|
1 | 1 | """Chunking strategies.""" |
2 | 2 |
|
3 | | -from typing import Dict, List |
| 3 | +import re |
| 4 | +from typing import Dict, List, Optional, Tuple |
4 | 5 |
|
5 | 6 | try: |
6 | 7 | import tiktoken |
|
18 | 19 | LANGCHAIN_AVAILABLE = False |
19 | 20 | RecursiveCharacterTextSplitter = None |
20 | 21 |
|
| 22 | +try: |
| 23 | + from sentence_transformers import SentenceTransformer |
| 24 | + import numpy as np |
| 25 | + |
| 26 | + EMBEDDINGS_AVAILABLE = True |
| 27 | +except ImportError: |
| 28 | + EMBEDDINGS_AVAILABLE = False |
| 29 | + SentenceTransformer = None |
| 30 | + np = None |
| 31 | + |
21 | 32 |
|
22 | 33 | def tokenize( |
23 | 34 | text: str, use_tiktoken: bool = False, model: str = "gpt-3.5-turbo" |
@@ -187,6 +198,324 @@ def recursive_character_chunks( |
187 | 198 | return [{"id": i, "text": t} for i, t in enumerate(texts)] |
188 | 199 |
|
189 | 200 |
|
| 201 | +def _split_into_sentences(text: str) -> List[str]: |
| 202 | + """Split text into sentences using basic punctuation.""" |
| 203 | + # Simple sentence splitter - splits on . ! ? followed by space/newline |
| 204 | + sentences = re.split(r'(?<=[.!?])\s+', text) |
| 205 | + return [s.strip() for s in sentences if s.strip()] |
| 206 | + |
| 207 | + |
| 208 | +def _extract_markdown_sections(text: str) -> List[Tuple[str, str, int]]: |
| 209 | + """Extract markdown sections based on headers. |
| 210 | + |
| 211 | + Returns: |
| 212 | + List of (header_text, content, level) tuples where level is 1-6 for h1-h6 |
| 213 | + """ |
| 214 | + sections = [] |
| 215 | + lines = text.split('\n') |
| 216 | + current_header = "" |
| 217 | + current_content = [] |
| 218 | + current_level = 0 |
| 219 | + |
| 220 | + for line in lines: |
| 221 | + # Check for ATX-style headers (# Header) |
| 222 | + header_match = re.match(r'^(#{1,6})\s+(.+)$', line) |
| 223 | + if header_match: |
| 224 | + # Save previous section |
| 225 | + if current_content: |
| 226 | + sections.append(( |
| 227 | + current_header, |
| 228 | + '\n'.join(current_content).strip(), |
| 229 | + current_level |
| 230 | + )) |
| 231 | + # Start new section |
| 232 | + current_level = len(header_match.group(1)) |
| 233 | + current_header = header_match.group(2).strip() |
| 234 | + current_content = [] |
| 235 | + else: |
| 236 | + current_content.append(line) |
| 237 | + |
| 238 | + # Add final section |
| 239 | + if current_content: |
| 240 | + sections.append(( |
| 241 | + current_header, |
| 242 | + '\n'.join(current_content).strip(), |
| 243 | + current_level |
| 244 | + )) |
| 245 | + |
| 246 | + return sections |
| 247 | + |
| 248 | + |
| 249 | +def hierarchical_chunk( |
| 250 | + text: str, |
| 251 | + levels: Optional[List[str]] = None, |
| 252 | + use_tiktoken: bool = False, |
| 253 | + model: str = "gpt-3.5-turbo", |
| 254 | + source_path: str = "", |
| 255 | +) -> List[Dict]: |
| 256 | + """Build multi-level chunk hierarchies. |
| 257 | + |
| 258 | + Splits text hierarchically: section → paragraph → sentence |
| 259 | + |
| 260 | + Args: |
| 261 | + text: Text to chunk |
| 262 | + levels: List of levels to split by. Options: 'section', 'paragraph', 'sentence' |
| 263 | + Default: ['section', 'paragraph'] |
| 264 | + use_tiktoken: If True, use tiktoken for token counting |
| 265 | + model: Model name for tiktoken encoding |
| 266 | + source_path: Optional source file path for metadata |
| 267 | + |
| 268 | + Returns: |
| 269 | + List of chunk dictionaries with metadata: |
| 270 | + - id: unique chunk identifier |
| 271 | + - text: chunk text |
| 272 | + - parent_id: parent chunk id (None for top level) |
| 273 | + - level: hierarchy level name |
| 274 | + - start_char: start position in original text |
| 275 | + - end_char: end position in original text |
| 276 | + - token_count: number of tokens |
| 277 | + - source_path: source file path |
| 278 | + """ |
| 279 | + if levels is None: |
| 280 | + levels = ['section', 'paragraph'] |
| 281 | + |
| 282 | + chunks = [] |
| 283 | + chunk_id = 0 |
| 284 | + |
| 285 | + # Level 1: Try to split by sections (markdown headers) |
| 286 | + if 'section' in levels: |
| 287 | + sections = _extract_markdown_sections(text) |
| 288 | + |
| 289 | + # If no sections found, fallback to treating entire text as one section |
| 290 | + if not sections or (len(sections) == 1 and sections[0][0] == "" and sections[0][2] == 0): |
| 291 | + sections = [("", text, 0)] |
| 292 | + |
| 293 | + for sec_idx, (header, content, level) in enumerate(sections): |
| 294 | + if not content.strip(): |
| 295 | + continue |
| 296 | + |
| 297 | + # Find position in original text |
| 298 | + start_pos = text.find(content) |
| 299 | + end_pos = start_pos + len(content) if start_pos >= 0 else len(content) |
| 300 | + |
| 301 | + section_chunk = { |
| 302 | + "id": chunk_id, |
| 303 | + "text": content, |
| 304 | + "parent_id": None, |
| 305 | + "level": "section", |
| 306 | + "start_char": start_pos if start_pos >= 0 else 0, |
| 307 | + "end_char": end_pos, |
| 308 | + "token_count": count_tokens(content, use_tiktoken=use_tiktoken, model=model), |
| 309 | + "source_path": source_path, |
| 310 | + "header": header, |
| 311 | + "header_level": level, |
| 312 | + } |
| 313 | + chunks.append(section_chunk) |
| 314 | + section_parent_id = chunk_id |
| 315 | + chunk_id += 1 |
| 316 | + |
| 317 | + # Level 2: Split sections into paragraphs |
| 318 | + if 'paragraph' in levels: |
| 319 | + paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] |
| 320 | + |
| 321 | + for para in paragraphs: |
| 322 | + para_start = text.find(para) |
| 323 | + para_end = para_start + len(para) if para_start >= 0 else len(para) |
| 324 | + |
| 325 | + para_chunk = { |
| 326 | + "id": chunk_id, |
| 327 | + "text": para, |
| 328 | + "parent_id": section_parent_id, |
| 329 | + "level": "paragraph", |
| 330 | + "start_char": para_start if para_start >= 0 else 0, |
| 331 | + "end_char": para_end, |
| 332 | + "token_count": count_tokens(para, use_tiktoken=use_tiktoken, model=model), |
| 333 | + "source_path": source_path, |
| 334 | + } |
| 335 | + chunks.append(para_chunk) |
| 336 | + para_parent_id = chunk_id |
| 337 | + chunk_id += 1 |
| 338 | + |
| 339 | + # Level 3: Split paragraphs into sentences |
| 340 | + if 'sentence' in levels: |
| 341 | + sentences = _split_into_sentences(para) |
| 342 | + |
| 343 | + for sent in sentences: |
| 344 | + sent_start = text.find(sent) |
| 345 | + sent_end = sent_start + len(sent) if sent_start >= 0 else len(sent) |
| 346 | + |
| 347 | + sent_chunk = { |
| 348 | + "id": chunk_id, |
| 349 | + "text": sent, |
| 350 | + "parent_id": para_parent_id, |
| 351 | + "level": "sentence", |
| 352 | + "start_char": sent_start if sent_start >= 0 else 0, |
| 353 | + "end_char": sent_end, |
| 354 | + "token_count": count_tokens(sent, use_tiktoken=use_tiktoken, model=model), |
| 355 | + "source_path": source_path, |
| 356 | + } |
| 357 | + chunks.append(sent_chunk) |
| 358 | + chunk_id += 1 |
| 359 | + |
| 360 | + elif 'paragraph' in levels: |
| 361 | + # Start with paragraphs if 'section' not in levels |
| 362 | + paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] |
| 363 | + |
| 364 | + for para in paragraphs: |
| 365 | + para_start = text.find(para) |
| 366 | + para_end = para_start + len(para) if para_start >= 0 else len(para) |
| 367 | + |
| 368 | + para_chunk = { |
| 369 | + "id": chunk_id, |
| 370 | + "text": para, |
| 371 | + "parent_id": None, |
| 372 | + "level": "paragraph", |
| 373 | + "start_char": para_start if para_start >= 0 else 0, |
| 374 | + "end_char": para_end, |
| 375 | + "token_count": count_tokens(para, use_tiktoken=use_tiktoken, model=model), |
| 376 | + "source_path": source_path, |
| 377 | + } |
| 378 | + chunks.append(para_chunk) |
| 379 | + para_parent_id = chunk_id |
| 380 | + chunk_id += 1 |
| 381 | + |
| 382 | + if 'sentence' in levels: |
| 383 | + sentences = _split_into_sentences(para) |
| 384 | + |
| 385 | + for sent in sentences: |
| 386 | + sent_start = text.find(sent) |
| 387 | + sent_end = sent_start + len(sent) if sent_start >= 0 else len(sent) |
| 388 | + |
| 389 | + sent_chunk = { |
| 390 | + "id": chunk_id, |
| 391 | + "text": sent, |
| 392 | + "parent_id": para_parent_id, |
| 393 | + "level": "sentence", |
| 394 | + "start_char": sent_start if sent_start >= 0 else 0, |
| 395 | + "end_char": sent_end, |
| 396 | + "token_count": count_tokens(sent, use_tiktoken=use_tiktoken, model=model), |
| 397 | + "source_path": source_path, |
| 398 | + } |
| 399 | + chunks.append(sent_chunk) |
| 400 | + chunk_id += 1 |
| 401 | + |
| 402 | + elif 'sentence' in levels: |
| 403 | + # Start with sentences if neither section nor paragraph in levels |
| 404 | + sentences = _split_into_sentences(text) |
| 405 | + |
| 406 | + for sent in sentences: |
| 407 | + sent_start = text.find(sent) |
| 408 | + sent_end = sent_start + len(sent) if sent_start >= 0 else len(sent) |
| 409 | + |
| 410 | + sent_chunk = { |
| 411 | + "id": chunk_id, |
| 412 | + "text": sent, |
| 413 | + "parent_id": None, |
| 414 | + "level": "sentence", |
| 415 | + "start_char": sent_start if sent_start >= 0 else 0, |
| 416 | + "end_char": sent_end, |
| 417 | + "token_count": count_tokens(sent, use_tiktoken=use_tiktoken, model=model), |
| 418 | + "source_path": source_path, |
| 419 | + } |
| 420 | + chunks.append(sent_chunk) |
| 421 | + chunk_id += 1 |
| 422 | + |
| 423 | + # Fallback: if no chunks created, return entire text as one chunk |
| 424 | + if not chunks: |
| 425 | + chunks.append({ |
| 426 | + "id": 0, |
| 427 | + "text": text, |
| 428 | + "parent_id": None, |
| 429 | + "level": "document", |
| 430 | + "start_char": 0, |
| 431 | + "end_char": len(text), |
| 432 | + "token_count": count_tokens(text, use_tiktoken=use_tiktoken, model=model), |
| 433 | + "source_path": source_path, |
| 434 | + }) |
| 435 | + |
| 436 | + return chunks |
| 437 | + |
| 438 | + |
| 439 | +def semantic_split( |
| 440 | + text: str, |
| 441 | + model: str = 'all-MiniLM-L6-v2', |
| 442 | + threshold: float = 0.7, |
| 443 | + use_tiktoken: bool = False, |
| 444 | + tiktoken_model: str = "gpt-3.5-turbo", |
| 445 | +) -> List[Dict]: |
| 446 | + """Detect topic boundaries via semantic embeddings. |
| 447 | + |
| 448 | + Splits text at points where similarity with neighboring sentences drops |
| 449 | + below threshold (changepoint detection). |
| 450 | + |
| 451 | + Args: |
| 452 | + text: Text to chunk |
| 453 | + model: Sentence-transformers model name (default: 'all-MiniLM-L6-v2') |
| 454 | + threshold: Similarity threshold for splitting (0.0-1.0, default: 0.7) |
| 455 | + use_tiktoken: If True, use tiktoken for token counting |
| 456 | + tiktoken_model: Model name for tiktoken encoding |
| 457 | + |
| 458 | + Returns: |
| 459 | + List of chunk dictionaries with 'id', 'text', and 'token_count' keys |
| 460 | + """ |
| 461 | + if not EMBEDDINGS_AVAILABLE: |
| 462 | + raise ImportError( |
| 463 | + "sentence-transformers is required for semantic-embedding strategy. " |
| 464 | + "Install with: pip install rag-chunk[embeddings]" |
| 465 | + ) |
| 466 | + |
| 467 | + # Split into sentences |
| 468 | + sentences = _split_into_sentences(text) |
| 469 | + |
| 470 | + if len(sentences) <= 1: |
| 471 | + # Not enough sentences to split |
| 472 | + return [{ |
| 473 | + "id": 0, |
| 474 | + "text": text, |
| 475 | + "token_count": count_tokens(text, use_tiktoken=use_tiktoken, model=tiktoken_model), |
| 476 | + }] |
| 477 | + |
| 478 | + # Load model and compute embeddings |
| 479 | + embedder = SentenceTransformer(model) |
| 480 | + embeddings = embedder.encode(sentences) |
| 481 | + |
| 482 | + # Compute cosine similarities between consecutive sentences |
| 483 | + similarities = [] |
| 484 | + for i in range(len(embeddings) - 1): |
| 485 | + # Cosine similarity |
| 486 | + sim = np.dot(embeddings[i], embeddings[i + 1]) / ( |
| 487 | + np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]) |
| 488 | + ) |
| 489 | + similarities.append(sim) |
| 490 | + |
| 491 | + # Find split points where similarity drops below threshold |
| 492 | + split_indices = [0] # Always start at beginning |
| 493 | + for i, sim in enumerate(similarities): |
| 494 | + if sim < threshold: |
| 495 | + split_indices.append(i + 1) |
| 496 | + split_indices.append(len(sentences)) # Always end at the end |
| 497 | + |
| 498 | + # Create chunks from split points |
| 499 | + chunks = [] |
| 500 | + for i in range(len(split_indices) - 1): |
| 501 | + start_idx = split_indices[i] |
| 502 | + end_idx = split_indices[i + 1] |
| 503 | + chunk_sentences = sentences[start_idx:end_idx] |
| 504 | + chunk_text = ' '.join(chunk_sentences) |
| 505 | + |
| 506 | + chunks.append({ |
| 507 | + "id": i, |
| 508 | + "text": chunk_text, |
| 509 | + "token_count": count_tokens( |
| 510 | + chunk_text, |
| 511 | + use_tiktoken=use_tiktoken, |
| 512 | + model=tiktoken_model |
| 513 | + ), |
| 514 | + }) |
| 515 | + |
| 516 | + return chunks |
| 517 | + |
| 518 | + |
190 | 519 | STRATEGIES = { |
191 | 520 | "fixed-size": ( |
192 | 521 | lambda text, chunk_size=200, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo": |
@@ -221,4 +550,24 @@ def recursive_character_chunks( |
221 | 550 | model=model, |
222 | 551 | ) |
223 | 552 | ), |
| 553 | + "hierarchical": ( |
| 554 | + lambda text, chunk_size=0, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo", **kwargs: |
| 555 | + hierarchical_chunk( |
| 556 | + text, |
| 557 | + levels=kwargs.get('levels', ['section', 'paragraph']), |
| 558 | + use_tiktoken=use_tiktoken, |
| 559 | + model=model, |
| 560 | + source_path=kwargs.get('source_path', ''), |
| 561 | + ) |
| 562 | + ), |
| 563 | + "semantic-embedding": ( |
| 564 | + lambda text, chunk_size=0, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo", **kwargs: |
| 565 | + semantic_split( |
| 566 | + text, |
| 567 | + model=kwargs.get('semantic_model', 'all-MiniLM-L6-v2'), |
| 568 | + threshold=kwargs.get('threshold', 0.7), |
| 569 | + use_tiktoken=use_tiktoken, |
| 570 | + tiktoken_model=model, |
| 571 | + ) |
| 572 | + ), |
224 | 573 | } |
0 commit comments