Skip to content

Commit d3a2199

Browse files
Copilotmesskan
andcommitted
Implement hierarchical and semantic chunking strategies
Co-authored-by: messkan <[email protected]>
1 parent d3dc783 commit d3a2199

File tree

4 files changed

+475
-11
lines changed

4 files changed

+475
-11
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@ build-backend = "setuptools.build_meta"
2424
rich = ["rich>=12.0.0"]
2525
tiktoken = ["tiktoken>=0.5.0"]
2626
langchain = ["langchain>=0.1.0", "langchain-text-splitters>=0.0.1"]
27-
all = ["rich>=12.0.0", "tiktoken>=0.5.0", "langchain>=0.1.0", "langchain-text-splitters>=0.0.1"]
27+
embeddings = ["sentence-transformers>=2.0.0"]
28+
all = ["rich>=12.0.0", "tiktoken>=0.5.0", "langchain>=0.1.0", "langchain-text-splitters>=0.0.1", "sentence-transformers>=2.0.0"]

src/chunker.py

Lines changed: 350 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Chunking strategies."""
22

3-
from typing import Dict, List
3+
import re
4+
from typing import Dict, List, Optional, Tuple
45

56
try:
67
import tiktoken
@@ -18,6 +19,16 @@
1819
LANGCHAIN_AVAILABLE = False
1920
RecursiveCharacterTextSplitter = None
2021

22+
try:
23+
from sentence_transformers import SentenceTransformer
24+
import numpy as np
25+
26+
EMBEDDINGS_AVAILABLE = True
27+
except ImportError:
28+
EMBEDDINGS_AVAILABLE = False
29+
SentenceTransformer = None
30+
np = None
31+
2132

2233
def tokenize(
2334
text: str, use_tiktoken: bool = False, model: str = "gpt-3.5-turbo"
@@ -187,6 +198,324 @@ def recursive_character_chunks(
187198
return [{"id": i, "text": t} for i, t in enumerate(texts)]
188199

189200

201+
def _split_into_sentences(text: str) -> List[str]:
202+
"""Split text into sentences using basic punctuation."""
203+
# Simple sentence splitter - splits on . ! ? followed by space/newline
204+
sentences = re.split(r'(?<=[.!?])\s+', text)
205+
return [s.strip() for s in sentences if s.strip()]
206+
207+
208+
def _extract_markdown_sections(text: str) -> List[Tuple[str, str, int]]:
209+
"""Extract markdown sections based on headers.
210+
211+
Returns:
212+
List of (header_text, content, level) tuples where level is 1-6 for h1-h6
213+
"""
214+
sections = []
215+
lines = text.split('\n')
216+
current_header = ""
217+
current_content = []
218+
current_level = 0
219+
220+
for line in lines:
221+
# Check for ATX-style headers (# Header)
222+
header_match = re.match(r'^(#{1,6})\s+(.+)$', line)
223+
if header_match:
224+
# Save previous section
225+
if current_content:
226+
sections.append((
227+
current_header,
228+
'\n'.join(current_content).strip(),
229+
current_level
230+
))
231+
# Start new section
232+
current_level = len(header_match.group(1))
233+
current_header = header_match.group(2).strip()
234+
current_content = []
235+
else:
236+
current_content.append(line)
237+
238+
# Add final section
239+
if current_content:
240+
sections.append((
241+
current_header,
242+
'\n'.join(current_content).strip(),
243+
current_level
244+
))
245+
246+
return sections
247+
248+
249+
def hierarchical_chunk(
250+
text: str,
251+
levels: Optional[List[str]] = None,
252+
use_tiktoken: bool = False,
253+
model: str = "gpt-3.5-turbo",
254+
source_path: str = "",
255+
) -> List[Dict]:
256+
"""Build multi-level chunk hierarchies.
257+
258+
Splits text hierarchically: section → paragraph → sentence
259+
260+
Args:
261+
text: Text to chunk
262+
levels: List of levels to split by. Options: 'section', 'paragraph', 'sentence'
263+
Default: ['section', 'paragraph']
264+
use_tiktoken: If True, use tiktoken for token counting
265+
model: Model name for tiktoken encoding
266+
source_path: Optional source file path for metadata
267+
268+
Returns:
269+
List of chunk dictionaries with metadata:
270+
- id: unique chunk identifier
271+
- text: chunk text
272+
- parent_id: parent chunk id (None for top level)
273+
- level: hierarchy level name
274+
- start_char: start position in original text
275+
- end_char: end position in original text
276+
- token_count: number of tokens
277+
- source_path: source file path
278+
"""
279+
if levels is None:
280+
levels = ['section', 'paragraph']
281+
282+
chunks = []
283+
chunk_id = 0
284+
285+
# Level 1: Try to split by sections (markdown headers)
286+
if 'section' in levels:
287+
sections = _extract_markdown_sections(text)
288+
289+
# If no sections found, fallback to treating entire text as one section
290+
if not sections or (len(sections) == 1 and sections[0][0] == "" and sections[0][2] == 0):
291+
sections = [("", text, 0)]
292+
293+
for sec_idx, (header, content, level) in enumerate(sections):
294+
if not content.strip():
295+
continue
296+
297+
# Find position in original text
298+
start_pos = text.find(content)
299+
end_pos = start_pos + len(content) if start_pos >= 0 else len(content)
300+
301+
section_chunk = {
302+
"id": chunk_id,
303+
"text": content,
304+
"parent_id": None,
305+
"level": "section",
306+
"start_char": start_pos if start_pos >= 0 else 0,
307+
"end_char": end_pos,
308+
"token_count": count_tokens(content, use_tiktoken=use_tiktoken, model=model),
309+
"source_path": source_path,
310+
"header": header,
311+
"header_level": level,
312+
}
313+
chunks.append(section_chunk)
314+
section_parent_id = chunk_id
315+
chunk_id += 1
316+
317+
# Level 2: Split sections into paragraphs
318+
if 'paragraph' in levels:
319+
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
320+
321+
for para in paragraphs:
322+
para_start = text.find(para)
323+
para_end = para_start + len(para) if para_start >= 0 else len(para)
324+
325+
para_chunk = {
326+
"id": chunk_id,
327+
"text": para,
328+
"parent_id": section_parent_id,
329+
"level": "paragraph",
330+
"start_char": para_start if para_start >= 0 else 0,
331+
"end_char": para_end,
332+
"token_count": count_tokens(para, use_tiktoken=use_tiktoken, model=model),
333+
"source_path": source_path,
334+
}
335+
chunks.append(para_chunk)
336+
para_parent_id = chunk_id
337+
chunk_id += 1
338+
339+
# Level 3: Split paragraphs into sentences
340+
if 'sentence' in levels:
341+
sentences = _split_into_sentences(para)
342+
343+
for sent in sentences:
344+
sent_start = text.find(sent)
345+
sent_end = sent_start + len(sent) if sent_start >= 0 else len(sent)
346+
347+
sent_chunk = {
348+
"id": chunk_id,
349+
"text": sent,
350+
"parent_id": para_parent_id,
351+
"level": "sentence",
352+
"start_char": sent_start if sent_start >= 0 else 0,
353+
"end_char": sent_end,
354+
"token_count": count_tokens(sent, use_tiktoken=use_tiktoken, model=model),
355+
"source_path": source_path,
356+
}
357+
chunks.append(sent_chunk)
358+
chunk_id += 1
359+
360+
elif 'paragraph' in levels:
361+
# Start with paragraphs if 'section' not in levels
362+
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
363+
364+
for para in paragraphs:
365+
para_start = text.find(para)
366+
para_end = para_start + len(para) if para_start >= 0 else len(para)
367+
368+
para_chunk = {
369+
"id": chunk_id,
370+
"text": para,
371+
"parent_id": None,
372+
"level": "paragraph",
373+
"start_char": para_start if para_start >= 0 else 0,
374+
"end_char": para_end,
375+
"token_count": count_tokens(para, use_tiktoken=use_tiktoken, model=model),
376+
"source_path": source_path,
377+
}
378+
chunks.append(para_chunk)
379+
para_parent_id = chunk_id
380+
chunk_id += 1
381+
382+
if 'sentence' in levels:
383+
sentences = _split_into_sentences(para)
384+
385+
for sent in sentences:
386+
sent_start = text.find(sent)
387+
sent_end = sent_start + len(sent) if sent_start >= 0 else len(sent)
388+
389+
sent_chunk = {
390+
"id": chunk_id,
391+
"text": sent,
392+
"parent_id": para_parent_id,
393+
"level": "sentence",
394+
"start_char": sent_start if sent_start >= 0 else 0,
395+
"end_char": sent_end,
396+
"token_count": count_tokens(sent, use_tiktoken=use_tiktoken, model=model),
397+
"source_path": source_path,
398+
}
399+
chunks.append(sent_chunk)
400+
chunk_id += 1
401+
402+
elif 'sentence' in levels:
403+
# Start with sentences if neither section nor paragraph in levels
404+
sentences = _split_into_sentences(text)
405+
406+
for sent in sentences:
407+
sent_start = text.find(sent)
408+
sent_end = sent_start + len(sent) if sent_start >= 0 else len(sent)
409+
410+
sent_chunk = {
411+
"id": chunk_id,
412+
"text": sent,
413+
"parent_id": None,
414+
"level": "sentence",
415+
"start_char": sent_start if sent_start >= 0 else 0,
416+
"end_char": sent_end,
417+
"token_count": count_tokens(sent, use_tiktoken=use_tiktoken, model=model),
418+
"source_path": source_path,
419+
}
420+
chunks.append(sent_chunk)
421+
chunk_id += 1
422+
423+
# Fallback: if no chunks created, return entire text as one chunk
424+
if not chunks:
425+
chunks.append({
426+
"id": 0,
427+
"text": text,
428+
"parent_id": None,
429+
"level": "document",
430+
"start_char": 0,
431+
"end_char": len(text),
432+
"token_count": count_tokens(text, use_tiktoken=use_tiktoken, model=model),
433+
"source_path": source_path,
434+
})
435+
436+
return chunks
437+
438+
439+
def semantic_split(
440+
text: str,
441+
model: str = 'all-MiniLM-L6-v2',
442+
threshold: float = 0.7,
443+
use_tiktoken: bool = False,
444+
tiktoken_model: str = "gpt-3.5-turbo",
445+
) -> List[Dict]:
446+
"""Detect topic boundaries via semantic embeddings.
447+
448+
Splits text at points where similarity with neighboring sentences drops
449+
below threshold (changepoint detection).
450+
451+
Args:
452+
text: Text to chunk
453+
model: Sentence-transformers model name (default: 'all-MiniLM-L6-v2')
454+
threshold: Similarity threshold for splitting (0.0-1.0, default: 0.7)
455+
use_tiktoken: If True, use tiktoken for token counting
456+
tiktoken_model: Model name for tiktoken encoding
457+
458+
Returns:
459+
List of chunk dictionaries with 'id', 'text', and 'token_count' keys
460+
"""
461+
if not EMBEDDINGS_AVAILABLE:
462+
raise ImportError(
463+
"sentence-transformers is required for semantic-embedding strategy. "
464+
"Install with: pip install rag-chunk[embeddings]"
465+
)
466+
467+
# Split into sentences
468+
sentences = _split_into_sentences(text)
469+
470+
if len(sentences) <= 1:
471+
# Not enough sentences to split
472+
return [{
473+
"id": 0,
474+
"text": text,
475+
"token_count": count_tokens(text, use_tiktoken=use_tiktoken, model=tiktoken_model),
476+
}]
477+
478+
# Load model and compute embeddings
479+
embedder = SentenceTransformer(model)
480+
embeddings = embedder.encode(sentences)
481+
482+
# Compute cosine similarities between consecutive sentences
483+
similarities = []
484+
for i in range(len(embeddings) - 1):
485+
# Cosine similarity
486+
sim = np.dot(embeddings[i], embeddings[i + 1]) / (
487+
np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1])
488+
)
489+
similarities.append(sim)
490+
491+
# Find split points where similarity drops below threshold
492+
split_indices = [0] # Always start at beginning
493+
for i, sim in enumerate(similarities):
494+
if sim < threshold:
495+
split_indices.append(i + 1)
496+
split_indices.append(len(sentences)) # Always end at the end
497+
498+
# Create chunks from split points
499+
chunks = []
500+
for i in range(len(split_indices) - 1):
501+
start_idx = split_indices[i]
502+
end_idx = split_indices[i + 1]
503+
chunk_sentences = sentences[start_idx:end_idx]
504+
chunk_text = ' '.join(chunk_sentences)
505+
506+
chunks.append({
507+
"id": i,
508+
"text": chunk_text,
509+
"token_count": count_tokens(
510+
chunk_text,
511+
use_tiktoken=use_tiktoken,
512+
model=tiktoken_model
513+
),
514+
})
515+
516+
return chunks
517+
518+
190519
STRATEGIES = {
191520
"fixed-size": (
192521
lambda text, chunk_size=200, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo":
@@ -221,4 +550,24 @@ def recursive_character_chunks(
221550
model=model,
222551
)
223552
),
553+
"hierarchical": (
554+
lambda text, chunk_size=0, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo", **kwargs:
555+
hierarchical_chunk(
556+
text,
557+
levels=kwargs.get('levels', ['section', 'paragraph']),
558+
use_tiktoken=use_tiktoken,
559+
model=model,
560+
source_path=kwargs.get('source_path', ''),
561+
)
562+
),
563+
"semantic-embedding": (
564+
lambda text, chunk_size=0, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo", **kwargs:
565+
semantic_split(
566+
text,
567+
model=kwargs.get('semantic_model', 'all-MiniLM-L6-v2'),
568+
threshold=kwargs.get('threshold', 0.7),
569+
use_tiktoken=use_tiktoken,
570+
tiktoken_model=model,
571+
)
572+
),
224573
}

0 commit comments

Comments
 (0)