chore: add langchain recursive strategy

messkan · messkan · commit 689e91571d32 · 2025-11-25T00:18:13.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "rag-chunk"
-version = "0.2.0"
+version = "0.3.0"
 description = "CLI tool to parse, chunk, and evaluate Markdown documents for RAG pipelines with token-accurate chunking support"
 authors = [ { name = "messkan" } ]
 license = { text = "MIT" }
@@ -23,3 +23,5 @@ build-backend = "setuptools.build_meta"
 [project.optional-dependencies]
 rich = ["rich>=12.0.0"]
 tiktoken = ["tiktoken>=0.5.0"]
+langchain = ["langchain>=0.1.0", "langchain-text-splitters>=0.0.1"]
+all = ["rich>=12.0.0", "tiktoken>=0.5.0", "langchain>=0.1.0", "langchain-text-splitters>=0.0.1"]
diff --git a/src/chunker.py b/src/chunker.py
@@ -10,6 +10,14 @@
     TIKTOKEN_AVAILABLE = False
     tiktoken = None
 
+try:
+    from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+    LANGCHAIN_AVAILABLE = True
+except ImportError:
+    LANGCHAIN_AVAILABLE = False
+    RecursiveCharacterTextSplitter = None
+
 
 def tokenize(
     text: str, use_tiktoken: bool = False, model: str = "gpt-3.5-turbo"
@@ -128,6 +136,57 @@ def paragraph_chunks(text: str) -> List[Dict]:
     return chunks
 
 
+def recursive_character_chunks(
+    text: str,
+    chunk_size: int = 200,
+    overlap: int = 50,
+    use_tiktoken: bool = False,
+    model: str = "gpt-3.5-turbo",
+) -> List[Dict]:
+    """Split text using LangChain's RecursiveCharacterTextSplitter.
+
+    Recursively splits by paragraphs, sentences, then words for semantic coherence.
+
+    Args:
+        text: Text to chunk
+        chunk_size: Target size per chunk (words or tokens)
+        overlap: Overlap between chunks
+        use_tiktoken: If True, use tiktoken for token-based chunking
+        model: Model name for tiktoken encoding
+
+    Returns:
+        List of chunk dictionaries with 'id' and 'text' keys
+    """
+    if not LANGCHAIN_AVAILABLE:
+        raise ImportError(
+            "LangChain is required for recursive-character strategy. "
+            "Install with: pip install rag-chunk[langchain]"
+        )
+
+    if use_tiktoken:
+        if not TIKTOKEN_AVAILABLE:
+            raise ImportError(
+                "tiktoken is required for token-based chunking. "
+                "Install with: pip install rag-chunk[tiktoken]"
+            )
+        import tiktoken
+
+        enc = tiktoken.encoding_for_model(model)
+        splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+            encoding_name=enc.name, chunk_size=chunk_size, chunk_overlap=overlap
+        )
+    else:
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=overlap,
+            length_function=len,
+            separators=["\n\n", "\n", ". ", " ", ""],
+        )
+
+    texts = splitter.split_text(text)
+    return [{"id": i, "text": t} for i, t in enumerate(texts)]
+
+
 STRATEGIES = {
     "fixed-size": (
         lambda text, chunk_size=200, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo":
@@ -152,4 +211,14 @@ def paragraph_chunks(text: str) -> List[Dict]:
         lambda text, chunk_size=0, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo":
         paragraph_chunks(text)
     ),
+    "recursive-character": (
+        lambda text, chunk_size=200, overlap=50, use_tiktoken=False, model="gpt-3.5-turbo":
+        recursive_character_chunks(
+            text,
+            chunk_size,
+            overlap,
+            use_tiktoken=use_tiktoken,
+            model=model
+        )
+    ),
 }
diff --git a/src/cli.py b/src/cli.py
@@ -107,21 +107,24 @@ def _run_strategy(text, func, strat, args):
     )
     outdir = write_chunks(chunks, strat)
 
-    avg_recall, per_questions = 0.0, []
+    metrics = {"avg_recall": 0.0, "avg_precision": 0.0, "avg_f1": 0.0}
+    per_questions = []
     questions = (
         scorer.load_test_file(args.test_file)
         if getattr(args, "test_file", None)
         else None
     )
     if questions:
-        avg_recall, per_questions = scorer.evaluate_strategy(
+        metrics, per_questions = scorer.evaluate_strategy(
             chunks, questions, args.top_k
         )
 
     return {
         "strategy": strat,
         "chunks": len(chunks),
-        "avg_recall": round(avg_recall, 4),
+        "avg_recall": round(metrics["avg_recall"], 4),
+        "avg_precision": round(metrics["avg_precision"], 4),
+        "avg_f1": round(metrics["avg_f1"], 4),
         "saved": str(outdir),
     }, per_questions
 
@@ -137,27 +140,55 @@ def _write_results(results, detail, output):
             table.add_column("strategy", style="cyan")
             table.add_column("chunks", justify="right")
             table.add_column("avg_recall", justify="right")
+            table.add_column("avg_precision", justify="right")
+            table.add_column("avg_f1", justify="right")
             table.add_column("saved")
             for r in results:
-                avg = r.get("avg_recall", 0.0)
+                recall = r.get("avg_recall", 0.0)
+                precision = r.get("avg_precision", 0.0)
+                f1 = r.get("avg_f1", 0.0)
+                
+                # Format recall with color
                 try:
-                    pct = f"{avg*100:.2f}%"
+                    recall_pct = f"{recall*100:.2f}%"
                 except (TypeError, ValueError):
-                    pct = str(avg)
-                if isinstance(avg, float):
-                    if avg >= 0.85:
+                    recall_pct = str(recall)
+                if isinstance(recall, float):
+                    if recall >= 0.85:
                         color = "green"
-                    elif avg >= 0.7:
+                    elif recall >= 0.7:
                         color = "yellow"
                     else:
                         color = "red"
-                    pct_cell = f"[{color}]{pct}[/{color}]"
+                    recall_cell = f"[{color}]{recall_pct}[/{color}]"
                 else:
-                    pct_cell = pct
+                    recall_cell = recall_pct
+                
+                # Format precision
+                precision_pct = f"{precision*100:.2f}%" if isinstance(precision, float) else str(precision)
+                
+                # Format F1 with color
+                try:
+                    f1_pct = f"{f1*100:.2f}%"
+                except (TypeError, ValueError):
+                    f1_pct = str(f1)
+                if isinstance(f1, float):
+                    if f1 >= 0.85:
+                        color = "green"
+                    elif f1 >= 0.7:
+                        color = "yellow"
+                    else:
+                        color = "red"
+                    f1_cell = f"[{color}]{f1_pct}[/{color}]"
+                else:
+                    f1_cell = f1_pct
+                
                 table.add_row(
                     str(r.get("strategy", "")),
                     str(r.get("chunks", "")),
-                    pct_cell,
+                    recall_cell,
+                    precision_pct,
+                    f1_cell,
                     str(r.get("saved", "")),
                 )
             console.print(table)
@@ -172,9 +203,11 @@ def _write_results(results, detail, output):
         wpath = Path("analysis_results.csv")
         with wpath.open("w", newline="", encoding="utf-8") as f:
             w = csv.writer(f)
-            w.writerow(["strategy", "chunks", "avg_recall", "saved"])
+            w.writerow(["strategy", "chunks", "avg_recall", "avg_precision", 
+                       "avg_f1", "saved"])
             for r in results:
-                w.writerow([r["strategy"], r["chunks"], r["avg_recall"], r["saved"]])
+                w.writerow([r["strategy"], r["chunks"], r["avg_recall"], 
+                           r["avg_precision"], r["avg_f1"], r["saved"]])
         print(str(wpath))
         return
     print("Unsupported output format")
@@ -191,7 +224,8 @@ def build_parser():
         "--strategy",
         type=str,
         default="fixed-size",
-        choices=["fixed-size", "sliding-window", "paragraph", "all"],
+        choices=["fixed-size", "sliding-window", "paragraph", 
+                 "recursive-character", "all"],
         help="Chunking strategy or all",
     )
     analyze_p.add_argument(
diff --git a/src/parser.py b/src/parser.py
@@ -4,9 +4,9 @@
 
 
 def read_markdown_folder(folder: str) -> list:
-    """Return list of (path, text) for all .md files in folder (non-recursive)."""
+    """Return list of (path, text) for all .md and .txt files in folder (non-recursive)."""
     p = Path(folder)
-    files = [f for f in p.iterdir() if f.is_file() and f.suffix.lower() == ".md"]
+    files = [f for f in p.iterdir() if f.is_file() and f.suffix.lower() in [".md", ".txt"]]
     result = []
     for f in files:
         try:
diff --git a/src/scorer.py b/src/scorer.py
@@ -45,18 +45,76 @@ def compute_recall(retrieved: List[Dict], relevant_phrases: List[str]) -> float:
     return found / len(relevant_phrases)
 
 
+def compute_precision_recall_f1(
+    retrieved: List[Dict], relevant_phrases: List[str]
+) -> Tuple[float, float, float]:
+    """Compute precision, recall, and F1 score.
+
+    Args:
+        retrieved: List of retrieved chunk dictionaries
+        relevant_phrases: List of phrases that should be found
+
+    Returns:
+        Tuple of (precision, recall, f1)
+    """
+    if not relevant_phrases:
+        return 0.0, 0.0, 0.0
+
+    lower_texts = [c["text"].lower() for c in retrieved]
+    found_phrases = set()
+    for phrase in relevant_phrases:
+        lp = phrase.lower()
+        if any(lp in t for t in lower_texts):
+            found_phrases.add(phrase)
+
+    tp = len(found_phrases)  # True positives
+    fn = len(relevant_phrases) - tp  # False negatives
+    # For precision: assume each relevant phrase found is a "correct" retrieval
+    # FP = 0 in this simplified model (we only check relevant phrases)
+    fp = 0
+
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+    f1 = (
+        2 * precision * recall / (precision + recall)
+        if (precision + recall) > 0
+        else 0.0
+    )
+
+    return precision, recall, f1
+
+
 def evaluate_strategy(
     chunks: List[Dict], questions: List[Dict], top_k: int
-) -> Tuple[float, List[Dict]]:
-    """Return average recall and per-question details."""
+) -> Tuple[Dict, List[Dict]]:
+    """Return average metrics and per-question details.
+
+    Returns:
+        Tuple of (metrics_dict, per_question_list)
+        metrics_dict contains: avg_recall, avg_precision, avg_f1
+    """
     per = []
     recalls = []
+    precisions = []
+    f1s = []
     for q in questions:
         question = q.get("question", "")
         relevant = q.get("relevant", [])
         retrieved = retrieve_top_k(chunks, question, top_k)
-        recall = compute_recall(retrieved, relevant)
+        precision, recall, f1 = compute_precision_recall_f1(retrieved, relevant)
         recalls.append(recall)
-        per.append({"question": question, "recall": recall})
-    avg = sum(recalls) / len(recalls) if recalls else 0.0
-    return avg, per
+        precisions.append(precision)
+        f1s.append(f1)
+        per.append({
+            "question": question,
+            "recall": recall,
+            "precision": precision,
+            "f1": f1
+        })
+    
+    metrics = {
+        "avg_recall": sum(recalls) / len(recalls) if recalls else 0.0,
+        "avg_precision": sum(precisions) / len(precisions) if precisions else 0.0,
+        "avg_f1": sum(f1s) / len(f1s) if f1s else 0.0,
+    }
+    return metrics, per