|
15 | 15 | from rich.table import Table |
16 | 16 |
|
17 | 17 | RICH_AVAILABLE = True |
18 | | - CONSOLE = Console() |
| 18 | + console = Console() |
19 | 19 | except ImportError: # pragma: no cover - optional dependency |
20 | 20 | RICH_AVAILABLE = False |
21 | | - CONSOLE = None |
| 21 | + console = None |
22 | 22 |
|
23 | 23 |
|
24 | 24 | def write_chunks(chunks, strategy: str): |
@@ -58,71 +58,72 @@ def analyze(args): |
58 | 58 | int: exit code (0 on success, non-zero on error) |
59 | 59 | """ |
60 | 60 |
|
61 | | - docs = mdparser.read_markdown_folder(args.folder) |
62 | | - if not docs: |
| 61 | + results, text = _run_all_strategies(args) |
| 62 | + if results is None: |
63 | 63 | print("No markdown files found") |
64 | 64 | return 1 |
| 65 | + _write_results(results, None, args.output) |
| 66 | + if not args.test_file: |
| 67 | + print(f"Total text length (chars): {len(text)}") |
| 68 | + return 0 |
| 69 | + |
| 70 | + |
| 71 | +def _run_all_strategies(args): |
| 72 | + """Helper to run all strategies and collect results.""" |
| 73 | + docs = mdparser.read_markdown_folder(args.folder) |
| 74 | + if not docs: |
| 75 | + return None, None |
65 | 76 | text = mdparser.clean_markdown_text(docs) |
66 | 77 | strategies = ( |
67 | 78 | [args.strategy] if args.strategy != "all" else list(chunker.STRATEGIES.keys()) |
68 | 79 | ) |
69 | 80 | results = [] |
70 | | - detail = {} |
71 | | - questions = None |
72 | | - if args.test_file: |
73 | | - questions = scorer.load_test_file(args.test_file) |
74 | | - |
75 | 81 | for strat in strategies: |
76 | 82 | func = chunker.STRATEGIES.get(strat) |
77 | 83 | if not func: |
78 | 84 | print(f"Unknown strategy: {strat}") |
79 | | - return 1 |
80 | | - result, per_questions = _run_strategy(text, func, strat, args, questions) |
| 85 | + continue |
| 86 | + result, per_questions = _run_strategy(text, func, strat, args) |
| 87 | + result["per_questions"] = per_questions |
81 | 88 | results.append(result) |
82 | | - detail[strat] = per_questions |
83 | | - |
84 | | - _write_results(results, detail, args.output) |
85 | | - |
86 | | - if not questions: |
87 | | - print(f"Total text length (chars): {len(text)}") |
88 | | - return 0 |
| 89 | + return results, text |
89 | 90 |
|
90 | 91 |
|
91 | | -def _run_strategy(text, func, strat, args, questions): |
| 92 | +def _run_strategy(text, func, strat, args): |
92 | 93 | """Run a single chunking strategy and return result dict and per-question details. |
93 | 94 |
|
94 | 95 | Args: |
95 | 96 | text: Full cleaned text |
96 | 97 | func: chunking function |
97 | 98 | strat: strategy name |
98 | 99 | args: argparse.Namespace containing configuration |
99 | | - questions: loaded questions list or None |
100 | 100 | """ |
101 | | - chunk_size = args.chunk_size |
102 | | - overlap = args.overlap |
103 | | - top_k = args.top_k |
104 | | - use_tiktoken = getattr(args, "use_tiktoken", False) |
105 | | - tiktoken_model = getattr(args, "tiktoken_model", "gpt-3.5-turbo") |
106 | 101 | chunks = func( |
107 | 102 | text, |
108 | | - chunk_size=chunk_size, |
109 | | - overlap=overlap, |
110 | | - use_tiktoken=use_tiktoken, |
111 | | - model=tiktoken_model, |
| 103 | + chunk_size=args.chunk_size, |
| 104 | + overlap=args.overlap, |
| 105 | + use_tiktoken=getattr(args, "use_tiktoken", False), |
| 106 | + model=getattr(args, "tiktoken_model", "gpt-3.5-turbo"), |
112 | 107 | ) |
113 | 108 | outdir = write_chunks(chunks, strat) |
114 | | - chunk_count = len(chunks) |
115 | | - avg_recall = 0.0 |
116 | | - per_questions = [] |
| 109 | + |
| 110 | + avg_recall, per_questions = 0.0, [] |
| 111 | + questions = ( |
| 112 | + scorer.load_test_file(args.test_file) |
| 113 | + if getattr(args, "test_file", None) |
| 114 | + else None |
| 115 | + ) |
117 | 116 | if questions: |
118 | | - avg_recall, per_questions = scorer.evaluate_strategy(chunks, questions, top_k) |
119 | | - result = { |
| 117 | + avg_recall, per_questions = scorer.evaluate_strategy( |
| 118 | + chunks, questions, args.top_k |
| 119 | + ) |
| 120 | + |
| 121 | + return { |
120 | 122 | "strategy": strat, |
121 | | - "chunks": chunk_count, |
| 123 | + "chunks": len(chunks), |
122 | 124 | "avg_recall": round(avg_recall, 4), |
123 | 125 | "saved": str(outdir), |
124 | | - } |
125 | | - return result, per_questions |
| 126 | + }, per_questions |
126 | 127 |
|
127 | 128 |
|
128 | 129 | def _write_results(results, detail, output): |
@@ -159,7 +160,7 @@ def _write_results(results, detail, output): |
159 | 160 | pct_cell, |
160 | 161 | str(r.get("saved", "")), |
161 | 162 | ) |
162 | | - CONSOLE.print(table) |
| 163 | + console.print(table) |
163 | 164 | return |
164 | 165 | print(format_table(results)) |
165 | 166 | return |
|
0 commit comments