Skip to content

Commit 9e404eb

Browse files
committed
chore: update ci
1 parent d58fff9 commit 9e404eb

File tree

3 files changed

+60
-45
lines changed

3 files changed

+60
-45
lines changed

.github/workflows/pylint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ jobs:
77
runs-on: ubuntu-latest
88
strategy:
99
matrix:
10-
python-version: ["3.10"]
10+
python-version: ["3.14"]
1111
steps:
1212
- uses: actions/checkout@v4
1313
- name: Set up Python ${{ matrix.python-version }}

src/chunker.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,27 @@ def paragraph_chunks(text: str) -> List[Dict]:
129129

130130

131131
STRATEGIES = {
132-
"fixed-size": lambda text, chunk_size=200, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo": fixed_size_chunks(
133-
text, chunk_size, use_tiktoken=use_tiktoken, model=model
132+
"fixed-size": (
133+
lambda text, chunk_size=200, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo":
134+
fixed_size_chunks(
135+
text,
136+
chunk_size,
137+
use_tiktoken=use_tiktoken,
138+
model=model
139+
)
134140
),
135-
"sliding-window": lambda text, chunk_size=200, overlap=50, use_tiktoken=False, model="gpt-3.5-turbo": sliding_window_chunks(
136-
text, chunk_size, overlap, use_tiktoken=use_tiktoken, model=model
141+
"sliding-window": (
142+
lambda text, chunk_size=200, overlap=50, use_tiktoken=False, model="gpt-3.5-turbo":
143+
sliding_window_chunks(
144+
text,
145+
chunk_size,
146+
overlap,
147+
use_tiktoken=use_tiktoken,
148+
model=model
149+
)
137150
),
138-
"paragraph": lambda text, chunk_size=0, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo": paragraph_chunks(
139-
text
151+
"paragraph": (
152+
lambda text, chunk_size=0, overlap=0, use_tiktoken=False, model="gpt-3.5-turbo":
153+
paragraph_chunks(text)
140154
),
141155
}

src/cli.py

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
from rich.table import Table
1616

1717
RICH_AVAILABLE = True
18-
CONSOLE = Console()
18+
console = Console()
1919
except ImportError: # pragma: no cover - optional dependency
2020
RICH_AVAILABLE = False
21-
CONSOLE = None
21+
console = None
2222

2323

2424
def write_chunks(chunks, strategy: str):
@@ -58,71 +58,72 @@ def analyze(args):
5858
int: exit code (0 on success, non-zero on error)
5959
"""
6060

61-
docs = mdparser.read_markdown_folder(args.folder)
62-
if not docs:
61+
results, text = _run_all_strategies(args)
62+
if results is None:
6363
print("No markdown files found")
6464
return 1
65+
_write_results(results, None, args.output)
66+
if not args.test_file:
67+
print(f"Total text length (chars): {len(text)}")
68+
return 0
69+
70+
71+
def _run_all_strategies(args):
72+
"""Helper to run all strategies and collect results."""
73+
docs = mdparser.read_markdown_folder(args.folder)
74+
if not docs:
75+
return None, None
6576
text = mdparser.clean_markdown_text(docs)
6677
strategies = (
6778
[args.strategy] if args.strategy != "all" else list(chunker.STRATEGIES.keys())
6879
)
6980
results = []
70-
detail = {}
71-
questions = None
72-
if args.test_file:
73-
questions = scorer.load_test_file(args.test_file)
74-
7581
for strat in strategies:
7682
func = chunker.STRATEGIES.get(strat)
7783
if not func:
7884
print(f"Unknown strategy: {strat}")
79-
return 1
80-
result, per_questions = _run_strategy(text, func, strat, args, questions)
85+
continue
86+
result, per_questions = _run_strategy(text, func, strat, args)
87+
result["per_questions"] = per_questions
8188
results.append(result)
82-
detail[strat] = per_questions
83-
84-
_write_results(results, detail, args.output)
85-
86-
if not questions:
87-
print(f"Total text length (chars): {len(text)}")
88-
return 0
89+
return results, text
8990

9091

91-
def _run_strategy(text, func, strat, args, questions):
92+
def _run_strategy(text, func, strat, args):
9293
"""Run a single chunking strategy and return result dict and per-question details.
9394
9495
Args:
9596
text: Full cleaned text
9697
func: chunking function
9798
strat: strategy name
9899
args: argparse.Namespace containing configuration
99-
questions: loaded questions list or None
100100
"""
101-
chunk_size = args.chunk_size
102-
overlap = args.overlap
103-
top_k = args.top_k
104-
use_tiktoken = getattr(args, "use_tiktoken", False)
105-
tiktoken_model = getattr(args, "tiktoken_model", "gpt-3.5-turbo")
106101
chunks = func(
107102
text,
108-
chunk_size=chunk_size,
109-
overlap=overlap,
110-
use_tiktoken=use_tiktoken,
111-
model=tiktoken_model,
103+
chunk_size=args.chunk_size,
104+
overlap=args.overlap,
105+
use_tiktoken=getattr(args, "use_tiktoken", False),
106+
model=getattr(args, "tiktoken_model", "gpt-3.5-turbo"),
112107
)
113108
outdir = write_chunks(chunks, strat)
114-
chunk_count = len(chunks)
115-
avg_recall = 0.0
116-
per_questions = []
109+
110+
avg_recall, per_questions = 0.0, []
111+
questions = (
112+
scorer.load_test_file(args.test_file)
113+
if getattr(args, "test_file", None)
114+
else None
115+
)
117116
if questions:
118-
avg_recall, per_questions = scorer.evaluate_strategy(chunks, questions, top_k)
119-
result = {
117+
avg_recall, per_questions = scorer.evaluate_strategy(
118+
chunks, questions, args.top_k
119+
)
120+
121+
return {
120122
"strategy": strat,
121-
"chunks": chunk_count,
123+
"chunks": len(chunks),
122124
"avg_recall": round(avg_recall, 4),
123125
"saved": str(outdir),
124-
}
125-
return result, per_questions
126+
}, per_questions
126127

127128

128129
def _write_results(results, detail, output):
@@ -159,7 +160,7 @@ def _write_results(results, detail, output):
159160
pct_cell,
160161
str(r.get("saved", "")),
161162
)
162-
CONSOLE.print(table)
163+
console.print(table)
163164
return
164165
print(format_table(results))
165166
return

0 commit comments

Comments
 (0)