Skip to content

Commit 18ba7cb

Browse files
feat(indexer): add progress reporting and improve indexing
- Add progress bar using tqdm for document indexing - Refactor document collection and processing for better efficiency - Improve error handling and logging - Add JSON support for scoring weights - Split indexing into collection and processing phases Co-authored-by: Bob <[email protected]>
1 parent 803b86a commit 18ba7cb

File tree

4 files changed

+201
-132
lines changed

4 files changed

+201
-132
lines changed

gptme_rag/cli.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import logging
23
import os
34
import signal
@@ -9,13 +10,15 @@
910
from rich.console import Console
1011
from rich.logging import RichHandler
1112
from rich.syntax import Syntax
13+
from tqdm import tqdm
1214

1315
from .benchmark import RagBenchmark
1416
from .indexing.indexer import Indexer
1517
from .indexing.watcher import FileWatcher
1618
from .query.context_assembler import ContextAssembler
1719

1820
console = Console()
21+
logger = logging.getLogger(__name__)
1922

2023
# TODO: change this to a more appropriate location
2124
default_persist_dir = Path.home() / ".cache" / "gptme" / "rag"
@@ -53,23 +56,45 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
5356

5457
try:
5558
indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
56-
total_indexed = 0
57-
58-
for path in paths:
59-
if path.is_file():
60-
console.print(f"Indexing file: {path}")
61-
n_indexed = indexer.index_file(path)
62-
if n_indexed is not None:
63-
total_indexed += n_indexed
64-
else:
65-
console.print(f"Indexing files in {path} with pattern {pattern}")
66-
n_indexed = indexer.index_directory(path, pattern)
67-
if n_indexed is not None:
68-
total_indexed += n_indexed
6959

70-
console.print(f"✅ Successfully indexed {total_indexed} files", style="green")
60+
# First, collect all documents
61+
all_documents = []
62+
with console.status("Collecting documents...") as status:
63+
for path in paths:
64+
if path.is_file():
65+
status.update(f"Processing file: {path}")
66+
else:
67+
status.update(f"Processing directory: {path}")
68+
documents = indexer.collect_documents(path)
69+
all_documents.extend(documents)
70+
71+
if not all_documents:
72+
console.print("No documents found to index", style="yellow")
73+
return
74+
75+
# Then process them with a progress bar
76+
n_files = len(set(doc.metadata.get("source", "") for doc in all_documents))
77+
n_chunks = len(all_documents)
78+
79+
logger.info(f"Found {n_files} files to index ({n_chunks} chunks)")
80+
81+
with tqdm(
82+
total=n_chunks,
83+
desc="Indexing documents",
84+
unit="chunk",
85+
disable=not sys.stdout.isatty(),
86+
) as pbar:
87+
for progress in indexer.add_documents_progress(all_documents):
88+
pbar.update(progress)
89+
90+
console.print(
91+
f"✅ Successfully indexed {n_files} files ({n_chunks} chunks)",
92+
style="green",
93+
)
7194
except Exception as e:
7295
console.print(f"❌ Error indexing directory: {e}", style="red")
96+
if logger.isEnabledFor(logging.DEBUG):
97+
console.print_exception()
7398

7499

75100
@cli.command()
@@ -111,8 +136,6 @@ def search(
111136
scoring_weights = None
112137
if weights:
113138
try:
114-
import json
115-
116139
scoring_weights = json.loads(weights)
117140
except json.JSONDecodeError as e:
118141
console.print(f"❌ Invalid weights JSON: {e}", style="red")

0 commit comments

Comments
 (0)