Skip to content

Commit fd0db34

Browse files
committed
perf: only index new/modified files
Track file modification times and skip unchanged files during indexing. This improves performance by avoiding unnecessary re-indexing. Key changes: - Track file modification times in document metadata - Compare mtimes to detect modified files - Only process new or modified files - Improve logging and status messages
1 parent 5bec137 commit fd0db34

File tree

2 files changed

+54
-4
lines changed

2 files changed

+54
-4
lines changed

gptme_rag/cli.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,26 +57,68 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
5757
try:
5858
indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
5959

60-
# First, collect all documents
60+
# Get existing files and their metadata from the index, using absolute paths
61+
existing_docs = indexer.get_all_documents()
62+
logger.debug("Found %d existing documents in index", len(existing_docs))
63+
64+
existing_files = {}
65+
for doc in existing_docs:
66+
if "source" in doc.metadata:
67+
abs_path = os.path.abspath(doc.metadata["source"])
68+
mtime = doc.metadata.get("mtime", 0)
69+
existing_files[abs_path] = mtime
70+
logger.debug("Existing file: %s (mtime: %s)", abs_path, mtime)
71+
72+
logger.debug("Loaded %d existing files from index", len(existing_files))
73+
74+
# First, collect all documents and filter for new/modified
6175
all_documents = []
6276
with console.status("Collecting documents...") as status:
6377
for path in paths:
6478
if path.is_file():
6579
status.update(f"Processing file: {path}")
6680
else:
6781
status.update(f"Processing directory: {path}")
82+
6883
documents = indexer.collect_documents(path)
69-
all_documents.extend(documents)
84+
85+
# Filter for new or modified documents
86+
filtered_documents = []
87+
for doc in documents:
88+
source = doc.metadata.get("source")
89+
if source:
90+
# Resolve to absolute path for consistent comparison
91+
abs_source = os.path.abspath(source)
92+
doc.metadata["source"] = abs_source
93+
current_mtime = os.path.getmtime(abs_source)
94+
doc.metadata["mtime"] = current_mtime
95+
96+
# Include if file is new or modified
97+
if abs_source not in existing_files:
98+
logger.debug("New file: %s", abs_source)
99+
filtered_documents.append(doc)
100+
elif current_mtime > existing_files[abs_source]:
101+
logger.debug(
102+
"Modified file: %s (current: %s, stored: %s)",
103+
abs_source,
104+
current_mtime,
105+
existing_files[abs_source],
106+
)
107+
filtered_documents.append(doc)
108+
else:
109+
logger.debug("Unchanged file: %s", abs_source)
110+
111+
all_documents.extend(filtered_documents)
70112

71113
if not all_documents:
72-
console.print("No documents found to index", style="yellow")
114+
console.print("No new or modified documents to index", style="yellow")
73115
return
74116

75117
# Then process them with a progress bar
76118
n_files = len(set(doc.metadata.get("source", "") for doc in all_documents))
77119
n_chunks = len(all_documents)
78120

79-
logger.info(f"Found {n_files} files to index ({n_chunks} chunks)")
121+
logger.info(f"Found {n_files} new/modified files to index ({n_chunks} chunks)")
80122

81123
with tqdm(
82124
total=n_chunks,

gptme_rag/indexing/indexer.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -905,3 +905,11 @@ def index_file(self, path: Path) -> int:
905905
self.add_documents(documents)
906906
return len(documents)
907907
return 0
908+
909+
def get_all_documents(self) -> list[Document]:
910+
"""Get all documents from the index.
911+
912+
Returns:
913+
List of all documents in the index, including all chunks.
914+
"""
915+
return self.list_documents(group_by_source=False)

0 commit comments

Comments
 (0)