Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions gptme_rag/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import signal
import sys
import time
from datetime import datetime
from pathlib import Path

import click
Expand Down Expand Up @@ -65,9 +66,21 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
for doc in existing_docs:
if "source" in doc.metadata:
abs_path = os.path.abspath(doc.metadata["source"])
mtime = doc.metadata.get("mtime", 0)
existing_files[abs_path] = mtime
logger.debug("Existing file: %s (mtime: %s)", abs_path, mtime)
last_modified = doc.metadata.get("last_modified")
if last_modified:
try:
# Parse ISO format timestamp to float
existing_files[abs_path] = datetime.fromisoformat(
last_modified
).timestamp()
except ValueError:
logger.warning(
"Invalid last_modified format: %s", last_modified
)
existing_files[abs_path] = 0
else:
existing_files[abs_path] = 0
# logger.debug("Existing file: %s", abs_path) # Too spammy

logger.debug("Loaded %d existing files from index", len(existing_files))

Expand All @@ -91,13 +104,15 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
abs_source = os.path.abspath(source)
doc.metadata["source"] = abs_source
current_mtime = os.path.getmtime(abs_source)
doc.metadata["mtime"] = current_mtime

# Include if file is new or modified
if abs_source not in existing_files:
logger.debug("New file: %s", abs_source)
filtered_documents.append(doc)
elif current_mtime > existing_files[abs_source]:
# Round to microseconds (6 decimal places) for comparison
elif round(current_mtime, 6) > round(
existing_files[abs_source], 6
):
logger.debug(
"Modified file: %s (current: %s, stored: %s)",
abs_source,
Expand Down
10 changes: 9 additions & 1 deletion gptme_rag/indexing/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def __init__(
self.persist_directory = Path(persist_directory).expanduser().resolve()
self.persist_directory.mkdir(parents=True, exist_ok=True)
logger.info(f"Using persist directory: {self.persist_directory}")

settings.persist_directory = str(self.persist_directory)
self.client = chromadb.PersistentClient(
path=str(self.persist_directory), settings=settings
Expand Down Expand Up @@ -516,6 +517,9 @@ def list_documents(self, group_by_source: bool = True) -> list[Document]:
"""
# Get all documents from collection
results = self.collection.get()
logger.debug("ChromaDB returned %d documents", len(results["ids"]))
if results["ids"]:
logger.debug("First document metadata: %s", results["metadatas"][0])

if not results["ids"]:
return []
Expand Down Expand Up @@ -912,4 +916,8 @@ def get_all_documents(self) -> list[Document]:
Returns:
List of all documents in the index, including all chunks.
"""
return self.list_documents(group_by_source=False)
logger.debug("Getting all documents from index")
docs = self.list_documents(group_by_source=False)
for doc in docs:
logger.debug("Retrieved document with metadata: %s", doc.metadata)
return docs
5 changes: 4 additions & 1 deletion gptme_rag/indexing/watcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import time
from pathlib import Path
from datetime import datetime

from watchdog.events import FileSystemEvent, FileSystemEventHandler
from watchdog.observers import Observer
Expand Down Expand Up @@ -321,7 +322,9 @@ def _process_updates(self) -> None:

# Sort updates by modification time to get latest versions
updates = sorted(
existing_updates, key=lambda p: p.stat().st_mtime, reverse=True
existing_updates,
key=lambda p: datetime.fromtimestamp(p.stat().st_mtime),
reverse=True,
)
logger.debug(f"Sorted updates: {[str(p) for p in updates]}")

Expand Down
Loading