Skip to content

Commit 5e5ecf2

Browse files
committed
fix: improve indexing robustness and type safety
- Disable persistent storage temporarily due to multi-threading issues - Improve document deletion using collection-level delete - Add better type hints and class attributes - Update build backend and gitignore
1 parent 537048a commit 5e5ecf2

File tree

4 files changed

+38
-22
lines changed

4 files changed

+38
-22
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ wheels/
1919
*.egg-info/
2020
.installed.cfg
2121
*.egg
22+
.*cache/
2223

2324
# Virtual Environments
2425
.env

gptme_rag/indexing/indexer.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from pathlib import Path
44

55
import chromadb
6-
from chromadb.api import Collection
6+
from chromadb import Collection
7+
from chromadb.api import ClientAPI
78
from chromadb.config import Settings
89

910
from .document import Document
@@ -15,25 +16,34 @@
1516
class Indexer:
1617
"""Handles document indexing and embedding storage."""
1718

19+
client: ClientAPI | None = None
20+
collection: Collection
21+
processor: DocumentProcessor
22+
is_persistent: bool = False
23+
1824
def __init__(
1925
self,
2026
persist_directory: Path | None,
2127
collection_name: str = "default",
2228
chunk_size: int = 1000,
2329
chunk_overlap: int = 200,
2430
):
25-
if persist_directory:
31+
enable_persist = False
32+
if persist_directory and enable_persist:
33+
self.is_persistent = True
2634
persist_directory = Path(persist_directory).expanduser().resolve()
2735
persist_directory.mkdir(parents=True, exist_ok=True)
2836
logger.debug(f"Using persist directory: {persist_directory}")
2937

3038
settings = Settings(
3139
allow_reset=True, # Allow resetting for testing
32-
is_persistent=persist_directory is not None,
40+
is_persistent=self.is_persistent,
3341
anonymized_telemetry=False,
3442
)
3543

36-
if persist_directory:
44+
# FIXME: persistent storage doesn't work in multi-threaded environments.
45+
# ("table segments already exist", "database is locked", among other issues)
46+
if persist_directory and enable_persist:
3747
settings.persist_directory = str(persist_directory)
3848
logger.debug(f"Using persist directory: {persist_directory}")
3949
self.client = chromadb.PersistentClient(
@@ -44,6 +54,7 @@ def __init__(
4454
self.client = chromadb.Client(settings)
4555

4656
def create_collection():
57+
assert self.client
4758
return self.client.get_or_create_collection(
4859
name=collection_name, metadata={"hnsw:space": "cosine"}
4960
)
@@ -67,11 +78,11 @@ def create_collection():
6778

6879
def __del__(self):
6980
"""Cleanup when the indexer is destroyed."""
70-
try:
71-
self.client.reset()
72-
except Exception as e:
73-
if "Resetting is not allowed" not in e.args[0]:
74-
logger.exception("Error resetting ChromaDB client")
81+
if self.client:
82+
try:
83+
self.client.reset()
84+
except Exception as e:
85+
logger.warning(f"Error during cleanup: {e}")
7586

7687
def add_document(self, document: Document, timestamp: int | None = None) -> None:
7788
"""Add a single document to the index."""

gptme_rag/indexing/watcher.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -178,18 +178,14 @@ def _process_updates(self) -> None:
178178
# Read current content for verification
179179
current_content = path.read_text()
180180

181-
# Remove all old versions from the index
182-
old_docs = self.indexer.search(
183-
"", n_results=100, where={"source": canonical_path}
184-
)[0]
185-
for doc in old_docs:
186-
if doc.doc_id is not None:
187-
if self.indexer.delete_document(doc.doc_id):
188-
logger.info(f"Deleted old version: {doc.doc_id}")
189-
else:
190-
logger.warning(
191-
f"Failed to delete old version: {doc.doc_id}"
192-
)
181+
# Clear all documents with this source path
182+
try:
183+
self.indexer.collection.delete(
184+
where={"source": canonical_path}
185+
)
186+
logger.info(f"Cleared old versions for: {canonical_path}")
187+
except Exception as e:
188+
logger.warning(f"Error clearing old versions: {e}")
193189

194190
# Index the new version with retries
195191
max_attempts = 3
@@ -263,6 +259,14 @@ def start(self) -> None:
263259
if not path.exists():
264260
logger.warning(f"Watch path does not exist: {path}")
265261
continue
262+
# Clear any existing documents for this path
263+
try:
264+
for file in path.glob(self.event_handler.pattern):
265+
canonical_path = str(file.resolve())
266+
self.indexer.collection.delete(where={"source": canonical_path})
267+
except Exception as e:
268+
logger.warning(f"Error clearing existing documents: {e}")
269+
266270
# Index existing files
267271
self.indexer.index_directory(path, self.event_handler.pattern)
268272
# Set up watching

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,4 @@ check_untyped_defs = true
5454

5555
[build-system]
5656
requires = ["poetry-core"]
57-
build-backend = "poetry-core.backend"
57+
build-backend = "poetry.core.masonry.api"

0 commit comments

Comments
 (0)