feat: fix persistence and enable in cli

ErikBjare · ErikBjare · commit b0f424b6efdd · 2024-12-09T16:54:58.000+01:00
diff --git a/gptme_rag/cli.py b/gptme_rag/cli.py
@@ -1,23 +1,32 @@
+import logging
 import os
+import signal
 import sys
+import time
 from pathlib import Path
 
 import click
 from rich.console import Console
 
+from .benchmark import RagBenchmark
 from .indexing.indexer import Indexer
+from .indexing.watcher import FileWatcher
 from .query.context_assembler import ContextAssembler
 
 console = Console()
 
 # TODO: change this to a more appropriate location
-default_persist_dir = Path(__file__).parent / "data"
+default_persist_dir = Path.home() / ".cache" / "gptme" / "rag"
 
 
 @click.group()
-def cli():
+@click.option("--verbose/-v", is_flag=True, help="Enable verbose output")
+def cli(verbose: bool):
     """RAG implementation for gptme context management."""
-    pass
+    logging.basicConfig(
+        level=logging.DEBUG if verbose else logging.INFO,
+        format="%(levelname)s - %(name)s - %(message)s",
+    )
 
 
 @cli.command()
@@ -36,20 +45,13 @@ def cli():
 def index(directory: Path, pattern: str, persist_dir: Path):
     """Index documents in a directory."""
     try:
-        indexer = Indexer(persist_directory=persist_dir)
+        indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
         console.print(f"Indexing files in {directory} with pattern {pattern}")
 
-        # List files that will be indexed
-        files = list(directory.glob(pattern))
-        console.print(f"Found {len(files)} files:")
-        for file in files:
-            console.print(f"  - {file}")
-
         # Index the files
-        with console.status(f"Indexing {len(files)} files..."):
-            indexer.index_directory(directory, pattern)
+        n_indexed = indexer.index_directory(directory, pattern)
 
-        console.print(f"✅ Successfully indexed {len(files)} files", style="green")
+        console.print(f"✅ Successfully indexed {n_indexed} files", style="green")
     except Exception as e:
         console.print(f"❌ Error indexing directory: {e}", style="red")
 
@@ -80,7 +82,7 @@ def search(
             stdout = sys.stdout
             sys.stdout = open(os.devnull, "w")
             try:
-                indexer = Indexer(persist_directory=persist_dir)
+                indexer = Indexer(persist_directory=persist_dir, enable_persist=True)
                 assembler = ContextAssembler(max_tokens=max_tokens)
                 documents, distances = indexer.search(query, n_results=n_results)
             finally:
@@ -165,7 +167,6 @@ def watch(directory: Path, pattern: str, persist_dir: Path, ignore_patterns: lis
             indexer.index_directory(directory, pattern)
 
         console.print("Starting file watcher...")
-        from .indexing.watcher import FileWatcher
 
         try:
             file_watcher = FileWatcher(
@@ -174,14 +175,11 @@ def watch(directory: Path, pattern: str, persist_dir: Path, ignore_patterns: lis
             with file_watcher:
                 console.print("Watching for changes. Press Ctrl+C to stop.")
                 # Keep the main thread alive
-                import signal
 
                 try:
                     signal.pause()
                 except AttributeError:  # Windows doesn't have signal.pause
                     while True:
-                        import time
-
                         time.sleep(1)
         except KeyboardInterrupt:
             console.print("\nStopping file watcher...")
@@ -212,7 +210,6 @@ def benchmark():
 )
 def indexing(directory: Path, pattern: str, persist_dir: Path | None):
     """Benchmark document indexing performance."""
-    from .benchmark import RagBenchmark
 
     benchmark = RagBenchmark(index_dir=persist_dir)
 
@@ -252,7 +249,6 @@ def search_benchmark(
     persist_dir: Path | None,
 ):
     """Benchmark search performance."""
-    from .benchmark import RagBenchmark
 
     benchmark = RagBenchmark(index_dir=persist_dir)
 
@@ -296,7 +292,6 @@ def watch_perf(
     persist_dir: Path | None,
 ):
     """Benchmark file watching performance."""
-    from .benchmark import RagBenchmark
 
     benchmark = RagBenchmark(index_dir=persist_dir)
 
diff --git a/gptme_rag/indexing/indexer.py b/gptme_rag/indexing/indexer.py
@@ -172,7 +172,7 @@ def add_documents(self, documents: list[Document], batch_size: int = 100) -> Non
 
     def _load_gitignore(self, directory: Path) -> list[str]:
         """Load gitignore patterns from all .gitignore files up to root."""
-        patterns: list[str] = []
+        patterns: list[str] = [".git/", ".sqlite3", ".db"]
         current_dir = directory.resolve()
         max_depth = 10  # Limit traversal to avoid infinite loops
 
@@ -225,14 +225,10 @@ def index_directory(
         gitignore_patterns = self._load_gitignore(directory)
 
         # Filter files
-        valid_files = []
+        valid_files = set()
         for f in files:
-            if (
-                f.is_file()
-                and not f.name.endswith((".sqlite3", ".db"))
-                and not self._is_ignored(f, gitignore_patterns)
-            ):
-                valid_files.append(f)
+            if f.is_file() and not self._is_ignored(f, gitignore_patterns):
+                valid_files.add(f)
 
             # Check file limit
             if len(valid_files) >= file_limit:
@@ -257,20 +253,20 @@ def index_directory(
         current_batch = []
 
         for file_path in valid_files:
+            logger.debug(f"Processing file: {file_path}")
             # Process each file into chunks
             for doc in Document.from_file(file_path, processor=self.processor):
+                logger.debug(f"Processing chunk: {doc.source_path} ({doc.chunk_index})")
                 current_batch.append(doc)
                 if len(current_batch) >= batch_size:
+                    logger.info(f"Adding {len(current_batch)} documents")
                     self.add_documents(current_batch)
                     current_batch = []
 
         # Add any remaining documents
         if current_batch:
-            logger.debug(
-                f"Adding {len(current_batch)} remaining documents. "
-                f"First doc preview: {current_batch[0].content[:100]}. "
-                f"Paths: {[doc.source_path for doc in current_batch]}"
-            )
+            self.add_documents(current_batch)
+            logger.info(f"Adding {len(current_batch)} documents.")
             self.add_documents(current_batch)
 
         logger.info(f"Indexed {len(valid_files)} documents from {directory}")
@@ -340,6 +336,52 @@ def search(
 
         return documents, distances[: len(documents)]
 
+    def list_documents(self, group_by_source: bool = True) -> list[Document]:
+        """List all documents in the index.
+
+        Args:
+            group_by_source: Whether to group chunks from the same document
+
+        Returns:
+            List of documents
+        """
+        # Get all documents from collection
+        results = self.collection.get()
+
+        if not results["ids"]:
+            return []
+
+        if group_by_source:
+            # Group chunks by source document
+            doc_groups: dict[str, list[Document]] = {}
+
+            for i, doc_id in enumerate(results["ids"]):
+                doc = Document(
+                    content=results["documents"][i],
+                    metadata=results["metadatas"][i],
+                    doc_id=doc_id,
+                )
+
+                # Get source document ID (remove chunk suffix if present)
+                source_id = doc_id.split("#chunk")[0]
+
+                if source_id not in doc_groups:
+                    doc_groups[source_id] = []
+                doc_groups[source_id].append(doc)
+
+            # Return first chunk from each document group
+            return [chunks[0] for chunks in doc_groups.values()]
+        else:
+            # Return all documents/chunks
+            return [
+                Document(
+                    content=results["documents"][i],
+                    metadata=results["metadatas"][i],
+                    doc_id=doc_id,
+                )
+                for i, doc_id in enumerate(results["ids"])
+            ]
+
     def get_document_chunks(self, doc_id: str) -> list[Document]:
         """Get all chunks for a document.