refactor: improve logging and test clarity

ErikBjare · TimeToBuildBob · ErikBjare · commit 77fbd1341f37 · 2024-12-11T17:40:56.000+01:00
Replace debug print statements with proper logging in indexer.
Clean up test files by removing debug prints and improving assertions.
Add more descriptive error messages in tests.

Co-authored-by: Bob &lt;bob@superuserlabs.org&gt;
diff --git a/gptme_rag/indexing/indexer.py b/gptme_rag/indexing/indexer.py
@@ -316,42 +316,30 @@ def index_directory(
             logger.debug("Using git ls-files for file listing")
         except subprocess.CalledProcessError:
             # Not a git repo or git not available, fall back to glob
-            print("\nFalling back to glob + gitignore for file listing")
             files = list(directory.glob(glob_pattern))
-            print(
-                f"Found {len(files)} files matching glob pattern: {[str(f) for f in files]}"
-            )
             gitignore_patterns = self._load_gitignore(directory)
-            print(f"Loaded gitignore patterns: {gitignore_patterns}")
-        print(f"\nProcessing files in {directory}")
-        for f in files:
-            print(f"\nChecking file: {f}")
 
+        for f in files:
             if not f.is_file():
-                print("  Skip: Not a file")
                 continue
 
             # Check gitignore patterns if in glob mode
             if gitignore_patterns and self._is_ignored(f, gitignore_patterns):
-                print("  Skip: Matches gitignore pattern")
                 continue
 
             # Filter by glob pattern
             rel_path = str(f.relative_to(directory))
             # Convert glob pattern to fnmatch pattern
             fnmatch_pattern = glob_pattern.replace("**/*", "*")
             if not fnmatch_path(rel_path, fnmatch_pattern):
-                print(f"  Skip: Does not match pattern {fnmatch_pattern}")
                 continue
-            print(f"  Pass: Matches pattern {fnmatch_pattern}")
 
             # Resolve symlinks to target
             try:
                 resolved = f.resolve()
                 valid_files.add(resolved)
-                print(f"  Added: {resolved}")
             except Exception as e:
-                print(f"  Error: Could not resolve path - {e}")
+                logger.warning(f"Error resolving symlink: {f} -> {e}")
 
         # Check file limit
         if len(valid_files) >= file_limit:
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
@@ -50,30 +50,11 @@ def test_document_chunking(test_file):
 
 def test_indexing_with_chunks(test_file, indexer):
     """Test indexing documents with chunking enabled."""
-    # Debug: Print test file content
-    content = test_file.read_text()
-    print("\nTest file content:")
-    print(f"Size: {len(content)} chars")
-    print("First 200 chars:")
-    print(content[:200])
-
     # Index the test file
-    print("\nIndexing directory:", test_file.parent)
-    n_indexed = indexer.index_directory(test_file.parent)
-    print(f"Indexed {n_indexed} files")
-
-    # Debug collection state
-    print("\nCollection state:")
-    indexer.debug_collection()
+    indexer.index_directory(test_file.parent)
 
     # Search should return results
-    print("\nSearching for 'Lorem ipsum'...")
     docs, distances, _ = indexer.search("Lorem ipsum", n_results=5)
-    print(f"Found {len(docs)} documents")
-    for i, doc in enumerate(docs):
-        print(f"\nDoc {i}:")
-        print(f"ID: {doc.doc_id}")
-        print(f"Content: {doc.content[:100]}...")
 
     assert len(docs) > 0, "No documents found in search results"
     assert len(distances) == len(docs), "Distances don't match documents"
@@ -127,24 +108,11 @@ def test_document_reconstruction(test_file, indexer):
 
 def test_chunk_retrieval(test_file, indexer):
     """Test retrieving all chunks for a document."""
-    # Debug: Print test file content
-    content = test_file.read_text()
-    print(f"\nTest file size: {len(content)} chars")
-    print(f"Token count: {len(indexer.processor.encoding.encode(content))}")
-
     # Index the test file
-    print("\nIndexing file...")
     indexer.index_file(test_file)
 
     # Get a document ID from search results
-    print("\nSearching...")
-    docs, _, _ = indexer.search("Lorem ipsum")  # Search for text we know exists
-    print(f"Found {len(docs)} documents")
-    for i, doc in enumerate(docs):
-        print(f"\nDoc {i}:")
-        print(f"ID: {doc.doc_id}")
-        print(f"Content length: {len(doc.content)}")
-        print(f"Is chunk: {doc.is_chunk}")
+    docs, _, _ = indexer.search("Lorem ipsum")
     base_doc_id = docs[0].doc_id
     assert base_doc_id is not None
     doc_id = base_doc_id.split("#chunk")[0]
@@ -153,11 +121,12 @@ def test_chunk_retrieval(test_file, indexer):
     chunks = indexer.get_document_chunks(doc_id)
 
     # Check chunks
-    assert len(chunks) > 1
-    assert all(chunk.is_chunk for chunk in chunks)
+    assert len(chunks) > 1, "Document should be split into multiple chunks"
+    assert all(chunk.is_chunk for chunk in chunks), "All items should be chunks"
     assert all(
         chunk.doc_id is not None and chunk.doc_id.startswith(doc_id) for chunk in chunks
-    )
+    ), "All chunks should belong to the same document"
+
     # Check chunks are in order
-    chunk_indices = [chunk.chunk_index or 0 for chunk in chunks]  # Default to 0 if None
-    assert chunk_indices == sorted(chunk_indices)
+    chunk_indices = [chunk.chunk_index or 0 for chunk in chunks]
+    assert chunk_indices == sorted(chunk_indices), "Chunks should be in order"
diff --git a/tests/test_document_processor.py b/tests/test_document_processor.py
@@ -15,14 +15,6 @@ def test_process_text_basic():
 
     chunks = list(processor.process_text(text))
 
-    # Print debug info
-    print(f"\nTotal tokens in text: {len(processor.encoding.encode(text))}")
-    for i, chunk in enumerate(chunks):
-        print(f"\nChunk {i}:")
-        print(f"Token count: {chunk['metadata']['token_count']}")
-        print(f"Content length: {len(chunk['text'])}")
-        print(f"First 50 chars: {chunk['text'][:50]}")
-
     assert len(chunks) > 1  # Should split into multiple chunks
     assert all(isinstance(c["text"], str) for c in chunks)
     assert all(isinstance(c["metadata"], dict) for c in chunks)
@@ -103,23 +95,6 @@ def test_token_estimation():
     assert chunks > 0
 
 
-def test_content_size():
-    """Test actual content size in tokens for test data."""
-    processor = DocumentProcessor()
-    content = "\n\n".join(
-        [
-            f"This is paragraph {i} with some content that should be indexed."
-            for i in range(10)
-        ]
-    )
-    tokens = processor.encoding.encode(content)
-    print(f"Total tokens: {len(tokens)}")
-    print(f"Content length: {len(content)}")
-    for i, para in enumerate(content.split("\n\n")):
-        para_tokens = processor.encoding.encode(para)
-        print(f"Paragraph {i}: {len(para_tokens)} tokens, {len(para)} chars")
-
-
 def test_optimal_chunk_size():
     """Test optimal chunk size calculation."""
     processor = DocumentProcessor(chunk_overlap=10)
diff --git a/tests/test_watcher.py b/tests/test_watcher.py
@@ -122,6 +122,7 @@ def verify_content(content: str, timeout: float = 5.0) -> bool:
             if results and content in results[0].content:
                 return True
             time.sleep(0.5)
+        logger.debug(f"Content not found within timeout: {content}")
         return False
 
     with FileWatcher(indexer, [str(tmp_path)], update_delay=0.5):
@@ -133,7 +134,9 @@ def verify_content(content: str, timeout: float = 5.0) -> bool:
             content = f"Content version {i}"
             test_file.write_text(content)
             time.sleep(1.0)  # Wait between updates
-            assert verify_content(content), f"Content not found: {content}"
+            if not verify_content(content):
+                logger.error(f"Failed to verify content: {content}")
+                raise AssertionError(f"Content not found: {content}")
 
         # Verify final state
         results, _, _ = indexer.search("Content version")