refactor(tests): extract common test fixtures to conftest.py

ErikBjare · TimeToBuildBob · ErikBjare · commit d79047d73173 · 2024-12-11T17:33:29.000+01:00
- Add shared indexer fixture with automatic cleanup
- Add cleanup_chroma fixture to reset ChromaDB between tests
- Refactor test files to use shared fixtures
- Remove duplicated setup code
- Use tmp_path fixture instead of custom temp_dir

Co-authored-by: Bob &lt;bob@superuserlabs.org&gt;
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,39 @@
+import pytest
+import chromadb
+
+
+@pytest.fixture(autouse=True)
+def cleanup_chroma():
+    """Clean up ChromaDB between tests."""
+    yield
+    # Reset the ChromaDB client system
+    if hasattr(chromadb.api.client.SharedSystemClient, "_identifer_to_system"):
+        chromadb.api.client.SharedSystemClient._identifer_to_system = {}
+
+
+@pytest.fixture
+def indexer(request, tmp_path):
+    """Create an indexer with a unique collection name based on the test name."""
+    from gptme_rag.indexing.indexer import Indexer
+    import logging
+
+    logger = logging.getLogger(__name__)
+
+    collection_name = request.node.name.replace("[", "_").replace("]", "_")
+    idx = Indexer(
+        persist_directory=tmp_path / "index",
+        chunk_size=50,  # Smaller chunk size to ensure multiple chunks
+        chunk_overlap=10,
+        enable_persist=True,  # Enable persistent storage
+        collection_name=collection_name,  # Unique collection name per test
+    )
+
+    # Reset collection before test
+    idx.reset_collection()
+    logger.debug("Reset collection before test")
+
+    yield idx
+
+    # Cleanup after test
+    idx.reset_collection()
+    logger.debug("Reset collection after test")
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
@@ -7,7 +7,6 @@
 
 from gptme_rag.indexing.document import Document
 from gptme_rag.indexing.document_processor import DocumentProcessor
-from gptme_rag.indexing.indexer import Indexer
 
 
 @pytest.fixture
@@ -49,153 +48,116 @@ def test_document_chunking(test_file):
     assert all(id_ is not None and "#chunk" in id_ for id_ in chunk_ids)
 
 
-def test_indexing_with_chunks(test_file):
+def test_indexing_with_chunks(test_file, indexer):
     """Test indexing documents with chunking enabled."""
-    with tempfile.TemporaryDirectory() as index_dir:
-        # Debug: Print test file content
-        content = test_file.read_text()
-        print("\nTest file content:")
-        print(f"Size: {len(content)} chars")
-        print("First 200 chars:")
-        print(content[:200])
-
-        indexer = Indexer(
-            persist_directory=Path(index_dir),
-            chunk_size=200,  # Increased chunk size
-            chunk_overlap=50,  # Increased overlap
-            enable_persist=True,  # Ensure persistence
-        )
-
-        # Index the test file
-        print("\nIndexing directory:", test_file.parent)
-        n_indexed = indexer.index_directory(test_file.parent)
-        print(f"Indexed {n_indexed} files")
-
-        # Debug collection state
-        print("\nCollection state:")
-        indexer.debug_collection()
-
-        # Search should return results
-        print("\nSearching for 'Lorem ipsum'...")
-        docs, distances, _ = indexer.search("Lorem ipsum", n_results=5)
-        print(f"Found {len(docs)} documents")
-        for i, doc in enumerate(docs):
-            print(f"\nDoc {i}:")
-            print(f"ID: {doc.doc_id}")
-            print(f"Content: {doc.content[:100]}...")
-
-        assert len(docs) > 0, "No documents found in search results"
-        assert len(distances) == len(docs), "Distances don't match documents"
-        assert all(doc.is_chunk for doc in docs), "Not all results are chunks"
-
-
-def test_chunk_grouping(test_file):
+    # Debug: Print test file content
+    content = test_file.read_text()
+    print("\nTest file content:")
+    print(f"Size: {len(content)} chars")
+    print("First 200 chars:")
+    print(content[:200])
+
+    # Index the test file
+    print("\nIndexing directory:", test_file.parent)
+    n_indexed = indexer.index_directory(test_file.parent)
+    print(f"Indexed {n_indexed} files")
+
+    # Debug collection state
+    print("\nCollection state:")
+    indexer.debug_collection()
+
+    # Search should return results
+    print("\nSearching for 'Lorem ipsum'...")
+    docs, distances, _ = indexer.search("Lorem ipsum", n_results=5)
+    print(f"Found {len(docs)} documents")
+    for i, doc in enumerate(docs):
+        print(f"\nDoc {i}:")
+        print(f"ID: {doc.doc_id}")
+        print(f"Content: {doc.content[:100]}...")
+
+    assert len(docs) > 0, "No documents found in search results"
+    assert len(distances) == len(docs), "Distances don't match documents"
+    assert all(doc.is_chunk for doc in docs), "Not all results are chunks"
+
+
+def test_chunk_grouping(test_file, indexer):
     """Test that chunks are properly grouped in search results."""
-    with tempfile.TemporaryDirectory() as index_dir:
-        indexer = Indexer(
-            persist_directory=Path(index_dir),
-            chunk_size=50,  # Smaller chunk size to ensure multiple chunks
-            chunk_overlap=10,
-            enable_persist=True,  # Enable persistent storage
-            collection_name="test_chunk_grouping",  # Unique collection name
-        )
-
-        # Index the test file
-        indexer.index_directory(test_file.parent)
-
-        # Search with and without grouping
-        grouped_docs, _, _ = indexer.search(
-            "Lorem ipsum", n_results=3, group_chunks=True
-        )
-        ungrouped_docs, _, _ = indexer.search(
-            "Lorem ipsum", n_results=3, group_chunks=False
-        )
-
-        # Grouped results should have unique source documents
-        grouped_sources = set(
-            doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in grouped_docs
-        )
-        assert len(grouped_sources) == len(grouped_docs)
-
-        # Ungrouped results might have multiple chunks from same document
-        ungrouped_sources = set(
-            doc.doc_id.split("#chunk")[0] if doc.doc_id else ""
-            for doc in ungrouped_docs
-        )
-        assert len(ungrouped_sources) <= len(ungrouped_docs)
-
-
-def test_document_reconstruction(test_file):
-    """Test reconstructing full documents from chunks."""
-    with tempfile.TemporaryDirectory() as index_dir:
-        indexer = Indexer(
-            persist_directory=Path(index_dir),
-            chunk_size=50,  # Smaller chunk size to ensure multiple chunks
-            chunk_overlap=10,
-        )
+    # Index the test file
+    indexer.index_directory(test_file.parent)
+
+    # Search with and without grouping
+    grouped_docs, _, _ = indexer.search("Lorem ipsum", n_results=3, group_chunks=True)
+    ungrouped_docs, _, _ = indexer.search(
+        "Lorem ipsum", n_results=3, group_chunks=False
+    )
+
+    # Grouped results should have unique source documents
+    grouped_sources = set(
+        doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in grouped_docs
+    )
+    assert len(grouped_sources) == len(grouped_docs)
+
+    # Ungrouped results might have multiple chunks from same document
+    ungrouped_sources = set(
+        doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in ungrouped_docs
+    )
+    assert len(ungrouped_sources) <= len(ungrouped_docs)
 
-        # Index the test file
-        indexer.index_directory(test_file.parent)
 
-        # Get a document ID from search results
-        docs, _, _ = indexer.search("Lorem ipsum")  # Search for text we know exists
-        base_doc_id = docs[0].doc_id
-        assert base_doc_id is not None
-        doc_id = base_doc_id.split("#chunk")[0]
+def test_document_reconstruction(test_file, indexer):
+    """Test reconstructing full documents from chunks."""
+    # Index the test file
+    indexer.index_directory(test_file.parent)
+
+    # Get a document ID from search results
+    docs, _, _ = indexer.search("Lorem ipsum")  # Search for text we know exists
+    base_doc_id = docs[0].doc_id
+    assert base_doc_id is not None
+    doc_id = base_doc_id.split("#chunk")[0]
 
-        # Reconstruct the document
-        full_doc = indexer.reconstruct_document(doc_id)
+    # Reconstruct the document
+    full_doc = indexer.reconstruct_document(doc_id)
 
-        # Check the reconstructed document
-        assert not full_doc.is_chunk
-        assert full_doc.doc_id == doc_id
-        assert "chunk_index" not in full_doc.metadata
-        assert len(full_doc.content) > len(docs[0].content)
+    # Check the reconstructed document
+    assert not full_doc.is_chunk
+    assert full_doc.doc_id == doc_id
+    assert "chunk_index" not in full_doc.metadata
+    assert len(full_doc.content) > len(docs[0].content)
 
 
-def test_chunk_retrieval(test_file):
+def test_chunk_retrieval(test_file, indexer):
     """Test retrieving all chunks for a document."""
-    with tempfile.TemporaryDirectory() as index_dir:
-        indexer = Indexer(
-            persist_directory=Path(index_dir),
-            chunk_size=50,  # Smaller chunk size to ensure multiple chunks
-            chunk_overlap=10,
-        )
-
-        # Debug: Print test file content
-        content = test_file.read_text()
-        print(f"\nTest file size: {len(content)} chars")
-        print(f"Token count: {len(indexer.processor.encoding.encode(content))}")
-
-        # Index the test file
-        print("\nIndexing file...")
-        indexer.index_file(test_file)
-
-        # Get a document ID from search results
-        print("\nSearching...")
-        docs, _, _ = indexer.search("Lorem ipsum")  # Search for text we know exists
-        print(f"Found {len(docs)} documents")
-        for i, doc in enumerate(docs):
-            print(f"\nDoc {i}:")
-            print(f"ID: {doc.doc_id}")
-            print(f"Content length: {len(doc.content)}")
-            print(f"Is chunk: {doc.is_chunk}")
-        base_doc_id = docs[0].doc_id
-        assert base_doc_id is not None
-        doc_id = base_doc_id.split("#chunk")[0]
-
-        # Get all chunks
-        chunks = indexer.get_document_chunks(doc_id)
-
-        # Check chunks
-        assert len(chunks) > 1
-        assert all(chunk.is_chunk for chunk in chunks)
-        assert all(
-            chunk.doc_id is not None and chunk.doc_id.startswith(doc_id)
-            for chunk in chunks
-        )
-        # Check chunks are in order
-        chunk_indices = [
-            chunk.chunk_index or 0 for chunk in chunks
-        ]  # Default to 0 if None
-        assert chunk_indices == sorted(chunk_indices)
+    # Debug: Print test file content
+    content = test_file.read_text()
+    print(f"\nTest file size: {len(content)} chars")
+    print(f"Token count: {len(indexer.processor.encoding.encode(content))}")
+
+    # Index the test file
+    print("\nIndexing file...")
+    indexer.index_file(test_file)
+
+    # Get a document ID from search results
+    print("\nSearching...")
+    docs, _, _ = indexer.search("Lorem ipsum")  # Search for text we know exists
+    print(f"Found {len(docs)} documents")
+    for i, doc in enumerate(docs):
+        print(f"\nDoc {i}:")
+        print(f"ID: {doc.doc_id}")
+        print(f"Content length: {len(doc.content)}")
+        print(f"Is chunk: {doc.is_chunk}")
+    base_doc_id = docs[0].doc_id
+    assert base_doc_id is not None
+    doc_id = base_doc_id.split("#chunk")[0]
+
+    # Get all chunks
+    chunks = indexer.get_document_chunks(doc_id)
+
+    # Check chunks
+    assert len(chunks) > 1
+    assert all(chunk.is_chunk for chunk in chunks)
+    assert all(
+        chunk.doc_id is not None and chunk.doc_id.startswith(doc_id) for chunk in chunks
+    )
+    # Check chunks are in order
+    chunk_indices = [chunk.chunk_index or 0 for chunk in chunks]  # Default to 0 if None
+    assert chunk_indices == sorted(chunk_indices)
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
@@ -1,18 +1,5 @@
-from pathlib import Path
 import pytest
-import tempfile
-import chromadb
 from gptme_rag.indexing.document import Document
-from gptme_rag.indexing.indexer import Indexer
-
-
-@pytest.fixture(autouse=True)
-def cleanup_chroma():
-    """Clean up ChromaDB between tests."""
-    yield
-    # Reset the ChromaDB client system
-    if hasattr(chromadb.api.client.SharedSystemClient, "_identifer_to_system"):
-        chromadb.api.client.SharedSystemClient._identifer_to_system = {}
 
 
 @pytest.fixture
@@ -31,15 +18,9 @@ def test_docs():
     ]
 
 
-@pytest.fixture
-def temp_dir():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        yield Path(tmpdir)
-
-
-def test_document_from_file(temp_dir):
+def test_document_from_file(tmp_path):
     # Create a test file
-    test_file = temp_dir / "test.txt"
+    test_file = tmp_path / "test.txt"
     test_content = "Test content"
     test_file.write_text(test_content)
 
@@ -54,9 +35,7 @@ def test_document_from_file(temp_dir):
     assert doc.metadata["extension"] == ".txt"
 
 
-def test_indexer_add_document(temp_dir, test_docs):
-    indexer = Indexer(persist_directory=temp_dir)
-
+def test_indexer_add_document(indexer, test_docs):
     # Add single document
     indexer.add_document(test_docs[0])
     results, distances, _ = indexer.search("Python programming")
@@ -66,14 +45,7 @@ def test_indexer_add_document(temp_dir, test_docs):
     assert len(distances) > 0
 
 
-def test_indexer_add_documents(temp_dir, test_docs):
-    # Create indexer with unique collection name
-    indexer = Indexer(
-        persist_directory=temp_dir,
-        collection_name="test_add_documents",
-        enable_persist=True,
-    )
-
+def test_indexer_add_documents(indexer, test_docs):
     # Reset collection to ensure clean state
     indexer.reset_collection()
 
@@ -99,15 +71,14 @@ def test_indexer_add_documents(temp_dir, test_docs):
     assert len(ml_distances) > 0, "No distances returned"
 
 
-def test_indexer_directory(temp_dir):
+def test_indexer_directory(indexer, tmp_path):
     # Create test files
-    (temp_dir / "test1.txt").write_text("Content about Python")
-    (temp_dir / "test2.txt").write_text("Content about JavaScript")
-    (temp_dir / "subdir").mkdir()
-    (temp_dir / "subdir" / "test3.txt").write_text("Content about TypeScript")
+    (tmp_path / "test1.txt").write_text("Content about Python")
+    (tmp_path / "test2.txt").write_text("Content about JavaScript")
+    (tmp_path / "subdir").mkdir()
+    (tmp_path / "subdir" / "test3.txt").write_text("Content about TypeScript")
 
-    indexer = Indexer(persist_directory=temp_dir / "index")
-    indexer.index_directory(temp_dir)
+    indexer.index_directory(tmp_path)
 
     # Search for programming languages
     python_results, python_distances, _ = indexer.search("Python")
diff --git a/tests/test_watcher.py b/tests/test_watcher.py