Skip to content

Commit d79047d

Browse files
refactor(tests): extract common test fixtures to conftest.py
- Add shared indexer fixture with automatic cleanup - Add cleanup_chroma fixture to reset ChromaDB between tests - Refactor test files to use shared fixtures - Remove duplicated setup code - Use tmp_path fixture instead of custom temp_dir Co-authored-by: Bob <[email protected]>
1 parent 95b6585 commit d79047d

File tree

4 files changed

+171
-259
lines changed

4 files changed

+171
-259
lines changed

tests/conftest.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import pytest
2+
import chromadb
3+
4+
5+
@pytest.fixture(autouse=True)
6+
def cleanup_chroma():
7+
"""Clean up ChromaDB between tests."""
8+
yield
9+
# Reset the ChromaDB client system
10+
if hasattr(chromadb.api.client.SharedSystemClient, "_identifer_to_system"):
11+
chromadb.api.client.SharedSystemClient._identifer_to_system = {}
12+
13+
14+
@pytest.fixture
15+
def indexer(request, tmp_path):
16+
"""Create an indexer with a unique collection name based on the test name."""
17+
from gptme_rag.indexing.indexer import Indexer
18+
import logging
19+
20+
logger = logging.getLogger(__name__)
21+
22+
collection_name = request.node.name.replace("[", "_").replace("]", "_")
23+
idx = Indexer(
24+
persist_directory=tmp_path / "index",
25+
chunk_size=50, # Smaller chunk size to ensure multiple chunks
26+
chunk_overlap=10,
27+
enable_persist=True, # Enable persistent storage
28+
collection_name=collection_name, # Unique collection name per test
29+
)
30+
31+
# Reset collection before test
32+
idx.reset_collection()
33+
logger.debug("Reset collection before test")
34+
35+
yield idx
36+
37+
# Cleanup after test
38+
idx.reset_collection()
39+
logger.debug("Reset collection after test")

tests/test_chunking.py

Lines changed: 104 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
from gptme_rag.indexing.document import Document
99
from gptme_rag.indexing.document_processor import DocumentProcessor
10-
from gptme_rag.indexing.indexer import Indexer
1110

1211

1312
@pytest.fixture
@@ -49,153 +48,116 @@ def test_document_chunking(test_file):
4948
assert all(id_ is not None and "#chunk" in id_ for id_ in chunk_ids)
5049

5150

52-
def test_indexing_with_chunks(test_file):
51+
def test_indexing_with_chunks(test_file, indexer):
5352
"""Test indexing documents with chunking enabled."""
54-
with tempfile.TemporaryDirectory() as index_dir:
55-
# Debug: Print test file content
56-
content = test_file.read_text()
57-
print("\nTest file content:")
58-
print(f"Size: {len(content)} chars")
59-
print("First 200 chars:")
60-
print(content[:200])
61-
62-
indexer = Indexer(
63-
persist_directory=Path(index_dir),
64-
chunk_size=200, # Increased chunk size
65-
chunk_overlap=50, # Increased overlap
66-
enable_persist=True, # Ensure persistence
67-
)
68-
69-
# Index the test file
70-
print("\nIndexing directory:", test_file.parent)
71-
n_indexed = indexer.index_directory(test_file.parent)
72-
print(f"Indexed {n_indexed} files")
73-
74-
# Debug collection state
75-
print("\nCollection state:")
76-
indexer.debug_collection()
77-
78-
# Search should return results
79-
print("\nSearching for 'Lorem ipsum'...")
80-
docs, distances, _ = indexer.search("Lorem ipsum", n_results=5)
81-
print(f"Found {len(docs)} documents")
82-
for i, doc in enumerate(docs):
83-
print(f"\nDoc {i}:")
84-
print(f"ID: {doc.doc_id}")
85-
print(f"Content: {doc.content[:100]}...")
86-
87-
assert len(docs) > 0, "No documents found in search results"
88-
assert len(distances) == len(docs), "Distances don't match documents"
89-
assert all(doc.is_chunk for doc in docs), "Not all results are chunks"
90-
91-
92-
def test_chunk_grouping(test_file):
53+
# Debug: Print test file content
54+
content = test_file.read_text()
55+
print("\nTest file content:")
56+
print(f"Size: {len(content)} chars")
57+
print("First 200 chars:")
58+
print(content[:200])
59+
60+
# Index the test file
61+
print("\nIndexing directory:", test_file.parent)
62+
n_indexed = indexer.index_directory(test_file.parent)
63+
print(f"Indexed {n_indexed} files")
64+
65+
# Debug collection state
66+
print("\nCollection state:")
67+
indexer.debug_collection()
68+
69+
# Search should return results
70+
print("\nSearching for 'Lorem ipsum'...")
71+
docs, distances, _ = indexer.search("Lorem ipsum", n_results=5)
72+
print(f"Found {len(docs)} documents")
73+
for i, doc in enumerate(docs):
74+
print(f"\nDoc {i}:")
75+
print(f"ID: {doc.doc_id}")
76+
print(f"Content: {doc.content[:100]}...")
77+
78+
assert len(docs) > 0, "No documents found in search results"
79+
assert len(distances) == len(docs), "Distances don't match documents"
80+
assert all(doc.is_chunk for doc in docs), "Not all results are chunks"
81+
82+
83+
def test_chunk_grouping(test_file, indexer):
9384
"""Test that chunks are properly grouped in search results."""
94-
with tempfile.TemporaryDirectory() as index_dir:
95-
indexer = Indexer(
96-
persist_directory=Path(index_dir),
97-
chunk_size=50, # Smaller chunk size to ensure multiple chunks
98-
chunk_overlap=10,
99-
enable_persist=True, # Enable persistent storage
100-
collection_name="test_chunk_grouping", # Unique collection name
101-
)
102-
103-
# Index the test file
104-
indexer.index_directory(test_file.parent)
105-
106-
# Search with and without grouping
107-
grouped_docs, _, _ = indexer.search(
108-
"Lorem ipsum", n_results=3, group_chunks=True
109-
)
110-
ungrouped_docs, _, _ = indexer.search(
111-
"Lorem ipsum", n_results=3, group_chunks=False
112-
)
113-
114-
# Grouped results should have unique source documents
115-
grouped_sources = set(
116-
doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in grouped_docs
117-
)
118-
assert len(grouped_sources) == len(grouped_docs)
119-
120-
# Ungrouped results might have multiple chunks from same document
121-
ungrouped_sources = set(
122-
doc.doc_id.split("#chunk")[0] if doc.doc_id else ""
123-
for doc in ungrouped_docs
124-
)
125-
assert len(ungrouped_sources) <= len(ungrouped_docs)
126-
127-
128-
def test_document_reconstruction(test_file):
129-
"""Test reconstructing full documents from chunks."""
130-
with tempfile.TemporaryDirectory() as index_dir:
131-
indexer = Indexer(
132-
persist_directory=Path(index_dir),
133-
chunk_size=50, # Smaller chunk size to ensure multiple chunks
134-
chunk_overlap=10,
135-
)
85+
# Index the test file
86+
indexer.index_directory(test_file.parent)
87+
88+
# Search with and without grouping
89+
grouped_docs, _, _ = indexer.search("Lorem ipsum", n_results=3, group_chunks=True)
90+
ungrouped_docs, _, _ = indexer.search(
91+
"Lorem ipsum", n_results=3, group_chunks=False
92+
)
93+
94+
# Grouped results should have unique source documents
95+
grouped_sources = set(
96+
doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in grouped_docs
97+
)
98+
assert len(grouped_sources) == len(grouped_docs)
99+
100+
# Ungrouped results might have multiple chunks from same document
101+
ungrouped_sources = set(
102+
doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in ungrouped_docs
103+
)
104+
assert len(ungrouped_sources) <= len(ungrouped_docs)
136105

137-
# Index the test file
138-
indexer.index_directory(test_file.parent)
139106

140-
# Get a document ID from search results
141-
docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists
142-
base_doc_id = docs[0].doc_id
143-
assert base_doc_id is not None
144-
doc_id = base_doc_id.split("#chunk")[0]
107+
def test_document_reconstruction(test_file, indexer):
108+
"""Test reconstructing full documents from chunks."""
109+
# Index the test file
110+
indexer.index_directory(test_file.parent)
111+
112+
# Get a document ID from search results
113+
docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists
114+
base_doc_id = docs[0].doc_id
115+
assert base_doc_id is not None
116+
doc_id = base_doc_id.split("#chunk")[0]
145117

146-
# Reconstruct the document
147-
full_doc = indexer.reconstruct_document(doc_id)
118+
# Reconstruct the document
119+
full_doc = indexer.reconstruct_document(doc_id)
148120

149-
# Check the reconstructed document
150-
assert not full_doc.is_chunk
151-
assert full_doc.doc_id == doc_id
152-
assert "chunk_index" not in full_doc.metadata
153-
assert len(full_doc.content) > len(docs[0].content)
121+
# Check the reconstructed document
122+
assert not full_doc.is_chunk
123+
assert full_doc.doc_id == doc_id
124+
assert "chunk_index" not in full_doc.metadata
125+
assert len(full_doc.content) > len(docs[0].content)
154126

155127

156-
def test_chunk_retrieval(test_file):
128+
def test_chunk_retrieval(test_file, indexer):
157129
"""Test retrieving all chunks for a document."""
158-
with tempfile.TemporaryDirectory() as index_dir:
159-
indexer = Indexer(
160-
persist_directory=Path(index_dir),
161-
chunk_size=50, # Smaller chunk size to ensure multiple chunks
162-
chunk_overlap=10,
163-
)
164-
165-
# Debug: Print test file content
166-
content = test_file.read_text()
167-
print(f"\nTest file size: {len(content)} chars")
168-
print(f"Token count: {len(indexer.processor.encoding.encode(content))}")
169-
170-
# Index the test file
171-
print("\nIndexing file...")
172-
indexer.index_file(test_file)
173-
174-
# Get a document ID from search results
175-
print("\nSearching...")
176-
docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists
177-
print(f"Found {len(docs)} documents")
178-
for i, doc in enumerate(docs):
179-
print(f"\nDoc {i}:")
180-
print(f"ID: {doc.doc_id}")
181-
print(f"Content length: {len(doc.content)}")
182-
print(f"Is chunk: {doc.is_chunk}")
183-
base_doc_id = docs[0].doc_id
184-
assert base_doc_id is not None
185-
doc_id = base_doc_id.split("#chunk")[0]
186-
187-
# Get all chunks
188-
chunks = indexer.get_document_chunks(doc_id)
189-
190-
# Check chunks
191-
assert len(chunks) > 1
192-
assert all(chunk.is_chunk for chunk in chunks)
193-
assert all(
194-
chunk.doc_id is not None and chunk.doc_id.startswith(doc_id)
195-
for chunk in chunks
196-
)
197-
# Check chunks are in order
198-
chunk_indices = [
199-
chunk.chunk_index or 0 for chunk in chunks
200-
] # Default to 0 if None
201-
assert chunk_indices == sorted(chunk_indices)
130+
# Debug: Print test file content
131+
content = test_file.read_text()
132+
print(f"\nTest file size: {len(content)} chars")
133+
print(f"Token count: {len(indexer.processor.encoding.encode(content))}")
134+
135+
# Index the test file
136+
print("\nIndexing file...")
137+
indexer.index_file(test_file)
138+
139+
# Get a document ID from search results
140+
print("\nSearching...")
141+
docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists
142+
print(f"Found {len(docs)} documents")
143+
for i, doc in enumerate(docs):
144+
print(f"\nDoc {i}:")
145+
print(f"ID: {doc.doc_id}")
146+
print(f"Content length: {len(doc.content)}")
147+
print(f"Is chunk: {doc.is_chunk}")
148+
base_doc_id = docs[0].doc_id
149+
assert base_doc_id is not None
150+
doc_id = base_doc_id.split("#chunk")[0]
151+
152+
# Get all chunks
153+
chunks = indexer.get_document_chunks(doc_id)
154+
155+
# Check chunks
156+
assert len(chunks) > 1
157+
assert all(chunk.is_chunk for chunk in chunks)
158+
assert all(
159+
chunk.doc_id is not None and chunk.doc_id.startswith(doc_id) for chunk in chunks
160+
)
161+
# Check chunks are in order
162+
chunk_indices = [chunk.chunk_index or 0 for chunk in chunks] # Default to 0 if None
163+
assert chunk_indices == sorted(chunk_indices)

tests/test_indexing.py

Lines changed: 10 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,5 @@
1-
from pathlib import Path
21
import pytest
3-
import tempfile
4-
import chromadb
52
from gptme_rag.indexing.document import Document
6-
from gptme_rag.indexing.indexer import Indexer
7-
8-
9-
@pytest.fixture(autouse=True)
10-
def cleanup_chroma():
11-
"""Clean up ChromaDB between tests."""
12-
yield
13-
# Reset the ChromaDB client system
14-
if hasattr(chromadb.api.client.SharedSystemClient, "_identifer_to_system"):
15-
chromadb.api.client.SharedSystemClient._identifer_to_system = {}
163

174

185
@pytest.fixture
@@ -31,15 +18,9 @@ def test_docs():
3118
]
3219

3320

34-
@pytest.fixture
35-
def temp_dir():
36-
with tempfile.TemporaryDirectory() as tmpdir:
37-
yield Path(tmpdir)
38-
39-
40-
def test_document_from_file(temp_dir):
21+
def test_document_from_file(tmp_path):
4122
# Create a test file
42-
test_file = temp_dir / "test.txt"
23+
test_file = tmp_path / "test.txt"
4324
test_content = "Test content"
4425
test_file.write_text(test_content)
4526

@@ -54,9 +35,7 @@ def test_document_from_file(temp_dir):
5435
assert doc.metadata["extension"] == ".txt"
5536

5637

57-
def test_indexer_add_document(temp_dir, test_docs):
58-
indexer = Indexer(persist_directory=temp_dir)
59-
38+
def test_indexer_add_document(indexer, test_docs):
6039
# Add single document
6140
indexer.add_document(test_docs[0])
6241
results, distances, _ = indexer.search("Python programming")
@@ -66,14 +45,7 @@ def test_indexer_add_document(temp_dir, test_docs):
6645
assert len(distances) > 0
6746

6847

69-
def test_indexer_add_documents(temp_dir, test_docs):
70-
# Create indexer with unique collection name
71-
indexer = Indexer(
72-
persist_directory=temp_dir,
73-
collection_name="test_add_documents",
74-
enable_persist=True,
75-
)
76-
48+
def test_indexer_add_documents(indexer, test_docs):
7749
# Reset collection to ensure clean state
7850
indexer.reset_collection()
7951

@@ -99,15 +71,14 @@ def test_indexer_add_documents(temp_dir, test_docs):
9971
assert len(ml_distances) > 0, "No distances returned"
10072

10173

102-
def test_indexer_directory(temp_dir):
74+
def test_indexer_directory(indexer, tmp_path):
10375
# Create test files
104-
(temp_dir / "test1.txt").write_text("Content about Python")
105-
(temp_dir / "test2.txt").write_text("Content about JavaScript")
106-
(temp_dir / "subdir").mkdir()
107-
(temp_dir / "subdir" / "test3.txt").write_text("Content about TypeScript")
76+
(tmp_path / "test1.txt").write_text("Content about Python")
77+
(tmp_path / "test2.txt").write_text("Content about JavaScript")
78+
(tmp_path / "subdir").mkdir()
79+
(tmp_path / "subdir" / "test3.txt").write_text("Content about TypeScript")
10880

109-
indexer = Indexer(persist_directory=temp_dir / "index")
110-
indexer.index_directory(temp_dir)
81+
indexer.index_directory(tmp_path)
11182

11283
# Search for programming languages
11384
python_results, python_distances, _ = indexer.search("Python")

0 commit comments

Comments
 (0)