|
7 | 7 |
|
8 | 8 | from gptme_rag.indexing.document import Document
|
9 | 9 | from gptme_rag.indexing.document_processor import DocumentProcessor
|
10 |
| -from gptme_rag.indexing.indexer import Indexer |
11 | 10 |
|
12 | 11 |
|
13 | 12 | @pytest.fixture
|
@@ -49,153 +48,116 @@ def test_document_chunking(test_file):
|
49 | 48 | assert all(id_ is not None and "#chunk" in id_ for id_ in chunk_ids)
|
50 | 49 |
|
51 | 50 |
|
52 |
| -def test_indexing_with_chunks(test_file): |
| 51 | +def test_indexing_with_chunks(test_file, indexer): |
53 | 52 | """Test indexing documents with chunking enabled."""
|
54 |
| - with tempfile.TemporaryDirectory() as index_dir: |
55 |
| - # Debug: Print test file content |
56 |
| - content = test_file.read_text() |
57 |
| - print("\nTest file content:") |
58 |
| - print(f"Size: {len(content)} chars") |
59 |
| - print("First 200 chars:") |
60 |
| - print(content[:200]) |
61 |
| - |
62 |
| - indexer = Indexer( |
63 |
| - persist_directory=Path(index_dir), |
64 |
| - chunk_size=200, # Increased chunk size |
65 |
| - chunk_overlap=50, # Increased overlap |
66 |
| - enable_persist=True, # Ensure persistence |
67 |
| - ) |
68 |
| - |
69 |
| - # Index the test file |
70 |
| - print("\nIndexing directory:", test_file.parent) |
71 |
| - n_indexed = indexer.index_directory(test_file.parent) |
72 |
| - print(f"Indexed {n_indexed} files") |
73 |
| - |
74 |
| - # Debug collection state |
75 |
| - print("\nCollection state:") |
76 |
| - indexer.debug_collection() |
77 |
| - |
78 |
| - # Search should return results |
79 |
| - print("\nSearching for 'Lorem ipsum'...") |
80 |
| - docs, distances, _ = indexer.search("Lorem ipsum", n_results=5) |
81 |
| - print(f"Found {len(docs)} documents") |
82 |
| - for i, doc in enumerate(docs): |
83 |
| - print(f"\nDoc {i}:") |
84 |
| - print(f"ID: {doc.doc_id}") |
85 |
| - print(f"Content: {doc.content[:100]}...") |
86 |
| - |
87 |
| - assert len(docs) > 0, "No documents found in search results" |
88 |
| - assert len(distances) == len(docs), "Distances don't match documents" |
89 |
| - assert all(doc.is_chunk for doc in docs), "Not all results are chunks" |
90 |
| - |
91 |
| - |
92 |
| -def test_chunk_grouping(test_file): |
| 53 | + # Debug: Print test file content |
| 54 | + content = test_file.read_text() |
| 55 | + print("\nTest file content:") |
| 56 | + print(f"Size: {len(content)} chars") |
| 57 | + print("First 200 chars:") |
| 58 | + print(content[:200]) |
| 59 | + |
| 60 | + # Index the test file |
| 61 | + print("\nIndexing directory:", test_file.parent) |
| 62 | + n_indexed = indexer.index_directory(test_file.parent) |
| 63 | + print(f"Indexed {n_indexed} files") |
| 64 | + |
| 65 | + # Debug collection state |
| 66 | + print("\nCollection state:") |
| 67 | + indexer.debug_collection() |
| 68 | + |
| 69 | + # Search should return results |
| 70 | + print("\nSearching for 'Lorem ipsum'...") |
| 71 | + docs, distances, _ = indexer.search("Lorem ipsum", n_results=5) |
| 72 | + print(f"Found {len(docs)} documents") |
| 73 | + for i, doc in enumerate(docs): |
| 74 | + print(f"\nDoc {i}:") |
| 75 | + print(f"ID: {doc.doc_id}") |
| 76 | + print(f"Content: {doc.content[:100]}...") |
| 77 | + |
| 78 | + assert len(docs) > 0, "No documents found in search results" |
| 79 | + assert len(distances) == len(docs), "Distances don't match documents" |
| 80 | + assert all(doc.is_chunk for doc in docs), "Not all results are chunks" |
| 81 | + |
| 82 | + |
| 83 | +def test_chunk_grouping(test_file, indexer): |
93 | 84 | """Test that chunks are properly grouped in search results."""
|
94 |
| - with tempfile.TemporaryDirectory() as index_dir: |
95 |
| - indexer = Indexer( |
96 |
| - persist_directory=Path(index_dir), |
97 |
| - chunk_size=50, # Smaller chunk size to ensure multiple chunks |
98 |
| - chunk_overlap=10, |
99 |
| - enable_persist=True, # Enable persistent storage |
100 |
| - collection_name="test_chunk_grouping", # Unique collection name |
101 |
| - ) |
102 |
| - |
103 |
| - # Index the test file |
104 |
| - indexer.index_directory(test_file.parent) |
105 |
| - |
106 |
| - # Search with and without grouping |
107 |
| - grouped_docs, _, _ = indexer.search( |
108 |
| - "Lorem ipsum", n_results=3, group_chunks=True |
109 |
| - ) |
110 |
| - ungrouped_docs, _, _ = indexer.search( |
111 |
| - "Lorem ipsum", n_results=3, group_chunks=False |
112 |
| - ) |
113 |
| - |
114 |
| - # Grouped results should have unique source documents |
115 |
| - grouped_sources = set( |
116 |
| - doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in grouped_docs |
117 |
| - ) |
118 |
| - assert len(grouped_sources) == len(grouped_docs) |
119 |
| - |
120 |
| - # Ungrouped results might have multiple chunks from same document |
121 |
| - ungrouped_sources = set( |
122 |
| - doc.doc_id.split("#chunk")[0] if doc.doc_id else "" |
123 |
| - for doc in ungrouped_docs |
124 |
| - ) |
125 |
| - assert len(ungrouped_sources) <= len(ungrouped_docs) |
126 |
| - |
127 |
| - |
128 |
| -def test_document_reconstruction(test_file): |
129 |
| - """Test reconstructing full documents from chunks.""" |
130 |
| - with tempfile.TemporaryDirectory() as index_dir: |
131 |
| - indexer = Indexer( |
132 |
| - persist_directory=Path(index_dir), |
133 |
| - chunk_size=50, # Smaller chunk size to ensure multiple chunks |
134 |
| - chunk_overlap=10, |
135 |
| - ) |
| 85 | + # Index the test file |
| 86 | + indexer.index_directory(test_file.parent) |
| 87 | + |
| 88 | + # Search with and without grouping |
| 89 | + grouped_docs, _, _ = indexer.search("Lorem ipsum", n_results=3, group_chunks=True) |
| 90 | + ungrouped_docs, _, _ = indexer.search( |
| 91 | + "Lorem ipsum", n_results=3, group_chunks=False |
| 92 | + ) |
| 93 | + |
| 94 | + # Grouped results should have unique source documents |
| 95 | + grouped_sources = set( |
| 96 | + doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in grouped_docs |
| 97 | + ) |
| 98 | + assert len(grouped_sources) == len(grouped_docs) |
| 99 | + |
| 100 | + # Ungrouped results might have multiple chunks from same document |
| 101 | + ungrouped_sources = set( |
| 102 | + doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in ungrouped_docs |
| 103 | + ) |
| 104 | + assert len(ungrouped_sources) <= len(ungrouped_docs) |
136 | 105 |
|
137 |
| - # Index the test file |
138 |
| - indexer.index_directory(test_file.parent) |
139 | 106 |
|
140 |
| - # Get a document ID from search results |
141 |
| - docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists |
142 |
| - base_doc_id = docs[0].doc_id |
143 |
| - assert base_doc_id is not None |
144 |
| - doc_id = base_doc_id.split("#chunk")[0] |
| 107 | +def test_document_reconstruction(test_file, indexer): |
| 108 | + """Test reconstructing full documents from chunks.""" |
| 109 | + # Index the test file |
| 110 | + indexer.index_directory(test_file.parent) |
| 111 | + |
| 112 | + # Get a document ID from search results |
| 113 | + docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists |
| 114 | + base_doc_id = docs[0].doc_id |
| 115 | + assert base_doc_id is not None |
| 116 | + doc_id = base_doc_id.split("#chunk")[0] |
145 | 117 |
|
146 |
| - # Reconstruct the document |
147 |
| - full_doc = indexer.reconstruct_document(doc_id) |
| 118 | + # Reconstruct the document |
| 119 | + full_doc = indexer.reconstruct_document(doc_id) |
148 | 120 |
|
149 |
| - # Check the reconstructed document |
150 |
| - assert not full_doc.is_chunk |
151 |
| - assert full_doc.doc_id == doc_id |
152 |
| - assert "chunk_index" not in full_doc.metadata |
153 |
| - assert len(full_doc.content) > len(docs[0].content) |
| 121 | + # Check the reconstructed document |
| 122 | + assert not full_doc.is_chunk |
| 123 | + assert full_doc.doc_id == doc_id |
| 124 | + assert "chunk_index" not in full_doc.metadata |
| 125 | + assert len(full_doc.content) > len(docs[0].content) |
154 | 126 |
|
155 | 127 |
|
156 |
| -def test_chunk_retrieval(test_file): |
| 128 | +def test_chunk_retrieval(test_file, indexer): |
157 | 129 | """Test retrieving all chunks for a document."""
|
158 |
| - with tempfile.TemporaryDirectory() as index_dir: |
159 |
| - indexer = Indexer( |
160 |
| - persist_directory=Path(index_dir), |
161 |
| - chunk_size=50, # Smaller chunk size to ensure multiple chunks |
162 |
| - chunk_overlap=10, |
163 |
| - ) |
164 |
| - |
165 |
| - # Debug: Print test file content |
166 |
| - content = test_file.read_text() |
167 |
| - print(f"\nTest file size: {len(content)} chars") |
168 |
| - print(f"Token count: {len(indexer.processor.encoding.encode(content))}") |
169 |
| - |
170 |
| - # Index the test file |
171 |
| - print("\nIndexing file...") |
172 |
| - indexer.index_file(test_file) |
173 |
| - |
174 |
| - # Get a document ID from search results |
175 |
| - print("\nSearching...") |
176 |
| - docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists |
177 |
| - print(f"Found {len(docs)} documents") |
178 |
| - for i, doc in enumerate(docs): |
179 |
| - print(f"\nDoc {i}:") |
180 |
| - print(f"ID: {doc.doc_id}") |
181 |
| - print(f"Content length: {len(doc.content)}") |
182 |
| - print(f"Is chunk: {doc.is_chunk}") |
183 |
| - base_doc_id = docs[0].doc_id |
184 |
| - assert base_doc_id is not None |
185 |
| - doc_id = base_doc_id.split("#chunk")[0] |
186 |
| - |
187 |
| - # Get all chunks |
188 |
| - chunks = indexer.get_document_chunks(doc_id) |
189 |
| - |
190 |
| - # Check chunks |
191 |
| - assert len(chunks) > 1 |
192 |
| - assert all(chunk.is_chunk for chunk in chunks) |
193 |
| - assert all( |
194 |
| - chunk.doc_id is not None and chunk.doc_id.startswith(doc_id) |
195 |
| - for chunk in chunks |
196 |
| - ) |
197 |
| - # Check chunks are in order |
198 |
| - chunk_indices = [ |
199 |
| - chunk.chunk_index or 0 for chunk in chunks |
200 |
| - ] # Default to 0 if None |
201 |
| - assert chunk_indices == sorted(chunk_indices) |
| 130 | + # Debug: Print test file content |
| 131 | + content = test_file.read_text() |
| 132 | + print(f"\nTest file size: {len(content)} chars") |
| 133 | + print(f"Token count: {len(indexer.processor.encoding.encode(content))}") |
| 134 | + |
| 135 | + # Index the test file |
| 136 | + print("\nIndexing file...") |
| 137 | + indexer.index_file(test_file) |
| 138 | + |
| 139 | + # Get a document ID from search results |
| 140 | + print("\nSearching...") |
| 141 | + docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists |
| 142 | + print(f"Found {len(docs)} documents") |
| 143 | + for i, doc in enumerate(docs): |
| 144 | + print(f"\nDoc {i}:") |
| 145 | + print(f"ID: {doc.doc_id}") |
| 146 | + print(f"Content length: {len(doc.content)}") |
| 147 | + print(f"Is chunk: {doc.is_chunk}") |
| 148 | + base_doc_id = docs[0].doc_id |
| 149 | + assert base_doc_id is not None |
| 150 | + doc_id = base_doc_id.split("#chunk")[0] |
| 151 | + |
| 152 | + # Get all chunks |
| 153 | + chunks = indexer.get_document_chunks(doc_id) |
| 154 | + |
| 155 | + # Check chunks |
| 156 | + assert len(chunks) > 1 |
| 157 | + assert all(chunk.is_chunk for chunk in chunks) |
| 158 | + assert all( |
| 159 | + chunk.doc_id is not None and chunk.doc_id.startswith(doc_id) for chunk in chunks |
| 160 | + ) |
| 161 | + # Check chunks are in order |
| 162 | + chunk_indices = [chunk.chunk_index or 0 for chunk in chunks] # Default to 0 if None |
| 163 | + assert chunk_indices == sorted(chunk_indices) |
0 commit comments