|
7 | 7 |
|
8 | 8 | from gptme_rag.indexing.document import Document |
9 | 9 | from gptme_rag.indexing.document_processor import DocumentProcessor |
10 | | -from gptme_rag.indexing.indexer import Indexer |
11 | 10 |
|
12 | 11 |
|
13 | 12 | @pytest.fixture |
@@ -49,153 +48,116 @@ def test_document_chunking(test_file): |
49 | 48 | assert all(id_ is not None and "#chunk" in id_ for id_ in chunk_ids) |
50 | 49 |
|
51 | 50 |
|
52 | | -def test_indexing_with_chunks(test_file): |
| 51 | +def test_indexing_with_chunks(test_file, indexer): |
53 | 52 | """Test indexing documents with chunking enabled.""" |
54 | | - with tempfile.TemporaryDirectory() as index_dir: |
55 | | - # Debug: Print test file content |
56 | | - content = test_file.read_text() |
57 | | - print("\nTest file content:") |
58 | | - print(f"Size: {len(content)} chars") |
59 | | - print("First 200 chars:") |
60 | | - print(content[:200]) |
61 | | - |
62 | | - indexer = Indexer( |
63 | | - persist_directory=Path(index_dir), |
64 | | - chunk_size=200, # Increased chunk size |
65 | | - chunk_overlap=50, # Increased overlap |
66 | | - enable_persist=True, # Ensure persistence |
67 | | - ) |
68 | | - |
69 | | - # Index the test file |
70 | | - print("\nIndexing directory:", test_file.parent) |
71 | | - n_indexed = indexer.index_directory(test_file.parent) |
72 | | - print(f"Indexed {n_indexed} files") |
73 | | - |
74 | | - # Debug collection state |
75 | | - print("\nCollection state:") |
76 | | - indexer.debug_collection() |
77 | | - |
78 | | - # Search should return results |
79 | | - print("\nSearching for 'Lorem ipsum'...") |
80 | | - docs, distances, _ = indexer.search("Lorem ipsum", n_results=5) |
81 | | - print(f"Found {len(docs)} documents") |
82 | | - for i, doc in enumerate(docs): |
83 | | - print(f"\nDoc {i}:") |
84 | | - print(f"ID: {doc.doc_id}") |
85 | | - print(f"Content: {doc.content[:100]}...") |
86 | | - |
87 | | - assert len(docs) > 0, "No documents found in search results" |
88 | | - assert len(distances) == len(docs), "Distances don't match documents" |
89 | | - assert all(doc.is_chunk for doc in docs), "Not all results are chunks" |
90 | | - |
91 | | - |
92 | | -def test_chunk_grouping(test_file): |
| 53 | + # Debug: Print test file content |
| 54 | + content = test_file.read_text() |
| 55 | + print("\nTest file content:") |
| 56 | + print(f"Size: {len(content)} chars") |
| 57 | + print("First 200 chars:") |
| 58 | + print(content[:200]) |
| 59 | + |
| 60 | + # Index the test file |
| 61 | + print("\nIndexing directory:", test_file.parent) |
| 62 | + n_indexed = indexer.index_directory(test_file.parent) |
| 63 | + print(f"Indexed {n_indexed} files") |
| 64 | + |
| 65 | + # Debug collection state |
| 66 | + print("\nCollection state:") |
| 67 | + indexer.debug_collection() |
| 68 | + |
| 69 | + # Search should return results |
| 70 | + print("\nSearching for 'Lorem ipsum'...") |
| 71 | + docs, distances, _ = indexer.search("Lorem ipsum", n_results=5) |
| 72 | + print(f"Found {len(docs)} documents") |
| 73 | + for i, doc in enumerate(docs): |
| 74 | + print(f"\nDoc {i}:") |
| 75 | + print(f"ID: {doc.doc_id}") |
| 76 | + print(f"Content: {doc.content[:100]}...") |
| 77 | + |
| 78 | + assert len(docs) > 0, "No documents found in search results" |
| 79 | + assert len(distances) == len(docs), "Distances don't match documents" |
| 80 | + assert all(doc.is_chunk for doc in docs), "Not all results are chunks" |
| 81 | + |
| 82 | + |
| 83 | +def test_chunk_grouping(test_file, indexer): |
93 | 84 | """Test that chunks are properly grouped in search results.""" |
94 | | - with tempfile.TemporaryDirectory() as index_dir: |
95 | | - indexer = Indexer( |
96 | | - persist_directory=Path(index_dir), |
97 | | - chunk_size=50, # Smaller chunk size to ensure multiple chunks |
98 | | - chunk_overlap=10, |
99 | | - enable_persist=True, # Enable persistent storage |
100 | | - collection_name="test_chunk_grouping", # Unique collection name |
101 | | - ) |
102 | | - |
103 | | - # Index the test file |
104 | | - indexer.index_directory(test_file.parent) |
105 | | - |
106 | | - # Search with and without grouping |
107 | | - grouped_docs, _, _ = indexer.search( |
108 | | - "Lorem ipsum", n_results=3, group_chunks=True |
109 | | - ) |
110 | | - ungrouped_docs, _, _ = indexer.search( |
111 | | - "Lorem ipsum", n_results=3, group_chunks=False |
112 | | - ) |
113 | | - |
114 | | - # Grouped results should have unique source documents |
115 | | - grouped_sources = set( |
116 | | - doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in grouped_docs |
117 | | - ) |
118 | | - assert len(grouped_sources) == len(grouped_docs) |
119 | | - |
120 | | - # Ungrouped results might have multiple chunks from same document |
121 | | - ungrouped_sources = set( |
122 | | - doc.doc_id.split("#chunk")[0] if doc.doc_id else "" |
123 | | - for doc in ungrouped_docs |
124 | | - ) |
125 | | - assert len(ungrouped_sources) <= len(ungrouped_docs) |
126 | | - |
127 | | - |
128 | | -def test_document_reconstruction(test_file): |
129 | | - """Test reconstructing full documents from chunks.""" |
130 | | - with tempfile.TemporaryDirectory() as index_dir: |
131 | | - indexer = Indexer( |
132 | | - persist_directory=Path(index_dir), |
133 | | - chunk_size=50, # Smaller chunk size to ensure multiple chunks |
134 | | - chunk_overlap=10, |
135 | | - ) |
| 85 | + # Index the test file |
| 86 | + indexer.index_directory(test_file.parent) |
| 87 | + |
| 88 | + # Search with and without grouping |
| 89 | + grouped_docs, _, _ = indexer.search("Lorem ipsum", n_results=3, group_chunks=True) |
| 90 | + ungrouped_docs, _, _ = indexer.search( |
| 91 | + "Lorem ipsum", n_results=3, group_chunks=False |
| 92 | + ) |
| 93 | + |
| 94 | + # Grouped results should have unique source documents |
| 95 | + grouped_sources = set( |
| 96 | + doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in grouped_docs |
| 97 | + ) |
| 98 | + assert len(grouped_sources) == len(grouped_docs) |
| 99 | + |
| 100 | + # Ungrouped results might have multiple chunks from same document |
| 101 | + ungrouped_sources = set( |
| 102 | + doc.doc_id.split("#chunk")[0] if doc.doc_id else "" for doc in ungrouped_docs |
| 103 | + ) |
| 104 | + assert len(ungrouped_sources) <= len(ungrouped_docs) |
136 | 105 |
|
137 | | - # Index the test file |
138 | | - indexer.index_directory(test_file.parent) |
139 | 106 |
|
140 | | - # Get a document ID from search results |
141 | | - docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists |
142 | | - base_doc_id = docs[0].doc_id |
143 | | - assert base_doc_id is not None |
144 | | - doc_id = base_doc_id.split("#chunk")[0] |
| 107 | +def test_document_reconstruction(test_file, indexer): |
| 108 | + """Test reconstructing full documents from chunks.""" |
| 109 | + # Index the test file |
| 110 | + indexer.index_directory(test_file.parent) |
| 111 | + |
| 112 | + # Get a document ID from search results |
| 113 | + docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists |
| 114 | + base_doc_id = docs[0].doc_id |
| 115 | + assert base_doc_id is not None |
| 116 | + doc_id = base_doc_id.split("#chunk")[0] |
145 | 117 |
|
146 | | - # Reconstruct the document |
147 | | - full_doc = indexer.reconstruct_document(doc_id) |
| 118 | + # Reconstruct the document |
| 119 | + full_doc = indexer.reconstruct_document(doc_id) |
148 | 120 |
|
149 | | - # Check the reconstructed document |
150 | | - assert not full_doc.is_chunk |
151 | | - assert full_doc.doc_id == doc_id |
152 | | - assert "chunk_index" not in full_doc.metadata |
153 | | - assert len(full_doc.content) > len(docs[0].content) |
| 121 | + # Check the reconstructed document |
| 122 | + assert not full_doc.is_chunk |
| 123 | + assert full_doc.doc_id == doc_id |
| 124 | + assert "chunk_index" not in full_doc.metadata |
| 125 | + assert len(full_doc.content) > len(docs[0].content) |
154 | 126 |
|
155 | 127 |
|
156 | | -def test_chunk_retrieval(test_file): |
| 128 | +def test_chunk_retrieval(test_file, indexer): |
157 | 129 | """Test retrieving all chunks for a document.""" |
158 | | - with tempfile.TemporaryDirectory() as index_dir: |
159 | | - indexer = Indexer( |
160 | | - persist_directory=Path(index_dir), |
161 | | - chunk_size=50, # Smaller chunk size to ensure multiple chunks |
162 | | - chunk_overlap=10, |
163 | | - ) |
164 | | - |
165 | | - # Debug: Print test file content |
166 | | - content = test_file.read_text() |
167 | | - print(f"\nTest file size: {len(content)} chars") |
168 | | - print(f"Token count: {len(indexer.processor.encoding.encode(content))}") |
169 | | - |
170 | | - # Index the test file |
171 | | - print("\nIndexing file...") |
172 | | - indexer.index_file(test_file) |
173 | | - |
174 | | - # Get a document ID from search results |
175 | | - print("\nSearching...") |
176 | | - docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists |
177 | | - print(f"Found {len(docs)} documents") |
178 | | - for i, doc in enumerate(docs): |
179 | | - print(f"\nDoc {i}:") |
180 | | - print(f"ID: {doc.doc_id}") |
181 | | - print(f"Content length: {len(doc.content)}") |
182 | | - print(f"Is chunk: {doc.is_chunk}") |
183 | | - base_doc_id = docs[0].doc_id |
184 | | - assert base_doc_id is not None |
185 | | - doc_id = base_doc_id.split("#chunk")[0] |
186 | | - |
187 | | - # Get all chunks |
188 | | - chunks = indexer.get_document_chunks(doc_id) |
189 | | - |
190 | | - # Check chunks |
191 | | - assert len(chunks) > 1 |
192 | | - assert all(chunk.is_chunk for chunk in chunks) |
193 | | - assert all( |
194 | | - chunk.doc_id is not None and chunk.doc_id.startswith(doc_id) |
195 | | - for chunk in chunks |
196 | | - ) |
197 | | - # Check chunks are in order |
198 | | - chunk_indices = [ |
199 | | - chunk.chunk_index or 0 for chunk in chunks |
200 | | - ] # Default to 0 if None |
201 | | - assert chunk_indices == sorted(chunk_indices) |
| 130 | + # Debug: Print test file content |
| 131 | + content = test_file.read_text() |
| 132 | + print(f"\nTest file size: {len(content)} chars") |
| 133 | + print(f"Token count: {len(indexer.processor.encoding.encode(content))}") |
| 134 | + |
| 135 | + # Index the test file |
| 136 | + print("\nIndexing file...") |
| 137 | + indexer.index_file(test_file) |
| 138 | + |
| 139 | + # Get a document ID from search results |
| 140 | + print("\nSearching...") |
| 141 | + docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists |
| 142 | + print(f"Found {len(docs)} documents") |
| 143 | + for i, doc in enumerate(docs): |
| 144 | + print(f"\nDoc {i}:") |
| 145 | + print(f"ID: {doc.doc_id}") |
| 146 | + print(f"Content length: {len(doc.content)}") |
| 147 | + print(f"Is chunk: {doc.is_chunk}") |
| 148 | + base_doc_id = docs[0].doc_id |
| 149 | + assert base_doc_id is not None |
| 150 | + doc_id = base_doc_id.split("#chunk")[0] |
| 151 | + |
| 152 | + # Get all chunks |
| 153 | + chunks = indexer.get_document_chunks(doc_id) |
| 154 | + |
| 155 | + # Check chunks |
| 156 | + assert len(chunks) > 1 |
| 157 | + assert all(chunk.is_chunk for chunk in chunks) |
| 158 | + assert all( |
| 159 | + chunk.doc_id is not None and chunk.doc_id.startswith(doc_id) for chunk in chunks |
| 160 | + ) |
| 161 | + # Check chunks are in order |
| 162 | + chunk_indices = [chunk.chunk_index or 0 for chunk in chunks] # Default to 0 if None |
| 163 | + assert chunk_indices == sorted(chunk_indices) |
0 commit comments