Skip to content

Commit 77fbd13

Browse files
refactor: improve logging and test clarity
Replace debug print statements with proper logging in indexer. Clean up test files by removing debug prints and improving assertions. Add more descriptive error messages in tests. Co-authored-by: Bob <[email protected]>
1 parent d79047d commit 77fbd13

File tree

4 files changed

+14
-79
lines changed

4 files changed

+14
-79
lines changed

gptme_rag/indexing/indexer.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -316,42 +316,30 @@ def index_directory(
316316
logger.debug("Using git ls-files for file listing")
317317
except subprocess.CalledProcessError:
318318
# Not a git repo or git not available, fall back to glob
319-
print("\nFalling back to glob + gitignore for file listing")
320319
files = list(directory.glob(glob_pattern))
321-
print(
322-
f"Found {len(files)} files matching glob pattern: {[str(f) for f in files]}"
323-
)
324320
gitignore_patterns = self._load_gitignore(directory)
325-
print(f"Loaded gitignore patterns: {gitignore_patterns}")
326-
print(f"\nProcessing files in {directory}")
327-
for f in files:
328-
print(f"\nChecking file: {f}")
329321

322+
for f in files:
330323
if not f.is_file():
331-
print(" Skip: Not a file")
332324
continue
333325

334326
# Check gitignore patterns if in glob mode
335327
if gitignore_patterns and self._is_ignored(f, gitignore_patterns):
336-
print(" Skip: Matches gitignore pattern")
337328
continue
338329

339330
# Filter by glob pattern
340331
rel_path = str(f.relative_to(directory))
341332
# Convert glob pattern to fnmatch pattern
342333
fnmatch_pattern = glob_pattern.replace("**/*", "*")
343334
if not fnmatch_path(rel_path, fnmatch_pattern):
344-
print(f" Skip: Does not match pattern {fnmatch_pattern}")
345335
continue
346-
print(f" Pass: Matches pattern {fnmatch_pattern}")
347336

348337
# Resolve symlinks to target
349338
try:
350339
resolved = f.resolve()
351340
valid_files.add(resolved)
352-
print(f" Added: {resolved}")
353341
except Exception as e:
354-
print(f" Error: Could not resolve path - {e}")
342+
logger.warning(f"Error resolving symlink: {f} -> {e}")
355343

356344
# Check file limit
357345
if len(valid_files) >= file_limit:

tests/test_chunking.py

Lines changed: 8 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -50,30 +50,11 @@ def test_document_chunking(test_file):
5050

5151
def test_indexing_with_chunks(test_file, indexer):
5252
"""Test indexing documents with chunking enabled."""
53-
# Debug: Print test file content
54-
content = test_file.read_text()
55-
print("\nTest file content:")
56-
print(f"Size: {len(content)} chars")
57-
print("First 200 chars:")
58-
print(content[:200])
59-
6053
# Index the test file
61-
print("\nIndexing directory:", test_file.parent)
62-
n_indexed = indexer.index_directory(test_file.parent)
63-
print(f"Indexed {n_indexed} files")
64-
65-
# Debug collection state
66-
print("\nCollection state:")
67-
indexer.debug_collection()
54+
indexer.index_directory(test_file.parent)
6855

6956
# Search should return results
70-
print("\nSearching for 'Lorem ipsum'...")
7157
docs, distances, _ = indexer.search("Lorem ipsum", n_results=5)
72-
print(f"Found {len(docs)} documents")
73-
for i, doc in enumerate(docs):
74-
print(f"\nDoc {i}:")
75-
print(f"ID: {doc.doc_id}")
76-
print(f"Content: {doc.content[:100]}...")
7758

7859
assert len(docs) > 0, "No documents found in search results"
7960
assert len(distances) == len(docs), "Distances don't match documents"
@@ -127,24 +108,11 @@ def test_document_reconstruction(test_file, indexer):
127108

128109
def test_chunk_retrieval(test_file, indexer):
129110
"""Test retrieving all chunks for a document."""
130-
# Debug: Print test file content
131-
content = test_file.read_text()
132-
print(f"\nTest file size: {len(content)} chars")
133-
print(f"Token count: {len(indexer.processor.encoding.encode(content))}")
134-
135111
# Index the test file
136-
print("\nIndexing file...")
137112
indexer.index_file(test_file)
138113

139114
# Get a document ID from search results
140-
print("\nSearching...")
141-
docs, _, _ = indexer.search("Lorem ipsum") # Search for text we know exists
142-
print(f"Found {len(docs)} documents")
143-
for i, doc in enumerate(docs):
144-
print(f"\nDoc {i}:")
145-
print(f"ID: {doc.doc_id}")
146-
print(f"Content length: {len(doc.content)}")
147-
print(f"Is chunk: {doc.is_chunk}")
115+
docs, _, _ = indexer.search("Lorem ipsum")
148116
base_doc_id = docs[0].doc_id
149117
assert base_doc_id is not None
150118
doc_id = base_doc_id.split("#chunk")[0]
@@ -153,11 +121,12 @@ def test_chunk_retrieval(test_file, indexer):
153121
chunks = indexer.get_document_chunks(doc_id)
154122

155123
# Check chunks
156-
assert len(chunks) > 1
157-
assert all(chunk.is_chunk for chunk in chunks)
124+
assert len(chunks) > 1, "Document should be split into multiple chunks"
125+
assert all(chunk.is_chunk for chunk in chunks), "All items should be chunks"
158126
assert all(
159127
chunk.doc_id is not None and chunk.doc_id.startswith(doc_id) for chunk in chunks
160-
)
128+
), "All chunks should belong to the same document"
129+
161130
# Check chunks are in order
162-
chunk_indices = [chunk.chunk_index or 0 for chunk in chunks] # Default to 0 if None
163-
assert chunk_indices == sorted(chunk_indices)
131+
chunk_indices = [chunk.chunk_index or 0 for chunk in chunks]
132+
assert chunk_indices == sorted(chunk_indices), "Chunks should be in order"

tests/test_document_processor.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,6 @@ def test_process_text_basic():
1515

1616
chunks = list(processor.process_text(text))
1717

18-
# Print debug info
19-
print(f"\nTotal tokens in text: {len(processor.encoding.encode(text))}")
20-
for i, chunk in enumerate(chunks):
21-
print(f"\nChunk {i}:")
22-
print(f"Token count: {chunk['metadata']['token_count']}")
23-
print(f"Content length: {len(chunk['text'])}")
24-
print(f"First 50 chars: {chunk['text'][:50]}")
25-
2618
assert len(chunks) > 1 # Should split into multiple chunks
2719
assert all(isinstance(c["text"], str) for c in chunks)
2820
assert all(isinstance(c["metadata"], dict) for c in chunks)
@@ -103,23 +95,6 @@ def test_token_estimation():
10395
assert chunks > 0
10496

10597

106-
def test_content_size():
107-
"""Test actual content size in tokens for test data."""
108-
processor = DocumentProcessor()
109-
content = "\n\n".join(
110-
[
111-
f"This is paragraph {i} with some content that should be indexed."
112-
for i in range(10)
113-
]
114-
)
115-
tokens = processor.encoding.encode(content)
116-
print(f"Total tokens: {len(tokens)}")
117-
print(f"Content length: {len(content)}")
118-
for i, para in enumerate(content.split("\n\n")):
119-
para_tokens = processor.encoding.encode(para)
120-
print(f"Paragraph {i}: {len(para_tokens)} tokens, {len(para)} chars")
121-
122-
12398
def test_optimal_chunk_size():
12499
"""Test optimal chunk size calculation."""
125100
processor = DocumentProcessor(chunk_overlap=10)

tests/test_watcher.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def verify_content(content: str, timeout: float = 5.0) -> bool:
122122
if results and content in results[0].content:
123123
return True
124124
time.sleep(0.5)
125+
logger.debug(f"Content not found within timeout: {content}")
125126
return False
126127

127128
with FileWatcher(indexer, [str(tmp_path)], update_delay=0.5):
@@ -133,7 +134,9 @@ def verify_content(content: str, timeout: float = 5.0) -> bool:
133134
content = f"Content version {i}"
134135
test_file.write_text(content)
135136
time.sleep(1.0) # Wait between updates
136-
assert verify_content(content), f"Content not found: {content}"
137+
if not verify_content(content):
138+
logger.error(f"Failed to verify content: {content}")
139+
raise AssertionError(f"Content not found: {content}")
137140

138141
# Verify final state
139142
results, _, _ = indexer.search("Content version")

0 commit comments

Comments
 (0)