Skip to content

Commit 5f04004

Browse files
committed
fix: dont include duplicate chunks in context assembly (fixes #7)
1 parent 4c9a7dd commit 5f04004

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

gptme_rag/query/context_assembler.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import logging
12
from dataclasses import dataclass
23

34
import tiktoken
45

56
from ..indexing.document import Document
67

8+
logger = logging.getLogger(__name__)
9+
710

811
@dataclass
912
class ContextWindow:
@@ -40,9 +43,10 @@ def assemble_context(
4043
Assemble a context window from documents, staying within token limit.
4144
4245
Documents should be pre-sorted by relevance.
46+
Duplicate documents will be filtered out and a warning will be logged.
4347
"""
4448
total_tokens = 0
45-
included_docs = []
49+
included_docs: list[Document] = []
4650
context_parts = []
4751
truncated = False
4852

@@ -64,6 +68,11 @@ def assemble_context(
6468
formatted_doc = self._format_document(doc)
6569
doc_tokens = self._count_tokens(formatted_doc)
6670

71+
# check if document content is duplicate
72+
if doc.content in [d.content for d in included_docs]:
73+
logger.warning(f"Duplicate document found: {doc.metadata['source']}")
74+
continue
75+
6776
if total_tokens + doc_tokens > self.max_tokens:
6877
truncated = True
6978
break

0 commit comments

Comments
 (0)