File tree 1 file changed +10
-1
lines changed
1 file changed +10
-1
lines changed Original file line number Diff line number Diff line change
1
+ import logging
1
2
from dataclasses import dataclass
2
3
3
4
import tiktoken
4
5
5
6
from ..indexing .document import Document
6
7
8
+ logger = logging .getLogger (__name__ )
9
+
7
10
8
11
@dataclass
9
12
class ContextWindow :
@@ -40,9 +43,10 @@ def assemble_context(
40
43
Assemble a context window from documents, staying within token limit.
41
44
42
45
Documents should be pre-sorted by relevance.
46
+ Duplicate documents will be filtered out and a warning will be logged.
43
47
"""
44
48
total_tokens = 0
45
- included_docs = []
49
+ included_docs : list [ Document ] = []
46
50
context_parts = []
47
51
truncated = False
48
52
@@ -64,6 +68,11 @@ def assemble_context(
64
68
formatted_doc = self ._format_document (doc )
65
69
doc_tokens = self ._count_tokens (formatted_doc )
66
70
71
+ # check if document content is duplicate
72
+ if doc .content in [d .content for d in included_docs ]:
73
+ logger .warning (f"Duplicate document found: { doc .metadata ['source' ]} " )
74
+ continue
75
+
67
76
if total_tokens + doc_tokens > self .max_tokens :
68
77
truncated = True
69
78
break
You can’t perform that action at this time.
0 commit comments