1
- ''' Full-text searcher for headwords/phrases/examples/definitions'''
1
+ """ Full-text searcher for headwords/phrases/examples/definitions"""
2
2
3
- from __future__ import absolute_import
4
-
5
- import re
3
+ import fnmatch
6
4
import os .path
5
+ import re
7
6
from operator import itemgetter
8
- import fnmatch
9
7
10
8
from whoosh import index as wh_index
11
- from whoosh .fields import Schema , STORED , IDLIST , ID , TEXT
12
- from whoosh .analysis import StandardAnalyzer , Filter
13
- from whoosh .query import Variations , Term , Or , And
14
- from whoosh .qparser import QueryParser , \
15
- RangePlugin , BoostPlugin , WildcardPlugin , OperatorsPlugin
16
- from whoosh .highlight import WholeFragmenter , HtmlFormatter
17
- from whoosh .collectors import WrappingCollector , \
18
- UnlimitedCollector , TopCollector
9
+ from whoosh .analysis import Filter , StandardAnalyzer
10
+ from whoosh .collectors import TopCollector , UnlimitedCollector , WrappingCollector
11
+ from whoosh .fields import ID , IDLIST , STORED , TEXT , Schema
12
+ from whoosh .highlight import HtmlFormatter , WholeFragmenter
13
+ from whoosh .qparser import (
14
+ BoostPlugin ,
15
+ OperatorsPlugin ,
16
+ QueryParser ,
17
+ RangePlugin ,
18
+ WildcardPlugin ,
19
+ )
20
+ from whoosh .query import And , Or , Term , Variations
19
21
20
- from .utils .cdb import CDBReader , CDBMaker , CDBError
21
- from .utils .text import normalize_token , normalize_index_key ,\
22
- enc_utf8 , dec_utf8
22
+ from .utils .cdb import CDBError , CDBMaker , CDBReader
23
+ from .utils .text import dec_utf8 , enc_utf8 , normalize_index_key , normalize_token
23
24
24
25
25
26
class IndexError (Exception ):
@@ -46,9 +47,10 @@ def abort(self):
46
47
self ._aborted = True
47
48
48
49
49
- #-----------------
50
+ # -----------------
50
51
# Word Vatiations
51
- #-----------------
52
+ # -----------------
53
+
52
54
53
55
class VariationsReader (object ):
54
56
def __init__ (self , path ):
@@ -68,13 +70,13 @@ def close(self):
68
70
self ._reader = None
69
71
70
72
def get_variations (self , word ):
71
- r = set ((word , ))
73
+ r = set ((word ,))
72
74
try :
73
75
s = self ._reader [enc_utf8 (word )]
74
76
except KeyError :
75
77
return r
76
78
77
- r .update (dec_utf8 (w ) for w in s .split (b' \0 ' ))
79
+ r .update (dec_utf8 (w ) for w in s .split (b" \0 " ))
78
80
return r
79
81
80
82
@@ -83,18 +85,18 @@ def __init__(self, f):
83
85
self ._writer = CDBMaker (f )
84
86
85
87
def add (self , word , variations ):
86
- self ._writer .add (
87
- enc_utf8 (word ),
88
- b'\0 ' .join (enc_utf8 (v ) for v in variations ))
88
+ self ._writer .add (enc_utf8 (word ), b"\0 " .join (enc_utf8 (v ) for v in variations ))
89
89
90
90
def finalize (self ):
91
91
self ._writer .finalize ()
92
92
93
93
94
94
def my_variations (var_reader ):
95
95
if var_reader :
96
+
96
97
def f (fieldname , text , boost = 1.0 ):
97
98
return MyVariations (var_reader , fieldname , text , boost )
99
+
98
100
return f
99
101
else :
100
102
return Term
@@ -116,43 +118,47 @@ def _words(self, ixreader):
116
118
return cache [text ]
117
119
else :
118
120
fieldname = self .fieldname
119
- words = [word for word in self .__var_reader .get_variations (text )
120
- if (fieldname , word ) in ixreader ]
121
+ words = [
122
+ word
123
+ for word in self .__var_reader .get_variations (text )
124
+ if (fieldname , word ) in ixreader
125
+ ]
121
126
cache [text ] = words
122
127
return words
123
128
124
129
def __deepcopy__ (self , x ):
125
- return MyVariations (self .__var_reader ,
126
- self .__fieldname , self .__text , self .__boost )
130
+ return MyVariations (
131
+ self .__var_reader , self .__fieldname , self .__text , self .__boost
132
+ )
127
133
128
134
129
- #-----------------
135
+ # -----------------
130
136
# Index Schema
131
- #-----------------
137
+ # -----------------
138
+
132
139
133
140
class _AccentFilter (Filter ):
134
141
def __call__ (self , tokens ):
135
142
for t in tokens :
136
143
t .text = normalize_token (t .text )
137
144
yield t
138
145
139
- _stopwords = frozenset (('a' , 'an' ))
140
- _analyzer = (StandardAnalyzer (stoplist = _stopwords ) | _AccentFilter ())
146
+
147
+ _stopwords = frozenset (("a" , "an" ))
148
+ _analyzer = StandardAnalyzer (stoplist = _stopwords ) | _AccentFilter ()
141
149
_schema = Schema (
142
- content = TEXT (
143
- stored = True ,
144
- spelling = True ,
145
- analyzer = _analyzer ),
150
+ content = TEXT (stored = True , spelling = True , analyzer = _analyzer ),
146
151
data = STORED , # tuple (label, path, prio, sortkey)
147
152
itemtype = ID ,
148
- asfilter = IDLIST
153
+ asfilter = IDLIST ,
149
154
)
150
- _schema [' content' ].scorable = False
155
+ _schema [" content" ].scorable = False
151
156
152
157
153
- #-----------------
158
+ # -----------------
154
159
# Maker
155
- #-----------------
160
+ # -----------------
161
+
156
162
157
163
class Maker (object ):
158
164
def __init__ (self , index_dir ):
@@ -167,13 +173,12 @@ def __init__(self, index_dir):
167
173
self ._writer = index .writer ()
168
174
self ._committed = False
169
175
170
- def add_item (self ,
171
- itemtype , content , asfilter , label , path , prio , sortkey ):
176
+ def add_item (self , itemtype , content , asfilter , label , path , prio , sortkey ):
172
177
self ._writer .add_document (
173
178
itemtype = itemtype ,
174
179
content = content ,
175
180
asfilter = asfilter ,
176
- data = (label , path , prio , normalize_index_key (sortkey ))
181
+ data = (label , path , prio , normalize_index_key (sortkey )),
177
182
)
178
183
179
184
def commit (self ):
@@ -189,9 +194,10 @@ def close(self):
189
194
self ._writer = None
190
195
191
196
192
- #-----------------
197
+ # -----------------
193
198
# Searcher
194
- #-----------------
199
+ # -----------------
200
+
195
201
196
202
class Searcher (object ):
197
203
def __init__ (self , index_dir , var_path ):
@@ -204,26 +210,33 @@ def __init__(self, index_dir, var_path):
204
210
self ._var_reader = self ._make_var_reader (var_path )
205
211
206
212
op = OperatorsPlugin (
207
- And = r"\bAND\b|&" , Or = None , # r"\bOR\b|\|",
208
- Not = r"\bNOT\b|\s+-" , AndMaybe = None , Require = None )
209
- parser = QueryParser ('content' , _schema ,
210
- termclass = my_variations (self ._var_reader ))
213
+ And = r"\bAND\b|&" ,
214
+ Or = None , # r"\bOR\b|\|",
215
+ Not = r"\bNOT\b|\s+-" ,
216
+ AndMaybe = None ,
217
+ Require = None ,
218
+ )
219
+ parser = QueryParser (
220
+ "content" , _schema , termclass = my_variations (self ._var_reader )
221
+ )
211
222
parser .remove_plugin_class (RangePlugin )
212
223
parser .remove_plugin_class (BoostPlugin )
213
224
parser .remove_plugin_class (WildcardPlugin )
214
225
parser .replace_plugin (op )
215
226
self ._parser = parser
216
227
217
- parser_wild = QueryParser ('content' , _schema ,
218
- termclass = my_variations (self ._var_reader ))
228
+ parser_wild = QueryParser (
229
+ "content" , _schema , termclass = my_variations (self ._var_reader )
230
+ )
219
231
parser_wild .remove_plugin_class (RangePlugin )
220
232
parser_wild .remove_plugin_class (BoostPlugin )
221
233
parser_wild .replace_plugin (op )
222
234
self ._parser_wild = parser_wild
223
235
224
- op_filter = OperatorsPlugin (And = r"\bAND\b" , Or = r"\bOR\b" ,
225
- Not = None , AndMaybe = None , Require = None )
226
- asf_parser = QueryParser ('asfilter' , _schema )
236
+ op_filter = OperatorsPlugin (
237
+ And = r"\bAND\b" , Or = r"\bOR\b" , Not = None , AndMaybe = None , Require = None
238
+ )
239
+ asf_parser = QueryParser ("asfilter" , _schema )
227
240
asf_parser .replace_plugin (op_filter )
228
241
self ._asf_parser = asf_parser
229
242
@@ -257,16 +270,17 @@ def make_collector(self, limit=None):
257
270
else :
258
271
return AbortableCollector (TopCollector (limit ))
259
272
260
- def search (self , collector , query_str1 = None , query_str2 = None ,
261
- itemtypes = (), highlight = False ):
273
+ def search (
274
+ self , collector , query_str1 = None , query_str2 = None , itemtypes = (), highlight = False
275
+ ):
262
276
263
277
# rejects '*' and '?'
264
278
if query_str1 :
265
279
for kw in (s .strip () for s in query_str1 .split ()):
266
280
if not kw .replace ("*" , "" ).replace ("?" , "" ).strip ():
267
281
return []
268
282
269
- wildcard = ( query_str1 and any (c in query_str1 for c in "*?" ) )
283
+ wildcard = query_str1 and any (c in query_str1 for c in "*?" )
270
284
271
285
parser = self ._parser_wild if wildcard else self ._parser
272
286
asf_parser = self ._asf_parser
@@ -283,10 +297,9 @@ def search(self, collector, query_str1=None, query_str2=None,
283
297
284
298
if itemtypes :
285
299
if len (itemtypes ) > 1 :
286
- andlist .append (
287
- Or ([Term ('itemtype' , t ) for t in itemtypes ]))
300
+ andlist .append (Or ([Term ("itemtype" , t ) for t in itemtypes ]))
288
301
else :
289
- andlist .append (Term (' itemtype' , itemtypes [0 ]))
302
+ andlist .append (Term (" itemtype" , itemtypes [0 ]))
290
303
291
304
query = And (andlist )
292
305
@@ -296,7 +309,8 @@ def search(self, collector, query_str1=None, query_str2=None,
296
309
if highlight :
297
310
hits .fragmenter = WholeFragmenter ()
298
311
hits .formatter = HtmlFormatter (
299
- tagname = 'span' , classname = 's_match' , termclass = 's_term' )
312
+ tagname = "span" , classname = "s_match" , termclass = "s_term"
313
+ )
300
314
301
315
if wildcard and query_str1 :
302
316
pat = query_str1 .replace ("-" , "" ).replace (" " , "" )
@@ -307,17 +321,17 @@ def search(self, collector, query_str1=None, query_str2=None,
307
321
for hit in hits :
308
322
if collector .aborted :
309
323
return []
310
- (label , path , prio , sortkey ) = hit [' data' ]
324
+ (label , path , prio , sortkey ) = hit [" data" ]
311
325
312
326
if wildcard and query_str1 :
313
327
if not wildmatch .match (sortkey ):
314
328
continue
315
329
316
330
if highlight :
317
331
if query_str1 :
318
- text = hit .highlights (' content' )
332
+ text = hit .highlights (" content" )
319
333
else :
320
- text = hit [' content' ]
334
+ text = hit [" content" ]
321
335
else :
322
336
text = None
323
337
@@ -328,4 +342,3 @@ def search(self, collector, query_str1=None, query_str2=None,
328
342
329
343
# Return
330
344
return results
331
-
0 commit comments