1 | # Copyright (C) 2009, 2010, 2011 David Sauve |
---|
2 | # Copyright (C) 2009, 2010 Trapeze |
---|
3 | # 2011: Modifications by LinObject |
---|
4 | |
---|
5 | __author__ = 'David Sauve' |
---|
6 | __version__ = (1, 1, 6, 'beta') |
---|
7 | |
---|
8 | import time |
---|
9 | import datetime |
---|
10 | import cPickle as pickle |
---|
11 | import os |
---|
12 | import re |
---|
13 | import shutil |
---|
14 | import sys |
---|
15 | import warnings |
---|
16 | |
---|
17 | from django.conf import settings |
---|
18 | from django.core.exceptions import ImproperlyConfigured |
---|
19 | from django.utils.encoding import smart_unicode, force_unicode |
---|
20 | |
---|
21 | from haystack.backends import BaseSearchBackend, BaseSearchQuery, SearchNode, log_query |
---|
22 | from haystack.constants import ID, DJANGO_CT, DJANGO_ID |
---|
23 | from haystack.exceptions import HaystackError, MissingDependency, MoreLikeThisError |
---|
24 | from haystack.fields import DateField, DateTimeField, IntegerField, FloatField, BooleanField, MultiValueField |
---|
25 | from haystack.models import SearchResult |
---|
26 | from haystack.utils import get_identifier |
---|
27 | |
---|
28 | try: |
---|
29 | import xapian |
---|
30 | except ImportError: |
---|
31 | raise MissingDependency("The 'xapian' backend requires the installation of 'xapian'. Please refer to the documentation.") |
---|
32 | |
---|
33 | |
---|
34 | DOCUMENT_ID_TERM_PREFIX = 'Q' |
---|
35 | DOCUMENT_CUSTOM_TERM_PREFIX = 'X' |
---|
36 | DOCUMENT_CT_TERM_PREFIX = DOCUMENT_CUSTOM_TERM_PREFIX + 'CONTENTTYPE' |
---|
37 | |
---|
38 | MEMORY_DB_NAME = ':memory:' |
---|
39 | |
---|
40 | BACKEND_NAME = 'xapian' |
---|
41 | |
---|
42 | DEFAULT_XAPIAN_FLAGS = ( |
---|
43 | xapian.QueryParser.FLAG_PHRASE | |
---|
44 | xapian.QueryParser.FLAG_BOOLEAN | |
---|
45 | xapian.QueryParser.FLAG_LOVEHATE | |
---|
46 | xapian.QueryParser.FLAG_WILDCARD | |
---|
47 | xapian.QueryParser.FLAG_PURE_NOT |
---|
48 | ) |
---|
49 | |
---|
50 | |
---|
51 | class InvalidIndexError(HaystackError): |
---|
52 | """Raised when an index can not be opened.""" |
---|
53 | pass |
---|
54 | |
---|
55 | |
---|
56 | class XHValueRangeProcessor(xapian.ValueRangeProcessor): |
---|
57 | def __init__(self, backend): |
---|
58 | self.backend = backend or SearchBackend() |
---|
59 | xapian.ValueRangeProcessor.__init__(self) |
---|
60 | |
---|
61 | def __call__(self, begin, end): |
---|
62 | """ |
---|
63 | Construct a tuple for value range processing. |
---|
64 | `begin` -- a string in the format '<field_name>:[low_range]' |
---|
65 | If 'low_range' is omitted, assume the smallest possible value. |
---|
66 | `end` -- a string in the the format '[high_range|*]'. If '*', assume |
---|
67 | the highest possible value. |
---|
68 | Return a tuple of three strings: (column, low, high) |
---|
69 | """ |
---|
70 | colon = begin.find(':') |
---|
71 | field_name = begin[:colon] |
---|
72 | begin = begin[colon + 1:len(begin)] |
---|
73 | for field_dict in self.backend.schema: |
---|
74 | if field_dict['field_name'] == field_name: |
---|
75 | if not begin: |
---|
76 | if field_dict['type'] == 'text': |
---|
77 | begin = u'a' # TODO: A better way of getting a min text value? |
---|
78 | elif field_dict['type'] == 'long': |
---|
79 | begin = -sys.maxint - 1 |
---|
80 | elif field_dict['type'] == 'float': |
---|
81 | begin = float('-inf') |
---|
82 | elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime': |
---|
83 | begin = u'00010101000000' |
---|
84 | elif end == '*': |
---|
85 | if field_dict['type'] == 'text': |
---|
86 | end = u'z' * 100 # TODO: A better way of getting a max text value? |
---|
87 | elif field_dict['type'] == 'long': |
---|
88 | end = sys.maxint |
---|
89 | elif field_dict['type'] == 'float': |
---|
90 | end = float('inf') |
---|
91 | elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime': |
---|
92 | end = u'99990101000000' |
---|
93 | if field_dict['type'] == 'float': |
---|
94 | begin = _marshal_value(float(begin)) |
---|
95 | end = _marshal_value(float(end)) |
---|
96 | elif field_dict['type'] == 'long': |
---|
97 | begin = _marshal_value(long(begin)) |
---|
98 | end = _marshal_value(long(end)) |
---|
99 | return field_dict['column'], str(begin), str(end) |
---|
100 | return xapian.BAD_VALUENO, str(begin), str(end) |
---|
101 | |
---|
102 | class XHExpandDecider(xapian.ExpandDecider): |
---|
103 | def __call__(self, term): |
---|
104 | """ |
---|
105 | Return True if the term should be used for expanding the search |
---|
106 | query, False otherwise. |
---|
107 | |
---|
108 | Currently, we only want to ignore terms beginning with `DOCUMENT_CT_TERM_PREFIX` |
---|
109 | """ |
---|
110 | if term.startswith(DOCUMENT_CT_TERM_PREFIX): |
---|
111 | return False |
---|
112 | return True |
---|
113 | |
---|
114 | |
---|
115 | class SearchBackend(BaseSearchBackend): |
---|
116 | """ |
---|
117 | `SearchBackend` defines the Xapian search backend for use with the Haystack |
---|
118 | API for Django search. |
---|
119 | |
---|
120 | It uses the Xapian Python bindings to interface with Xapian, and as |
---|
121 | such is subject to this bug: <http://trac.xapian.org/ticket/364> when |
---|
122 | Django is running with mod_python or mod_wsgi under Apache. |
---|
123 | |
---|
124 | Until this issue has been fixed by Xapian, it is neccessary to set |
---|
125 | `WSGIApplicationGroup to %{GLOBAL}` when using mod_wsgi, or |
---|
126 | `PythonInterpreter main_interpreter` when using mod_python. |
---|
127 | |
---|
128 | In order to use this backend, `HAYSTACK_XAPIAN_PATH` must be set in |
---|
129 | your settings. This should point to a location where you would your |
---|
130 | indexes to reside. |
---|
131 | """ |
---|
132 | |
---|
133 | inmemory_db = None |
---|
134 | |
---|
135 | def __init__(self, site=None, language=None): |
---|
136 | """ |
---|
137 | Instantiates an instance of `SearchBackend`. |
---|
138 | |
---|
139 | Optional arguments: |
---|
140 | `site` -- The site to associate the backend with (default = None) |
---|
141 | |
---|
142 | """ |
---|
143 | super(SearchBackend, self).__init__(site) |
---|
144 | |
---|
145 | if not hasattr(settings, 'HAYSTACK_XAPIAN_PATH'): |
---|
146 | raise ImproperlyConfigured('You must specify a HAYSTACK_XAPIAN_PATH in your settings.') |
---|
147 | |
---|
148 | if language: |
---|
149 | raise AttributeError('Language arg is now deprecated. Please use settings.HAYSTACK_XAPIAN_LANGUAGE instead.') |
---|
150 | |
---|
151 | if settings.HAYSTACK_XAPIAN_PATH != MEMORY_DB_NAME and \ |
---|
152 | not os.path.exists(settings.HAYSTACK_XAPIAN_PATH): |
---|
153 | os.makedirs(settings.HAYSTACK_XAPIAN_PATH) |
---|
154 | |
---|
155 | self.language = getattr(settings, 'HAYSTACK_XAPIAN_LANGUAGE', 'english') |
---|
156 | self._schema = None |
---|
157 | self._content_field_name = None |
---|
158 | |
---|
159 | @property |
---|
160 | def schema(self): |
---|
161 | if not self._schema: |
---|
162 | self._content_field_name, self._schema = self.build_schema(self.site.all_searchfields()) |
---|
163 | return self._schema |
---|
164 | |
---|
165 | @property |
---|
166 | def content_field_name(self): |
---|
167 | if not self._content_field_name: |
---|
168 | self._content_field_name, self._schema = self.build_schema(self.site.all_searchfields()) |
---|
169 | return self._content_field_name |
---|
170 | |
---|
171 | def update(self, index, iterable): |
---|
172 | """ |
---|
173 | Updates the `index` with any objects in `iterable` by adding/updating |
---|
174 | the database as needed. |
---|
175 | |
---|
176 | Required arguments: |
---|
177 | `index` -- The `SearchIndex` to process |
---|
178 | `iterable` -- An iterable of model instances to index |
---|
179 | |
---|
180 | For each object in `iterable`, a document is created containing all |
---|
181 | of the terms extracted from `index.full_prepare(obj)` with field prefixes, |
---|
182 | and 'as-is' as needed. Also, if the field type is 'text' it will be |
---|
183 | stemmed and stored with the 'Z' prefix as well. |
---|
184 | |
---|
185 | eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest` |
---|
186 | |
---|
187 | Each document also contains an extra term in the format: |
---|
188 | |
---|
189 | `XCONTENTTYPE<app_name>.<model_name>` |
---|
190 | |
---|
191 | As well as a unique identifier in the the format: |
---|
192 | |
---|
193 | `Q<app_name>.<model_name>.<pk>` |
---|
194 | |
---|
195 | eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar` |
---|
196 | |
---|
197 | This is useful for querying for a specific document corresponding to |
---|
198 | a model instance. |
---|
199 | |
---|
200 | The document also contains a pickled version of the object itself and |
---|
201 | the document ID in the document data field. |
---|
202 | |
---|
203 | Finally, we also store field values to be used for sorting data. We |
---|
204 | store these in the document value slots (position zero is reserver |
---|
205 | for the document ID). All values are stored as unicode strings with |
---|
206 | conversion of float, int, double, values being done by Xapian itself |
---|
207 | through the use of the :method:xapian.sortable_serialise method. |
---|
208 | """ |
---|
209 | database = self._database(writable=True) |
---|
210 | try: |
---|
211 | for obj in iterable: |
---|
212 | document = xapian.Document() |
---|
213 | |
---|
214 | term_generator = xapian.TermGenerator() |
---|
215 | term_generator.set_database(database) |
---|
216 | term_generator.set_stemmer(xapian.Stem(self.language)) |
---|
217 | if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: |
---|
218 | term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) |
---|
219 | term_generator.set_document(document) |
---|
220 | |
---|
221 | document_id = DOCUMENT_ID_TERM_PREFIX + get_identifier(obj) |
---|
222 | data = index.full_prepare(obj) |
---|
223 | weights = index.get_field_weights() |
---|
224 | for field in self.schema: |
---|
225 | if field['field_name'] in data.keys(): |
---|
226 | prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper() |
---|
227 | value = data[field['field_name']] |
---|
228 | try: |
---|
229 | weight = int(weights[field['field_name']]) |
---|
230 | except KeyError: |
---|
231 | weight = 1 |
---|
232 | if field['type'] == 'text': |
---|
233 | if field['multi_valued'] == 'false': |
---|
234 | term = _marshal_term(value) |
---|
235 | term_generator.index_text(term, weight) |
---|
236 | term_generator.index_text(term, weight, prefix) |
---|
237 | if len(term.split()) == 1: |
---|
238 | document.add_term(term, weight) |
---|
239 | document.add_term(prefix + term, weight) |
---|
240 | document.add_value(field['column'], _marshal_value(value)) |
---|
241 | else: |
---|
242 | for term in value: |
---|
243 | term = _marshal_term(term) |
---|
244 | term_generator.index_text(term, weight) |
---|
245 | term_generator.index_text(term, weight, prefix) |
---|
246 | if len(term.split()) == 1: |
---|
247 | document.add_term(term, weight) |
---|
248 | document.add_term(prefix + term, weight) |
---|
249 | else: |
---|
250 | if field['multi_valued'] == 'false': |
---|
251 | term = _marshal_term(value) |
---|
252 | if len(term.split()) == 1: |
---|
253 | document.add_term(term, weight) |
---|
254 | document.add_term(prefix + term, weight) |
---|
255 | document.add_value(field['column'], _marshal_value(value)) |
---|
256 | else: |
---|
257 | for term in value: |
---|
258 | term = _marshal_term(term) |
---|
259 | if len(term.split()) == 1: |
---|
260 | document.add_term(term, weight) |
---|
261 | document.add_term(prefix + term, weight) |
---|
262 | |
---|
263 | document.set_data(pickle.dumps( |
---|
264 | (obj._meta.app_label, obj._meta.module_name, obj.pk, data), |
---|
265 | pickle.HIGHEST_PROTOCOL |
---|
266 | )) |
---|
267 | document.add_term(document_id) |
---|
268 | document.add_term( |
---|
269 | DOCUMENT_CT_TERM_PREFIX + u'%s.%s' % |
---|
270 | (obj._meta.app_label, obj._meta.module_name) |
---|
271 | ) |
---|
272 | database.replace_document(document_id, document) |
---|
273 | |
---|
274 | except UnicodeDecodeError: |
---|
275 | sys.stderr.write('Chunk failed.\n') |
---|
276 | pass |
---|
277 | |
---|
278 | finally: |
---|
279 | database = None |
---|
280 | |
---|
281 | def remove(self, obj): |
---|
282 | """ |
---|
283 | Remove indexes for `obj` from the database. |
---|
284 | |
---|
285 | We delete all instances of `Q<app_name>.<model_name>.<pk>` which |
---|
286 | should be unique to this object. |
---|
287 | """ |
---|
288 | database = self._database(writable=True) |
---|
289 | database.delete_document(DOCUMENT_ID_TERM_PREFIX + get_identifier(obj)) |
---|
290 | |
---|
291 | def clear(self, models=[]): |
---|
292 | """ |
---|
293 | Clear all instances of `models` from the database or all models, if |
---|
294 | not specified. |
---|
295 | |
---|
296 | Optional Arguments: |
---|
297 | `models` -- Models to clear from the database (default = []) |
---|
298 | |
---|
299 | If `models` is empty, an empty query is executed which matches all |
---|
300 | documents in the database. Afterwards, each match is deleted. |
---|
301 | |
---|
302 | Otherwise, for each model, a `delete_document` call is issued with |
---|
303 | the term `XCONTENTTYPE<app_name>.<model_name>`. This will delete |
---|
304 | all documents with the specified model type. |
---|
305 | """ |
---|
306 | database = self._database(writable=True) |
---|
307 | if not models: |
---|
308 | # Because there does not appear to be a "clear all" method, |
---|
309 | # it's much quicker to remove the contents of the `HAYSTACK_XAPIAN_PATH` |
---|
310 | # folder than it is to remove each document one at a time. |
---|
311 | if os.path.exists(settings.HAYSTACK_XAPIAN_PATH): |
---|
312 | shutil.rmtree(settings.HAYSTACK_XAPIAN_PATH) |
---|
313 | else: |
---|
314 | for model in models: |
---|
315 | database.delete_document( |
---|
316 | DOCUMENT_CT_TERM_PREFIX + '%s.%s' % |
---|
317 | (model._meta.app_label, model._meta.module_name) |
---|
318 | ) |
---|
319 | |
---|
320 | def document_count(self): |
---|
321 | try: |
---|
322 | return self._database().get_doccount() |
---|
323 | except InvalidIndexError: |
---|
324 | return 0 |
---|
325 | |
---|
326 | @log_query |
---|
327 | def search(self, query, sort_by=None, start_offset=0, end_offset=None, |
---|
328 | fields='', highlight=False, facets=None, date_facets=None, |
---|
329 | query_facets=None, narrow_queries=None, spelling_query=None, |
---|
330 | limit_to_registered_models=True, result_class=None, **kwargs): |
---|
331 | """ |
---|
332 | Executes the Xapian::query as defined in `query`. |
---|
333 | |
---|
334 | Required arguments: |
---|
335 | `query` -- Search query to execute |
---|
336 | |
---|
337 | Optional arguments: |
---|
338 | `sort_by` -- Sort results by specified field (default = None) |
---|
339 | `start_offset` -- Slice results from `start_offset` (default = 0) |
---|
340 | `end_offset` -- Slice results at `end_offset` (default = None), if None, then all documents |
---|
341 | `fields` -- Filter results on `fields` (default = '') |
---|
342 | `highlight` -- Highlight terms in results (default = False) |
---|
343 | `facets` -- Facet results on fields (default = None) |
---|
344 | `date_facets` -- Facet results on date ranges (default = None) |
---|
345 | `query_facets` -- Facet results on queries (default = None) |
---|
346 | `narrow_queries` -- Narrow queries (default = None) |
---|
347 | `spelling_query` -- An optional query to execute spelling suggestion on |
---|
348 | `limit_to_registered_models` -- Limit returned results to models registered in the current `SearchSite` (default = True) |
---|
349 | |
---|
350 | Returns: |
---|
351 | A dictionary with the following keys: |
---|
352 | `results` -- A list of `SearchResult` |
---|
353 | `hits` -- The total available results |
---|
354 | `facets` - A dictionary of facets with the following keys: |
---|
355 | `fields` -- A list of field facets |
---|
356 | `dates` -- A list of date facets |
---|
357 | `queries` -- A list of query facets |
---|
358 | If faceting was not used, the `facets` key will not be present |
---|
359 | |
---|
360 | If `query` is None, returns no results. |
---|
361 | |
---|
362 | If `HAYSTACK_INCLUDE_SPELLING` was enabled in `settings.py`, the |
---|
363 | extra flag `FLAG_SPELLING_CORRECTION` will be passed to the query parser |
---|
364 | and any suggestions for spell correction will be returned as well as |
---|
365 | the results. |
---|
366 | """ |
---|
367 | if not self.site: |
---|
368 | from haystack import site |
---|
369 | else: |
---|
370 | site = self.site |
---|
371 | |
---|
372 | if xapian.Query.empty(query): |
---|
373 | return { |
---|
374 | 'results': [], |
---|
375 | 'hits': 0, |
---|
376 | } |
---|
377 | |
---|
378 | database = self._database() |
---|
379 | |
---|
380 | if result_class is None: |
---|
381 | result_class = SearchResult |
---|
382 | |
---|
383 | if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: |
---|
384 | spelling_suggestion = self._do_spelling_suggestion(database, query, spelling_query) |
---|
385 | else: |
---|
386 | spelling_suggestion = '' |
---|
387 | |
---|
388 | if narrow_queries is not None: |
---|
389 | query = xapian.Query( |
---|
390 | xapian.Query.OP_AND, query, xapian.Query( |
---|
391 | xapian.Query.OP_OR, [self.parse_query(narrow_query) for narrow_query in narrow_queries] |
---|
392 | ) |
---|
393 | ) |
---|
394 | |
---|
395 | if limit_to_registered_models: |
---|
396 | registered_models = self.build_registered_models_list() |
---|
397 | |
---|
398 | if len(registered_models) > 0: |
---|
399 | query = xapian.Query( |
---|
400 | xapian.Query.OP_AND, query, |
---|
401 | xapian.Query( |
---|
402 | xapian.Query.OP_OR, [ |
---|
403 | xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, model)) for model in registered_models |
---|
404 | ] |
---|
405 | ) |
---|
406 | ) |
---|
407 | |
---|
408 | enquire = xapian.Enquire(database) |
---|
409 | if hasattr(settings, 'HAYSTACK_XAPIAN_WEIGHTING_SCHEME'): |
---|
410 | enquire.set_weighting_scheme(xapian.BM25Weight(*settings.HAYSTACK_XAPIAN_WEIGHTING_SCHEME)) |
---|
411 | enquire.set_query(query) |
---|
412 | |
---|
413 | if sort_by: |
---|
414 | sorter = xapian.MultiValueSorter() |
---|
415 | |
---|
416 | for sort_field in sort_by: |
---|
417 | if sort_field.startswith('-'): |
---|
418 | reverse = True |
---|
419 | sort_field = sort_field[1:] # Strip the '-' |
---|
420 | else: |
---|
421 | reverse = False # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311 |
---|
422 | sorter.add(self._value_column(sort_field), reverse) |
---|
423 | |
---|
424 | enquire.set_sort_by_key_then_relevance(sorter, True) |
---|
425 | |
---|
426 | results = [] |
---|
427 | facets_dict = { |
---|
428 | 'fields': {}, |
---|
429 | 'dates': {}, |
---|
430 | 'queries': {}, |
---|
431 | } |
---|
432 | |
---|
433 | if not end_offset: |
---|
434 | end_offset = database.get_doccount() - start_offset |
---|
435 | |
---|
436 | matches = self._get_enquire_mset(database, enquire, start_offset, end_offset) |
---|
437 | |
---|
438 | for match in matches: |
---|
439 | app_label, module_name, pk, model_data = pickle.loads(self._get_document_data(database, match.document)) |
---|
440 | if highlight: |
---|
441 | model_data['highlighted'] = { |
---|
442 | self.content_field_name: self._do_highlight( |
---|
443 | model_data.get(self.content_field_name), query |
---|
444 | ) |
---|
445 | } |
---|
446 | results.append( |
---|
447 | result_class(app_label, module_name, pk, match.percent, searchsite=site, **model_data) |
---|
448 | ) |
---|
449 | |
---|
450 | if facets: |
---|
451 | facets_dict['fields'] = self._do_field_facets(results, facets) |
---|
452 | if date_facets: |
---|
453 | facets_dict['dates'] = self._do_date_facets(results, date_facets) |
---|
454 | if query_facets: |
---|
455 | facets_dict['queries'] = self._do_query_facets(results, query_facets) |
---|
456 | |
---|
457 | return { |
---|
458 | 'results': results, |
---|
459 | 'hits': self._get_hit_count(database, enquire), |
---|
460 | 'facets': facets_dict, |
---|
461 | 'spelling_suggestion': spelling_suggestion, |
---|
462 | } |
---|
463 | |
---|
464 | def more_like_this(self, model_instance, additional_query=None, |
---|
465 | start_offset=0, end_offset=None, |
---|
466 | limit_to_registered_models=True, result_class=None, **kwargs): |
---|
467 | """ |
---|
468 | Given a model instance, returns a result set of similar documents. |
---|
469 | |
---|
470 | Required arguments: |
---|
471 | `model_instance` -- The model instance to use as a basis for |
---|
472 | retrieving similar documents. |
---|
473 | |
---|
474 | Optional arguments: |
---|
475 | `additional_query` -- An additional query to narrow results |
---|
476 | `start_offset` -- The starting offset (default=0) |
---|
477 | `end_offset` -- The ending offset (default=None), if None, then all documents |
---|
478 | `limit_to_registered_models` -- Limit returned results to models registered in the current `SearchSite` (default = True) |
---|
479 | |
---|
480 | Returns: |
---|
481 | A dictionary with the following keys: |
---|
482 | `results` -- A list of `SearchResult` |
---|
483 | `hits` -- The total available results |
---|
484 | |
---|
485 | Opens a database connection, then builds a simple query using the |
---|
486 | `model_instance` to build the unique identifier. |
---|
487 | |
---|
488 | For each document retrieved(should always be one), adds an entry into |
---|
489 | an RSet (relevance set) with the document id, then, uses the RSet |
---|
490 | to query for an ESet (A set of terms that can be used to suggest |
---|
491 | expansions to the original query), omitting any document that was in |
---|
492 | the original query. |
---|
493 | |
---|
494 | Finally, processes the resulting matches and returns. |
---|
495 | """ |
---|
496 | if not self.site: |
---|
497 | from haystack import site |
---|
498 | else: |
---|
499 | site = self.site |
---|
500 | |
---|
501 | database = self._database() |
---|
502 | |
---|
503 | if result_class is None: |
---|
504 | result_class = SearchResult |
---|
505 | |
---|
506 | query = xapian.Query(DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance)) |
---|
507 | |
---|
508 | enquire = xapian.Enquire(database) |
---|
509 | enquire.set_query(query) |
---|
510 | |
---|
511 | rset = xapian.RSet() |
---|
512 | |
---|
513 | if not end_offset: |
---|
514 | end_offset = database.get_doccount() |
---|
515 | |
---|
516 | for match in self._get_enquire_mset(database, enquire, 0, end_offset): |
---|
517 | rset.add_document(match.docid) |
---|
518 | |
---|
519 | query = xapian.Query( |
---|
520 | xapian.Query.OP_ELITE_SET, |
---|
521 | [expand.term for expand in enquire.get_eset(match.document.termlist_count(), rset, XHExpandDecider())], |
---|
522 | match.document.termlist_count() |
---|
523 | ) |
---|
524 | query = xapian.Query( |
---|
525 | xapian.Query.OP_AND_NOT, [query, DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance)] |
---|
526 | ) |
---|
527 | if limit_to_registered_models: |
---|
528 | registered_models = self.build_registered_models_list() |
---|
529 | |
---|
530 | if len(registered_models) > 0: |
---|
531 | query = xapian.Query( |
---|
532 | xapian.Query.OP_AND, query, |
---|
533 | xapian.Query( |
---|
534 | xapian.Query.OP_OR, [ |
---|
535 | xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, model)) for model in registered_models |
---|
536 | ] |
---|
537 | ) |
---|
538 | ) |
---|
539 | if additional_query: |
---|
540 | query = xapian.Query( |
---|
541 | xapian.Query.OP_AND, query, additional_query |
---|
542 | ) |
---|
543 | |
---|
544 | enquire.set_query(query) |
---|
545 | |
---|
546 | results = [] |
---|
547 | matches = self._get_enquire_mset(database, enquire, start_offset, end_offset) |
---|
548 | |
---|
549 | for match in matches: |
---|
550 | app_label, module_name, pk, model_data = pickle.loads(self._get_document_data(database, match.document)) |
---|
551 | results.append( |
---|
552 | result_class(app_label, module_name, pk, match.percent, searchsite=site, **model_data) |
---|
553 | ) |
---|
554 | |
---|
555 | return { |
---|
556 | 'results': results, |
---|
557 | 'hits': self._get_hit_count(database, enquire), |
---|
558 | 'facets': { |
---|
559 | 'fields': {}, |
---|
560 | 'dates': {}, |
---|
561 | 'queries': {}, |
---|
562 | }, |
---|
563 | 'spelling_suggestion': None, |
---|
564 | } |
---|
565 | |
---|
566 | def parse_query(self, query_string): |
---|
567 | """ |
---|
568 | Given a `query_string`, will attempt to return a xapian.Query |
---|
569 | |
---|
570 | Required arguments: |
---|
571 | ``query_string`` -- A query string to parse |
---|
572 | |
---|
573 | Returns a xapian.Query |
---|
574 | """ |
---|
575 | if query_string == '*': |
---|
576 | return xapian.Query('') # Match everything |
---|
577 | elif query_string == '': |
---|
578 | return xapian.Query() # Match nothing |
---|
579 | |
---|
580 | flags = getattr(settings, 'HAYSTACK_XAPIAN_FLAGS', DEFAULT_XAPIAN_FLAGS) |
---|
581 | qp = xapian.QueryParser() |
---|
582 | qp.set_database(self._database()) |
---|
583 | qp.set_stemmer(xapian.Stem(self.language)) |
---|
584 | qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) |
---|
585 | qp.add_boolean_prefix('django_ct', DOCUMENT_CT_TERM_PREFIX) |
---|
586 | |
---|
587 | for field_dict in self.schema: |
---|
588 | qp.add_prefix( |
---|
589 | field_dict['field_name'], |
---|
590 | DOCUMENT_CUSTOM_TERM_PREFIX + field_dict['field_name'].upper() |
---|
591 | ) |
---|
592 | |
---|
593 | vrp = XHValueRangeProcessor(self) |
---|
594 | qp.add_valuerangeprocessor(vrp) |
---|
595 | |
---|
596 | return qp.parse_query(query_string, flags) |
---|
597 | |
---|
598 | def build_schema(self, fields): |
---|
599 | """ |
---|
600 | Build the schema from fields. |
---|
601 | |
---|
602 | Required arguments: |
---|
603 | ``fields`` -- A list of fields in the index |
---|
604 | |
---|
605 | Returns a list of fields in dictionary format ready for inclusion in |
---|
606 | an indexed meta-data. |
---|
607 | """ |
---|
608 | content_field_name = '' |
---|
609 | schema_fields = [ |
---|
610 | {'field_name': ID, 'type': 'text', 'multi_valued': 'false', 'column': 0}, |
---|
611 | ] |
---|
612 | column = len(schema_fields) |
---|
613 | |
---|
614 | for field_name, field_class in sorted(fields.items(), key=lambda n: n[0]): |
---|
615 | if field_class.document is True: |
---|
616 | content_field_name = field_class.index_fieldname |
---|
617 | |
---|
618 | if field_class.indexed is True: |
---|
619 | field_data = { |
---|
620 | 'field_name': field_class.index_fieldname, |
---|
621 | 'type': 'text', |
---|
622 | 'multi_valued': 'false', |
---|
623 | 'column': column, |
---|
624 | } |
---|
625 | |
---|
626 | if field_class.field_type in ['date', 'datetime']: |
---|
627 | field_data['type'] = 'date' |
---|
628 | elif field_class.field_type == 'integer': |
---|
629 | field_data['type'] = 'long' |
---|
630 | elif field_class.field_type == 'float': |
---|
631 | field_data['type'] = 'float' |
---|
632 | elif field_class.field_type == 'boolean': |
---|
633 | field_data['type'] = 'boolean' |
---|
634 | |
---|
635 | if field_class.is_multivalued: |
---|
636 | field_data['multi_valued'] = 'true' |
---|
637 | |
---|
638 | schema_fields.append(field_data) |
---|
639 | column += 1 |
---|
640 | |
---|
641 | return (content_field_name, schema_fields) |
---|
642 | |
---|
643 | def _do_highlight(self, content, query, tag='em'): |
---|
644 | """ |
---|
645 | Highlight `query` terms in `content` with html `tag`. |
---|
646 | |
---|
647 | This method assumes that the input text (`content`) does not contain |
---|
648 | any special formatting. That is, it does not contain any html tags |
---|
649 | or similar markup that could be screwed up by the highlighting. |
---|
650 | |
---|
651 | Required arguments: |
---|
652 | `content` -- Content to search for instances of `text` |
---|
653 | `text` -- The text to be highlighted |
---|
654 | """ |
---|
655 | for term in query: |
---|
656 | for match in re.findall('[^A-Z]+', term): # Ignore field identifiers |
---|
657 | match_re = re.compile(match, re.I) |
---|
658 | content = match_re.sub('<%s>%s</%s>' % (tag, match, tag), content) |
---|
659 | # remove non highlighted line |
---|
660 | content = "...".join(line for line in content.splitlines() if "<em>" in line) |
---|
661 | return content |
---|
662 | |
---|
663 | def _do_field_facets(self, results, field_facets): |
---|
664 | """ |
---|
665 | Private method that facets a document by field name. |
---|
666 | |
---|
667 | Fields of type MultiValueField will be faceted on each item in the |
---|
668 | (containing) list. |
---|
669 | |
---|
670 | Required arguments: |
---|
671 | `results` -- A list SearchResults to facet |
---|
672 | `field_facets` -- A list of fields to facet on |
---|
673 | """ |
---|
674 | facet_dict = {} |
---|
675 | |
---|
676 | # DS_TODO: Improve this algorithm. Currently, runs in O(N^2), ouch. |
---|
677 | for field in field_facets: |
---|
678 | facet_list = {} |
---|
679 | |
---|
680 | for result in results: |
---|
681 | field_value = getattr(result, field) |
---|
682 | if self._multi_value_field(field): |
---|
683 | for item in field_value: # Facet each item in a MultiValueField |
---|
684 | facet_list[item] = facet_list.get(item, 0) + 1 |
---|
685 | else: |
---|
686 | facet_list[field_value] = facet_list.get(field_value, 0) + 1 |
---|
687 | |
---|
688 | facet_dict[field] = facet_list.items() |
---|
689 | |
---|
690 | return facet_dict |
---|
691 | |
---|
692 | def _do_date_facets(self, results, date_facets): |
---|
693 | """ |
---|
694 | Private method that facets a document by date ranges |
---|
695 | |
---|
696 | Required arguments: |
---|
697 | `results` -- A list SearchResults to facet |
---|
698 | `date_facets` -- A dictionary containing facet parameters: |
---|
699 | {'field': {'start_date': ..., 'end_date': ...: 'gap_by': '...', 'gap_amount': n}} |
---|
700 | nb., gap must be one of the following: |
---|
701 | year|month|day|hour|minute|second |
---|
702 | |
---|
703 | For each date facet field in `date_facets`, generates a list |
---|
704 | of date ranges (from `start_date` to `end_date` by `gap_by`) then |
---|
705 | iterates through `results` and tallies the count for each date_facet. |
---|
706 | |
---|
707 | Returns a dictionary of date facets (fields) containing a list with |
---|
708 | entries for each range and a count of documents matching the range. |
---|
709 | |
---|
710 | eg. { |
---|
711 | 'pub_date': [ |
---|
712 | ('2009-01-01T00:00:00Z', 5), |
---|
713 | ('2009-02-01T00:00:00Z', 0), |
---|
714 | ('2009-03-01T00:00:00Z', 0), |
---|
715 | ('2009-04-01T00:00:00Z', 1), |
---|
716 | ('2009-05-01T00:00:00Z', 2), |
---|
717 | ], |
---|
718 | } |
---|
719 | """ |
---|
720 | facet_dict = {} |
---|
721 | |
---|
722 | for date_facet, facet_params in date_facets.iteritems(): |
---|
723 | gap_type = facet_params.get('gap_by') |
---|
724 | gap_value = facet_params.get('gap_amount', 1) |
---|
725 | date_range = facet_params['start_date'] |
---|
726 | facet_list = [] |
---|
727 | while date_range < facet_params['end_date']: |
---|
728 | facet_list.append((date_range.isoformat(), 0)) |
---|
729 | if gap_type == 'year': |
---|
730 | date_range = date_range.replace( |
---|
731 | year=date_range.year + int(gap_value) |
---|
732 | ) |
---|
733 | elif gap_type == 'month': |
---|
734 | if date_range.month + int(gap_value) > 12: |
---|
735 | date_range = date_range.replace( |
---|
736 | month=((date_range.month + int(gap_value)) % 12), |
---|
737 | year=(date_range.year + (date_range.month + int(gap_value)) / 12) |
---|
738 | ) |
---|
739 | else: |
---|
740 | date_range = date_range.replace( |
---|
741 | month=date_range.month + int(gap_value) |
---|
742 | ) |
---|
743 | elif gap_type == 'day': |
---|
744 | date_range += datetime.timedelta(days=int(gap_value)) |
---|
745 | elif gap_type == 'hour': |
---|
746 | date_range += datetime.timedelta(hours=int(gap_value)) |
---|
747 | elif gap_type == 'minute': |
---|
748 | date_range += datetime.timedelta(minutes=int(gap_value)) |
---|
749 | elif gap_type == 'second': |
---|
750 | date_range += datetime.timedelta(seconds=int(gap_value)) |
---|
751 | |
---|
752 | facet_list = sorted(facet_list, key=lambda n:n[0], reverse=True) |
---|
753 | |
---|
754 | for result in results: |
---|
755 | result_date = getattr(result, date_facet) |
---|
756 | if result_date: |
---|
757 | if not isinstance(result_date, datetime.datetime): |
---|
758 | result_date = datetime.datetime( |
---|
759 | year=result_date.year, |
---|
760 | month=result_date.month, |
---|
761 | day=result_date.day, |
---|
762 | ) |
---|
763 | for n, facet_date in enumerate(facet_list): |
---|
764 | if result_date > datetime.datetime(*(time.strptime(facet_date[0], '%Y-%m-%dT%H:%M:%S')[0:6])): |
---|
765 | facet_list[n] = (facet_list[n][0], (facet_list[n][1] + 1)) |
---|
766 | break |
---|
767 | |
---|
768 | facet_dict[date_facet] = facet_list |
---|
769 | |
---|
770 | return facet_dict |
---|
771 | |
---|
772 | def _do_query_facets(self, results, query_facets): |
---|
773 | """ |
---|
774 | Private method that facets a document by query |
---|
775 | |
---|
776 | Required arguments: |
---|
777 | `results` -- A list SearchResults to facet |
---|
778 | `query_facets` -- A dictionary containing facet parameters: |
---|
779 | {'field': 'query', [...]} |
---|
780 | |
---|
781 | For each query in `query_facets`, generates a dictionary entry with |
---|
782 | the field name as the key and a tuple with the query and result count |
---|
783 | as the value. |
---|
784 | |
---|
785 | eg. {'name': ('a*', 5)} |
---|
786 | """ |
---|
787 | facet_dict = {} |
---|
788 | |
---|
789 | for field, query in query_facets.iteritems(): |
---|
790 | facet_dict[field] = (query, self.search(self.parse_query(query))['hits']) |
---|
791 | |
---|
792 | return facet_dict |
---|
793 | |
---|
794 | def _do_spelling_suggestion(self, database, query, spelling_query): |
---|
795 | """ |
---|
796 | Private method that returns a single spelling suggestion based on |
---|
797 | `spelling_query` or `query`. |
---|
798 | |
---|
799 | Required arguments: |
---|
800 | `database` -- The database to check spelling against |
---|
801 | `query` -- The query to check |
---|
802 | `spelling_query` -- If not None, this will be checked instead of `query` |
---|
803 | |
---|
804 | Returns a string with a suggested spelling |
---|
805 | """ |
---|
806 | if spelling_query: |
---|
807 | if ' ' in spelling_query: |
---|
808 | return ' '.join([database.get_spelling_suggestion(term) for term in spelling_query.split()]) |
---|
809 | else: |
---|
810 | return database.get_spelling_suggestion(spelling_query) |
---|
811 | |
---|
812 | term_set = set() |
---|
813 | for term in query: |
---|
814 | for match in re.findall('[^A-Z]+', term): # Ignore field identifiers |
---|
815 | term_set.add(database.get_spelling_suggestion(match)) |
---|
816 | |
---|
817 | return ' '.join(term_set) |
---|
818 | |
---|
819 | def _database(self, writable=False): |
---|
820 | """ |
---|
821 | Private method that returns a xapian.Database for use. |
---|
822 | |
---|
823 | Optional arguments: |
---|
824 | ``writable`` -- Open the database in read/write mode (default=False) |
---|
825 | |
---|
826 | Returns an instance of a xapian.Database or xapian.WritableDatabase |
---|
827 | """ |
---|
828 | if settings.HAYSTACK_XAPIAN_PATH == MEMORY_DB_NAME: |
---|
829 | if not SearchBackend.inmemory_db: |
---|
830 | SearchBackend.inmemory_db = xapian.inmemory_open() |
---|
831 | return SearchBackend.inmemory_db |
---|
832 | if writable: |
---|
833 | database = xapian.WritableDatabase(settings.HAYSTACK_XAPIAN_PATH, xapian.DB_CREATE_OR_OPEN) |
---|
834 | else: |
---|
835 | try: |
---|
836 | database = xapian.Database(settings.HAYSTACK_XAPIAN_PATH) |
---|
837 | except xapian.DatabaseOpeningError: |
---|
838 | raise InvalidIndexError(u'Unable to open index at %s' % settings.HAYSTACK_XAPIAN_PATH) |
---|
839 | |
---|
840 | return database |
---|
841 | |
---|
842 | def _get_enquire_mset(self, database, enquire, start_offset, end_offset): |
---|
843 | """ |
---|
844 | A safer version of Xapian.enquire.get_mset |
---|
845 | |
---|
846 | Simply wraps the Xapian version and catches any `Xapian.DatabaseModifiedError`, |
---|
847 | attempting a `database.reopen` as needed. |
---|
848 | |
---|
849 | Required arguments: |
---|
850 | `database` -- The database to be read |
---|
851 | `enquire` -- An instance of an Xapian.enquire object |
---|
852 | `start_offset` -- The start offset to pass to `enquire.get_mset` |
---|
853 | `end_offset` -- The end offset to pass to `enquire.get_mset` |
---|
854 | """ |
---|
855 | try: |
---|
856 | return enquire.get_mset(start_offset, end_offset) |
---|
857 | except xapian.DatabaseModifiedError: |
---|
858 | database.reopen() |
---|
859 | return enquire.get_mset(start_offset, end_offset) |
---|
860 | |
---|
861 | def _get_document_data(self, database, document): |
---|
862 | """ |
---|
863 | A safer version of Xapian.document.get_data |
---|
864 | |
---|
865 | Simply wraps the Xapian version and catches any `Xapian.DatabaseModifiedError`, |
---|
866 | attempting a `database.reopen` as needed. |
---|
867 | |
---|
868 | Required arguments: |
---|
869 | `database` -- The database to be read |
---|
870 | `document` -- An instance of an Xapian.document object |
---|
871 | """ |
---|
872 | try: |
---|
873 | return document.get_data() |
---|
874 | except xapian.DatabaseModifiedError: |
---|
875 | database.reopen() |
---|
876 | return document.get_data() |
---|
877 | |
---|
878 | def _get_hit_count(self, database, enquire): |
---|
879 | """ |
---|
880 | Given a database and enquire instance, returns the estimated number |
---|
881 | of matches. |
---|
882 | |
---|
883 | Required arguments: |
---|
884 | `database` -- The database to be queried |
---|
885 | `enquire` -- The enquire instance |
---|
886 | """ |
---|
887 | return self._get_enquire_mset( |
---|
888 | database, enquire, 0, database.get_doccount() |
---|
889 | ).size() |
---|
890 | |
---|
891 | def _value_column(self, field): |
---|
892 | """ |
---|
893 | Private method that returns the column value slot in the database |
---|
894 | for a given field. |
---|
895 | |
---|
896 | Required arguemnts: |
---|
897 | `field` -- The field to lookup |
---|
898 | |
---|
899 | Returns an integer with the column location (0 indexed). |
---|
900 | """ |
---|
901 | for field_dict in self.schema: |
---|
902 | if field_dict['field_name'] == field: |
---|
903 | return field_dict['column'] |
---|
904 | return 0 |
---|
905 | |
---|
906 | def _multi_value_field(self, field): |
---|
907 | """ |
---|
908 | Private method that returns `True` if a field is multi-valued, else |
---|
909 | `False`. |
---|
910 | |
---|
911 | Required arguemnts: |
---|
912 | `field` -- The field to lookup |
---|
913 | |
---|
914 | Returns a boolean value indicating whether the field is multi-valued. |
---|
915 | """ |
---|
916 | for field_dict in self.schema: |
---|
917 | if field_dict['field_name'] == field: |
---|
918 | return field_dict['multi_valued'] == 'true' |
---|
919 | return False |
---|
920 | |
---|
921 | |
---|
922 | class SearchQuery(BaseSearchQuery): |
---|
923 | """ |
---|
924 | This class is the Xapian specific version of the SearchQuery class. |
---|
925 | It acts as an intermediary between the ``SearchQuerySet`` and the |
---|
926 | ``SearchBackend`` itself. |
---|
927 | """ |
---|
928 | def __init__(self, backend=None, site=None): |
---|
929 | """ |
---|
930 | Create a new instance of the SearchQuery setting the backend as |
---|
931 | specified. If no backend is set, will use the Xapian `SearchBackend`. |
---|
932 | |
---|
933 | Optional arguments: |
---|
934 | ``backend`` -- The ``SearchBackend`` to use (default = None) |
---|
935 | ``site`` -- The site to use (default = None) |
---|
936 | """ |
---|
937 | super(SearchQuery, self).__init__(backend=backend) |
---|
938 | self.backend = backend or SearchBackend(site=site) |
---|
939 | |
---|
940 | def build_params(self, *args, **kwargs): |
---|
941 | kwargs = super(SearchQuery, self).build_params(*args, **kwargs) |
---|
942 | |
---|
943 | if self.end_offset is not None: |
---|
944 | kwargs['end_offset'] = self.end_offset - self.start_offset |
---|
945 | |
---|
946 | return kwargs |
---|
947 | |
---|
948 | def build_query(self): |
---|
949 | if not self.query_filter: |
---|
950 | query = xapian.Query('') |
---|
951 | else: |
---|
952 | query = self._query_from_search_node(self.query_filter) |
---|
953 | |
---|
954 | if self.models: |
---|
955 | subqueries = [ |
---|
956 | xapian.Query( |
---|
957 | xapian.Query.OP_SCALE_WEIGHT, xapian.Query('%s%s.%s' % ( |
---|
958 | DOCUMENT_CT_TERM_PREFIX, |
---|
959 | model._meta.app_label, model._meta.module_name |
---|
960 | ) |
---|
961 | ), 0 # Pure boolean sub-query |
---|
962 | ) for model in self.models |
---|
963 | ] |
---|
964 | query = xapian.Query( |
---|
965 | xapian.Query.OP_AND, query, |
---|
966 | xapian.Query(xapian.Query.OP_OR, subqueries) |
---|
967 | ) |
---|
968 | |
---|
969 | if self.boost: |
---|
970 | subqueries = [ |
---|
971 | xapian.Query( |
---|
972 | xapian.Query.OP_SCALE_WEIGHT, self._content_field(term, False), value |
---|
973 | ) for term, value in self.boost.iteritems() |
---|
974 | ] |
---|
975 | query = xapian.Query( |
---|
976 | xapian.Query.OP_AND_MAYBE, query, |
---|
977 | xapian.Query(xapian.Query.OP_OR, subqueries) |
---|
978 | ) |
---|
979 | |
---|
980 | return query |
---|
981 | |
---|
982 | def _query_from_search_node(self, search_node, is_not=False): |
---|
983 | query_list = [] |
---|
984 | |
---|
985 | for child in search_node.children: |
---|
986 | if isinstance(child, SearchNode): |
---|
987 | query_list.append( |
---|
988 | self._query_from_search_node(child, child.negated) |
---|
989 | ) |
---|
990 | else: |
---|
991 | expression, term = child |
---|
992 | field, filter_type = search_node.split_expression(expression) |
---|
993 | |
---|
994 | # Handle when we've got a ``ValuesListQuerySet``... |
---|
995 | if hasattr(term, 'values_list'): |
---|
996 | term = list(term) |
---|
997 | |
---|
998 | if isinstance(term, (list, tuple)): |
---|
999 | term = [_marshal_term(t) for t in term] |
---|
1000 | else: |
---|
1001 | term = _marshal_term(term) |
---|
1002 | |
---|
1003 | if field == 'content': |
---|
1004 | query_list.append(self._content_field(term, is_not)) |
---|
1005 | else: |
---|
1006 | if filter_type == 'exact': |
---|
1007 | query_list.append(self._filter_exact(term, field, is_not)) |
---|
1008 | elif filter_type == 'gt': |
---|
1009 | query_list.append(self._filter_gt(term, field, is_not)) |
---|
1010 | elif filter_type == 'gte': |
---|
1011 | query_list.append(self._filter_gte(term, field, is_not)) |
---|
1012 | elif filter_type == 'lt': |
---|
1013 | query_list.append(self._filter_lt(term, field, is_not)) |
---|
1014 | elif filter_type == 'lte': |
---|
1015 | query_list.append(self._filter_lte(term, field, is_not)) |
---|
1016 | elif filter_type == 'startswith': |
---|
1017 | query_list.append(self._filter_startswith(term, field, is_not)) |
---|
1018 | elif filter_type == 'in': |
---|
1019 | query_list.append(self._filter_in(term, field, is_not)) |
---|
1020 | |
---|
1021 | if search_node.connector == 'OR': |
---|
1022 | return xapian.Query(xapian.Query.OP_OR, query_list) |
---|
1023 | else: |
---|
1024 | return xapian.Query(xapian.Query.OP_AND, query_list) |
---|
1025 | |
---|
1026 | def _content_field(self, term, is_not): |
---|
1027 | """ |
---|
1028 | Private method that returns a xapian.Query that searches for `value` |
---|
1029 | in all fields. |
---|
1030 | |
---|
1031 | Required arguments: |
---|
1032 | ``term`` -- The term to search for |
---|
1033 | ``is_not`` -- Invert the search results |
---|
1034 | |
---|
1035 | Returns: |
---|
1036 | A xapian.Query |
---|
1037 | """ |
---|
1038 | if ' ' in term: |
---|
1039 | if is_not: |
---|
1040 | return xapian.Query( |
---|
1041 | xapian.Query.OP_AND_NOT, self._all_query(), self._phrase_query( |
---|
1042 | term.split(), self.backend.content_field_name |
---|
1043 | ) |
---|
1044 | ) |
---|
1045 | else: |
---|
1046 | return self._phrase_query(term.split(), self.backend.content_field_name) |
---|
1047 | else: |
---|
1048 | if is_not: |
---|
1049 | return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), self.backend.parse_query(term)) |
---|
1050 | else: |
---|
1051 | return self.backend.parse_query(term) |
---|
1052 | |
---|
1053 | def _filter_exact(self, term, field, is_not): |
---|
1054 | """ |
---|
1055 | Private method that returns a xapian.Query that searches for `term` |
---|
1056 | in a specified `field`. |
---|
1057 | |
---|
1058 | Required arguments: |
---|
1059 | ``term`` -- The term to search for |
---|
1060 | ``field`` -- The field to search |
---|
1061 | ``is_not`` -- Invert the search results |
---|
1062 | |
---|
1063 | Returns: |
---|
1064 | A xapian.Query |
---|
1065 | """ |
---|
1066 | if ' ' in term: |
---|
1067 | if is_not: |
---|
1068 | return xapian.Query( |
---|
1069 | xapian.Query.OP_AND_NOT, self._all_query(), self._phrase_query(term.split(), field) |
---|
1070 | ) |
---|
1071 | else: |
---|
1072 | return self._phrase_query(term.split(), field) |
---|
1073 | else: |
---|
1074 | if is_not: |
---|
1075 | return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), self._term_query(term, field)) |
---|
1076 | else: |
---|
1077 | return self._term_query(term, field) |
---|
1078 | |
---|
1079 | def _filter_in(self, term_list, field, is_not): |
---|
1080 | """ |
---|
1081 | Private method that returns a xapian.Query that searches for any term |
---|
1082 | of `value_list` in a specified `field`. |
---|
1083 | |
---|
1084 | Required arguments: |
---|
1085 | ``term_list`` -- The terms to search for |
---|
1086 | ``field`` -- The field to search |
---|
1087 | ``is_not`` -- Invert the search results |
---|
1088 | |
---|
1089 | Returns: |
---|
1090 | A xapian.Query |
---|
1091 | """ |
---|
1092 | query_list = [] |
---|
1093 | for term in term_list: |
---|
1094 | if ' ' in term: |
---|
1095 | query_list.append( |
---|
1096 | self._phrase_query(term.split(), field) |
---|
1097 | ) |
---|
1098 | else: |
---|
1099 | query_list.append( |
---|
1100 | self._term_query(term, field) |
---|
1101 | ) |
---|
1102 | if is_not: |
---|
1103 | return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), xapian.Query(xapian.Query.OP_OR, query_list)) |
---|
1104 | else: |
---|
1105 | return xapian.Query(xapian.Query.OP_OR, query_list) |
---|
1106 | |
---|
1107 | def _filter_startswith(self, term, field, is_not): |
---|
1108 | """ |
---|
1109 | Private method that returns a xapian.Query that searches for any term |
---|
1110 | that begins with `term` in a specified `field`. |
---|
1111 | |
---|
1112 | Required arguments: |
---|
1113 | ``term`` -- The terms to search for |
---|
1114 | ``field`` -- The field to search |
---|
1115 | ``is_not`` -- Invert the search results |
---|
1116 | |
---|
1117 | Returns: |
---|
1118 | A xapian.Query |
---|
1119 | """ |
---|
1120 | if is_not: |
---|
1121 | return xapian.Query( |
---|
1122 | xapian.Query.OP_AND_NOT, |
---|
1123 | self._all_query(), |
---|
1124 | self.backend.parse_query('%s:%s*' % (field, term)), |
---|
1125 | ) |
---|
1126 | return self.backend.parse_query('%s:%s*' % (field, term)) |
---|
1127 | |
---|
1128 | def _filter_gt(self, term, field, is_not): |
---|
1129 | return self._filter_lte(term, field, is_not=(is_not != True)) |
---|
1130 | |
---|
1131 | def _filter_lt(self, term, field, is_not): |
---|
1132 | return self._filter_gte(term, field, is_not=(is_not != True)) |
---|
1133 | |
---|
1134 | def _filter_gte(self, term, field, is_not): |
---|
1135 | """ |
---|
1136 | Private method that returns a xapian.Query that searches for any term |
---|
1137 | that is greater than `term` in a specified `field`. |
---|
1138 | """ |
---|
1139 | vrp = XHValueRangeProcessor(self.backend) |
---|
1140 | pos, begin, end = vrp('%s:%s' % (field, _marshal_value(term)), '*') |
---|
1141 | if is_not: |
---|
1142 | return xapian.Query(xapian.Query.OP_AND_NOT, |
---|
1143 | self._all_query(), |
---|
1144 | xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) |
---|
1145 | ) |
---|
1146 | return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) |
---|
1147 | |
---|
1148 | def _filter_lte(self, term, field, is_not): |
---|
1149 | """ |
---|
1150 | Private method that returns a xapian.Query that searches for any term |
---|
1151 | that is less than `term` in a specified `field`. |
---|
1152 | """ |
---|
1153 | vrp = XHValueRangeProcessor(self.backend) |
---|
1154 | pos, begin, end = vrp('%s:' % field, '%s' % _marshal_value(term)) |
---|
1155 | if is_not: |
---|
1156 | return xapian.Query(xapian.Query.OP_AND_NOT, |
---|
1157 | self._all_query(), |
---|
1158 | xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) |
---|
1159 | ) |
---|
1160 | return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) |
---|
1161 | |
---|
1162 | def _all_query(self): |
---|
1163 | """ |
---|
1164 | Private method that returns a xapian.Query that returns all documents, |
---|
1165 | |
---|
1166 | Returns: |
---|
1167 | A xapian.Query |
---|
1168 | """ |
---|
1169 | return xapian.Query('') |
---|
1170 | |
---|
1171 | def _term_query(self, term, field=None): |
---|
1172 | """ |
---|
1173 | Private method that returns a term based xapian.Query that searches |
---|
1174 | for `term`. |
---|
1175 | |
---|
1176 | Required arguments: |
---|
1177 | ``term`` -- The term to search for |
---|
1178 | ``field`` -- The field to search (If `None`, all fields) |
---|
1179 | |
---|
1180 | Returns: |
---|
1181 | A xapian.Query |
---|
1182 | """ |
---|
1183 | stem = xapian.Stem(self.backend.language) |
---|
1184 | |
---|
1185 | if field == 'id': |
---|
1186 | return xapian.Query('%s%s' % (DOCUMENT_ID_TERM_PREFIX, term)) |
---|
1187 | elif field == 'django_ct': |
---|
1188 | return xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, term)) |
---|
1189 | elif field: |
---|
1190 | stemmed = 'Z%s%s%s' % ( |
---|
1191 | DOCUMENT_CUSTOM_TERM_PREFIX, field.upper(), stem(term) |
---|
1192 | ) |
---|
1193 | unstemmed = '%s%s%s' % ( |
---|
1194 | DOCUMENT_CUSTOM_TERM_PREFIX, field.upper(), term |
---|
1195 | ) |
---|
1196 | else: |
---|
1197 | stemmed = 'Z%s' % stem(term) |
---|
1198 | unstemmed = term |
---|
1199 | |
---|
1200 | return xapian.Query( |
---|
1201 | xapian.Query.OP_OR, |
---|
1202 | xapian.Query(stemmed), |
---|
1203 | xapian.Query(unstemmed) |
---|
1204 | ) |
---|
1205 | |
---|
1206 | def _phrase_query(self, term_list, field=None): |
---|
1207 | """ |
---|
1208 | Private method that returns a phrase based xapian.Query that searches |
---|
1209 | for terms in `term_list. |
---|
1210 | |
---|
1211 | Required arguments: |
---|
1212 | ``term_list`` -- The terms to search for |
---|
1213 | ``field`` -- The field to search (If `None`, all fields) |
---|
1214 | |
---|
1215 | Returns: |
---|
1216 | A xapian.Query |
---|
1217 | """ |
---|
1218 | if field: |
---|
1219 | return xapian.Query( |
---|
1220 | xapian.Query.OP_PHRASE, [ |
---|
1221 | '%s%s%s' % ( |
---|
1222 | DOCUMENT_CUSTOM_TERM_PREFIX, field.upper(), term |
---|
1223 | ) for term in term_list |
---|
1224 | ] |
---|
1225 | ) |
---|
1226 | else: |
---|
1227 | return xapian.Query(xapian.Query.OP_PHRASE, term_list) |
---|
1228 | |
---|
1229 | def _marshal_value(value): |
---|
1230 | """ |
---|
1231 | Private utility method that converts Python values to a string for Xapian values. |
---|
1232 | """ |
---|
1233 | if isinstance(value, datetime.datetime): |
---|
1234 | value = _marshal_datetime(value) |
---|
1235 | elif isinstance(value, datetime.date): |
---|
1236 | value = _marshal_date(value) |
---|
1237 | elif isinstance(value, bool): |
---|
1238 | if value: |
---|
1239 | value = u't' |
---|
1240 | else: |
---|
1241 | value = u'f' |
---|
1242 | elif isinstance(value, float): |
---|
1243 | value = xapian.sortable_serialise(value) |
---|
1244 | elif isinstance(value, (int, long)): |
---|
1245 | value = u'%012d' % value |
---|
1246 | else: |
---|
1247 | value = force_unicode(value).lower() |
---|
1248 | return value |
---|
1249 | |
---|
1250 | |
---|
1251 | def _marshal_term(term): |
---|
1252 | """ |
---|
1253 | Private utility method that converts Python terms to a string for Xapian terms. |
---|
1254 | """ |
---|
1255 | if isinstance(term, datetime.datetime): |
---|
1256 | term = _marshal_datetime(term) |
---|
1257 | elif isinstance(term, datetime.date): |
---|
1258 | term = _marshal_date(term) |
---|
1259 | else: |
---|
1260 | term = force_unicode(term).lower() |
---|
1261 | return term |
---|
1262 | |
---|
1263 | |
---|
1264 | def _marshal_date(d): |
---|
1265 | return u'%04d%02d%02d000000' % (d.year, d.month, d.day) |
---|
1266 | |
---|
1267 | |
---|
1268 | def _marshal_datetime(dt): |
---|
1269 | if dt.microsecond: |
---|
1270 | return u'%04d%02d%02d%02d%02d%02d%06d' % ( |
---|
1271 | dt.year, dt.month, dt.day, dt.hour, |
---|
1272 | dt.minute, dt.second, dt.microsecond |
---|
1273 | ) |
---|
1274 | else: |
---|
1275 | return u'%04d%02d%02d%02d%02d%02d' % ( |
---|
1276 | dt.year, dt.month, dt.day, dt.hour, |
---|
1277 | dt.minute, dt.second |
---|
1278 | ) |
---|