forked from django-haystack/django-haystack
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhighlighting.py
More file actions
166 lines (122 loc) · 5.55 KB
/
highlighting.py
File metadata and controls
166 lines (122 loc) · 5.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# encoding: utf-8
from django.utils.html import strip_tags
class Highlighter(object):
css_class = "highlighted"
html_tag = "span"
max_length = 200
text_block = ""
def __init__(self, query, **kwargs):
self.query = query
if "max_length" in kwargs:
self.max_length = int(kwargs["max_length"])
if "html_tag" in kwargs:
self.html_tag = kwargs["html_tag"]
if "css_class" in kwargs:
self.css_class = kwargs["css_class"]
self.query_words = set(
[word.lower() for word in self.query.split() if not word.startswith("-")]
)
def highlight(self, text_block):
self.text_block = strip_tags(text_block)
highlight_locations = self.find_highlightable_words()
start_offset, end_offset = self.find_window(highlight_locations)
return self.render_html(highlight_locations, start_offset, end_offset)
def find_highlightable_words(self):
# Use a set so we only do this once per unique word.
word_positions = {}
# Pre-compute the length.
end_offset = len(self.text_block)
lower_text_block = self.text_block.lower()
for word in self.query_words:
if word not in word_positions:
word_positions[word] = []
start_offset = 0
while start_offset < end_offset:
next_offset = lower_text_block.find(word, start_offset, end_offset)
# If we get a -1 out of find, it wasn't found. Bomb out and
# start the next word.
if next_offset == -1:
break
word_positions[word].append(next_offset)
start_offset = next_offset + len(word)
return word_positions
def find_window(self, highlight_locations):
best_start = 0
best_end = self.max_length
# First, make sure we have words.
if not len(highlight_locations):
return (best_start, best_end)
words_found = []
# Next, make sure we found any words at all.
for word, offset_list in highlight_locations.items():
if len(offset_list):
# Add all of the locations to the list.
words_found.extend(offset_list)
if not len(words_found):
return (best_start, best_end)
if len(words_found) == 1:
return (words_found[0], words_found[0] + self.max_length)
# Sort the list so it's in ascending order.
words_found = sorted(words_found)
# We now have a denormalized list of all positions were a word was
# found. We'll iterate through and find the densest window we can by
# counting the number of found offsets (-1 to fit in the window).
highest_density = 0
if words_found[:-1][0] > self.max_length:
best_start = words_found[:-1][0]
best_end = best_start + self.max_length
for count, start in enumerate(words_found[:-1]):
current_density = 1
for end in words_found[count + 1 :]:
if end - start < self.max_length:
current_density += 1
else:
current_density = 0
# Only replace if we have a bigger (not equal density) so we
# give deference to windows earlier in the document.
if current_density > highest_density:
best_start = start
best_end = start + self.max_length
highest_density = current_density
return (best_start, best_end)
def render_html(self, highlight_locations=None, start_offset=None, end_offset=None):
# Start by chopping the block down to the proper window.
text = self.text_block[start_offset:end_offset]
# Invert highlight_locations to a location -> term list
term_list = []
for term, locations in highlight_locations.items():
term_list += [(loc - start_offset, term) for loc in locations]
loc_to_term = sorted(term_list)
# Prepare the highlight template
if self.css_class:
hl_start = '<%s class="%s">' % (self.html_tag, self.css_class)
else:
hl_start = "<%s>" % (self.html_tag)
hl_end = "</%s>" % self.html_tag
# Copy the part from the start of the string to the first match,
# and there replace the match with a highlighted version.
highlighted_chunk = ""
matched_so_far = 0
prev = 0
prev_str = ""
for cur, cur_str in loc_to_term:
# This can be in a different case than cur_str
actual_term = text[cur : cur + len(cur_str)]
# Handle incorrect highlight_locations by first checking for the term
if actual_term.lower() == cur_str:
if cur < prev + len(prev_str):
continue
highlighted_chunk += (
text[prev + len(prev_str) : cur] + hl_start + actual_term + hl_end
)
prev = cur
prev_str = cur_str
# Keep track of how far we've copied so far, for the last step
matched_so_far = cur + len(actual_term)
# Don't forget the chunk after the last term
highlighted_chunk += text[matched_so_far:]
if start_offset > 0:
highlighted_chunk = "...%s" % highlighted_chunk
if end_offset < len(self.text_block):
highlighted_chunk = "%s..." % highlighted_chunk
return highlighted_chunk