1 |
adcroft |
1.1 |
#======================================================================= |
2 |
|
|
# Phrase Highlighting Code |
3 |
|
|
# |
4 |
|
|
# $Id: PhraseHighlight.pm,v 1.4 2002/04/06 17:52:39 whmoseley Exp $ |
5 |
|
|
#======================================================================= |
6 |
|
|
package PhraseHighlight; |
7 |
|
|
use strict; |
8 |
|
|
|
9 |
|
|
use constant DEBUG_HIGHLIGHT => 0; |
10 |
|
|
|
11 |
|
|
sub new { |
12 |
|
|
my ( $class, $results, $metaname ) = @_; |
13 |
|
|
|
14 |
|
|
|
15 |
|
|
my $self = bless { |
16 |
|
|
results => $results, # just in case we need a method |
17 |
|
|
settings=> $results->config('highlight'), |
18 |
|
|
metaname=> $metaname, |
19 |
|
|
}, $class; |
20 |
|
|
|
21 |
|
|
# parse out the query into words |
22 |
|
|
my $query = $results->extract_query_match; |
23 |
|
|
|
24 |
|
|
|
25 |
|
|
# Do words exist for this layer (all text at this time) and metaname? |
26 |
|
|
# This is a reference to an array of phrases and words |
27 |
|
|
|
28 |
|
|
$self->{description_prop} = $results->config('description_prop') || ''; |
29 |
|
|
|
30 |
|
|
|
31 |
|
|
|
32 |
|
|
if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) { |
33 |
|
|
eval { require SWISH::Stemmer }; |
34 |
|
|
if ( $@ ) { |
35 |
|
|
$results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@); |
36 |
|
|
} else { |
37 |
|
|
$self->{stemmer_function} = \&SWISH::Stemmer::SwishStem; |
38 |
|
|
} |
39 |
|
|
} |
40 |
|
|
|
41 |
|
|
|
42 |
|
|
|
43 |
|
|
my %stopwords = map { $_, 1 } split /\s+/, $results->header('stopwords'); |
44 |
|
|
$self->{stopwords} = \%stopwords; |
45 |
|
|
|
46 |
|
|
|
47 |
|
|
if ( $query && exists $query->{text}{$metaname} ) { |
48 |
|
|
$self->{query} = $query->{text}{$metaname}; |
49 |
|
|
|
50 |
|
|
$self->set_match_regexp; |
51 |
|
|
} |
52 |
|
|
|
53 |
|
|
return $self; |
54 |
|
|
} |
55 |
|
|
|
56 |
|
|
sub highlight { |
57 |
|
|
my ( $self, $properties ) = @_; |
58 |
|
|
|
59 |
|
|
|
60 |
|
|
return unless $self->{query}; |
61 |
|
|
|
62 |
|
|
my $phrase_array = $self->{query}; |
63 |
|
|
|
64 |
|
|
my $settings = $self->{settings}; |
65 |
|
|
my $metaname = $self->{metaname}; |
66 |
|
|
|
67 |
|
|
# Do we care about this meta? |
68 |
|
|
return unless exists $settings->{meta_to_prop_map}{$metaname}; |
69 |
|
|
|
70 |
|
|
# Get the related properties |
71 |
|
|
my @props = @{ $settings->{meta_to_prop_map}{$metaname} }; |
72 |
|
|
|
73 |
|
|
my %checked; |
74 |
|
|
|
75 |
|
|
for ( @props ) { |
76 |
|
|
if ( $properties->{$_} ) { |
77 |
|
|
$checked{$_}++; |
78 |
|
|
$self->highlight_text( \$properties->{$_}, $phrase_array ); |
79 |
|
|
} |
80 |
|
|
} |
81 |
|
|
|
82 |
|
|
|
83 |
|
|
# Truncate the description, if not processed. |
84 |
|
|
my $description = $self->{description_prop}; |
85 |
|
|
if ( $description && !$checked{ $description } && $properties->{$description} ) { |
86 |
|
|
my $max_words = $settings->{max_words} || 100; |
87 |
|
|
my @words = split /\s+/, $properties->{$description}; |
88 |
|
|
if ( @words > $max_words ) { |
89 |
|
|
$properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>'; |
90 |
|
|
} |
91 |
|
|
} |
92 |
|
|
|
93 |
|
|
} |
94 |
|
|
|
95 |
|
|
|
96 |
|
|
|
97 |
|
|
#========================================================================== |
98 |
|
|
# |
99 |
|
|
|
100 |
|
|
sub highlight_text { |
101 |
|
|
|
102 |
|
|
my ( $self, $text_ref, $phrase_array ) = @_; |
103 |
|
|
|
104 |
|
|
my $wc_regexp = $self->{wc_regexp}; |
105 |
|
|
my $extract_regexp = $self->{extract_regexp}; |
106 |
|
|
|
107 |
|
|
|
108 |
|
|
my $last = 0; |
109 |
|
|
|
110 |
|
|
my $settings = $self->{settings}; |
111 |
|
|
|
112 |
|
|
my $Show_Words = $settings->{show_words} || 10; |
113 |
|
|
my $Occurrences = $settings->{occurrences} || 5; |
114 |
|
|
my $Max_Words = $settings->{max_words} || 100; |
115 |
|
|
|
116 |
|
|
|
117 |
|
|
|
118 |
|
|
my $On = $settings->{highlight_on} || '<b>'; |
119 |
|
|
my $Off = $settings->{highlight_off} || '</b>'; |
120 |
|
|
|
121 |
|
|
my $on_flag = 'sw' . time . 'on'; |
122 |
|
|
my $off_flag = 'sw' . time . 'off'; |
123 |
|
|
|
124 |
|
|
|
125 |
|
|
my $stemmer_function = $self->{stemmer_function}; |
126 |
|
|
|
127 |
|
|
# Should really call unescapeHTML(), but then would need to escape <b> from escaping. |
128 |
|
|
|
129 |
|
|
# Split into words. For speed, should work on a stream method. |
130 |
|
|
my @words = split /$wc_regexp/, $$text_ref; |
131 |
|
|
|
132 |
|
|
|
133 |
|
|
return 'No Content saved: Check StoreDescription setting' unless @words; |
134 |
|
|
|
135 |
|
|
my @flags; # This marks where to start and stop display. |
136 |
|
|
$flags[$#words] = 0; # Extend array. |
137 |
|
|
|
138 |
|
|
my $occurrences = $Occurrences ; |
139 |
|
|
|
140 |
|
|
|
141 |
|
|
my $word_pos = $words[0] eq '' ? 2 : 0; # Start depends on if first word was wordcharacters or not |
142 |
|
|
|
143 |
|
|
my @phrases = @{ $self->{query} }; |
144 |
|
|
|
145 |
|
|
# Remember, that the swish words are every other in @words. |
146 |
|
|
|
147 |
|
|
WORD: |
148 |
|
|
while ( $Show_Words && $word_pos * 2 < @words ) { |
149 |
|
|
|
150 |
|
|
PHRASE: |
151 |
|
|
foreach my $phrase ( @phrases ) { |
152 |
|
|
|
153 |
|
|
print STDERR " Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT; |
154 |
|
|
next PHRASE if ($word_pos + @$phrase -1) * 2 > @words; # phrase is longer than what's left |
155 |
|
|
|
156 |
|
|
|
157 |
|
|
my $end_pos = 0; # end offset of the current phrase |
158 |
|
|
|
159 |
|
|
# now compare all the words in the phrase |
160 |
|
|
|
161 |
|
|
my ( $begin, $word, $end ); |
162 |
|
|
|
163 |
|
|
for my $match_word ( @$phrase ) { |
164 |
|
|
|
165 |
|
|
my $cur_word = $words[ ($word_pos + $end_pos) * 2 ]; |
166 |
|
|
unless ( $cur_word =~ /$extract_regexp/ ) { |
167 |
|
|
|
168 |
|
|
my $idx = ($word_pos + $end_pos) * 2; |
169 |
|
|
my ( $s, $e ) = ( $idx - 10, $idx + 10 ); |
170 |
|
|
$s = 0 if $s < 0; |
171 |
|
|
$e = @words-1 if $e >= @words; |
172 |
|
|
|
173 |
|
|
|
174 |
|
|
warn "Failed to parse IgnoreFirst/Last from word '" |
175 |
|
|
. (defined $cur_word ? $cur_word : '*undef') |
176 |
|
|
. "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:" |
177 |
|
|
. scalar @words |
178 |
|
|
. "\n-search pharse words-\n" |
179 |
|
|
. join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 ) |
180 |
|
|
. "\n-Words-\n" |
181 |
|
|
. join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e ) |
182 |
|
|
. "\n"; |
183 |
|
|
|
184 |
|
|
next PHRASE; |
185 |
|
|
} |
186 |
|
|
|
187 |
|
|
|
188 |
|
|
|
189 |
|
|
|
190 |
|
|
# Strip ignorefirst and ignorelast |
191 |
|
|
( $begin, $word, $end ) = ( $1, $2, $3 ); # this is a waste, as it can operate on the same word over and over |
192 |
|
|
|
193 |
|
|
my $check_word = lc $word; |
194 |
|
|
|
195 |
|
|
if ( $end_pos && exists $self->{stopwords}{$check_word} ) { |
196 |
|
|
$end_pos++; |
197 |
|
|
print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT; |
198 |
|
|
redo if ( $word_pos + $end_pos ) * 2 < @words; # go on to check this match word with the next word. |
199 |
|
|
|
200 |
|
|
# No more words to match with, so go on to next pharse. |
201 |
|
|
next PHRASE; |
202 |
|
|
} |
203 |
|
|
|
204 |
|
|
if ( $stemmer_function ) { |
205 |
|
|
my $w = $stemmer_function->($check_word); |
206 |
|
|
$check_word = $w if $w; |
207 |
|
|
} |
208 |
|
|
|
209 |
|
|
|
210 |
|
|
|
211 |
|
|
print STDERR " comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT; |
212 |
|
|
|
213 |
|
|
if ( substr( $match_word, -1 ) eq '*' ) { |
214 |
|
|
next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0; |
215 |
|
|
|
216 |
|
|
} else { |
217 |
|
|
next PHRASE if $check_word ne $match_word; |
218 |
|
|
} |
219 |
|
|
|
220 |
|
|
|
221 |
|
|
print STDERR " *** Word Matched '$check_word' *** \n" if DEBUG_HIGHLIGHT; |
222 |
|
|
$end_pos++; |
223 |
|
|
} |
224 |
|
|
|
225 |
|
|
print STDERR " *** PHRASE MATCHED (word:$word_pos offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT; |
226 |
|
|
|
227 |
|
|
|
228 |
|
|
# We are currently at the end word, so it's easy to set that highlight |
229 |
|
|
|
230 |
|
|
$end_pos--; |
231 |
|
|
|
232 |
|
|
if ( !$end_pos ) { # only one word |
233 |
|
|
$words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end"; |
234 |
|
|
} else { |
235 |
|
|
$words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end"; |
236 |
|
|
|
237 |
|
|
#Now, reload first word of match |
238 |
|
|
$words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?"; |
239 |
|
|
# Strip ignorefirst and ignorelast |
240 |
|
|
( $begin, $word, $end ) = ( $1, $2, $3 ); # probably should cache this! |
241 |
|
|
$words[$word_pos * 2] = "$begin$on_flag$word$end"; |
242 |
|
|
} |
243 |
|
|
|
244 |
|
|
|
245 |
|
|
# Now, flag the words around to be shown |
246 |
|
|
my $start = ($word_pos - $Show_Words + 1) * 2; |
247 |
|
|
my $stop = ($word_pos + $end_pos + $Show_Words - 2) * 2; |
248 |
|
|
if ( $start < 0 ) { |
249 |
|
|
$stop = $stop - $start; |
250 |
|
|
$start = 0; |
251 |
|
|
} |
252 |
|
|
|
253 |
|
|
$stop = $#words if $stop > $#words; |
254 |
|
|
|
255 |
|
|
$flags[$_]++ for $start .. $stop; |
256 |
|
|
|
257 |
|
|
|
258 |
|
|
# All done, and mark where to stop looking |
259 |
|
|
if ( $occurrences-- <= 0 ) { |
260 |
|
|
$last = $stop; |
261 |
|
|
last WORD; |
262 |
|
|
} |
263 |
|
|
|
264 |
|
|
|
265 |
|
|
# Now reset $word_pos to word following |
266 |
|
|
$word_pos += $end_pos; # continue will still be executed |
267 |
|
|
next WORD; |
268 |
|
|
} |
269 |
|
|
} continue { |
270 |
|
|
$word_pos ++; |
271 |
|
|
} |
272 |
|
|
|
273 |
|
|
|
274 |
|
|
|
275 |
|
|
|
276 |
|
|
my $dotdotdot = ' ... '; |
277 |
|
|
|
278 |
|
|
|
279 |
|
|
my @output; |
280 |
|
|
|
281 |
|
|
my $printing; |
282 |
|
|
my $first = 1; |
283 |
|
|
my $some_printed; |
284 |
|
|
|
285 |
|
|
if ( $Show_Words && @words > 50 ) { # don't limit context if a small number of words |
286 |
|
|
for my $i ( 0 ..$#words ) { |
287 |
|
|
|
288 |
|
|
|
289 |
|
|
if ( $last && $i >= $last && $i < $#words ) { |
290 |
|
|
push @output, $dotdotdot; |
291 |
|
|
last; |
292 |
|
|
} |
293 |
|
|
|
294 |
|
|
if ( $flags[$i] ) { |
295 |
|
|
|
296 |
|
|
push @output, $dotdotdot if !$printing++ && !$first; |
297 |
|
|
push @output, $words[$i]; |
298 |
|
|
$some_printed++; |
299 |
|
|
|
300 |
|
|
} else { |
301 |
|
|
$printing = 0; |
302 |
|
|
} |
303 |
|
|
|
304 |
|
|
$first = 0; |
305 |
|
|
|
306 |
|
|
|
307 |
|
|
} |
308 |
|
|
} |
309 |
|
|
|
310 |
|
|
if ( !$some_printed ) { |
311 |
|
|
for my $i ( 0 .. $Max_Words ) { |
312 |
|
|
if ( $i > $#words ) { |
313 |
|
|
$printing++; |
314 |
|
|
last; |
315 |
|
|
} |
316 |
|
|
push @output, $words[$i]; |
317 |
|
|
} |
318 |
|
|
} |
319 |
|
|
|
320 |
|
|
|
321 |
|
|
|
322 |
|
|
push @output, $dotdotdot if !$printing; |
323 |
|
|
|
324 |
|
|
$$text_ref = join '', @output; |
325 |
|
|
my %entities = ( |
326 |
|
|
'&' => '&', |
327 |
|
|
'>' => '>', |
328 |
|
|
'<' => '<', |
329 |
|
|
'"' => '"', |
330 |
|
|
); |
331 |
|
|
my %highlight = ( |
332 |
|
|
$on_flag => $On, |
333 |
|
|
$off_flag => $Off, |
334 |
|
|
); |
335 |
|
|
|
336 |
|
|
|
337 |
|
|
$$text_ref =~ s/([&"<>])/$entities{$1}/ge; |
338 |
|
|
|
339 |
|
|
$$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge; |
340 |
|
|
|
341 |
|
|
|
342 |
|
|
|
343 |
|
|
|
344 |
|
|
# $$text_ref = join '', @words; # interesting that this seems reasonably faster |
345 |
|
|
|
346 |
|
|
|
347 |
|
|
} |
348 |
|
|
|
349 |
|
|
#============================================ |
350 |
|
|
# Returns compiled regular expressions for matching |
351 |
|
|
# |
352 |
|
|
# |
353 |
|
|
|
354 |
|
|
sub set_match_regexp { |
355 |
|
|
my $self = shift; |
356 |
|
|
|
357 |
|
|
my $results = $self->{results}; |
358 |
|
|
|
359 |
|
|
|
360 |
|
|
my $wc = $results->header('wordcharacters'); |
361 |
|
|
my $ignoref = $results->header('ignorefirstchar'); |
362 |
|
|
my $ignorel = $results->header('ignorelastchar'); |
363 |
|
|
|
364 |
|
|
|
365 |
|
|
$wc = quotemeta $wc; |
366 |
|
|
|
367 |
|
|
#Convert query into regular expressions |
368 |
|
|
|
369 |
|
|
|
370 |
|
|
for ( $ignoref, $ignorel ) { |
371 |
|
|
if ( $_ ) { |
372 |
|
|
$_ = quotemeta; |
373 |
|
|
$_ = "([$_]*)"; |
374 |
|
|
} else { |
375 |
|
|
$_ = '()'; |
376 |
|
|
} |
377 |
|
|
} |
378 |
|
|
|
379 |
|
|
|
380 |
|
|
$wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing |
381 |
|
|
|
382 |
|
|
|
383 |
|
|
# Now, wait a minute. Look at this more, as I'd hope that making a |
384 |
|
|
# qr// go out of scope would release the compiled pattern. |
385 |
|
|
|
386 |
|
|
if ( $ENV{MOD_PERL} ) { |
387 |
|
|
$self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words |
388 |
|
|
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare |
389 |
|
|
|
390 |
|
|
} else { |
391 |
|
|
$self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words |
392 |
|
|
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare |
393 |
|
|
} |
394 |
|
|
} |
395 |
|
|
|
396 |
|
|
1; |
397 |
|
|
|
398 |
|
|
|
399 |
|
|
|