/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/example/modules/PhraseHighlight.pm
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/example/modules/PhraseHighlight.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
Importing web-site building process.

1 #=======================================================================
2 # Phrase Highlighting Code
3 #
4 # $Id: PhraseHighlight.pm,v 1.4 2002/04/06 17:52:39 whmoseley Exp $
5 #=======================================================================
6 package PhraseHighlight;
7 use strict;
8
9 use constant DEBUG_HIGHLIGHT => 0;
10
11 sub new {
12 my ( $class, $results, $metaname ) = @_;
13
14
15 my $self = bless {
16 results => $results, # just in case we need a method
17 settings=> $results->config('highlight'),
18 metaname=> $metaname,
19 }, $class;
20
21 # parse out the query into words
22 my $query = $results->extract_query_match;
23
24
25 # Do words exist for this layer (all text at this time) and metaname?
26 # This is a reference to an array of phrases and words
27
28 $self->{description_prop} = $results->config('description_prop') || '';
29
30
31
32 if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
33 eval { require SWISH::Stemmer };
34 if ( $@ ) {
35 $results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
36 } else {
37 $self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
38 }
39 }
40
41
42
43 my %stopwords = map { $_, 1 } split /\s+/, $results->header('stopwords');
44 $self->{stopwords} = \%stopwords;
45
46
47 if ( $query && exists $query->{text}{$metaname} ) {
48 $self->{query} = $query->{text}{$metaname};
49
50 $self->set_match_regexp;
51 }
52
53 return $self;
54 }
55
56 sub highlight {
57 my ( $self, $properties ) = @_;
58
59
60 return unless $self->{query};
61
62 my $phrase_array = $self->{query};
63
64 my $settings = $self->{settings};
65 my $metaname = $self->{metaname};
66
67 # Do we care about this meta?
68 return unless exists $settings->{meta_to_prop_map}{$metaname};
69
70 # Get the related properties
71 my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
72
73 my %checked;
74
75 for ( @props ) {
76 if ( $properties->{$_} ) {
77 $checked{$_}++;
78 $self->highlight_text( \$properties->{$_}, $phrase_array );
79 }
80 }
81
82
83 # Truncate the description, if not processed.
84 my $description = $self->{description_prop};
85 if ( $description && !$checked{ $description } && $properties->{$description} ) {
86 my $max_words = $settings->{max_words} || 100;
87 my @words = split /\s+/, $properties->{$description};
88 if ( @words > $max_words ) {
89 $properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
90 }
91 }
92
93 }
94
95
96
97 #==========================================================================
98 #
99
100 sub highlight_text {
101
102 my ( $self, $text_ref, $phrase_array ) = @_;
103
104 my $wc_regexp = $self->{wc_regexp};
105 my $extract_regexp = $self->{extract_regexp};
106
107
108 my $last = 0;
109
110 my $settings = $self->{settings};
111
112 my $Show_Words = $settings->{show_words} || 10;
113 my $Occurrences = $settings->{occurrences} || 5;
114 my $Max_Words = $settings->{max_words} || 100;
115
116
117
118 my $On = $settings->{highlight_on} || '<b>';
119 my $Off = $settings->{highlight_off} || '</b>';
120
121 my $on_flag = 'sw' . time . 'on';
122 my $off_flag = 'sw' . time . 'off';
123
124
125 my $stemmer_function = $self->{stemmer_function};
126
127 # Should really call unescapeHTML(), but then would need to escape <b> from escaping.
128
129 # Split into words. For speed, should work on a stream method.
130 my @words = split /$wc_regexp/, $$text_ref;
131
132
133 return 'No Content saved: Check StoreDescription setting' unless @words;
134
135 my @flags; # This marks where to start and stop display.
136 $flags[$#words] = 0; # Extend array.
137
138 my $occurrences = $Occurrences ;
139
140
141 my $word_pos = $words[0] eq '' ? 2 : 0; # Start depends on if first word was wordcharacters or not
142
143 my @phrases = @{ $self->{query} };
144
145 # Remember, that the swish words are every other in @words.
146
147 WORD:
148 while ( $Show_Words && $word_pos * 2 < @words ) {
149
150 PHRASE:
151 foreach my $phrase ( @phrases ) {
152
153 print STDERR " Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT;
154 next PHRASE if ($word_pos + @$phrase -1) * 2 > @words; # phrase is longer than what's left
155
156
157 my $end_pos = 0; # end offset of the current phrase
158
159 # now compare all the words in the phrase
160
161 my ( $begin, $word, $end );
162
163 for my $match_word ( @$phrase ) {
164
165 my $cur_word = $words[ ($word_pos + $end_pos) * 2 ];
166 unless ( $cur_word =~ /$extract_regexp/ ) {
167
168 my $idx = ($word_pos + $end_pos) * 2;
169 my ( $s, $e ) = ( $idx - 10, $idx + 10 );
170 $s = 0 if $s < 0;
171 $e = @words-1 if $e >= @words;
172
173
174 warn "Failed to parse IgnoreFirst/Last from word '"
175 . (defined $cur_word ? $cur_word : '*undef')
176 . "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:"
177 . scalar @words
178 . "\n-search pharse words-\n"
179 . join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 )
180 . "\n-Words-\n"
181 . join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e )
182 . "\n";
183
184 next PHRASE;
185 }
186
187
188
189
190 # Strip ignorefirst and ignorelast
191 ( $begin, $word, $end ) = ( $1, $2, $3 ); # this is a waste, as it can operate on the same word over and over
192
193 my $check_word = lc $word;
194
195 if ( $end_pos && exists $self->{stopwords}{$check_word} ) {
196 $end_pos++;
197 print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT;
198 redo if ( $word_pos + $end_pos ) * 2 < @words; # go on to check this match word with the next word.
199
200 # No more words to match with, so go on to next pharse.
201 next PHRASE;
202 }
203
204 if ( $stemmer_function ) {
205 my $w = $stemmer_function->($check_word);
206 $check_word = $w if $w;
207 }
208
209
210
211 print STDERR " comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT;
212
213 if ( substr( $match_word, -1 ) eq '*' ) {
214 next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0;
215
216 } else {
217 next PHRASE if $check_word ne $match_word;
218 }
219
220
221 print STDERR " *** Word Matched '$check_word' *** \n" if DEBUG_HIGHLIGHT;
222 $end_pos++;
223 }
224
225 print STDERR " *** PHRASE MATCHED (word:$word_pos offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT;
226
227
228 # We are currently at the end word, so it's easy to set that highlight
229
230 $end_pos--;
231
232 if ( !$end_pos ) { # only one word
233 $words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end";
234 } else {
235 $words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end";
236
237 #Now, reload first word of match
238 $words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?";
239 # Strip ignorefirst and ignorelast
240 ( $begin, $word, $end ) = ( $1, $2, $3 ); # probably should cache this!
241 $words[$word_pos * 2] = "$begin$on_flag$word$end";
242 }
243
244
245 # Now, flag the words around to be shown
246 my $start = ($word_pos - $Show_Words + 1) * 2;
247 my $stop = ($word_pos + $end_pos + $Show_Words - 2) * 2;
248 if ( $start < 0 ) {
249 $stop = $stop - $start;
250 $start = 0;
251 }
252
253 $stop = $#words if $stop > $#words;
254
255 $flags[$_]++ for $start .. $stop;
256
257
258 # All done, and mark where to stop looking
259 if ( $occurrences-- <= 0 ) {
260 $last = $stop;
261 last WORD;
262 }
263
264
265 # Now reset $word_pos to word following
266 $word_pos += $end_pos; # continue will still be executed
267 next WORD;
268 }
269 } continue {
270 $word_pos ++;
271 }
272
273
274
275
276 my $dotdotdot = ' ... ';
277
278
279 my @output;
280
281 my $printing;
282 my $first = 1;
283 my $some_printed;
284
285 if ( $Show_Words && @words > 50 ) { # don't limit context if a small number of words
286 for my $i ( 0 ..$#words ) {
287
288
289 if ( $last && $i >= $last && $i < $#words ) {
290 push @output, $dotdotdot;
291 last;
292 }
293
294 if ( $flags[$i] ) {
295
296 push @output, $dotdotdot if !$printing++ && !$first;
297 push @output, $words[$i];
298 $some_printed++;
299
300 } else {
301 $printing = 0;
302 }
303
304 $first = 0;
305
306
307 }
308 }
309
310 if ( !$some_printed ) {
311 for my $i ( 0 .. $Max_Words ) {
312 if ( $i > $#words ) {
313 $printing++;
314 last;
315 }
316 push @output, $words[$i];
317 }
318 }
319
320
321
322 push @output, $dotdotdot if !$printing;
323
324 $$text_ref = join '', @output;
325 my %entities = (
326 '&' => '&amp;',
327 '>' => '&gt;',
328 '<' => '&lt;',
329 '"' => '&quot;',
330 );
331 my %highlight = (
332 $on_flag => $On,
333 $off_flag => $Off,
334 );
335
336
337 $$text_ref =~ s/([&"<>])/$entities{$1}/ge;
338
339 $$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge;
340
341
342
343
344 # $$text_ref = join '', @words; # interesting that this seems reasonably faster
345
346
347 }
348
349 #============================================
350 # Returns compiled regular expressions for matching
351 #
352 #
353
354 sub set_match_regexp {
355 my $self = shift;
356
357 my $results = $self->{results};
358
359
360 my $wc = $results->header('wordcharacters');
361 my $ignoref = $results->header('ignorefirstchar');
362 my $ignorel = $results->header('ignorelastchar');
363
364
365 $wc = quotemeta $wc;
366
367 #Convert query into regular expressions
368
369
370 for ( $ignoref, $ignorel ) {
371 if ( $_ ) {
372 $_ = quotemeta;
373 $_ = "([$_]*)";
374 } else {
375 $_ = '()';
376 }
377 }
378
379
380 $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing
381
382
383 # Now, wait a minute. Look at this more, as I'd hope that making a
384 # qr// go out of scope would release the compiled pattern.
385
386 if ( $ENV{MOD_PERL} ) {
387 $self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words
388 $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare
389
390 } else {
391 $self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words
392 $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare
393 }
394 }
395
396 1;
397
398
399

  ViewVC Help
Powered by ViewVC 1.1.22