/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/example/modules/PhraseHighlight.pm
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/example/modules/PhraseHighlight.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (hide annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
Importing web-site building process.

1 adcroft 1.1 #=======================================================================
2     # Phrase Highlighting Code
3     #
4     # $Id: PhraseHighlight.pm,v 1.4 2002/04/06 17:52:39 whmoseley Exp $
5     #=======================================================================
6     package PhraseHighlight;
7     use strict;
8    
9     use constant DEBUG_HIGHLIGHT => 0;
10    
11     sub new {
12     my ( $class, $results, $metaname ) = @_;
13    
14    
15     my $self = bless {
16     results => $results, # just in case we need a method
17     settings=> $results->config('highlight'),
18     metaname=> $metaname,
19     }, $class;
20    
21     # parse out the query into words
22     my $query = $results->extract_query_match;
23    
24    
25     # Do words exist for this layer (all text at this time) and metaname?
26     # This is a reference to an array of phrases and words
27    
28     $self->{description_prop} = $results->config('description_prop') || '';
29    
30    
31    
32     if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
33     eval { require SWISH::Stemmer };
34     if ( $@ ) {
35     $results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
36     } else {
37     $self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
38     }
39     }
40    
41    
42    
43     my %stopwords = map { $_, 1 } split /\s+/, $results->header('stopwords');
44     $self->{stopwords} = \%stopwords;
45    
46    
47     if ( $query && exists $query->{text}{$metaname} ) {
48     $self->{query} = $query->{text}{$metaname};
49    
50     $self->set_match_regexp;
51     }
52    
53     return $self;
54     }
55    
56     sub highlight {
57     my ( $self, $properties ) = @_;
58    
59    
60     return unless $self->{query};
61    
62     my $phrase_array = $self->{query};
63    
64     my $settings = $self->{settings};
65     my $metaname = $self->{metaname};
66    
67     # Do we care about this meta?
68     return unless exists $settings->{meta_to_prop_map}{$metaname};
69    
70     # Get the related properties
71     my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
72    
73     my %checked;
74    
75     for ( @props ) {
76     if ( $properties->{$_} ) {
77     $checked{$_}++;
78     $self->highlight_text( \$properties->{$_}, $phrase_array );
79     }
80     }
81    
82    
83     # Truncate the description, if not processed.
84     my $description = $self->{description_prop};
85     if ( $description && !$checked{ $description } && $properties->{$description} ) {
86     my $max_words = $settings->{max_words} || 100;
87     my @words = split /\s+/, $properties->{$description};
88     if ( @words > $max_words ) {
89     $properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
90     }
91     }
92    
93     }
94    
95    
96    
97     #==========================================================================
98     #
99    
100     sub highlight_text {
101    
102     my ( $self, $text_ref, $phrase_array ) = @_;
103    
104     my $wc_regexp = $self->{wc_regexp};
105     my $extract_regexp = $self->{extract_regexp};
106    
107    
108     my $last = 0;
109    
110     my $settings = $self->{settings};
111    
112     my $Show_Words = $settings->{show_words} || 10;
113     my $Occurrences = $settings->{occurrences} || 5;
114     my $Max_Words = $settings->{max_words} || 100;
115    
116    
117    
118     my $On = $settings->{highlight_on} || '<b>';
119     my $Off = $settings->{highlight_off} || '</b>';
120    
121     my $on_flag = 'sw' . time . 'on';
122     my $off_flag = 'sw' . time . 'off';
123    
124    
125     my $stemmer_function = $self->{stemmer_function};
126    
127     # Should really call unescapeHTML(), but then would need to escape <b> from escaping.
128    
129     # Split into words. For speed, should work on a stream method.
130     my @words = split /$wc_regexp/, $$text_ref;
131    
132    
133     return 'No Content saved: Check StoreDescription setting' unless @words;
134    
135     my @flags; # This marks where to start and stop display.
136     $flags[$#words] = 0; # Extend array.
137    
138     my $occurrences = $Occurrences ;
139    
140    
141     my $word_pos = $words[0] eq '' ? 2 : 0; # Start depends on if first word was wordcharacters or not
142    
143     my @phrases = @{ $self->{query} };
144    
145     # Remember, that the swish words are every other in @words.
146    
147     WORD:
148     while ( $Show_Words && $word_pos * 2 < @words ) {
149    
150     PHRASE:
151     foreach my $phrase ( @phrases ) {
152    
153     print STDERR " Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT;
154     next PHRASE if ($word_pos + @$phrase -1) * 2 > @words; # phrase is longer than what's left
155    
156    
157     my $end_pos = 0; # end offset of the current phrase
158    
159     # now compare all the words in the phrase
160    
161     my ( $begin, $word, $end );
162    
163     for my $match_word ( @$phrase ) {
164    
165     my $cur_word = $words[ ($word_pos + $end_pos) * 2 ];
166     unless ( $cur_word =~ /$extract_regexp/ ) {
167    
168     my $idx = ($word_pos + $end_pos) * 2;
169     my ( $s, $e ) = ( $idx - 10, $idx + 10 );
170     $s = 0 if $s < 0;
171     $e = @words-1 if $e >= @words;
172    
173    
174     warn "Failed to parse IgnoreFirst/Last from word '"
175     . (defined $cur_word ? $cur_word : '*undef')
176     . "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:"
177     . scalar @words
178     . "\n-search pharse words-\n"
179     . join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 )
180     . "\n-Words-\n"
181     . join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e )
182     . "\n";
183    
184     next PHRASE;
185     }
186    
187    
188    
189    
190     # Strip ignorefirst and ignorelast
191     ( $begin, $word, $end ) = ( $1, $2, $3 ); # this is a waste, as it can operate on the same word over and over
192    
193     my $check_word = lc $word;
194    
195     if ( $end_pos && exists $self->{stopwords}{$check_word} ) {
196     $end_pos++;
197     print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT;
198     redo if ( $word_pos + $end_pos ) * 2 < @words; # go on to check this match word with the next word.
199    
200     # No more words to match with, so go on to next pharse.
201     next PHRASE;
202     }
203    
204     if ( $stemmer_function ) {
205     my $w = $stemmer_function->($check_word);
206     $check_word = $w if $w;
207     }
208    
209    
210    
211     print STDERR " comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT;
212    
213     if ( substr( $match_word, -1 ) eq '*' ) {
214     next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0;
215    
216     } else {
217     next PHRASE if $check_word ne $match_word;
218     }
219    
220    
221     print STDERR " *** Word Matched '$check_word' *** \n" if DEBUG_HIGHLIGHT;
222     $end_pos++;
223     }
224    
225     print STDERR " *** PHRASE MATCHED (word:$word_pos offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT;
226    
227    
228     # We are currently at the end word, so it's easy to set that highlight
229    
230     $end_pos--;
231    
232     if ( !$end_pos ) { # only one word
233     $words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end";
234     } else {
235     $words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end";
236    
237     #Now, reload first word of match
238     $words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?";
239     # Strip ignorefirst and ignorelast
240     ( $begin, $word, $end ) = ( $1, $2, $3 ); # probably should cache this!
241     $words[$word_pos * 2] = "$begin$on_flag$word$end";
242     }
243    
244    
245     # Now, flag the words around to be shown
246     my $start = ($word_pos - $Show_Words + 1) * 2;
247     my $stop = ($word_pos + $end_pos + $Show_Words - 2) * 2;
248     if ( $start < 0 ) {
249     $stop = $stop - $start;
250     $start = 0;
251     }
252    
253     $stop = $#words if $stop > $#words;
254    
255     $flags[$_]++ for $start .. $stop;
256    
257    
258     # All done, and mark where to stop looking
259     if ( $occurrences-- <= 0 ) {
260     $last = $stop;
261     last WORD;
262     }
263    
264    
265     # Now reset $word_pos to word following
266     $word_pos += $end_pos; # continue will still be executed
267     next WORD;
268     }
269     } continue {
270     $word_pos ++;
271     }
272    
273    
274    
275    
276     my $dotdotdot = ' ... ';
277    
278    
279     my @output;
280    
281     my $printing;
282     my $first = 1;
283     my $some_printed;
284    
285     if ( $Show_Words && @words > 50 ) { # don't limit context if a small number of words
286     for my $i ( 0 ..$#words ) {
287    
288    
289     if ( $last && $i >= $last && $i < $#words ) {
290     push @output, $dotdotdot;
291     last;
292     }
293    
294     if ( $flags[$i] ) {
295    
296     push @output, $dotdotdot if !$printing++ && !$first;
297     push @output, $words[$i];
298     $some_printed++;
299    
300     } else {
301     $printing = 0;
302     }
303    
304     $first = 0;
305    
306    
307     }
308     }
309    
310     if ( !$some_printed ) {
311     for my $i ( 0 .. $Max_Words ) {
312     if ( $i > $#words ) {
313     $printing++;
314     last;
315     }
316     push @output, $words[$i];
317     }
318     }
319    
320    
321    
322     push @output, $dotdotdot if !$printing;
323    
324     $$text_ref = join '', @output;
325     my %entities = (
326     '&' => '&amp;',
327     '>' => '&gt;',
328     '<' => '&lt;',
329     '"' => '&quot;',
330     );
331     my %highlight = (
332     $on_flag => $On,
333     $off_flag => $Off,
334     );
335    
336    
337     $$text_ref =~ s/([&"<>])/$entities{$1}/ge;
338    
339     $$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge;
340    
341    
342    
343    
344     # $$text_ref = join '', @words; # interesting that this seems reasonably faster
345    
346    
347     }
348    
349     #============================================
350     # Returns compiled regular expressions for matching
351     #
352     #
353    
354     sub set_match_regexp {
355     my $self = shift;
356    
357     my $results = $self->{results};
358    
359    
360     my $wc = $results->header('wordcharacters');
361     my $ignoref = $results->header('ignorefirstchar');
362     my $ignorel = $results->header('ignorelastchar');
363    
364    
365     $wc = quotemeta $wc;
366    
367     #Convert query into regular expressions
368    
369    
370     for ( $ignoref, $ignorel ) {
371     if ( $_ ) {
372     $_ = quotemeta;
373     $_ = "([$_]*)";
374     } else {
375     $_ = '()';
376     }
377     }
378    
379    
380     $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing
381    
382    
383     # Now, wait a minute. Look at this more, as I'd hope that making a
384     # qr// go out of scope would release the compiled pattern.
385    
386     if ( $ENV{MOD_PERL} ) {
387     $self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words
388     $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare
389    
390     } else {
391     $self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words
392     $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare
393     }
394     }
395    
396     1;
397    
398    
399    

  ViewVC Help
Powered by ViewVC 1.1.22