example/modules/PhraseHighlight.pm

#=======================================================================
#  Phrase Highlighting Code
#
#    $Id: PhraseHighlight.pm,v 1.4 2002/04/06 17:52:39 whmoseley Exp $
#=======================================================================
package PhraseHighlight;
use strict;

use constant DEBUG_HIGHLIGHT => 0;

sub new {
    my ( $class, $results, $metaname ) = @_;


    my $self = bless {
        results => $results,  # just in case we need a method
        settings=> $results->config('highlight'),
        metaname=> $metaname,
    }, $class;

    # parse out the query into words
    my $query = $results->extract_query_match;


    # Do words exist for this layer (all text at this time) and metaname?
    # This is a reference to an array of phrases and words

    $self->{description_prop} = $results->config('description_prop') || '';


    if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
        eval { require SWISH::Stemmer };
        if ( $@ ) {
            $results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
        } else {
            $self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
        }
    }


    my %stopwords =  map { $_, 1 } split /\s+/, $results->header('stopwords');
    $self->{stopwords} = \%stopwords;


    if ( $query && exists $query->{text}{$metaname} ) {
        $self->{query} = $query->{text}{$metaname};

        $self->set_match_regexp;
    }

    return $self;
}

sub highlight {
    my ( $self, $properties ) = @_;


    return unless $self->{query};

    my $phrase_array = $self->{query};

    my $settings = $self->{settings};
    my $metaname = $self->{metaname};

    # Do we care about this meta?
    return unless exists $settings->{meta_to_prop_map}{$metaname};

    # Get the related properties
    my @props = @{ $settings->{meta_to_prop_map}{$metaname} };

    my %checked;

    for ( @props ) {
        if ( $properties->{$_} ) {
            $checked{$_}++;
            $self->highlight_text( \$properties->{$_}, $phrase_array );
        }
    }


    # Truncate the description, if not processed.
    my $description = $self->{description_prop};
    if ( $description && !$checked{ $description } && $properties->{$description} ) {
        my $max_words = $settings->{max_words} || 100;
        my @words = split /\s+/, $properties->{$description};
        if ( @words > $max_words ) {
            $properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
        }
    }

}


#==========================================================================
#

sub highlight_text {

    my ( $self, $text_ref, $phrase_array ) = @_;
    
    my $wc_regexp = $self->{wc_regexp};
    my $extract_regexp = $self->{extract_regexp};


    my $last = 0;

    my $settings = $self->{settings};

    my $Show_Words = $settings->{show_words} || 10;
    my $Occurrences = $settings->{occurrences} || 5;
    my $Max_Words = $settings->{max_words} || 100;


    my $On = $settings->{highlight_on} || '<b>';
    my $Off = $settings->{highlight_off} || '</b>';

    my $on_flag  = 'sw' . time . 'on';
    my $off_flag = 'sw' . time . 'off';


    my $stemmer_function = $self->{stemmer_function};

    # Should really call unescapeHTML(), but then would need to escape <b> from escaping.

    # Split into words.  For speed, should work on a stream method.
    my @words = split /$wc_regexp/, $$text_ref;


    return 'No Content saved: Check StoreDescription setting' unless @words;

    my @flags;  # This marks where to start and stop display.
    $flags[$#words] = 0;  # Extend array.

    my $occurrences = $Occurrences ;


    my $word_pos = $words[0] eq '' ? 2 : 0;  # Start depends on if first word was wordcharacters or not

    my @phrases = @{ $self->{query} };

    # Remember, that the swish words are every other in @words.

    WORD:
    while ( $Show_Words && $word_pos * 2 < @words ) {

        PHRASE:
        foreach my $phrase ( @phrases ) {

            print STDERR "  Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT;
            next PHRASE if ($word_pos + @$phrase -1) * 2 > @words;  # phrase is longer than what's left
            

            my $end_pos = 0;  # end offset of the current phrase

            # now compare all the words in the phrase

            my ( $begin, $word, $end );
            
            for my $match_word ( @$phrase ) {

                my $cur_word = $words[ ($word_pos + $end_pos) * 2 ];
                unless ( $cur_word =~ /$extract_regexp/ ) {

                    my $idx = ($word_pos + $end_pos) * 2;
                    my ( $s, $e ) = ( $idx - 10, $idx + 10 );
                    $s = 0 if $s < 0;
                    $e = @words-1 if $e >= @words;
                   
                
                    warn  "Failed to parse IgnoreFirst/Last from word '"
                    . (defined $cur_word ? $cur_word : '*undef')
                    . "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:"
                    . scalar @words
                    . "\n-search pharse words-\n"
                    . join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 )
                    . "\n-Words-\n"
                    . join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e )
                    . "\n";

                    next PHRASE;
                }


                # Strip ignorefirst and ignorelast
                ( $begin, $word, $end ) = ( $1, $2, $3 );  # this is a waste, as it can operate on the same word over and over

                my $check_word = lc $word;

                if ( $end_pos && exists $self->{stopwords}{$check_word} ) {
                    $end_pos++;
                    print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT;
                    redo if  ( $word_pos + $end_pos ) * 2 < @words;  # go on to check this match word with the next word.

                    # No more words to match with, so go on to next pharse.
                    next PHRASE;
                }

                if ( $stemmer_function ) {
                    my $w = $stemmer_function->($check_word);
                    $check_word = $w if $w;
                }


                print STDERR "     comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT;
    
                if ( substr( $match_word, -1 ) eq '*' ) {
                    next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0;

                } else {
                    next PHRASE if $check_word ne $match_word;
                }


                print STDERR "      *** Word Matched '$check_word' *** \n" if DEBUG_HIGHLIGHT;
                $end_pos++;  
            }

            print STDERR "      *** PHRASE MATCHED (word:$word_pos offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT;


            # We are currently at the end word, so it's easy to set that highlight

            $end_pos--;

            if ( !$end_pos ) { # only one word
                $words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end";
            } else {
                $words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end";

                #Now, reload first word of match
                $words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?";
                # Strip ignorefirst and ignorelast
                ( $begin, $word, $end ) = ( $1, $2, $3 );  # probably should cache this!
                $words[$word_pos * 2] = "$begin$on_flag$word$end";
            }


            # Now, flag the words around to be shown
            my $start = ($word_pos - $Show_Words + 1) * 2;
            my $stop   = ($word_pos + $end_pos + $Show_Words - 2) * 2;
            if ( $start < 0 ) {
                $stop = $stop - $start;
                $start = 0;
            }
            
            $stop = $#words if $stop > $#words;

            $flags[$_]++ for $start .. $stop;


            # All done, and mark where to stop looking
            if ( $occurrences-- <= 0 ) {
                $last = $stop;
                last WORD;
            }


            # Now reset $word_pos to word following
            $word_pos += $end_pos; # continue will still be executed
            next WORD;
        }
    } continue {
        $word_pos ++;
    }


    my $dotdotdot = ' ... ';


    my @output;

    my $printing;
    my $first = 1;
    my $some_printed;

    if ( $Show_Words && @words > 50 ) {  # don't limit context if a small number of words
        for my $i ( 0 ..$#words ) {


            if ( $last && $i >= $last && $i < $#words ) {
                push @output, $dotdotdot;
                last;
            }

            if ( $flags[$i] ) {

                push @output, $dotdotdot if !$printing++ && !$first;
                push @output, $words[$i];
                $some_printed++;

            } else {
                $printing = 0;
            }

        $first = 0;

        
        }
    }

    if ( !$some_printed ) {
        for my $i ( 0 .. $Max_Words ) {
            if ( $i > $#words ) {
                $printing++;
                last;
            }
            push @output, $words[$i];
        }
    }
        
        
    push @output, $dotdotdot if !$printing;

    $$text_ref = join '', @output;
    my %entities = (
        '&' => '&amp;',
        '>' => '&gt;',
        '<' => '&lt;',
        '"' => '&quot;',
    );
    my %highlight = (
        $on_flag => $On,
        $off_flag => $Off,
    );
        

    $$text_ref =~ s/([&"<>])/$entities{$1}/ge;

    $$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge;


    # $$text_ref = join '', @words;  # interesting that this seems reasonably faster


}

#============================================
# Returns compiled regular expressions for matching
#
#

sub set_match_regexp {
    my $self = shift;

    my $results = $self->{results};


    my $wc = $results->header('wordcharacters');
    my $ignoref = $results->header('ignorefirstchar');
    my $ignorel = $results->header('ignorelastchar');


    $wc = quotemeta $wc;

    #Convert query into regular expressions


    for ( $ignoref, $ignorel ) {
        if ( $_ ) {
            $_ = quotemeta;
            $_ = "([$_]*)";
        } else {
            $_ = '()';
        }
    }


    $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';  # Warning: dependent on tolower used while indexing


    # Now, wait a minute.  Look at this more, as I'd hope that making a
    # qr// go out of scope would release the compiled pattern.

    if ( $ENV{MOD_PERL} ) {
        $self->{wc_regexp}      = qr/([^$wc]+)/;                     # regexp for splitting into swish-words
        $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i;  # regexp for extracting out the words to compare

     } else {
        $self->{wc_regexp}      = qr/([^$wc]+)/o;                    # regexp for splitting into swish-words
        $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi;  # regexp for extracting out the words to compare
     }
}    

1;


1	adcroft	1.1	#=======================================================================
2			# Phrase Highlighting Code
3			#
4			# $Id: PhraseHighlight.pm,v 1.4 2002/04/06 17:52:39 whmoseley Exp $
5			#=======================================================================
6			package PhraseHighlight;
7			use strict;
8
9			use constant DEBUG_HIGHLIGHT => 0;
10
11			sub new {
12			my ( $class, $results, $metaname ) = @_;
13
14
15			my $self = bless {
16			results => $results, # just in case we need a method
17			settings=> $results->config('highlight'),
18			metaname=> $metaname,
19			}, $class;
20
21			# parse out the query into words
22			my $query = $results->extract_query_match;
23
24
25			# Do words exist for this layer (all text at this time) and metaname?
26			# This is a reference to an array of phrases and words
27
28			$self->{description_prop} = $results->config('description_prop') \|\| '';
29
30
31
32			if ( $results->header('stemming applied') =~ /^(?:1\|yes)$/i ) {
33			eval { require SWISH::Stemmer };
34			if ( $@ ) {
35			$results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
36			} else {
37			$self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
38			}
39			}
40
41
42
43			my %stopwords = map { $_, 1 } split /\s+/, $results->header('stopwords');
44			$self->{stopwords} = \%stopwords;
45
46
47			if ( $query && exists $query->{text}{$metaname} ) {
48			$self->{query} = $query->{text}{$metaname};
49
50			$self->set_match_regexp;
51			}
52
53			return $self;
54			}
55
56			sub highlight {
57			my ( $self, $properties ) = @_;
58
59
60			return unless $self->{query};
61
62			my $phrase_array = $self->{query};
63
64			my $settings = $self->{settings};
65			my $metaname = $self->{metaname};
66
67			# Do we care about this meta?
68			return unless exists $settings->{meta_to_prop_map}{$metaname};
69
70			# Get the related properties
71			my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
72
73			my %checked;
74
75			for ( @props ) {
76			if ( $properties->{$_} ) {
77			$checked{$_}++;
78			$self->highlight_text( \$properties->{$_}, $phrase_array );
79			}
80			}
81
82
83			# Truncate the description, if not processed.
84			my $description = $self->{description_prop};
85			if ( $description && !$checked{ $description } && $properties->{$description} ) {
86			my $max_words = $settings->{max_words} \|\| 100;
87			my @words = split /\s+/, $properties->{$description};
88			if ( @words > $max_words ) {
89			$properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
90			}
91			}
92
93			}
94
95
96
97			#==========================================================================
98			#
99
100			sub highlight_text {
101
102			my ( $self, $text_ref, $phrase_array ) = @_;
103
104			my $wc_regexp = $self->{wc_regexp};
105			my $extract_regexp = $self->{extract_regexp};
106
107
108			my $last = 0;
109
110			my $settings = $self->{settings};
111
112			my $Show_Words = $settings->{show_words} \|\| 10;
113			my $Occurrences = $settings->{occurrences} \|\| 5;
114			my $Max_Words = $settings->{max_words} \|\| 100;
115
116
117
118			my $On = $settings->{highlight_on} \|\| '<b>';
119			my $Off = $settings->{highlight_off} \|\| '</b>';
120
121			my $on_flag = 'sw' . time . 'on';
122			my $off_flag = 'sw' . time . 'off';
123
124
125			my $stemmer_function = $self->{stemmer_function};
126
127			# Should really call unescapeHTML(), but then would need to escape <b> from escaping.
128
129			# Split into words. For speed, should work on a stream method.
130			my @words = split /$wc_regexp/, $$text_ref;
131
132
133			return 'No Content saved: Check StoreDescription setting' unless @words;
134
135			my @flags; # This marks where to start and stop display.
136			$flags[$#words] = 0; # Extend array.
137
138			my $occurrences = $Occurrences ;
139
140
141			my $word_pos = $words[0] eq '' ? 2 : 0; # Start depends on if first word was wordcharacters or not
142
143			my @phrases = @{ $self->{query} };
144
145			# Remember, that the swish words are every other in @words.
146
147			WORD:
148			while ( $Show_Words && $word_pos * 2 < @words ) {
149
150			PHRASE:
151			foreach my $phrase ( @phrases ) {
152
153			print STDERR " Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT;
154			next PHRASE if ($word_pos + @$phrase -1) * 2 > @words; # phrase is longer than what's left
155
156
157			my $end_pos = 0; # end offset of the current phrase
158
159			# now compare all the words in the phrase
160
161			my ( $begin, $word, $end );
162
163			for my $match_word ( @$phrase ) {
164
165			my $cur_word = $words[ ($word_pos + $end_pos) * 2 ];
166			unless ( $cur_word =~ /$extract_regexp/ ) {
167
168			my $idx = ($word_pos + $end_pos) * 2;
169			my ( $s, $e ) = ( $idx - 10, $idx + 10 );
170			$s = 0 if $s < 0;
171			$e = @words-1 if $e >= @words;
172
173
174			warn "Failed to parse IgnoreFirst/Last from word '"
175			. (defined $cur_word ? $cur_word : '*undef')
176			. "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:"
177			. scalar @words
178			. "\n-search pharse words-\n"
179			. join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 )
180			. "\n-Words-\n"
181			. join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e )
182			. "\n";
183
184			next PHRASE;
185			}
186
187
188
189
190			# Strip ignorefirst and ignorelast
191			( $begin, $word, $end ) = ( $1, $2, $3 ); # this is a waste, as it can operate on the same word over and over
192
193			my $check_word = lc $word;
194
195			if ( $end_pos && exists $self->{stopwords}{$check_word} ) {
196			$end_pos++;
197			print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT;
198			redo if ( $word_pos + $end_pos ) * 2 < @words; # go on to check this match word with the next word.
199
200			# No more words to match with, so go on to next pharse.
201			next PHRASE;
202			}
203
204			if ( $stemmer_function ) {
205			my $w = $stemmer_function->($check_word);
206			$check_word = $w if $w;
207			}
208
209
210
211			print STDERR " comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT;
212
213			if ( substr( $match_word, -1 ) eq '*' ) {
214			next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0;
215
216			} else {
217			next PHRASE if $check_word ne $match_word;
218			}
219
220
221			print STDERR " * Word Matched '$check_word' * \n" if DEBUG_HIGHLIGHT;
222			$end_pos++;
223			}
224
225			print STDERR " * PHRASE MATCHED (word:$word_pos offset:$end_pos) * \n" if DEBUG_HIGHLIGHT;
226
227
228			# We are currently at the end word, so it's easy to set that highlight
229
230			$end_pos--;
231
232			if ( !$end_pos ) { # only one word
233			$words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end";
234			} else {
235			$words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end";
236
237			#Now, reload first word of match
238			$words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?";
239			# Strip ignorefirst and ignorelast
240			( $begin, $word, $end ) = ( $1, $2, $3 ); # probably should cache this!
241			$words[$word_pos * 2] = "$begin$on_flag$word$end";
242			}
243
244
245			# Now, flag the words around to be shown
246			my $start = ($word_pos - $Show_Words + 1) * 2;
247			my $stop = ($word_pos + $end_pos + $Show_Words - 2) * 2;
248			if ( $start < 0 ) {
249			$stop = $stop - $start;
250			$start = 0;
251			}
252
253			$stop = $#words if $stop > $#words;
254
255			$flags[$_]++ for $start .. $stop;
256
257
258			# All done, and mark where to stop looking
259			if ( $occurrences-- <= 0 ) {
260			$last = $stop;
261			last WORD;
262			}
263
264
265			# Now reset $word_pos to word following
266			$word_pos += $end_pos; # continue will still be executed
267			next WORD;
268			}
269			} continue {
270			$word_pos ++;
271			}
272
273
274
275
276			my $dotdotdot = ' ... ';
277
278
279			my @output;
280
281			my $printing;
282			my $first = 1;
283			my $some_printed;
284
285			if ( $Show_Words && @words > 50 ) { # don't limit context if a small number of words
286			for my $i ( 0 ..$#words ) {
287
288
289			if ( $last && $i >= $last && $i < $#words ) {
290			push @output, $dotdotdot;
291			last;
292			}
293
294			if ( $flags[$i] ) {
295
296			push @output, $dotdotdot if !$printing++ && !$first;
297			push @output, $words[$i];
298			$some_printed++;
299
300			} else {
301			$printing = 0;
302			}
303
304			$first = 0;
305
306
307			}
308			}
309
310			if ( !$some_printed ) {
311			for my $i ( 0 .. $Max_Words ) {
312			if ( $i > $#words ) {
313			$printing++;
314			last;
315			}
316			push @output, $words[$i];
317			}
318			}
319
320
321
322			push @output, $dotdotdot if !$printing;
323
324			$$text_ref = join '', @output;
325			my %entities = (
326			'&' => '&',
327			'>' => '>',
328			'<' => '<',
329			'"' => '"',
330			);
331			my %highlight = (
332			$on_flag => $On,
333			$off_flag => $Off,
334			);
335
336
337			$$text_ref =~ s/([&"<>])/$entities{$1}/ge;
338
339			$$text_ref =~ s/($on_flag\|$off_flag)/$highlight{$1}/ge;
340
341
342
343
344			# $$text_ref = join '', @words; # interesting that this seems reasonably faster
345
346
347			}
348
349			#============================================
350			# Returns compiled regular expressions for matching
351			#
352			#
353
354			sub set_match_regexp {
355			my $self = shift;
356
357			my $results = $self->{results};
358
359
360			my $wc = $results->header('wordcharacters');
361			my $ignoref = $results->header('ignorefirstchar');
362			my $ignorel = $results->header('ignorelastchar');
363
364
365			$wc = quotemeta $wc;
366
367			#Convert query into regular expressions
368
369
370			for ( $ignoref, $ignorel ) {
371			if ( $_ ) {
372			$_ = quotemeta;
373			$_ = "([$_]*)";
374			} else {
375			$_ = '()';
376			}
377			}
378
379
380			$wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing
381
382
383			# Now, wait a minute. Look at this more, as I'd hope that making a
384			# qr// go out of scope would release the compiled pattern.
385
386			if ( $ENV{MOD_PERL} ) {
387			$self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words
388			$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare
389
390			} else {
391			$self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words
392			$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare
393			}
394			}
395
396			1;
397
398
399