#======================================================================= # Phrase Highlighting Code # # $Id: PhraseHighlight.pm,v 1.1.1.1 2002/09/20 19:47:30 adcroft Exp $ #======================================================================= package PhraseHighlight; use strict; use constant DEBUG_HIGHLIGHT => 0; sub new { my ( $class, $results, $metaname ) = @_; my $self = bless { results => $results, # just in case we need a method settings=> $results->config('highlight'), metaname=> $metaname, }, $class; # parse out the query into words my $query = $results->extract_query_match; # Do words exist for this layer (all text at this time) and metaname? # This is a reference to an array of phrases and words $self->{description_prop} = $results->config('description_prop') || ''; if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) { eval { require SWISH::Stemmer }; if ( $@ ) { $results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@); } else { $self->{stemmer_function} = \&SWISH::Stemmer::SwishStem; } } my %stopwords = map { $_, 1 } split /\s+/, $results->header('stopwords'); $self->{stopwords} = \%stopwords; if ( $query && exists $query->{text}{$metaname} ) { $self->{query} = $query->{text}{$metaname}; $self->set_match_regexp; } return $self; } sub highlight { my ( $self, $properties ) = @_; return unless $self->{query}; my $phrase_array = $self->{query}; my $settings = $self->{settings}; my $metaname = $self->{metaname}; # Do we care about this meta? return unless exists $settings->{meta_to_prop_map}{$metaname}; # Get the related properties my @props = @{ $settings->{meta_to_prop_map}{$metaname} }; my %checked; for ( @props ) { if ( $properties->{$_} ) { $checked{$_}++; $self->highlight_text( \$properties->{$_}, $phrase_array ); } } # Truncate the description, if not processed. my $description = $self->{description_prop}; if ( $description && !$checked{ $description } && $properties->{$description} ) { my $max_words = $settings->{max_words} || 100; my @words = split /\s+/, $properties->{$description}; if ( @words > $max_words ) { $properties->{$description} = join ' ', @words[0..$max_words], '...'; } } } #========================================================================== # sub highlight_text { my ( $self, $text_ref, $phrase_array ) = @_; my $wc_regexp = $self->{wc_regexp}; my $extract_regexp = $self->{extract_regexp}; my $last = 0; my $settings = $self->{settings}; my $Show_Words = $settings->{show_words} || 10; my $Occurrences = $settings->{occurrences} || 5; my $Max_Words = $settings->{max_words} || 100; my $On = $settings->{highlight_on} || ''; my $Off = $settings->{highlight_off} || ''; my $on_flag = 'sw' . time . 'on'; my $off_flag = 'sw' . time . 'off'; my $stemmer_function = $self->{stemmer_function}; # Should really call unescapeHTML(), but then would need to escape from escaping. # Split into words. For speed, should work on a stream method. my @words = split /$wc_regexp/, $$text_ref; return 'No Content saved: Check StoreDescription setting' unless @words; my @flags; # This marks where to start and stop display. $flags[$#words] = 0; # Extend array. my $occurrences = $Occurrences ; my $word_pos = $words[0] eq '' ? 2 : 0; # Start depends on if first word was wordcharacters or not my @phrases = @{ $self->{query} }; # Remember, that the swish words are every other in @words. WORD: while ( $Show_Words && $word_pos * 2 < @words ) { PHRASE: foreach my $phrase ( @phrases ) { print STDERR " Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT; next PHRASE if ($word_pos + @$phrase -1) * 2 > @words; # phrase is longer than what's left my $end_pos = 0; # end offset of the current phrase # now compare all the words in the phrase my ( $begin, $word, $end ); for my $match_word ( @$phrase ) { my $cur_word = $words[ ($word_pos + $end_pos) * 2 ]; unless ( $cur_word =~ /$extract_regexp/ ) { my $idx = ($word_pos + $end_pos) * 2; my ( $s, $e ) = ( $idx - 10, $idx + 10 ); $s = 0 if $s < 0; $e = @words-1 if $e >= @words; warn "Failed to parse IgnoreFirst/Last from word '" . (defined $cur_word ? $cur_word : '*undef') . "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:" . scalar @words . "\n-search pharse words-\n" . join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 ) . "\n-Words-\n" . join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e ) . "\n"; next PHRASE; } # Strip ignorefirst and ignorelast ( $begin, $word, $end ) = ( $1, $2, $3 ); # this is a waste, as it can operate on the same word over and over my $check_word = lc $word; if ( $end_pos && exists $self->{stopwords}{$check_word} ) { $end_pos++; print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT; redo if ( $word_pos + $end_pos ) * 2 < @words; # go on to check this match word with the next word. # No more words to match with, so go on to next pharse. next PHRASE; } if ( $stemmer_function ) { my $w = $stemmer_function->($check_word); $check_word = $w if $w; } print STDERR " comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT; if ( substr( $match_word, -1 ) eq '*' ) { next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0; } else { next PHRASE if $check_word ne $match_word; } print STDERR " *** Word Matched '$check_word' *** \n" if DEBUG_HIGHLIGHT; $end_pos++; } print STDERR " *** PHRASE MATCHED (word:$word_pos offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT; # We are currently at the end word, so it's easy to set that highlight $end_pos--; if ( !$end_pos ) { # only one word $words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end"; } else { $words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end"; #Now, reload first word of match $words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?"; # Strip ignorefirst and ignorelast ( $begin, $word, $end ) = ( $1, $2, $3 ); # probably should cache this! $words[$word_pos * 2] = "$begin$on_flag$word$end"; } # Now, flag the words around to be shown my $start = ($word_pos - $Show_Words + 1) * 2; my $stop = ($word_pos + $end_pos + $Show_Words - 2) * 2; if ( $start < 0 ) { $stop = $stop - $start; $start = 0; } $stop = $#words if $stop > $#words; $flags[$_]++ for $start .. $stop; # All done, and mark where to stop looking if ( $occurrences-- <= 0 ) { $last = $stop; last WORD; } # Now reset $word_pos to word following $word_pos += $end_pos; # continue will still be executed next WORD; } } continue { $word_pos ++; } my $dotdotdot = ' ... '; my @output; my $printing; my $first = 1; my $some_printed; if ( $Show_Words && @words > 50 ) { # don't limit context if a small number of words for my $i ( 0 ..$#words ) { if ( $last && $i >= $last && $i < $#words ) { push @output, $dotdotdot; last; } if ( $flags[$i] ) { push @output, $dotdotdot if !$printing++ && !$first; push @output, $words[$i]; $some_printed++; } else { $printing = 0; } $first = 0; } } if ( !$some_printed ) { for my $i ( 0 .. $Max_Words ) { if ( $i > $#words ) { $printing++; last; } push @output, $words[$i]; } } push @output, $dotdotdot if !$printing; $$text_ref = join '', @output; my %entities = ( '&' => '&', '>' => '>', '<' => '<', '"' => '"', ); my %highlight = ( $on_flag => $On, $off_flag => $Off, ); $$text_ref =~ s/([&"<>])/$entities{$1}/ge; $$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge; # $$text_ref = join '', @words; # interesting that this seems reasonably faster } #============================================ # Returns compiled regular expressions for matching # # sub set_match_regexp { my $self = shift; my $results = $self->{results}; my $wc = $results->header('wordcharacters'); my $ignoref = $results->header('ignorefirstchar'); my $ignorel = $results->header('ignorelastchar'); $wc = quotemeta $wc; #Convert query into regular expressions for ( $ignoref, $ignorel ) { if ( $_ ) { $_ = quotemeta; $_ = "([$_]*)"; } else { $_ = '()'; } } $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing # Now, wait a minute. Look at this more, as I'd hope that making a # qr// go out of scope would release the compiled pattern. if ( $ENV{MOD_PERL} ) { $self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare } else { $self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare } } 1;