example/modules/DefaultHighlight.pm

#=======================================================================
#  Default Highlighting Code
#
#    $Id: DefaultHighlight.pm,v 1.1 2001/12/06 07:32:11 whmoseley Exp $
#=======================================================================
package DefaultHighlight;
use strict;

sub new {
    my ( $class, $results, $metaname ) = @_;


    my $self = bless {
        results => $results,  # just in case we need a method
        settings=> $results->config('highlight'),
        metaname=> $metaname,
    }, $class;

    # parse out the query into words
    my $query = $results->extract_query_match;


    # Do words exist for this layer (all text at this time) and metaname?
    # This is a reference to an array of phrases and words

    $self->{description_prop} = $results->config('description_prop') || '';

    if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
        eval { require SWISH::Stemmer };
        if ( $@ ) {
            $results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
        } else {
            $self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
        }
    }


    if ( $query && exists $query->{text}{$metaname} ) {
        $self->{query} = $query->{text}{$metaname};

        $self->set_match_regexp;
    }

    return $self;
}

sub highlight {
    my ( $self, $properties ) = @_;


    return unless $self->{query};

    my $phrase_array = $self->{query};

    my $settings = $self->{settings};
    my $metaname = $self->{metaname};

    # Do we care about this meta?
    return unless exists $settings->{meta_to_prop_map}{$metaname};

    # Get the related properties
    my @props = @{ $settings->{meta_to_prop_map}{$metaname} };

    my %checked;

    for ( @props ) {
        if ( $properties->{$_} ) {
            $checked{$_}++;
            $self->highlight_text( \$properties->{$_}, $phrase_array );
        }
    }


    # Truncate the description, if not processed.
    my $description = $self->{description_prop};
    if ( $description && !$checked{ $description } && $properties->{$description} ) {
        my $max_words = $settings->{max_words} || 100;
        my @words = split /\s+/, $properties->{$description};
        if ( @words > $max_words ) {
            $properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
        }
    }

}


#==========================================================================
#

sub highlight_text {

    my ( $self, $text_ref, $phrase_array ) = @_;
    
    my $wc_regexp = $self->{wc_regexp};
    my $extract_regexp = $self->{extract_regexp};
    my $match_regexp = $self->{match_regexp};


    my $last = 0;

    my $settings = $self->{settings};

    my $Show_Words = $settings->{show_words} || 10;
    my $Occurrences = $settings->{occurrences} || 5;
    my $Max_Words = $settings->{max_words} || 100;
    my $On = $settings->{highlight_on} || '<b>';
    my $Off = $settings->{highlight_off} || '</b>';


    my $stemmer_function = $self->{stemmer_function};


    # Should really call unescapeHTML(), but then would need to escape <b> from escaping.
    my @words = split /$wc_regexp/, $$text_ref;


    return 'No Content saved: Check StoreDescription setting' unless @words;

    my @flags;
    $flags[$#words] = 0;  # Extend array.

    my $occurrences = $Occurrences ;


    my $pos = 0;

    while ( $Show_Words && $pos <= $#words ) {

        # Check if the word is a swish word (ignoring begin and end chars)
        if ( $words[$pos] =~ /$extract_regexp/ ) {


            my ( $begin, $word, $end ) = ( $1, $2, $3 );

            my $test = $stemmer_function
                       ? $stemmer_function->($word)
                       : lc $word;

            $test ||= lc $word;                       

            # Not check if word matches
            if ( $test =~ /$match_regexp/ ) {

                $words[$pos] = "$begin$On$word$Off$end";


                my $start = $pos - ($Show_Words-1)* 2;
                my $end   = $pos + ($Show_Words-1)* 2;
                if ( $start < 0 ) {
                    $end = $end - $start;
                    $start = 0;
                }
                
                $end = $#words if $end > $#words;

                $flags[$_]++ for $start .. $end;


                # All done, and mark where to stop looking
                if ( $occurrences-- <= 0 ) {
                    $last = $end;
                    last;
                }
            }
        }

       $pos += 2;  # Skip to next wordchar word
    }


    my $dotdotdot = ' <b>...</b> ';


    my @output;

    my $printing;
    my $first = 1;
    my $some_printed;

    if ( $Show_Words && @words > 50 ) {  # don't limit context if a small number of words
        for my $i ( 0 ..$#words ) {


            if ( $last && $i >= $last && $i < $#words ) {
                push @output, $dotdotdot;
                last;
            }

            if ( $flags[$i] ) {

                push @output, $dotdotdot if !$printing++ && !$first;
                push @output, $words[$i];
                $some_printed++;

            } else {
                $printing = 0;
            }

        $first = 0;

        
        }
    }

    if ( !$some_printed ) {
        for my $i ( 0 .. $Max_Words ) {
            if ( $i > $#words ) {
                $printing++;
                last;
            }
            push @output, $words[$i];
        }
    }
        
        
    push @output, $dotdotdot if !$printing;

    $$text_ref = join '', @output;


}

#============================================
# Returns compiled regular expressions for matching
#

sub set_match_regexp {
    my $self = shift;

    my $results = $self->{results};


    my $wc = $results->header('wordcharacters');
    my $ignoref = $results->header('ignorefirstchar');
    my $ignorel = $results->header('ignorelastchar');


    my $query = join ' ', map { join ' ', @$_} @{$self->{query}};  # join everything together!


    $wc = quotemeta $wc;


    my $match_string =
        join '|',
           map { substr( $_, -1, 1 ) eq '*'
                    ? quotemeta( substr( $_, 0, -1) ) . "[$wc]*?"
                    : quotemeta
               }
                grep { ! /^(and|or|not|["()=])$/oi }  # left over code
                    split /\s+/, $query;


    return unless $match_string;

    for ( $ignoref, $ignorel ) {
        if ( $_ ) {
            $_ = quotemeta;
            $_ = "([$_]*)";
        } else {
            $_ = '()';
        }
    }


    # Yuck!
    $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';  # Warning: dependent on tolower used while indexing


    # Now, wait a minute.  Look at this more, as I'd hope that making a
    # qr// go out of scope would release the compiled pattern.

    if ( $ENV{MOD_PERL} ) {
        $self->{wc_regexp}      = qr/([^$wc]+)/;                     # regexp for splitting into swish-words
        $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i;  # regexp for extracting out the words to compare
        $self->{match_regexp}   = qr/^(?:$match_string)$/;           # regexp for comparing extracted words to query

     } else {
        $self->{wc_regexp}      = qr/([^$wc]+)/o;                    # regexp for splitting into swish-words
        $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi;  # regexp for extracting out the words to compare
        $self->{match_regexp}   = qr/^(?:$match_string)$/o;          # regexp for comparing extracted words to query
     }
}    


1;

1	adcroft	1.1	#=======================================================================
2			# Default Highlighting Code
3			#
4			# $Id: DefaultHighlight.pm,v 1.1 2001/12/06 07:32:11 whmoseley Exp $
5			#=======================================================================
6			package DefaultHighlight;
7			use strict;
8
9			sub new {
10			my ( $class, $results, $metaname ) = @_;
11
12
13			my $self = bless {
14			results => $results, # just in case we need a method
15			settings=> $results->config('highlight'),
16			metaname=> $metaname,
17			}, $class;
18
19			# parse out the query into words
20			my $query = $results->extract_query_match;
21
22
23			# Do words exist for this layer (all text at this time) and metaname?
24			# This is a reference to an array of phrases and words
25
26			$self->{description_prop} = $results->config('description_prop') \|\| '';
27
28			if ( $results->header('stemming applied') =~ /^(?:1\|yes)$/i ) {
29			eval { require SWISH::Stemmer };
30			if ( $@ ) {
31			$results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
32			} else {
33			$self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
34			}
35			}
36
37
38			if ( $query && exists $query->{text}{$metaname} ) {
39			$self->{query} = $query->{text}{$metaname};
40
41			$self->set_match_regexp;
42			}
43
44			return $self;
45			}
46
47			sub highlight {
48			my ( $self, $properties ) = @_;
49
50
51			return unless $self->{query};
52
53			my $phrase_array = $self->{query};
54
55			my $settings = $self->{settings};
56			my $metaname = $self->{metaname};
57
58			# Do we care about this meta?
59			return unless exists $settings->{meta_to_prop_map}{$metaname};
60
61			# Get the related properties
62			my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
63
64			my %checked;
65
66			for ( @props ) {
67			if ( $properties->{$_} ) {
68			$checked{$_}++;
69			$self->highlight_text( \$properties->{$_}, $phrase_array );
70			}
71			}
72
73
74			# Truncate the description, if not processed.
75			my $description = $self->{description_prop};
76			if ( $description && !$checked{ $description } && $properties->{$description} ) {
77			my $max_words = $settings->{max_words} \|\| 100;
78			my @words = split /\s+/, $properties->{$description};
79			if ( @words > $max_words ) {
80			$properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
81			}
82			}
83
84			}
85
86
87
88			#==========================================================================
89			#
90
91			sub highlight_text {
92
93			my ( $self, $text_ref, $phrase_array ) = @_;
94
95			my $wc_regexp = $self->{wc_regexp};
96			my $extract_regexp = $self->{extract_regexp};
97			my $match_regexp = $self->{match_regexp};
98
99
100			my $last = 0;
101
102			my $settings = $self->{settings};
103
104			my $Show_Words = $settings->{show_words} \|\| 10;
105			my $Occurrences = $settings->{occurrences} \|\| 5;
106			my $Max_Words = $settings->{max_words} \|\| 100;
107			my $On = $settings->{highlight_on} \|\| '<b>';
108			my $Off = $settings->{highlight_off} \|\| '</b>';
109
110
111			my $stemmer_function = $self->{stemmer_function};
112
113
114			# Should really call unescapeHTML(), but then would need to escape <b> from escaping.
115			my @words = split /$wc_regexp/, $$text_ref;
116
117
118			return 'No Content saved: Check StoreDescription setting' unless @words;
119
120			my @flags;
121			$flags[$#words] = 0; # Extend array.
122
123			my $occurrences = $Occurrences ;
124
125
126			my $pos = 0;
127
128			while ( $Show_Words && $pos <= $#words ) {
129
130			# Check if the word is a swish word (ignoring begin and end chars)
131			if ( $words[$pos] =~ /$extract_regexp/ ) {
132
133
134			my ( $begin, $word, $end ) = ( $1, $2, $3 );
135
136			my $test = $stemmer_function
137			? $stemmer_function->($word)
138			: lc $word;
139
140			$test \|\|= lc $word;
141
142			# Not check if word matches
143			if ( $test =~ /$match_regexp/ ) {
144
145			$words[$pos] = "$begin$On$word$Off$end";
146
147
148			my $start = $pos - ($Show_Words-1)* 2;
149			my $end = $pos + ($Show_Words-1)* 2;
150			if ( $start < 0 ) {
151			$end = $end - $start;
152			$start = 0;
153			}
154
155			$end = $#words if $end > $#words;
156
157			$flags[$_]++ for $start .. $end;
158
159
160			# All done, and mark where to stop looking
161			if ( $occurrences-- <= 0 ) {
162			$last = $end;
163			last;
164			}
165			}
166			}
167
168			$pos += 2; # Skip to next wordchar word
169			}
170
171
172			my $dotdotdot = ' <b>...</b> ';
173
174
175			my @output;
176
177			my $printing;
178			my $first = 1;
179			my $some_printed;
180
181			if ( $Show_Words && @words > 50 ) { # don't limit context if a small number of words
182			for my $i ( 0 ..$#words ) {
183
184
185			if ( $last && $i >= $last && $i < $#words ) {
186			push @output, $dotdotdot;
187			last;
188			}
189
190			if ( $flags[$i] ) {
191
192			push @output, $dotdotdot if !$printing++ && !$first;
193			push @output, $words[$i];
194			$some_printed++;
195
196			} else {
197			$printing = 0;
198			}
199
200			$first = 0;
201
202
203			}
204			}
205
206			if ( !$some_printed ) {
207			for my $i ( 0 .. $Max_Words ) {
208			if ( $i > $#words ) {
209			$printing++;
210			last;
211			}
212			push @output, $words[$i];
213			}
214			}
215
216
217
218			push @output, $dotdotdot if !$printing;
219
220			$$text_ref = join '', @output;
221
222
223			}
224
225			#============================================
226			# Returns compiled regular expressions for matching
227			#
228
229			sub set_match_regexp {
230			my $self = shift;
231
232			my $results = $self->{results};
233
234
235			my $wc = $results->header('wordcharacters');
236			my $ignoref = $results->header('ignorefirstchar');
237			my $ignorel = $results->header('ignorelastchar');
238
239
240
241			my $query = join ' ', map { join ' ', @$_} @{$self->{query}}; # join everything together!
242
243
244			$wc = quotemeta $wc;
245
246
247			my $match_string =
248			join '\|',
249			map { substr( $_, -1, 1 ) eq '*'
250			? quotemeta( substr( $_, 0, -1) ) . "[$wc]*?"
251			: quotemeta
252			}
253			grep { ! /^(and\|or\|not\|["()=])$/oi } # left over code
254			split /\s+/, $query;
255
256
257
258			return unless $match_string;
259
260			for ( $ignoref, $ignorel ) {
261			if ( $_ ) {
262			$_ = quotemeta;
263			$_ = "([$_]*)";
264			} else {
265			$_ = '()';
266			}
267			}
268
269
270			# Yuck!
271			$wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing
272
273
274			# Now, wait a minute. Look at this more, as I'd hope that making a
275			# qr// go out of scope would release the compiled pattern.
276
277			if ( $ENV{MOD_PERL} ) {
278			$self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words
279			$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare
280			$self->{match_regexp} = qr/^(?:$match_string)$/; # regexp for comparing extracted words to query
281
282			} else {
283			$self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words
284			$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare
285			$self->{match_regexp} = qr/^(?:$match_string)$/o; # regexp for comparing extracted words to query
286			}
287			}
288
289
290			1;
291