1 |
adcroft |
1.1 |
#======================================================================= |
2 |
|
|
# Default Highlighting Code |
3 |
|
|
# |
4 |
|
|
# $Id: DefaultHighlight.pm,v 1.1 2001/12/06 07:32:11 whmoseley Exp $ |
5 |
|
|
#======================================================================= |
6 |
|
|
package DefaultHighlight; |
7 |
|
|
use strict; |
8 |
|
|
|
9 |
|
|
sub new { |
10 |
|
|
my ( $class, $results, $metaname ) = @_; |
11 |
|
|
|
12 |
|
|
|
13 |
|
|
my $self = bless { |
14 |
|
|
results => $results, # just in case we need a method |
15 |
|
|
settings=> $results->config('highlight'), |
16 |
|
|
metaname=> $metaname, |
17 |
|
|
}, $class; |
18 |
|
|
|
19 |
|
|
# parse out the query into words |
20 |
|
|
my $query = $results->extract_query_match; |
21 |
|
|
|
22 |
|
|
|
23 |
|
|
# Do words exist for this layer (all text at this time) and metaname? |
24 |
|
|
# This is a reference to an array of phrases and words |
25 |
|
|
|
26 |
|
|
$self->{description_prop} = $results->config('description_prop') || ''; |
27 |
|
|
|
28 |
|
|
if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) { |
29 |
|
|
eval { require SWISH::Stemmer }; |
30 |
|
|
if ( $@ ) { |
31 |
|
|
$results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@); |
32 |
|
|
} else { |
33 |
|
|
$self->{stemmer_function} = \&SWISH::Stemmer::SwishStem; |
34 |
|
|
} |
35 |
|
|
} |
36 |
|
|
|
37 |
|
|
|
38 |
|
|
if ( $query && exists $query->{text}{$metaname} ) { |
39 |
|
|
$self->{query} = $query->{text}{$metaname}; |
40 |
|
|
|
41 |
|
|
$self->set_match_regexp; |
42 |
|
|
} |
43 |
|
|
|
44 |
|
|
return $self; |
45 |
|
|
} |
46 |
|
|
|
47 |
|
|
sub highlight { |
48 |
|
|
my ( $self, $properties ) = @_; |
49 |
|
|
|
50 |
|
|
|
51 |
|
|
return unless $self->{query}; |
52 |
|
|
|
53 |
|
|
my $phrase_array = $self->{query}; |
54 |
|
|
|
55 |
|
|
my $settings = $self->{settings}; |
56 |
|
|
my $metaname = $self->{metaname}; |
57 |
|
|
|
58 |
|
|
# Do we care about this meta? |
59 |
|
|
return unless exists $settings->{meta_to_prop_map}{$metaname}; |
60 |
|
|
|
61 |
|
|
# Get the related properties |
62 |
|
|
my @props = @{ $settings->{meta_to_prop_map}{$metaname} }; |
63 |
|
|
|
64 |
|
|
my %checked; |
65 |
|
|
|
66 |
|
|
for ( @props ) { |
67 |
|
|
if ( $properties->{$_} ) { |
68 |
|
|
$checked{$_}++; |
69 |
|
|
$self->highlight_text( \$properties->{$_}, $phrase_array ); |
70 |
|
|
} |
71 |
|
|
} |
72 |
|
|
|
73 |
|
|
|
74 |
|
|
# Truncate the description, if not processed. |
75 |
|
|
my $description = $self->{description_prop}; |
76 |
|
|
if ( $description && !$checked{ $description } && $properties->{$description} ) { |
77 |
|
|
my $max_words = $settings->{max_words} || 100; |
78 |
|
|
my @words = split /\s+/, $properties->{$description}; |
79 |
|
|
if ( @words > $max_words ) { |
80 |
|
|
$properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>'; |
81 |
|
|
} |
82 |
|
|
} |
83 |
|
|
|
84 |
|
|
} |
85 |
|
|
|
86 |
|
|
|
87 |
|
|
|
88 |
|
|
#========================================================================== |
89 |
|
|
# |
90 |
|
|
|
91 |
|
|
sub highlight_text { |
92 |
|
|
|
93 |
|
|
my ( $self, $text_ref, $phrase_array ) = @_; |
94 |
|
|
|
95 |
|
|
my $wc_regexp = $self->{wc_regexp}; |
96 |
|
|
my $extract_regexp = $self->{extract_regexp}; |
97 |
|
|
my $match_regexp = $self->{match_regexp}; |
98 |
|
|
|
99 |
|
|
|
100 |
|
|
my $last = 0; |
101 |
|
|
|
102 |
|
|
my $settings = $self->{settings}; |
103 |
|
|
|
104 |
|
|
my $Show_Words = $settings->{show_words} || 10; |
105 |
|
|
my $Occurrences = $settings->{occurrences} || 5; |
106 |
|
|
my $Max_Words = $settings->{max_words} || 100; |
107 |
|
|
my $On = $settings->{highlight_on} || '<b>'; |
108 |
|
|
my $Off = $settings->{highlight_off} || '</b>'; |
109 |
|
|
|
110 |
|
|
|
111 |
|
|
my $stemmer_function = $self->{stemmer_function}; |
112 |
|
|
|
113 |
|
|
|
114 |
|
|
# Should really call unescapeHTML(), but then would need to escape <b> from escaping. |
115 |
|
|
my @words = split /$wc_regexp/, $$text_ref; |
116 |
|
|
|
117 |
|
|
|
118 |
|
|
return 'No Content saved: Check StoreDescription setting' unless @words; |
119 |
|
|
|
120 |
|
|
my @flags; |
121 |
|
|
$flags[$#words] = 0; # Extend array. |
122 |
|
|
|
123 |
|
|
my $occurrences = $Occurrences ; |
124 |
|
|
|
125 |
|
|
|
126 |
|
|
my $pos = 0; |
127 |
|
|
|
128 |
|
|
while ( $Show_Words && $pos <= $#words ) { |
129 |
|
|
|
130 |
|
|
# Check if the word is a swish word (ignoring begin and end chars) |
131 |
|
|
if ( $words[$pos] =~ /$extract_regexp/ ) { |
132 |
|
|
|
133 |
|
|
|
134 |
|
|
my ( $begin, $word, $end ) = ( $1, $2, $3 ); |
135 |
|
|
|
136 |
|
|
my $test = $stemmer_function |
137 |
|
|
? $stemmer_function->($word) |
138 |
|
|
: lc $word; |
139 |
|
|
|
140 |
|
|
$test ||= lc $word; |
141 |
|
|
|
142 |
|
|
# Not check if word matches |
143 |
|
|
if ( $test =~ /$match_regexp/ ) { |
144 |
|
|
|
145 |
|
|
$words[$pos] = "$begin$On$word$Off$end"; |
146 |
|
|
|
147 |
|
|
|
148 |
|
|
my $start = $pos - ($Show_Words-1)* 2; |
149 |
|
|
my $end = $pos + ($Show_Words-1)* 2; |
150 |
|
|
if ( $start < 0 ) { |
151 |
|
|
$end = $end - $start; |
152 |
|
|
$start = 0; |
153 |
|
|
} |
154 |
|
|
|
155 |
|
|
$end = $#words if $end > $#words; |
156 |
|
|
|
157 |
|
|
$flags[$_]++ for $start .. $end; |
158 |
|
|
|
159 |
|
|
|
160 |
|
|
# All done, and mark where to stop looking |
161 |
|
|
if ( $occurrences-- <= 0 ) { |
162 |
|
|
$last = $end; |
163 |
|
|
last; |
164 |
|
|
} |
165 |
|
|
} |
166 |
|
|
} |
167 |
|
|
|
168 |
|
|
$pos += 2; # Skip to next wordchar word |
169 |
|
|
} |
170 |
|
|
|
171 |
|
|
|
172 |
|
|
my $dotdotdot = ' <b>...</b> '; |
173 |
|
|
|
174 |
|
|
|
175 |
|
|
my @output; |
176 |
|
|
|
177 |
|
|
my $printing; |
178 |
|
|
my $first = 1; |
179 |
|
|
my $some_printed; |
180 |
|
|
|
181 |
|
|
if ( $Show_Words && @words > 50 ) { # don't limit context if a small number of words |
182 |
|
|
for my $i ( 0 ..$#words ) { |
183 |
|
|
|
184 |
|
|
|
185 |
|
|
if ( $last && $i >= $last && $i < $#words ) { |
186 |
|
|
push @output, $dotdotdot; |
187 |
|
|
last; |
188 |
|
|
} |
189 |
|
|
|
190 |
|
|
if ( $flags[$i] ) { |
191 |
|
|
|
192 |
|
|
push @output, $dotdotdot if !$printing++ && !$first; |
193 |
|
|
push @output, $words[$i]; |
194 |
|
|
$some_printed++; |
195 |
|
|
|
196 |
|
|
} else { |
197 |
|
|
$printing = 0; |
198 |
|
|
} |
199 |
|
|
|
200 |
|
|
$first = 0; |
201 |
|
|
|
202 |
|
|
|
203 |
|
|
} |
204 |
|
|
} |
205 |
|
|
|
206 |
|
|
if ( !$some_printed ) { |
207 |
|
|
for my $i ( 0 .. $Max_Words ) { |
208 |
|
|
if ( $i > $#words ) { |
209 |
|
|
$printing++; |
210 |
|
|
last; |
211 |
|
|
} |
212 |
|
|
push @output, $words[$i]; |
213 |
|
|
} |
214 |
|
|
} |
215 |
|
|
|
216 |
|
|
|
217 |
|
|
|
218 |
|
|
push @output, $dotdotdot if !$printing; |
219 |
|
|
|
220 |
|
|
$$text_ref = join '', @output; |
221 |
|
|
|
222 |
|
|
|
223 |
|
|
} |
224 |
|
|
|
225 |
|
|
#============================================ |
226 |
|
|
# Returns compiled regular expressions for matching |
227 |
|
|
# |
228 |
|
|
|
229 |
|
|
sub set_match_regexp { |
230 |
|
|
my $self = shift; |
231 |
|
|
|
232 |
|
|
my $results = $self->{results}; |
233 |
|
|
|
234 |
|
|
|
235 |
|
|
my $wc = $results->header('wordcharacters'); |
236 |
|
|
my $ignoref = $results->header('ignorefirstchar'); |
237 |
|
|
my $ignorel = $results->header('ignorelastchar'); |
238 |
|
|
|
239 |
|
|
|
240 |
|
|
|
241 |
|
|
my $query = join ' ', map { join ' ', @$_} @{$self->{query}}; # join everything together! |
242 |
|
|
|
243 |
|
|
|
244 |
|
|
$wc = quotemeta $wc; |
245 |
|
|
|
246 |
|
|
|
247 |
|
|
my $match_string = |
248 |
|
|
join '|', |
249 |
|
|
map { substr( $_, -1, 1 ) eq '*' |
250 |
|
|
? quotemeta( substr( $_, 0, -1) ) . "[$wc]*?" |
251 |
|
|
: quotemeta |
252 |
|
|
} |
253 |
|
|
grep { ! /^(and|or|not|["()=])$/oi } # left over code |
254 |
|
|
split /\s+/, $query; |
255 |
|
|
|
256 |
|
|
|
257 |
|
|
|
258 |
|
|
return unless $match_string; |
259 |
|
|
|
260 |
|
|
for ( $ignoref, $ignorel ) { |
261 |
|
|
if ( $_ ) { |
262 |
|
|
$_ = quotemeta; |
263 |
|
|
$_ = "([$_]*)"; |
264 |
|
|
} else { |
265 |
|
|
$_ = '()'; |
266 |
|
|
} |
267 |
|
|
} |
268 |
|
|
|
269 |
|
|
|
270 |
|
|
# Yuck! |
271 |
|
|
$wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing |
272 |
|
|
|
273 |
|
|
|
274 |
|
|
# Now, wait a minute. Look at this more, as I'd hope that making a |
275 |
|
|
# qr// go out of scope would release the compiled pattern. |
276 |
|
|
|
277 |
|
|
if ( $ENV{MOD_PERL} ) { |
278 |
|
|
$self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words |
279 |
|
|
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare |
280 |
|
|
$self->{match_regexp} = qr/^(?:$match_string)$/; # regexp for comparing extracted words to query |
281 |
|
|
|
282 |
|
|
} else { |
283 |
|
|
$self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words |
284 |
|
|
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare |
285 |
|
|
$self->{match_regexp} = qr/^(?:$match_string)$/o; # regexp for comparing extracted words to query |
286 |
|
|
} |
287 |
|
|
} |
288 |
|
|
|
289 |
|
|
|
290 |
|
|
1; |
291 |
|
|
|