1 |
adcroft |
1.1 |
#!/usr/local/bin/perl -w |
2 |
|
|
package SwishSearch; |
3 |
|
|
use strict; |
4 |
|
|
|
5 |
|
|
use lib qw( modules ); ### This may need to be adjusted! |
6 |
|
|
### It should point to the location of the |
7 |
|
|
### associated script modules directory |
8 |
|
|
|
9 |
|
|
my $DEFAULT_CONFIG_FILE = '.swishcgi.conf'; |
10 |
|
|
|
11 |
|
|
################################################################################### |
12 |
|
|
# |
13 |
|
|
# If this text is displayed on your browser then your web server |
14 |
|
|
# is not configured to run .cgi programs. Contact your web server administrator. |
15 |
|
|
# |
16 |
|
|
# To display documentation for this program type "perldoc swish.cgi" |
17 |
|
|
# |
18 |
|
|
# swish.cgi $Revision: 1.33 $ Copyright (C) 2001 Bill Moseley swishscript@hank.org |
19 |
|
|
# Example CGI program for searching with SWISH-E |
20 |
|
|
# |
21 |
|
|
# This example program will only run under an OS that supports fork(). |
22 |
|
|
# Ok, piped opens. |
23 |
|
|
# |
24 |
|
|
# |
25 |
|
|
# This program is free software; you can redistribute it and/or |
26 |
|
|
# modify it under the terms of the GNU General Public License |
27 |
|
|
# as published by the Free Software Foundation; either version |
28 |
|
|
# 2 of the License, or (at your option) any later version. |
29 |
|
|
# |
30 |
|
|
# This program is distributed in the hope that it will be useful, |
31 |
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
32 |
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
33 |
|
|
# GNU General Public License for more details. |
34 |
|
|
# |
35 |
|
|
# The above lines must remain at the top of this program |
36 |
|
|
# |
37 |
|
|
# $Id: swish.cgi,v 1.33 2002/08/13 23:08:54 whmoseley Exp $ |
38 |
|
|
# |
39 |
|
|
#################################################################################### |
40 |
|
|
|
41 |
|
|
# This is written this way so the script can be used as a CGI script or a mod_perl |
42 |
|
|
# module without any code changes. |
43 |
|
|
|
44 |
|
|
# use CGI (); # might not be needed if using Apache::Request |
45 |
|
|
|
46 |
|
|
#================================================================================= |
47 |
|
|
# CGI entry point |
48 |
|
|
# |
49 |
|
|
#================================================================================= |
50 |
|
|
|
51 |
|
|
|
52 |
|
|
|
53 |
|
|
# Run the script -- entry point if running as a CGI script |
54 |
|
|
|
55 |
|
|
unless ( $ENV{MOD_PERL} ) { |
56 |
|
|
my $config = default_config(); |
57 |
|
|
|
58 |
|
|
# Merge with disk config file. |
59 |
|
|
$config = merge_read_config( $config ); |
60 |
|
|
process_request( $config ); |
61 |
|
|
} |
62 |
|
|
|
63 |
|
|
|
64 |
|
|
|
65 |
|
|
|
66 |
|
|
#================================================================================== |
67 |
|
|
# This sets the default configuration parameters |
68 |
|
|
# |
69 |
|
|
# Any configuration read from disk is merged with these settings. |
70 |
|
|
# |
71 |
|
|
# Only a few settings are actually required. Some reasonable defaults are used |
72 |
|
|
# for most. If fact, you can probably create a complete config as: |
73 |
|
|
# |
74 |
|
|
# return = { |
75 |
|
|
# swish_binary => '/usr/local/bin/swish-e', |
76 |
|
|
# swish_index => '/usr/local/share/swish/index.swish-e', |
77 |
|
|
# title_property => 'swishtitle', # Not required, but recommended |
78 |
|
|
# }; |
79 |
|
|
# |
80 |
|
|
# But, that doesn't really show all the options. |
81 |
|
|
# |
82 |
|
|
# You can modify the options below, or you can use a config file. The config file |
83 |
|
|
# is .swishcgi.conf by default (read from the current directory) that must return |
84 |
|
|
# a hash reference. For example, to create a config file that changes the default |
85 |
|
|
# title and index file name, plus uses Template::Toolkit to generate output |
86 |
|
|
# create a config file as: |
87 |
|
|
# |
88 |
|
|
# # Example config file -- returns a hash reference |
89 |
|
|
# { |
90 |
|
|
# title => 'Search Our Site', |
91 |
|
|
# swish_index => 'index.web', |
92 |
|
|
# |
93 |
|
|
# template => { |
94 |
|
|
# package => 'TemplateToolkit', |
95 |
|
|
# file => 'search.tt', |
96 |
|
|
# options => { |
97 |
|
|
# INCLUDE_PATH => '/home/user/swish-e/example', |
98 |
|
|
# }, |
99 |
|
|
# }; |
100 |
|
|
# |
101 |
|
|
# |
102 |
|
|
#----------------------------------------------------------------------------------- |
103 |
|
|
|
104 |
|
|
sub default_config { |
105 |
|
|
|
106 |
|
|
|
107 |
|
|
|
108 |
|
|
##### Configuration Parameters ######### |
109 |
|
|
|
110 |
|
|
#---- This lists all the options, with many commented out --- |
111 |
|
|
# By default, this config is used -- see the process_request() call below. |
112 |
|
|
|
113 |
|
|
# You should adjust for your site, and how your swish index was created. |
114 |
|
|
|
115 |
|
|
##>> |
116 |
|
|
##>> Please don't post this entire section on the swish-e list if looking for help! |
117 |
|
|
##>> |
118 |
|
|
##>> Send a small example, without all the comments. |
119 |
|
|
|
120 |
|
|
#====================================================================== |
121 |
|
|
# *** NOTES **** |
122 |
|
|
# Items beginning with an "x" or "#" are commented out |
123 |
|
|
# the "x" form simply renames (hides) that setting. It's used |
124 |
|
|
# to make it easy to disable a mult-line configuation setting. |
125 |
|
|
# |
126 |
|
|
# If you do not understand a setting then best to leave the default. |
127 |
|
|
# |
128 |
|
|
# Please follow the documentation (perldoc swish.cgi) and set up |
129 |
|
|
# a test using the defaults before making changes. It's much easier |
130 |
|
|
# to modify a working example than to try to get a modified example to work... |
131 |
|
|
# |
132 |
|
|
# Again, this is a Perl hash structure. Commas are important. |
133 |
|
|
#====================================================================== |
134 |
|
|
|
135 |
|
|
return { |
136 |
|
|
title => 'Search our site', # Title of your choice. Displays on the search page |
137 |
|
|
swish_binary => './swish-e', # Location of swish-e binary |
138 |
|
|
|
139 |
|
|
|
140 |
|
|
# By default, this script tries to read a config file. You should probably |
141 |
|
|
# comment this out if not used save a disk stat |
142 |
|
|
config_file => $DEFAULT_CONFIG_FILE, # Default config file |
143 |
|
|
|
144 |
|
|
|
145 |
|
|
# The location of your index file. Typically, this would not be in |
146 |
|
|
# your web tree. |
147 |
|
|
# If you have more than one index to search then specify an array |
148 |
|
|
# reference. e.g. swish_index =>[ qw( index1 index2 index3 )], |
149 |
|
|
|
150 |
|
|
swish_index => 'index.swish-e', # Location of your index file |
151 |
|
|
|
152 |
|
|
# See "select_indexes" below for how to |
153 |
|
|
# select more than one index. |
154 |
|
|
|
155 |
|
|
page_size => 15, # Number of results per page - default 15 |
156 |
|
|
|
157 |
|
|
|
158 |
|
|
# Property name to use as the main link text to the indexed document. |
159 |
|
|
# Typically, this will be 'swishtitle' if have indexed html documents, |
160 |
|
|
# But you can specify any PropertyName defined in your document. |
161 |
|
|
# By default, swish will return the pathname for documents that do not |
162 |
|
|
# have a title. |
163 |
|
|
# In other words, this is used for the text of the links of the search results. |
164 |
|
|
# <a href="prepend_path/swishdocpath">title_property</a> |
165 |
|
|
|
166 |
|
|
title_property => 'swishtitle', |
167 |
|
|
|
168 |
|
|
|
169 |
|
|
|
170 |
|
|
# prepend this path to the filename (swishdocpath) returned by swish. This is used to |
171 |
|
|
# make the href link back to the original document. Comment out to disable. |
172 |
|
|
|
173 |
|
|
#prepend_path => 'http://localhost/mydocs', |
174 |
|
|
|
175 |
|
|
|
176 |
|
|
# Swish has a configuration directive "StoreDescription" that will save part or |
177 |
|
|
# all of a document's contents in the index file. This can then be displayed |
178 |
|
|
# along with results. If you are indexing a lot of files this can use a lot of disk |
179 |
|
|
# space, so test carefully before indexing your entire site. |
180 |
|
|
# Building swish with zlib can greatly reduce the space used by StoreDescription |
181 |
|
|
# |
182 |
|
|
# This settings tells this script to display this description. |
183 |
|
|
# Normally, this should be 'swishdescription', but you can specify another property name. |
184 |
|
|
# There is no default. |
185 |
|
|
|
186 |
|
|
description_prop => 'swishdescription', |
187 |
|
|
|
188 |
|
|
|
189 |
|
|
|
190 |
|
|
# Property names listed here will be displayed in a table below each result |
191 |
|
|
# You may wish to modify this list if you are using document properties (PropertyNames) |
192 |
|
|
# in your swish-e index configuration |
193 |
|
|
# There is no default. |
194 |
|
|
|
195 |
|
|
display_props => [qw/swishlastmodified swishdocsize swishdocpath/], |
196 |
|
|
|
197 |
|
|
|
198 |
|
|
|
199 |
|
|
# Results can be be sorted by any of the properties listed here |
200 |
|
|
# They will be displayed in a drop-down list |
201 |
|
|
# Again, you may modify this list if you are using document properties of your own creation |
202 |
|
|
# Swish uses the rank as the default sort |
203 |
|
|
|
204 |
|
|
sorts => [qw/swishrank swishlastmodified swishtitle swishdocpath/], |
205 |
|
|
|
206 |
|
|
|
207 |
|
|
# Secondary_sort is used to sort within a sort |
208 |
|
|
# You may enter a property name followed by a direction (asc|desc) |
209 |
|
|
|
210 |
|
|
secondary_sort => [qw/swishlastmodified desc/], |
211 |
|
|
|
212 |
|
|
|
213 |
|
|
|
214 |
|
|
|
215 |
|
|
|
216 |
|
|
# You can limit by MetaNames here. Names listed here will be displayed in |
217 |
|
|
# a line of radio buttons. |
218 |
|
|
# The default is to not allow any metaname selection. |
219 |
|
|
# To use this feature you must define MetaNames while indexing. |
220 |
|
|
|
221 |
|
|
# The special "swishdefault" says to search any text that was not indexed |
222 |
|
|
# as a specific metaname (e.g. typically the body of a HTML document and its title). |
223 |
|
|
|
224 |
|
|
# To see how this might work, add to your config file: |
225 |
|
|
# MetaNames swishtitle swishdocpath |
226 |
|
|
# reindex and try: |
227 |
|
|
|
228 |
|
|
metanames => [qw/swishdefault swishtitle swishdocpath /], |
229 |
|
|
|
230 |
|
|
# Add "all" to metanames to test the meta_groups feature described below |
231 |
|
|
|
232 |
|
|
|
233 |
|
|
|
234 |
|
|
# Another example: if you indexed an email archive |
235 |
|
|
# that defined the metanames subject name email (as in the swish-e discussion archive) |
236 |
|
|
# you might use: |
237 |
|
|
#metanames => [qw/body subject name email/], |
238 |
|
|
|
239 |
|
|
|
240 |
|
|
# Note that you can do a real "all" search if you use nested metanames in your source documents. |
241 |
|
|
# Nesting metanames is most common with XML documents. |
242 |
|
|
|
243 |
|
|
# You can also group metanames into "meta-metanames". |
244 |
|
|
# Example: Say you defined metanames "author", "comment" and "keywords" |
245 |
|
|
# You want to allow searching "author", "comment" and the document body ("swishdefault") |
246 |
|
|
# But you would also like an "all" search that searches all metanames, including "keywords": |
247 |
|
|
# |
248 |
|
|
# metanames => [qw/swishdefault author comment all/], |
249 |
|
|
# |
250 |
|
|
# Now, the "all" metaname is not a real metaname. It must be expanded into its |
251 |
|
|
# individual metanames |
252 |
|
|
# |
253 |
|
|
# "meta_groups" maps a fake metaname to a list of real metanames |
254 |
|
|
# |
255 |
|
|
# meta_groups => { |
256 |
|
|
# all => [qw/swishdefault author comment keywords / ], |
257 |
|
|
# }, |
258 |
|
|
# |
259 |
|
|
# swish.cgi will then take a query like |
260 |
|
|
# |
261 |
|
|
# all=(query words) |
262 |
|
|
# |
263 |
|
|
# into the query |
264 |
|
|
# |
265 |
|
|
# swishdefault=(query words) OR author=(query words) OR comment=(query words) OR keywords=(query words) |
266 |
|
|
# |
267 |
|
|
# This is not ideal, but should work for most cases |
268 |
|
|
# (might fail under windows since the query is passed through the shell). |
269 |
|
|
|
270 |
|
|
# To enable this group add "all" to the list of metanames |
271 |
|
|
meta_groups => { |
272 |
|
|
all => [qw/swishdefault swishtitle swishdocpath/], |
273 |
|
|
}, |
274 |
|
|
|
275 |
|
|
|
276 |
|
|
|
277 |
|
|
# "name_labels" is used to map MetaNames and PropertyNames to user-friendly names |
278 |
|
|
# on the form. |
279 |
|
|
|
280 |
|
|
name_labels => { |
281 |
|
|
swishdefault => 'Title & Body', |
282 |
|
|
swishtitle => 'Title', |
283 |
|
|
swishrank => 'Rank', |
284 |
|
|
swishlastmodified => 'Last Modified Date', |
285 |
|
|
swishdocpath => 'Document Path', |
286 |
|
|
swishdocsize => 'Document Size', |
287 |
|
|
all => 'All', # group of metanames |
288 |
|
|
|
289 |
|
|
subject => 'Message Subject', # other examples |
290 |
|
|
name => "Poster's Name", |
291 |
|
|
email => "Poster's Email", |
292 |
|
|
sent => 'Message Date', |
293 |
|
|
}, |
294 |
|
|
|
295 |
|
|
|
296 |
|
|
timeout => 10, # limit time used by swish when fetching results - DoS protection. |
297 |
|
|
|
298 |
|
|
max_query_length => 100, # limit length of query string. Swish also has a limit (default is 40) |
299 |
|
|
# You might want to set swish-e's limit higher, and use this to get a |
300 |
|
|
# somewhat more friendly message. |
301 |
|
|
|
302 |
|
|
|
303 |
|
|
# These settings will use some crude highlighting code to highlight search terms in the |
304 |
|
|
# property specified above as the description_prop (normally, 'swishdescription'). |
305 |
|
|
|
306 |
|
|
|
307 |
|
|
max_chars => 500, # If "highlight" is not defined, then just truncate the description to this many *chars*. |
308 |
|
|
# If you want to go by *words*, enable highlighting, |
309 |
|
|
# and then comment-out show_words. It will be a little slower. |
310 |
|
|
|
311 |
|
|
|
312 |
|
|
# This structure defines term highlighting, and what type of highlighting to use |
313 |
|
|
# If you are using metanames in your searches and they map to properties that you |
314 |
|
|
# will display, you may need to adjust the "meta_to_prop_map". |
315 |
|
|
|
316 |
|
|
highlight => { |
317 |
|
|
|
318 |
|
|
# Pick highlighting module -- you must make sure the module can be found |
319 |
|
|
|
320 |
|
|
# Ok speed, but doesn't handle phrases. |
321 |
|
|
#Deals with stemming, but not stopwords |
322 |
|
|
#package => 'DefaultHighlight', |
323 |
|
|
|
324 |
|
|
# Somewhat slow, but deals with phases, stopwords, and stemming. |
325 |
|
|
# Takes into consideration WordCharacters, IgnoreFirstChars and IgnoreLastChars. |
326 |
|
|
package => 'PhraseHighlight', |
327 |
|
|
|
328 |
|
|
# Fast: phrases without regard to wordcharacter settings |
329 |
|
|
# doesn't do context display, so must match in first X words, |
330 |
|
|
# doesn't handle stemming or stopwords. |
331 |
|
|
#package => 'SimpleHighlight', |
332 |
|
|
|
333 |
|
|
show_words => 10, # Number of swish words words to show around highlighted word |
334 |
|
|
max_words => 100, # If no words are found to highlighted then show this many words |
335 |
|
|
occurrences => 6, # Limit number of occurrences of highlighted words |
336 |
|
|
#highlight_on => '<b>', # HTML highlighting codes |
337 |
|
|
#highlight_off => '</b>', |
338 |
|
|
highlight_on => '<font style="background:#FFFF99">', |
339 |
|
|
highlight_off => '</font>', |
340 |
|
|
|
341 |
|
|
# This maps search metatags to display properties. |
342 |
|
|
meta_to_prop_map => { |
343 |
|
|
swishdefault => [ qw/swishtitle swishdescription/ ], |
344 |
|
|
swishtitle => [ qw/swishtitle/ ], |
345 |
|
|
swishdocpath => [ qw/swishdocpath/ ], |
346 |
|
|
all => [ qw/swishtitle swishdescription swishdocpath/ ], |
347 |
|
|
}, |
348 |
|
|
}, |
349 |
|
|
|
350 |
|
|
|
351 |
|
|
|
352 |
|
|
# If you specify more than one index file (as an array reference) you |
353 |
|
|
# can set this allow selection of which indexes to search. |
354 |
|
|
# The default is to search all indexes specified if this is not used. |
355 |
|
|
# When used, the first index is the default index. |
356 |
|
|
|
357 |
|
|
# You need to specify your indexes as an array reference: |
358 |
|
|
#swish_index => [ qw/ index.swish-e index.other index2.other index3.other index4.other / ], |
359 |
|
|
|
360 |
|
|
Xselect_indexes => { |
361 |
|
|
#method => 'radio_group', # pick radio_group, popup_menu, or checkbox_group |
362 |
|
|
method => 'checkbox_group', |
363 |
|
|
#method => 'popup_menu', |
364 |
|
|
columns => 3, |
365 |
|
|
labels => [ 'Main Index', 'Other Index', qw/ two three four/ ], # Must match up one-to-one |
366 |
|
|
description => 'Select Site: ', |
367 |
|
|
}, |
368 |
|
|
|
369 |
|
|
|
370 |
|
|
# Similar to select_indexes, this adds a metaname search |
371 |
|
|
# based on a metaname. You can use any metaname, and this will |
372 |
|
|
# add an "AND" search to limit results to a subset of your records. |
373 |
|
|
# i.e. it adds something like 'site=(foo or bar or baz)' if foo, bar, and baz were selected. |
374 |
|
|
|
375 |
|
|
# Swish-e's ExtractPath would work well with this. For example, the apache docs: |
376 |
|
|
# ExtractPath site regex !^/usr/local/apache/htdocs/manual/([^/]+)/.+$!$1! |
377 |
|
|
# ExtractPathDefault site other |
378 |
|
|
|
379 |
|
|
|
380 |
|
|
Xselect_by_meta => { |
381 |
|
|
#method => 'radio_group', # pick: radio_group, popup_menu, or checkbox_group |
382 |
|
|
method => 'checkbox_group', |
383 |
|
|
#method => 'popup_menu', |
384 |
|
|
columns => 3, |
385 |
|
|
metaname => 'site', # Can't be a metaname used elsewhere! |
386 |
|
|
values => [qw/misc mod vhosts other/], |
387 |
|
|
labels => { |
388 |
|
|
misc => 'General Apache docs', |
389 |
|
|
mod => 'Apache Modules', |
390 |
|
|
vhosts => 'Virutal hosts', |
391 |
|
|
}, |
392 |
|
|
description => 'Limit search to these areas: ', |
393 |
|
|
}, |
394 |
|
|
|
395 |
|
|
|
396 |
|
|
|
397 |
|
|
|
398 |
|
|
# The 'template' setting defines what generates the output |
399 |
|
|
# The default is "TemplateDefault" which is reasonably ugly. |
400 |
|
|
# Note that some of the above options may not be available |
401 |
|
|
# for templating, as it's up to you do layout the form |
402 |
|
|
# and results in your template. |
403 |
|
|
|
404 |
|
|
|
405 |
|
|
xtemplate => { |
406 |
|
|
package => 'TemplateDefault', |
407 |
|
|
}, |
408 |
|
|
|
409 |
|
|
xtemplate => { |
410 |
|
|
package => 'TemplateDumper', |
411 |
|
|
}, |
412 |
|
|
|
413 |
|
|
xtemplate => { |
414 |
|
|
package => 'TemplateToolkit', |
415 |
|
|
file => 'search.tt', |
416 |
|
|
options => { |
417 |
|
|
INCLUDE_PATH => '/home/user/swish-e/example', |
418 |
|
|
#PRE_PROCESS => 'config', |
419 |
|
|
}, |
420 |
|
|
}, |
421 |
|
|
|
422 |
|
|
xtemplate => { |
423 |
|
|
package => 'TemplateHTMLTemplate', |
424 |
|
|
options => { |
425 |
|
|
filename => 'swish.tmpl', |
426 |
|
|
die_on_bad_params => 0, |
427 |
|
|
loop_context_vars => 1, |
428 |
|
|
cache => 1, |
429 |
|
|
}, |
430 |
|
|
}, |
431 |
|
|
|
432 |
|
|
|
433 |
|
|
|
434 |
|
|
# The "on_intranet" setting is just a flag that can be used to say you do |
435 |
|
|
# not have an external internet connection. It's here because the default |
436 |
|
|
# page generation includes links to images on swish-e.or and on www.w3.org. |
437 |
|
|
# If this is set to one then those images will not be shown. |
438 |
|
|
# (This only effects the default ouput module TemplateDefault) |
439 |
|
|
|
440 |
|
|
on_intranet => 0, |
441 |
|
|
|
442 |
|
|
|
443 |
|
|
|
444 |
|
|
# Here you can hard-code debugging options. The will help you find |
445 |
|
|
# where you made your mistake ;) |
446 |
|
|
# Using all at once will generate a lot of messages to STDERR |
447 |
|
|
# Please see the documentation before using these. |
448 |
|
|
# Typically, you will set these from the command line instead of in the configuration. |
449 |
|
|
|
450 |
|
|
# debug_options => 'basic, command, headers, output, summary, dump', |
451 |
|
|
|
452 |
|
|
|
453 |
|
|
|
454 |
|
|
# This defines the package object for reading CGI parameters |
455 |
|
|
# Defaults to CGI. Might be useful with mod_perl. |
456 |
|
|
# request_package => 'CGI', |
457 |
|
|
# request_package => 'Apache::Request', |
458 |
|
|
|
459 |
|
|
|
460 |
|
|
|
461 |
|
|
# Minor adjustment to page display. The page navigation normally looks like: |
462 |
|
|
# Page: 1 5 6 7 8 9 24 |
463 |
|
|
# where the first page and last page are always displayed. These can be disabled by |
464 |
|
|
# by setting to true values ( 1 ) |
465 |
|
|
|
466 |
|
|
no_first_page_navigation => 0, |
467 |
|
|
no_last_page_navigation => 0, |
468 |
|
|
|
469 |
|
|
|
470 |
|
|
|
471 |
|
|
|
472 |
|
|
# Limit to date ranges |
473 |
|
|
|
474 |
|
|
|
475 |
|
|
|
476 |
|
|
# This adds in the date_range limiting options |
477 |
|
|
# You will need the DateRanges.pm module from the author to use that feature |
478 |
|
|
|
479 |
|
|
# Noramlly, you will want to limit by the last modified date, so specify |
480 |
|
|
# "swishlastmodified" as the property_name. If indexing a mail archive, and, for |
481 |
|
|
# example, you store the date (a unix timestamp) as "date" then specify |
482 |
|
|
# "date" as the property_name. |
483 |
|
|
|
484 |
|
|
date_ranges => { |
485 |
|
|
property_name => 'swishlastmodified', # property name to limit by |
486 |
|
|
|
487 |
|
|
# what you specify here depends on the DateRanges.pm module. |
488 |
|
|
time_periods => [ |
489 |
|
|
'All', |
490 |
|
|
'Today', |
491 |
|
|
'Yesterday', |
492 |
|
|
#'Yesterday onward', |
493 |
|
|
'This Week', |
494 |
|
|
'Last Week', |
495 |
|
|
'Last 90 Days', |
496 |
|
|
'This Month', |
497 |
|
|
'Last Month', |
498 |
|
|
#'Past', |
499 |
|
|
#'Future', |
500 |
|
|
#'Next 30 Days', |
501 |
|
|
], |
502 |
|
|
|
503 |
|
|
line_break => 0, |
504 |
|
|
default => 'All', |
505 |
|
|
date_range => 1, |
506 |
|
|
}, |
507 |
|
|
|
508 |
|
|
}; |
509 |
|
|
|
510 |
|
|
} |
511 |
|
|
|
512 |
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^ end of user config ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
513 |
|
|
#======================================================================================== |
514 |
|
|
|
515 |
|
|
|
516 |
|
|
|
517 |
|
|
#================================================================================= |
518 |
|
|
# mod_perl entry point |
519 |
|
|
# |
520 |
|
|
# As an example, you might use a PerlSetVar to point to paths to different |
521 |
|
|
# config files, and then cache the different configurations by path. |
522 |
|
|
# |
523 |
|
|
#================================================================================= |
524 |
|
|
|
525 |
|
|
my %cached_configs; |
526 |
|
|
|
527 |
|
|
sub handler { |
528 |
|
|
my $r = shift; |
529 |
|
|
|
530 |
|
|
if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) { |
531 |
|
|
|
532 |
|
|
# Already cached? |
533 |
|
|
if ( $cached_configs{ $config_path } ) { |
534 |
|
|
process_request( $cached_configs{ $config_path } ); |
535 |
|
|
return Apache::Constants::OK(); |
536 |
|
|
} |
537 |
|
|
|
538 |
|
|
# Else, load config |
539 |
|
|
my $config = default_config(); |
540 |
|
|
$config->{config_file} = $config_path; |
541 |
|
|
|
542 |
|
|
# Merge with disk config file. |
543 |
|
|
$cached_configs{ $config_path } = merge_read_config( $config ); |
544 |
|
|
|
545 |
|
|
process_request( $cached_configs{ $config_path } ); |
546 |
|
|
return Apache::Constants::OK(); |
547 |
|
|
} |
548 |
|
|
|
549 |
|
|
|
550 |
|
|
# Otherwise, use hard-coded config |
551 |
|
|
process_request( default_config() ); |
552 |
|
|
|
553 |
|
|
return Apache::Constants::OK(); |
554 |
|
|
|
555 |
|
|
} |
556 |
|
|
|
557 |
|
|
|
558 |
|
|
#============================================================================ |
559 |
|
|
# Read config settings from disk, and merge |
560 |
|
|
# Note, all errors are ignored since by default this script looks for a |
561 |
|
|
# config file. |
562 |
|
|
# |
563 |
|
|
#============================================================================ |
564 |
|
|
sub merge_read_config { |
565 |
|
|
my $config = shift; |
566 |
|
|
|
567 |
|
|
|
568 |
|
|
set_default_debug_flags(); |
569 |
|
|
|
570 |
|
|
set_debug($config); # get from config or from %ENV |
571 |
|
|
|
572 |
|
|
|
573 |
|
|
return $config unless $config->{config_file}; |
574 |
|
|
|
575 |
|
|
my $return = do $config->{config_file}; # load the config file |
576 |
|
|
|
577 |
|
|
unless ( ref $return eq 'HASH' ) { |
578 |
|
|
|
579 |
|
|
# First, let's check for file not found for the default config, which we can ignore |
580 |
|
|
|
581 |
|
|
my $error = $@ || $!; |
582 |
|
|
|
583 |
|
|
if ( $config->{config_file} eq $DEFAULT_CONFIG_FILE && !-e $config->{config_file} ) { |
584 |
|
|
warn "Config file '$config->{config_file}': $!" if $config->{debug}; |
585 |
|
|
return $config; |
586 |
|
|
} |
587 |
|
|
|
588 |
|
|
die "Config file '$config->{config_file}': $error"; |
589 |
|
|
} |
590 |
|
|
|
591 |
|
|
|
592 |
|
|
|
593 |
|
|
if ( $config->{debug} || $return->{debug} ) { |
594 |
|
|
require Data::Dumper; |
595 |
|
|
print STDERR "\n---------- Read config parameters from '$config->{config_file}' ------\n", |
596 |
|
|
Data::Dumper::Dumper($return), |
597 |
|
|
"-------------------------\n"; |
598 |
|
|
} |
599 |
|
|
|
600 |
|
|
set_debug( $return ); |
601 |
|
|
|
602 |
|
|
|
603 |
|
|
# Merge settings |
604 |
|
|
return { %$config, %$return }; |
605 |
|
|
} |
606 |
|
|
|
607 |
|
|
#-------------------------------------------------------------------------------------------------- |
608 |
|
|
sub set_default_debug_flags { |
609 |
|
|
# Debug flags defined |
610 |
|
|
|
611 |
|
|
$SwishSearch::DEBUG_BASIC = 1; # Show command used to run swish |
612 |
|
|
$SwishSearch::DEBUG_COMMAND = 2; # Show command used to run swish |
613 |
|
|
$SwishSearch::DEBUG_HEADERS = 4; # Swish output headers |
614 |
|
|
$SwishSearch::DEBUG_OUTPUT = 8; # Swish output besides headers |
615 |
|
|
$SwishSearch::DEBUG_SUMMARY = 16; # Summary of results parsed |
616 |
|
|
$SwishSearch::DEBUG_DUMP_DATA = 32; # dump data that is sent to templating modules |
617 |
|
|
} |
618 |
|
|
|
619 |
|
|
|
620 |
|
|
|
621 |
|
|
|
622 |
|
|
#--------------------------------------------------------------------------------------------------- |
623 |
|
|
sub set_debug { |
624 |
|
|
my $conf = shift; |
625 |
|
|
|
626 |
|
|
unless ( $ENV{SWISH_DEBUG} ||$conf->{debug_options} ) { |
627 |
|
|
$conf->{debug} = 0; |
628 |
|
|
return; |
629 |
|
|
} |
630 |
|
|
|
631 |
|
|
my %debug = ( |
632 |
|
|
basic => [$SwishSearch::DEBUG_BASIC, 'Basic debugging'], |
633 |
|
|
command => [$SwishSearch::DEBUG_COMMAND, 'Show command used to run swish'], |
634 |
|
|
headers => [$SwishSearch::DEBUG_HEADERS, 'Show headers returned from swish'], |
635 |
|
|
output => [$SwishSearch::DEBUG_OUTPUT, 'Show output from swish'], |
636 |
|
|
summary => [$SwishSearch::DEBUG_SUMMARY, 'Show summary of results'], |
637 |
|
|
dump => [$SwishSearch::DEBUG_DUMP_DATA, 'Show all data available to templates'], |
638 |
|
|
); |
639 |
|
|
|
640 |
|
|
|
641 |
|
|
$conf->{debug} = 1; |
642 |
|
|
|
643 |
|
|
for ( split /\s*,\s*/, $ENV{SWISH_DEBUG} ) { |
644 |
|
|
if ( exists $debug{ lc $_ } ) { |
645 |
|
|
$conf->{debug} |= $debug{ lc $_ }->[0]; |
646 |
|
|
next; |
647 |
|
|
} |
648 |
|
|
|
649 |
|
|
print STDERR "Unknown debug option '$_'. Must be one of:\n", |
650 |
|
|
join( "\n", map { sprintf(' %10s: %10s', $_, $debug{$_}->[1]) } sort { $debug{$a}->[0] <=> $debug{$b}->[0] }keys %debug), |
651 |
|
|
"\n\n"; |
652 |
|
|
exit; |
653 |
|
|
} |
654 |
|
|
|
655 |
|
|
print STDERR "Debug level set to: $conf->{debug}\n"; |
656 |
|
|
} |
657 |
|
|
|
658 |
|
|
|
659 |
|
|
#============================================================================ |
660 |
|
|
# |
661 |
|
|
# This is the main entry point, where a config hash is passed in. |
662 |
|
|
# |
663 |
|
|
#============================================================================ |
664 |
|
|
|
665 |
|
|
sub process_request { |
666 |
|
|
my $conf = shift; # configuration parameters |
667 |
|
|
|
668 |
|
|
# Use CGI.pm by default |
669 |
|
|
my $request_package = $conf->{request_package} || 'CGI'; |
670 |
|
|
$request_package =~ s[::][/]g; |
671 |
|
|
require "$request_package.pm"; |
672 |
|
|
|
673 |
|
|
my $request_object = $conf->{request_package} ? $conf->{request_package}->new : CGI->new; |
674 |
|
|
|
675 |
|
|
if ( $conf->{debug} ) { |
676 |
|
|
print STDERR 'Enter a query [all]: '; |
677 |
|
|
my $query = <STDIN>; |
678 |
|
|
$query =~ tr/\r//d; |
679 |
|
|
chomp $query; |
680 |
|
|
unless ( $query ) { |
681 |
|
|
print STDERR "Using 'not asdfghjklzxcv' to match all records\n"; |
682 |
|
|
$query = 'not asdfghjklzxcv'; |
683 |
|
|
} |
684 |
|
|
|
685 |
|
|
$request_object->param('query', $query ); |
686 |
|
|
|
687 |
|
|
print STDERR 'Enter max results to display [1]: '; |
688 |
|
|
my $max = <STDIN>; |
689 |
|
|
chomp $max; |
690 |
|
|
$max = 1 unless $max && $max =~/^\d+$/; |
691 |
|
|
|
692 |
|
|
$conf->{page_size} = $max; |
693 |
|
|
} |
694 |
|
|
|
695 |
|
|
|
696 |
|
|
|
697 |
|
|
# create search object |
698 |
|
|
my $search = SwishQuery->new( |
699 |
|
|
config => $conf, |
700 |
|
|
request => $request_object, |
701 |
|
|
); |
702 |
|
|
|
703 |
|
|
|
704 |
|
|
# run the query |
705 |
|
|
my $results = $search->run_query; # currently, results is the just the $search object |
706 |
|
|
|
707 |
|
|
if ( $conf->{debug} ) { |
708 |
|
|
if ( $conf->{debug} & $SwishSearch::DEBUG_DUMP_DATA ) { |
709 |
|
|
require Data::Dumper; |
710 |
|
|
print STDERR "\n------------- Results structure passed to template ------------\n", |
711 |
|
|
Data::Dumper::Dumper( $results ), |
712 |
|
|
"--------------------------\n"; |
713 |
|
|
} elsif ( $conf->{debug} & $SwishSearch::DEBUG_SUMMARY ) { |
714 |
|
|
print STDERR "\n------------- Results Summary ------------\n"; |
715 |
|
|
if ( $results->{hits} ) { |
716 |
|
|
require Data::Dumper; |
717 |
|
|
print STDERR "Showing $results->{navigation}{showing} of $results->{navigation}{hits}\n", |
718 |
|
|
Data::Dumper::Dumper( $results->{_results} ); |
719 |
|
|
} else { |
720 |
|
|
print STDERR "** NO RESULTS **\n"; |
721 |
|
|
} |
722 |
|
|
|
723 |
|
|
print STDERR "--------------------------\n"; |
724 |
|
|
} else { |
725 |
|
|
print STDERR ( ($results->{hits} ? "Found $results->{hits} results\n" : "Failed to find any results\n" . $results->errstr . "\n" ),"\n" ); |
726 |
|
|
} |
727 |
|
|
} |
728 |
|
|
|
729 |
|
|
|
730 |
|
|
|
731 |
|
|
my $template = $conf->{template} || { package => 'TemplateDefault' }; |
732 |
|
|
|
733 |
|
|
my $package = $template->{package}; |
734 |
|
|
|
735 |
|
|
my $file = "$package.pm"; |
736 |
|
|
$file =~ s[::][/]g; |
737 |
|
|
|
738 |
|
|
eval { require $file }; |
739 |
|
|
if ( $@ ) { |
740 |
|
|
warn "$0 $@\n"; |
741 |
|
|
print <<EOF; |
742 |
|
|
Content-Type: text/html |
743 |
|
|
|
744 |
|
|
<html> |
745 |
|
|
<head><title>Software Error</title></head> |
746 |
|
|
<body><h2>Software Error</h2><p>Please check error log</p></body> |
747 |
|
|
</html> |
748 |
|
|
EOF |
749 |
|
|
|
750 |
|
|
exit; |
751 |
|
|
} |
752 |
|
|
|
753 |
|
|
$package->show_template( $template, $results ); |
754 |
|
|
} |
755 |
|
|
|
756 |
|
|
|
757 |
|
|
|
758 |
|
|
|
759 |
|
|
|
760 |
|
|
#================================================================================================== |
761 |
|
|
package SwishQuery; |
762 |
|
|
#================================================================================================== |
763 |
|
|
|
764 |
|
|
use Carp; |
765 |
|
|
# Or use this instead -- PLEASE see perldoc CGI::Carp for details |
766 |
|
|
# <opinion>CGI::Carp doesn't help that much</opinion> |
767 |
|
|
#use CGI::Carp; # qw(fatalsToBrowser); |
768 |
|
|
|
769 |
|
|
|
770 |
|
|
#-------------------------------------------------------------------------------- |
771 |
|
|
# new() doesn't do much, just create the object |
772 |
|
|
#-------------------------------------------------------------------------------- |
773 |
|
|
sub new { |
774 |
|
|
my $class = shift; |
775 |
|
|
my %options = @_; |
776 |
|
|
|
777 |
|
|
my $conf = $options{config}; |
778 |
|
|
|
779 |
|
|
croak "Failed to set the swish index files in config setting 'swish_index'" unless $conf->{swish_index}; |
780 |
|
|
croak "Failed to specify 'swish_binary' in configuration" unless $conf->{swish_binary}; |
781 |
|
|
|
782 |
|
|
# initialize the request search hash |
783 |
|
|
my $sh = { |
784 |
|
|
prog => $conf->{swish_binary}, |
785 |
|
|
config => $conf, |
786 |
|
|
q => $options{request}, |
787 |
|
|
hits => 0, |
788 |
|
|
MOD_PERL => $ENV{MOD_PERL}, |
789 |
|
|
}; |
790 |
|
|
|
791 |
|
|
return bless $sh, $class; |
792 |
|
|
} |
793 |
|
|
|
794 |
|
|
|
795 |
|
|
sub hits { shift->{hits} } |
796 |
|
|
|
797 |
|
|
sub config { |
798 |
|
|
my ($self, $setting, $value ) = @_; |
799 |
|
|
|
800 |
|
|
croak "Failed to pass 'config' a setting" unless $setting; |
801 |
|
|
|
802 |
|
|
my $cur = $self->{config}{$setting} if exists $self->{config}{$setting}; |
803 |
|
|
|
804 |
|
|
$self->{config}{$setting} = $value if $value; |
805 |
|
|
|
806 |
|
|
return $cur; |
807 |
|
|
} |
808 |
|
|
|
809 |
|
|
sub header { |
810 |
|
|
my $self = shift; |
811 |
|
|
return unless ref $self->{_headers} eq 'HASH'; |
812 |
|
|
|
813 |
|
|
return $self->{_headers}{$_[0]} || ''; |
814 |
|
|
} |
815 |
|
|
|
816 |
|
|
|
817 |
|
|
# return a ref to an array |
818 |
|
|
sub results { |
819 |
|
|
my $self = shift; |
820 |
|
|
return $self->{_results} || undef; |
821 |
|
|
} |
822 |
|
|
|
823 |
|
|
sub navigation { |
824 |
|
|
my $self = shift; |
825 |
|
|
return unless ref $self->{navigation} eq 'HASH'; |
826 |
|
|
|
827 |
|
|
return exists $self->{navigation}{$_[0]} ? $self->{navigation}{$_[0]} : ''; |
828 |
|
|
} |
829 |
|
|
|
830 |
|
|
sub CGI { $_[0]->{q} }; |
831 |
|
|
|
832 |
|
|
|
833 |
|
|
|
834 |
|
|
|
835 |
|
|
sub swish_command { |
836 |
|
|
|
837 |
|
|
my $self = shift; |
838 |
|
|
|
839 |
|
|
unless ( @_ ) { |
840 |
|
|
return $self->{swish_command} ? @{$self->{swish_command}} : undef; |
841 |
|
|
} |
842 |
|
|
|
843 |
|
|
push @{$self->{swish_command}}, @_; |
844 |
|
|
} |
845 |
|
|
|
846 |
|
|
|
847 |
|
|
sub errstr { |
848 |
|
|
my ($self, $value ) = @_; |
849 |
|
|
|
850 |
|
|
|
851 |
|
|
$self->{_errstr} = $value if $value; |
852 |
|
|
|
853 |
|
|
return $self->{_errstr} || ''; |
854 |
|
|
} |
855 |
|
|
|
856 |
|
|
|
857 |
|
|
|
858 |
|
|
|
859 |
|
|
|
860 |
|
|
|
861 |
|
|
#============================================ |
862 |
|
|
# This returns "$self" just in case we want to seperate out into two objects later |
863 |
|
|
|
864 |
|
|
|
865 |
|
|
sub run_query { |
866 |
|
|
|
867 |
|
|
my $self = shift; |
868 |
|
|
|
869 |
|
|
my $q = $self->{q}; |
870 |
|
|
my $conf = $self->{config}; |
871 |
|
|
|
872 |
|
|
|
873 |
|
|
# Sets the query string, and any -L limits. |
874 |
|
|
return $self unless $self->build_query; |
875 |
|
|
|
876 |
|
|
|
877 |
|
|
|
878 |
|
|
# Set the starting position (which is offset by one) |
879 |
|
|
|
880 |
|
|
my $start = $q->param('start') || 0; |
881 |
|
|
$start = 0 unless $start =~ /^\d+$/ && $start >= 0; |
882 |
|
|
|
883 |
|
|
$self->swish_command( '-b', $start+1 ); |
884 |
|
|
|
885 |
|
|
|
886 |
|
|
|
887 |
|
|
# Set the max hits |
888 |
|
|
|
889 |
|
|
my $page_size = $self->config('page_size') || 15; |
890 |
|
|
$self->swish_command( '-m', $page_size ); |
891 |
|
|
|
892 |
|
|
|
893 |
|
|
return $self unless $self->set_index_file; |
894 |
|
|
|
895 |
|
|
|
896 |
|
|
|
897 |
|
|
# Set the sort option, if any |
898 |
|
|
return $self unless $self->set_sort_order; |
899 |
|
|
|
900 |
|
|
|
901 |
|
|
|
902 |
|
|
my $timeout = $self->config('timeout') || 0; |
903 |
|
|
|
904 |
|
|
eval { |
905 |
|
|
local $SIG{ALRM} = sub { die "Timed out\n" }; |
906 |
|
|
alarm $timeout if $timeout && $^O !~ /Win32/i; |
907 |
|
|
$self->run_swish; |
908 |
|
|
alarm 0 unless $^O =~ /Win32/i; |
909 |
|
|
waitpid $self->{pid}, 0 if $self->{pid}; # for IPC::Open2 |
910 |
|
|
}; |
911 |
|
|
|
912 |
|
|
if ( $@ ) { |
913 |
|
|
warn "$0 $@"; # if $conf->{debug}; |
914 |
|
|
$self->errstr( "Service currently unavailable" ); |
915 |
|
|
return $self; |
916 |
|
|
} |
917 |
|
|
|
918 |
|
|
|
919 |
|
|
|
920 |
|
|
my $hits = $self->hits; |
921 |
|
|
return $self unless $hits; |
922 |
|
|
|
923 |
|
|
|
924 |
|
|
|
925 |
|
|
# Build href for repeated search via GET (forward, backward links) |
926 |
|
|
|
927 |
|
|
|
928 |
|
|
my @query_string = |
929 |
|
|
map { "$_=" . $q->escape( $q->param($_) ) } |
930 |
|
|
grep { $q->param($_) } qw/query metaname sort reverse/; |
931 |
|
|
|
932 |
|
|
|
933 |
|
|
for my $p ( qw/si sbm/ ) { |
934 |
|
|
my @settings = $q->param($p); |
935 |
|
|
next unless @settings; |
936 |
|
|
push @query_string, "$p=" . $q->escape( $_ ) for @settings; |
937 |
|
|
} |
938 |
|
|
|
939 |
|
|
|
940 |
|
|
|
941 |
|
|
|
942 |
|
|
if ( $conf->{date_ranges} ) { |
943 |
|
|
my $dr = DateRanges::GetDateRangeArgs( $q ); |
944 |
|
|
push @query_string, $dr, if $dr; |
945 |
|
|
} |
946 |
|
|
|
947 |
|
|
|
948 |
|
|
$self->{query_href} = $q->script_name . '?' . join '&', @query_string; |
949 |
|
|
|
950 |
|
|
|
951 |
|
|
|
952 |
|
|
# Return the template fields |
953 |
|
|
|
954 |
|
|
$self->{my_url} = $q->script_name; |
955 |
|
|
|
956 |
|
|
$self->{hits} = $hits; |
957 |
|
|
|
958 |
|
|
$self->{navigation} = { |
959 |
|
|
showing => $hits, |
960 |
|
|
from => $start + 1, |
961 |
|
|
to => $start + $hits, |
962 |
|
|
hits => $self->header('number of hits') || 0, |
963 |
|
|
run_time => $self->header('run time') || 'unknown', |
964 |
|
|
search_time => $self->header('search time') || 'unknown', |
965 |
|
|
}; |
966 |
|
|
|
967 |
|
|
|
968 |
|
|
$self->set_page ( $page_size ); |
969 |
|
|
|
970 |
|
|
return $self; |
971 |
|
|
|
972 |
|
|
} |
973 |
|
|
|
974 |
|
|
|
975 |
|
|
#============================================================ |
976 |
|
|
# Build a query string from swish |
977 |
|
|
# Just builds the -w string |
978 |
|
|
#------------------------------------------------------------ |
979 |
|
|
|
980 |
|
|
sub build_query { |
981 |
|
|
my $self = shift; |
982 |
|
|
|
983 |
|
|
my $q = $self->{q}; |
984 |
|
|
|
985 |
|
|
|
986 |
|
|
# set up the query string to pass to swish. |
987 |
|
|
my $query = $q->param('query') || ''; |
988 |
|
|
|
989 |
|
|
for ( $query ) { # trim the query string |
990 |
|
|
s/\s+$//; |
991 |
|
|
s/^\s+//; |
992 |
|
|
} |
993 |
|
|
|
994 |
|
|
$self->{query_simple} = $query; # without metaname |
995 |
|
|
$q->param('query', $query ); # clean up the query, if needed. |
996 |
|
|
|
997 |
|
|
|
998 |
|
|
# Read in the date limits, if any. This can create a new query |
999 |
|
|
return unless $self->get_date_limits( \$query ); |
1000 |
|
|
|
1001 |
|
|
|
1002 |
|
|
unless ( $query ) { |
1003 |
|
|
$self->errstr('Please enter a query string') if $q->param('submit'); |
1004 |
|
|
return; |
1005 |
|
|
} |
1006 |
|
|
|
1007 |
|
|
|
1008 |
|
|
if ( length( $query ) > $self->{config}{max_query_length} ) { |
1009 |
|
|
$self->errstr('Please enter a shorter query'); |
1010 |
|
|
return; |
1011 |
|
|
} |
1012 |
|
|
|
1013 |
|
|
|
1014 |
|
|
|
1015 |
|
|
# Adjust the query string for metaname search |
1016 |
|
|
# *Everything* is a metaname search |
1017 |
|
|
# Might also like to allow searching more than one metaname at the same time |
1018 |
|
|
|
1019 |
|
|
my $metaname = $q->param('metaname') || 'swishdefault'; |
1020 |
|
|
|
1021 |
|
|
|
1022 |
|
|
# make sure it's a valid metaname |
1023 |
|
|
|
1024 |
|
|
my $conf = $self->{config}; |
1025 |
|
|
my @metas = ('swishdefault'); |
1026 |
|
|
push @metas, @{ $self->config('metanames')} if $self->config('metanames'); |
1027 |
|
|
my %meta_lookup = map { $_ => 1 } @metas; |
1028 |
|
|
|
1029 |
|
|
unless ( $meta_lookup{$metaname} ) { |
1030 |
|
|
$self->errstr('Bad MetaName provided'); |
1031 |
|
|
return; |
1032 |
|
|
} |
1033 |
|
|
|
1034 |
|
|
# prepend metaname to query |
1035 |
|
|
|
1036 |
|
|
if ( $conf->{meta_groups} && $conf->{meta_groups}{$metaname} ) { |
1037 |
|
|
$query = join ' OR ', map { "$_=($query)" } @{$conf->{meta_groups}{$metaname}}; |
1038 |
|
|
|
1039 |
|
|
# This is used to create a fake entry in the parsed query so highlighting |
1040 |
|
|
# can find the query words |
1041 |
|
|
$self->{real_metaname} = $conf->{meta_groups}{$metaname}[0]; |
1042 |
|
|
} else { |
1043 |
|
|
$query = $metaname . "=($query)"; |
1044 |
|
|
} |
1045 |
|
|
|
1046 |
|
|
# save the metaname so we know what field to highlight |
1047 |
|
|
# Note that this might be a fake metaname |
1048 |
|
|
$self->{metaname} = $metaname; |
1049 |
|
|
|
1050 |
|
|
|
1051 |
|
|
## Look for a "limit" metaname -- perhaps used with ExtractPath |
1052 |
|
|
# Here we don't worry about user supplied data |
1053 |
|
|
|
1054 |
|
|
my $limits = $self->config('select_by_meta'); |
1055 |
|
|
my @limits = $q->param('sbm'); # Select By Metaname |
1056 |
|
|
|
1057 |
|
|
|
1058 |
|
|
# Note that this could be messed up by ending the query in a NOT or OR |
1059 |
|
|
# Should look into doing: |
1060 |
|
|
# $query = "( $query ) AND " . $limits->{metaname} . '=(' . join( ' OR ', @limits ) . ')'; |
1061 |
|
|
if ( @limits && ref $limits eq 'HASH' && $limits->{metaname} ) { |
1062 |
|
|
$query .= ' and ' . $limits->{metaname} . '=(' . join( ' or ', @limits ) . ')'; |
1063 |
|
|
} |
1064 |
|
|
|
1065 |
|
|
|
1066 |
|
|
$self->swish_command('-w', $query ); |
1067 |
|
|
|
1068 |
|
|
return 1; |
1069 |
|
|
} |
1070 |
|
|
|
1071 |
|
|
#======================================================================== |
1072 |
|
|
# Get the index files from the form, or from simple the config settings |
1073 |
|
|
#------------------------------------------------------------------------ |
1074 |
|
|
|
1075 |
|
|
sub set_index_file { |
1076 |
|
|
my $self = shift; |
1077 |
|
|
|
1078 |
|
|
my $q = $self->CGI; |
1079 |
|
|
|
1080 |
|
|
# Set the index file |
1081 |
|
|
|
1082 |
|
|
if ( $self->config('select_indexes') && ref $self->config('swish_index') eq 'ARRAY' ) { |
1083 |
|
|
|
1084 |
|
|
my @choices = $q->param('si'); |
1085 |
|
|
if ( !@choices ) { |
1086 |
|
|
$self->errstr('Please select a source to search'); |
1087 |
|
|
return; |
1088 |
|
|
} |
1089 |
|
|
|
1090 |
|
|
my @indexes = @{$self->config('swish_index')}; |
1091 |
|
|
|
1092 |
|
|
|
1093 |
|
|
my @selected_indexes = grep {/^\d+$/ && $_ >= 0 && $_ < @indexes } @choices; |
1094 |
|
|
|
1095 |
|
|
if ( !@selected_indexes ) { |
1096 |
|
|
$self->errstr('Invalid source selected'); |
1097 |
|
|
return $self; |
1098 |
|
|
} |
1099 |
|
|
$self->swish_command( '-f', @indexes[ @selected_indexes ] ); |
1100 |
|
|
|
1101 |
|
|
|
1102 |
|
|
} else { |
1103 |
|
|
my $indexes = $self->config('swish_index'); |
1104 |
|
|
$self->swish_command( '-f', ref $indexes ? @$indexes : $indexes ); |
1105 |
|
|
} |
1106 |
|
|
|
1107 |
|
|
return 1; |
1108 |
|
|
} |
1109 |
|
|
|
1110 |
|
|
#================================================================================ |
1111 |
|
|
# Parse out the date limits from the form or from GET request |
1112 |
|
|
# |
1113 |
|
|
#--------------------------------------------------------------------------------- |
1114 |
|
|
|
1115 |
|
|
sub get_date_limits { |
1116 |
|
|
|
1117 |
|
|
my ( $self, $query_ref ) = @_; |
1118 |
|
|
|
1119 |
|
|
my $conf = $self->{config}; |
1120 |
|
|
|
1121 |
|
|
# Are date ranges enabled? |
1122 |
|
|
return 1 unless $conf->{date_ranges}; |
1123 |
|
|
|
1124 |
|
|
|
1125 |
|
|
eval { require DateRanges }; |
1126 |
|
|
if ( $@ ) { |
1127 |
|
|
print STDERR "\n------ Can't use DateRanges feature ------------\n", |
1128 |
|
|
"\nScript will run, but you can't use the date range feature\n", |
1129 |
|
|
$@, |
1130 |
|
|
"\n--------------\n" if $conf->{debug}; |
1131 |
|
|
|
1132 |
|
|
delete $conf->{date_ranges}; |
1133 |
|
|
return 1; |
1134 |
|
|
} |
1135 |
|
|
|
1136 |
|
|
my $q = $self->{q}; |
1137 |
|
|
|
1138 |
|
|
my %limits; |
1139 |
|
|
|
1140 |
|
|
unless ( DateRanges::DateRangeParse( $q, \%limits ) ) { |
1141 |
|
|
$self->errstr( $limits{DateRanges_error} || 'Bad date range selection' ); |
1142 |
|
|
return; |
1143 |
|
|
} |
1144 |
|
|
|
1145 |
|
|
# Store the values for later |
1146 |
|
|
|
1147 |
|
|
$self->{DateRanges_time_low} = $limits{DateRanges_time_low}; |
1148 |
|
|
$self->{DateRanges_time_high} = $limits{DateRanges_time_high}; |
1149 |
|
|
|
1150 |
|
|
|
1151 |
|
|
# Allow searchs just be date if not "All dates" search |
1152 |
|
|
# $$$ should place some limits here, and provide a switch to disable |
1153 |
|
|
if ( !$$query_ref && $limits{DateRanges_time_high} ) { |
1154 |
|
|
$$query_ref = 'not skaisikdeekk'; |
1155 |
|
|
$self->{_search_all}++; # flag |
1156 |
|
|
} |
1157 |
|
|
|
1158 |
|
|
|
1159 |
|
|
my $limit_prop = $conf->{date_ranges}{property_name} || 'swishlastmodified'; |
1160 |
|
|
|
1161 |
|
|
|
1162 |
|
|
if ( $limits{DateRanges_time_low} && $limits{DateRanges_time_high} ) { |
1163 |
|
|
$self->swish_command( '-L', $limit_prop, $limits{DateRanges_time_low}, $limits{DateRanges_time_high} ); |
1164 |
|
|
} |
1165 |
|
|
|
1166 |
|
|
return 1; |
1167 |
|
|
} |
1168 |
|
|
|
1169 |
|
|
|
1170 |
|
|
|
1171 |
|
|
#================================================================ |
1172 |
|
|
# Set the sort order |
1173 |
|
|
# Just builds the -s string |
1174 |
|
|
#---------------------------------------------------------------- |
1175 |
|
|
|
1176 |
|
|
sub set_sort_order { |
1177 |
|
|
my $self = shift; |
1178 |
|
|
|
1179 |
|
|
my $q = $self->{q}; |
1180 |
|
|
|
1181 |
|
|
my $sorts_array = $self->config('sorts'); |
1182 |
|
|
return 1 unless $sorts_array; |
1183 |
|
|
|
1184 |
|
|
|
1185 |
|
|
my $conf = $self->{config}; |
1186 |
|
|
|
1187 |
|
|
|
1188 |
|
|
# Now set sort option - if a valid option submitted (or you could let swish-e return the error). |
1189 |
|
|
my %sorts = map { $_, 1 } @$sorts_array; |
1190 |
|
|
|
1191 |
|
|
my $sortby = $q->param('sort') || 'swishrank'; |
1192 |
|
|
|
1193 |
|
|
if ( $sortby && $sorts{ $sortby } ) { |
1194 |
|
|
|
1195 |
|
|
my $direction = $sortby eq 'swishrank' |
1196 |
|
|
? $q->param('reverse') ? 'asc' : 'desc' |
1197 |
|
|
: $q->param('reverse') ? 'desc' : 'asc'; |
1198 |
|
|
|
1199 |
|
|
$self->swish_command( '-s', $sortby, $direction ); |
1200 |
|
|
|
1201 |
|
|
if ( $conf->{secondary_sort} && $sortby ne $conf->{secondary_sort}[0] ) { |
1202 |
|
|
$self->swish_command(ref $conf->{secondary_sort} ? @{ $conf->{secondary_sort} } : $conf->{secondary_sort} ); |
1203 |
|
|
} |
1204 |
|
|
|
1205 |
|
|
} else { |
1206 |
|
|
$self->errstr( 'Invalid Sort Option Selected' ); |
1207 |
|
|
return; |
1208 |
|
|
} |
1209 |
|
|
|
1210 |
|
|
return 1; |
1211 |
|
|
} |
1212 |
|
|
|
1213 |
|
|
|
1214 |
|
|
|
1215 |
|
|
#======================================================== |
1216 |
|
|
# Sets prev and next page links. |
1217 |
|
|
# Feel free to clean this code up! |
1218 |
|
|
# |
1219 |
|
|
# Pass: |
1220 |
|
|
# $resutls - reference to a hash (for access to the headers returned by swish) |
1221 |
|
|
# $q - CGI object |
1222 |
|
|
# |
1223 |
|
|
# Returns: |
1224 |
|
|
# Sets entries in the $results hash |
1225 |
|
|
# |
1226 |
|
|
|
1227 |
|
|
sub set_page { |
1228 |
|
|
|
1229 |
|
|
my ( $self, $Page_Size ) = @_; |
1230 |
|
|
|
1231 |
|
|
my $q = $self->{q}; |
1232 |
|
|
|
1233 |
|
|
my $navigation = $self->{navigation}; |
1234 |
|
|
|
1235 |
|
|
|
1236 |
|
|
my $start = $navigation->{from} - 1; # Current starting record |
1237 |
|
|
|
1238 |
|
|
|
1239 |
|
|
my $prev = $start - $Page_Size; |
1240 |
|
|
$prev = 0 if $prev < 0; |
1241 |
|
|
|
1242 |
|
|
if ( $prev < $start ) { |
1243 |
|
|
$navigation->{prev} = $prev; |
1244 |
|
|
$navigation->{prev_count} = $start - $prev; |
1245 |
|
|
} |
1246 |
|
|
|
1247 |
|
|
|
1248 |
|
|
my $last = $navigation->{hits} - 1; |
1249 |
|
|
|
1250 |
|
|
|
1251 |
|
|
my $next = $start + $Page_Size; |
1252 |
|
|
$next = $last if $next > $last; |
1253 |
|
|
my $cur_end = $start + $self->{hits} - 1; |
1254 |
|
|
if ( $next > $cur_end ) { |
1255 |
|
|
$navigation->{next} = $next; |
1256 |
|
|
$navigation->{next_count} = $next + $Page_Size > $last |
1257 |
|
|
? $last - $next + 1 |
1258 |
|
|
: $Page_Size; |
1259 |
|
|
} |
1260 |
|
|
|
1261 |
|
|
|
1262 |
|
|
# Calculate pages ( is this -1 correct here? ) |
1263 |
|
|
|
1264 |
|
|
my $pages = int (($navigation->{hits} -1) / $Page_Size); |
1265 |
|
|
if ( $pages ) { |
1266 |
|
|
|
1267 |
|
|
my @pages = 0..$pages; |
1268 |
|
|
|
1269 |
|
|
my $max_pages = 10; |
1270 |
|
|
|
1271 |
|
|
if ( @pages > $max_pages ) { |
1272 |
|
|
my $current_page = int ( $start / $Page_Size - $max_pages/2) ; |
1273 |
|
|
$current_page = 0 if $current_page < 0; |
1274 |
|
|
if ( $current_page + $max_pages - 1 > $pages ) { |
1275 |
|
|
$current_page = $pages - $max_pages; |
1276 |
|
|
} |
1277 |
|
|
|
1278 |
|
|
@pages = $current_page..$current_page + $max_pages - 1; |
1279 |
|
|
unshift @pages, 0 if $current_page && !$self->{config}{no_first_page_navigation}; |
1280 |
|
|
push @pages, $pages unless $current_page + $max_pages - 1 == $pages || $self->{config}{no_last_page_navigation} |
1281 |
|
|
} |
1282 |
|
|
|
1283 |
|
|
|
1284 |
|
|
$navigation->{pages} = |
1285 |
|
|
join ' ', map { |
1286 |
|
|
my $page_start = $_ * $Page_Size; |
1287 |
|
|
my $page = $_ + 1; |
1288 |
|
|
$page_start == $start |
1289 |
|
|
? $page |
1290 |
|
|
: qq[<a href="$self->{query_href}&start=$page_start">$page</a>]; |
1291 |
|
|
} @pages; |
1292 |
|
|
} |
1293 |
|
|
|
1294 |
|
|
} |
1295 |
|
|
|
1296 |
|
|
#================================================== |
1297 |
|
|
# Format and return the date range options in HTML |
1298 |
|
|
# |
1299 |
|
|
#-------------------------------------------------- |
1300 |
|
|
sub get_date_ranges { |
1301 |
|
|
|
1302 |
|
|
my $self = shift; |
1303 |
|
|
|
1304 |
|
|
my $q = $self->{q}; |
1305 |
|
|
my $conf = $self->{config}; |
1306 |
|
|
|
1307 |
|
|
return '' unless $conf->{date_ranges}; |
1308 |
|
|
|
1309 |
|
|
# pass parametes, and a hash to store the returned values. |
1310 |
|
|
|
1311 |
|
|
my %fields; |
1312 |
|
|
|
1313 |
|
|
DateRanges::DateRangeForm( $q, $conf->{date_ranges}, \%fields ); |
1314 |
|
|
|
1315 |
|
|
|
1316 |
|
|
# Set the layout: |
1317 |
|
|
|
1318 |
|
|
my $string = '<br>Limit to: ' |
1319 |
|
|
. ( $fields{buttons} ? "$fields{buttons}<br>" : '' ) |
1320 |
|
|
. ( $fields{date_range_button} || '' ) |
1321 |
|
|
. ( $fields{date_range_low} |
1322 |
|
|
? " $fields{date_range_low} through $fields{date_range_high}" |
1323 |
|
|
: '' ); |
1324 |
|
|
|
1325 |
|
|
return $string; |
1326 |
|
|
} |
1327 |
|
|
|
1328 |
|
|
|
1329 |
|
|
|
1330 |
|
|
#============================================ |
1331 |
|
|
# Run swish-e and gathers headers and results |
1332 |
|
|
# Currently requires fork() to run. |
1333 |
|
|
# |
1334 |
|
|
# Pass: |
1335 |
|
|
# $sh - an array with search parameters |
1336 |
|
|
# |
1337 |
|
|
# Returns: |
1338 |
|
|
# a reference to a hash that contains the headers and results |
1339 |
|
|
# or possibly a scalar with an error message. |
1340 |
|
|
# |
1341 |
|
|
|
1342 |
|
|
|
1343 |
|
|
sub run_swish { |
1344 |
|
|
|
1345 |
|
|
|
1346 |
|
|
my $self = shift; |
1347 |
|
|
|
1348 |
|
|
my $results = $self->{results}; |
1349 |
|
|
my $conf = $self->{config}; |
1350 |
|
|
my $q = $self->{q}; |
1351 |
|
|
|
1352 |
|
|
|
1353 |
|
|
my @properties; |
1354 |
|
|
my %seen; |
1355 |
|
|
|
1356 |
|
|
# Gather up the properties specified |
1357 |
|
|
|
1358 |
|
|
for ( qw/ title_property description_prop display_props / ) { |
1359 |
|
|
push @properties, ref $conf->{$_} ? @{$conf->{$_}} : $conf->{$_} |
1360 |
|
|
if $conf->{$_} && !$seen{$_}++; |
1361 |
|
|
} |
1362 |
|
|
|
1363 |
|
|
# Add in the default props |
1364 |
|
|
for ( qw/swishrank swishdocpath/ ) { |
1365 |
|
|
push @properties, $_ unless $seen{$_}; |
1366 |
|
|
} |
1367 |
|
|
|
1368 |
|
|
|
1369 |
|
|
# add in the default prop - a number must be first (this might be a duplicate in -x, oh well) |
1370 |
|
|
@properties = ( 'swishreccount', @properties ); |
1371 |
|
|
|
1372 |
|
|
$self->swish_command( -x => join( '\t', map { "<$_>" } @properties ) . '\n' ); |
1373 |
|
|
|
1374 |
|
|
$self->swish_command( -H => 9 ); |
1375 |
|
|
|
1376 |
|
|
my $fh = $^O =~ /Win32/i |
1377 |
|
|
? windows_fork( $conf, $self ) |
1378 |
|
|
: real_fork( $conf, $self ); |
1379 |
|
|
|
1380 |
|
|
|
1381 |
|
|
$self->{COMMAND} = join ' ', $self->{prog}, $self->swish_command; |
1382 |
|
|
|
1383 |
|
|
|
1384 |
|
|
# read in from child |
1385 |
|
|
|
1386 |
|
|
|
1387 |
|
|
my @results; |
1388 |
|
|
|
1389 |
|
|
my $trim_prop = $self->config('description_prop'); |
1390 |
|
|
|
1391 |
|
|
my $highlight = $self->config('highlight'); |
1392 |
|
|
my $highlight_object; |
1393 |
|
|
|
1394 |
|
|
# Loop through values returned from swish. |
1395 |
|
|
|
1396 |
|
|
my %stops_removed; |
1397 |
|
|
|
1398 |
|
|
my $unknown_output = ''; |
1399 |
|
|
|
1400 |
|
|
|
1401 |
|
|
while (<$fh>) { |
1402 |
|
|
|
1403 |
|
|
chomp; |
1404 |
|
|
tr/\r//d; |
1405 |
|
|
|
1406 |
|
|
# This will not work correctly with multiple indexes when different values are used. |
1407 |
|
|
if ( /^# ([^:]+):\s+(.+)$/ ) { |
1408 |
|
|
|
1409 |
|
|
print STDERR "$_\n" if $conf->{debug} & $SwishSearch::DEBUG_HEADERS; |
1410 |
|
|
|
1411 |
|
|
my $h = lc $1; |
1412 |
|
|
my $value = $2; |
1413 |
|
|
$self->{_headers}{$h} = $value; |
1414 |
|
|
|
1415 |
|
|
push @{$self->{_headers}{'removed stopwords'}}, $value if $h eq 'removed stopword' && !$stops_removed{$value}++; |
1416 |
|
|
|
1417 |
|
|
next; |
1418 |
|
|
} elsif ( $conf->{debug} & $SwishSearch::DEBUG_OUTPUT ) { |
1419 |
|
|
print STDERR "$_\n"; |
1420 |
|
|
} |
1421 |
|
|
|
1422 |
|
|
|
1423 |
|
|
|
1424 |
|
|
# return swish errors as a mesage to the script |
1425 |
|
|
$self->errstr($1), return if /^err:\s*(.+)/; |
1426 |
|
|
|
1427 |
|
|
# Or, if you want to log the errors and just say "Service Unavailable" use this: |
1428 |
|
|
#die "$1\n" if /^err:\s*(.+)/; |
1429 |
|
|
|
1430 |
|
|
|
1431 |
|
|
# Found a result |
1432 |
|
|
if ( /^\d/ ) { |
1433 |
|
|
|
1434 |
|
|
my %h; |
1435 |
|
|
@h{@properties} = split /\t/; |
1436 |
|
|
push @results, \%h; |
1437 |
|
|
|
1438 |
|
|
# There's a chance that the docpath could be modified by highlighting |
1439 |
|
|
# when used in a "display_props". |
1440 |
|
|
$h{saved_swishdocpath} = $h{swishdocpath}; |
1441 |
|
|
|
1442 |
|
|
my $docpath = $h{swishdocpath}; |
1443 |
|
|
$docpath =~ s/\s/%20/g; # Replace spaces |
1444 |
|
|
$h{swishdocpath_href} = ( $self->config('prepend_path') || '' ) . $docpath; |
1445 |
|
|
|
1446 |
|
|
|
1447 |
|
|
|
1448 |
|
|
|
1449 |
|
|
|
1450 |
|
|
# Now do any formatting |
1451 |
|
|
if ( $highlight ) { |
1452 |
|
|
if ( !$highlight_object ) { |
1453 |
|
|
my $package = $highlight->{package} || 'DefaultHighlight'; |
1454 |
|
|
|
1455 |
|
|
eval { require "$package.pm" }; |
1456 |
|
|
if ( $@ ) { |
1457 |
|
|
$self->errstr( "Failed to load Highlighting Module - check error log" ); |
1458 |
|
|
warn "$0: $@"; |
1459 |
|
|
$highlight = ''; |
1460 |
|
|
next; |
1461 |
|
|
} else { |
1462 |
|
|
$highlight_object = $package->new( $self, $self->{metaname} ); |
1463 |
|
|
} |
1464 |
|
|
} |
1465 |
|
|
|
1466 |
|
|
# Highlight any fields, as needed |
1467 |
|
|
$highlight_object->highlight( \%h ); |
1468 |
|
|
|
1469 |
|
|
next; |
1470 |
|
|
} |
1471 |
|
|
|
1472 |
|
|
|
1473 |
|
|
|
1474 |
|
|
|
1475 |
|
|
# Trim down the description if no highlight, or if highlighting some other property |
1476 |
|
|
# Not very nice. The highlighting code would limit by words |
1477 |
|
|
|
1478 |
|
|
if ( $trim_prop && $h{$trim_prop} ) { |
1479 |
|
|
my $max = $conf->{max_chars} || 500; |
1480 |
|
|
|
1481 |
|
|
if ( length $h{$trim_prop} > $max ) { |
1482 |
|
|
$h{$trim_prop} = substr( $h{$trim_prop}, 0, $max) . ' <b>...</b>'; |
1483 |
|
|
} |
1484 |
|
|
} |
1485 |
|
|
|
1486 |
|
|
next; |
1487 |
|
|
|
1488 |
|
|
} elsif ( /^\.$/ ) { |
1489 |
|
|
last; |
1490 |
|
|
|
1491 |
|
|
} else { |
1492 |
|
|
next if /^#/; |
1493 |
|
|
} |
1494 |
|
|
|
1495 |
|
|
$unknown_output .= "'$_'\n"; |
1496 |
|
|
|
1497 |
|
|
|
1498 |
|
|
|
1499 |
|
|
|
1500 |
|
|
} |
1501 |
|
|
|
1502 |
|
|
die "Swish returned unknown output: $unknown_output\n" if $unknown_output; |
1503 |
|
|
|
1504 |
|
|
$self->{hits} = @results; |
1505 |
|
|
$self->{_results} = \@results if @results; |
1506 |
|
|
|
1507 |
|
|
} |
1508 |
|
|
|
1509 |
|
|
#================================================================== |
1510 |
|
|
# Run swish-e by forking |
1511 |
|
|
# |
1512 |
|
|
|
1513 |
|
|
use Symbol; |
1514 |
|
|
|
1515 |
|
|
sub real_fork { |
1516 |
|
|
my ( $conf, $self ) = @_; |
1517 |
|
|
|
1518 |
|
|
|
1519 |
|
|
# Run swish |
1520 |
|
|
my $fh = gensym; |
1521 |
|
|
my $pid = open( $fh, '-|' ); |
1522 |
|
|
|
1523 |
|
|
die "Failed to fork: $!\n" unless defined $pid; |
1524 |
|
|
|
1525 |
|
|
|
1526 |
|
|
|
1527 |
|
|
if ( !$pid ) { # in child |
1528 |
|
|
if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) { |
1529 |
|
|
print STDERR "---- Running swish with the following command and parameters ----\n"; |
1530 |
|
|
print STDERR join( " \\\n", map { /[^\/.\-\w\d]/ ? qq['$_'] : $_ } $self->{prog}, $self->swish_command ); |
1531 |
|
|
print STDERR "\n-----------------------------------------------\n"; |
1532 |
|
|
} |
1533 |
|
|
|
1534 |
|
|
|
1535 |
|
|
unless ( exec $self->{prog}, $self->swish_command ) { |
1536 |
|
|
warn "Child process Failed to exec '$self->{prog}' Error: $!"; |
1537 |
|
|
print "Failed to exec Swish"; # send this message to parent. |
1538 |
|
|
exit; |
1539 |
|
|
} |
1540 |
|
|
} |
1541 |
|
|
|
1542 |
|
|
return $fh; |
1543 |
|
|
} |
1544 |
|
|
|
1545 |
|
|
|
1546 |
|
|
#===================================================================================== |
1547 |
|
|
# Windows work around |
1548 |
|
|
# from perldoc perlfok -- na, that doesn't work. Try IPC::Open2 |
1549 |
|
|
# |
1550 |
|
|
sub windows_fork { |
1551 |
|
|
my ( $conf, $self ) = @_; |
1552 |
|
|
|
1553 |
|
|
if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) { |
1554 |
|
|
print STDERR "---- Running swish with the following command and parameters ----\n"; |
1555 |
|
|
print STDERR join( ' ', map { /[^.\-\w\d]/ ? qq["$_"] : $_ } map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command ); |
1556 |
|
|
print STDERR "\n-----------------------------------------------\n"; |
1557 |
|
|
} |
1558 |
|
|
|
1559 |
|
|
|
1560 |
|
|
require IPC::Open2; |
1561 |
|
|
my ( $rdrfh, $wtrfh ); |
1562 |
|
|
|
1563 |
|
|
# Ok, I'll say it. Windows sucks. |
1564 |
|
|
my @command = map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command; |
1565 |
|
|
my $pid = IPC::Open2::open2($rdrfh, $wtrfh, @command ); |
1566 |
|
|
|
1567 |
|
|
|
1568 |
|
|
$self->{pid} = $pid; |
1569 |
|
|
|
1570 |
|
|
return $rdrfh; |
1571 |
|
|
} |
1572 |
|
|
|
1573 |
|
|
#===================================================================================== |
1574 |
|
|
# This method parses out the query from the "Parsed words" returned by swish |
1575 |
|
|
# for use in highlighting routines |
1576 |
|
|
# This returns a hash ref: |
1577 |
|
|
# $query->{text} # evertying is currently at level "text" |
1578 |
|
|
# {$metaname} # the meta name |
1579 |
|
|
# [ array of phrases ] |
1580 |
|
|
# each phrase is made up of an array of words |
1581 |
|
|
|
1582 |
|
|
|
1583 |
|
|
|
1584 |
|
|
|
1585 |
|
|
|
1586 |
|
|
use constant DEBUG_QUERY_PARSED => 0; |
1587 |
|
|
|
1588 |
|
|
sub extract_query_match { |
1589 |
|
|
my $self = shift; |
1590 |
|
|
|
1591 |
|
|
my $query = $self->header('parsed words'); # grab query parsed by swish |
1592 |
|
|
|
1593 |
|
|
|
1594 |
|
|
my %query_match; # kewords broken down by layer and field. |
1595 |
|
|
$self->{query_match} = \%query_match; |
1596 |
|
|
|
1597 |
|
|
|
1598 |
|
|
# Loop through the query |
1599 |
|
|
|
1600 |
|
|
while ( $query =~ /([a-z]+)\s+=\s+(.+?)(?=\s+[a-z]+\s+=|$)/g ) { |
1601 |
|
|
|
1602 |
|
|
my ( $field, $words ) = ( $1, $2 ); |
1603 |
|
|
|
1604 |
|
|
|
1605 |
|
|
my $inquotes; |
1606 |
|
|
my $buffer; |
1607 |
|
|
my %single_words; |
1608 |
|
|
|
1609 |
|
|
my $layer = 'text'; # This might be used in the future to highlight tags when matching a href. |
1610 |
|
|
|
1611 |
|
|
# Expand group searches -- not currently used |
1612 |
|
|
my @fields = ( $field ); |
1613 |
|
|
|
1614 |
|
|
|
1615 |
|
|
for my $word ( split /\s+/, $words ) { |
1616 |
|
|
|
1617 |
|
|
|
1618 |
|
|
# XXX This list of swish operators could change "and or not" and is dependent on stopwords. |
1619 |
|
|
# remove control words and parens |
1620 |
|
|
next if !$inquotes && $word =~ /^(and|or|not|\(|\))$/; |
1621 |
|
|
|
1622 |
|
|
$buffer = [] unless $inquotes; # is there a better way to allocate memory like this? |
1623 |
|
|
|
1624 |
|
|
if ( $word eq '"' ) { |
1625 |
|
|
unless ( $inquotes ) { |
1626 |
|
|
$inquotes++; |
1627 |
|
|
next; |
1628 |
|
|
} else { |
1629 |
|
|
$inquotes = 0; |
1630 |
|
|
} |
1631 |
|
|
|
1632 |
|
|
} else { |
1633 |
|
|
|
1634 |
|
|
push @$buffer, $word; |
1635 |
|
|
} |
1636 |
|
|
|
1637 |
|
|
|
1638 |
|
|
next if $inquotes; |
1639 |
|
|
|
1640 |
|
|
|
1641 |
|
|
# Only record single words once (this will probably break something) |
1642 |
|
|
# Reason: to reduce the number of matches must check |
1643 |
|
|
next if @$buffer == 1 && $single_words{ $buffer->[0] }++; |
1644 |
|
|
|
1645 |
|
|
|
1646 |
|
|
push @{$query_match{$layer}{$_}}, $buffer foreach @fields; |
1647 |
|
|
|
1648 |
|
|
|
1649 |
|
|
} |
1650 |
|
|
} |
1651 |
|
|
|
1652 |
|
|
|
1653 |
|
|
# Here's a hack to make metaname expansion work |
1654 |
|
|
# this will make an entry like all => [qw/ query words /]; for use with fake metanames |
1655 |
|
|
|
1656 |
|
|
$query_match{text}{ $self->{metaname} } = $query_match{text}{$self->{real_metaname}} |
1657 |
|
|
if $self->{real_metaname} && $query_match{text}{$self->{real_metaname}}; |
1658 |
|
|
|
1659 |
|
|
|
1660 |
|
|
|
1661 |
|
|
# Now, sort in desending order of phrase lenght |
1662 |
|
|
|
1663 |
|
|
|
1664 |
|
|
foreach my $layer ( keys %query_match ) { |
1665 |
|
|
print STDERR " LAYER: $layer\n" if DEBUG_QUERY_PARSED; |
1666 |
|
|
|
1667 |
|
|
|
1668 |
|
|
foreach my $tag ( keys %{$query_match{$layer}} ) { |
1669 |
|
|
|
1670 |
|
|
@{$query_match{$layer}{$tag}} = sort { @$b <=> @$a } @{$query_match{$layer}{$tag}}; |
1671 |
|
|
|
1672 |
|
|
|
1673 |
|
|
if ( DEBUG_QUERY_PARSED ) { |
1674 |
|
|
print STDERR " TAG: '$tag'\n"; |
1675 |
|
|
print STDERR " : '@$_'\n" foreach @{$query_match{$layer}{$tag}}; |
1676 |
|
|
} |
1677 |
|
|
} |
1678 |
|
|
} |
1679 |
|
|
|
1680 |
|
|
|
1681 |
|
|
# display parsed query instead of the title for debugging |
1682 |
|
|
# use Data::Dumper; |
1683 |
|
|
# $self->config('title',"<pre><font size=3>Query:\n$query\n" . Dumper(\%query_match) . '</font></pre>'); |
1684 |
|
|
|
1685 |
|
|
|
1686 |
|
|
return \%query_match; |
1687 |
|
|
} |
1688 |
|
|
|
1689 |
|
|
|
1690 |
|
|
1; |
1691 |
|
|
|
1692 |
|
|
|
1693 |
|
|
__END__ |
1694 |
|
|
|
1695 |
|
|
=head1 NAME |
1696 |
|
|
|
1697 |
|
|
swish.cgi -- Example Perl script for searching with the SWISH-E search engine. |
1698 |
|
|
|
1699 |
|
|
=head1 DESCRIPTION |
1700 |
|
|
|
1701 |
|
|
C<swish.cgi> is a CGI script for searching with the SWISH-E search engine version 2.1-dev and above. |
1702 |
|
|
It returns results a page at a time, with matching words from the source document highlighted, showing a |
1703 |
|
|
few words of content on either side of the highlighted word. |
1704 |
|
|
|
1705 |
|
|
The script is highly configurable; you can search multiple (or selectable) indexes, limit searches to |
1706 |
|
|
part of the index, allow sorting by a number of different properties, limit results to a date range, and so on. |
1707 |
|
|
|
1708 |
|
|
The standard configuration (i.e. not using a config file) should work with most swish index files. |
1709 |
|
|
Customization of the parameters will be |
1710 |
|
|
needed if you are indexing special meta data and want to search and/or display the meta data. The |
1711 |
|
|
configuration can be modified by editing this script directly, or by using a configuration file (.swishcgi.conf |
1712 |
|
|
by default). |
1713 |
|
|
|
1714 |
|
|
You are strongly encouraged to get the default configuration working before making changes. Most problems |
1715 |
|
|
using this script are the result of configuration modifications. |
1716 |
|
|
|
1717 |
|
|
The script is modular in design. Both the highlighting code and output generation is handled by modules, which |
1718 |
|
|
are included in the F<example/modules> directory. This allows for easy customization of the output without |
1719 |
|
|
changing the main CGI script. A module exists to generate standard HTML output. There's also modules and |
1720 |
|
|
template examples to use with the popular Perl templating systems HTML::Template and Template-Toolkit. This allows |
1721 |
|
|
you to tightly integrate this script with the look of an existing template-driven web site. |
1722 |
|
|
HTML::Template and Template-Toolkit are available from the CPAN (http://search.cpan.org). |
1723 |
|
|
|
1724 |
|
|
This scipt can also run basically unmodified as a mod_perl handler, providing much better performance than |
1725 |
|
|
running as a CGI script. |
1726 |
|
|
|
1727 |
|
|
Please read the rest of the documentation. There's a C<DEBUGGING> section, and a C<FAQ> section. |
1728 |
|
|
|
1729 |
|
|
This script should work on Windows, but security may be an issue. |
1730 |
|
|
|
1731 |
|
|
=head1 REQUIREMENTS |
1732 |
|
|
|
1733 |
|
|
You should be running a reasonably current version of Perl. 5.00503 or above is recommended (anything older |
1734 |
|
|
will not be supported). |
1735 |
|
|
|
1736 |
|
|
If you wish to use the date range feature you will need to install the Date::Calc module. This is available |
1737 |
|
|
from http://search.cpan.org. |
1738 |
|
|
|
1739 |
|
|
|
1740 |
|
|
=head1 INSTALLATION |
1741 |
|
|
|
1742 |
|
|
Here's an example installation session. Please get a simple installation working before modifying the |
1743 |
|
|
configuration file. Most problems reported for using this script have been due to improper configuration. |
1744 |
|
|
|
1745 |
|
|
The script's default settings are setup for initial testing. By default the settings expect to find |
1746 |
|
|
most files and the swish-e binary in the same directory as the script. |
1747 |
|
|
|
1748 |
|
|
For I<security> reasons, once you have tested the script you will want to change settings to limit access |
1749 |
|
|
to some of these files by the web server |
1750 |
|
|
(either by moving them out of web space, or using access control such as F<.htaccess>). |
1751 |
|
|
An example of using F<.htaccess> on Apache is given below. |
1752 |
|
|
|
1753 |
|
|
It's expected that you have already unpacked the swish-e distribution |
1754 |
|
|
and built the swish-e binary (if using a source distribution). |
1755 |
|
|
|
1756 |
|
|
Below is a (unix) session where we create a directory, move required files into this directory, adjust |
1757 |
|
|
permissions, index some documents, and symlink into the web server. |
1758 |
|
|
|
1759 |
|
|
=over 4 |
1760 |
|
|
|
1761 |
|
|
=item 1 Move required files into their own directory. |
1762 |
|
|
|
1763 |
|
|
This assumes that swish-e was unpacked and build in the ~/swish-e directory. |
1764 |
|
|
|
1765 |
|
|
~ >mkdir swishdir |
1766 |
|
|
~ >cd swishdir |
1767 |
|
|
~/swishdir >cp ~/swish-e/example/swish.cgi . |
1768 |
|
|
~/swishdir >cp -rp ~/swish-e/example/modules . |
1769 |
|
|
~/swishdir >cp ~/swish-e/src/swish-e . |
1770 |
|
|
~/swishdir >chmod 755 swish.cgi |
1771 |
|
|
~/swishdir >chmod 644 modules/* |
1772 |
|
|
|
1773 |
|
|
|
1774 |
|
|
=item 2 Create an index |
1775 |
|
|
|
1776 |
|
|
This step you will create a simple configuration file. In this example the Apache documentation |
1777 |
|
|
is indexed. Last we run a simple query to test swish. |
1778 |
|
|
|
1779 |
|
|
~/swishdir >cat swish.conf |
1780 |
|
|
IndexDir /usr/local/apache/htdocs |
1781 |
|
|
IndexOnly .html .htm |
1782 |
|
|
DefaultContents HTML |
1783 |
|
|
StoreDescription HTML <body> 200000 |
1784 |
|
|
MetaNames swishdocpath swishtitle |
1785 |
|
|
|
1786 |
|
|
~/swishdir >./swish-e -c swish.conf |
1787 |
|
|
Indexing Data Source: "File-System" |
1788 |
|
|
Indexing "/usr/local/apache/htdocs" |
1789 |
|
|
Removing very common words... |
1790 |
|
|
no words removed. |
1791 |
|
|
Writing main index... |
1792 |
|
|
Sorting words ... |
1793 |
|
|
Sorting 7005 words alphabetically |
1794 |
|
|
Writing header ... |
1795 |
|
|
Writing index entries ... |
1796 |
|
|
Writing word text: Complete |
1797 |
|
|
Writing word hash: Complete |
1798 |
|
|
Writing word data: Complete |
1799 |
|
|
7005 unique words indexed. |
1800 |
|
|
5 properties sorted. |
1801 |
|
|
124 files indexed. 1485844 total bytes. 171704 total words. |
1802 |
|
|
Elapsed time: 00:00:02 CPU time: 00:00:02 |
1803 |
|
|
Indexing done! |
1804 |
|
|
|
1805 |
|
|
Now, verify that the index can be searched: |
1806 |
|
|
|
1807 |
|
|
~/swishdir >./swish-e -w install -m 1 |
1808 |
|
|
# SWISH format: 2.1-dev-25 |
1809 |
|
|
# Search words: install |
1810 |
|
|
# Number of hits: 14 |
1811 |
|
|
# Search time: 0.001 seconds |
1812 |
|
|
# Run time: 0.040 seconds |
1813 |
|
|
1000 /usr/local/apache/htdocs/manual/dso.html "Apache 1.3 Dynamic Shared Object (DSO) support" 17341 |
1814 |
|
|
. |
1815 |
|
|
|
1816 |
|
|
Let's see what files we have in our directory now: |
1817 |
|
|
|
1818 |
|
|
~/swishdir >ls -1 -F |
1819 |
|
|
index.swish-e |
1820 |
|
|
index.swish-e.prop |
1821 |
|
|
modules/ |
1822 |
|
|
swish-e* |
1823 |
|
|
swish.cgi* |
1824 |
|
|
swish.conf |
1825 |
|
|
|
1826 |
|
|
=item 3 Test the CGI script |
1827 |
|
|
|
1828 |
|
|
This is a simple step, but often overlooked. You should test from the command line instead of jumping |
1829 |
|
|
ahead and testing with the web server. See the C<DEBUGGING> section below for more information. |
1830 |
|
|
|
1831 |
|
|
~/swishdir >./swish.cgi | head |
1832 |
|
|
Content-Type: text/html; charset=ISO-8859-1 |
1833 |
|
|
|
1834 |
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> |
1835 |
|
|
<html> |
1836 |
|
|
<head> |
1837 |
|
|
<title> |
1838 |
|
|
Search our site |
1839 |
|
|
</title> |
1840 |
|
|
</head> |
1841 |
|
|
<body> |
1842 |
|
|
|
1843 |
|
|
The above shows that the script can be run directly, and generates a correct HTTP header and HTML. |
1844 |
|
|
|
1845 |
|
|
If you run the above and see something like this: |
1846 |
|
|
|
1847 |
|
|
~/swishdir >./swish.cgi |
1848 |
|
|
bash: ./swish.cgi: No such file or directory |
1849 |
|
|
|
1850 |
|
|
then you probably need to edit the script to point to the correct location of your perl program. |
1851 |
|
|
Here's one way to find out where perl is located (again, on unix): |
1852 |
|
|
|
1853 |
|
|
~/swishdir >which perl |
1854 |
|
|
/usr/local/bin/perl |
1855 |
|
|
|
1856 |
|
|
~/swishdir >/usr/local/bin/perl -v |
1857 |
|
|
This is perl, v5.6.0 built for i586-linux |
1858 |
|
|
... |
1859 |
|
|
|
1860 |
|
|
Good! We are using a reasonably current version of perl. You should be running |
1861 |
|
|
at least perl 5.005 (5.00503 really). You may have problems otherwise. |
1862 |
|
|
|
1863 |
|
|
Now that we know perl is at F</usr/local/bin/perl> we can adjust the "shebang" line |
1864 |
|
|
in the perl script (e.g. the first line of the script): |
1865 |
|
|
|
1866 |
|
|
~/swishdir >pico swish.cgi |
1867 |
|
|
(edit the #! line) |
1868 |
|
|
~/swishdir >head -1 swish.cgi |
1869 |
|
|
#!/usr/local/bin/perl -w |
1870 |
|
|
|
1871 |
|
|
=item 4 Test with your web server |
1872 |
|
|
|
1873 |
|
|
How you do this is completely dependent on your web server, and you may need to talk to your web |
1874 |
|
|
server admin to get this working. Often files with the .cgi extension are automatically set up to |
1875 |
|
|
run as CGI scripts, but not always. In other words, this step is really up to you to figure out! |
1876 |
|
|
|
1877 |
|
|
First, I create a symlink in Apache's document root to point to my test directory "swishdir". This will work |
1878 |
|
|
because I know my Apache server is configured to follow symbolic links. |
1879 |
|
|
|
1880 |
|
|
~/swishdir >su -c 'ln -s /home/bill/swishdir /usr/local/apache/htdocs/swishdir' |
1881 |
|
|
Password: ********* |
1882 |
|
|
|
1883 |
|
|
If your account is on an ISP and your web directory is F<~/public_html> the you might just move the entire |
1884 |
|
|
directory: |
1885 |
|
|
|
1886 |
|
|
mv ~/swishdir ~/public_html |
1887 |
|
|
|
1888 |
|
|
Now, let's make a real HTTP request. I happen to have Apache setup on a local port: |
1889 |
|
|
|
1890 |
|
|
~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head -3 |
1891 |
|
|
#!/usr/local/bin/perl -w |
1892 |
|
|
package SwishSearch; |
1893 |
|
|
use strict; |
1894 |
|
|
|
1895 |
|
|
Oh, darn. It looks like Apache is not running the script and instead returning it as a |
1896 |
|
|
static page. I need to tell Apache that swish.cgi is a CGI script. |
1897 |
|
|
|
1898 |
|
|
In my case F<.htaccess> comes to the rescue: |
1899 |
|
|
|
1900 |
|
|
~/swishdir >cat .htaccess |
1901 |
|
|
|
1902 |
|
|
# Deny everything by default |
1903 |
|
|
Deny From All |
1904 |
|
|
|
1905 |
|
|
# But allow just CGI script |
1906 |
|
|
<files swish.cgi> |
1907 |
|
|
Options ExecCGI |
1908 |
|
|
Allow From All |
1909 |
|
|
SetHandler cgi-script |
1910 |
|
|
</files> |
1911 |
|
|
|
1912 |
|
|
Let's try the request one more time: |
1913 |
|
|
|
1914 |
|
|
~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head |
1915 |
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> |
1916 |
|
|
<html> |
1917 |
|
|
<head> |
1918 |
|
|
<title> |
1919 |
|
|
Search our site |
1920 |
|
|
</title> |
1921 |
|
|
</head> |
1922 |
|
|
<body> |
1923 |
|
|
<h2> |
1924 |
|
|
<a href="http://swish-e.org"> |
1925 |
|
|
|
1926 |
|
|
That looks better! Now use your web browser to test. |
1927 |
|
|
|
1928 |
|
|
Make sure you look at your web server's error log file while testing the script. |
1929 |
|
|
|
1930 |
|
|
BTW - "GET" is a program included with Perl's LWP library. If you do no have this you might |
1931 |
|
|
try something like: |
1932 |
|
|
|
1933 |
|
|
wget -O - http://localhost:8000/swishdir/swish.cgi | head |
1934 |
|
|
|
1935 |
|
|
and if nothing else, you can always telnet to the web server and make a basic request. |
1936 |
|
|
|
1937 |
|
|
~/swishtest > telnet localhost 8000 |
1938 |
|
|
Trying 127.0.0.1... |
1939 |
|
|
Connected to localhost. |
1940 |
|
|
Escape character is '^]'. |
1941 |
|
|
GET /swishtest/swish.cgi http/1.0 |
1942 |
|
|
|
1943 |
|
|
HTTP/1.1 200 OK |
1944 |
|
|
Date: Wed, 13 Feb 2002 20:14:31 GMT |
1945 |
|
|
Server: Apache/1.3.20 (Unix) mod_perl/1.25_01 |
1946 |
|
|
Connection: close |
1947 |
|
|
Content-Type: text/html; charset=ISO-8859-1 |
1948 |
|
|
|
1949 |
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> |
1950 |
|
|
<html> |
1951 |
|
|
<head> |
1952 |
|
|
<title> |
1953 |
|
|
Search our site |
1954 |
|
|
</title> |
1955 |
|
|
</head> |
1956 |
|
|
<body> |
1957 |
|
|
|
1958 |
|
|
This may seem like a lot of work compared to using a browser, but browsers |
1959 |
|
|
are a poor tool for basic CGI debugging. |
1960 |
|
|
|
1961 |
|
|
|
1962 |
|
|
=back |
1963 |
|
|
|
1964 |
|
|
If you have problems check the C<DEBUGGING> section below. |
1965 |
|
|
|
1966 |
|
|
=head1 CONFIGURATION |
1967 |
|
|
|
1968 |
|
|
If you want to change the location of the swish-e binary or the index file, use multiple indexes, add additional metanames and properties, |
1969 |
|
|
change the default highlighting behavior, etc., you will need to adjust the script's configuration settings. |
1970 |
|
|
|
1971 |
|
|
Please get a test setup working with the default parameters before making changes to any configuration settings. |
1972 |
|
|
Better to debug one thing at a time... |
1973 |
|
|
|
1974 |
|
|
In general, you will need to adjust the script's settings to match the index file you are searching. For example, |
1975 |
|
|
if you are indexing a hypermail list archive you may want to make the script |
1976 |
|
|
use metanames/properties of Subject, Author, and, Email address. Or you may wish to provide a way to limit |
1977 |
|
|
searches to parts of your index file (e.g. parts of your directory tree). |
1978 |
|
|
|
1979 |
|
|
To make things somewhat "simple", the configuration parameters are included near the top of the swish.cgi program. |
1980 |
|
|
That is the only place that the individual parameters are defined and explained, so you will need to open up |
1981 |
|
|
the swish.cgi script in an editor to view the options. Further questions about individual settings should |
1982 |
|
|
be referred to the swish-e discussion list. |
1983 |
|
|
|
1984 |
|
|
The parameters are all part of a perl C<hash> structure, and the comments at the top of the program should |
1985 |
|
|
get you going. The perl hash structure may seem a bit confusing, but it makes it easy to create nested and complex |
1986 |
|
|
parameters. Syntax is important, so cut-n-paste should be your best defense if you are not a perl programmer. |
1987 |
|
|
|
1988 |
|
|
By the way, Perl has a number of quote operators. For example, to quote a string you might write: |
1989 |
|
|
|
1990 |
|
|
title => 'Search My Site', |
1991 |
|
|
|
1992 |
|
|
Some options take more than one parameter, where each parameter must be quoted. For example: |
1993 |
|
|
|
1994 |
|
|
metanames => [ 'swishdefault', 'swishtitle', 'swishdocpath' ], |
1995 |
|
|
|
1996 |
|
|
Which assigns an array ( [...] ) of three strings to the "metanames" variable. |
1997 |
|
|
Lists of quotes strings are so common in perl that there's a special operator called "qw" (quote word): |
1998 |
|
|
|
1999 |
|
|
metanames => [ qw/ swishdefault swishtitle swishdocpath / ], |
2000 |
|
|
|
2001 |
|
|
or to use the parenthesis as the quote character (you can pick any): |
2002 |
|
|
|
2003 |
|
|
metanames => [ qw( swishdefault swishtitle swishdocpath ) ], |
2004 |
|
|
|
2005 |
|
|
|
2006 |
|
|
You have two options for changing the configuration settings from their default values: |
2007 |
|
|
you may edit the script directly, or you may use a configuration file. In either case, the configuration |
2008 |
|
|
settings are a basic perl hash reference. |
2009 |
|
|
|
2010 |
|
|
Using a configuration file is described below, but contains the same hash structure. |
2011 |
|
|
|
2012 |
|
|
There are many configuration settings, and some of them are commented out either by using |
2013 |
|
|
a "#" symbol, or by simply renaming the configuration directive (e.g. by adding an "x" to the parameter |
2014 |
|
|
name). |
2015 |
|
|
|
2016 |
|
|
A very basic configuration setup might look like: |
2017 |
|
|
|
2018 |
|
|
return { |
2019 |
|
|
title => 'Search the Swish-e list', # Title of your choice. |
2020 |
|
|
swish_binary => './swish-e', # Location of swish-e binary |
2021 |
|
|
swish_index => 'index.swish-e', # Location of your index file |
2022 |
|
|
}; |
2023 |
|
|
|
2024 |
|
|
Or if searching more than one index: |
2025 |
|
|
|
2026 |
|
|
return { |
2027 |
|
|
title => 'Search the Swish-e list', |
2028 |
|
|
swish_binary => './swish-e', |
2029 |
|
|
swish_index => ['index.swish-e', 'index2'], |
2030 |
|
|
}; |
2031 |
|
|
|
2032 |
|
|
Both of these examples return a reference to a perl hash ( C<return {...}> ). In the second example, |
2033 |
|
|
the multiple index files are set as an array reference. |
2034 |
|
|
|
2035 |
|
|
Note that in the example above the swish-e binary file is relative to the current directory. |
2036 |
|
|
If running under mod_perl you will typically need to use absolute paths. |
2037 |
|
|
|
2038 |
|
|
B<Using A Configuration File> |
2039 |
|
|
|
2040 |
|
|
As mentioned above, you can either edit the F<swish.cgi> script directly and modify the configuration settings, or |
2041 |
|
|
use an external configuration file. The settings in the configuration file are merged with (override) |
2042 |
|
|
the settings defined in the script. |
2043 |
|
|
|
2044 |
|
|
The advantage of using a configuration script is that you are not editing the swish.cgi script directly, and |
2045 |
|
|
downloading a new version won't mean re-editing the cgi script. Also, if running under mod_perl you can use the same |
2046 |
|
|
script loaded into Apache to manage many different search pages. |
2047 |
|
|
|
2048 |
|
|
By default, the script will attempt to read from the file F<.swishcgi.conf>. |
2049 |
|
|
For example, you might only wish to change the title used |
2050 |
|
|
in the script. Simply create a file called F<.swishcgi.conf> in the same directory as the CGI script: |
2051 |
|
|
|
2052 |
|
|
> cat .swishcgi.conf |
2053 |
|
|
# Example swish.cgi configuration script. |
2054 |
|
|
return { |
2055 |
|
|
title => 'Search Our Mailing List Archive', |
2056 |
|
|
}; |
2057 |
|
|
|
2058 |
|
|
The settings you use will depend on the index you create with swish. Here's a basic configuration: |
2059 |
|
|
|
2060 |
|
|
return { |
2061 |
|
|
title => 'Search the Apache documentation', |
2062 |
|
|
swish_binary => './swish-e', |
2063 |
|
|
swish_index => 'index.swish-e', |
2064 |
|
|
metanames => [qw/swishdefault swishdocpath swishtitle/], |
2065 |
|
|
display_props => [qw/swishtitle swishlastmodified swishdocsize swishdocpath/], |
2066 |
|
|
title_property => 'swishdocpath', |
2067 |
|
|
prepend_path => 'http://myhost/apachedocs', |
2068 |
|
|
|
2069 |
|
|
name_labels => { |
2070 |
|
|
swishdefault => 'Search All', |
2071 |
|
|
swishtitle => 'Title', |
2072 |
|
|
swishrank => 'Rank', |
2073 |
|
|
swishlastmodified => 'Last Modified Date', |
2074 |
|
|
swishdocpath => 'Document Path', |
2075 |
|
|
swishdocsize => 'Document Size', |
2076 |
|
|
}, |
2077 |
|
|
|
2078 |
|
|
}; |
2079 |
|
|
|
2080 |
|
|
The above configuration defines metanames to use on the form. |
2081 |
|
|
Searches can be limited to these metanames. |
2082 |
|
|
|
2083 |
|
|
"display_props" tells the script to display the property "swishlastmodified" (the last modified |
2084 |
|
|
date of the file), the document size, and path with the search results. |
2085 |
|
|
|
2086 |
|
|
The parameter "name_labels" is a hash (reference) |
2087 |
|
|
that is used to give friendly names to the metanames. |
2088 |
|
|
|
2089 |
|
|
Here's another example. Say you want to search either (or both) the Apache 1.3 documentation or the |
2090 |
|
|
Apache 2.0 documentation: |
2091 |
|
|
|
2092 |
|
|
return { |
2093 |
|
|
title => 'Search the Apache Documentation', |
2094 |
|
|
date_ranges => 0, |
2095 |
|
|
swish_index => [ qw/ index.apache index.apache2 / ], |
2096 |
|
|
select_indexes => { |
2097 |
|
|
method => 'checkbox_group', |
2098 |
|
|
labels => [ '1.3.23 docs', '2.0 docs' ], # Must match up one-to-one to swish_index |
2099 |
|
|
description => 'Select: ', |
2100 |
|
|
}, |
2101 |
|
|
|
2102 |
|
|
}; |
2103 |
|
|
|
2104 |
|
|
Now you can select either or both sets of documentation while searching. |
2105 |
|
|
|
2106 |
|
|
|
2107 |
|
|
Please refer to the default configuration settings near the top of the script for details on |
2108 |
|
|
the available settings. |
2109 |
|
|
|
2110 |
|
|
=head1 DEBUGGING |
2111 |
|
|
|
2112 |
|
|
Most problems with using this script have been a result of improper configuration. Please |
2113 |
|
|
get the script working with default settings before adjusting the configuration settings. |
2114 |
|
|
|
2115 |
|
|
The key to debugging CGI scripts is to run them from the command line, not with a browser. |
2116 |
|
|
|
2117 |
|
|
First, make sure the program compiles correctly: |
2118 |
|
|
|
2119 |
|
|
> perl -c swish.cgi |
2120 |
|
|
swish.cgi syntax OK |
2121 |
|
|
|
2122 |
|
|
Next, simply try running the program: |
2123 |
|
|
|
2124 |
|
|
> ./swish.cgi | head |
2125 |
|
|
Content-Type: text/html; charset=ISO-8859-1 |
2126 |
|
|
|
2127 |
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> |
2128 |
|
|
<html> |
2129 |
|
|
<head> |
2130 |
|
|
<title> |
2131 |
|
|
Search our site |
2132 |
|
|
</title> |
2133 |
|
|
</head> |
2134 |
|
|
<body> |
2135 |
|
|
|
2136 |
|
|
Now, you know that the program compiles and will run from the command line. |
2137 |
|
|
Next, try accessing the script from a web browser. |
2138 |
|
|
|
2139 |
|
|
If you see the contents of the CGI script instead of its output then your web server is |
2140 |
|
|
not configured to run the script. You will need to look at settings like ScriptAlias, SetHandler, |
2141 |
|
|
and Options. |
2142 |
|
|
|
2143 |
|
|
If an error is reported (such as Internal Server Error or Forbidden) |
2144 |
|
|
you need to locate your web server's error_log file |
2145 |
|
|
and carefully read what the problem is. Contact your web administrator for help. |
2146 |
|
|
|
2147 |
|
|
If you don't have access to the web server's error_log file, you can modify the script to report |
2148 |
|
|
errors to the browser screen. Open the script and search for "CGI::Carp". (Author's suggestion is |
2149 |
|
|
to debug from the command line -- adding the browser and web server into the equation only complicates |
2150 |
|
|
debugging.) |
2151 |
|
|
|
2152 |
|
|
The script does offer some basic debugging options that allow debugging from the command line. |
2153 |
|
|
The debugging options are enabled by setting |
2154 |
|
|
an environment variable "SWISH_DEBUG". How that is set depends on your operating system and the |
2155 |
|
|
shell you are using. These examples are using the "bash" shell syntax. |
2156 |
|
|
|
2157 |
|
|
Note: You can also use the "debug_options" configuration setting, but the recommended method |
2158 |
|
|
is to set the environment variable. |
2159 |
|
|
|
2160 |
|
|
You can list the available debugging options like this: |
2161 |
|
|
|
2162 |
|
|
>SWISH_DEBUG=help ./swish.cgi >outfile |
2163 |
|
|
Unknown debug option 'help'. Must be one of: |
2164 |
|
|
basic: Basic debugging |
2165 |
|
|
command: Show command used to run swish |
2166 |
|
|
headers: Show headers returned from swish |
2167 |
|
|
output: Show output from swish |
2168 |
|
|
summary: Show summary of results |
2169 |
|
|
dump: Show all data available to templates |
2170 |
|
|
|
2171 |
|
|
As you work yourself down the list you will get more detail output. You can combine |
2172 |
|
|
options like: |
2173 |
|
|
|
2174 |
|
|
>SWISH_DEBUG=command,headers,summary ./swish.cgi >outfile |
2175 |
|
|
|
2176 |
|
|
You will be asked for an input query and the max number of results to return. You can use the defaults |
2177 |
|
|
in most cases. It's a good idea to redirect output to a file. Any error messages are sent to stderr, so |
2178 |
|
|
those will still be displayed (unless you redirect stderr, too). |
2179 |
|
|
|
2180 |
|
|
Here are some examples: |
2181 |
|
|
|
2182 |
|
|
~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile |
2183 |
|
|
Debug level set to: 1 |
2184 |
|
|
Enter a query [all]: |
2185 |
|
|
Using 'not asdfghjklzxcv' to match all records |
2186 |
|
|
Enter max results to display [1]: |
2187 |
|
|
|
2188 |
|
|
------ Can't use DateRanges feature ------------ |
2189 |
|
|
|
2190 |
|
|
Script will run, but you can't use the date range feature |
2191 |
|
|
Can't locate Date/Calc.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at modules/DateRanges.pm line 107, <STDIN> line 2. |
2192 |
|
|
BEGIN failed--compilation aborted at modules/DateRanges.pm line 107, <STDIN> line 2. |
2193 |
|
|
Compilation failed in require at ./swish.cgi line 971, <STDIN> line 2. |
2194 |
|
|
|
2195 |
|
|
-------------- |
2196 |
|
|
Can't exec "./swish-e": No such file or directory at ./swish.cgi line 1245, <STDIN> line 2. |
2197 |
|
|
Child process Failed to exec './swish-e' Error: No such file or directory at ./swish.cgi line 1246, <STDIN> line 2. |
2198 |
|
|
Failed to find any results |
2199 |
|
|
|
2200 |
|
|
The above told me about two problems. First, it's telling me that the Date::Calc module is not installed. |
2201 |
|
|
The Date::Calc module is needed to use the date limiting feature of the script. |
2202 |
|
|
|
2203 |
|
|
The second problem is a bit more serious. It's saying that the script can't find the swish-e binary file. |
2204 |
|
|
I simply forgot to copy it. |
2205 |
|
|
|
2206 |
|
|
~/swishtest >cp ~/swish-e/src/swish-e . |
2207 |
|
|
~/swishtest >cat .swishcgi.conf |
2208 |
|
|
return { |
2209 |
|
|
title => 'Search the Apache Documentation', |
2210 |
|
|
date_ranges => 0, |
2211 |
|
|
}; |
2212 |
|
|
|
2213 |
|
|
Now, let's try again: |
2214 |
|
|
|
2215 |
|
|
~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile |
2216 |
|
|
Debug level set to: 1 |
2217 |
|
|
|
2218 |
|
|
---------- Read config parameters from '.swishcgi.conf' ------ |
2219 |
|
|
$VAR1 = { |
2220 |
|
|
'date_ranges' => 0, |
2221 |
|
|
'title' => 'Search the Apache Documentation' |
2222 |
|
|
}; |
2223 |
|
|
------------------------- |
2224 |
|
|
Enter a query [all]: |
2225 |
|
|
Using 'not asdfghjklzxcv' to match all records |
2226 |
|
|
Enter max results to display [1]: |
2227 |
|
|
Found 1 results |
2228 |
|
|
|
2229 |
|
|
Can't locate TemplateDefault.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at ./swish.cgi line 608. |
2230 |
|
|
|
2231 |
|
|
Bother. I fixed the first two problems, but now there's this new error. Oh, I somehow forgot to |
2232 |
|
|
copy the modules directory. The obvious way to fix that is to copy the directory. But, there may |
2233 |
|
|
be times where you want to put the module directory in another location. So, let's modify the |
2234 |
|
|
F<.swishcgi.conf> file and add a "use lib" setting: |
2235 |
|
|
|
2236 |
|
|
~/swishtest >cat .swishcgi.conf |
2237 |
|
|
use lib '/home/bill/swish-e/example/modules'; |
2238 |
|
|
|
2239 |
|
|
return { |
2240 |
|
|
title => 'Search the Apache Documentation', |
2241 |
|
|
date_ranges => 0, |
2242 |
|
|
}; |
2243 |
|
|
|
2244 |
|
|
~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile |
2245 |
|
|
Debug level set to: 1 |
2246 |
|
|
|
2247 |
|
|
---------- Read config parameters from '.swishcgi.conf' ------ |
2248 |
|
|
$VAR1 = { |
2249 |
|
|
'date_ranges' => 0, |
2250 |
|
|
'title' => 'Search the Apache Documentation' |
2251 |
|
|
}; |
2252 |
|
|
------------------------- |
2253 |
|
|
Enter a query [all]: |
2254 |
|
|
Using 'not asdfghjklzxcv' to match all records |
2255 |
|
|
Enter max results to display [1]: |
2256 |
|
|
Found 1 results |
2257 |
|
|
|
2258 |
|
|
Now were talking. |
2259 |
|
|
|
2260 |
|
|
Here's a common problem. Everything checks out, but when you run the script you see the message: |
2261 |
|
|
|
2262 |
|
|
Swish returned unknown output |
2263 |
|
|
|
2264 |
|
|
Ok, let's find out what output it is returning: |
2265 |
|
|
|
2266 |
|
|
~/swishtest >SWISH_DEBUG=headers,output ./swish.cgi >outfile |
2267 |
|
|
Debug level set to: 13 |
2268 |
|
|
|
2269 |
|
|
---------- Read config parameters from '.swishcgi.conf' ------ |
2270 |
|
|
$VAR1 = { |
2271 |
|
|
'swish_binary' => '/usr/local/bin/swish-e', |
2272 |
|
|
'date_ranges' => 0, |
2273 |
|
|
'title' => 'Search the Apache Documentation' |
2274 |
|
|
}; |
2275 |
|
|
------------------------- |
2276 |
|
|
Enter a query [all]: |
2277 |
|
|
Using 'not asdfghjklzxcv' to match all records |
2278 |
|
|
Enter max results to display [1]: |
2279 |
|
|
usage: swish [-i dir file ... ] [-S system] [-c file] [-f file] [-l] [-v (num)] |
2280 |
|
|
... |
2281 |
|
|
version: 2.0 |
2282 |
|
|
docs: http://sunsite.berkeley.edu/SWISH-E/ |
2283 |
|
|
|
2284 |
|
|
*** 9872 Failed to run swish: 'Swish returned unknown output' *** |
2285 |
|
|
Failed to find any results |
2286 |
|
|
|
2287 |
|
|
Oh, looks like /usr/local/bin/swish-e is version 2.0 of swish. We need 2.1-dev and above! |
2288 |
|
|
|
2289 |
|
|
=head1 Frequently Asked Questions |
2290 |
|
|
|
2291 |
|
|
Here's some common questions and answers. |
2292 |
|
|
|
2293 |
|
|
=head2 How do I change the way the output looks? |
2294 |
|
|
|
2295 |
|
|
The script uses a module to generate output. By default it uses the TemplateDefault.pm module. |
2296 |
|
|
The module used can be selected in the configuration file. |
2297 |
|
|
|
2298 |
|
|
If you want to make simple changes you can edit the TemplatDefault.pm module directly. If you want to |
2299 |
|
|
copy a module, you must also change the "package" statement at the top of the module. For example: |
2300 |
|
|
|
2301 |
|
|
cp TempateDefault.pm MyTemplateDefault.pm |
2302 |
|
|
|
2303 |
|
|
Then at the top of the module adjust the "package" line to: |
2304 |
|
|
|
2305 |
|
|
package MyTemplateDefault; |
2306 |
|
|
|
2307 |
|
|
To use this modules you need to adjust the configuration settings (either at the top of F<swish.cgi> or in |
2308 |
|
|
a configuration file: |
2309 |
|
|
|
2310 |
|
|
|
2311 |
|
|
template => { |
2312 |
|
|
package => 'MyTemplateDefault', |
2313 |
|
|
}, |
2314 |
|
|
|
2315 |
|
|
|
2316 |
|
|
=head2 How do I use a templating system with swish.cgi? |
2317 |
|
|
|
2318 |
|
|
In addition to the TemplateDefault.pm module, the swish-e distribution includes two other Perl modules for |
2319 |
|
|
generating output using the templating systems HTML::Template and Template-Toolkit. |
2320 |
|
|
|
2321 |
|
|
Templating systems use template files to generate the HTML, and make maintaining the look of a large (or small) site |
2322 |
|
|
much easier. HTML::Template and Template-Toolkit are separate packages and can be downloaded from the CPAN. |
2323 |
|
|
See http://search.cpan.org. |
2324 |
|
|
|
2325 |
|
|
Two basic templates are provided as examples for generating output using these templating systems. |
2326 |
|
|
The example templates are located in the F<example> directory. |
2327 |
|
|
The module F<TemplateHTMLTemplate.pm> uses the file F<swish.tmpl> to generate its output, while the |
2328 |
|
|
module F<TemplateToolkit.pm> uses the F<search.tt> file. |
2329 |
|
|
|
2330 |
|
|
To use either of these modules you will need to adjust the "template" configuration setting. Examples for |
2331 |
|
|
both templating systems are provided in the configuration settings near the top of the F<swish.cgi> program. |
2332 |
|
|
|
2333 |
|
|
Use of these modules is an advanced usage of F<swish.cgi> and are provided as examples only. |
2334 |
|
|
|
2335 |
|
|
All of the output generation modules are passed a hash with the results from the search, plus other data use to create the |
2336 |
|
|
output page. You can see this hash by using the debugging option "dump" or by using the TemplateDumper.pm |
2337 |
|
|
module: |
2338 |
|
|
|
2339 |
|
|
~/swishtest >cat .swishcgi.conf |
2340 |
|
|
return { |
2341 |
|
|
title => 'Search the Apache Documentation', |
2342 |
|
|
template => { |
2343 |
|
|
package => 'TemplateDumper', |
2344 |
|
|
}, |
2345 |
|
|
}; |
2346 |
|
|
|
2347 |
|
|
And run a query. For example: |
2348 |
|
|
|
2349 |
|
|
http://localhost:8000/swishtest/swish.cgi?query=install |
2350 |
|
|
|
2351 |
|
|
=head2 Why are there three different highlighting modules? |
2352 |
|
|
|
2353 |
|
|
Three are three highlighting modules included with the swish-e distribution. |
2354 |
|
|
Each is a trade-off of speed vs. accuracy: |
2355 |
|
|
|
2356 |
|
|
DefaultHighlight.pm - reasonably fast, but does not highlight phrases |
2357 |
|
|
PhraseHighlight.pm - reasonably slow, but is reasonably accurate |
2358 |
|
|
SimpleHighlight.pm - fast, some phrases, but least accurate |
2359 |
|
|
|
2360 |
|
|
Eh, the default is actually "PhraseHighlight.pm". Oh well. |
2361 |
|
|
|
2362 |
|
|
Optimizations to these modules are welcome! |
2363 |
|
|
|
2364 |
|
|
=head2 My ISP doesn't provide access to the web server logs |
2365 |
|
|
|
2366 |
|
|
There are a number of options. One way it to use the CGI::Carp module. Search in the |
2367 |
|
|
swish.cgi script for: |
2368 |
|
|
|
2369 |
|
|
use Carp; |
2370 |
|
|
# Or use this instead -- PLEASE see perldoc CGI::Carp for details |
2371 |
|
|
# use CGI::Carp qw(fatalsToBrowser warningsToBrowser); |
2372 |
|
|
|
2373 |
|
|
And change it to look like: |
2374 |
|
|
|
2375 |
|
|
#use Carp; |
2376 |
|
|
# Or use this instead -- PLEASE see perldoc CGI::Carp for details |
2377 |
|
|
use CGI::Carp qw(fatalsToBrowser warningsToBrowser); |
2378 |
|
|
|
2379 |
|
|
This should be only for debugging purposes, as if used in production you may end up sending |
2380 |
|
|
quite ugly and confusing messages to your browsers. |
2381 |
|
|
|
2382 |
|
|
=head2 Why does the output show (NULL)? |
2383 |
|
|
|
2384 |
|
|
The most common reason is that you did not use StoreDescription in your config file while indexing. |
2385 |
|
|
|
2386 |
|
|
StoreDescription HTML <body> 200000 |
2387 |
|
|
|
2388 |
|
|
That tells swish to store the first 200,000 characters of text extracted from the body of each document parsed |
2389 |
|
|
by the HTML parser. The text is stored as property "swishdescription". Running: |
2390 |
|
|
|
2391 |
|
|
~/swishtest > ./swish-e -T index_metanames |
2392 |
|
|
|
2393 |
|
|
will display the properties defined in your index file. |
2394 |
|
|
|
2395 |
|
|
This can happen with other properties, too. |
2396 |
|
|
For example, this will happen when you are asking for a property to display that is not defined in swish. |
2397 |
|
|
|
2398 |
|
|
~/swishtest > ./swish-e -w install -m 1 -p foo |
2399 |
|
|
# SWISH format: 2.1-dev-25 |
2400 |
|
|
# Search words: install |
2401 |
|
|
err: Unknown Display property name "foo" |
2402 |
|
|
. |
2403 |
|
|
|
2404 |
|
|
~/swishtest > ./swish-e -w install -m 1 -x 'Property foo=<foo>\n' |
2405 |
|
|
# SWISH format: 2.1-dev-25 |
2406 |
|
|
# Search words: install |
2407 |
|
|
# Number of hits: 14 |
2408 |
|
|
# Search time: 0.000 seconds |
2409 |
|
|
# Run time: 0.038 seconds |
2410 |
|
|
Property foo=(NULL) |
2411 |
|
|
. |
2412 |
|
|
|
2413 |
|
|
To check that a property exists in your index you can run: |
2414 |
|
|
|
2415 |
|
|
~/swishtest > ./swish-e -w not dkdk -T index_metanames | grep foo |
2416 |
|
|
foo : id=10 type=70 META_PROP:STRING(case:ignore) *presorted* |
2417 |
|
|
|
2418 |
|
|
Ok, in this case we see that "foo" is really defined as a property. Now let's make sure F<swish.cgi> |
2419 |
|
|
is asking for "foo" (sorry for the long lines): |
2420 |
|
|
|
2421 |
|
|
~/swishtest > SWISH_DEBUG=command ./swish.cgi > /dev/null |
2422 |
|
|
Debug level set to: 3 |
2423 |
|
|
Enter a query [all]: |
2424 |
|
|
Using 'not asdfghjklzxcv' to match all records |
2425 |
|
|
Enter max results to display [1]: |
2426 |
|
|
---- Running swish with the following command and parameters ---- |
2427 |
|
|
./swish-e \ |
2428 |
|
|
-w \ |
2429 |
|
|
'swishdefault=(not asdfghjklzxcv)' \ |
2430 |
|
|
-b \ |
2431 |
|
|
1 \ |
2432 |
|
|
-m \ |
2433 |
|
|
1 \ |
2434 |
|
|
-f \ |
2435 |
|
|
index.swish-e \ |
2436 |
|
|
-s \ |
2437 |
|
|
swishrank \ |
2438 |
|
|
desc \ |
2439 |
|
|
swishlastmodified \ |
2440 |
|
|
desc \ |
2441 |
|
|
-x \ |
2442 |
|
|
'<swishreccount>\t<swishtitle>\t<swishdescription>\t<swishlastmodified>\t<swishdocsize>\t<swishdocpath>\t<fos>\t<swishrank>\t<swishdocpath>\n' \ |
2443 |
|
|
-H \ |
2444 |
|
|
9 |
2445 |
|
|
|
2446 |
|
|
If you look carefully you will see that the -x parameter has "fos" instead of "foo", so there's our problem. |
2447 |
|
|
|
2448 |
|
|
|
2449 |
|
|
=head1 MOD_PERL |
2450 |
|
|
|
2451 |
|
|
This script can be run under mod_perl (see http://perl.apache.org). |
2452 |
|
|
This will improve the response time of the script compared to running under CGI. |
2453 |
|
|
|
2454 |
|
|
Configuration is simple. In your httpd.conf or your startup.pl file you need to |
2455 |
|
|
load the script. For example, in httpd.conf you can use a perl section: |
2456 |
|
|
|
2457 |
|
|
<perl> |
2458 |
|
|
use lib '/usr/local/apache/cgi-bin'; |
2459 |
|
|
use lib '/home/yourname/swish-e/example/modules'; |
2460 |
|
|
require "swish.cgi"; |
2461 |
|
|
</perl> |
2462 |
|
|
|
2463 |
|
|
Again, note that the paths used will depend on where you installed the script and the modules. |
2464 |
|
|
When running under mod_perl the swish.cgi script becomes a perl module, and therefore the script |
2465 |
|
|
does not need to be installed in the cgi-bin directory. (But, you can actually use the same script as |
2466 |
|
|
both a CGI script and a mod_perl module at the same time, read from the same location.) |
2467 |
|
|
|
2468 |
|
|
The above loads the script into mod_perl. Then to configure the script to run add this to your httpd.conf |
2469 |
|
|
configuration file: |
2470 |
|
|
|
2471 |
|
|
<location /search> |
2472 |
|
|
allow from all |
2473 |
|
|
SetHandler perl-script |
2474 |
|
|
PerlHandler SwishSearch |
2475 |
|
|
</location> |
2476 |
|
|
|
2477 |
|
|
Unlike CGI, mod_perl does not change the current directory to the location of the perl module, so |
2478 |
|
|
your settings for the swish binary and the path to your index files must be absolute |
2479 |
|
|
paths (or relative to the server root). |
2480 |
|
|
|
2481 |
|
|
Take a look at the C<handler()> routine in this script for ideas how to use PerlSetVar commands |
2482 |
|
|
in httpd.conf to control the script. |
2483 |
|
|
|
2484 |
|
|
Please post to the swish-e discussion list if you have any questions about running this |
2485 |
|
|
script under mod_perl. |
2486 |
|
|
|
2487 |
|
|
|
2488 |
|
|
=head1 Spidering |
2489 |
|
|
|
2490 |
|
|
There are two ways to spider with swish-e. One uses the "http" input method that uses code that's |
2491 |
|
|
part of swish. The other way is to use the new "prog" method along with a perl helper program called |
2492 |
|
|
C<spider.pl>. |
2493 |
|
|
|
2494 |
|
|
Here's an example of a configuration file for spidering with the "http" input method. |
2495 |
|
|
You can see that the configuration is not much different than the file system input method. |
2496 |
|
|
(But, don't use the http input method -- use the -S prog method shown below.) |
2497 |
|
|
|
2498 |
|
|
# Define what to index |
2499 |
|
|
IndexDir http://www.myserver.name/index.html |
2500 |
|
|
IndexOnly .html .htm |
2501 |
|
|
|
2502 |
|
|
IndexContents HTML .html .htm |
2503 |
|
|
DefaultContents HTML |
2504 |
|
|
StoreDescription HTML <body> 200000 |
2505 |
|
|
MetaNames swishdocpath swishtitle |
2506 |
|
|
|
2507 |
|
|
# Define http method specific settings -- see swish-e documentation |
2508 |
|
|
SpiderDirectory ../swish-e/src/ |
2509 |
|
|
Delay 0 |
2510 |
|
|
|
2511 |
|
|
You index with the command: |
2512 |
|
|
|
2513 |
|
|
swish-e -S http -c spider.conf |
2514 |
|
|
|
2515 |
|
|
Note that this does take longer. For example, spidering the Apache documentation on |
2516 |
|
|
a local web server with this method took over a minute, where indexing with the |
2517 |
|
|
file system took less than two seconds. Using the "prog" method can speed this up. |
2518 |
|
|
|
2519 |
|
|
Here's an example configuration file for using the "prog" input method: |
2520 |
|
|
|
2521 |
|
|
# Define the location of the spider helper program |
2522 |
|
|
IndexDir ../swish-e/prog-bin/spider.pl |
2523 |
|
|
|
2524 |
|
|
# Tell the spider what to index. |
2525 |
|
|
SwishProgParameters default http://www.myserver.name/index.html |
2526 |
|
|
|
2527 |
|
|
IndexContents HTML .html .htm |
2528 |
|
|
DefaultContents HTML |
2529 |
|
|
StoreDescription HTML <body> 200000 |
2530 |
|
|
MetaNames swishdocpath swishtitle |
2531 |
|
|
|
2532 |
|
|
Then to index you use the command: |
2533 |
|
|
|
2534 |
|
|
swish-e -c prog.conf -S prog -v 0 |
2535 |
|
|
|
2536 |
|
|
Spidering with this method took nine seconds. |
2537 |
|
|
|
2538 |
|
|
|
2539 |
|
|
=head1 Stemmed Indexes |
2540 |
|
|
|
2541 |
|
|
Many people enable a feature of swish called word stemming to provide "fuzzy" search |
2542 |
|
|
options to their users. |
2543 |
|
|
The stemming code does not actually find the "stem" of word, rather removes and/or replaces |
2544 |
|
|
common endings on words. |
2545 |
|
|
Stemming is far from perfect, and many words do not stem as you might expect. But, it can |
2546 |
|
|
be a helpful tool for searching your site. You may wish to create both a stemmed and non-stemmed index, and |
2547 |
|
|
provide a checkbox for selecting the index file. |
2548 |
|
|
|
2549 |
|
|
To enable a stemmed index you simply add to your configuration file: |
2550 |
|
|
|
2551 |
|
|
UseStemming yes |
2552 |
|
|
|
2553 |
|
|
If you want to use a stemmed index with this program and continue to highlight search terms you will need |
2554 |
|
|
to install a perl module that will stem words. This section explains how to do this. |
2555 |
|
|
|
2556 |
|
|
The perl module is included with the swish-e distribution. It can be found in the examples directory (where |
2557 |
|
|
you found this file) and called something like: |
2558 |
|
|
|
2559 |
|
|
SWISH-Stemmer-0.05.tar.gz |
2560 |
|
|
|
2561 |
|
|
The module should also be available on CPAN (http://search.cpan.org/). |
2562 |
|
|
|
2563 |
|
|
Here's an example session for installing the module. (There will be quite a bit of output |
2564 |
|
|
when running make.) |
2565 |
|
|
|
2566 |
|
|
|
2567 |
|
|
% gzip -dc SWISH-Stemmer-0.05.tar.gz |tar xof - |
2568 |
|
|
% cd SWISH-Stemmer-0.05 |
2569 |
|
|
% perl Makefile.PL |
2570 |
|
|
or |
2571 |
|
|
% perl Makefile.PL PREFIX=$HOME/perl_lib |
2572 |
|
|
% make |
2573 |
|
|
% make test |
2574 |
|
|
|
2575 |
|
|
(perhaps su root at this point if you did not use a PREFIX) |
2576 |
|
|
% make install |
2577 |
|
|
% cd .. |
2578 |
|
|
|
2579 |
|
|
Use the B<PREFIX> if you do not have root access or you want to install the modules |
2580 |
|
|
in a local library. If you do use a PREFIX setting, add a C<use lib> statement to the top of this |
2581 |
|
|
swish.cgi program. |
2582 |
|
|
|
2583 |
|
|
For example: |
2584 |
|
|
|
2585 |
|
|
use lib qw( |
2586 |
|
|
/home/bmoseley/perl_lib/lib/site_perl/5.6.0 |
2587 |
|
|
/home/bmoseley/perl_lib/lib/site_perl/5.6.0/i386-linux/ |
2588 |
|
|
); |
2589 |
|
|
|
2590 |
|
|
Once the stemmer module is installed, and you are using a stemmed index, the C<swish.cgi> script will automatically |
2591 |
|
|
detect this and use the stemmer module. |
2592 |
|
|
|
2593 |
|
|
=head1 DISCLAIMER |
2594 |
|
|
|
2595 |
|
|
Please use this CGI script at your own risk. |
2596 |
|
|
|
2597 |
|
|
This script has been tested and used without problem, but you should still be aware that |
2598 |
|
|
any code running on your server represents a risk. If you have any concerns please carefully |
2599 |
|
|
review the code. |
2600 |
|
|
|
2601 |
|
|
See http://www.w3.org/Security/Faq/www-security-faq.html |
2602 |
|
|
|
2603 |
|
|
Security on Windows questionable. |
2604 |
|
|
|
2605 |
|
|
=head1 SUPPORT |
2606 |
|
|
|
2607 |
|
|
The SWISH-E discussion list is the place to ask for any help regarding SWISH-E or this example |
2608 |
|
|
script. See http://swish-e.org. |
2609 |
|
|
|
2610 |
|
|
Before posting please review: |
2611 |
|
|
|
2612 |
|
|
http://swish-e.org/2.2/docs/INSTALL.html#When_posting_please_provide_the_ |
2613 |
|
|
|
2614 |
|
|
Please do not contact the author or any of the swish-e developers directly. |
2615 |
|
|
|
2616 |
|
|
=head1 LICENSE |
2617 |
|
|
|
2618 |
|
|
swish.cgi $Revision: 1.33 $ Copyright (C) 2001 Bill Moseley search@hank.org |
2619 |
|
|
Example CGI program for searching with SWISH-E |
2620 |
|
|
|
2621 |
|
|
|
2622 |
|
|
This program is free software; you can redistribute it and/or |
2623 |
|
|
modify it under the terms of the GNU General Public License |
2624 |
|
|
as published by the Free Software Foundation; either version |
2625 |
|
|
2 of the License, or (at your option) any later version. |
2626 |
|
|
|
2627 |
|
|
This program is distributed in the hope that it will be useful, |
2628 |
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
2629 |
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
2630 |
|
|
GNU General Public License for more details. |
2631 |
|
|
|
2632 |
|
|
|
2633 |
|
|
=head1 AUTHOR |
2634 |
|
|
|
2635 |
|
|
Bill Moseley -- search@hank.org |
2636 |
|
|
|
2637 |
|
|
=cut |
2638 |
|
|
|
2639 |
|
|
|