| 1 |
adcroft |
1.1 |
#!/usr/local/bin/perl -w |
| 2 |
|
|
package SwishSearch; |
| 3 |
|
|
use strict; |
| 4 |
|
|
|
| 5 |
|
|
use lib qw( modules ); ### This may need to be adjusted! |
| 6 |
|
|
### It should point to the location of the |
| 7 |
|
|
### associated script modules directory |
| 8 |
|
|
|
| 9 |
|
|
my $DEFAULT_CONFIG_FILE = '.swishcgi.conf'; |
| 10 |
|
|
|
| 11 |
|
|
################################################################################### |
| 12 |
|
|
# |
| 13 |
|
|
# If this text is displayed on your browser then your web server |
| 14 |
|
|
# is not configured to run .cgi programs. Contact your web server administrator. |
| 15 |
|
|
# |
| 16 |
|
|
# To display documentation for this program type "perldoc swish.cgi" |
| 17 |
|
|
# |
| 18 |
|
|
# swish.cgi $Revision: 1.33 $ Copyright (C) 2001 Bill Moseley swishscript@hank.org |
| 19 |
|
|
# Example CGI program for searching with SWISH-E |
| 20 |
|
|
# |
| 21 |
|
|
# This example program will only run under an OS that supports fork(). |
| 22 |
|
|
# Ok, piped opens. |
| 23 |
|
|
# |
| 24 |
|
|
# |
| 25 |
|
|
# This program is free software; you can redistribute it and/or |
| 26 |
|
|
# modify it under the terms of the GNU General Public License |
| 27 |
|
|
# as published by the Free Software Foundation; either version |
| 28 |
|
|
# 2 of the License, or (at your option) any later version. |
| 29 |
|
|
# |
| 30 |
|
|
# This program is distributed in the hope that it will be useful, |
| 31 |
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 32 |
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 33 |
|
|
# GNU General Public License for more details. |
| 34 |
|
|
# |
| 35 |
|
|
# The above lines must remain at the top of this program |
| 36 |
|
|
# |
| 37 |
|
|
# $Id: swish.cgi,v 1.33 2002/08/13 23:08:54 whmoseley Exp $ |
| 38 |
|
|
# |
| 39 |
|
|
#################################################################################### |
| 40 |
|
|
|
| 41 |
|
|
# This is written this way so the script can be used as a CGI script or a mod_perl |
| 42 |
|
|
# module without any code changes. |
| 43 |
|
|
|
| 44 |
|
|
# use CGI (); # might not be needed if using Apache::Request |
| 45 |
|
|
|
| 46 |
|
|
#================================================================================= |
| 47 |
|
|
# CGI entry point |
| 48 |
|
|
# |
| 49 |
|
|
#================================================================================= |
| 50 |
|
|
|
| 51 |
|
|
|
| 52 |
|
|
|
| 53 |
|
|
# Run the script -- entry point if running as a CGI script |
| 54 |
|
|
|
| 55 |
|
|
unless ( $ENV{MOD_PERL} ) { |
| 56 |
|
|
my $config = default_config(); |
| 57 |
|
|
|
| 58 |
|
|
# Merge with disk config file. |
| 59 |
|
|
$config = merge_read_config( $config ); |
| 60 |
|
|
process_request( $config ); |
| 61 |
|
|
} |
| 62 |
|
|
|
| 63 |
|
|
|
| 64 |
|
|
|
| 65 |
|
|
|
| 66 |
|
|
#================================================================================== |
| 67 |
|
|
# This sets the default configuration parameters |
| 68 |
|
|
# |
| 69 |
|
|
# Any configuration read from disk is merged with these settings. |
| 70 |
|
|
# |
| 71 |
|
|
# Only a few settings are actually required. Some reasonable defaults are used |
| 72 |
|
|
# for most. If fact, you can probably create a complete config as: |
| 73 |
|
|
# |
| 74 |
|
|
# return = { |
| 75 |
|
|
# swish_binary => '/usr/local/bin/swish-e', |
| 76 |
|
|
# swish_index => '/usr/local/share/swish/index.swish-e', |
| 77 |
|
|
# title_property => 'swishtitle', # Not required, but recommended |
| 78 |
|
|
# }; |
| 79 |
|
|
# |
| 80 |
|
|
# But, that doesn't really show all the options. |
| 81 |
|
|
# |
| 82 |
|
|
# You can modify the options below, or you can use a config file. The config file |
| 83 |
|
|
# is .swishcgi.conf by default (read from the current directory) that must return |
| 84 |
|
|
# a hash reference. For example, to create a config file that changes the default |
| 85 |
|
|
# title and index file name, plus uses Template::Toolkit to generate output |
| 86 |
|
|
# create a config file as: |
| 87 |
|
|
# |
| 88 |
|
|
# # Example config file -- returns a hash reference |
| 89 |
|
|
# { |
| 90 |
|
|
# title => 'Search Our Site', |
| 91 |
|
|
# swish_index => 'index.web', |
| 92 |
|
|
# |
| 93 |
|
|
# template => { |
| 94 |
|
|
# package => 'TemplateToolkit', |
| 95 |
|
|
# file => 'search.tt', |
| 96 |
|
|
# options => { |
| 97 |
|
|
# INCLUDE_PATH => '/home/user/swish-e/example', |
| 98 |
|
|
# }, |
| 99 |
|
|
# }; |
| 100 |
|
|
# |
| 101 |
|
|
# |
| 102 |
|
|
#----------------------------------------------------------------------------------- |
| 103 |
|
|
|
| 104 |
|
|
sub default_config { |
| 105 |
|
|
|
| 106 |
|
|
|
| 107 |
|
|
|
| 108 |
|
|
##### Configuration Parameters ######### |
| 109 |
|
|
|
| 110 |
|
|
#---- This lists all the options, with many commented out --- |
| 111 |
|
|
# By default, this config is used -- see the process_request() call below. |
| 112 |
|
|
|
| 113 |
|
|
# You should adjust for your site, and how your swish index was created. |
| 114 |
|
|
|
| 115 |
|
|
##>> |
| 116 |
|
|
##>> Please don't post this entire section on the swish-e list if looking for help! |
| 117 |
|
|
##>> |
| 118 |
|
|
##>> Send a small example, without all the comments. |
| 119 |
|
|
|
| 120 |
|
|
#====================================================================== |
| 121 |
|
|
# *** NOTES **** |
| 122 |
|
|
# Items beginning with an "x" or "#" are commented out |
| 123 |
|
|
# the "x" form simply renames (hides) that setting. It's used |
| 124 |
|
|
# to make it easy to disable a mult-line configuation setting. |
| 125 |
|
|
# |
| 126 |
|
|
# If you do not understand a setting then best to leave the default. |
| 127 |
|
|
# |
| 128 |
|
|
# Please follow the documentation (perldoc swish.cgi) and set up |
| 129 |
|
|
# a test using the defaults before making changes. It's much easier |
| 130 |
|
|
# to modify a working example than to try to get a modified example to work... |
| 131 |
|
|
# |
| 132 |
|
|
# Again, this is a Perl hash structure. Commas are important. |
| 133 |
|
|
#====================================================================== |
| 134 |
|
|
|
| 135 |
|
|
return { |
| 136 |
|
|
title => 'Search our site', # Title of your choice. Displays on the search page |
| 137 |
|
|
swish_binary => './swish-e', # Location of swish-e binary |
| 138 |
|
|
|
| 139 |
|
|
|
| 140 |
|
|
# By default, this script tries to read a config file. You should probably |
| 141 |
|
|
# comment this out if not used save a disk stat |
| 142 |
|
|
config_file => $DEFAULT_CONFIG_FILE, # Default config file |
| 143 |
|
|
|
| 144 |
|
|
|
| 145 |
|
|
# The location of your index file. Typically, this would not be in |
| 146 |
|
|
# your web tree. |
| 147 |
|
|
# If you have more than one index to search then specify an array |
| 148 |
|
|
# reference. e.g. swish_index =>[ qw( index1 index2 index3 )], |
| 149 |
|
|
|
| 150 |
|
|
swish_index => 'index.swish-e', # Location of your index file |
| 151 |
|
|
|
| 152 |
|
|
# See "select_indexes" below for how to |
| 153 |
|
|
# select more than one index. |
| 154 |
|
|
|
| 155 |
|
|
page_size => 15, # Number of results per page - default 15 |
| 156 |
|
|
|
| 157 |
|
|
|
| 158 |
|
|
# Property name to use as the main link text to the indexed document. |
| 159 |
|
|
# Typically, this will be 'swishtitle' if have indexed html documents, |
| 160 |
|
|
# But you can specify any PropertyName defined in your document. |
| 161 |
|
|
# By default, swish will return the pathname for documents that do not |
| 162 |
|
|
# have a title. |
| 163 |
|
|
# In other words, this is used for the text of the links of the search results. |
| 164 |
|
|
# <a href="prepend_path/swishdocpath">title_property</a> |
| 165 |
|
|
|
| 166 |
|
|
title_property => 'swishtitle', |
| 167 |
|
|
|
| 168 |
|
|
|
| 169 |
|
|
|
| 170 |
|
|
# prepend this path to the filename (swishdocpath) returned by swish. This is used to |
| 171 |
|
|
# make the href link back to the original document. Comment out to disable. |
| 172 |
|
|
|
| 173 |
|
|
#prepend_path => 'http://localhost/mydocs', |
| 174 |
|
|
|
| 175 |
|
|
|
| 176 |
|
|
# Swish has a configuration directive "StoreDescription" that will save part or |
| 177 |
|
|
# all of a document's contents in the index file. This can then be displayed |
| 178 |
|
|
# along with results. If you are indexing a lot of files this can use a lot of disk |
| 179 |
|
|
# space, so test carefully before indexing your entire site. |
| 180 |
|
|
# Building swish with zlib can greatly reduce the space used by StoreDescription |
| 181 |
|
|
# |
| 182 |
|
|
# This settings tells this script to display this description. |
| 183 |
|
|
# Normally, this should be 'swishdescription', but you can specify another property name. |
| 184 |
|
|
# There is no default. |
| 185 |
|
|
|
| 186 |
|
|
description_prop => 'swishdescription', |
| 187 |
|
|
|
| 188 |
|
|
|
| 189 |
|
|
|
| 190 |
|
|
# Property names listed here will be displayed in a table below each result |
| 191 |
|
|
# You may wish to modify this list if you are using document properties (PropertyNames) |
| 192 |
|
|
# in your swish-e index configuration |
| 193 |
|
|
# There is no default. |
| 194 |
|
|
|
| 195 |
|
|
display_props => [qw/swishlastmodified swishdocsize swishdocpath/], |
| 196 |
|
|
|
| 197 |
|
|
|
| 198 |
|
|
|
| 199 |
|
|
# Results can be be sorted by any of the properties listed here |
| 200 |
|
|
# They will be displayed in a drop-down list |
| 201 |
|
|
# Again, you may modify this list if you are using document properties of your own creation |
| 202 |
|
|
# Swish uses the rank as the default sort |
| 203 |
|
|
|
| 204 |
|
|
sorts => [qw/swishrank swishlastmodified swishtitle swishdocpath/], |
| 205 |
|
|
|
| 206 |
|
|
|
| 207 |
|
|
# Secondary_sort is used to sort within a sort |
| 208 |
|
|
# You may enter a property name followed by a direction (asc|desc) |
| 209 |
|
|
|
| 210 |
|
|
secondary_sort => [qw/swishlastmodified desc/], |
| 211 |
|
|
|
| 212 |
|
|
|
| 213 |
|
|
|
| 214 |
|
|
|
| 215 |
|
|
|
| 216 |
|
|
# You can limit by MetaNames here. Names listed here will be displayed in |
| 217 |
|
|
# a line of radio buttons. |
| 218 |
|
|
# The default is to not allow any metaname selection. |
| 219 |
|
|
# To use this feature you must define MetaNames while indexing. |
| 220 |
|
|
|
| 221 |
|
|
# The special "swishdefault" says to search any text that was not indexed |
| 222 |
|
|
# as a specific metaname (e.g. typically the body of a HTML document and its title). |
| 223 |
|
|
|
| 224 |
|
|
# To see how this might work, add to your config file: |
| 225 |
|
|
# MetaNames swishtitle swishdocpath |
| 226 |
|
|
# reindex and try: |
| 227 |
|
|
|
| 228 |
|
|
metanames => [qw/swishdefault swishtitle swishdocpath /], |
| 229 |
|
|
|
| 230 |
|
|
# Add "all" to metanames to test the meta_groups feature described below |
| 231 |
|
|
|
| 232 |
|
|
|
| 233 |
|
|
|
| 234 |
|
|
# Another example: if you indexed an email archive |
| 235 |
|
|
# that defined the metanames subject name email (as in the swish-e discussion archive) |
| 236 |
|
|
# you might use: |
| 237 |
|
|
#metanames => [qw/body subject name email/], |
| 238 |
|
|
|
| 239 |
|
|
|
| 240 |
|
|
# Note that you can do a real "all" search if you use nested metanames in your source documents. |
| 241 |
|
|
# Nesting metanames is most common with XML documents. |
| 242 |
|
|
|
| 243 |
|
|
# You can also group metanames into "meta-metanames". |
| 244 |
|
|
# Example: Say you defined metanames "author", "comment" and "keywords" |
| 245 |
|
|
# You want to allow searching "author", "comment" and the document body ("swishdefault") |
| 246 |
|
|
# But you would also like an "all" search that searches all metanames, including "keywords": |
| 247 |
|
|
# |
| 248 |
|
|
# metanames => [qw/swishdefault author comment all/], |
| 249 |
|
|
# |
| 250 |
|
|
# Now, the "all" metaname is not a real metaname. It must be expanded into its |
| 251 |
|
|
# individual metanames |
| 252 |
|
|
# |
| 253 |
|
|
# "meta_groups" maps a fake metaname to a list of real metanames |
| 254 |
|
|
# |
| 255 |
|
|
# meta_groups => { |
| 256 |
|
|
# all => [qw/swishdefault author comment keywords / ], |
| 257 |
|
|
# }, |
| 258 |
|
|
# |
| 259 |
|
|
# swish.cgi will then take a query like |
| 260 |
|
|
# |
| 261 |
|
|
# all=(query words) |
| 262 |
|
|
# |
| 263 |
|
|
# into the query |
| 264 |
|
|
# |
| 265 |
|
|
# swishdefault=(query words) OR author=(query words) OR comment=(query words) OR keywords=(query words) |
| 266 |
|
|
# |
| 267 |
|
|
# This is not ideal, but should work for most cases |
| 268 |
|
|
# (might fail under windows since the query is passed through the shell). |
| 269 |
|
|
|
| 270 |
|
|
# To enable this group add "all" to the list of metanames |
| 271 |
|
|
meta_groups => { |
| 272 |
|
|
all => [qw/swishdefault swishtitle swishdocpath/], |
| 273 |
|
|
}, |
| 274 |
|
|
|
| 275 |
|
|
|
| 276 |
|
|
|
| 277 |
|
|
# "name_labels" is used to map MetaNames and PropertyNames to user-friendly names |
| 278 |
|
|
# on the form. |
| 279 |
|
|
|
| 280 |
|
|
name_labels => { |
| 281 |
|
|
swishdefault => 'Title & Body', |
| 282 |
|
|
swishtitle => 'Title', |
| 283 |
|
|
swishrank => 'Rank', |
| 284 |
|
|
swishlastmodified => 'Last Modified Date', |
| 285 |
|
|
swishdocpath => 'Document Path', |
| 286 |
|
|
swishdocsize => 'Document Size', |
| 287 |
|
|
all => 'All', # group of metanames |
| 288 |
|
|
|
| 289 |
|
|
subject => 'Message Subject', # other examples |
| 290 |
|
|
name => "Poster's Name", |
| 291 |
|
|
email => "Poster's Email", |
| 292 |
|
|
sent => 'Message Date', |
| 293 |
|
|
}, |
| 294 |
|
|
|
| 295 |
|
|
|
| 296 |
|
|
timeout => 10, # limit time used by swish when fetching results - DoS protection. |
| 297 |
|
|
|
| 298 |
|
|
max_query_length => 100, # limit length of query string. Swish also has a limit (default is 40) |
| 299 |
|
|
# You might want to set swish-e's limit higher, and use this to get a |
| 300 |
|
|
# somewhat more friendly message. |
| 301 |
|
|
|
| 302 |
|
|
|
| 303 |
|
|
# These settings will use some crude highlighting code to highlight search terms in the |
| 304 |
|
|
# property specified above as the description_prop (normally, 'swishdescription'). |
| 305 |
|
|
|
| 306 |
|
|
|
| 307 |
|
|
max_chars => 500, # If "highlight" is not defined, then just truncate the description to this many *chars*. |
| 308 |
|
|
# If you want to go by *words*, enable highlighting, |
| 309 |
|
|
# and then comment-out show_words. It will be a little slower. |
| 310 |
|
|
|
| 311 |
|
|
|
| 312 |
|
|
# This structure defines term highlighting, and what type of highlighting to use |
| 313 |
|
|
# If you are using metanames in your searches and they map to properties that you |
| 314 |
|
|
# will display, you may need to adjust the "meta_to_prop_map". |
| 315 |
|
|
|
| 316 |
|
|
highlight => { |
| 317 |
|
|
|
| 318 |
|
|
# Pick highlighting module -- you must make sure the module can be found |
| 319 |
|
|
|
| 320 |
|
|
# Ok speed, but doesn't handle phrases. |
| 321 |
|
|
#Deals with stemming, but not stopwords |
| 322 |
|
|
#package => 'DefaultHighlight', |
| 323 |
|
|
|
| 324 |
|
|
# Somewhat slow, but deals with phases, stopwords, and stemming. |
| 325 |
|
|
# Takes into consideration WordCharacters, IgnoreFirstChars and IgnoreLastChars. |
| 326 |
|
|
package => 'PhraseHighlight', |
| 327 |
|
|
|
| 328 |
|
|
# Fast: phrases without regard to wordcharacter settings |
| 329 |
|
|
# doesn't do context display, so must match in first X words, |
| 330 |
|
|
# doesn't handle stemming or stopwords. |
| 331 |
|
|
#package => 'SimpleHighlight', |
| 332 |
|
|
|
| 333 |
|
|
show_words => 10, # Number of swish words words to show around highlighted word |
| 334 |
|
|
max_words => 100, # If no words are found to highlighted then show this many words |
| 335 |
|
|
occurrences => 6, # Limit number of occurrences of highlighted words |
| 336 |
|
|
#highlight_on => '<b>', # HTML highlighting codes |
| 337 |
|
|
#highlight_off => '</b>', |
| 338 |
|
|
highlight_on => '<font style="background:#FFFF99">', |
| 339 |
|
|
highlight_off => '</font>', |
| 340 |
|
|
|
| 341 |
|
|
# This maps search metatags to display properties. |
| 342 |
|
|
meta_to_prop_map => { |
| 343 |
|
|
swishdefault => [ qw/swishtitle swishdescription/ ], |
| 344 |
|
|
swishtitle => [ qw/swishtitle/ ], |
| 345 |
|
|
swishdocpath => [ qw/swishdocpath/ ], |
| 346 |
|
|
all => [ qw/swishtitle swishdescription swishdocpath/ ], |
| 347 |
|
|
}, |
| 348 |
|
|
}, |
| 349 |
|
|
|
| 350 |
|
|
|
| 351 |
|
|
|
| 352 |
|
|
# If you specify more than one index file (as an array reference) you |
| 353 |
|
|
# can set this allow selection of which indexes to search. |
| 354 |
|
|
# The default is to search all indexes specified if this is not used. |
| 355 |
|
|
# When used, the first index is the default index. |
| 356 |
|
|
|
| 357 |
|
|
# You need to specify your indexes as an array reference: |
| 358 |
|
|
#swish_index => [ qw/ index.swish-e index.other index2.other index3.other index4.other / ], |
| 359 |
|
|
|
| 360 |
|
|
Xselect_indexes => { |
| 361 |
|
|
#method => 'radio_group', # pick radio_group, popup_menu, or checkbox_group |
| 362 |
|
|
method => 'checkbox_group', |
| 363 |
|
|
#method => 'popup_menu', |
| 364 |
|
|
columns => 3, |
| 365 |
|
|
labels => [ 'Main Index', 'Other Index', qw/ two three four/ ], # Must match up one-to-one |
| 366 |
|
|
description => 'Select Site: ', |
| 367 |
|
|
}, |
| 368 |
|
|
|
| 369 |
|
|
|
| 370 |
|
|
# Similar to select_indexes, this adds a metaname search |
| 371 |
|
|
# based on a metaname. You can use any metaname, and this will |
| 372 |
|
|
# add an "AND" search to limit results to a subset of your records. |
| 373 |
|
|
# i.e. it adds something like 'site=(foo or bar or baz)' if foo, bar, and baz were selected. |
| 374 |
|
|
|
| 375 |
|
|
# Swish-e's ExtractPath would work well with this. For example, the apache docs: |
| 376 |
|
|
# ExtractPath site regex !^/usr/local/apache/htdocs/manual/([^/]+)/.+$!$1! |
| 377 |
|
|
# ExtractPathDefault site other |
| 378 |
|
|
|
| 379 |
|
|
|
| 380 |
|
|
Xselect_by_meta => { |
| 381 |
|
|
#method => 'radio_group', # pick: radio_group, popup_menu, or checkbox_group |
| 382 |
|
|
method => 'checkbox_group', |
| 383 |
|
|
#method => 'popup_menu', |
| 384 |
|
|
columns => 3, |
| 385 |
|
|
metaname => 'site', # Can't be a metaname used elsewhere! |
| 386 |
|
|
values => [qw/misc mod vhosts other/], |
| 387 |
|
|
labels => { |
| 388 |
|
|
misc => 'General Apache docs', |
| 389 |
|
|
mod => 'Apache Modules', |
| 390 |
|
|
vhosts => 'Virutal hosts', |
| 391 |
|
|
}, |
| 392 |
|
|
description => 'Limit search to these areas: ', |
| 393 |
|
|
}, |
| 394 |
|
|
|
| 395 |
|
|
|
| 396 |
|
|
|
| 397 |
|
|
|
| 398 |
|
|
# The 'template' setting defines what generates the output |
| 399 |
|
|
# The default is "TemplateDefault" which is reasonably ugly. |
| 400 |
|
|
# Note that some of the above options may not be available |
| 401 |
|
|
# for templating, as it's up to you do layout the form |
| 402 |
|
|
# and results in your template. |
| 403 |
|
|
|
| 404 |
|
|
|
| 405 |
|
|
xtemplate => { |
| 406 |
|
|
package => 'TemplateDefault', |
| 407 |
|
|
}, |
| 408 |
|
|
|
| 409 |
|
|
xtemplate => { |
| 410 |
|
|
package => 'TemplateDumper', |
| 411 |
|
|
}, |
| 412 |
|
|
|
| 413 |
|
|
xtemplate => { |
| 414 |
|
|
package => 'TemplateToolkit', |
| 415 |
|
|
file => 'search.tt', |
| 416 |
|
|
options => { |
| 417 |
|
|
INCLUDE_PATH => '/home/user/swish-e/example', |
| 418 |
|
|
#PRE_PROCESS => 'config', |
| 419 |
|
|
}, |
| 420 |
|
|
}, |
| 421 |
|
|
|
| 422 |
|
|
xtemplate => { |
| 423 |
|
|
package => 'TemplateHTMLTemplate', |
| 424 |
|
|
options => { |
| 425 |
|
|
filename => 'swish.tmpl', |
| 426 |
|
|
die_on_bad_params => 0, |
| 427 |
|
|
loop_context_vars => 1, |
| 428 |
|
|
cache => 1, |
| 429 |
|
|
}, |
| 430 |
|
|
}, |
| 431 |
|
|
|
| 432 |
|
|
|
| 433 |
|
|
|
| 434 |
|
|
# The "on_intranet" setting is just a flag that can be used to say you do |
| 435 |
|
|
# not have an external internet connection. It's here because the default |
| 436 |
|
|
# page generation includes links to images on swish-e.or and on www.w3.org. |
| 437 |
|
|
# If this is set to one then those images will not be shown. |
| 438 |
|
|
# (This only effects the default ouput module TemplateDefault) |
| 439 |
|
|
|
| 440 |
|
|
on_intranet => 0, |
| 441 |
|
|
|
| 442 |
|
|
|
| 443 |
|
|
|
| 444 |
|
|
# Here you can hard-code debugging options. The will help you find |
| 445 |
|
|
# where you made your mistake ;) |
| 446 |
|
|
# Using all at once will generate a lot of messages to STDERR |
| 447 |
|
|
# Please see the documentation before using these. |
| 448 |
|
|
# Typically, you will set these from the command line instead of in the configuration. |
| 449 |
|
|
|
| 450 |
|
|
# debug_options => 'basic, command, headers, output, summary, dump', |
| 451 |
|
|
|
| 452 |
|
|
|
| 453 |
|
|
|
| 454 |
|
|
# This defines the package object for reading CGI parameters |
| 455 |
|
|
# Defaults to CGI. Might be useful with mod_perl. |
| 456 |
|
|
# request_package => 'CGI', |
| 457 |
|
|
# request_package => 'Apache::Request', |
| 458 |
|
|
|
| 459 |
|
|
|
| 460 |
|
|
|
| 461 |
|
|
# Minor adjustment to page display. The page navigation normally looks like: |
| 462 |
|
|
# Page: 1 5 6 7 8 9 24 |
| 463 |
|
|
# where the first page and last page are always displayed. These can be disabled by |
| 464 |
|
|
# by setting to true values ( 1 ) |
| 465 |
|
|
|
| 466 |
|
|
no_first_page_navigation => 0, |
| 467 |
|
|
no_last_page_navigation => 0, |
| 468 |
|
|
|
| 469 |
|
|
|
| 470 |
|
|
|
| 471 |
|
|
|
| 472 |
|
|
# Limit to date ranges |
| 473 |
|
|
|
| 474 |
|
|
|
| 475 |
|
|
|
| 476 |
|
|
# This adds in the date_range limiting options |
| 477 |
|
|
# You will need the DateRanges.pm module from the author to use that feature |
| 478 |
|
|
|
| 479 |
|
|
# Noramlly, you will want to limit by the last modified date, so specify |
| 480 |
|
|
# "swishlastmodified" as the property_name. If indexing a mail archive, and, for |
| 481 |
|
|
# example, you store the date (a unix timestamp) as "date" then specify |
| 482 |
|
|
# "date" as the property_name. |
| 483 |
|
|
|
| 484 |
|
|
date_ranges => { |
| 485 |
|
|
property_name => 'swishlastmodified', # property name to limit by |
| 486 |
|
|
|
| 487 |
|
|
# what you specify here depends on the DateRanges.pm module. |
| 488 |
|
|
time_periods => [ |
| 489 |
|
|
'All', |
| 490 |
|
|
'Today', |
| 491 |
|
|
'Yesterday', |
| 492 |
|
|
#'Yesterday onward', |
| 493 |
|
|
'This Week', |
| 494 |
|
|
'Last Week', |
| 495 |
|
|
'Last 90 Days', |
| 496 |
|
|
'This Month', |
| 497 |
|
|
'Last Month', |
| 498 |
|
|
#'Past', |
| 499 |
|
|
#'Future', |
| 500 |
|
|
#'Next 30 Days', |
| 501 |
|
|
], |
| 502 |
|
|
|
| 503 |
|
|
line_break => 0, |
| 504 |
|
|
default => 'All', |
| 505 |
|
|
date_range => 1, |
| 506 |
|
|
}, |
| 507 |
|
|
|
| 508 |
|
|
}; |
| 509 |
|
|
|
| 510 |
|
|
} |
| 511 |
|
|
|
| 512 |
|
|
#^^^^^^^^^^^^^^^^^^^^^^^^^ end of user config ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| 513 |
|
|
#======================================================================================== |
| 514 |
|
|
|
| 515 |
|
|
|
| 516 |
|
|
|
| 517 |
|
|
#================================================================================= |
| 518 |
|
|
# mod_perl entry point |
| 519 |
|
|
# |
| 520 |
|
|
# As an example, you might use a PerlSetVar to point to paths to different |
| 521 |
|
|
# config files, and then cache the different configurations by path. |
| 522 |
|
|
# |
| 523 |
|
|
#================================================================================= |
| 524 |
|
|
|
| 525 |
|
|
my %cached_configs; |
| 526 |
|
|
|
| 527 |
|
|
sub handler { |
| 528 |
|
|
my $r = shift; |
| 529 |
|
|
|
| 530 |
|
|
if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) { |
| 531 |
|
|
|
| 532 |
|
|
# Already cached? |
| 533 |
|
|
if ( $cached_configs{ $config_path } ) { |
| 534 |
|
|
process_request( $cached_configs{ $config_path } ); |
| 535 |
|
|
return Apache::Constants::OK(); |
| 536 |
|
|
} |
| 537 |
|
|
|
| 538 |
|
|
# Else, load config |
| 539 |
|
|
my $config = default_config(); |
| 540 |
|
|
$config->{config_file} = $config_path; |
| 541 |
|
|
|
| 542 |
|
|
# Merge with disk config file. |
| 543 |
|
|
$cached_configs{ $config_path } = merge_read_config( $config ); |
| 544 |
|
|
|
| 545 |
|
|
process_request( $cached_configs{ $config_path } ); |
| 546 |
|
|
return Apache::Constants::OK(); |
| 547 |
|
|
} |
| 548 |
|
|
|
| 549 |
|
|
|
| 550 |
|
|
# Otherwise, use hard-coded config |
| 551 |
|
|
process_request( default_config() ); |
| 552 |
|
|
|
| 553 |
|
|
return Apache::Constants::OK(); |
| 554 |
|
|
|
| 555 |
|
|
} |
| 556 |
|
|
|
| 557 |
|
|
|
| 558 |
|
|
#============================================================================ |
| 559 |
|
|
# Read config settings from disk, and merge |
| 560 |
|
|
# Note, all errors are ignored since by default this script looks for a |
| 561 |
|
|
# config file. |
| 562 |
|
|
# |
| 563 |
|
|
#============================================================================ |
| 564 |
|
|
sub merge_read_config { |
| 565 |
|
|
my $config = shift; |
| 566 |
|
|
|
| 567 |
|
|
|
| 568 |
|
|
set_default_debug_flags(); |
| 569 |
|
|
|
| 570 |
|
|
set_debug($config); # get from config or from %ENV |
| 571 |
|
|
|
| 572 |
|
|
|
| 573 |
|
|
return $config unless $config->{config_file}; |
| 574 |
|
|
|
| 575 |
|
|
my $return = do $config->{config_file}; # load the config file |
| 576 |
|
|
|
| 577 |
|
|
unless ( ref $return eq 'HASH' ) { |
| 578 |
|
|
|
| 579 |
|
|
# First, let's check for file not found for the default config, which we can ignore |
| 580 |
|
|
|
| 581 |
|
|
my $error = $@ || $!; |
| 582 |
|
|
|
| 583 |
|
|
if ( $config->{config_file} eq $DEFAULT_CONFIG_FILE && !-e $config->{config_file} ) { |
| 584 |
|
|
warn "Config file '$config->{config_file}': $!" if $config->{debug}; |
| 585 |
|
|
return $config; |
| 586 |
|
|
} |
| 587 |
|
|
|
| 588 |
|
|
die "Config file '$config->{config_file}': $error"; |
| 589 |
|
|
} |
| 590 |
|
|
|
| 591 |
|
|
|
| 592 |
|
|
|
| 593 |
|
|
if ( $config->{debug} || $return->{debug} ) { |
| 594 |
|
|
require Data::Dumper; |
| 595 |
|
|
print STDERR "\n---------- Read config parameters from '$config->{config_file}' ------\n", |
| 596 |
|
|
Data::Dumper::Dumper($return), |
| 597 |
|
|
"-------------------------\n"; |
| 598 |
|
|
} |
| 599 |
|
|
|
| 600 |
|
|
set_debug( $return ); |
| 601 |
|
|
|
| 602 |
|
|
|
| 603 |
|
|
# Merge settings |
| 604 |
|
|
return { %$config, %$return }; |
| 605 |
|
|
} |
| 606 |
|
|
|
| 607 |
|
|
#-------------------------------------------------------------------------------------------------- |
| 608 |
|
|
sub set_default_debug_flags { |
| 609 |
|
|
# Debug flags defined |
| 610 |
|
|
|
| 611 |
|
|
$SwishSearch::DEBUG_BASIC = 1; # Show command used to run swish |
| 612 |
|
|
$SwishSearch::DEBUG_COMMAND = 2; # Show command used to run swish |
| 613 |
|
|
$SwishSearch::DEBUG_HEADERS = 4; # Swish output headers |
| 614 |
|
|
$SwishSearch::DEBUG_OUTPUT = 8; # Swish output besides headers |
| 615 |
|
|
$SwishSearch::DEBUG_SUMMARY = 16; # Summary of results parsed |
| 616 |
|
|
$SwishSearch::DEBUG_DUMP_DATA = 32; # dump data that is sent to templating modules |
| 617 |
|
|
} |
| 618 |
|
|
|
| 619 |
|
|
|
| 620 |
|
|
|
| 621 |
|
|
|
| 622 |
|
|
#--------------------------------------------------------------------------------------------------- |
| 623 |
|
|
sub set_debug { |
| 624 |
|
|
my $conf = shift; |
| 625 |
|
|
|
| 626 |
|
|
unless ( $ENV{SWISH_DEBUG} ||$conf->{debug_options} ) { |
| 627 |
|
|
$conf->{debug} = 0; |
| 628 |
|
|
return; |
| 629 |
|
|
} |
| 630 |
|
|
|
| 631 |
|
|
my %debug = ( |
| 632 |
|
|
basic => [$SwishSearch::DEBUG_BASIC, 'Basic debugging'], |
| 633 |
|
|
command => [$SwishSearch::DEBUG_COMMAND, 'Show command used to run swish'], |
| 634 |
|
|
headers => [$SwishSearch::DEBUG_HEADERS, 'Show headers returned from swish'], |
| 635 |
|
|
output => [$SwishSearch::DEBUG_OUTPUT, 'Show output from swish'], |
| 636 |
|
|
summary => [$SwishSearch::DEBUG_SUMMARY, 'Show summary of results'], |
| 637 |
|
|
dump => [$SwishSearch::DEBUG_DUMP_DATA, 'Show all data available to templates'], |
| 638 |
|
|
); |
| 639 |
|
|
|
| 640 |
|
|
|
| 641 |
|
|
$conf->{debug} = 1; |
| 642 |
|
|
|
| 643 |
|
|
for ( split /\s*,\s*/, $ENV{SWISH_DEBUG} ) { |
| 644 |
|
|
if ( exists $debug{ lc $_ } ) { |
| 645 |
|
|
$conf->{debug} |= $debug{ lc $_ }->[0]; |
| 646 |
|
|
next; |
| 647 |
|
|
} |
| 648 |
|
|
|
| 649 |
|
|
print STDERR "Unknown debug option '$_'. Must be one of:\n", |
| 650 |
|
|
join( "\n", map { sprintf(' %10s: %10s', $_, $debug{$_}->[1]) } sort { $debug{$a}->[0] <=> $debug{$b}->[0] }keys %debug), |
| 651 |
|
|
"\n\n"; |
| 652 |
|
|
exit; |
| 653 |
|
|
} |
| 654 |
|
|
|
| 655 |
|
|
print STDERR "Debug level set to: $conf->{debug}\n"; |
| 656 |
|
|
} |
| 657 |
|
|
|
| 658 |
|
|
|
| 659 |
|
|
#============================================================================ |
| 660 |
|
|
# |
| 661 |
|
|
# This is the main entry point, where a config hash is passed in. |
| 662 |
|
|
# |
| 663 |
|
|
#============================================================================ |
| 664 |
|
|
|
| 665 |
|
|
sub process_request { |
| 666 |
|
|
my $conf = shift; # configuration parameters |
| 667 |
|
|
|
| 668 |
|
|
# Use CGI.pm by default |
| 669 |
|
|
my $request_package = $conf->{request_package} || 'CGI'; |
| 670 |
|
|
$request_package =~ s[::][/]g; |
| 671 |
|
|
require "$request_package.pm"; |
| 672 |
|
|
|
| 673 |
|
|
my $request_object = $conf->{request_package} ? $conf->{request_package}->new : CGI->new; |
| 674 |
|
|
|
| 675 |
|
|
if ( $conf->{debug} ) { |
| 676 |
|
|
print STDERR 'Enter a query [all]: '; |
| 677 |
|
|
my $query = <STDIN>; |
| 678 |
|
|
$query =~ tr/\r//d; |
| 679 |
|
|
chomp $query; |
| 680 |
|
|
unless ( $query ) { |
| 681 |
|
|
print STDERR "Using 'not asdfghjklzxcv' to match all records\n"; |
| 682 |
|
|
$query = 'not asdfghjklzxcv'; |
| 683 |
|
|
} |
| 684 |
|
|
|
| 685 |
|
|
$request_object->param('query', $query ); |
| 686 |
|
|
|
| 687 |
|
|
print STDERR 'Enter max results to display [1]: '; |
| 688 |
|
|
my $max = <STDIN>; |
| 689 |
|
|
chomp $max; |
| 690 |
|
|
$max = 1 unless $max && $max =~/^\d+$/; |
| 691 |
|
|
|
| 692 |
|
|
$conf->{page_size} = $max; |
| 693 |
|
|
} |
| 694 |
|
|
|
| 695 |
|
|
|
| 696 |
|
|
|
| 697 |
|
|
# create search object |
| 698 |
|
|
my $search = SwishQuery->new( |
| 699 |
|
|
config => $conf, |
| 700 |
|
|
request => $request_object, |
| 701 |
|
|
); |
| 702 |
|
|
|
| 703 |
|
|
|
| 704 |
|
|
# run the query |
| 705 |
|
|
my $results = $search->run_query; # currently, results is the just the $search object |
| 706 |
|
|
|
| 707 |
|
|
if ( $conf->{debug} ) { |
| 708 |
|
|
if ( $conf->{debug} & $SwishSearch::DEBUG_DUMP_DATA ) { |
| 709 |
|
|
require Data::Dumper; |
| 710 |
|
|
print STDERR "\n------------- Results structure passed to template ------------\n", |
| 711 |
|
|
Data::Dumper::Dumper( $results ), |
| 712 |
|
|
"--------------------------\n"; |
| 713 |
|
|
} elsif ( $conf->{debug} & $SwishSearch::DEBUG_SUMMARY ) { |
| 714 |
|
|
print STDERR "\n------------- Results Summary ------------\n"; |
| 715 |
|
|
if ( $results->{hits} ) { |
| 716 |
|
|
require Data::Dumper; |
| 717 |
|
|
print STDERR "Showing $results->{navigation}{showing} of $results->{navigation}{hits}\n", |
| 718 |
|
|
Data::Dumper::Dumper( $results->{_results} ); |
| 719 |
|
|
} else { |
| 720 |
|
|
print STDERR "** NO RESULTS **\n"; |
| 721 |
|
|
} |
| 722 |
|
|
|
| 723 |
|
|
print STDERR "--------------------------\n"; |
| 724 |
|
|
} else { |
| 725 |
|
|
print STDERR ( ($results->{hits} ? "Found $results->{hits} results\n" : "Failed to find any results\n" . $results->errstr . "\n" ),"\n" ); |
| 726 |
|
|
} |
| 727 |
|
|
} |
| 728 |
|
|
|
| 729 |
|
|
|
| 730 |
|
|
|
| 731 |
|
|
my $template = $conf->{template} || { package => 'TemplateDefault' }; |
| 732 |
|
|
|
| 733 |
|
|
my $package = $template->{package}; |
| 734 |
|
|
|
| 735 |
|
|
my $file = "$package.pm"; |
| 736 |
|
|
$file =~ s[::][/]g; |
| 737 |
|
|
|
| 738 |
|
|
eval { require $file }; |
| 739 |
|
|
if ( $@ ) { |
| 740 |
|
|
warn "$0 $@\n"; |
| 741 |
|
|
print <<EOF; |
| 742 |
|
|
Content-Type: text/html |
| 743 |
|
|
|
| 744 |
|
|
<html> |
| 745 |
|
|
<head><title>Software Error</title></head> |
| 746 |
|
|
<body><h2>Software Error</h2><p>Please check error log</p></body> |
| 747 |
|
|
</html> |
| 748 |
|
|
EOF |
| 749 |
|
|
|
| 750 |
|
|
exit; |
| 751 |
|
|
} |
| 752 |
|
|
|
| 753 |
|
|
$package->show_template( $template, $results ); |
| 754 |
|
|
} |
| 755 |
|
|
|
| 756 |
|
|
|
| 757 |
|
|
|
| 758 |
|
|
|
| 759 |
|
|
|
| 760 |
|
|
#================================================================================================== |
| 761 |
|
|
package SwishQuery; |
| 762 |
|
|
#================================================================================================== |
| 763 |
|
|
|
| 764 |
|
|
use Carp; |
| 765 |
|
|
# Or use this instead -- PLEASE see perldoc CGI::Carp for details |
| 766 |
|
|
# <opinion>CGI::Carp doesn't help that much</opinion> |
| 767 |
|
|
#use CGI::Carp; # qw(fatalsToBrowser); |
| 768 |
|
|
|
| 769 |
|
|
|
| 770 |
|
|
#-------------------------------------------------------------------------------- |
| 771 |
|
|
# new() doesn't do much, just create the object |
| 772 |
|
|
#-------------------------------------------------------------------------------- |
| 773 |
|
|
sub new { |
| 774 |
|
|
my $class = shift; |
| 775 |
|
|
my %options = @_; |
| 776 |
|
|
|
| 777 |
|
|
my $conf = $options{config}; |
| 778 |
|
|
|
| 779 |
|
|
croak "Failed to set the swish index files in config setting 'swish_index'" unless $conf->{swish_index}; |
| 780 |
|
|
croak "Failed to specify 'swish_binary' in configuration" unless $conf->{swish_binary}; |
| 781 |
|
|
|
| 782 |
|
|
# initialize the request search hash |
| 783 |
|
|
my $sh = { |
| 784 |
|
|
prog => $conf->{swish_binary}, |
| 785 |
|
|
config => $conf, |
| 786 |
|
|
q => $options{request}, |
| 787 |
|
|
hits => 0, |
| 788 |
|
|
MOD_PERL => $ENV{MOD_PERL}, |
| 789 |
|
|
}; |
| 790 |
|
|
|
| 791 |
|
|
return bless $sh, $class; |
| 792 |
|
|
} |
| 793 |
|
|
|
| 794 |
|
|
|
| 795 |
|
|
sub hits { shift->{hits} } |
| 796 |
|
|
|
| 797 |
|
|
sub config { |
| 798 |
|
|
my ($self, $setting, $value ) = @_; |
| 799 |
|
|
|
| 800 |
|
|
croak "Failed to pass 'config' a setting" unless $setting; |
| 801 |
|
|
|
| 802 |
|
|
my $cur = $self->{config}{$setting} if exists $self->{config}{$setting}; |
| 803 |
|
|
|
| 804 |
|
|
$self->{config}{$setting} = $value if $value; |
| 805 |
|
|
|
| 806 |
|
|
return $cur; |
| 807 |
|
|
} |
| 808 |
|
|
|
| 809 |
|
|
sub header { |
| 810 |
|
|
my $self = shift; |
| 811 |
|
|
return unless ref $self->{_headers} eq 'HASH'; |
| 812 |
|
|
|
| 813 |
|
|
return $self->{_headers}{$_[0]} || ''; |
| 814 |
|
|
} |
| 815 |
|
|
|
| 816 |
|
|
|
| 817 |
|
|
# return a ref to an array |
| 818 |
|
|
sub results { |
| 819 |
|
|
my $self = shift; |
| 820 |
|
|
return $self->{_results} || undef; |
| 821 |
|
|
} |
| 822 |
|
|
|
| 823 |
|
|
sub navigation { |
| 824 |
|
|
my $self = shift; |
| 825 |
|
|
return unless ref $self->{navigation} eq 'HASH'; |
| 826 |
|
|
|
| 827 |
|
|
return exists $self->{navigation}{$_[0]} ? $self->{navigation}{$_[0]} : ''; |
| 828 |
|
|
} |
| 829 |
|
|
|
| 830 |
|
|
sub CGI { $_[0]->{q} }; |
| 831 |
|
|
|
| 832 |
|
|
|
| 833 |
|
|
|
| 834 |
|
|
|
| 835 |
|
|
sub swish_command { |
| 836 |
|
|
|
| 837 |
|
|
my $self = shift; |
| 838 |
|
|
|
| 839 |
|
|
unless ( @_ ) { |
| 840 |
|
|
return $self->{swish_command} ? @{$self->{swish_command}} : undef; |
| 841 |
|
|
} |
| 842 |
|
|
|
| 843 |
|
|
push @{$self->{swish_command}}, @_; |
| 844 |
|
|
} |
| 845 |
|
|
|
| 846 |
|
|
|
| 847 |
|
|
sub errstr { |
| 848 |
|
|
my ($self, $value ) = @_; |
| 849 |
|
|
|
| 850 |
|
|
|
| 851 |
|
|
$self->{_errstr} = $value if $value; |
| 852 |
|
|
|
| 853 |
|
|
return $self->{_errstr} || ''; |
| 854 |
|
|
} |
| 855 |
|
|
|
| 856 |
|
|
|
| 857 |
|
|
|
| 858 |
|
|
|
| 859 |
|
|
|
| 860 |
|
|
|
| 861 |
|
|
#============================================ |
| 862 |
|
|
# This returns "$self" just in case we want to seperate out into two objects later |
| 863 |
|
|
|
| 864 |
|
|
|
| 865 |
|
|
sub run_query { |
| 866 |
|
|
|
| 867 |
|
|
my $self = shift; |
| 868 |
|
|
|
| 869 |
|
|
my $q = $self->{q}; |
| 870 |
|
|
my $conf = $self->{config}; |
| 871 |
|
|
|
| 872 |
|
|
|
| 873 |
|
|
# Sets the query string, and any -L limits. |
| 874 |
|
|
return $self unless $self->build_query; |
| 875 |
|
|
|
| 876 |
|
|
|
| 877 |
|
|
|
| 878 |
|
|
# Set the starting position (which is offset by one) |
| 879 |
|
|
|
| 880 |
|
|
my $start = $q->param('start') || 0; |
| 881 |
|
|
$start = 0 unless $start =~ /^\d+$/ && $start >= 0; |
| 882 |
|
|
|
| 883 |
|
|
$self->swish_command( '-b', $start+1 ); |
| 884 |
|
|
|
| 885 |
|
|
|
| 886 |
|
|
|
| 887 |
|
|
# Set the max hits |
| 888 |
|
|
|
| 889 |
|
|
my $page_size = $self->config('page_size') || 15; |
| 890 |
|
|
$self->swish_command( '-m', $page_size ); |
| 891 |
|
|
|
| 892 |
|
|
|
| 893 |
|
|
return $self unless $self->set_index_file; |
| 894 |
|
|
|
| 895 |
|
|
|
| 896 |
|
|
|
| 897 |
|
|
# Set the sort option, if any |
| 898 |
|
|
return $self unless $self->set_sort_order; |
| 899 |
|
|
|
| 900 |
|
|
|
| 901 |
|
|
|
| 902 |
|
|
my $timeout = $self->config('timeout') || 0; |
| 903 |
|
|
|
| 904 |
|
|
eval { |
| 905 |
|
|
local $SIG{ALRM} = sub { die "Timed out\n" }; |
| 906 |
|
|
alarm $timeout if $timeout && $^O !~ /Win32/i; |
| 907 |
|
|
$self->run_swish; |
| 908 |
|
|
alarm 0 unless $^O =~ /Win32/i; |
| 909 |
|
|
waitpid $self->{pid}, 0 if $self->{pid}; # for IPC::Open2 |
| 910 |
|
|
}; |
| 911 |
|
|
|
| 912 |
|
|
if ( $@ ) { |
| 913 |
|
|
warn "$0 $@"; # if $conf->{debug}; |
| 914 |
|
|
$self->errstr( "Service currently unavailable" ); |
| 915 |
|
|
return $self; |
| 916 |
|
|
} |
| 917 |
|
|
|
| 918 |
|
|
|
| 919 |
|
|
|
| 920 |
|
|
my $hits = $self->hits; |
| 921 |
|
|
return $self unless $hits; |
| 922 |
|
|
|
| 923 |
|
|
|
| 924 |
|
|
|
| 925 |
|
|
# Build href for repeated search via GET (forward, backward links) |
| 926 |
|
|
|
| 927 |
|
|
|
| 928 |
|
|
my @query_string = |
| 929 |
|
|
map { "$_=" . $q->escape( $q->param($_) ) } |
| 930 |
|
|
grep { $q->param($_) } qw/query metaname sort reverse/; |
| 931 |
|
|
|
| 932 |
|
|
|
| 933 |
|
|
for my $p ( qw/si sbm/ ) { |
| 934 |
|
|
my @settings = $q->param($p); |
| 935 |
|
|
next unless @settings; |
| 936 |
|
|
push @query_string, "$p=" . $q->escape( $_ ) for @settings; |
| 937 |
|
|
} |
| 938 |
|
|
|
| 939 |
|
|
|
| 940 |
|
|
|
| 941 |
|
|
|
| 942 |
|
|
if ( $conf->{date_ranges} ) { |
| 943 |
|
|
my $dr = DateRanges::GetDateRangeArgs( $q ); |
| 944 |
|
|
push @query_string, $dr, if $dr; |
| 945 |
|
|
} |
| 946 |
|
|
|
| 947 |
|
|
|
| 948 |
|
|
$self->{query_href} = $q->script_name . '?' . join '&', @query_string; |
| 949 |
|
|
|
| 950 |
|
|
|
| 951 |
|
|
|
| 952 |
|
|
# Return the template fields |
| 953 |
|
|
|
| 954 |
|
|
$self->{my_url} = $q->script_name; |
| 955 |
|
|
|
| 956 |
|
|
$self->{hits} = $hits; |
| 957 |
|
|
|
| 958 |
|
|
$self->{navigation} = { |
| 959 |
|
|
showing => $hits, |
| 960 |
|
|
from => $start + 1, |
| 961 |
|
|
to => $start + $hits, |
| 962 |
|
|
hits => $self->header('number of hits') || 0, |
| 963 |
|
|
run_time => $self->header('run time') || 'unknown', |
| 964 |
|
|
search_time => $self->header('search time') || 'unknown', |
| 965 |
|
|
}; |
| 966 |
|
|
|
| 967 |
|
|
|
| 968 |
|
|
$self->set_page ( $page_size ); |
| 969 |
|
|
|
| 970 |
|
|
return $self; |
| 971 |
|
|
|
| 972 |
|
|
} |
| 973 |
|
|
|
| 974 |
|
|
|
| 975 |
|
|
#============================================================ |
| 976 |
|
|
# Build a query string from swish |
| 977 |
|
|
# Just builds the -w string |
| 978 |
|
|
#------------------------------------------------------------ |
| 979 |
|
|
|
| 980 |
|
|
sub build_query { |
| 981 |
|
|
my $self = shift; |
| 982 |
|
|
|
| 983 |
|
|
my $q = $self->{q}; |
| 984 |
|
|
|
| 985 |
|
|
|
| 986 |
|
|
# set up the query string to pass to swish. |
| 987 |
|
|
my $query = $q->param('query') || ''; |
| 988 |
|
|
|
| 989 |
|
|
for ( $query ) { # trim the query string |
| 990 |
|
|
s/\s+$//; |
| 991 |
|
|
s/^\s+//; |
| 992 |
|
|
} |
| 993 |
|
|
|
| 994 |
|
|
$self->{query_simple} = $query; # without metaname |
| 995 |
|
|
$q->param('query', $query ); # clean up the query, if needed. |
| 996 |
|
|
|
| 997 |
|
|
|
| 998 |
|
|
# Read in the date limits, if any. This can create a new query |
| 999 |
|
|
return unless $self->get_date_limits( \$query ); |
| 1000 |
|
|
|
| 1001 |
|
|
|
| 1002 |
|
|
unless ( $query ) { |
| 1003 |
|
|
$self->errstr('Please enter a query string') if $q->param('submit'); |
| 1004 |
|
|
return; |
| 1005 |
|
|
} |
| 1006 |
|
|
|
| 1007 |
|
|
|
| 1008 |
|
|
if ( length( $query ) > $self->{config}{max_query_length} ) { |
| 1009 |
|
|
$self->errstr('Please enter a shorter query'); |
| 1010 |
|
|
return; |
| 1011 |
|
|
} |
| 1012 |
|
|
|
| 1013 |
|
|
|
| 1014 |
|
|
|
| 1015 |
|
|
# Adjust the query string for metaname search |
| 1016 |
|
|
# *Everything* is a metaname search |
| 1017 |
|
|
# Might also like to allow searching more than one metaname at the same time |
| 1018 |
|
|
|
| 1019 |
|
|
my $metaname = $q->param('metaname') || 'swishdefault'; |
| 1020 |
|
|
|
| 1021 |
|
|
|
| 1022 |
|
|
# make sure it's a valid metaname |
| 1023 |
|
|
|
| 1024 |
|
|
my $conf = $self->{config}; |
| 1025 |
|
|
my @metas = ('swishdefault'); |
| 1026 |
|
|
push @metas, @{ $self->config('metanames')} if $self->config('metanames'); |
| 1027 |
|
|
my %meta_lookup = map { $_ => 1 } @metas; |
| 1028 |
|
|
|
| 1029 |
|
|
unless ( $meta_lookup{$metaname} ) { |
| 1030 |
|
|
$self->errstr('Bad MetaName provided'); |
| 1031 |
|
|
return; |
| 1032 |
|
|
} |
| 1033 |
|
|
|
| 1034 |
|
|
# prepend metaname to query |
| 1035 |
|
|
|
| 1036 |
|
|
if ( $conf->{meta_groups} && $conf->{meta_groups}{$metaname} ) { |
| 1037 |
|
|
$query = join ' OR ', map { "$_=($query)" } @{$conf->{meta_groups}{$metaname}}; |
| 1038 |
|
|
|
| 1039 |
|
|
# This is used to create a fake entry in the parsed query so highlighting |
| 1040 |
|
|
# can find the query words |
| 1041 |
|
|
$self->{real_metaname} = $conf->{meta_groups}{$metaname}[0]; |
| 1042 |
|
|
} else { |
| 1043 |
|
|
$query = $metaname . "=($query)"; |
| 1044 |
|
|
} |
| 1045 |
|
|
|
| 1046 |
|
|
# save the metaname so we know what field to highlight |
| 1047 |
|
|
# Note that this might be a fake metaname |
| 1048 |
|
|
$self->{metaname} = $metaname; |
| 1049 |
|
|
|
| 1050 |
|
|
|
| 1051 |
|
|
## Look for a "limit" metaname -- perhaps used with ExtractPath |
| 1052 |
|
|
# Here we don't worry about user supplied data |
| 1053 |
|
|
|
| 1054 |
|
|
my $limits = $self->config('select_by_meta'); |
| 1055 |
|
|
my @limits = $q->param('sbm'); # Select By Metaname |
| 1056 |
|
|
|
| 1057 |
|
|
|
| 1058 |
|
|
# Note that this could be messed up by ending the query in a NOT or OR |
| 1059 |
|
|
# Should look into doing: |
| 1060 |
|
|
# $query = "( $query ) AND " . $limits->{metaname} . '=(' . join( ' OR ', @limits ) . ')'; |
| 1061 |
|
|
if ( @limits && ref $limits eq 'HASH' && $limits->{metaname} ) { |
| 1062 |
|
|
$query .= ' and ' . $limits->{metaname} . '=(' . join( ' or ', @limits ) . ')'; |
| 1063 |
|
|
} |
| 1064 |
|
|
|
| 1065 |
|
|
|
| 1066 |
|
|
$self->swish_command('-w', $query ); |
| 1067 |
|
|
|
| 1068 |
|
|
return 1; |
| 1069 |
|
|
} |
| 1070 |
|
|
|
| 1071 |
|
|
#======================================================================== |
| 1072 |
|
|
# Get the index files from the form, or from simple the config settings |
| 1073 |
|
|
#------------------------------------------------------------------------ |
| 1074 |
|
|
|
| 1075 |
|
|
sub set_index_file { |
| 1076 |
|
|
my $self = shift; |
| 1077 |
|
|
|
| 1078 |
|
|
my $q = $self->CGI; |
| 1079 |
|
|
|
| 1080 |
|
|
# Set the index file |
| 1081 |
|
|
|
| 1082 |
|
|
if ( $self->config('select_indexes') && ref $self->config('swish_index') eq 'ARRAY' ) { |
| 1083 |
|
|
|
| 1084 |
|
|
my @choices = $q->param('si'); |
| 1085 |
|
|
if ( !@choices ) { |
| 1086 |
|
|
$self->errstr('Please select a source to search'); |
| 1087 |
|
|
return; |
| 1088 |
|
|
} |
| 1089 |
|
|
|
| 1090 |
|
|
my @indexes = @{$self->config('swish_index')}; |
| 1091 |
|
|
|
| 1092 |
|
|
|
| 1093 |
|
|
my @selected_indexes = grep {/^\d+$/ && $_ >= 0 && $_ < @indexes } @choices; |
| 1094 |
|
|
|
| 1095 |
|
|
if ( !@selected_indexes ) { |
| 1096 |
|
|
$self->errstr('Invalid source selected'); |
| 1097 |
|
|
return $self; |
| 1098 |
|
|
} |
| 1099 |
|
|
$self->swish_command( '-f', @indexes[ @selected_indexes ] ); |
| 1100 |
|
|
|
| 1101 |
|
|
|
| 1102 |
|
|
} else { |
| 1103 |
|
|
my $indexes = $self->config('swish_index'); |
| 1104 |
|
|
$self->swish_command( '-f', ref $indexes ? @$indexes : $indexes ); |
| 1105 |
|
|
} |
| 1106 |
|
|
|
| 1107 |
|
|
return 1; |
| 1108 |
|
|
} |
| 1109 |
|
|
|
| 1110 |
|
|
#================================================================================ |
| 1111 |
|
|
# Parse out the date limits from the form or from GET request |
| 1112 |
|
|
# |
| 1113 |
|
|
#--------------------------------------------------------------------------------- |
| 1114 |
|
|
|
| 1115 |
|
|
sub get_date_limits { |
| 1116 |
|
|
|
| 1117 |
|
|
my ( $self, $query_ref ) = @_; |
| 1118 |
|
|
|
| 1119 |
|
|
my $conf = $self->{config}; |
| 1120 |
|
|
|
| 1121 |
|
|
# Are date ranges enabled? |
| 1122 |
|
|
return 1 unless $conf->{date_ranges}; |
| 1123 |
|
|
|
| 1124 |
|
|
|
| 1125 |
|
|
eval { require DateRanges }; |
| 1126 |
|
|
if ( $@ ) { |
| 1127 |
|
|
print STDERR "\n------ Can't use DateRanges feature ------------\n", |
| 1128 |
|
|
"\nScript will run, but you can't use the date range feature\n", |
| 1129 |
|
|
$@, |
| 1130 |
|
|
"\n--------------\n" if $conf->{debug}; |
| 1131 |
|
|
|
| 1132 |
|
|
delete $conf->{date_ranges}; |
| 1133 |
|
|
return 1; |
| 1134 |
|
|
} |
| 1135 |
|
|
|
| 1136 |
|
|
my $q = $self->{q}; |
| 1137 |
|
|
|
| 1138 |
|
|
my %limits; |
| 1139 |
|
|
|
| 1140 |
|
|
unless ( DateRanges::DateRangeParse( $q, \%limits ) ) { |
| 1141 |
|
|
$self->errstr( $limits{DateRanges_error} || 'Bad date range selection' ); |
| 1142 |
|
|
return; |
| 1143 |
|
|
} |
| 1144 |
|
|
|
| 1145 |
|
|
# Store the values for later |
| 1146 |
|
|
|
| 1147 |
|
|
$self->{DateRanges_time_low} = $limits{DateRanges_time_low}; |
| 1148 |
|
|
$self->{DateRanges_time_high} = $limits{DateRanges_time_high}; |
| 1149 |
|
|
|
| 1150 |
|
|
|
| 1151 |
|
|
# Allow searchs just be date if not "All dates" search |
| 1152 |
|
|
# $$$ should place some limits here, and provide a switch to disable |
| 1153 |
|
|
if ( !$$query_ref && $limits{DateRanges_time_high} ) { |
| 1154 |
|
|
$$query_ref = 'not skaisikdeekk'; |
| 1155 |
|
|
$self->{_search_all}++; # flag |
| 1156 |
|
|
} |
| 1157 |
|
|
|
| 1158 |
|
|
|
| 1159 |
|
|
my $limit_prop = $conf->{date_ranges}{property_name} || 'swishlastmodified'; |
| 1160 |
|
|
|
| 1161 |
|
|
|
| 1162 |
|
|
if ( $limits{DateRanges_time_low} && $limits{DateRanges_time_high} ) { |
| 1163 |
|
|
$self->swish_command( '-L', $limit_prop, $limits{DateRanges_time_low}, $limits{DateRanges_time_high} ); |
| 1164 |
|
|
} |
| 1165 |
|
|
|
| 1166 |
|
|
return 1; |
| 1167 |
|
|
} |
| 1168 |
|
|
|
| 1169 |
|
|
|
| 1170 |
|
|
|
| 1171 |
|
|
#================================================================ |
| 1172 |
|
|
# Set the sort order |
| 1173 |
|
|
# Just builds the -s string |
| 1174 |
|
|
#---------------------------------------------------------------- |
| 1175 |
|
|
|
| 1176 |
|
|
sub set_sort_order { |
| 1177 |
|
|
my $self = shift; |
| 1178 |
|
|
|
| 1179 |
|
|
my $q = $self->{q}; |
| 1180 |
|
|
|
| 1181 |
|
|
my $sorts_array = $self->config('sorts'); |
| 1182 |
|
|
return 1 unless $sorts_array; |
| 1183 |
|
|
|
| 1184 |
|
|
|
| 1185 |
|
|
my $conf = $self->{config}; |
| 1186 |
|
|
|
| 1187 |
|
|
|
| 1188 |
|
|
# Now set sort option - if a valid option submitted (or you could let swish-e return the error). |
| 1189 |
|
|
my %sorts = map { $_, 1 } @$sorts_array; |
| 1190 |
|
|
|
| 1191 |
|
|
my $sortby = $q->param('sort') || 'swishrank'; |
| 1192 |
|
|
|
| 1193 |
|
|
if ( $sortby && $sorts{ $sortby } ) { |
| 1194 |
|
|
|
| 1195 |
|
|
my $direction = $sortby eq 'swishrank' |
| 1196 |
|
|
? $q->param('reverse') ? 'asc' : 'desc' |
| 1197 |
|
|
: $q->param('reverse') ? 'desc' : 'asc'; |
| 1198 |
|
|
|
| 1199 |
|
|
$self->swish_command( '-s', $sortby, $direction ); |
| 1200 |
|
|
|
| 1201 |
|
|
if ( $conf->{secondary_sort} && $sortby ne $conf->{secondary_sort}[0] ) { |
| 1202 |
|
|
$self->swish_command(ref $conf->{secondary_sort} ? @{ $conf->{secondary_sort} } : $conf->{secondary_sort} ); |
| 1203 |
|
|
} |
| 1204 |
|
|
|
| 1205 |
|
|
} else { |
| 1206 |
|
|
$self->errstr( 'Invalid Sort Option Selected' ); |
| 1207 |
|
|
return; |
| 1208 |
|
|
} |
| 1209 |
|
|
|
| 1210 |
|
|
return 1; |
| 1211 |
|
|
} |
| 1212 |
|
|
|
| 1213 |
|
|
|
| 1214 |
|
|
|
| 1215 |
|
|
#======================================================== |
| 1216 |
|
|
# Sets prev and next page links. |
| 1217 |
|
|
# Feel free to clean this code up! |
| 1218 |
|
|
# |
| 1219 |
|
|
# Pass: |
| 1220 |
|
|
# $resutls - reference to a hash (for access to the headers returned by swish) |
| 1221 |
|
|
# $q - CGI object |
| 1222 |
|
|
# |
| 1223 |
|
|
# Returns: |
| 1224 |
|
|
# Sets entries in the $results hash |
| 1225 |
|
|
# |
| 1226 |
|
|
|
| 1227 |
|
|
sub set_page { |
| 1228 |
|
|
|
| 1229 |
|
|
my ( $self, $Page_Size ) = @_; |
| 1230 |
|
|
|
| 1231 |
|
|
my $q = $self->{q}; |
| 1232 |
|
|
|
| 1233 |
|
|
my $navigation = $self->{navigation}; |
| 1234 |
|
|
|
| 1235 |
|
|
|
| 1236 |
|
|
my $start = $navigation->{from} - 1; # Current starting record |
| 1237 |
|
|
|
| 1238 |
|
|
|
| 1239 |
|
|
my $prev = $start - $Page_Size; |
| 1240 |
|
|
$prev = 0 if $prev < 0; |
| 1241 |
|
|
|
| 1242 |
|
|
if ( $prev < $start ) { |
| 1243 |
|
|
$navigation->{prev} = $prev; |
| 1244 |
|
|
$navigation->{prev_count} = $start - $prev; |
| 1245 |
|
|
} |
| 1246 |
|
|
|
| 1247 |
|
|
|
| 1248 |
|
|
my $last = $navigation->{hits} - 1; |
| 1249 |
|
|
|
| 1250 |
|
|
|
| 1251 |
|
|
my $next = $start + $Page_Size; |
| 1252 |
|
|
$next = $last if $next > $last; |
| 1253 |
|
|
my $cur_end = $start + $self->{hits} - 1; |
| 1254 |
|
|
if ( $next > $cur_end ) { |
| 1255 |
|
|
$navigation->{next} = $next; |
| 1256 |
|
|
$navigation->{next_count} = $next + $Page_Size > $last |
| 1257 |
|
|
? $last - $next + 1 |
| 1258 |
|
|
: $Page_Size; |
| 1259 |
|
|
} |
| 1260 |
|
|
|
| 1261 |
|
|
|
| 1262 |
|
|
# Calculate pages ( is this -1 correct here? ) |
| 1263 |
|
|
|
| 1264 |
|
|
my $pages = int (($navigation->{hits} -1) / $Page_Size); |
| 1265 |
|
|
if ( $pages ) { |
| 1266 |
|
|
|
| 1267 |
|
|
my @pages = 0..$pages; |
| 1268 |
|
|
|
| 1269 |
|
|
my $max_pages = 10; |
| 1270 |
|
|
|
| 1271 |
|
|
if ( @pages > $max_pages ) { |
| 1272 |
|
|
my $current_page = int ( $start / $Page_Size - $max_pages/2) ; |
| 1273 |
|
|
$current_page = 0 if $current_page < 0; |
| 1274 |
|
|
if ( $current_page + $max_pages - 1 > $pages ) { |
| 1275 |
|
|
$current_page = $pages - $max_pages; |
| 1276 |
|
|
} |
| 1277 |
|
|
|
| 1278 |
|
|
@pages = $current_page..$current_page + $max_pages - 1; |
| 1279 |
|
|
unshift @pages, 0 if $current_page && !$self->{config}{no_first_page_navigation}; |
| 1280 |
|
|
push @pages, $pages unless $current_page + $max_pages - 1 == $pages || $self->{config}{no_last_page_navigation} |
| 1281 |
|
|
} |
| 1282 |
|
|
|
| 1283 |
|
|
|
| 1284 |
|
|
$navigation->{pages} = |
| 1285 |
|
|
join ' ', map { |
| 1286 |
|
|
my $page_start = $_ * $Page_Size; |
| 1287 |
|
|
my $page = $_ + 1; |
| 1288 |
|
|
$page_start == $start |
| 1289 |
|
|
? $page |
| 1290 |
|
|
: qq[<a href="$self->{query_href}&start=$page_start">$page</a>]; |
| 1291 |
|
|
} @pages; |
| 1292 |
|
|
} |
| 1293 |
|
|
|
| 1294 |
|
|
} |
| 1295 |
|
|
|
| 1296 |
|
|
#================================================== |
| 1297 |
|
|
# Format and return the date range options in HTML |
| 1298 |
|
|
# |
| 1299 |
|
|
#-------------------------------------------------- |
| 1300 |
|
|
sub get_date_ranges { |
| 1301 |
|
|
|
| 1302 |
|
|
my $self = shift; |
| 1303 |
|
|
|
| 1304 |
|
|
my $q = $self->{q}; |
| 1305 |
|
|
my $conf = $self->{config}; |
| 1306 |
|
|
|
| 1307 |
|
|
return '' unless $conf->{date_ranges}; |
| 1308 |
|
|
|
| 1309 |
|
|
# pass parametes, and a hash to store the returned values. |
| 1310 |
|
|
|
| 1311 |
|
|
my %fields; |
| 1312 |
|
|
|
| 1313 |
|
|
DateRanges::DateRangeForm( $q, $conf->{date_ranges}, \%fields ); |
| 1314 |
|
|
|
| 1315 |
|
|
|
| 1316 |
|
|
# Set the layout: |
| 1317 |
|
|
|
| 1318 |
|
|
my $string = '<br>Limit to: ' |
| 1319 |
|
|
. ( $fields{buttons} ? "$fields{buttons}<br>" : '' ) |
| 1320 |
|
|
. ( $fields{date_range_button} || '' ) |
| 1321 |
|
|
. ( $fields{date_range_low} |
| 1322 |
|
|
? " $fields{date_range_low} through $fields{date_range_high}" |
| 1323 |
|
|
: '' ); |
| 1324 |
|
|
|
| 1325 |
|
|
return $string; |
| 1326 |
|
|
} |
| 1327 |
|
|
|
| 1328 |
|
|
|
| 1329 |
|
|
|
| 1330 |
|
|
#============================================ |
| 1331 |
|
|
# Run swish-e and gathers headers and results |
| 1332 |
|
|
# Currently requires fork() to run. |
| 1333 |
|
|
# |
| 1334 |
|
|
# Pass: |
| 1335 |
|
|
# $sh - an array with search parameters |
| 1336 |
|
|
# |
| 1337 |
|
|
# Returns: |
| 1338 |
|
|
# a reference to a hash that contains the headers and results |
| 1339 |
|
|
# or possibly a scalar with an error message. |
| 1340 |
|
|
# |
| 1341 |
|
|
|
| 1342 |
|
|
|
| 1343 |
|
|
sub run_swish { |
| 1344 |
|
|
|
| 1345 |
|
|
|
| 1346 |
|
|
my $self = shift; |
| 1347 |
|
|
|
| 1348 |
|
|
my $results = $self->{results}; |
| 1349 |
|
|
my $conf = $self->{config}; |
| 1350 |
|
|
my $q = $self->{q}; |
| 1351 |
|
|
|
| 1352 |
|
|
|
| 1353 |
|
|
my @properties; |
| 1354 |
|
|
my %seen; |
| 1355 |
|
|
|
| 1356 |
|
|
# Gather up the properties specified |
| 1357 |
|
|
|
| 1358 |
|
|
for ( qw/ title_property description_prop display_props / ) { |
| 1359 |
|
|
push @properties, ref $conf->{$_} ? @{$conf->{$_}} : $conf->{$_} |
| 1360 |
|
|
if $conf->{$_} && !$seen{$_}++; |
| 1361 |
|
|
} |
| 1362 |
|
|
|
| 1363 |
|
|
# Add in the default props |
| 1364 |
|
|
for ( qw/swishrank swishdocpath/ ) { |
| 1365 |
|
|
push @properties, $_ unless $seen{$_}; |
| 1366 |
|
|
} |
| 1367 |
|
|
|
| 1368 |
|
|
|
| 1369 |
|
|
# add in the default prop - a number must be first (this might be a duplicate in -x, oh well) |
| 1370 |
|
|
@properties = ( 'swishreccount', @properties ); |
| 1371 |
|
|
|
| 1372 |
|
|
$self->swish_command( -x => join( '\t', map { "<$_>" } @properties ) . '\n' ); |
| 1373 |
|
|
|
| 1374 |
|
|
$self->swish_command( -H => 9 ); |
| 1375 |
|
|
|
| 1376 |
|
|
my $fh = $^O =~ /Win32/i |
| 1377 |
|
|
? windows_fork( $conf, $self ) |
| 1378 |
|
|
: real_fork( $conf, $self ); |
| 1379 |
|
|
|
| 1380 |
|
|
|
| 1381 |
|
|
$self->{COMMAND} = join ' ', $self->{prog}, $self->swish_command; |
| 1382 |
|
|
|
| 1383 |
|
|
|
| 1384 |
|
|
# read in from child |
| 1385 |
|
|
|
| 1386 |
|
|
|
| 1387 |
|
|
my @results; |
| 1388 |
|
|
|
| 1389 |
|
|
my $trim_prop = $self->config('description_prop'); |
| 1390 |
|
|
|
| 1391 |
|
|
my $highlight = $self->config('highlight'); |
| 1392 |
|
|
my $highlight_object; |
| 1393 |
|
|
|
| 1394 |
|
|
# Loop through values returned from swish. |
| 1395 |
|
|
|
| 1396 |
|
|
my %stops_removed; |
| 1397 |
|
|
|
| 1398 |
|
|
my $unknown_output = ''; |
| 1399 |
|
|
|
| 1400 |
|
|
|
| 1401 |
|
|
while (<$fh>) { |
| 1402 |
|
|
|
| 1403 |
|
|
chomp; |
| 1404 |
|
|
tr/\r//d; |
| 1405 |
|
|
|
| 1406 |
|
|
# This will not work correctly with multiple indexes when different values are used. |
| 1407 |
|
|
if ( /^# ([^:]+):\s+(.+)$/ ) { |
| 1408 |
|
|
|
| 1409 |
|
|
print STDERR "$_\n" if $conf->{debug} & $SwishSearch::DEBUG_HEADERS; |
| 1410 |
|
|
|
| 1411 |
|
|
my $h = lc $1; |
| 1412 |
|
|
my $value = $2; |
| 1413 |
|
|
$self->{_headers}{$h} = $value; |
| 1414 |
|
|
|
| 1415 |
|
|
push @{$self->{_headers}{'removed stopwords'}}, $value if $h eq 'removed stopword' && !$stops_removed{$value}++; |
| 1416 |
|
|
|
| 1417 |
|
|
next; |
| 1418 |
|
|
} elsif ( $conf->{debug} & $SwishSearch::DEBUG_OUTPUT ) { |
| 1419 |
|
|
print STDERR "$_\n"; |
| 1420 |
|
|
} |
| 1421 |
|
|
|
| 1422 |
|
|
|
| 1423 |
|
|
|
| 1424 |
|
|
# return swish errors as a mesage to the script |
| 1425 |
|
|
$self->errstr($1), return if /^err:\s*(.+)/; |
| 1426 |
|
|
|
| 1427 |
|
|
# Or, if you want to log the errors and just say "Service Unavailable" use this: |
| 1428 |
|
|
#die "$1\n" if /^err:\s*(.+)/; |
| 1429 |
|
|
|
| 1430 |
|
|
|
| 1431 |
|
|
# Found a result |
| 1432 |
|
|
if ( /^\d/ ) { |
| 1433 |
|
|
|
| 1434 |
|
|
my %h; |
| 1435 |
|
|
@h{@properties} = split /\t/; |
| 1436 |
|
|
push @results, \%h; |
| 1437 |
|
|
|
| 1438 |
|
|
# There's a chance that the docpath could be modified by highlighting |
| 1439 |
|
|
# when used in a "display_props". |
| 1440 |
|
|
$h{saved_swishdocpath} = $h{swishdocpath}; |
| 1441 |
|
|
|
| 1442 |
|
|
my $docpath = $h{swishdocpath}; |
| 1443 |
|
|
$docpath =~ s/\s/%20/g; # Replace spaces |
| 1444 |
|
|
$h{swishdocpath_href} = ( $self->config('prepend_path') || '' ) . $docpath; |
| 1445 |
|
|
|
| 1446 |
|
|
|
| 1447 |
|
|
|
| 1448 |
|
|
|
| 1449 |
|
|
|
| 1450 |
|
|
# Now do any formatting |
| 1451 |
|
|
if ( $highlight ) { |
| 1452 |
|
|
if ( !$highlight_object ) { |
| 1453 |
|
|
my $package = $highlight->{package} || 'DefaultHighlight'; |
| 1454 |
|
|
|
| 1455 |
|
|
eval { require "$package.pm" }; |
| 1456 |
|
|
if ( $@ ) { |
| 1457 |
|
|
$self->errstr( "Failed to load Highlighting Module - check error log" ); |
| 1458 |
|
|
warn "$0: $@"; |
| 1459 |
|
|
$highlight = ''; |
| 1460 |
|
|
next; |
| 1461 |
|
|
} else { |
| 1462 |
|
|
$highlight_object = $package->new( $self, $self->{metaname} ); |
| 1463 |
|
|
} |
| 1464 |
|
|
} |
| 1465 |
|
|
|
| 1466 |
|
|
# Highlight any fields, as needed |
| 1467 |
|
|
$highlight_object->highlight( \%h ); |
| 1468 |
|
|
|
| 1469 |
|
|
next; |
| 1470 |
|
|
} |
| 1471 |
|
|
|
| 1472 |
|
|
|
| 1473 |
|
|
|
| 1474 |
|
|
|
| 1475 |
|
|
# Trim down the description if no highlight, or if highlighting some other property |
| 1476 |
|
|
# Not very nice. The highlighting code would limit by words |
| 1477 |
|
|
|
| 1478 |
|
|
if ( $trim_prop && $h{$trim_prop} ) { |
| 1479 |
|
|
my $max = $conf->{max_chars} || 500; |
| 1480 |
|
|
|
| 1481 |
|
|
if ( length $h{$trim_prop} > $max ) { |
| 1482 |
|
|
$h{$trim_prop} = substr( $h{$trim_prop}, 0, $max) . ' <b>...</b>'; |
| 1483 |
|
|
} |
| 1484 |
|
|
} |
| 1485 |
|
|
|
| 1486 |
|
|
next; |
| 1487 |
|
|
|
| 1488 |
|
|
} elsif ( /^\.$/ ) { |
| 1489 |
|
|
last; |
| 1490 |
|
|
|
| 1491 |
|
|
} else { |
| 1492 |
|
|
next if /^#/; |
| 1493 |
|
|
} |
| 1494 |
|
|
|
| 1495 |
|
|
$unknown_output .= "'$_'\n"; |
| 1496 |
|
|
|
| 1497 |
|
|
|
| 1498 |
|
|
|
| 1499 |
|
|
|
| 1500 |
|
|
} |
| 1501 |
|
|
|
| 1502 |
|
|
die "Swish returned unknown output: $unknown_output\n" if $unknown_output; |
| 1503 |
|
|
|
| 1504 |
|
|
$self->{hits} = @results; |
| 1505 |
|
|
$self->{_results} = \@results if @results; |
| 1506 |
|
|
|
| 1507 |
|
|
} |
| 1508 |
|
|
|
| 1509 |
|
|
#================================================================== |
| 1510 |
|
|
# Run swish-e by forking |
| 1511 |
|
|
# |
| 1512 |
|
|
|
| 1513 |
|
|
use Symbol; |
| 1514 |
|
|
|
| 1515 |
|
|
sub real_fork { |
| 1516 |
|
|
my ( $conf, $self ) = @_; |
| 1517 |
|
|
|
| 1518 |
|
|
|
| 1519 |
|
|
# Run swish |
| 1520 |
|
|
my $fh = gensym; |
| 1521 |
|
|
my $pid = open( $fh, '-|' ); |
| 1522 |
|
|
|
| 1523 |
|
|
die "Failed to fork: $!\n" unless defined $pid; |
| 1524 |
|
|
|
| 1525 |
|
|
|
| 1526 |
|
|
|
| 1527 |
|
|
if ( !$pid ) { # in child |
| 1528 |
|
|
if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) { |
| 1529 |
|
|
print STDERR "---- Running swish with the following command and parameters ----\n"; |
| 1530 |
|
|
print STDERR join( " \\\n", map { /[^\/.\-\w\d]/ ? qq['$_'] : $_ } $self->{prog}, $self->swish_command ); |
| 1531 |
|
|
print STDERR "\n-----------------------------------------------\n"; |
| 1532 |
|
|
} |
| 1533 |
|
|
|
| 1534 |
|
|
|
| 1535 |
|
|
unless ( exec $self->{prog}, $self->swish_command ) { |
| 1536 |
|
|
warn "Child process Failed to exec '$self->{prog}' Error: $!"; |
| 1537 |
|
|
print "Failed to exec Swish"; # send this message to parent. |
| 1538 |
|
|
exit; |
| 1539 |
|
|
} |
| 1540 |
|
|
} |
| 1541 |
|
|
|
| 1542 |
|
|
return $fh; |
| 1543 |
|
|
} |
| 1544 |
|
|
|
| 1545 |
|
|
|
| 1546 |
|
|
#===================================================================================== |
| 1547 |
|
|
# Windows work around |
| 1548 |
|
|
# from perldoc perlfok -- na, that doesn't work. Try IPC::Open2 |
| 1549 |
|
|
# |
| 1550 |
|
|
sub windows_fork { |
| 1551 |
|
|
my ( $conf, $self ) = @_; |
| 1552 |
|
|
|
| 1553 |
|
|
if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) { |
| 1554 |
|
|
print STDERR "---- Running swish with the following command and parameters ----\n"; |
| 1555 |
|
|
print STDERR join( ' ', map { /[^.\-\w\d]/ ? qq["$_"] : $_ } map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command ); |
| 1556 |
|
|
print STDERR "\n-----------------------------------------------\n"; |
| 1557 |
|
|
} |
| 1558 |
|
|
|
| 1559 |
|
|
|
| 1560 |
|
|
require IPC::Open2; |
| 1561 |
|
|
my ( $rdrfh, $wtrfh ); |
| 1562 |
|
|
|
| 1563 |
|
|
# Ok, I'll say it. Windows sucks. |
| 1564 |
|
|
my @command = map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command; |
| 1565 |
|
|
my $pid = IPC::Open2::open2($rdrfh, $wtrfh, @command ); |
| 1566 |
|
|
|
| 1567 |
|
|
|
| 1568 |
|
|
$self->{pid} = $pid; |
| 1569 |
|
|
|
| 1570 |
|
|
return $rdrfh; |
| 1571 |
|
|
} |
| 1572 |
|
|
|
| 1573 |
|
|
#===================================================================================== |
| 1574 |
|
|
# This method parses out the query from the "Parsed words" returned by swish |
| 1575 |
|
|
# for use in highlighting routines |
| 1576 |
|
|
# This returns a hash ref: |
| 1577 |
|
|
# $query->{text} # evertying is currently at level "text" |
| 1578 |
|
|
# {$metaname} # the meta name |
| 1579 |
|
|
# [ array of phrases ] |
| 1580 |
|
|
# each phrase is made up of an array of words |
| 1581 |
|
|
|
| 1582 |
|
|
|
| 1583 |
|
|
|
| 1584 |
|
|
|
| 1585 |
|
|
|
| 1586 |
|
|
use constant DEBUG_QUERY_PARSED => 0; |
| 1587 |
|
|
|
| 1588 |
|
|
sub extract_query_match { |
| 1589 |
|
|
my $self = shift; |
| 1590 |
|
|
|
| 1591 |
|
|
my $query = $self->header('parsed words'); # grab query parsed by swish |
| 1592 |
|
|
|
| 1593 |
|
|
|
| 1594 |
|
|
my %query_match; # kewords broken down by layer and field. |
| 1595 |
|
|
$self->{query_match} = \%query_match; |
| 1596 |
|
|
|
| 1597 |
|
|
|
| 1598 |
|
|
# Loop through the query |
| 1599 |
|
|
|
| 1600 |
|
|
while ( $query =~ /([a-z]+)\s+=\s+(.+?)(?=\s+[a-z]+\s+=|$)/g ) { |
| 1601 |
|
|
|
| 1602 |
|
|
my ( $field, $words ) = ( $1, $2 ); |
| 1603 |
|
|
|
| 1604 |
|
|
|
| 1605 |
|
|
my $inquotes; |
| 1606 |
|
|
my $buffer; |
| 1607 |
|
|
my %single_words; |
| 1608 |
|
|
|
| 1609 |
|
|
my $layer = 'text'; # This might be used in the future to highlight tags when matching a href. |
| 1610 |
|
|
|
| 1611 |
|
|
# Expand group searches -- not currently used |
| 1612 |
|
|
my @fields = ( $field ); |
| 1613 |
|
|
|
| 1614 |
|
|
|
| 1615 |
|
|
for my $word ( split /\s+/, $words ) { |
| 1616 |
|
|
|
| 1617 |
|
|
|
| 1618 |
|
|
# XXX This list of swish operators could change "and or not" and is dependent on stopwords. |
| 1619 |
|
|
# remove control words and parens |
| 1620 |
|
|
next if !$inquotes && $word =~ /^(and|or|not|\(|\))$/; |
| 1621 |
|
|
|
| 1622 |
|
|
$buffer = [] unless $inquotes; # is there a better way to allocate memory like this? |
| 1623 |
|
|
|
| 1624 |
|
|
if ( $word eq '"' ) { |
| 1625 |
|
|
unless ( $inquotes ) { |
| 1626 |
|
|
$inquotes++; |
| 1627 |
|
|
next; |
| 1628 |
|
|
} else { |
| 1629 |
|
|
$inquotes = 0; |
| 1630 |
|
|
} |
| 1631 |
|
|
|
| 1632 |
|
|
} else { |
| 1633 |
|
|
|
| 1634 |
|
|
push @$buffer, $word; |
| 1635 |
|
|
} |
| 1636 |
|
|
|
| 1637 |
|
|
|
| 1638 |
|
|
next if $inquotes; |
| 1639 |
|
|
|
| 1640 |
|
|
|
| 1641 |
|
|
# Only record single words once (this will probably break something) |
| 1642 |
|
|
# Reason: to reduce the number of matches must check |
| 1643 |
|
|
next if @$buffer == 1 && $single_words{ $buffer->[0] }++; |
| 1644 |
|
|
|
| 1645 |
|
|
|
| 1646 |
|
|
push @{$query_match{$layer}{$_}}, $buffer foreach @fields; |
| 1647 |
|
|
|
| 1648 |
|
|
|
| 1649 |
|
|
} |
| 1650 |
|
|
} |
| 1651 |
|
|
|
| 1652 |
|
|
|
| 1653 |
|
|
# Here's a hack to make metaname expansion work |
| 1654 |
|
|
# this will make an entry like all => [qw/ query words /]; for use with fake metanames |
| 1655 |
|
|
|
| 1656 |
|
|
$query_match{text}{ $self->{metaname} } = $query_match{text}{$self->{real_metaname}} |
| 1657 |
|
|
if $self->{real_metaname} && $query_match{text}{$self->{real_metaname}}; |
| 1658 |
|
|
|
| 1659 |
|
|
|
| 1660 |
|
|
|
| 1661 |
|
|
# Now, sort in desending order of phrase lenght |
| 1662 |
|
|
|
| 1663 |
|
|
|
| 1664 |
|
|
foreach my $layer ( keys %query_match ) { |
| 1665 |
|
|
print STDERR " LAYER: $layer\n" if DEBUG_QUERY_PARSED; |
| 1666 |
|
|
|
| 1667 |
|
|
|
| 1668 |
|
|
foreach my $tag ( keys %{$query_match{$layer}} ) { |
| 1669 |
|
|
|
| 1670 |
|
|
@{$query_match{$layer}{$tag}} = sort { @$b <=> @$a } @{$query_match{$layer}{$tag}}; |
| 1671 |
|
|
|
| 1672 |
|
|
|
| 1673 |
|
|
if ( DEBUG_QUERY_PARSED ) { |
| 1674 |
|
|
print STDERR " TAG: '$tag'\n"; |
| 1675 |
|
|
print STDERR " : '@$_'\n" foreach @{$query_match{$layer}{$tag}}; |
| 1676 |
|
|
} |
| 1677 |
|
|
} |
| 1678 |
|
|
} |
| 1679 |
|
|
|
| 1680 |
|
|
|
| 1681 |
|
|
# display parsed query instead of the title for debugging |
| 1682 |
|
|
# use Data::Dumper; |
| 1683 |
|
|
# $self->config('title',"<pre><font size=3>Query:\n$query\n" . Dumper(\%query_match) . '</font></pre>'); |
| 1684 |
|
|
|
| 1685 |
|
|
|
| 1686 |
|
|
return \%query_match; |
| 1687 |
|
|
} |
| 1688 |
|
|
|
| 1689 |
|
|
|
| 1690 |
|
|
1; |
| 1691 |
|
|
|
| 1692 |
|
|
|
| 1693 |
|
|
__END__ |
| 1694 |
|
|
|
| 1695 |
|
|
=head1 NAME |
| 1696 |
|
|
|
| 1697 |
|
|
swish.cgi -- Example Perl script for searching with the SWISH-E search engine. |
| 1698 |
|
|
|
| 1699 |
|
|
=head1 DESCRIPTION |
| 1700 |
|
|
|
| 1701 |
|
|
C<swish.cgi> is a CGI script for searching with the SWISH-E search engine version 2.1-dev and above. |
| 1702 |
|
|
It returns results a page at a time, with matching words from the source document highlighted, showing a |
| 1703 |
|
|
few words of content on either side of the highlighted word. |
| 1704 |
|
|
|
| 1705 |
|
|
The script is highly configurable; you can search multiple (or selectable) indexes, limit searches to |
| 1706 |
|
|
part of the index, allow sorting by a number of different properties, limit results to a date range, and so on. |
| 1707 |
|
|
|
| 1708 |
|
|
The standard configuration (i.e. not using a config file) should work with most swish index files. |
| 1709 |
|
|
Customization of the parameters will be |
| 1710 |
|
|
needed if you are indexing special meta data and want to search and/or display the meta data. The |
| 1711 |
|
|
configuration can be modified by editing this script directly, or by using a configuration file (.swishcgi.conf |
| 1712 |
|
|
by default). |
| 1713 |
|
|
|
| 1714 |
|
|
You are strongly encouraged to get the default configuration working before making changes. Most problems |
| 1715 |
|
|
using this script are the result of configuration modifications. |
| 1716 |
|
|
|
| 1717 |
|
|
The script is modular in design. Both the highlighting code and output generation is handled by modules, which |
| 1718 |
|
|
are included in the F<example/modules> directory. This allows for easy customization of the output without |
| 1719 |
|
|
changing the main CGI script. A module exists to generate standard HTML output. There's also modules and |
| 1720 |
|
|
template examples to use with the popular Perl templating systems HTML::Template and Template-Toolkit. This allows |
| 1721 |
|
|
you to tightly integrate this script with the look of an existing template-driven web site. |
| 1722 |
|
|
HTML::Template and Template-Toolkit are available from the CPAN (http://search.cpan.org). |
| 1723 |
|
|
|
| 1724 |
|
|
This scipt can also run basically unmodified as a mod_perl handler, providing much better performance than |
| 1725 |
|
|
running as a CGI script. |
| 1726 |
|
|
|
| 1727 |
|
|
Please read the rest of the documentation. There's a C<DEBUGGING> section, and a C<FAQ> section. |
| 1728 |
|
|
|
| 1729 |
|
|
This script should work on Windows, but security may be an issue. |
| 1730 |
|
|
|
| 1731 |
|
|
=head1 REQUIREMENTS |
| 1732 |
|
|
|
| 1733 |
|
|
You should be running a reasonably current version of Perl. 5.00503 or above is recommended (anything older |
| 1734 |
|
|
will not be supported). |
| 1735 |
|
|
|
| 1736 |
|
|
If you wish to use the date range feature you will need to install the Date::Calc module. This is available |
| 1737 |
|
|
from http://search.cpan.org. |
| 1738 |
|
|
|
| 1739 |
|
|
|
| 1740 |
|
|
=head1 INSTALLATION |
| 1741 |
|
|
|
| 1742 |
|
|
Here's an example installation session. Please get a simple installation working before modifying the |
| 1743 |
|
|
configuration file. Most problems reported for using this script have been due to improper configuration. |
| 1744 |
|
|
|
| 1745 |
|
|
The script's default settings are setup for initial testing. By default the settings expect to find |
| 1746 |
|
|
most files and the swish-e binary in the same directory as the script. |
| 1747 |
|
|
|
| 1748 |
|
|
For I<security> reasons, once you have tested the script you will want to change settings to limit access |
| 1749 |
|
|
to some of these files by the web server |
| 1750 |
|
|
(either by moving them out of web space, or using access control such as F<.htaccess>). |
| 1751 |
|
|
An example of using F<.htaccess> on Apache is given below. |
| 1752 |
|
|
|
| 1753 |
|
|
It's expected that you have already unpacked the swish-e distribution |
| 1754 |
|
|
and built the swish-e binary (if using a source distribution). |
| 1755 |
|
|
|
| 1756 |
|
|
Below is a (unix) session where we create a directory, move required files into this directory, adjust |
| 1757 |
|
|
permissions, index some documents, and symlink into the web server. |
| 1758 |
|
|
|
| 1759 |
|
|
=over 4 |
| 1760 |
|
|
|
| 1761 |
|
|
=item 1 Move required files into their own directory. |
| 1762 |
|
|
|
| 1763 |
|
|
This assumes that swish-e was unpacked and build in the ~/swish-e directory. |
| 1764 |
|
|
|
| 1765 |
|
|
~ >mkdir swishdir |
| 1766 |
|
|
~ >cd swishdir |
| 1767 |
|
|
~/swishdir >cp ~/swish-e/example/swish.cgi . |
| 1768 |
|
|
~/swishdir >cp -rp ~/swish-e/example/modules . |
| 1769 |
|
|
~/swishdir >cp ~/swish-e/src/swish-e . |
| 1770 |
|
|
~/swishdir >chmod 755 swish.cgi |
| 1771 |
|
|
~/swishdir >chmod 644 modules/* |
| 1772 |
|
|
|
| 1773 |
|
|
|
| 1774 |
|
|
=item 2 Create an index |
| 1775 |
|
|
|
| 1776 |
|
|
This step you will create a simple configuration file. In this example the Apache documentation |
| 1777 |
|
|
is indexed. Last we run a simple query to test swish. |
| 1778 |
|
|
|
| 1779 |
|
|
~/swishdir >cat swish.conf |
| 1780 |
|
|
IndexDir /usr/local/apache/htdocs |
| 1781 |
|
|
IndexOnly .html .htm |
| 1782 |
|
|
DefaultContents HTML |
| 1783 |
|
|
StoreDescription HTML <body> 200000 |
| 1784 |
|
|
MetaNames swishdocpath swishtitle |
| 1785 |
|
|
|
| 1786 |
|
|
~/swishdir >./swish-e -c swish.conf |
| 1787 |
|
|
Indexing Data Source: "File-System" |
| 1788 |
|
|
Indexing "/usr/local/apache/htdocs" |
| 1789 |
|
|
Removing very common words... |
| 1790 |
|
|
no words removed. |
| 1791 |
|
|
Writing main index... |
| 1792 |
|
|
Sorting words ... |
| 1793 |
|
|
Sorting 7005 words alphabetically |
| 1794 |
|
|
Writing header ... |
| 1795 |
|
|
Writing index entries ... |
| 1796 |
|
|
Writing word text: Complete |
| 1797 |
|
|
Writing word hash: Complete |
| 1798 |
|
|
Writing word data: Complete |
| 1799 |
|
|
7005 unique words indexed. |
| 1800 |
|
|
5 properties sorted. |
| 1801 |
|
|
124 files indexed. 1485844 total bytes. 171704 total words. |
| 1802 |
|
|
Elapsed time: 00:00:02 CPU time: 00:00:02 |
| 1803 |
|
|
Indexing done! |
| 1804 |
|
|
|
| 1805 |
|
|
Now, verify that the index can be searched: |
| 1806 |
|
|
|
| 1807 |
|
|
~/swishdir >./swish-e -w install -m 1 |
| 1808 |
|
|
# SWISH format: 2.1-dev-25 |
| 1809 |
|
|
# Search words: install |
| 1810 |
|
|
# Number of hits: 14 |
| 1811 |
|
|
# Search time: 0.001 seconds |
| 1812 |
|
|
# Run time: 0.040 seconds |
| 1813 |
|
|
1000 /usr/local/apache/htdocs/manual/dso.html "Apache 1.3 Dynamic Shared Object (DSO) support" 17341 |
| 1814 |
|
|
. |
| 1815 |
|
|
|
| 1816 |
|
|
Let's see what files we have in our directory now: |
| 1817 |
|
|
|
| 1818 |
|
|
~/swishdir >ls -1 -F |
| 1819 |
|
|
index.swish-e |
| 1820 |
|
|
index.swish-e.prop |
| 1821 |
|
|
modules/ |
| 1822 |
|
|
swish-e* |
| 1823 |
|
|
swish.cgi* |
| 1824 |
|
|
swish.conf |
| 1825 |
|
|
|
| 1826 |
|
|
=item 3 Test the CGI script |
| 1827 |
|
|
|
| 1828 |
|
|
This is a simple step, but often overlooked. You should test from the command line instead of jumping |
| 1829 |
|
|
ahead and testing with the web server. See the C<DEBUGGING> section below for more information. |
| 1830 |
|
|
|
| 1831 |
|
|
~/swishdir >./swish.cgi | head |
| 1832 |
|
|
Content-Type: text/html; charset=ISO-8859-1 |
| 1833 |
|
|
|
| 1834 |
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> |
| 1835 |
|
|
<html> |
| 1836 |
|
|
<head> |
| 1837 |
|
|
<title> |
| 1838 |
|
|
Search our site |
| 1839 |
|
|
</title> |
| 1840 |
|
|
</head> |
| 1841 |
|
|
<body> |
| 1842 |
|
|
|
| 1843 |
|
|
The above shows that the script can be run directly, and generates a correct HTTP header and HTML. |
| 1844 |
|
|
|
| 1845 |
|
|
If you run the above and see something like this: |
| 1846 |
|
|
|
| 1847 |
|
|
~/swishdir >./swish.cgi |
| 1848 |
|
|
bash: ./swish.cgi: No such file or directory |
| 1849 |
|
|
|
| 1850 |
|
|
then you probably need to edit the script to point to the correct location of your perl program. |
| 1851 |
|
|
Here's one way to find out where perl is located (again, on unix): |
| 1852 |
|
|
|
| 1853 |
|
|
~/swishdir >which perl |
| 1854 |
|
|
/usr/local/bin/perl |
| 1855 |
|
|
|
| 1856 |
|
|
~/swishdir >/usr/local/bin/perl -v |
| 1857 |
|
|
This is perl, v5.6.0 built for i586-linux |
| 1858 |
|
|
... |
| 1859 |
|
|
|
| 1860 |
|
|
Good! We are using a reasonably current version of perl. You should be running |
| 1861 |
|
|
at least perl 5.005 (5.00503 really). You may have problems otherwise. |
| 1862 |
|
|
|
| 1863 |
|
|
Now that we know perl is at F</usr/local/bin/perl> we can adjust the "shebang" line |
| 1864 |
|
|
in the perl script (e.g. the first line of the script): |
| 1865 |
|
|
|
| 1866 |
|
|
~/swishdir >pico swish.cgi |
| 1867 |
|
|
(edit the #! line) |
| 1868 |
|
|
~/swishdir >head -1 swish.cgi |
| 1869 |
|
|
#!/usr/local/bin/perl -w |
| 1870 |
|
|
|
| 1871 |
|
|
=item 4 Test with your web server |
| 1872 |
|
|
|
| 1873 |
|
|
How you do this is completely dependent on your web server, and you may need to talk to your web |
| 1874 |
|
|
server admin to get this working. Often files with the .cgi extension are automatically set up to |
| 1875 |
|
|
run as CGI scripts, but not always. In other words, this step is really up to you to figure out! |
| 1876 |
|
|
|
| 1877 |
|
|
First, I create a symlink in Apache's document root to point to my test directory "swishdir". This will work |
| 1878 |
|
|
because I know my Apache server is configured to follow symbolic links. |
| 1879 |
|
|
|
| 1880 |
|
|
~/swishdir >su -c 'ln -s /home/bill/swishdir /usr/local/apache/htdocs/swishdir' |
| 1881 |
|
|
Password: ********* |
| 1882 |
|
|
|
| 1883 |
|
|
If your account is on an ISP and your web directory is F<~/public_html> the you might just move the entire |
| 1884 |
|
|
directory: |
| 1885 |
|
|
|
| 1886 |
|
|
mv ~/swishdir ~/public_html |
| 1887 |
|
|
|
| 1888 |
|
|
Now, let's make a real HTTP request. I happen to have Apache setup on a local port: |
| 1889 |
|
|
|
| 1890 |
|
|
~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head -3 |
| 1891 |
|
|
#!/usr/local/bin/perl -w |
| 1892 |
|
|
package SwishSearch; |
| 1893 |
|
|
use strict; |
| 1894 |
|
|
|
| 1895 |
|
|
Oh, darn. It looks like Apache is not running the script and instead returning it as a |
| 1896 |
|
|
static page. I need to tell Apache that swish.cgi is a CGI script. |
| 1897 |
|
|
|
| 1898 |
|
|
In my case F<.htaccess> comes to the rescue: |
| 1899 |
|
|
|
| 1900 |
|
|
~/swishdir >cat .htaccess |
| 1901 |
|
|
|
| 1902 |
|
|
# Deny everything by default |
| 1903 |
|
|
Deny From All |
| 1904 |
|
|
|
| 1905 |
|
|
# But allow just CGI script |
| 1906 |
|
|
<files swish.cgi> |
| 1907 |
|
|
Options ExecCGI |
| 1908 |
|
|
Allow From All |
| 1909 |
|
|
SetHandler cgi-script |
| 1910 |
|
|
</files> |
| 1911 |
|
|
|
| 1912 |
|
|
Let's try the request one more time: |
| 1913 |
|
|
|
| 1914 |
|
|
~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head |
| 1915 |
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> |
| 1916 |
|
|
<html> |
| 1917 |
|
|
<head> |
| 1918 |
|
|
<title> |
| 1919 |
|
|
Search our site |
| 1920 |
|
|
</title> |
| 1921 |
|
|
</head> |
| 1922 |
|
|
<body> |
| 1923 |
|
|
<h2> |
| 1924 |
|
|
<a href="http://swish-e.org"> |
| 1925 |
|
|
|
| 1926 |
|
|
That looks better! Now use your web browser to test. |
| 1927 |
|
|
|
| 1928 |
|
|
Make sure you look at your web server's error log file while testing the script. |
| 1929 |
|
|
|
| 1930 |
|
|
BTW - "GET" is a program included with Perl's LWP library. If you do no have this you might |
| 1931 |
|
|
try something like: |
| 1932 |
|
|
|
| 1933 |
|
|
wget -O - http://localhost:8000/swishdir/swish.cgi | head |
| 1934 |
|
|
|
| 1935 |
|
|
and if nothing else, you can always telnet to the web server and make a basic request. |
| 1936 |
|
|
|
| 1937 |
|
|
~/swishtest > telnet localhost 8000 |
| 1938 |
|
|
Trying 127.0.0.1... |
| 1939 |
|
|
Connected to localhost. |
| 1940 |
|
|
Escape character is '^]'. |
| 1941 |
|
|
GET /swishtest/swish.cgi http/1.0 |
| 1942 |
|
|
|
| 1943 |
|
|
HTTP/1.1 200 OK |
| 1944 |
|
|
Date: Wed, 13 Feb 2002 20:14:31 GMT |
| 1945 |
|
|
Server: Apache/1.3.20 (Unix) mod_perl/1.25_01 |
| 1946 |
|
|
Connection: close |
| 1947 |
|
|
Content-Type: text/html; charset=ISO-8859-1 |
| 1948 |
|
|
|
| 1949 |
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> |
| 1950 |
|
|
<html> |
| 1951 |
|
|
<head> |
| 1952 |
|
|
<title> |
| 1953 |
|
|
Search our site |
| 1954 |
|
|
</title> |
| 1955 |
|
|
</head> |
| 1956 |
|
|
<body> |
| 1957 |
|
|
|
| 1958 |
|
|
This may seem like a lot of work compared to using a browser, but browsers |
| 1959 |
|
|
are a poor tool for basic CGI debugging. |
| 1960 |
|
|
|
| 1961 |
|
|
|
| 1962 |
|
|
=back |
| 1963 |
|
|
|
| 1964 |
|
|
If you have problems check the C<DEBUGGING> section below. |
| 1965 |
|
|
|
| 1966 |
|
|
=head1 CONFIGURATION |
| 1967 |
|
|
|
| 1968 |
|
|
If you want to change the location of the swish-e binary or the index file, use multiple indexes, add additional metanames and properties, |
| 1969 |
|
|
change the default highlighting behavior, etc., you will need to adjust the script's configuration settings. |
| 1970 |
|
|
|
| 1971 |
|
|
Please get a test setup working with the default parameters before making changes to any configuration settings. |
| 1972 |
|
|
Better to debug one thing at a time... |
| 1973 |
|
|
|
| 1974 |
|
|
In general, you will need to adjust the script's settings to match the index file you are searching. For example, |
| 1975 |
|
|
if you are indexing a hypermail list archive you may want to make the script |
| 1976 |
|
|
use metanames/properties of Subject, Author, and, Email address. Or you may wish to provide a way to limit |
| 1977 |
|
|
searches to parts of your index file (e.g. parts of your directory tree). |
| 1978 |
|
|
|
| 1979 |
|
|
To make things somewhat "simple", the configuration parameters are included near the top of the swish.cgi program. |
| 1980 |
|
|
That is the only place that the individual parameters are defined and explained, so you will need to open up |
| 1981 |
|
|
the swish.cgi script in an editor to view the options. Further questions about individual settings should |
| 1982 |
|
|
be referred to the swish-e discussion list. |
| 1983 |
|
|
|
| 1984 |
|
|
The parameters are all part of a perl C<hash> structure, and the comments at the top of the program should |
| 1985 |
|
|
get you going. The perl hash structure may seem a bit confusing, but it makes it easy to create nested and complex |
| 1986 |
|
|
parameters. Syntax is important, so cut-n-paste should be your best defense if you are not a perl programmer. |
| 1987 |
|
|
|
| 1988 |
|
|
By the way, Perl has a number of quote operators. For example, to quote a string you might write: |
| 1989 |
|
|
|
| 1990 |
|
|
title => 'Search My Site', |
| 1991 |
|
|
|
| 1992 |
|
|
Some options take more than one parameter, where each parameter must be quoted. For example: |
| 1993 |
|
|
|
| 1994 |
|
|
metanames => [ 'swishdefault', 'swishtitle', 'swishdocpath' ], |
| 1995 |
|
|
|
| 1996 |
|
|
Which assigns an array ( [...] ) of three strings to the "metanames" variable. |
| 1997 |
|
|
Lists of quotes strings are so common in perl that there's a special operator called "qw" (quote word): |
| 1998 |
|
|
|
| 1999 |
|
|
metanames => [ qw/ swishdefault swishtitle swishdocpath / ], |
| 2000 |
|
|
|
| 2001 |
|
|
or to use the parenthesis as the quote character (you can pick any): |
| 2002 |
|
|
|
| 2003 |
|
|
metanames => [ qw( swishdefault swishtitle swishdocpath ) ], |
| 2004 |
|
|
|
| 2005 |
|
|
|
| 2006 |
|
|
You have two options for changing the configuration settings from their default values: |
| 2007 |
|
|
you may edit the script directly, or you may use a configuration file. In either case, the configuration |
| 2008 |
|
|
settings are a basic perl hash reference. |
| 2009 |
|
|
|
| 2010 |
|
|
Using a configuration file is described below, but contains the same hash structure. |
| 2011 |
|
|
|
| 2012 |
|
|
There are many configuration settings, and some of them are commented out either by using |
| 2013 |
|
|
a "#" symbol, or by simply renaming the configuration directive (e.g. by adding an "x" to the parameter |
| 2014 |
|
|
name). |
| 2015 |
|
|
|
| 2016 |
|
|
A very basic configuration setup might look like: |
| 2017 |
|
|
|
| 2018 |
|
|
return { |
| 2019 |
|
|
title => 'Search the Swish-e list', # Title of your choice. |
| 2020 |
|
|
swish_binary => './swish-e', # Location of swish-e binary |
| 2021 |
|
|
swish_index => 'index.swish-e', # Location of your index file |
| 2022 |
|
|
}; |
| 2023 |
|
|
|
| 2024 |
|
|
Or if searching more than one index: |
| 2025 |
|
|
|
| 2026 |
|
|
return { |
| 2027 |
|
|
title => 'Search the Swish-e list', |
| 2028 |
|
|
swish_binary => './swish-e', |
| 2029 |
|
|
swish_index => ['index.swish-e', 'index2'], |
| 2030 |
|
|
}; |
| 2031 |
|
|
|
| 2032 |
|
|
Both of these examples return a reference to a perl hash ( C<return {...}> ). In the second example, |
| 2033 |
|
|
the multiple index files are set as an array reference. |
| 2034 |
|
|
|
| 2035 |
|
|
Note that in the example above the swish-e binary file is relative to the current directory. |
| 2036 |
|
|
If running under mod_perl you will typically need to use absolute paths. |
| 2037 |
|
|
|
| 2038 |
|
|
B<Using A Configuration File> |
| 2039 |
|
|
|
| 2040 |
|
|
As mentioned above, you can either edit the F<swish.cgi> script directly and modify the configuration settings, or |
| 2041 |
|
|
use an external configuration file. The settings in the configuration file are merged with (override) |
| 2042 |
|
|
the settings defined in the script. |
| 2043 |
|
|
|
| 2044 |
|
|
The advantage of using a configuration script is that you are not editing the swish.cgi script directly, and |
| 2045 |
|
|
downloading a new version won't mean re-editing the cgi script. Also, if running under mod_perl you can use the same |
| 2046 |
|
|
script loaded into Apache to manage many different search pages. |
| 2047 |
|
|
|
| 2048 |
|
|
By default, the script will attempt to read from the file F<.swishcgi.conf>. |
| 2049 |
|
|
For example, you might only wish to change the title used |
| 2050 |
|
|
in the script. Simply create a file called F<.swishcgi.conf> in the same directory as the CGI script: |
| 2051 |
|
|
|
| 2052 |
|
|
> cat .swishcgi.conf |
| 2053 |
|
|
# Example swish.cgi configuration script. |
| 2054 |
|
|
return { |
| 2055 |
|
|
title => 'Search Our Mailing List Archive', |
| 2056 |
|
|
}; |
| 2057 |
|
|
|
| 2058 |
|
|
The settings you use will depend on the index you create with swish. Here's a basic configuration: |
| 2059 |
|
|
|
| 2060 |
|
|
return { |
| 2061 |
|
|
title => 'Search the Apache documentation', |
| 2062 |
|
|
swish_binary => './swish-e', |
| 2063 |
|
|
swish_index => 'index.swish-e', |
| 2064 |
|
|
metanames => [qw/swishdefault swishdocpath swishtitle/], |
| 2065 |
|
|
display_props => [qw/swishtitle swishlastmodified swishdocsize swishdocpath/], |
| 2066 |
|
|
title_property => 'swishdocpath', |
| 2067 |
|
|
prepend_path => 'http://myhost/apachedocs', |
| 2068 |
|
|
|
| 2069 |
|
|
name_labels => { |
| 2070 |
|
|
swishdefault => 'Search All', |
| 2071 |
|
|
swishtitle => 'Title', |
| 2072 |
|
|
swishrank => 'Rank', |
| 2073 |
|
|
swishlastmodified => 'Last Modified Date', |
| 2074 |
|
|
swishdocpath => 'Document Path', |
| 2075 |
|
|
swishdocsize => 'Document Size', |
| 2076 |
|
|
}, |
| 2077 |
|
|
|
| 2078 |
|
|
}; |
| 2079 |
|
|
|
| 2080 |
|
|
The above configuration defines metanames to use on the form. |
| 2081 |
|
|
Searches can be limited to these metanames. |
| 2082 |
|
|
|
| 2083 |
|
|
"display_props" tells the script to display the property "swishlastmodified" (the last modified |
| 2084 |
|
|
date of the file), the document size, and path with the search results. |
| 2085 |
|
|
|
| 2086 |
|
|
The parameter "name_labels" is a hash (reference) |
| 2087 |
|
|
that is used to give friendly names to the metanames. |
| 2088 |
|
|
|
| 2089 |
|
|
Here's another example. Say you want to search either (or both) the Apache 1.3 documentation or the |
| 2090 |
|
|
Apache 2.0 documentation: |
| 2091 |
|
|
|
| 2092 |
|
|
return { |
| 2093 |
|
|
title => 'Search the Apache Documentation', |
| 2094 |
|
|
date_ranges => 0, |
| 2095 |
|
|
swish_index => [ qw/ index.apache index.apache2 / ], |
| 2096 |
|
|
select_indexes => { |
| 2097 |
|
|
method => 'checkbox_group', |
| 2098 |
|
|
labels => [ '1.3.23 docs', '2.0 docs' ], # Must match up one-to-one to swish_index |
| 2099 |
|
|
description => 'Select: ', |
| 2100 |
|
|
}, |
| 2101 |
|
|
|
| 2102 |
|
|
}; |
| 2103 |
|
|
|
| 2104 |
|
|
Now you can select either or both sets of documentation while searching. |
| 2105 |
|
|
|
| 2106 |
|
|
|
| 2107 |
|
|
Please refer to the default configuration settings near the top of the script for details on |
| 2108 |
|
|
the available settings. |
| 2109 |
|
|
|
| 2110 |
|
|
=head1 DEBUGGING |
| 2111 |
|
|
|
| 2112 |
|
|
Most problems with using this script have been a result of improper configuration. Please |
| 2113 |
|
|
get the script working with default settings before adjusting the configuration settings. |
| 2114 |
|
|
|
| 2115 |
|
|
The key to debugging CGI scripts is to run them from the command line, not with a browser. |
| 2116 |
|
|
|
| 2117 |
|
|
First, make sure the program compiles correctly: |
| 2118 |
|
|
|
| 2119 |
|
|
> perl -c swish.cgi |
| 2120 |
|
|
swish.cgi syntax OK |
| 2121 |
|
|
|
| 2122 |
|
|
Next, simply try running the program: |
| 2123 |
|
|
|
| 2124 |
|
|
> ./swish.cgi | head |
| 2125 |
|
|
Content-Type: text/html; charset=ISO-8859-1 |
| 2126 |
|
|
|
| 2127 |
|
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> |
| 2128 |
|
|
<html> |
| 2129 |
|
|
<head> |
| 2130 |
|
|
<title> |
| 2131 |
|
|
Search our site |
| 2132 |
|
|
</title> |
| 2133 |
|
|
</head> |
| 2134 |
|
|
<body> |
| 2135 |
|
|
|
| 2136 |
|
|
Now, you know that the program compiles and will run from the command line. |
| 2137 |
|
|
Next, try accessing the script from a web browser. |
| 2138 |
|
|
|
| 2139 |
|
|
If you see the contents of the CGI script instead of its output then your web server is |
| 2140 |
|
|
not configured to run the script. You will need to look at settings like ScriptAlias, SetHandler, |
| 2141 |
|
|
and Options. |
| 2142 |
|
|
|
| 2143 |
|
|
If an error is reported (such as Internal Server Error or Forbidden) |
| 2144 |
|
|
you need to locate your web server's error_log file |
| 2145 |
|
|
and carefully read what the problem is. Contact your web administrator for help. |
| 2146 |
|
|
|
| 2147 |
|
|
If you don't have access to the web server's error_log file, you can modify the script to report |
| 2148 |
|
|
errors to the browser screen. Open the script and search for "CGI::Carp". (Author's suggestion is |
| 2149 |
|
|
to debug from the command line -- adding the browser and web server into the equation only complicates |
| 2150 |
|
|
debugging.) |
| 2151 |
|
|
|
| 2152 |
|
|
The script does offer some basic debugging options that allow debugging from the command line. |
| 2153 |
|
|
The debugging options are enabled by setting |
| 2154 |
|
|
an environment variable "SWISH_DEBUG". How that is set depends on your operating system and the |
| 2155 |
|
|
shell you are using. These examples are using the "bash" shell syntax. |
| 2156 |
|
|
|
| 2157 |
|
|
Note: You can also use the "debug_options" configuration setting, but the recommended method |
| 2158 |
|
|
is to set the environment variable. |
| 2159 |
|
|
|
| 2160 |
|
|
You can list the available debugging options like this: |
| 2161 |
|
|
|
| 2162 |
|
|
>SWISH_DEBUG=help ./swish.cgi >outfile |
| 2163 |
|
|
Unknown debug option 'help'. Must be one of: |
| 2164 |
|
|
basic: Basic debugging |
| 2165 |
|
|
command: Show command used to run swish |
| 2166 |
|
|
headers: Show headers returned from swish |
| 2167 |
|
|
output: Show output from swish |
| 2168 |
|
|
summary: Show summary of results |
| 2169 |
|
|
dump: Show all data available to templates |
| 2170 |
|
|
|
| 2171 |
|
|
As you work yourself down the list you will get more detail output. You can combine |
| 2172 |
|
|
options like: |
| 2173 |
|
|
|
| 2174 |
|
|
>SWISH_DEBUG=command,headers,summary ./swish.cgi >outfile |
| 2175 |
|
|
|
| 2176 |
|
|
You will be asked for an input query and the max number of results to return. You can use the defaults |
| 2177 |
|
|
in most cases. It's a good idea to redirect output to a file. Any error messages are sent to stderr, so |
| 2178 |
|
|
those will still be displayed (unless you redirect stderr, too). |
| 2179 |
|
|
|
| 2180 |
|
|
Here are some examples: |
| 2181 |
|
|
|
| 2182 |
|
|
~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile |
| 2183 |
|
|
Debug level set to: 1 |
| 2184 |
|
|
Enter a query [all]: |
| 2185 |
|
|
Using 'not asdfghjklzxcv' to match all records |
| 2186 |
|
|
Enter max results to display [1]: |
| 2187 |
|
|
|
| 2188 |
|
|
------ Can't use DateRanges feature ------------ |
| 2189 |
|
|
|
| 2190 |
|
|
Script will run, but you can't use the date range feature |
| 2191 |
|
|
Can't locate Date/Calc.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at modules/DateRanges.pm line 107, <STDIN> line 2. |
| 2192 |
|
|
BEGIN failed--compilation aborted at modules/DateRanges.pm line 107, <STDIN> line 2. |
| 2193 |
|
|
Compilation failed in require at ./swish.cgi line 971, <STDIN> line 2. |
| 2194 |
|
|
|
| 2195 |
|
|
-------------- |
| 2196 |
|
|
Can't exec "./swish-e": No such file or directory at ./swish.cgi line 1245, <STDIN> line 2. |
| 2197 |
|
|
Child process Failed to exec './swish-e' Error: No such file or directory at ./swish.cgi line 1246, <STDIN> line 2. |
| 2198 |
|
|
Failed to find any results |
| 2199 |
|
|
|
| 2200 |
|
|
The above told me about two problems. First, it's telling me that the Date::Calc module is not installed. |
| 2201 |
|
|
The Date::Calc module is needed to use the date limiting feature of the script. |
| 2202 |
|
|
|
| 2203 |
|
|
The second problem is a bit more serious. It's saying that the script can't find the swish-e binary file. |
| 2204 |
|
|
I simply forgot to copy it. |
| 2205 |
|
|
|
| 2206 |
|
|
~/swishtest >cp ~/swish-e/src/swish-e . |
| 2207 |
|
|
~/swishtest >cat .swishcgi.conf |
| 2208 |
|
|
return { |
| 2209 |
|
|
title => 'Search the Apache Documentation', |
| 2210 |
|
|
date_ranges => 0, |
| 2211 |
|
|
}; |
| 2212 |
|
|
|
| 2213 |
|
|
Now, let's try again: |
| 2214 |
|
|
|
| 2215 |
|
|
~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile |
| 2216 |
|
|
Debug level set to: 1 |
| 2217 |
|
|
|
| 2218 |
|
|
---------- Read config parameters from '.swishcgi.conf' ------ |
| 2219 |
|
|
$VAR1 = { |
| 2220 |
|
|
'date_ranges' => 0, |
| 2221 |
|
|
'title' => 'Search the Apache Documentation' |
| 2222 |
|
|
}; |
| 2223 |
|
|
------------------------- |
| 2224 |
|
|
Enter a query [all]: |
| 2225 |
|
|
Using 'not asdfghjklzxcv' to match all records |
| 2226 |
|
|
Enter max results to display [1]: |
| 2227 |
|
|
Found 1 results |
| 2228 |
|
|
|
| 2229 |
|
|
Can't locate TemplateDefault.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at ./swish.cgi line 608. |
| 2230 |
|
|
|
| 2231 |
|
|
Bother. I fixed the first two problems, but now there's this new error. Oh, I somehow forgot to |
| 2232 |
|
|
copy the modules directory. The obvious way to fix that is to copy the directory. But, there may |
| 2233 |
|
|
be times where you want to put the module directory in another location. So, let's modify the |
| 2234 |
|
|
F<.swishcgi.conf> file and add a "use lib" setting: |
| 2235 |
|
|
|
| 2236 |
|
|
~/swishtest >cat .swishcgi.conf |
| 2237 |
|
|
use lib '/home/bill/swish-e/example/modules'; |
| 2238 |
|
|
|
| 2239 |
|
|
return { |
| 2240 |
|
|
title => 'Search the Apache Documentation', |
| 2241 |
|
|
date_ranges => 0, |
| 2242 |
|
|
}; |
| 2243 |
|
|
|
| 2244 |
|
|
~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile |
| 2245 |
|
|
Debug level set to: 1 |
| 2246 |
|
|
|
| 2247 |
|
|
---------- Read config parameters from '.swishcgi.conf' ------ |
| 2248 |
|
|
$VAR1 = { |
| 2249 |
|
|
'date_ranges' => 0, |
| 2250 |
|
|
'title' => 'Search the Apache Documentation' |
| 2251 |
|
|
}; |
| 2252 |
|
|
------------------------- |
| 2253 |
|
|
Enter a query [all]: |
| 2254 |
|
|
Using 'not asdfghjklzxcv' to match all records |
| 2255 |
|
|
Enter max results to display [1]: |
| 2256 |
|
|
Found 1 results |
| 2257 |
|
|
|
| 2258 |
|
|
Now were talking. |
| 2259 |
|
|
|
| 2260 |
|
|
Here's a common problem. Everything checks out, but when you run the script you see the message: |
| 2261 |
|
|
|
| 2262 |
|
|
Swish returned unknown output |
| 2263 |
|
|
|
| 2264 |
|
|
Ok, let's find out what output it is returning: |
| 2265 |
|
|
|
| 2266 |
|
|
~/swishtest >SWISH_DEBUG=headers,output ./swish.cgi >outfile |
| 2267 |
|
|
Debug level set to: 13 |
| 2268 |
|
|
|
| 2269 |
|
|
---------- Read config parameters from '.swishcgi.conf' ------ |
| 2270 |
|
|
$VAR1 = { |
| 2271 |
|
|
'swish_binary' => '/usr/local/bin/swish-e', |
| 2272 |
|
|
'date_ranges' => 0, |
| 2273 |
|
|
'title' => 'Search the Apache Documentation' |
| 2274 |
|
|
}; |
| 2275 |
|
|
------------------------- |
| 2276 |
|
|
Enter a query [all]: |
| 2277 |
|
|
Using 'not asdfghjklzxcv' to match all records |
| 2278 |
|
|
Enter max results to display [1]: |
| 2279 |
|
|
usage: swish [-i dir file ... ] [-S system] [-c file] [-f file] [-l] [-v (num)] |
| 2280 |
|
|
... |
| 2281 |
|
|
version: 2.0 |
| 2282 |
|
|
docs: http://sunsite.berkeley.edu/SWISH-E/ |
| 2283 |
|
|
|
| 2284 |
|
|
*** 9872 Failed to run swish: 'Swish returned unknown output' *** |
| 2285 |
|
|
Failed to find any results |
| 2286 |
|
|
|
| 2287 |
|
|
Oh, looks like /usr/local/bin/swish-e is version 2.0 of swish. We need 2.1-dev and above! |
| 2288 |
|
|
|
| 2289 |
|
|
=head1 Frequently Asked Questions |
| 2290 |
|
|
|
| 2291 |
|
|
Here's some common questions and answers. |
| 2292 |
|
|
|
| 2293 |
|
|
=head2 How do I change the way the output looks? |
| 2294 |
|
|
|
| 2295 |
|
|
The script uses a module to generate output. By default it uses the TemplateDefault.pm module. |
| 2296 |
|
|
The module used can be selected in the configuration file. |
| 2297 |
|
|
|
| 2298 |
|
|
If you want to make simple changes you can edit the TemplatDefault.pm module directly. If you want to |
| 2299 |
|
|
copy a module, you must also change the "package" statement at the top of the module. For example: |
| 2300 |
|
|
|
| 2301 |
|
|
cp TempateDefault.pm MyTemplateDefault.pm |
| 2302 |
|
|
|
| 2303 |
|
|
Then at the top of the module adjust the "package" line to: |
| 2304 |
|
|
|
| 2305 |
|
|
package MyTemplateDefault; |
| 2306 |
|
|
|
| 2307 |
|
|
To use this modules you need to adjust the configuration settings (either at the top of F<swish.cgi> or in |
| 2308 |
|
|
a configuration file: |
| 2309 |
|
|
|
| 2310 |
|
|
|
| 2311 |
|
|
template => { |
| 2312 |
|
|
package => 'MyTemplateDefault', |
| 2313 |
|
|
}, |
| 2314 |
|
|
|
| 2315 |
|
|
|
| 2316 |
|
|
=head2 How do I use a templating system with swish.cgi? |
| 2317 |
|
|
|
| 2318 |
|
|
In addition to the TemplateDefault.pm module, the swish-e distribution includes two other Perl modules for |
| 2319 |
|
|
generating output using the templating systems HTML::Template and Template-Toolkit. |
| 2320 |
|
|
|
| 2321 |
|
|
Templating systems use template files to generate the HTML, and make maintaining the look of a large (or small) site |
| 2322 |
|
|
much easier. HTML::Template and Template-Toolkit are separate packages and can be downloaded from the CPAN. |
| 2323 |
|
|
See http://search.cpan.org. |
| 2324 |
|
|
|
| 2325 |
|
|
Two basic templates are provided as examples for generating output using these templating systems. |
| 2326 |
|
|
The example templates are located in the F<example> directory. |
| 2327 |
|
|
The module F<TemplateHTMLTemplate.pm> uses the file F<swish.tmpl> to generate its output, while the |
| 2328 |
|
|
module F<TemplateToolkit.pm> uses the F<search.tt> file. |
| 2329 |
|
|
|
| 2330 |
|
|
To use either of these modules you will need to adjust the "template" configuration setting. Examples for |
| 2331 |
|
|
both templating systems are provided in the configuration settings near the top of the F<swish.cgi> program. |
| 2332 |
|
|
|
| 2333 |
|
|
Use of these modules is an advanced usage of F<swish.cgi> and are provided as examples only. |
| 2334 |
|
|
|
| 2335 |
|
|
All of the output generation modules are passed a hash with the results from the search, plus other data use to create the |
| 2336 |
|
|
output page. You can see this hash by using the debugging option "dump" or by using the TemplateDumper.pm |
| 2337 |
|
|
module: |
| 2338 |
|
|
|
| 2339 |
|
|
~/swishtest >cat .swishcgi.conf |
| 2340 |
|
|
return { |
| 2341 |
|
|
title => 'Search the Apache Documentation', |
| 2342 |
|
|
template => { |
| 2343 |
|
|
package => 'TemplateDumper', |
| 2344 |
|
|
}, |
| 2345 |
|
|
}; |
| 2346 |
|
|
|
| 2347 |
|
|
And run a query. For example: |
| 2348 |
|
|
|
| 2349 |
|
|
http://localhost:8000/swishtest/swish.cgi?query=install |
| 2350 |
|
|
|
| 2351 |
|
|
=head2 Why are there three different highlighting modules? |
| 2352 |
|
|
|
| 2353 |
|
|
Three are three highlighting modules included with the swish-e distribution. |
| 2354 |
|
|
Each is a trade-off of speed vs. accuracy: |
| 2355 |
|
|
|
| 2356 |
|
|
DefaultHighlight.pm - reasonably fast, but does not highlight phrases |
| 2357 |
|
|
PhraseHighlight.pm - reasonably slow, but is reasonably accurate |
| 2358 |
|
|
SimpleHighlight.pm - fast, some phrases, but least accurate |
| 2359 |
|
|
|
| 2360 |
|
|
Eh, the default is actually "PhraseHighlight.pm". Oh well. |
| 2361 |
|
|
|
| 2362 |
|
|
Optimizations to these modules are welcome! |
| 2363 |
|
|
|
| 2364 |
|
|
=head2 My ISP doesn't provide access to the web server logs |
| 2365 |
|
|
|
| 2366 |
|
|
There are a number of options. One way it to use the CGI::Carp module. Search in the |
| 2367 |
|
|
swish.cgi script for: |
| 2368 |
|
|
|
| 2369 |
|
|
use Carp; |
| 2370 |
|
|
# Or use this instead -- PLEASE see perldoc CGI::Carp for details |
| 2371 |
|
|
# use CGI::Carp qw(fatalsToBrowser warningsToBrowser); |
| 2372 |
|
|
|
| 2373 |
|
|
And change it to look like: |
| 2374 |
|
|
|
| 2375 |
|
|
#use Carp; |
| 2376 |
|
|
# Or use this instead -- PLEASE see perldoc CGI::Carp for details |
| 2377 |
|
|
use CGI::Carp qw(fatalsToBrowser warningsToBrowser); |
| 2378 |
|
|
|
| 2379 |
|
|
This should be only for debugging purposes, as if used in production you may end up sending |
| 2380 |
|
|
quite ugly and confusing messages to your browsers. |
| 2381 |
|
|
|
| 2382 |
|
|
=head2 Why does the output show (NULL)? |
| 2383 |
|
|
|
| 2384 |
|
|
The most common reason is that you did not use StoreDescription in your config file while indexing. |
| 2385 |
|
|
|
| 2386 |
|
|
StoreDescription HTML <body> 200000 |
| 2387 |
|
|
|
| 2388 |
|
|
That tells swish to store the first 200,000 characters of text extracted from the body of each document parsed |
| 2389 |
|
|
by the HTML parser. The text is stored as property "swishdescription". Running: |
| 2390 |
|
|
|
| 2391 |
|
|
~/swishtest > ./swish-e -T index_metanames |
| 2392 |
|
|
|
| 2393 |
|
|
will display the properties defined in your index file. |
| 2394 |
|
|
|
| 2395 |
|
|
This can happen with other properties, too. |
| 2396 |
|
|
For example, this will happen when you are asking for a property to display that is not defined in swish. |
| 2397 |
|
|
|
| 2398 |
|
|
~/swishtest > ./swish-e -w install -m 1 -p foo |
| 2399 |
|
|
# SWISH format: 2.1-dev-25 |
| 2400 |
|
|
# Search words: install |
| 2401 |
|
|
err: Unknown Display property name "foo" |
| 2402 |
|
|
. |
| 2403 |
|
|
|
| 2404 |
|
|
~/swishtest > ./swish-e -w install -m 1 -x 'Property foo=<foo>\n' |
| 2405 |
|
|
# SWISH format: 2.1-dev-25 |
| 2406 |
|
|
# Search words: install |
| 2407 |
|
|
# Number of hits: 14 |
| 2408 |
|
|
# Search time: 0.000 seconds |
| 2409 |
|
|
# Run time: 0.038 seconds |
| 2410 |
|
|
Property foo=(NULL) |
| 2411 |
|
|
. |
| 2412 |
|
|
|
| 2413 |
|
|
To check that a property exists in your index you can run: |
| 2414 |
|
|
|
| 2415 |
|
|
~/swishtest > ./swish-e -w not dkdk -T index_metanames | grep foo |
| 2416 |
|
|
foo : id=10 type=70 META_PROP:STRING(case:ignore) *presorted* |
| 2417 |
|
|
|
| 2418 |
|
|
Ok, in this case we see that "foo" is really defined as a property. Now let's make sure F<swish.cgi> |
| 2419 |
|
|
is asking for "foo" (sorry for the long lines): |
| 2420 |
|
|
|
| 2421 |
|
|
~/swishtest > SWISH_DEBUG=command ./swish.cgi > /dev/null |
| 2422 |
|
|
Debug level set to: 3 |
| 2423 |
|
|
Enter a query [all]: |
| 2424 |
|
|
Using 'not asdfghjklzxcv' to match all records |
| 2425 |
|
|
Enter max results to display [1]: |
| 2426 |
|
|
---- Running swish with the following command and parameters ---- |
| 2427 |
|
|
./swish-e \ |
| 2428 |
|
|
-w \ |
| 2429 |
|
|
'swishdefault=(not asdfghjklzxcv)' \ |
| 2430 |
|
|
-b \ |
| 2431 |
|
|
1 \ |
| 2432 |
|
|
-m \ |
| 2433 |
|
|
1 \ |
| 2434 |
|
|
-f \ |
| 2435 |
|
|
index.swish-e \ |
| 2436 |
|
|
-s \ |
| 2437 |
|
|
swishrank \ |
| 2438 |
|
|
desc \ |
| 2439 |
|
|
swishlastmodified \ |
| 2440 |
|
|
desc \ |
| 2441 |
|
|
-x \ |
| 2442 |
|
|
'<swishreccount>\t<swishtitle>\t<swishdescription>\t<swishlastmodified>\t<swishdocsize>\t<swishdocpath>\t<fos>\t<swishrank>\t<swishdocpath>\n' \ |
| 2443 |
|
|
-H \ |
| 2444 |
|
|
9 |
| 2445 |
|
|
|
| 2446 |
|
|
If you look carefully you will see that the -x parameter has "fos" instead of "foo", so there's our problem. |
| 2447 |
|
|
|
| 2448 |
|
|
|
| 2449 |
|
|
=head1 MOD_PERL |
| 2450 |
|
|
|
| 2451 |
|
|
This script can be run under mod_perl (see http://perl.apache.org). |
| 2452 |
|
|
This will improve the response time of the script compared to running under CGI. |
| 2453 |
|
|
|
| 2454 |
|
|
Configuration is simple. In your httpd.conf or your startup.pl file you need to |
| 2455 |
|
|
load the script. For example, in httpd.conf you can use a perl section: |
| 2456 |
|
|
|
| 2457 |
|
|
<perl> |
| 2458 |
|
|
use lib '/usr/local/apache/cgi-bin'; |
| 2459 |
|
|
use lib '/home/yourname/swish-e/example/modules'; |
| 2460 |
|
|
require "swish.cgi"; |
| 2461 |
|
|
</perl> |
| 2462 |
|
|
|
| 2463 |
|
|
Again, note that the paths used will depend on where you installed the script and the modules. |
| 2464 |
|
|
When running under mod_perl the swish.cgi script becomes a perl module, and therefore the script |
| 2465 |
|
|
does not need to be installed in the cgi-bin directory. (But, you can actually use the same script as |
| 2466 |
|
|
both a CGI script and a mod_perl module at the same time, read from the same location.) |
| 2467 |
|
|
|
| 2468 |
|
|
The above loads the script into mod_perl. Then to configure the script to run add this to your httpd.conf |
| 2469 |
|
|
configuration file: |
| 2470 |
|
|
|
| 2471 |
|
|
<location /search> |
| 2472 |
|
|
allow from all |
| 2473 |
|
|
SetHandler perl-script |
| 2474 |
|
|
PerlHandler SwishSearch |
| 2475 |
|
|
</location> |
| 2476 |
|
|
|
| 2477 |
|
|
Unlike CGI, mod_perl does not change the current directory to the location of the perl module, so |
| 2478 |
|
|
your settings for the swish binary and the path to your index files must be absolute |
| 2479 |
|
|
paths (or relative to the server root). |
| 2480 |
|
|
|
| 2481 |
|
|
Take a look at the C<handler()> routine in this script for ideas how to use PerlSetVar commands |
| 2482 |
|
|
in httpd.conf to control the script. |
| 2483 |
|
|
|
| 2484 |
|
|
Please post to the swish-e discussion list if you have any questions about running this |
| 2485 |
|
|
script under mod_perl. |
| 2486 |
|
|
|
| 2487 |
|
|
|
| 2488 |
|
|
=head1 Spidering |
| 2489 |
|
|
|
| 2490 |
|
|
There are two ways to spider with swish-e. One uses the "http" input method that uses code that's |
| 2491 |
|
|
part of swish. The other way is to use the new "prog" method along with a perl helper program called |
| 2492 |
|
|
C<spider.pl>. |
| 2493 |
|
|
|
| 2494 |
|
|
Here's an example of a configuration file for spidering with the "http" input method. |
| 2495 |
|
|
You can see that the configuration is not much different than the file system input method. |
| 2496 |
|
|
(But, don't use the http input method -- use the -S prog method shown below.) |
| 2497 |
|
|
|
| 2498 |
|
|
# Define what to index |
| 2499 |
|
|
IndexDir http://www.myserver.name/index.html |
| 2500 |
|
|
IndexOnly .html .htm |
| 2501 |
|
|
|
| 2502 |
|
|
IndexContents HTML .html .htm |
| 2503 |
|
|
DefaultContents HTML |
| 2504 |
|
|
StoreDescription HTML <body> 200000 |
| 2505 |
|
|
MetaNames swishdocpath swishtitle |
| 2506 |
|
|
|
| 2507 |
|
|
# Define http method specific settings -- see swish-e documentation |
| 2508 |
|
|
SpiderDirectory ../swish-e/src/ |
| 2509 |
|
|
Delay 0 |
| 2510 |
|
|
|
| 2511 |
|
|
You index with the command: |
| 2512 |
|
|
|
| 2513 |
|
|
swish-e -S http -c spider.conf |
| 2514 |
|
|
|
| 2515 |
|
|
Note that this does take longer. For example, spidering the Apache documentation on |
| 2516 |
|
|
a local web server with this method took over a minute, where indexing with the |
| 2517 |
|
|
file system took less than two seconds. Using the "prog" method can speed this up. |
| 2518 |
|
|
|
| 2519 |
|
|
Here's an example configuration file for using the "prog" input method: |
| 2520 |
|
|
|
| 2521 |
|
|
# Define the location of the spider helper program |
| 2522 |
|
|
IndexDir ../swish-e/prog-bin/spider.pl |
| 2523 |
|
|
|
| 2524 |
|
|
# Tell the spider what to index. |
| 2525 |
|
|
SwishProgParameters default http://www.myserver.name/index.html |
| 2526 |
|
|
|
| 2527 |
|
|
IndexContents HTML .html .htm |
| 2528 |
|
|
DefaultContents HTML |
| 2529 |
|
|
StoreDescription HTML <body> 200000 |
| 2530 |
|
|
MetaNames swishdocpath swishtitle |
| 2531 |
|
|
|
| 2532 |
|
|
Then to index you use the command: |
| 2533 |
|
|
|
| 2534 |
|
|
swish-e -c prog.conf -S prog -v 0 |
| 2535 |
|
|
|
| 2536 |
|
|
Spidering with this method took nine seconds. |
| 2537 |
|
|
|
| 2538 |
|
|
|
| 2539 |
|
|
=head1 Stemmed Indexes |
| 2540 |
|
|
|
| 2541 |
|
|
Many people enable a feature of swish called word stemming to provide "fuzzy" search |
| 2542 |
|
|
options to their users. |
| 2543 |
|
|
The stemming code does not actually find the "stem" of word, rather removes and/or replaces |
| 2544 |
|
|
common endings on words. |
| 2545 |
|
|
Stemming is far from perfect, and many words do not stem as you might expect. But, it can |
| 2546 |
|
|
be a helpful tool for searching your site. You may wish to create both a stemmed and non-stemmed index, and |
| 2547 |
|
|
provide a checkbox for selecting the index file. |
| 2548 |
|
|
|
| 2549 |
|
|
To enable a stemmed index you simply add to your configuration file: |
| 2550 |
|
|
|
| 2551 |
|
|
UseStemming yes |
| 2552 |
|
|
|
| 2553 |
|
|
If you want to use a stemmed index with this program and continue to highlight search terms you will need |
| 2554 |
|
|
to install a perl module that will stem words. This section explains how to do this. |
| 2555 |
|
|
|
| 2556 |
|
|
The perl module is included with the swish-e distribution. It can be found in the examples directory (where |
| 2557 |
|
|
you found this file) and called something like: |
| 2558 |
|
|
|
| 2559 |
|
|
SWISH-Stemmer-0.05.tar.gz |
| 2560 |
|
|
|
| 2561 |
|
|
The module should also be available on CPAN (http://search.cpan.org/). |
| 2562 |
|
|
|
| 2563 |
|
|
Here's an example session for installing the module. (There will be quite a bit of output |
| 2564 |
|
|
when running make.) |
| 2565 |
|
|
|
| 2566 |
|
|
|
| 2567 |
|
|
% gzip -dc SWISH-Stemmer-0.05.tar.gz |tar xof - |
| 2568 |
|
|
% cd SWISH-Stemmer-0.05 |
| 2569 |
|
|
% perl Makefile.PL |
| 2570 |
|
|
or |
| 2571 |
|
|
% perl Makefile.PL PREFIX=$HOME/perl_lib |
| 2572 |
|
|
% make |
| 2573 |
|
|
% make test |
| 2574 |
|
|
|
| 2575 |
|
|
(perhaps su root at this point if you did not use a PREFIX) |
| 2576 |
|
|
% make install |
| 2577 |
|
|
% cd .. |
| 2578 |
|
|
|
| 2579 |
|
|
Use the B<PREFIX> if you do not have root access or you want to install the modules |
| 2580 |
|
|
in a local library. If you do use a PREFIX setting, add a C<use lib> statement to the top of this |
| 2581 |
|
|
swish.cgi program. |
| 2582 |
|
|
|
| 2583 |
|
|
For example: |
| 2584 |
|
|
|
| 2585 |
|
|
use lib qw( |
| 2586 |
|
|
/home/bmoseley/perl_lib/lib/site_perl/5.6.0 |
| 2587 |
|
|
/home/bmoseley/perl_lib/lib/site_perl/5.6.0/i386-linux/ |
| 2588 |
|
|
); |
| 2589 |
|
|
|
| 2590 |
|
|
Once the stemmer module is installed, and you are using a stemmed index, the C<swish.cgi> script will automatically |
| 2591 |
|
|
detect this and use the stemmer module. |
| 2592 |
|
|
|
| 2593 |
|
|
=head1 DISCLAIMER |
| 2594 |
|
|
|
| 2595 |
|
|
Please use this CGI script at your own risk. |
| 2596 |
|
|
|
| 2597 |
|
|
This script has been tested and used without problem, but you should still be aware that |
| 2598 |
|
|
any code running on your server represents a risk. If you have any concerns please carefully |
| 2599 |
|
|
review the code. |
| 2600 |
|
|
|
| 2601 |
|
|
See http://www.w3.org/Security/Faq/www-security-faq.html |
| 2602 |
|
|
|
| 2603 |
|
|
Security on Windows questionable. |
| 2604 |
|
|
|
| 2605 |
|
|
=head1 SUPPORT |
| 2606 |
|
|
|
| 2607 |
|
|
The SWISH-E discussion list is the place to ask for any help regarding SWISH-E or this example |
| 2608 |
|
|
script. See http://swish-e.org. |
| 2609 |
|
|
|
| 2610 |
|
|
Before posting please review: |
| 2611 |
|
|
|
| 2612 |
|
|
http://swish-e.org/2.2/docs/INSTALL.html#When_posting_please_provide_the_ |
| 2613 |
|
|
|
| 2614 |
|
|
Please do not contact the author or any of the swish-e developers directly. |
| 2615 |
|
|
|
| 2616 |
|
|
=head1 LICENSE |
| 2617 |
|
|
|
| 2618 |
|
|
swish.cgi $Revision: 1.33 $ Copyright (C) 2001 Bill Moseley search@hank.org |
| 2619 |
|
|
Example CGI program for searching with SWISH-E |
| 2620 |
|
|
|
| 2621 |
|
|
|
| 2622 |
|
|
This program is free software; you can redistribute it and/or |
| 2623 |
|
|
modify it under the terms of the GNU General Public License |
| 2624 |
|
|
as published by the Free Software Foundation; either version |
| 2625 |
|
|
2 of the License, or (at your option) any later version. |
| 2626 |
|
|
|
| 2627 |
|
|
This program is distributed in the hope that it will be useful, |
| 2628 |
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 2629 |
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 2630 |
|
|
GNU General Public License for more details. |
| 2631 |
|
|
|
| 2632 |
|
|
|
| 2633 |
|
|
=head1 AUTHOR |
| 2634 |
|
|
|
| 2635 |
|
|
Bill Moseley -- search@hank.org |
| 2636 |
|
|
|
| 2637 |
|
|
=cut |
| 2638 |
|
|
|
| 2639 |
|
|
|