/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/example/swish.cgi
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/example/swish.cgi

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (hide annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
Importing web-site building process.

1 adcroft 1.1 #!/usr/local/bin/perl -w
2     package SwishSearch;
3     use strict;
4    
5     use lib qw( modules ); ### This may need to be adjusted!
6     ### It should point to the location of the
7     ### associated script modules directory
8    
9     my $DEFAULT_CONFIG_FILE = '.swishcgi.conf';
10    
11     ###################################################################################
12     #
13     # If this text is displayed on your browser then your web server
14     # is not configured to run .cgi programs. Contact your web server administrator.
15     #
16     # To display documentation for this program type "perldoc swish.cgi"
17     #
18     # swish.cgi $Revision: 1.33 $ Copyright (C) 2001 Bill Moseley swishscript@hank.org
19     # Example CGI program for searching with SWISH-E
20     #
21     # This example program will only run under an OS that supports fork().
22     # Ok, piped opens.
23     #
24     #
25     # This program is free software; you can redistribute it and/or
26     # modify it under the terms of the GNU General Public License
27     # as published by the Free Software Foundation; either version
28     # 2 of the License, or (at your option) any later version.
29     #
30     # This program is distributed in the hope that it will be useful,
31     # but WITHOUT ANY WARRANTY; without even the implied warranty of
32     # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33     # GNU General Public License for more details.
34     #
35     # The above lines must remain at the top of this program
36     #
37     # $Id: swish.cgi,v 1.33 2002/08/13 23:08:54 whmoseley Exp $
38     #
39     ####################################################################################
40    
41     # This is written this way so the script can be used as a CGI script or a mod_perl
42     # module without any code changes.
43    
44     # use CGI (); # might not be needed if using Apache::Request
45    
46     #=================================================================================
47     # CGI entry point
48     #
49     #=================================================================================
50    
51    
52    
53     # Run the script -- entry point if running as a CGI script
54    
55     unless ( $ENV{MOD_PERL} ) {
56     my $config = default_config();
57    
58     # Merge with disk config file.
59     $config = merge_read_config( $config );
60     process_request( $config );
61     }
62    
63    
64    
65    
66     #==================================================================================
67     # This sets the default configuration parameters
68     #
69     # Any configuration read from disk is merged with these settings.
70     #
71     # Only a few settings are actually required. Some reasonable defaults are used
72     # for most. If fact, you can probably create a complete config as:
73     #
74     # return = {
75     # swish_binary => '/usr/local/bin/swish-e',
76     # swish_index => '/usr/local/share/swish/index.swish-e',
77     # title_property => 'swishtitle', # Not required, but recommended
78     # };
79     #
80     # But, that doesn't really show all the options.
81     #
82     # You can modify the options below, or you can use a config file. The config file
83     # is .swishcgi.conf by default (read from the current directory) that must return
84     # a hash reference. For example, to create a config file that changes the default
85     # title and index file name, plus uses Template::Toolkit to generate output
86     # create a config file as:
87     #
88     # # Example config file -- returns a hash reference
89     # {
90     # title => 'Search Our Site',
91     # swish_index => 'index.web',
92     #
93     # template => {
94     # package => 'TemplateToolkit',
95     # file => 'search.tt',
96     # options => {
97     # INCLUDE_PATH => '/home/user/swish-e/example',
98     # },
99     # };
100     #
101     #
102     #-----------------------------------------------------------------------------------
103    
104     sub default_config {
105    
106    
107    
108     ##### Configuration Parameters #########
109    
110     #---- This lists all the options, with many commented out ---
111     # By default, this config is used -- see the process_request() call below.
112    
113     # You should adjust for your site, and how your swish index was created.
114    
115     ##>>
116     ##>> Please don't post this entire section on the swish-e list if looking for help!
117     ##>>
118     ##>> Send a small example, without all the comments.
119    
120     #======================================================================
121     # *** NOTES ****
122     # Items beginning with an "x" or "#" are commented out
123     # the "x" form simply renames (hides) that setting. It's used
124     # to make it easy to disable a mult-line configuation setting.
125     #
126     # If you do not understand a setting then best to leave the default.
127     #
128     # Please follow the documentation (perldoc swish.cgi) and set up
129     # a test using the defaults before making changes. It's much easier
130     # to modify a working example than to try to get a modified example to work...
131     #
132     # Again, this is a Perl hash structure. Commas are important.
133     #======================================================================
134    
135     return {
136     title => 'Search our site', # Title of your choice. Displays on the search page
137     swish_binary => './swish-e', # Location of swish-e binary
138    
139    
140     # By default, this script tries to read a config file. You should probably
141     # comment this out if not used save a disk stat
142     config_file => $DEFAULT_CONFIG_FILE, # Default config file
143    
144    
145     # The location of your index file. Typically, this would not be in
146     # your web tree.
147     # If you have more than one index to search then specify an array
148     # reference. e.g. swish_index =>[ qw( index1 index2 index3 )],
149    
150     swish_index => 'index.swish-e', # Location of your index file
151    
152     # See "select_indexes" below for how to
153     # select more than one index.
154    
155     page_size => 15, # Number of results per page - default 15
156    
157    
158     # Property name to use as the main link text to the indexed document.
159     # Typically, this will be 'swishtitle' if have indexed html documents,
160     # But you can specify any PropertyName defined in your document.
161     # By default, swish will return the pathname for documents that do not
162     # have a title.
163     # In other words, this is used for the text of the links of the search results.
164     # <a href="prepend_path/swishdocpath">title_property</a>
165    
166     title_property => 'swishtitle',
167    
168    
169    
170     # prepend this path to the filename (swishdocpath) returned by swish. This is used to
171     # make the href link back to the original document. Comment out to disable.
172    
173     #prepend_path => 'http://localhost/mydocs',
174    
175    
176     # Swish has a configuration directive "StoreDescription" that will save part or
177     # all of a document's contents in the index file. This can then be displayed
178     # along with results. If you are indexing a lot of files this can use a lot of disk
179     # space, so test carefully before indexing your entire site.
180     # Building swish with zlib can greatly reduce the space used by StoreDescription
181     #
182     # This settings tells this script to display this description.
183     # Normally, this should be 'swishdescription', but you can specify another property name.
184     # There is no default.
185    
186     description_prop => 'swishdescription',
187    
188    
189    
190     # Property names listed here will be displayed in a table below each result
191     # You may wish to modify this list if you are using document properties (PropertyNames)
192     # in your swish-e index configuration
193     # There is no default.
194    
195     display_props => [qw/swishlastmodified swishdocsize swishdocpath/],
196    
197    
198    
199     # Results can be be sorted by any of the properties listed here
200     # They will be displayed in a drop-down list
201     # Again, you may modify this list if you are using document properties of your own creation
202     # Swish uses the rank as the default sort
203    
204     sorts => [qw/swishrank swishlastmodified swishtitle swishdocpath/],
205    
206    
207     # Secondary_sort is used to sort within a sort
208     # You may enter a property name followed by a direction (asc|desc)
209    
210     secondary_sort => [qw/swishlastmodified desc/],
211    
212    
213    
214    
215    
216     # You can limit by MetaNames here. Names listed here will be displayed in
217     # a line of radio buttons.
218     # The default is to not allow any metaname selection.
219     # To use this feature you must define MetaNames while indexing.
220    
221     # The special "swishdefault" says to search any text that was not indexed
222     # as a specific metaname (e.g. typically the body of a HTML document and its title).
223    
224     # To see how this might work, add to your config file:
225     # MetaNames swishtitle swishdocpath
226     # reindex and try:
227    
228     metanames => [qw/swishdefault swishtitle swishdocpath /],
229    
230     # Add "all" to metanames to test the meta_groups feature described below
231    
232    
233    
234     # Another example: if you indexed an email archive
235     # that defined the metanames subject name email (as in the swish-e discussion archive)
236     # you might use:
237     #metanames => [qw/body subject name email/],
238    
239    
240     # Note that you can do a real "all" search if you use nested metanames in your source documents.
241     # Nesting metanames is most common with XML documents.
242    
243     # You can also group metanames into "meta-metanames".
244     # Example: Say you defined metanames "author", "comment" and "keywords"
245     # You want to allow searching "author", "comment" and the document body ("swishdefault")
246     # But you would also like an "all" search that searches all metanames, including "keywords":
247     #
248     # metanames => [qw/swishdefault author comment all/],
249     #
250     # Now, the "all" metaname is not a real metaname. It must be expanded into its
251     # individual metanames
252     #
253     # "meta_groups" maps a fake metaname to a list of real metanames
254     #
255     # meta_groups => {
256     # all => [qw/swishdefault author comment keywords / ],
257     # },
258     #
259     # swish.cgi will then take a query like
260     #
261     # all=(query words)
262     #
263     # into the query
264     #
265     # swishdefault=(query words) OR author=(query words) OR comment=(query words) OR keywords=(query words)
266     #
267     # This is not ideal, but should work for most cases
268     # (might fail under windows since the query is passed through the shell).
269    
270     # To enable this group add "all" to the list of metanames
271     meta_groups => {
272     all => [qw/swishdefault swishtitle swishdocpath/],
273     },
274    
275    
276    
277     # "name_labels" is used to map MetaNames and PropertyNames to user-friendly names
278     # on the form.
279    
280     name_labels => {
281     swishdefault => 'Title & Body',
282     swishtitle => 'Title',
283     swishrank => 'Rank',
284     swishlastmodified => 'Last Modified Date',
285     swishdocpath => 'Document Path',
286     swishdocsize => 'Document Size',
287     all => 'All', # group of metanames
288    
289     subject => 'Message Subject', # other examples
290     name => "Poster's Name",
291     email => "Poster's Email",
292     sent => 'Message Date',
293     },
294    
295    
296     timeout => 10, # limit time used by swish when fetching results - DoS protection.
297    
298     max_query_length => 100, # limit length of query string. Swish also has a limit (default is 40)
299     # You might want to set swish-e's limit higher, and use this to get a
300     # somewhat more friendly message.
301    
302    
303     # These settings will use some crude highlighting code to highlight search terms in the
304     # property specified above as the description_prop (normally, 'swishdescription').
305    
306    
307     max_chars => 500, # If "highlight" is not defined, then just truncate the description to this many *chars*.
308     # If you want to go by *words*, enable highlighting,
309     # and then comment-out show_words. It will be a little slower.
310    
311    
312     # This structure defines term highlighting, and what type of highlighting to use
313     # If you are using metanames in your searches and they map to properties that you
314     # will display, you may need to adjust the "meta_to_prop_map".
315    
316     highlight => {
317    
318     # Pick highlighting module -- you must make sure the module can be found
319    
320     # Ok speed, but doesn't handle phrases.
321     #Deals with stemming, but not stopwords
322     #package => 'DefaultHighlight',
323    
324     # Somewhat slow, but deals with phases, stopwords, and stemming.
325     # Takes into consideration WordCharacters, IgnoreFirstChars and IgnoreLastChars.
326     package => 'PhraseHighlight',
327    
328     # Fast: phrases without regard to wordcharacter settings
329     # doesn't do context display, so must match in first X words,
330     # doesn't handle stemming or stopwords.
331     #package => 'SimpleHighlight',
332    
333     show_words => 10, # Number of swish words words to show around highlighted word
334     max_words => 100, # If no words are found to highlighted then show this many words
335     occurrences => 6, # Limit number of occurrences of highlighted words
336     #highlight_on => '<b>', # HTML highlighting codes
337     #highlight_off => '</b>',
338     highlight_on => '<font style="background:#FFFF99">',
339     highlight_off => '</font>',
340    
341     # This maps search metatags to display properties.
342     meta_to_prop_map => {
343     swishdefault => [ qw/swishtitle swishdescription/ ],
344     swishtitle => [ qw/swishtitle/ ],
345     swishdocpath => [ qw/swishdocpath/ ],
346     all => [ qw/swishtitle swishdescription swishdocpath/ ],
347     },
348     },
349    
350    
351    
352     # If you specify more than one index file (as an array reference) you
353     # can set this allow selection of which indexes to search.
354     # The default is to search all indexes specified if this is not used.
355     # When used, the first index is the default index.
356    
357     # You need to specify your indexes as an array reference:
358     #swish_index => [ qw/ index.swish-e index.other index2.other index3.other index4.other / ],
359    
360     Xselect_indexes => {
361     #method => 'radio_group', # pick radio_group, popup_menu, or checkbox_group
362     method => 'checkbox_group',
363     #method => 'popup_menu',
364     columns => 3,
365     labels => [ 'Main Index', 'Other Index', qw/ two three four/ ], # Must match up one-to-one
366     description => 'Select Site: ',
367     },
368    
369    
370     # Similar to select_indexes, this adds a metaname search
371     # based on a metaname. You can use any metaname, and this will
372     # add an "AND" search to limit results to a subset of your records.
373     # i.e. it adds something like 'site=(foo or bar or baz)' if foo, bar, and baz were selected.
374    
375     # Swish-e's ExtractPath would work well with this. For example, the apache docs:
376     # ExtractPath site regex !^/usr/local/apache/htdocs/manual/([^/]+)/.+$!$1!
377     # ExtractPathDefault site other
378    
379    
380     Xselect_by_meta => {
381     #method => 'radio_group', # pick: radio_group, popup_menu, or checkbox_group
382     method => 'checkbox_group',
383     #method => 'popup_menu',
384     columns => 3,
385     metaname => 'site', # Can't be a metaname used elsewhere!
386     values => [qw/misc mod vhosts other/],
387     labels => {
388     misc => 'General Apache docs',
389     mod => 'Apache Modules',
390     vhosts => 'Virutal hosts',
391     },
392     description => 'Limit search to these areas: ',
393     },
394    
395    
396    
397    
398     # The 'template' setting defines what generates the output
399     # The default is "TemplateDefault" which is reasonably ugly.
400     # Note that some of the above options may not be available
401     # for templating, as it's up to you do layout the form
402     # and results in your template.
403    
404    
405     xtemplate => {
406     package => 'TemplateDefault',
407     },
408    
409     xtemplate => {
410     package => 'TemplateDumper',
411     },
412    
413     xtemplate => {
414     package => 'TemplateToolkit',
415     file => 'search.tt',
416     options => {
417     INCLUDE_PATH => '/home/user/swish-e/example',
418     #PRE_PROCESS => 'config',
419     },
420     },
421    
422     xtemplate => {
423     package => 'TemplateHTMLTemplate',
424     options => {
425     filename => 'swish.tmpl',
426     die_on_bad_params => 0,
427     loop_context_vars => 1,
428     cache => 1,
429     },
430     },
431    
432    
433    
434     # The "on_intranet" setting is just a flag that can be used to say you do
435     # not have an external internet connection. It's here because the default
436     # page generation includes links to images on swish-e.or and on www.w3.org.
437     # If this is set to one then those images will not be shown.
438     # (This only effects the default ouput module TemplateDefault)
439    
440     on_intranet => 0,
441    
442    
443    
444     # Here you can hard-code debugging options. The will help you find
445     # where you made your mistake ;)
446     # Using all at once will generate a lot of messages to STDERR
447     # Please see the documentation before using these.
448     # Typically, you will set these from the command line instead of in the configuration.
449    
450     # debug_options => 'basic, command, headers, output, summary, dump',
451    
452    
453    
454     # This defines the package object for reading CGI parameters
455     # Defaults to CGI. Might be useful with mod_perl.
456     # request_package => 'CGI',
457     # request_package => 'Apache::Request',
458    
459    
460    
461     # Minor adjustment to page display. The page navigation normally looks like:
462     # Page: 1 5 6 7 8 9 24
463     # where the first page and last page are always displayed. These can be disabled by
464     # by setting to true values ( 1 )
465    
466     no_first_page_navigation => 0,
467     no_last_page_navigation => 0,
468    
469    
470    
471    
472     # Limit to date ranges
473    
474    
475    
476     # This adds in the date_range limiting options
477     # You will need the DateRanges.pm module from the author to use that feature
478    
479     # Noramlly, you will want to limit by the last modified date, so specify
480     # "swishlastmodified" as the property_name. If indexing a mail archive, and, for
481     # example, you store the date (a unix timestamp) as "date" then specify
482     # "date" as the property_name.
483    
484     date_ranges => {
485     property_name => 'swishlastmodified', # property name to limit by
486    
487     # what you specify here depends on the DateRanges.pm module.
488     time_periods => [
489     'All',
490     'Today',
491     'Yesterday',
492     #'Yesterday onward',
493     'This Week',
494     'Last Week',
495     'Last 90 Days',
496     'This Month',
497     'Last Month',
498     #'Past',
499     #'Future',
500     #'Next 30 Days',
501     ],
502    
503     line_break => 0,
504     default => 'All',
505     date_range => 1,
506     },
507    
508     };
509    
510     }
511    
512     #^^^^^^^^^^^^^^^^^^^^^^^^^ end of user config ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
513     #========================================================================================
514    
515    
516    
517     #=================================================================================
518     # mod_perl entry point
519     #
520     # As an example, you might use a PerlSetVar to point to paths to different
521     # config files, and then cache the different configurations by path.
522     #
523     #=================================================================================
524    
525     my %cached_configs;
526    
527     sub handler {
528     my $r = shift;
529    
530     if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) {
531    
532     # Already cached?
533     if ( $cached_configs{ $config_path } ) {
534     process_request( $cached_configs{ $config_path } );
535     return Apache::Constants::OK();
536     }
537    
538     # Else, load config
539     my $config = default_config();
540     $config->{config_file} = $config_path;
541    
542     # Merge with disk config file.
543     $cached_configs{ $config_path } = merge_read_config( $config );
544    
545     process_request( $cached_configs{ $config_path } );
546     return Apache::Constants::OK();
547     }
548    
549    
550     # Otherwise, use hard-coded config
551     process_request( default_config() );
552    
553     return Apache::Constants::OK();
554    
555     }
556    
557    
558     #============================================================================
559     # Read config settings from disk, and merge
560     # Note, all errors are ignored since by default this script looks for a
561     # config file.
562     #
563     #============================================================================
564     sub merge_read_config {
565     my $config = shift;
566    
567    
568     set_default_debug_flags();
569    
570     set_debug($config); # get from config or from %ENV
571    
572    
573     return $config unless $config->{config_file};
574    
575     my $return = do $config->{config_file}; # load the config file
576    
577     unless ( ref $return eq 'HASH' ) {
578    
579     # First, let's check for file not found for the default config, which we can ignore
580    
581     my $error = $@ || $!;
582    
583     if ( $config->{config_file} eq $DEFAULT_CONFIG_FILE && !-e $config->{config_file} ) {
584     warn "Config file '$config->{config_file}': $!" if $config->{debug};
585     return $config;
586     }
587    
588     die "Config file '$config->{config_file}': $error";
589     }
590    
591    
592    
593     if ( $config->{debug} || $return->{debug} ) {
594     require Data::Dumper;
595     print STDERR "\n---------- Read config parameters from '$config->{config_file}' ------\n",
596     Data::Dumper::Dumper($return),
597     "-------------------------\n";
598     }
599    
600     set_debug( $return );
601    
602    
603     # Merge settings
604     return { %$config, %$return };
605     }
606    
607     #--------------------------------------------------------------------------------------------------
608     sub set_default_debug_flags {
609     # Debug flags defined
610    
611     $SwishSearch::DEBUG_BASIC = 1; # Show command used to run swish
612     $SwishSearch::DEBUG_COMMAND = 2; # Show command used to run swish
613     $SwishSearch::DEBUG_HEADERS = 4; # Swish output headers
614     $SwishSearch::DEBUG_OUTPUT = 8; # Swish output besides headers
615     $SwishSearch::DEBUG_SUMMARY = 16; # Summary of results parsed
616     $SwishSearch::DEBUG_DUMP_DATA = 32; # dump data that is sent to templating modules
617     }
618    
619    
620    
621    
622     #---------------------------------------------------------------------------------------------------
623     sub set_debug {
624     my $conf = shift;
625    
626     unless ( $ENV{SWISH_DEBUG} ||$conf->{debug_options} ) {
627     $conf->{debug} = 0;
628     return;
629     }
630    
631     my %debug = (
632     basic => [$SwishSearch::DEBUG_BASIC, 'Basic debugging'],
633     command => [$SwishSearch::DEBUG_COMMAND, 'Show command used to run swish'],
634     headers => [$SwishSearch::DEBUG_HEADERS, 'Show headers returned from swish'],
635     output => [$SwishSearch::DEBUG_OUTPUT, 'Show output from swish'],
636     summary => [$SwishSearch::DEBUG_SUMMARY, 'Show summary of results'],
637     dump => [$SwishSearch::DEBUG_DUMP_DATA, 'Show all data available to templates'],
638     );
639    
640    
641     $conf->{debug} = 1;
642    
643     for ( split /\s*,\s*/, $ENV{SWISH_DEBUG} ) {
644     if ( exists $debug{ lc $_ } ) {
645     $conf->{debug} |= $debug{ lc $_ }->[0];
646     next;
647     }
648    
649     print STDERR "Unknown debug option '$_'. Must be one of:\n",
650     join( "\n", map { sprintf(' %10s: %10s', $_, $debug{$_}->[1]) } sort { $debug{$a}->[0] <=> $debug{$b}->[0] }keys %debug),
651     "\n\n";
652     exit;
653     }
654    
655     print STDERR "Debug level set to: $conf->{debug}\n";
656     }
657    
658    
659     #============================================================================
660     #
661     # This is the main entry point, where a config hash is passed in.
662     #
663     #============================================================================
664    
665     sub process_request {
666     my $conf = shift; # configuration parameters
667    
668     # Use CGI.pm by default
669     my $request_package = $conf->{request_package} || 'CGI';
670     $request_package =~ s[::][/]g;
671     require "$request_package.pm";
672    
673     my $request_object = $conf->{request_package} ? $conf->{request_package}->new : CGI->new;
674    
675     if ( $conf->{debug} ) {
676     print STDERR 'Enter a query [all]: ';
677     my $query = <STDIN>;
678     $query =~ tr/\r//d;
679     chomp $query;
680     unless ( $query ) {
681     print STDERR "Using 'not asdfghjklzxcv' to match all records\n";
682     $query = 'not asdfghjklzxcv';
683     }
684    
685     $request_object->param('query', $query );
686    
687     print STDERR 'Enter max results to display [1]: ';
688     my $max = <STDIN>;
689     chomp $max;
690     $max = 1 unless $max && $max =~/^\d+$/;
691    
692     $conf->{page_size} = $max;
693     }
694    
695    
696    
697     # create search object
698     my $search = SwishQuery->new(
699     config => $conf,
700     request => $request_object,
701     );
702    
703    
704     # run the query
705     my $results = $search->run_query; # currently, results is the just the $search object
706    
707     if ( $conf->{debug} ) {
708     if ( $conf->{debug} & $SwishSearch::DEBUG_DUMP_DATA ) {
709     require Data::Dumper;
710     print STDERR "\n------------- Results structure passed to template ------------\n",
711     Data::Dumper::Dumper( $results ),
712     "--------------------------\n";
713     } elsif ( $conf->{debug} & $SwishSearch::DEBUG_SUMMARY ) {
714     print STDERR "\n------------- Results Summary ------------\n";
715     if ( $results->{hits} ) {
716     require Data::Dumper;
717     print STDERR "Showing $results->{navigation}{showing} of $results->{navigation}{hits}\n",
718     Data::Dumper::Dumper( $results->{_results} );
719     } else {
720     print STDERR "** NO RESULTS **\n";
721     }
722    
723     print STDERR "--------------------------\n";
724     } else {
725     print STDERR ( ($results->{hits} ? "Found $results->{hits} results\n" : "Failed to find any results\n" . $results->errstr . "\n" ),"\n" );
726     }
727     }
728    
729    
730    
731     my $template = $conf->{template} || { package => 'TemplateDefault' };
732    
733     my $package = $template->{package};
734    
735     my $file = "$package.pm";
736     $file =~ s[::][/]g;
737    
738     eval { require $file };
739     if ( $@ ) {
740     warn "$0 $@\n";
741     print <<EOF;
742     Content-Type: text/html
743    
744     <html>
745     <head><title>Software Error</title></head>
746     <body><h2>Software Error</h2><p>Please check error log</p></body>
747     </html>
748     EOF
749    
750     exit;
751     }
752    
753     $package->show_template( $template, $results );
754     }
755    
756    
757    
758    
759    
760     #==================================================================================================
761     package SwishQuery;
762     #==================================================================================================
763    
764     use Carp;
765     # Or use this instead -- PLEASE see perldoc CGI::Carp for details
766     # <opinion>CGI::Carp doesn't help that much</opinion>
767     #use CGI::Carp; # qw(fatalsToBrowser);
768    
769    
770     #--------------------------------------------------------------------------------
771     # new() doesn't do much, just create the object
772     #--------------------------------------------------------------------------------
773     sub new {
774     my $class = shift;
775     my %options = @_;
776    
777     my $conf = $options{config};
778    
779     croak "Failed to set the swish index files in config setting 'swish_index'" unless $conf->{swish_index};
780     croak "Failed to specify 'swish_binary' in configuration" unless $conf->{swish_binary};
781    
782     # initialize the request search hash
783     my $sh = {
784     prog => $conf->{swish_binary},
785     config => $conf,
786     q => $options{request},
787     hits => 0,
788     MOD_PERL => $ENV{MOD_PERL},
789     };
790    
791     return bless $sh, $class;
792     }
793    
794    
795     sub hits { shift->{hits} }
796    
797     sub config {
798     my ($self, $setting, $value ) = @_;
799    
800     croak "Failed to pass 'config' a setting" unless $setting;
801    
802     my $cur = $self->{config}{$setting} if exists $self->{config}{$setting};
803    
804     $self->{config}{$setting} = $value if $value;
805    
806     return $cur;
807     }
808    
809     sub header {
810     my $self = shift;
811     return unless ref $self->{_headers} eq 'HASH';
812    
813     return $self->{_headers}{$_[0]} || '';
814     }
815    
816    
817     # return a ref to an array
818     sub results {
819     my $self = shift;
820     return $self->{_results} || undef;
821     }
822    
823     sub navigation {
824     my $self = shift;
825     return unless ref $self->{navigation} eq 'HASH';
826    
827     return exists $self->{navigation}{$_[0]} ? $self->{navigation}{$_[0]} : '';
828     }
829    
830     sub CGI { $_[0]->{q} };
831    
832    
833    
834    
835     sub swish_command {
836    
837     my $self = shift;
838    
839     unless ( @_ ) {
840     return $self->{swish_command} ? @{$self->{swish_command}} : undef;
841     }
842    
843     push @{$self->{swish_command}}, @_;
844     }
845    
846    
847     sub errstr {
848     my ($self, $value ) = @_;
849    
850    
851     $self->{_errstr} = $value if $value;
852    
853     return $self->{_errstr} || '';
854     }
855    
856    
857    
858    
859    
860    
861     #============================================
862     # This returns "$self" just in case we want to seperate out into two objects later
863    
864    
865     sub run_query {
866    
867     my $self = shift;
868    
869     my $q = $self->{q};
870     my $conf = $self->{config};
871    
872    
873     # Sets the query string, and any -L limits.
874     return $self unless $self->build_query;
875    
876    
877    
878     # Set the starting position (which is offset by one)
879    
880     my $start = $q->param('start') || 0;
881     $start = 0 unless $start =~ /^\d+$/ && $start >= 0;
882    
883     $self->swish_command( '-b', $start+1 );
884    
885    
886    
887     # Set the max hits
888    
889     my $page_size = $self->config('page_size') || 15;
890     $self->swish_command( '-m', $page_size );
891    
892    
893     return $self unless $self->set_index_file;
894    
895    
896    
897     # Set the sort option, if any
898     return $self unless $self->set_sort_order;
899    
900    
901    
902     my $timeout = $self->config('timeout') || 0;
903    
904     eval {
905     local $SIG{ALRM} = sub { die "Timed out\n" };
906     alarm $timeout if $timeout && $^O !~ /Win32/i;
907     $self->run_swish;
908     alarm 0 unless $^O =~ /Win32/i;
909     waitpid $self->{pid}, 0 if $self->{pid}; # for IPC::Open2
910     };
911    
912     if ( $@ ) {
913     warn "$0 $@"; # if $conf->{debug};
914     $self->errstr( "Service currently unavailable" );
915     return $self;
916     }
917    
918    
919    
920     my $hits = $self->hits;
921     return $self unless $hits;
922    
923    
924    
925     # Build href for repeated search via GET (forward, backward links)
926    
927    
928     my @query_string =
929     map { "$_=" . $q->escape( $q->param($_) ) }
930     grep { $q->param($_) } qw/query metaname sort reverse/;
931    
932    
933     for my $p ( qw/si sbm/ ) {
934     my @settings = $q->param($p);
935     next unless @settings;
936     push @query_string, "$p=" . $q->escape( $_ ) for @settings;
937     }
938    
939    
940    
941    
942     if ( $conf->{date_ranges} ) {
943     my $dr = DateRanges::GetDateRangeArgs( $q );
944     push @query_string, $dr, if $dr;
945     }
946    
947    
948     $self->{query_href} = $q->script_name . '?' . join '&amp;', @query_string;
949    
950    
951    
952     # Return the template fields
953    
954     $self->{my_url} = $q->script_name;
955    
956     $self->{hits} = $hits;
957    
958     $self->{navigation} = {
959     showing => $hits,
960     from => $start + 1,
961     to => $start + $hits,
962     hits => $self->header('number of hits') || 0,
963     run_time => $self->header('run time') || 'unknown',
964     search_time => $self->header('search time') || 'unknown',
965     };
966    
967    
968     $self->set_page ( $page_size );
969    
970     return $self;
971    
972     }
973    
974    
975     #============================================================
976     # Build a query string from swish
977     # Just builds the -w string
978     #------------------------------------------------------------
979    
980     sub build_query {
981     my $self = shift;
982    
983     my $q = $self->{q};
984    
985    
986     # set up the query string to pass to swish.
987     my $query = $q->param('query') || '';
988    
989     for ( $query ) { # trim the query string
990     s/\s+$//;
991     s/^\s+//;
992     }
993    
994     $self->{query_simple} = $query; # without metaname
995     $q->param('query', $query ); # clean up the query, if needed.
996    
997    
998     # Read in the date limits, if any. This can create a new query
999     return unless $self->get_date_limits( \$query );
1000    
1001    
1002     unless ( $query ) {
1003     $self->errstr('Please enter a query string') if $q->param('submit');
1004     return;
1005     }
1006    
1007    
1008     if ( length( $query ) > $self->{config}{max_query_length} ) {
1009     $self->errstr('Please enter a shorter query');
1010     return;
1011     }
1012    
1013    
1014    
1015     # Adjust the query string for metaname search
1016     # *Everything* is a metaname search
1017     # Might also like to allow searching more than one metaname at the same time
1018    
1019     my $metaname = $q->param('metaname') || 'swishdefault';
1020    
1021    
1022     # make sure it's a valid metaname
1023    
1024     my $conf = $self->{config};
1025     my @metas = ('swishdefault');
1026     push @metas, @{ $self->config('metanames')} if $self->config('metanames');
1027     my %meta_lookup = map { $_ => 1 } @metas;
1028    
1029     unless ( $meta_lookup{$metaname} ) {
1030     $self->errstr('Bad MetaName provided');
1031     return;
1032     }
1033    
1034     # prepend metaname to query
1035    
1036     if ( $conf->{meta_groups} && $conf->{meta_groups}{$metaname} ) {
1037     $query = join ' OR ', map { "$_=($query)" } @{$conf->{meta_groups}{$metaname}};
1038    
1039     # This is used to create a fake entry in the parsed query so highlighting
1040     # can find the query words
1041     $self->{real_metaname} = $conf->{meta_groups}{$metaname}[0];
1042     } else {
1043     $query = $metaname . "=($query)";
1044     }
1045    
1046     # save the metaname so we know what field to highlight
1047     # Note that this might be a fake metaname
1048     $self->{metaname} = $metaname;
1049    
1050    
1051     ## Look for a "limit" metaname -- perhaps used with ExtractPath
1052     # Here we don't worry about user supplied data
1053    
1054     my $limits = $self->config('select_by_meta');
1055     my @limits = $q->param('sbm'); # Select By Metaname
1056    
1057    
1058     # Note that this could be messed up by ending the query in a NOT or OR
1059     # Should look into doing:
1060     # $query = "( $query ) AND " . $limits->{metaname} . '=(' . join( ' OR ', @limits ) . ')';
1061     if ( @limits && ref $limits eq 'HASH' && $limits->{metaname} ) {
1062     $query .= ' and ' . $limits->{metaname} . '=(' . join( ' or ', @limits ) . ')';
1063     }
1064    
1065    
1066     $self->swish_command('-w', $query );
1067    
1068     return 1;
1069     }
1070    
1071     #========================================================================
1072     # Get the index files from the form, or from simple the config settings
1073     #------------------------------------------------------------------------
1074    
1075     sub set_index_file {
1076     my $self = shift;
1077    
1078     my $q = $self->CGI;
1079    
1080     # Set the index file
1081    
1082     if ( $self->config('select_indexes') && ref $self->config('swish_index') eq 'ARRAY' ) {
1083    
1084     my @choices = $q->param('si');
1085     if ( !@choices ) {
1086     $self->errstr('Please select a source to search');
1087     return;
1088     }
1089    
1090     my @indexes = @{$self->config('swish_index')};
1091    
1092    
1093     my @selected_indexes = grep {/^\d+$/ && $_ >= 0 && $_ < @indexes } @choices;
1094    
1095     if ( !@selected_indexes ) {
1096     $self->errstr('Invalid source selected');
1097     return $self;
1098     }
1099     $self->swish_command( '-f', @indexes[ @selected_indexes ] );
1100    
1101    
1102     } else {
1103     my $indexes = $self->config('swish_index');
1104     $self->swish_command( '-f', ref $indexes ? @$indexes : $indexes );
1105     }
1106    
1107     return 1;
1108     }
1109    
1110     #================================================================================
1111     # Parse out the date limits from the form or from GET request
1112     #
1113     #---------------------------------------------------------------------------------
1114    
1115     sub get_date_limits {
1116    
1117     my ( $self, $query_ref ) = @_;
1118    
1119     my $conf = $self->{config};
1120    
1121     # Are date ranges enabled?
1122     return 1 unless $conf->{date_ranges};
1123    
1124    
1125     eval { require DateRanges };
1126     if ( $@ ) {
1127     print STDERR "\n------ Can't use DateRanges feature ------------\n",
1128     "\nScript will run, but you can't use the date range feature\n",
1129     $@,
1130     "\n--------------\n" if $conf->{debug};
1131    
1132     delete $conf->{date_ranges};
1133     return 1;
1134     }
1135    
1136     my $q = $self->{q};
1137    
1138     my %limits;
1139    
1140     unless ( DateRanges::DateRangeParse( $q, \%limits ) ) {
1141     $self->errstr( $limits{DateRanges_error} || 'Bad date range selection' );
1142     return;
1143     }
1144    
1145     # Store the values for later
1146    
1147     $self->{DateRanges_time_low} = $limits{DateRanges_time_low};
1148     $self->{DateRanges_time_high} = $limits{DateRanges_time_high};
1149    
1150    
1151     # Allow searchs just be date if not "All dates" search
1152     # $$$ should place some limits here, and provide a switch to disable
1153     if ( !$$query_ref && $limits{DateRanges_time_high} ) {
1154     $$query_ref = 'not skaisikdeekk';
1155     $self->{_search_all}++; # flag
1156     }
1157    
1158    
1159     my $limit_prop = $conf->{date_ranges}{property_name} || 'swishlastmodified';
1160    
1161    
1162     if ( $limits{DateRanges_time_low} && $limits{DateRanges_time_high} ) {
1163     $self->swish_command( '-L', $limit_prop, $limits{DateRanges_time_low}, $limits{DateRanges_time_high} );
1164     }
1165    
1166     return 1;
1167     }
1168    
1169    
1170    
1171     #================================================================
1172     # Set the sort order
1173     # Just builds the -s string
1174     #----------------------------------------------------------------
1175    
1176     sub set_sort_order {
1177     my $self = shift;
1178    
1179     my $q = $self->{q};
1180    
1181     my $sorts_array = $self->config('sorts');
1182     return 1 unless $sorts_array;
1183    
1184    
1185     my $conf = $self->{config};
1186    
1187    
1188     # Now set sort option - if a valid option submitted (or you could let swish-e return the error).
1189     my %sorts = map { $_, 1 } @$sorts_array;
1190    
1191     my $sortby = $q->param('sort') || 'swishrank';
1192    
1193     if ( $sortby && $sorts{ $sortby } ) {
1194    
1195     my $direction = $sortby eq 'swishrank'
1196     ? $q->param('reverse') ? 'asc' : 'desc'
1197     : $q->param('reverse') ? 'desc' : 'asc';
1198    
1199     $self->swish_command( '-s', $sortby, $direction );
1200    
1201     if ( $conf->{secondary_sort} && $sortby ne $conf->{secondary_sort}[0] ) {
1202     $self->swish_command(ref $conf->{secondary_sort} ? @{ $conf->{secondary_sort} } : $conf->{secondary_sort} );
1203     }
1204    
1205     } else {
1206     $self->errstr( 'Invalid Sort Option Selected' );
1207     return;
1208     }
1209    
1210     return 1;
1211     }
1212    
1213    
1214    
1215     #========================================================
1216     # Sets prev and next page links.
1217     # Feel free to clean this code up!
1218     #
1219     # Pass:
1220     # $resutls - reference to a hash (for access to the headers returned by swish)
1221     # $q - CGI object
1222     #
1223     # Returns:
1224     # Sets entries in the $results hash
1225     #
1226    
1227     sub set_page {
1228    
1229     my ( $self, $Page_Size ) = @_;
1230    
1231     my $q = $self->{q};
1232    
1233     my $navigation = $self->{navigation};
1234    
1235    
1236     my $start = $navigation->{from} - 1; # Current starting record
1237    
1238    
1239     my $prev = $start - $Page_Size;
1240     $prev = 0 if $prev < 0;
1241    
1242     if ( $prev < $start ) {
1243     $navigation->{prev} = $prev;
1244     $navigation->{prev_count} = $start - $prev;
1245     }
1246    
1247    
1248     my $last = $navigation->{hits} - 1;
1249    
1250    
1251     my $next = $start + $Page_Size;
1252     $next = $last if $next > $last;
1253     my $cur_end = $start + $self->{hits} - 1;
1254     if ( $next > $cur_end ) {
1255     $navigation->{next} = $next;
1256     $navigation->{next_count} = $next + $Page_Size > $last
1257     ? $last - $next + 1
1258     : $Page_Size;
1259     }
1260    
1261    
1262     # Calculate pages ( is this -1 correct here? )
1263    
1264     my $pages = int (($navigation->{hits} -1) / $Page_Size);
1265     if ( $pages ) {
1266    
1267     my @pages = 0..$pages;
1268    
1269     my $max_pages = 10;
1270    
1271     if ( @pages > $max_pages ) {
1272     my $current_page = int ( $start / $Page_Size - $max_pages/2) ;
1273     $current_page = 0 if $current_page < 0;
1274     if ( $current_page + $max_pages - 1 > $pages ) {
1275     $current_page = $pages - $max_pages;
1276     }
1277    
1278     @pages = $current_page..$current_page + $max_pages - 1;
1279     unshift @pages, 0 if $current_page && !$self->{config}{no_first_page_navigation};
1280     push @pages, $pages unless $current_page + $max_pages - 1 == $pages || $self->{config}{no_last_page_navigation}
1281     }
1282    
1283    
1284     $navigation->{pages} =
1285     join ' ', map {
1286     my $page_start = $_ * $Page_Size;
1287     my $page = $_ + 1;
1288     $page_start == $start
1289     ? $page
1290     : qq[<a href="$self->{query_href}&amp;start=$page_start">$page</a>];
1291     } @pages;
1292     }
1293    
1294     }
1295    
1296     #==================================================
1297     # Format and return the date range options in HTML
1298     #
1299     #--------------------------------------------------
1300     sub get_date_ranges {
1301    
1302     my $self = shift;
1303    
1304     my $q = $self->{q};
1305     my $conf = $self->{config};
1306    
1307     return '' unless $conf->{date_ranges};
1308    
1309     # pass parametes, and a hash to store the returned values.
1310    
1311     my %fields;
1312    
1313     DateRanges::DateRangeForm( $q, $conf->{date_ranges}, \%fields );
1314    
1315    
1316     # Set the layout:
1317    
1318     my $string = '<br>Limit to: '
1319     . ( $fields{buttons} ? "$fields{buttons}<br>" : '' )
1320     . ( $fields{date_range_button} || '' )
1321     . ( $fields{date_range_low}
1322     ? " $fields{date_range_low} through $fields{date_range_high}"
1323     : '' );
1324    
1325     return $string;
1326     }
1327    
1328    
1329    
1330     #============================================
1331     # Run swish-e and gathers headers and results
1332     # Currently requires fork() to run.
1333     #
1334     # Pass:
1335     # $sh - an array with search parameters
1336     #
1337     # Returns:
1338     # a reference to a hash that contains the headers and results
1339     # or possibly a scalar with an error message.
1340     #
1341    
1342    
1343     sub run_swish {
1344    
1345    
1346     my $self = shift;
1347    
1348     my $results = $self->{results};
1349     my $conf = $self->{config};
1350     my $q = $self->{q};
1351    
1352    
1353     my @properties;
1354     my %seen;
1355    
1356     # Gather up the properties specified
1357    
1358     for ( qw/ title_property description_prop display_props / ) {
1359     push @properties, ref $conf->{$_} ? @{$conf->{$_}} : $conf->{$_}
1360     if $conf->{$_} && !$seen{$_}++;
1361     }
1362    
1363     # Add in the default props
1364     for ( qw/swishrank swishdocpath/ ) {
1365     push @properties, $_ unless $seen{$_};
1366     }
1367    
1368    
1369     # add in the default prop - a number must be first (this might be a duplicate in -x, oh well)
1370     @properties = ( 'swishreccount', @properties );
1371    
1372     $self->swish_command( -x => join( '\t', map { "<$_>" } @properties ) . '\n' );
1373    
1374     $self->swish_command( -H => 9 );
1375    
1376     my $fh = $^O =~ /Win32/i
1377     ? windows_fork( $conf, $self )
1378     : real_fork( $conf, $self );
1379    
1380    
1381     $self->{COMMAND} = join ' ', $self->{prog}, $self->swish_command;
1382    
1383    
1384     # read in from child
1385    
1386    
1387     my @results;
1388    
1389     my $trim_prop = $self->config('description_prop');
1390    
1391     my $highlight = $self->config('highlight');
1392     my $highlight_object;
1393    
1394     # Loop through values returned from swish.
1395    
1396     my %stops_removed;
1397    
1398     my $unknown_output = '';
1399    
1400    
1401     while (<$fh>) {
1402    
1403     chomp;
1404     tr/\r//d;
1405    
1406     # This will not work correctly with multiple indexes when different values are used.
1407     if ( /^# ([^:]+):\s+(.+)$/ ) {
1408    
1409     print STDERR "$_\n" if $conf->{debug} & $SwishSearch::DEBUG_HEADERS;
1410    
1411     my $h = lc $1;
1412     my $value = $2;
1413     $self->{_headers}{$h} = $value;
1414    
1415     push @{$self->{_headers}{'removed stopwords'}}, $value if $h eq 'removed stopword' && !$stops_removed{$value}++;
1416    
1417     next;
1418     } elsif ( $conf->{debug} & $SwishSearch::DEBUG_OUTPUT ) {
1419     print STDERR "$_\n";
1420     }
1421    
1422    
1423    
1424     # return swish errors as a mesage to the script
1425     $self->errstr($1), return if /^err:\s*(.+)/;
1426    
1427     # Or, if you want to log the errors and just say "Service Unavailable" use this:
1428     #die "$1\n" if /^err:\s*(.+)/;
1429    
1430    
1431     # Found a result
1432     if ( /^\d/ ) {
1433    
1434     my %h;
1435     @h{@properties} = split /\t/;
1436     push @results, \%h;
1437    
1438     # There's a chance that the docpath could be modified by highlighting
1439     # when used in a "display_props".
1440     $h{saved_swishdocpath} = $h{swishdocpath};
1441    
1442     my $docpath = $h{swishdocpath};
1443     $docpath =~ s/\s/%20/g; # Replace spaces
1444     $h{swishdocpath_href} = ( $self->config('prepend_path') || '' ) . $docpath;
1445    
1446    
1447    
1448    
1449    
1450     # Now do any formatting
1451     if ( $highlight ) {
1452     if ( !$highlight_object ) {
1453     my $package = $highlight->{package} || 'DefaultHighlight';
1454    
1455     eval { require "$package.pm" };
1456     if ( $@ ) {
1457     $self->errstr( "Failed to load Highlighting Module - check error log" );
1458     warn "$0: $@";
1459     $highlight = '';
1460     next;
1461     } else {
1462     $highlight_object = $package->new( $self, $self->{metaname} );
1463     }
1464     }
1465    
1466     # Highlight any fields, as needed
1467     $highlight_object->highlight( \%h );
1468    
1469     next;
1470     }
1471    
1472    
1473    
1474    
1475     # Trim down the description if no highlight, or if highlighting some other property
1476     # Not very nice. The highlighting code would limit by words
1477    
1478     if ( $trim_prop && $h{$trim_prop} ) {
1479     my $max = $conf->{max_chars} || 500;
1480    
1481     if ( length $h{$trim_prop} > $max ) {
1482     $h{$trim_prop} = substr( $h{$trim_prop}, 0, $max) . ' <b>...</b>';
1483     }
1484     }
1485    
1486     next;
1487    
1488     } elsif ( /^\.$/ ) {
1489     last;
1490    
1491     } else {
1492     next if /^#/;
1493     }
1494    
1495     $unknown_output .= "'$_'\n";
1496    
1497    
1498    
1499    
1500     }
1501    
1502     die "Swish returned unknown output: $unknown_output\n" if $unknown_output;
1503    
1504     $self->{hits} = @results;
1505     $self->{_results} = \@results if @results;
1506    
1507     }
1508    
1509     #==================================================================
1510     # Run swish-e by forking
1511     #
1512    
1513     use Symbol;
1514    
1515     sub real_fork {
1516     my ( $conf, $self ) = @_;
1517    
1518    
1519     # Run swish
1520     my $fh = gensym;
1521     my $pid = open( $fh, '-|' );
1522    
1523     die "Failed to fork: $!\n" unless defined $pid;
1524    
1525    
1526    
1527     if ( !$pid ) { # in child
1528     if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) {
1529     print STDERR "---- Running swish with the following command and parameters ----\n";
1530     print STDERR join( " \\\n", map { /[^\/.\-\w\d]/ ? qq['$_'] : $_ } $self->{prog}, $self->swish_command );
1531     print STDERR "\n-----------------------------------------------\n";
1532     }
1533    
1534    
1535     unless ( exec $self->{prog}, $self->swish_command ) {
1536     warn "Child process Failed to exec '$self->{prog}' Error: $!";
1537     print "Failed to exec Swish"; # send this message to parent.
1538     exit;
1539     }
1540     }
1541    
1542     return $fh;
1543     }
1544    
1545    
1546     #=====================================================================================
1547     # Windows work around
1548     # from perldoc perlfok -- na, that doesn't work. Try IPC::Open2
1549     #
1550     sub windows_fork {
1551     my ( $conf, $self ) = @_;
1552    
1553     if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) {
1554     print STDERR "---- Running swish with the following command and parameters ----\n";
1555     print STDERR join( ' ', map { /[^.\-\w\d]/ ? qq["$_"] : $_ } map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command );
1556     print STDERR "\n-----------------------------------------------\n";
1557     }
1558    
1559    
1560     require IPC::Open2;
1561     my ( $rdrfh, $wtrfh );
1562    
1563     # Ok, I'll say it. Windows sucks.
1564     my @command = map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command;
1565     my $pid = IPC::Open2::open2($rdrfh, $wtrfh, @command );
1566    
1567    
1568     $self->{pid} = $pid;
1569    
1570     return $rdrfh;
1571     }
1572    
1573     #=====================================================================================
1574     # This method parses out the query from the "Parsed words" returned by swish
1575     # for use in highlighting routines
1576     # This returns a hash ref:
1577     # $query->{text} # evertying is currently at level "text"
1578     # {$metaname} # the meta name
1579     # [ array of phrases ]
1580     # each phrase is made up of an array of words
1581    
1582    
1583    
1584    
1585    
1586     use constant DEBUG_QUERY_PARSED => 0;
1587    
1588     sub extract_query_match {
1589     my $self = shift;
1590    
1591     my $query = $self->header('parsed words'); # grab query parsed by swish
1592    
1593    
1594     my %query_match; # kewords broken down by layer and field.
1595     $self->{query_match} = \%query_match;
1596    
1597    
1598     # Loop through the query
1599    
1600     while ( $query =~ /([a-z]+)\s+=\s+(.+?)(?=\s+[a-z]+\s+=|$)/g ) {
1601    
1602     my ( $field, $words ) = ( $1, $2 );
1603    
1604    
1605     my $inquotes;
1606     my $buffer;
1607     my %single_words;
1608    
1609     my $layer = 'text'; # This might be used in the future to highlight tags when matching a href.
1610    
1611     # Expand group searches -- not currently used
1612     my @fields = ( $field );
1613    
1614    
1615     for my $word ( split /\s+/, $words ) {
1616    
1617    
1618     # XXX This list of swish operators could change "and or not" and is dependent on stopwords.
1619     # remove control words and parens
1620     next if !$inquotes && $word =~ /^(and|or|not|\(|\))$/;
1621    
1622     $buffer = [] unless $inquotes; # is there a better way to allocate memory like this?
1623    
1624     if ( $word eq '"' ) {
1625     unless ( $inquotes ) {
1626     $inquotes++;
1627     next;
1628     } else {
1629     $inquotes = 0;
1630     }
1631    
1632     } else {
1633    
1634     push @$buffer, $word;
1635     }
1636    
1637    
1638     next if $inquotes;
1639    
1640    
1641     # Only record single words once (this will probably break something)
1642     # Reason: to reduce the number of matches must check
1643     next if @$buffer == 1 && $single_words{ $buffer->[0] }++;
1644    
1645    
1646     push @{$query_match{$layer}{$_}}, $buffer foreach @fields;
1647    
1648    
1649     }
1650     }
1651    
1652    
1653     # Here's a hack to make metaname expansion work
1654     # this will make an entry like all => [qw/ query words /]; for use with fake metanames
1655    
1656     $query_match{text}{ $self->{metaname} } = $query_match{text}{$self->{real_metaname}}
1657     if $self->{real_metaname} && $query_match{text}{$self->{real_metaname}};
1658    
1659    
1660    
1661     # Now, sort in desending order of phrase lenght
1662    
1663    
1664     foreach my $layer ( keys %query_match ) {
1665     print STDERR " LAYER: $layer\n" if DEBUG_QUERY_PARSED;
1666    
1667    
1668     foreach my $tag ( keys %{$query_match{$layer}} ) {
1669    
1670     @{$query_match{$layer}{$tag}} = sort { @$b <=> @$a } @{$query_match{$layer}{$tag}};
1671    
1672    
1673     if ( DEBUG_QUERY_PARSED ) {
1674     print STDERR " TAG: '$tag'\n";
1675     print STDERR " : '@$_'\n" foreach @{$query_match{$layer}{$tag}};
1676     }
1677     }
1678     }
1679    
1680    
1681     # display parsed query instead of the title for debugging
1682     # use Data::Dumper;
1683     # $self->config('title',"<pre><font size=3>Query:\n$query\n" . Dumper(\%query_match) . '</font></pre>');
1684    
1685    
1686     return \%query_match;
1687     }
1688    
1689    
1690     1;
1691    
1692    
1693     __END__
1694    
1695     =head1 NAME
1696    
1697     swish.cgi -- Example Perl script for searching with the SWISH-E search engine.
1698    
1699     =head1 DESCRIPTION
1700    
1701     C<swish.cgi> is a CGI script for searching with the SWISH-E search engine version 2.1-dev and above.
1702     It returns results a page at a time, with matching words from the source document highlighted, showing a
1703     few words of content on either side of the highlighted word.
1704    
1705     The script is highly configurable; you can search multiple (or selectable) indexes, limit searches to
1706     part of the index, allow sorting by a number of different properties, limit results to a date range, and so on.
1707    
1708     The standard configuration (i.e. not using a config file) should work with most swish index files.
1709     Customization of the parameters will be
1710     needed if you are indexing special meta data and want to search and/or display the meta data. The
1711     configuration can be modified by editing this script directly, or by using a configuration file (.swishcgi.conf
1712     by default).
1713    
1714     You are strongly encouraged to get the default configuration working before making changes. Most problems
1715     using this script are the result of configuration modifications.
1716    
1717     The script is modular in design. Both the highlighting code and output generation is handled by modules, which
1718     are included in the F<example/modules> directory. This allows for easy customization of the output without
1719     changing the main CGI script. A module exists to generate standard HTML output. There's also modules and
1720     template examples to use with the popular Perl templating systems HTML::Template and Template-Toolkit. This allows
1721     you to tightly integrate this script with the look of an existing template-driven web site.
1722     HTML::Template and Template-Toolkit are available from the CPAN (http://search.cpan.org).
1723    
1724     This scipt can also run basically unmodified as a mod_perl handler, providing much better performance than
1725     running as a CGI script.
1726    
1727     Please read the rest of the documentation. There's a C<DEBUGGING> section, and a C<FAQ> section.
1728    
1729     This script should work on Windows, but security may be an issue.
1730    
1731     =head1 REQUIREMENTS
1732    
1733     You should be running a reasonably current version of Perl. 5.00503 or above is recommended (anything older
1734     will not be supported).
1735    
1736     If you wish to use the date range feature you will need to install the Date::Calc module. This is available
1737     from http://search.cpan.org.
1738    
1739    
1740     =head1 INSTALLATION
1741    
1742     Here's an example installation session. Please get a simple installation working before modifying the
1743     configuration file. Most problems reported for using this script have been due to improper configuration.
1744    
1745     The script's default settings are setup for initial testing. By default the settings expect to find
1746     most files and the swish-e binary in the same directory as the script.
1747    
1748     For I<security> reasons, once you have tested the script you will want to change settings to limit access
1749     to some of these files by the web server
1750     (either by moving them out of web space, or using access control such as F<.htaccess>).
1751     An example of using F<.htaccess> on Apache is given below.
1752    
1753     It's expected that you have already unpacked the swish-e distribution
1754     and built the swish-e binary (if using a source distribution).
1755    
1756     Below is a (unix) session where we create a directory, move required files into this directory, adjust
1757     permissions, index some documents, and symlink into the web server.
1758    
1759     =over 4
1760    
1761     =item 1 Move required files into their own directory.
1762    
1763     This assumes that swish-e was unpacked and build in the ~/swish-e directory.
1764    
1765     ~ >mkdir swishdir
1766     ~ >cd swishdir
1767     ~/swishdir >cp ~/swish-e/example/swish.cgi .
1768     ~/swishdir >cp -rp ~/swish-e/example/modules .
1769     ~/swishdir >cp ~/swish-e/src/swish-e .
1770     ~/swishdir >chmod 755 swish.cgi
1771     ~/swishdir >chmod 644 modules/*
1772    
1773    
1774     =item 2 Create an index
1775    
1776     This step you will create a simple configuration file. In this example the Apache documentation
1777     is indexed. Last we run a simple query to test swish.
1778    
1779     ~/swishdir >cat swish.conf
1780     IndexDir /usr/local/apache/htdocs
1781     IndexOnly .html .htm
1782     DefaultContents HTML
1783     StoreDescription HTML <body> 200000
1784     MetaNames swishdocpath swishtitle
1785    
1786     ~/swishdir >./swish-e -c swish.conf
1787     Indexing Data Source: "File-System"
1788     Indexing "/usr/local/apache/htdocs"
1789     Removing very common words...
1790     no words removed.
1791     Writing main index...
1792     Sorting words ...
1793     Sorting 7005 words alphabetically
1794     Writing header ...
1795     Writing index entries ...
1796     Writing word text: Complete
1797     Writing word hash: Complete
1798     Writing word data: Complete
1799     7005 unique words indexed.
1800     5 properties sorted.
1801     124 files indexed. 1485844 total bytes. 171704 total words.
1802     Elapsed time: 00:00:02 CPU time: 00:00:02
1803     Indexing done!
1804    
1805     Now, verify that the index can be searched:
1806    
1807     ~/swishdir >./swish-e -w install -m 1
1808     # SWISH format: 2.1-dev-25
1809     # Search words: install
1810     # Number of hits: 14
1811     # Search time: 0.001 seconds
1812     # Run time: 0.040 seconds
1813     1000 /usr/local/apache/htdocs/manual/dso.html "Apache 1.3 Dynamic Shared Object (DSO) support" 17341
1814     .
1815    
1816     Let's see what files we have in our directory now:
1817    
1818     ~/swishdir >ls -1 -F
1819     index.swish-e
1820     index.swish-e.prop
1821     modules/
1822     swish-e*
1823     swish.cgi*
1824     swish.conf
1825    
1826     =item 3 Test the CGI script
1827    
1828     This is a simple step, but often overlooked. You should test from the command line instead of jumping
1829     ahead and testing with the web server. See the C<DEBUGGING> section below for more information.
1830    
1831     ~/swishdir >./swish.cgi | head
1832     Content-Type: text/html; charset=ISO-8859-1
1833    
1834     <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
1835     <html>
1836     <head>
1837     <title>
1838     Search our site
1839     </title>
1840     </head>
1841     <body>
1842    
1843     The above shows that the script can be run directly, and generates a correct HTTP header and HTML.
1844    
1845     If you run the above and see something like this:
1846    
1847     ~/swishdir >./swish.cgi
1848     bash: ./swish.cgi: No such file or directory
1849    
1850     then you probably need to edit the script to point to the correct location of your perl program.
1851     Here's one way to find out where perl is located (again, on unix):
1852    
1853     ~/swishdir >which perl
1854     /usr/local/bin/perl
1855    
1856     ~/swishdir >/usr/local/bin/perl -v
1857     This is perl, v5.6.0 built for i586-linux
1858     ...
1859    
1860     Good! We are using a reasonably current version of perl. You should be running
1861     at least perl 5.005 (5.00503 really). You may have problems otherwise.
1862    
1863     Now that we know perl is at F</usr/local/bin/perl> we can adjust the "shebang" line
1864     in the perl script (e.g. the first line of the script):
1865    
1866     ~/swishdir >pico swish.cgi
1867     (edit the #! line)
1868     ~/swishdir >head -1 swish.cgi
1869     #!/usr/local/bin/perl -w
1870    
1871     =item 4 Test with your web server
1872    
1873     How you do this is completely dependent on your web server, and you may need to talk to your web
1874     server admin to get this working. Often files with the .cgi extension are automatically set up to
1875     run as CGI scripts, but not always. In other words, this step is really up to you to figure out!
1876    
1877     First, I create a symlink in Apache's document root to point to my test directory "swishdir". This will work
1878     because I know my Apache server is configured to follow symbolic links.
1879    
1880     ~/swishdir >su -c 'ln -s /home/bill/swishdir /usr/local/apache/htdocs/swishdir'
1881     Password: *********
1882    
1883     If your account is on an ISP and your web directory is F<~/public_html> the you might just move the entire
1884     directory:
1885    
1886     mv ~/swishdir ~/public_html
1887    
1888     Now, let's make a real HTTP request. I happen to have Apache setup on a local port:
1889    
1890     ~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head -3
1891     #!/usr/local/bin/perl -w
1892     package SwishSearch;
1893     use strict;
1894    
1895     Oh, darn. It looks like Apache is not running the script and instead returning it as a
1896     static page. I need to tell Apache that swish.cgi is a CGI script.
1897    
1898     In my case F<.htaccess> comes to the rescue:
1899    
1900     ~/swishdir >cat .htaccess
1901    
1902     # Deny everything by default
1903     Deny From All
1904    
1905     # But allow just CGI script
1906     <files swish.cgi>
1907     Options ExecCGI
1908     Allow From All
1909     SetHandler cgi-script
1910     </files>
1911    
1912     Let's try the request one more time:
1913    
1914     ~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head
1915     <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
1916     <html>
1917     <head>
1918     <title>
1919     Search our site
1920     </title>
1921     </head>
1922     <body>
1923     <h2>
1924     <a href="http://swish-e.org">
1925    
1926     That looks better! Now use your web browser to test.
1927    
1928     Make sure you look at your web server's error log file while testing the script.
1929    
1930     BTW - "GET" is a program included with Perl's LWP library. If you do no have this you might
1931     try something like:
1932    
1933     wget -O - http://localhost:8000/swishdir/swish.cgi | head
1934    
1935     and if nothing else, you can always telnet to the web server and make a basic request.
1936    
1937     ~/swishtest > telnet localhost 8000
1938     Trying 127.0.0.1...
1939     Connected to localhost.
1940     Escape character is '^]'.
1941     GET /swishtest/swish.cgi http/1.0
1942    
1943     HTTP/1.1 200 OK
1944     Date: Wed, 13 Feb 2002 20:14:31 GMT
1945     Server: Apache/1.3.20 (Unix) mod_perl/1.25_01
1946     Connection: close
1947     Content-Type: text/html; charset=ISO-8859-1
1948    
1949     <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
1950     <html>
1951     <head>
1952     <title>
1953     Search our site
1954     </title>
1955     </head>
1956     <body>
1957    
1958     This may seem like a lot of work compared to using a browser, but browsers
1959     are a poor tool for basic CGI debugging.
1960    
1961    
1962     =back
1963    
1964     If you have problems check the C<DEBUGGING> section below.
1965    
1966     =head1 CONFIGURATION
1967    
1968     If you want to change the location of the swish-e binary or the index file, use multiple indexes, add additional metanames and properties,
1969     change the default highlighting behavior, etc., you will need to adjust the script's configuration settings.
1970    
1971     Please get a test setup working with the default parameters before making changes to any configuration settings.
1972     Better to debug one thing at a time...
1973    
1974     In general, you will need to adjust the script's settings to match the index file you are searching. For example,
1975     if you are indexing a hypermail list archive you may want to make the script
1976     use metanames/properties of Subject, Author, and, Email address. Or you may wish to provide a way to limit
1977     searches to parts of your index file (e.g. parts of your directory tree).
1978    
1979     To make things somewhat "simple", the configuration parameters are included near the top of the swish.cgi program.
1980     That is the only place that the individual parameters are defined and explained, so you will need to open up
1981     the swish.cgi script in an editor to view the options. Further questions about individual settings should
1982     be referred to the swish-e discussion list.
1983    
1984     The parameters are all part of a perl C<hash> structure, and the comments at the top of the program should
1985     get you going. The perl hash structure may seem a bit confusing, but it makes it easy to create nested and complex
1986     parameters. Syntax is important, so cut-n-paste should be your best defense if you are not a perl programmer.
1987    
1988     By the way, Perl has a number of quote operators. For example, to quote a string you might write:
1989    
1990     title => 'Search My Site',
1991    
1992     Some options take more than one parameter, where each parameter must be quoted. For example:
1993    
1994     metanames => [ 'swishdefault', 'swishtitle', 'swishdocpath' ],
1995    
1996     Which assigns an array ( [...] ) of three strings to the "metanames" variable.
1997     Lists of quotes strings are so common in perl that there's a special operator called "qw" (quote word):
1998    
1999     metanames => [ qw/ swishdefault swishtitle swishdocpath / ],
2000    
2001     or to use the parenthesis as the quote character (you can pick any):
2002    
2003     metanames => [ qw( swishdefault swishtitle swishdocpath ) ],
2004    
2005    
2006     You have two options for changing the configuration settings from their default values:
2007     you may edit the script directly, or you may use a configuration file. In either case, the configuration
2008     settings are a basic perl hash reference.
2009    
2010     Using a configuration file is described below, but contains the same hash structure.
2011    
2012     There are many configuration settings, and some of them are commented out either by using
2013     a "#" symbol, or by simply renaming the configuration directive (e.g. by adding an "x" to the parameter
2014     name).
2015    
2016     A very basic configuration setup might look like:
2017    
2018     return {
2019     title => 'Search the Swish-e list', # Title of your choice.
2020     swish_binary => './swish-e', # Location of swish-e binary
2021     swish_index => 'index.swish-e', # Location of your index file
2022     };
2023    
2024     Or if searching more than one index:
2025    
2026     return {
2027     title => 'Search the Swish-e list',
2028     swish_binary => './swish-e',
2029     swish_index => ['index.swish-e', 'index2'],
2030     };
2031    
2032     Both of these examples return a reference to a perl hash ( C<return {...}> ). In the second example,
2033     the multiple index files are set as an array reference.
2034    
2035     Note that in the example above the swish-e binary file is relative to the current directory.
2036     If running under mod_perl you will typically need to use absolute paths.
2037    
2038     B<Using A Configuration File>
2039    
2040     As mentioned above, you can either edit the F<swish.cgi> script directly and modify the configuration settings, or
2041     use an external configuration file. The settings in the configuration file are merged with (override)
2042     the settings defined in the script.
2043    
2044     The advantage of using a configuration script is that you are not editing the swish.cgi script directly, and
2045     downloading a new version won't mean re-editing the cgi script. Also, if running under mod_perl you can use the same
2046     script loaded into Apache to manage many different search pages.
2047    
2048     By default, the script will attempt to read from the file F<.swishcgi.conf>.
2049     For example, you might only wish to change the title used
2050     in the script. Simply create a file called F<.swishcgi.conf> in the same directory as the CGI script:
2051    
2052     > cat .swishcgi.conf
2053     # Example swish.cgi configuration script.
2054     return {
2055     title => 'Search Our Mailing List Archive',
2056     };
2057    
2058     The settings you use will depend on the index you create with swish. Here's a basic configuration:
2059    
2060     return {
2061     title => 'Search the Apache documentation',
2062     swish_binary => './swish-e',
2063     swish_index => 'index.swish-e',
2064     metanames => [qw/swishdefault swishdocpath swishtitle/],
2065     display_props => [qw/swishtitle swishlastmodified swishdocsize swishdocpath/],
2066     title_property => 'swishdocpath',
2067     prepend_path => 'http://myhost/apachedocs',
2068    
2069     name_labels => {
2070     swishdefault => 'Search All',
2071     swishtitle => 'Title',
2072     swishrank => 'Rank',
2073     swishlastmodified => 'Last Modified Date',
2074     swishdocpath => 'Document Path',
2075     swishdocsize => 'Document Size',
2076     },
2077    
2078     };
2079    
2080     The above configuration defines metanames to use on the form.
2081     Searches can be limited to these metanames.
2082    
2083     "display_props" tells the script to display the property "swishlastmodified" (the last modified
2084     date of the file), the document size, and path with the search results.
2085    
2086     The parameter "name_labels" is a hash (reference)
2087     that is used to give friendly names to the metanames.
2088    
2089     Here's another example. Say you want to search either (or both) the Apache 1.3 documentation or the
2090     Apache 2.0 documentation:
2091    
2092     return {
2093     title => 'Search the Apache Documentation',
2094     date_ranges => 0,
2095     swish_index => [ qw/ index.apache index.apache2 / ],
2096     select_indexes => {
2097     method => 'checkbox_group',
2098     labels => [ '1.3.23 docs', '2.0 docs' ], # Must match up one-to-one to swish_index
2099     description => 'Select: ',
2100     },
2101    
2102     };
2103    
2104     Now you can select either or both sets of documentation while searching.
2105    
2106    
2107     Please refer to the default configuration settings near the top of the script for details on
2108     the available settings.
2109    
2110     =head1 DEBUGGING
2111    
2112     Most problems with using this script have been a result of improper configuration. Please
2113     get the script working with default settings before adjusting the configuration settings.
2114    
2115     The key to debugging CGI scripts is to run them from the command line, not with a browser.
2116    
2117     First, make sure the program compiles correctly:
2118    
2119     > perl -c swish.cgi
2120     swish.cgi syntax OK
2121    
2122     Next, simply try running the program:
2123    
2124     > ./swish.cgi | head
2125     Content-Type: text/html; charset=ISO-8859-1
2126    
2127     <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2128     <html>
2129     <head>
2130     <title>
2131     Search our site
2132     </title>
2133     </head>
2134     <body>
2135    
2136     Now, you know that the program compiles and will run from the command line.
2137     Next, try accessing the script from a web browser.
2138    
2139     If you see the contents of the CGI script instead of its output then your web server is
2140     not configured to run the script. You will need to look at settings like ScriptAlias, SetHandler,
2141     and Options.
2142    
2143     If an error is reported (such as Internal Server Error or Forbidden)
2144     you need to locate your web server's error_log file
2145     and carefully read what the problem is. Contact your web administrator for help.
2146    
2147     If you don't have access to the web server's error_log file, you can modify the script to report
2148     errors to the browser screen. Open the script and search for "CGI::Carp". (Author's suggestion is
2149     to debug from the command line -- adding the browser and web server into the equation only complicates
2150     debugging.)
2151    
2152     The script does offer some basic debugging options that allow debugging from the command line.
2153     The debugging options are enabled by setting
2154     an environment variable "SWISH_DEBUG". How that is set depends on your operating system and the
2155     shell you are using. These examples are using the "bash" shell syntax.
2156    
2157     Note: You can also use the "debug_options" configuration setting, but the recommended method
2158     is to set the environment variable.
2159    
2160     You can list the available debugging options like this:
2161    
2162     >SWISH_DEBUG=help ./swish.cgi >outfile
2163     Unknown debug option 'help'. Must be one of:
2164     basic: Basic debugging
2165     command: Show command used to run swish
2166     headers: Show headers returned from swish
2167     output: Show output from swish
2168     summary: Show summary of results
2169     dump: Show all data available to templates
2170    
2171     As you work yourself down the list you will get more detail output. You can combine
2172     options like:
2173    
2174     >SWISH_DEBUG=command,headers,summary ./swish.cgi >outfile
2175    
2176     You will be asked for an input query and the max number of results to return. You can use the defaults
2177     in most cases. It's a good idea to redirect output to a file. Any error messages are sent to stderr, so
2178     those will still be displayed (unless you redirect stderr, too).
2179    
2180     Here are some examples:
2181    
2182     ~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile
2183     Debug level set to: 1
2184     Enter a query [all]:
2185     Using 'not asdfghjklzxcv' to match all records
2186     Enter max results to display [1]:
2187    
2188     ------ Can't use DateRanges feature ------------
2189    
2190     Script will run, but you can't use the date range feature
2191     Can't locate Date/Calc.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at modules/DateRanges.pm line 107, <STDIN> line 2.
2192     BEGIN failed--compilation aborted at modules/DateRanges.pm line 107, <STDIN> line 2.
2193     Compilation failed in require at ./swish.cgi line 971, <STDIN> line 2.
2194    
2195     --------------
2196     Can't exec "./swish-e": No such file or directory at ./swish.cgi line 1245, <STDIN> line 2.
2197     Child process Failed to exec './swish-e' Error: No such file or directory at ./swish.cgi line 1246, <STDIN> line 2.
2198     Failed to find any results
2199    
2200     The above told me about two problems. First, it's telling me that the Date::Calc module is not installed.
2201     The Date::Calc module is needed to use the date limiting feature of the script.
2202    
2203     The second problem is a bit more serious. It's saying that the script can't find the swish-e binary file.
2204     I simply forgot to copy it.
2205    
2206     ~/swishtest >cp ~/swish-e/src/swish-e .
2207     ~/swishtest >cat .swishcgi.conf
2208     return {
2209     title => 'Search the Apache Documentation',
2210     date_ranges => 0,
2211     };
2212    
2213     Now, let's try again:
2214    
2215     ~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile
2216     Debug level set to: 1
2217    
2218     ---------- Read config parameters from '.swishcgi.conf' ------
2219     $VAR1 = {
2220     'date_ranges' => 0,
2221     'title' => 'Search the Apache Documentation'
2222     };
2223     -------------------------
2224     Enter a query [all]:
2225     Using 'not asdfghjklzxcv' to match all records
2226     Enter max results to display [1]:
2227     Found 1 results
2228    
2229     Can't locate TemplateDefault.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at ./swish.cgi line 608.
2230    
2231     Bother. I fixed the first two problems, but now there's this new error. Oh, I somehow forgot to
2232     copy the modules directory. The obvious way to fix that is to copy the directory. But, there may
2233     be times where you want to put the module directory in another location. So, let's modify the
2234     F<.swishcgi.conf> file and add a "use lib" setting:
2235    
2236     ~/swishtest >cat .swishcgi.conf
2237     use lib '/home/bill/swish-e/example/modules';
2238    
2239     return {
2240     title => 'Search the Apache Documentation',
2241     date_ranges => 0,
2242     };
2243    
2244     ~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile
2245     Debug level set to: 1
2246    
2247     ---------- Read config parameters from '.swishcgi.conf' ------
2248     $VAR1 = {
2249     'date_ranges' => 0,
2250     'title' => 'Search the Apache Documentation'
2251     };
2252     -------------------------
2253     Enter a query [all]:
2254     Using 'not asdfghjklzxcv' to match all records
2255     Enter max results to display [1]:
2256     Found 1 results
2257    
2258     Now were talking.
2259    
2260     Here's a common problem. Everything checks out, but when you run the script you see the message:
2261    
2262     Swish returned unknown output
2263    
2264     Ok, let's find out what output it is returning:
2265    
2266     ~/swishtest >SWISH_DEBUG=headers,output ./swish.cgi >outfile
2267     Debug level set to: 13
2268    
2269     ---------- Read config parameters from '.swishcgi.conf' ------
2270     $VAR1 = {
2271     'swish_binary' => '/usr/local/bin/swish-e',
2272     'date_ranges' => 0,
2273     'title' => 'Search the Apache Documentation'
2274     };
2275     -------------------------
2276     Enter a query [all]:
2277     Using 'not asdfghjklzxcv' to match all records
2278     Enter max results to display [1]:
2279     usage: swish [-i dir file ... ] [-S system] [-c file] [-f file] [-l] [-v (num)]
2280     ...
2281     version: 2.0
2282     docs: http://sunsite.berkeley.edu/SWISH-E/
2283    
2284     *** 9872 Failed to run swish: 'Swish returned unknown output' ***
2285     Failed to find any results
2286    
2287     Oh, looks like /usr/local/bin/swish-e is version 2.0 of swish. We need 2.1-dev and above!
2288    
2289     =head1 Frequently Asked Questions
2290    
2291     Here's some common questions and answers.
2292    
2293     =head2 How do I change the way the output looks?
2294    
2295     The script uses a module to generate output. By default it uses the TemplateDefault.pm module.
2296     The module used can be selected in the configuration file.
2297    
2298     If you want to make simple changes you can edit the TemplatDefault.pm module directly. If you want to
2299     copy a module, you must also change the "package" statement at the top of the module. For example:
2300    
2301     cp TempateDefault.pm MyTemplateDefault.pm
2302    
2303     Then at the top of the module adjust the "package" line to:
2304    
2305     package MyTemplateDefault;
2306    
2307     To use this modules you need to adjust the configuration settings (either at the top of F<swish.cgi> or in
2308     a configuration file:
2309    
2310    
2311     template => {
2312     package => 'MyTemplateDefault',
2313     },
2314    
2315    
2316     =head2 How do I use a templating system with swish.cgi?
2317    
2318     In addition to the TemplateDefault.pm module, the swish-e distribution includes two other Perl modules for
2319     generating output using the templating systems HTML::Template and Template-Toolkit.
2320    
2321     Templating systems use template files to generate the HTML, and make maintaining the look of a large (or small) site
2322     much easier. HTML::Template and Template-Toolkit are separate packages and can be downloaded from the CPAN.
2323     See http://search.cpan.org.
2324    
2325     Two basic templates are provided as examples for generating output using these templating systems.
2326     The example templates are located in the F<example> directory.
2327     The module F<TemplateHTMLTemplate.pm> uses the file F<swish.tmpl> to generate its output, while the
2328     module F<TemplateToolkit.pm> uses the F<search.tt> file.
2329    
2330     To use either of these modules you will need to adjust the "template" configuration setting. Examples for
2331     both templating systems are provided in the configuration settings near the top of the F<swish.cgi> program.
2332    
2333     Use of these modules is an advanced usage of F<swish.cgi> and are provided as examples only.
2334    
2335     All of the output generation modules are passed a hash with the results from the search, plus other data use to create the
2336     output page. You can see this hash by using the debugging option "dump" or by using the TemplateDumper.pm
2337     module:
2338    
2339     ~/swishtest >cat .swishcgi.conf
2340     return {
2341     title => 'Search the Apache Documentation',
2342     template => {
2343     package => 'TemplateDumper',
2344     },
2345     };
2346    
2347     And run a query. For example:
2348    
2349     http://localhost:8000/swishtest/swish.cgi?query=install
2350    
2351     =head2 Why are there three different highlighting modules?
2352    
2353     Three are three highlighting modules included with the swish-e distribution.
2354     Each is a trade-off of speed vs. accuracy:
2355    
2356     DefaultHighlight.pm - reasonably fast, but does not highlight phrases
2357     PhraseHighlight.pm - reasonably slow, but is reasonably accurate
2358     SimpleHighlight.pm - fast, some phrases, but least accurate
2359    
2360     Eh, the default is actually "PhraseHighlight.pm". Oh well.
2361    
2362     Optimizations to these modules are welcome!
2363    
2364     =head2 My ISP doesn't provide access to the web server logs
2365    
2366     There are a number of options. One way it to use the CGI::Carp module. Search in the
2367     swish.cgi script for:
2368    
2369     use Carp;
2370     # Or use this instead -- PLEASE see perldoc CGI::Carp for details
2371     # use CGI::Carp qw(fatalsToBrowser warningsToBrowser);
2372    
2373     And change it to look like:
2374    
2375     #use Carp;
2376     # Or use this instead -- PLEASE see perldoc CGI::Carp for details
2377     use CGI::Carp qw(fatalsToBrowser warningsToBrowser);
2378    
2379     This should be only for debugging purposes, as if used in production you may end up sending
2380     quite ugly and confusing messages to your browsers.
2381    
2382     =head2 Why does the output show (NULL)?
2383    
2384     The most common reason is that you did not use StoreDescription in your config file while indexing.
2385    
2386     StoreDescription HTML <body> 200000
2387    
2388     That tells swish to store the first 200,000 characters of text extracted from the body of each document parsed
2389     by the HTML parser. The text is stored as property "swishdescription". Running:
2390    
2391     ~/swishtest > ./swish-e -T index_metanames
2392    
2393     will display the properties defined in your index file.
2394    
2395     This can happen with other properties, too.
2396     For example, this will happen when you are asking for a property to display that is not defined in swish.
2397    
2398     ~/swishtest > ./swish-e -w install -m 1 -p foo
2399     # SWISH format: 2.1-dev-25
2400     # Search words: install
2401     err: Unknown Display property name "foo"
2402     .
2403    
2404     ~/swishtest > ./swish-e -w install -m 1 -x 'Property foo=<foo>\n'
2405     # SWISH format: 2.1-dev-25
2406     # Search words: install
2407     # Number of hits: 14
2408     # Search time: 0.000 seconds
2409     # Run time: 0.038 seconds
2410     Property foo=(NULL)
2411     .
2412    
2413     To check that a property exists in your index you can run:
2414    
2415     ~/swishtest > ./swish-e -w not dkdk -T index_metanames | grep foo
2416     foo : id=10 type=70 META_PROP:STRING(case:ignore) *presorted*
2417    
2418     Ok, in this case we see that "foo" is really defined as a property. Now let's make sure F<swish.cgi>
2419     is asking for "foo" (sorry for the long lines):
2420    
2421     ~/swishtest > SWISH_DEBUG=command ./swish.cgi > /dev/null
2422     Debug level set to: 3
2423     Enter a query [all]:
2424     Using 'not asdfghjklzxcv' to match all records
2425     Enter max results to display [1]:
2426     ---- Running swish with the following command and parameters ----
2427     ./swish-e \
2428     -w \
2429     'swishdefault=(not asdfghjklzxcv)' \
2430     -b \
2431     1 \
2432     -m \
2433     1 \
2434     -f \
2435     index.swish-e \
2436     -s \
2437     swishrank \
2438     desc \
2439     swishlastmodified \
2440     desc \
2441     -x \
2442     '<swishreccount>\t<swishtitle>\t<swishdescription>\t<swishlastmodified>\t<swishdocsize>\t<swishdocpath>\t<fos>\t<swishrank>\t<swishdocpath>\n' \
2443     -H \
2444     9
2445    
2446     If you look carefully you will see that the -x parameter has "fos" instead of "foo", so there's our problem.
2447    
2448    
2449     =head1 MOD_PERL
2450    
2451     This script can be run under mod_perl (see http://perl.apache.org).
2452     This will improve the response time of the script compared to running under CGI.
2453    
2454     Configuration is simple. In your httpd.conf or your startup.pl file you need to
2455     load the script. For example, in httpd.conf you can use a perl section:
2456    
2457     <perl>
2458     use lib '/usr/local/apache/cgi-bin';
2459     use lib '/home/yourname/swish-e/example/modules';
2460     require "swish.cgi";
2461     </perl>
2462    
2463     Again, note that the paths used will depend on where you installed the script and the modules.
2464     When running under mod_perl the swish.cgi script becomes a perl module, and therefore the script
2465     does not need to be installed in the cgi-bin directory. (But, you can actually use the same script as
2466     both a CGI script and a mod_perl module at the same time, read from the same location.)
2467    
2468     The above loads the script into mod_perl. Then to configure the script to run add this to your httpd.conf
2469     configuration file:
2470    
2471     <location /search>
2472     allow from all
2473     SetHandler perl-script
2474     PerlHandler SwishSearch
2475     </location>
2476    
2477     Unlike CGI, mod_perl does not change the current directory to the location of the perl module, so
2478     your settings for the swish binary and the path to your index files must be absolute
2479     paths (or relative to the server root).
2480    
2481     Take a look at the C<handler()> routine in this script for ideas how to use PerlSetVar commands
2482     in httpd.conf to control the script.
2483    
2484     Please post to the swish-e discussion list if you have any questions about running this
2485     script under mod_perl.
2486    
2487    
2488     =head1 Spidering
2489    
2490     There are two ways to spider with swish-e. One uses the "http" input method that uses code that's
2491     part of swish. The other way is to use the new "prog" method along with a perl helper program called
2492     C<spider.pl>.
2493    
2494     Here's an example of a configuration file for spidering with the "http" input method.
2495     You can see that the configuration is not much different than the file system input method.
2496     (But, don't use the http input method -- use the -S prog method shown below.)
2497    
2498     # Define what to index
2499     IndexDir http://www.myserver.name/index.html
2500     IndexOnly .html .htm
2501    
2502     IndexContents HTML .html .htm
2503     DefaultContents HTML
2504     StoreDescription HTML <body> 200000
2505     MetaNames swishdocpath swishtitle
2506    
2507     # Define http method specific settings -- see swish-e documentation
2508     SpiderDirectory ../swish-e/src/
2509     Delay 0
2510    
2511     You index with the command:
2512    
2513     swish-e -S http -c spider.conf
2514    
2515     Note that this does take longer. For example, spidering the Apache documentation on
2516     a local web server with this method took over a minute, where indexing with the
2517     file system took less than two seconds. Using the "prog" method can speed this up.
2518    
2519     Here's an example configuration file for using the "prog" input method:
2520    
2521     # Define the location of the spider helper program
2522     IndexDir ../swish-e/prog-bin/spider.pl
2523    
2524     # Tell the spider what to index.
2525     SwishProgParameters default http://www.myserver.name/index.html
2526    
2527     IndexContents HTML .html .htm
2528     DefaultContents HTML
2529     StoreDescription HTML <body> 200000
2530     MetaNames swishdocpath swishtitle
2531    
2532     Then to index you use the command:
2533    
2534     swish-e -c prog.conf -S prog -v 0
2535    
2536     Spidering with this method took nine seconds.
2537    
2538    
2539     =head1 Stemmed Indexes
2540    
2541     Many people enable a feature of swish called word stemming to provide "fuzzy" search
2542     options to their users.
2543     The stemming code does not actually find the "stem" of word, rather removes and/or replaces
2544     common endings on words.
2545     Stemming is far from perfect, and many words do not stem as you might expect. But, it can
2546     be a helpful tool for searching your site. You may wish to create both a stemmed and non-stemmed index, and
2547     provide a checkbox for selecting the index file.
2548    
2549     To enable a stemmed index you simply add to your configuration file:
2550    
2551     UseStemming yes
2552    
2553     If you want to use a stemmed index with this program and continue to highlight search terms you will need
2554     to install a perl module that will stem words. This section explains how to do this.
2555    
2556     The perl module is included with the swish-e distribution. It can be found in the examples directory (where
2557     you found this file) and called something like:
2558    
2559     SWISH-Stemmer-0.05.tar.gz
2560    
2561     The module should also be available on CPAN (http://search.cpan.org/).
2562    
2563     Here's an example session for installing the module. (There will be quite a bit of output
2564     when running make.)
2565    
2566    
2567     % gzip -dc SWISH-Stemmer-0.05.tar.gz |tar xof -
2568     % cd SWISH-Stemmer-0.05
2569     % perl Makefile.PL
2570     or
2571     % perl Makefile.PL PREFIX=$HOME/perl_lib
2572     % make
2573     % make test
2574    
2575     (perhaps su root at this point if you did not use a PREFIX)
2576     % make install
2577     % cd ..
2578    
2579     Use the B<PREFIX> if you do not have root access or you want to install the modules
2580     in a local library. If you do use a PREFIX setting, add a C<use lib> statement to the top of this
2581     swish.cgi program.
2582    
2583     For example:
2584    
2585     use lib qw(
2586     /home/bmoseley/perl_lib/lib/site_perl/5.6.0
2587     /home/bmoseley/perl_lib/lib/site_perl/5.6.0/i386-linux/
2588     );
2589    
2590     Once the stemmer module is installed, and you are using a stemmed index, the C<swish.cgi> script will automatically
2591     detect this and use the stemmer module.
2592    
2593     =head1 DISCLAIMER
2594    
2595     Please use this CGI script at your own risk.
2596    
2597     This script has been tested and used without problem, but you should still be aware that
2598     any code running on your server represents a risk. If you have any concerns please carefully
2599     review the code.
2600    
2601     See http://www.w3.org/Security/Faq/www-security-faq.html
2602    
2603     Security on Windows questionable.
2604    
2605     =head1 SUPPORT
2606    
2607     The SWISH-E discussion list is the place to ask for any help regarding SWISH-E or this example
2608     script. See http://swish-e.org.
2609    
2610     Before posting please review:
2611    
2612     http://swish-e.org/2.2/docs/INSTALL.html#When_posting_please_provide_the_
2613    
2614     Please do not contact the author or any of the swish-e developers directly.
2615    
2616     =head1 LICENSE
2617    
2618     swish.cgi $Revision: 1.33 $ Copyright (C) 2001 Bill Moseley search@hank.org
2619     Example CGI program for searching with SWISH-E
2620    
2621    
2622     This program is free software; you can redistribute it and/or
2623     modify it under the terms of the GNU General Public License
2624     as published by the Free Software Foundation; either version
2625     2 of the License, or (at your option) any later version.
2626    
2627     This program is distributed in the hope that it will be useful,
2628     but WITHOUT ANY WARRANTY; without even the implied warranty of
2629     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2630     GNU General Public License for more details.
2631    
2632    
2633     =head1 AUTHOR
2634    
2635     Bill Moseley -- search@hank.org
2636    
2637     =cut
2638    
2639    

  ViewVC Help
Powered by ViewVC 1.1.22