/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/example/swish.cgi
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/example/swish.cgi

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:30 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
Importing web-site building process.

1 #!/usr/local/bin/perl -w
2 package SwishSearch;
3 use strict;
4
5 use lib qw( modules ); ### This may need to be adjusted!
6 ### It should point to the location of the
7 ### associated script modules directory
8
9 my $DEFAULT_CONFIG_FILE = '.swishcgi.conf';
10
11 ###################################################################################
12 #
13 # If this text is displayed on your browser then your web server
14 # is not configured to run .cgi programs. Contact your web server administrator.
15 #
16 # To display documentation for this program type "perldoc swish.cgi"
17 #
18 # swish.cgi $Revision: 1.33 $ Copyright (C) 2001 Bill Moseley swishscript@hank.org
19 # Example CGI program for searching with SWISH-E
20 #
21 # This example program will only run under an OS that supports fork().
22 # Ok, piped opens.
23 #
24 #
25 # This program is free software; you can redistribute it and/or
26 # modify it under the terms of the GNU General Public License
27 # as published by the Free Software Foundation; either version
28 # 2 of the License, or (at your option) any later version.
29 #
30 # This program is distributed in the hope that it will be useful,
31 # but WITHOUT ANY WARRANTY; without even the implied warranty of
32 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 # GNU General Public License for more details.
34 #
35 # The above lines must remain at the top of this program
36 #
37 # $Id: swish.cgi,v 1.33 2002/08/13 23:08:54 whmoseley Exp $
38 #
39 ####################################################################################
40
41 # This is written this way so the script can be used as a CGI script or a mod_perl
42 # module without any code changes.
43
44 # use CGI (); # might not be needed if using Apache::Request
45
46 #=================================================================================
47 # CGI entry point
48 #
49 #=================================================================================
50
51
52
53 # Run the script -- entry point if running as a CGI script
54
55 unless ( $ENV{MOD_PERL} ) {
56 my $config = default_config();
57
58 # Merge with disk config file.
59 $config = merge_read_config( $config );
60 process_request( $config );
61 }
62
63
64
65
66 #==================================================================================
67 # This sets the default configuration parameters
68 #
69 # Any configuration read from disk is merged with these settings.
70 #
71 # Only a few settings are actually required. Some reasonable defaults are used
72 # for most. If fact, you can probably create a complete config as:
73 #
74 # return = {
75 # swish_binary => '/usr/local/bin/swish-e',
76 # swish_index => '/usr/local/share/swish/index.swish-e',
77 # title_property => 'swishtitle', # Not required, but recommended
78 # };
79 #
80 # But, that doesn't really show all the options.
81 #
82 # You can modify the options below, or you can use a config file. The config file
83 # is .swishcgi.conf by default (read from the current directory) that must return
84 # a hash reference. For example, to create a config file that changes the default
85 # title and index file name, plus uses Template::Toolkit to generate output
86 # create a config file as:
87 #
88 # # Example config file -- returns a hash reference
89 # {
90 # title => 'Search Our Site',
91 # swish_index => 'index.web',
92 #
93 # template => {
94 # package => 'TemplateToolkit',
95 # file => 'search.tt',
96 # options => {
97 # INCLUDE_PATH => '/home/user/swish-e/example',
98 # },
99 # };
100 #
101 #
102 #-----------------------------------------------------------------------------------
103
104 sub default_config {
105
106
107
108 ##### Configuration Parameters #########
109
110 #---- This lists all the options, with many commented out ---
111 # By default, this config is used -- see the process_request() call below.
112
113 # You should adjust for your site, and how your swish index was created.
114
115 ##>>
116 ##>> Please don't post this entire section on the swish-e list if looking for help!
117 ##>>
118 ##>> Send a small example, without all the comments.
119
120 #======================================================================
121 # *** NOTES ****
122 # Items beginning with an "x" or "#" are commented out
123 # the "x" form simply renames (hides) that setting. It's used
124 # to make it easy to disable a mult-line configuation setting.
125 #
126 # If you do not understand a setting then best to leave the default.
127 #
128 # Please follow the documentation (perldoc swish.cgi) and set up
129 # a test using the defaults before making changes. It's much easier
130 # to modify a working example than to try to get a modified example to work...
131 #
132 # Again, this is a Perl hash structure. Commas are important.
133 #======================================================================
134
135 return {
136 title => 'Search our site', # Title of your choice. Displays on the search page
137 swish_binary => './swish-e', # Location of swish-e binary
138
139
140 # By default, this script tries to read a config file. You should probably
141 # comment this out if not used save a disk stat
142 config_file => $DEFAULT_CONFIG_FILE, # Default config file
143
144
145 # The location of your index file. Typically, this would not be in
146 # your web tree.
147 # If you have more than one index to search then specify an array
148 # reference. e.g. swish_index =>[ qw( index1 index2 index3 )],
149
150 swish_index => 'index.swish-e', # Location of your index file
151
152 # See "select_indexes" below for how to
153 # select more than one index.
154
155 page_size => 15, # Number of results per page - default 15
156
157
158 # Property name to use as the main link text to the indexed document.
159 # Typically, this will be 'swishtitle' if have indexed html documents,
160 # But you can specify any PropertyName defined in your document.
161 # By default, swish will return the pathname for documents that do not
162 # have a title.
163 # In other words, this is used for the text of the links of the search results.
164 # <a href="prepend_path/swishdocpath">title_property</a>
165
166 title_property => 'swishtitle',
167
168
169
170 # prepend this path to the filename (swishdocpath) returned by swish. This is used to
171 # make the href link back to the original document. Comment out to disable.
172
173 #prepend_path => 'http://localhost/mydocs',
174
175
176 # Swish has a configuration directive "StoreDescription" that will save part or
177 # all of a document's contents in the index file. This can then be displayed
178 # along with results. If you are indexing a lot of files this can use a lot of disk
179 # space, so test carefully before indexing your entire site.
180 # Building swish with zlib can greatly reduce the space used by StoreDescription
181 #
182 # This settings tells this script to display this description.
183 # Normally, this should be 'swishdescription', but you can specify another property name.
184 # There is no default.
185
186 description_prop => 'swishdescription',
187
188
189
190 # Property names listed here will be displayed in a table below each result
191 # You may wish to modify this list if you are using document properties (PropertyNames)
192 # in your swish-e index configuration
193 # There is no default.
194
195 display_props => [qw/swishlastmodified swishdocsize swishdocpath/],
196
197
198
199 # Results can be be sorted by any of the properties listed here
200 # They will be displayed in a drop-down list
201 # Again, you may modify this list if you are using document properties of your own creation
202 # Swish uses the rank as the default sort
203
204 sorts => [qw/swishrank swishlastmodified swishtitle swishdocpath/],
205
206
207 # Secondary_sort is used to sort within a sort
208 # You may enter a property name followed by a direction (asc|desc)
209
210 secondary_sort => [qw/swishlastmodified desc/],
211
212
213
214
215
216 # You can limit by MetaNames here. Names listed here will be displayed in
217 # a line of radio buttons.
218 # The default is to not allow any metaname selection.
219 # To use this feature you must define MetaNames while indexing.
220
221 # The special "swishdefault" says to search any text that was not indexed
222 # as a specific metaname (e.g. typically the body of a HTML document and its title).
223
224 # To see how this might work, add to your config file:
225 # MetaNames swishtitle swishdocpath
226 # reindex and try:
227
228 metanames => [qw/swishdefault swishtitle swishdocpath /],
229
230 # Add "all" to metanames to test the meta_groups feature described below
231
232
233
234 # Another example: if you indexed an email archive
235 # that defined the metanames subject name email (as in the swish-e discussion archive)
236 # you might use:
237 #metanames => [qw/body subject name email/],
238
239
240 # Note that you can do a real "all" search if you use nested metanames in your source documents.
241 # Nesting metanames is most common with XML documents.
242
243 # You can also group metanames into "meta-metanames".
244 # Example: Say you defined metanames "author", "comment" and "keywords"
245 # You want to allow searching "author", "comment" and the document body ("swishdefault")
246 # But you would also like an "all" search that searches all metanames, including "keywords":
247 #
248 # metanames => [qw/swishdefault author comment all/],
249 #
250 # Now, the "all" metaname is not a real metaname. It must be expanded into its
251 # individual metanames
252 #
253 # "meta_groups" maps a fake metaname to a list of real metanames
254 #
255 # meta_groups => {
256 # all => [qw/swishdefault author comment keywords / ],
257 # },
258 #
259 # swish.cgi will then take a query like
260 #
261 # all=(query words)
262 #
263 # into the query
264 #
265 # swishdefault=(query words) OR author=(query words) OR comment=(query words) OR keywords=(query words)
266 #
267 # This is not ideal, but should work for most cases
268 # (might fail under windows since the query is passed through the shell).
269
270 # To enable this group add "all" to the list of metanames
271 meta_groups => {
272 all => [qw/swishdefault swishtitle swishdocpath/],
273 },
274
275
276
277 # "name_labels" is used to map MetaNames and PropertyNames to user-friendly names
278 # on the form.
279
280 name_labels => {
281 swishdefault => 'Title & Body',
282 swishtitle => 'Title',
283 swishrank => 'Rank',
284 swishlastmodified => 'Last Modified Date',
285 swishdocpath => 'Document Path',
286 swishdocsize => 'Document Size',
287 all => 'All', # group of metanames
288
289 subject => 'Message Subject', # other examples
290 name => "Poster's Name",
291 email => "Poster's Email",
292 sent => 'Message Date',
293 },
294
295
296 timeout => 10, # limit time used by swish when fetching results - DoS protection.
297
298 max_query_length => 100, # limit length of query string. Swish also has a limit (default is 40)
299 # You might want to set swish-e's limit higher, and use this to get a
300 # somewhat more friendly message.
301
302
303 # These settings will use some crude highlighting code to highlight search terms in the
304 # property specified above as the description_prop (normally, 'swishdescription').
305
306
307 max_chars => 500, # If "highlight" is not defined, then just truncate the description to this many *chars*.
308 # If you want to go by *words*, enable highlighting,
309 # and then comment-out show_words. It will be a little slower.
310
311
312 # This structure defines term highlighting, and what type of highlighting to use
313 # If you are using metanames in your searches and they map to properties that you
314 # will display, you may need to adjust the "meta_to_prop_map".
315
316 highlight => {
317
318 # Pick highlighting module -- you must make sure the module can be found
319
320 # Ok speed, but doesn't handle phrases.
321 #Deals with stemming, but not stopwords
322 #package => 'DefaultHighlight',
323
324 # Somewhat slow, but deals with phases, stopwords, and stemming.
325 # Takes into consideration WordCharacters, IgnoreFirstChars and IgnoreLastChars.
326 package => 'PhraseHighlight',
327
328 # Fast: phrases without regard to wordcharacter settings
329 # doesn't do context display, so must match in first X words,
330 # doesn't handle stemming or stopwords.
331 #package => 'SimpleHighlight',
332
333 show_words => 10, # Number of swish words words to show around highlighted word
334 max_words => 100, # If no words are found to highlighted then show this many words
335 occurrences => 6, # Limit number of occurrences of highlighted words
336 #highlight_on => '<b>', # HTML highlighting codes
337 #highlight_off => '</b>',
338 highlight_on => '<font style="background:#FFFF99">',
339 highlight_off => '</font>',
340
341 # This maps search metatags to display properties.
342 meta_to_prop_map => {
343 swishdefault => [ qw/swishtitle swishdescription/ ],
344 swishtitle => [ qw/swishtitle/ ],
345 swishdocpath => [ qw/swishdocpath/ ],
346 all => [ qw/swishtitle swishdescription swishdocpath/ ],
347 },
348 },
349
350
351
352 # If you specify more than one index file (as an array reference) you
353 # can set this allow selection of which indexes to search.
354 # The default is to search all indexes specified if this is not used.
355 # When used, the first index is the default index.
356
357 # You need to specify your indexes as an array reference:
358 #swish_index => [ qw/ index.swish-e index.other index2.other index3.other index4.other / ],
359
360 Xselect_indexes => {
361 #method => 'radio_group', # pick radio_group, popup_menu, or checkbox_group
362 method => 'checkbox_group',
363 #method => 'popup_menu',
364 columns => 3,
365 labels => [ 'Main Index', 'Other Index', qw/ two three four/ ], # Must match up one-to-one
366 description => 'Select Site: ',
367 },
368
369
370 # Similar to select_indexes, this adds a metaname search
371 # based on a metaname. You can use any metaname, and this will
372 # add an "AND" search to limit results to a subset of your records.
373 # i.e. it adds something like 'site=(foo or bar or baz)' if foo, bar, and baz were selected.
374
375 # Swish-e's ExtractPath would work well with this. For example, the apache docs:
376 # ExtractPath site regex !^/usr/local/apache/htdocs/manual/([^/]+)/.+$!$1!
377 # ExtractPathDefault site other
378
379
380 Xselect_by_meta => {
381 #method => 'radio_group', # pick: radio_group, popup_menu, or checkbox_group
382 method => 'checkbox_group',
383 #method => 'popup_menu',
384 columns => 3,
385 metaname => 'site', # Can't be a metaname used elsewhere!
386 values => [qw/misc mod vhosts other/],
387 labels => {
388 misc => 'General Apache docs',
389 mod => 'Apache Modules',
390 vhosts => 'Virutal hosts',
391 },
392 description => 'Limit search to these areas: ',
393 },
394
395
396
397
398 # The 'template' setting defines what generates the output
399 # The default is "TemplateDefault" which is reasonably ugly.
400 # Note that some of the above options may not be available
401 # for templating, as it's up to you do layout the form
402 # and results in your template.
403
404
405 xtemplate => {
406 package => 'TemplateDefault',
407 },
408
409 xtemplate => {
410 package => 'TemplateDumper',
411 },
412
413 xtemplate => {
414 package => 'TemplateToolkit',
415 file => 'search.tt',
416 options => {
417 INCLUDE_PATH => '/home/user/swish-e/example',
418 #PRE_PROCESS => 'config',
419 },
420 },
421
422 xtemplate => {
423 package => 'TemplateHTMLTemplate',
424 options => {
425 filename => 'swish.tmpl',
426 die_on_bad_params => 0,
427 loop_context_vars => 1,
428 cache => 1,
429 },
430 },
431
432
433
434 # The "on_intranet" setting is just a flag that can be used to say you do
435 # not have an external internet connection. It's here because the default
436 # page generation includes links to images on swish-e.or and on www.w3.org.
437 # If this is set to one then those images will not be shown.
438 # (This only effects the default ouput module TemplateDefault)
439
440 on_intranet => 0,
441
442
443
444 # Here you can hard-code debugging options. The will help you find
445 # where you made your mistake ;)
446 # Using all at once will generate a lot of messages to STDERR
447 # Please see the documentation before using these.
448 # Typically, you will set these from the command line instead of in the configuration.
449
450 # debug_options => 'basic, command, headers, output, summary, dump',
451
452
453
454 # This defines the package object for reading CGI parameters
455 # Defaults to CGI. Might be useful with mod_perl.
456 # request_package => 'CGI',
457 # request_package => 'Apache::Request',
458
459
460
461 # Minor adjustment to page display. The page navigation normally looks like:
462 # Page: 1 5 6 7 8 9 24
463 # where the first page and last page are always displayed. These can be disabled by
464 # by setting to true values ( 1 )
465
466 no_first_page_navigation => 0,
467 no_last_page_navigation => 0,
468
469
470
471
472 # Limit to date ranges
473
474
475
476 # This adds in the date_range limiting options
477 # You will need the DateRanges.pm module from the author to use that feature
478
479 # Noramlly, you will want to limit by the last modified date, so specify
480 # "swishlastmodified" as the property_name. If indexing a mail archive, and, for
481 # example, you store the date (a unix timestamp) as "date" then specify
482 # "date" as the property_name.
483
484 date_ranges => {
485 property_name => 'swishlastmodified', # property name to limit by
486
487 # what you specify here depends on the DateRanges.pm module.
488 time_periods => [
489 'All',
490 'Today',
491 'Yesterday',
492 #'Yesterday onward',
493 'This Week',
494 'Last Week',
495 'Last 90 Days',
496 'This Month',
497 'Last Month',
498 #'Past',
499 #'Future',
500 #'Next 30 Days',
501 ],
502
503 line_break => 0,
504 default => 'All',
505 date_range => 1,
506 },
507
508 };
509
510 }
511
512 #^^^^^^^^^^^^^^^^^^^^^^^^^ end of user config ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
513 #========================================================================================
514
515
516
517 #=================================================================================
518 # mod_perl entry point
519 #
520 # As an example, you might use a PerlSetVar to point to paths to different
521 # config files, and then cache the different configurations by path.
522 #
523 #=================================================================================
524
525 my %cached_configs;
526
527 sub handler {
528 my $r = shift;
529
530 if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) {
531
532 # Already cached?
533 if ( $cached_configs{ $config_path } ) {
534 process_request( $cached_configs{ $config_path } );
535 return Apache::Constants::OK();
536 }
537
538 # Else, load config
539 my $config = default_config();
540 $config->{config_file} = $config_path;
541
542 # Merge with disk config file.
543 $cached_configs{ $config_path } = merge_read_config( $config );
544
545 process_request( $cached_configs{ $config_path } );
546 return Apache::Constants::OK();
547 }
548
549
550 # Otherwise, use hard-coded config
551 process_request( default_config() );
552
553 return Apache::Constants::OK();
554
555 }
556
557
558 #============================================================================
559 # Read config settings from disk, and merge
560 # Note, all errors are ignored since by default this script looks for a
561 # config file.
562 #
563 #============================================================================
564 sub merge_read_config {
565 my $config = shift;
566
567
568 set_default_debug_flags();
569
570 set_debug($config); # get from config or from %ENV
571
572
573 return $config unless $config->{config_file};
574
575 my $return = do $config->{config_file}; # load the config file
576
577 unless ( ref $return eq 'HASH' ) {
578
579 # First, let's check for file not found for the default config, which we can ignore
580
581 my $error = $@ || $!;
582
583 if ( $config->{config_file} eq $DEFAULT_CONFIG_FILE && !-e $config->{config_file} ) {
584 warn "Config file '$config->{config_file}': $!" if $config->{debug};
585 return $config;
586 }
587
588 die "Config file '$config->{config_file}': $error";
589 }
590
591
592
593 if ( $config->{debug} || $return->{debug} ) {
594 require Data::Dumper;
595 print STDERR "\n---------- Read config parameters from '$config->{config_file}' ------\n",
596 Data::Dumper::Dumper($return),
597 "-------------------------\n";
598 }
599
600 set_debug( $return );
601
602
603 # Merge settings
604 return { %$config, %$return };
605 }
606
607 #--------------------------------------------------------------------------------------------------
608 sub set_default_debug_flags {
609 # Debug flags defined
610
611 $SwishSearch::DEBUG_BASIC = 1; # Show command used to run swish
612 $SwishSearch::DEBUG_COMMAND = 2; # Show command used to run swish
613 $SwishSearch::DEBUG_HEADERS = 4; # Swish output headers
614 $SwishSearch::DEBUG_OUTPUT = 8; # Swish output besides headers
615 $SwishSearch::DEBUG_SUMMARY = 16; # Summary of results parsed
616 $SwishSearch::DEBUG_DUMP_DATA = 32; # dump data that is sent to templating modules
617 }
618
619
620
621
622 #---------------------------------------------------------------------------------------------------
623 sub set_debug {
624 my $conf = shift;
625
626 unless ( $ENV{SWISH_DEBUG} ||$conf->{debug_options} ) {
627 $conf->{debug} = 0;
628 return;
629 }
630
631 my %debug = (
632 basic => [$SwishSearch::DEBUG_BASIC, 'Basic debugging'],
633 command => [$SwishSearch::DEBUG_COMMAND, 'Show command used to run swish'],
634 headers => [$SwishSearch::DEBUG_HEADERS, 'Show headers returned from swish'],
635 output => [$SwishSearch::DEBUG_OUTPUT, 'Show output from swish'],
636 summary => [$SwishSearch::DEBUG_SUMMARY, 'Show summary of results'],
637 dump => [$SwishSearch::DEBUG_DUMP_DATA, 'Show all data available to templates'],
638 );
639
640
641 $conf->{debug} = 1;
642
643 for ( split /\s*,\s*/, $ENV{SWISH_DEBUG} ) {
644 if ( exists $debug{ lc $_ } ) {
645 $conf->{debug} |= $debug{ lc $_ }->[0];
646 next;
647 }
648
649 print STDERR "Unknown debug option '$_'. Must be one of:\n",
650 join( "\n", map { sprintf(' %10s: %10s', $_, $debug{$_}->[1]) } sort { $debug{$a}->[0] <=> $debug{$b}->[0] }keys %debug),
651 "\n\n";
652 exit;
653 }
654
655 print STDERR "Debug level set to: $conf->{debug}\n";
656 }
657
658
659 #============================================================================
660 #
661 # This is the main entry point, where a config hash is passed in.
662 #
663 #============================================================================
664
665 sub process_request {
666 my $conf = shift; # configuration parameters
667
668 # Use CGI.pm by default
669 my $request_package = $conf->{request_package} || 'CGI';
670 $request_package =~ s[::][/]g;
671 require "$request_package.pm";
672
673 my $request_object = $conf->{request_package} ? $conf->{request_package}->new : CGI->new;
674
675 if ( $conf->{debug} ) {
676 print STDERR 'Enter a query [all]: ';
677 my $query = <STDIN>;
678 $query =~ tr/\r//d;
679 chomp $query;
680 unless ( $query ) {
681 print STDERR "Using 'not asdfghjklzxcv' to match all records\n";
682 $query = 'not asdfghjklzxcv';
683 }
684
685 $request_object->param('query', $query );
686
687 print STDERR 'Enter max results to display [1]: ';
688 my $max = <STDIN>;
689 chomp $max;
690 $max = 1 unless $max && $max =~/^\d+$/;
691
692 $conf->{page_size} = $max;
693 }
694
695
696
697 # create search object
698 my $search = SwishQuery->new(
699 config => $conf,
700 request => $request_object,
701 );
702
703
704 # run the query
705 my $results = $search->run_query; # currently, results is the just the $search object
706
707 if ( $conf->{debug} ) {
708 if ( $conf->{debug} & $SwishSearch::DEBUG_DUMP_DATA ) {
709 require Data::Dumper;
710 print STDERR "\n------------- Results structure passed to template ------------\n",
711 Data::Dumper::Dumper( $results ),
712 "--------------------------\n";
713 } elsif ( $conf->{debug} & $SwishSearch::DEBUG_SUMMARY ) {
714 print STDERR "\n------------- Results Summary ------------\n";
715 if ( $results->{hits} ) {
716 require Data::Dumper;
717 print STDERR "Showing $results->{navigation}{showing} of $results->{navigation}{hits}\n",
718 Data::Dumper::Dumper( $results->{_results} );
719 } else {
720 print STDERR "** NO RESULTS **\n";
721 }
722
723 print STDERR "--------------------------\n";
724 } else {
725 print STDERR ( ($results->{hits} ? "Found $results->{hits} results\n" : "Failed to find any results\n" . $results->errstr . "\n" ),"\n" );
726 }
727 }
728
729
730
731 my $template = $conf->{template} || { package => 'TemplateDefault' };
732
733 my $package = $template->{package};
734
735 my $file = "$package.pm";
736 $file =~ s[::][/]g;
737
738 eval { require $file };
739 if ( $@ ) {
740 warn "$0 $@\n";
741 print <<EOF;
742 Content-Type: text/html
743
744 <html>
745 <head><title>Software Error</title></head>
746 <body><h2>Software Error</h2><p>Please check error log</p></body>
747 </html>
748 EOF
749
750 exit;
751 }
752
753 $package->show_template( $template, $results );
754 }
755
756
757
758
759
760 #==================================================================================================
761 package SwishQuery;
762 #==================================================================================================
763
764 use Carp;
765 # Or use this instead -- PLEASE see perldoc CGI::Carp for details
766 # <opinion>CGI::Carp doesn't help that much</opinion>
767 #use CGI::Carp; # qw(fatalsToBrowser);
768
769
770 #--------------------------------------------------------------------------------
771 # new() doesn't do much, just create the object
772 #--------------------------------------------------------------------------------
773 sub new {
774 my $class = shift;
775 my %options = @_;
776
777 my $conf = $options{config};
778
779 croak "Failed to set the swish index files in config setting 'swish_index'" unless $conf->{swish_index};
780 croak "Failed to specify 'swish_binary' in configuration" unless $conf->{swish_binary};
781
782 # initialize the request search hash
783 my $sh = {
784 prog => $conf->{swish_binary},
785 config => $conf,
786 q => $options{request},
787 hits => 0,
788 MOD_PERL => $ENV{MOD_PERL},
789 };
790
791 return bless $sh, $class;
792 }
793
794
795 sub hits { shift->{hits} }
796
797 sub config {
798 my ($self, $setting, $value ) = @_;
799
800 croak "Failed to pass 'config' a setting" unless $setting;
801
802 my $cur = $self->{config}{$setting} if exists $self->{config}{$setting};
803
804 $self->{config}{$setting} = $value if $value;
805
806 return $cur;
807 }
808
809 sub header {
810 my $self = shift;
811 return unless ref $self->{_headers} eq 'HASH';
812
813 return $self->{_headers}{$_[0]} || '';
814 }
815
816
817 # return a ref to an array
818 sub results {
819 my $self = shift;
820 return $self->{_results} || undef;
821 }
822
823 sub navigation {
824 my $self = shift;
825 return unless ref $self->{navigation} eq 'HASH';
826
827 return exists $self->{navigation}{$_[0]} ? $self->{navigation}{$_[0]} : '';
828 }
829
830 sub CGI { $_[0]->{q} };
831
832
833
834
835 sub swish_command {
836
837 my $self = shift;
838
839 unless ( @_ ) {
840 return $self->{swish_command} ? @{$self->{swish_command}} : undef;
841 }
842
843 push @{$self->{swish_command}}, @_;
844 }
845
846
847 sub errstr {
848 my ($self, $value ) = @_;
849
850
851 $self->{_errstr} = $value if $value;
852
853 return $self->{_errstr} || '';
854 }
855
856
857
858
859
860
861 #============================================
862 # This returns "$self" just in case we want to seperate out into two objects later
863
864
865 sub run_query {
866
867 my $self = shift;
868
869 my $q = $self->{q};
870 my $conf = $self->{config};
871
872
873 # Sets the query string, and any -L limits.
874 return $self unless $self->build_query;
875
876
877
878 # Set the starting position (which is offset by one)
879
880 my $start = $q->param('start') || 0;
881 $start = 0 unless $start =~ /^\d+$/ && $start >= 0;
882
883 $self->swish_command( '-b', $start+1 );
884
885
886
887 # Set the max hits
888
889 my $page_size = $self->config('page_size') || 15;
890 $self->swish_command( '-m', $page_size );
891
892
893 return $self unless $self->set_index_file;
894
895
896
897 # Set the sort option, if any
898 return $self unless $self->set_sort_order;
899
900
901
902 my $timeout = $self->config('timeout') || 0;
903
904 eval {
905 local $SIG{ALRM} = sub { die "Timed out\n" };
906 alarm $timeout if $timeout && $^O !~ /Win32/i;
907 $self->run_swish;
908 alarm 0 unless $^O =~ /Win32/i;
909 waitpid $self->{pid}, 0 if $self->{pid}; # for IPC::Open2
910 };
911
912 if ( $@ ) {
913 warn "$0 $@"; # if $conf->{debug};
914 $self->errstr( "Service currently unavailable" );
915 return $self;
916 }
917
918
919
920 my $hits = $self->hits;
921 return $self unless $hits;
922
923
924
925 # Build href for repeated search via GET (forward, backward links)
926
927
928 my @query_string =
929 map { "$_=" . $q->escape( $q->param($_) ) }
930 grep { $q->param($_) } qw/query metaname sort reverse/;
931
932
933 for my $p ( qw/si sbm/ ) {
934 my @settings = $q->param($p);
935 next unless @settings;
936 push @query_string, "$p=" . $q->escape( $_ ) for @settings;
937 }
938
939
940
941
942 if ( $conf->{date_ranges} ) {
943 my $dr = DateRanges::GetDateRangeArgs( $q );
944 push @query_string, $dr, if $dr;
945 }
946
947
948 $self->{query_href} = $q->script_name . '?' . join '&amp;', @query_string;
949
950
951
952 # Return the template fields
953
954 $self->{my_url} = $q->script_name;
955
956 $self->{hits} = $hits;
957
958 $self->{navigation} = {
959 showing => $hits,
960 from => $start + 1,
961 to => $start + $hits,
962 hits => $self->header('number of hits') || 0,
963 run_time => $self->header('run time') || 'unknown',
964 search_time => $self->header('search time') || 'unknown',
965 };
966
967
968 $self->set_page ( $page_size );
969
970 return $self;
971
972 }
973
974
975 #============================================================
976 # Build a query string from swish
977 # Just builds the -w string
978 #------------------------------------------------------------
979
980 sub build_query {
981 my $self = shift;
982
983 my $q = $self->{q};
984
985
986 # set up the query string to pass to swish.
987 my $query = $q->param('query') || '';
988
989 for ( $query ) { # trim the query string
990 s/\s+$//;
991 s/^\s+//;
992 }
993
994 $self->{query_simple} = $query; # without metaname
995 $q->param('query', $query ); # clean up the query, if needed.
996
997
998 # Read in the date limits, if any. This can create a new query
999 return unless $self->get_date_limits( \$query );
1000
1001
1002 unless ( $query ) {
1003 $self->errstr('Please enter a query string') if $q->param('submit');
1004 return;
1005 }
1006
1007
1008 if ( length( $query ) > $self->{config}{max_query_length} ) {
1009 $self->errstr('Please enter a shorter query');
1010 return;
1011 }
1012
1013
1014
1015 # Adjust the query string for metaname search
1016 # *Everything* is a metaname search
1017 # Might also like to allow searching more than one metaname at the same time
1018
1019 my $metaname = $q->param('metaname') || 'swishdefault';
1020
1021
1022 # make sure it's a valid metaname
1023
1024 my $conf = $self->{config};
1025 my @metas = ('swishdefault');
1026 push @metas, @{ $self->config('metanames')} if $self->config('metanames');
1027 my %meta_lookup = map { $_ => 1 } @metas;
1028
1029 unless ( $meta_lookup{$metaname} ) {
1030 $self->errstr('Bad MetaName provided');
1031 return;
1032 }
1033
1034 # prepend metaname to query
1035
1036 if ( $conf->{meta_groups} && $conf->{meta_groups}{$metaname} ) {
1037 $query = join ' OR ', map { "$_=($query)" } @{$conf->{meta_groups}{$metaname}};
1038
1039 # This is used to create a fake entry in the parsed query so highlighting
1040 # can find the query words
1041 $self->{real_metaname} = $conf->{meta_groups}{$metaname}[0];
1042 } else {
1043 $query = $metaname . "=($query)";
1044 }
1045
1046 # save the metaname so we know what field to highlight
1047 # Note that this might be a fake metaname
1048 $self->{metaname} = $metaname;
1049
1050
1051 ## Look for a "limit" metaname -- perhaps used with ExtractPath
1052 # Here we don't worry about user supplied data
1053
1054 my $limits = $self->config('select_by_meta');
1055 my @limits = $q->param('sbm'); # Select By Metaname
1056
1057
1058 # Note that this could be messed up by ending the query in a NOT or OR
1059 # Should look into doing:
1060 # $query = "( $query ) AND " . $limits->{metaname} . '=(' . join( ' OR ', @limits ) . ')';
1061 if ( @limits && ref $limits eq 'HASH' && $limits->{metaname} ) {
1062 $query .= ' and ' . $limits->{metaname} . '=(' . join( ' or ', @limits ) . ')';
1063 }
1064
1065
1066 $self->swish_command('-w', $query );
1067
1068 return 1;
1069 }
1070
1071 #========================================================================
1072 # Get the index files from the form, or from simple the config settings
1073 #------------------------------------------------------------------------
1074
1075 sub set_index_file {
1076 my $self = shift;
1077
1078 my $q = $self->CGI;
1079
1080 # Set the index file
1081
1082 if ( $self->config('select_indexes') && ref $self->config('swish_index') eq 'ARRAY' ) {
1083
1084 my @choices = $q->param('si');
1085 if ( !@choices ) {
1086 $self->errstr('Please select a source to search');
1087 return;
1088 }
1089
1090 my @indexes = @{$self->config('swish_index')};
1091
1092
1093 my @selected_indexes = grep {/^\d+$/ && $_ >= 0 && $_ < @indexes } @choices;
1094
1095 if ( !@selected_indexes ) {
1096 $self->errstr('Invalid source selected');
1097 return $self;
1098 }
1099 $self->swish_command( '-f', @indexes[ @selected_indexes ] );
1100
1101
1102 } else {
1103 my $indexes = $self->config('swish_index');
1104 $self->swish_command( '-f', ref $indexes ? @$indexes : $indexes );
1105 }
1106
1107 return 1;
1108 }
1109
1110 #================================================================================
1111 # Parse out the date limits from the form or from GET request
1112 #
1113 #---------------------------------------------------------------------------------
1114
1115 sub get_date_limits {
1116
1117 my ( $self, $query_ref ) = @_;
1118
1119 my $conf = $self->{config};
1120
1121 # Are date ranges enabled?
1122 return 1 unless $conf->{date_ranges};
1123
1124
1125 eval { require DateRanges };
1126 if ( $@ ) {
1127 print STDERR "\n------ Can't use DateRanges feature ------------\n",
1128 "\nScript will run, but you can't use the date range feature\n",
1129 $@,
1130 "\n--------------\n" if $conf->{debug};
1131
1132 delete $conf->{date_ranges};
1133 return 1;
1134 }
1135
1136 my $q = $self->{q};
1137
1138 my %limits;
1139
1140 unless ( DateRanges::DateRangeParse( $q, \%limits ) ) {
1141 $self->errstr( $limits{DateRanges_error} || 'Bad date range selection' );
1142 return;
1143 }
1144
1145 # Store the values for later
1146
1147 $self->{DateRanges_time_low} = $limits{DateRanges_time_low};
1148 $self->{DateRanges_time_high} = $limits{DateRanges_time_high};
1149
1150
1151 # Allow searchs just be date if not "All dates" search
1152 # $$$ should place some limits here, and provide a switch to disable
1153 if ( !$$query_ref && $limits{DateRanges_time_high} ) {
1154 $$query_ref = 'not skaisikdeekk';
1155 $self->{_search_all}++; # flag
1156 }
1157
1158
1159 my $limit_prop = $conf->{date_ranges}{property_name} || 'swishlastmodified';
1160
1161
1162 if ( $limits{DateRanges_time_low} && $limits{DateRanges_time_high} ) {
1163 $self->swish_command( '-L', $limit_prop, $limits{DateRanges_time_low}, $limits{DateRanges_time_high} );
1164 }
1165
1166 return 1;
1167 }
1168
1169
1170
1171 #================================================================
1172 # Set the sort order
1173 # Just builds the -s string
1174 #----------------------------------------------------------------
1175
1176 sub set_sort_order {
1177 my $self = shift;
1178
1179 my $q = $self->{q};
1180
1181 my $sorts_array = $self->config('sorts');
1182 return 1 unless $sorts_array;
1183
1184
1185 my $conf = $self->{config};
1186
1187
1188 # Now set sort option - if a valid option submitted (or you could let swish-e return the error).
1189 my %sorts = map { $_, 1 } @$sorts_array;
1190
1191 my $sortby = $q->param('sort') || 'swishrank';
1192
1193 if ( $sortby && $sorts{ $sortby } ) {
1194
1195 my $direction = $sortby eq 'swishrank'
1196 ? $q->param('reverse') ? 'asc' : 'desc'
1197 : $q->param('reverse') ? 'desc' : 'asc';
1198
1199 $self->swish_command( '-s', $sortby, $direction );
1200
1201 if ( $conf->{secondary_sort} && $sortby ne $conf->{secondary_sort}[0] ) {
1202 $self->swish_command(ref $conf->{secondary_sort} ? @{ $conf->{secondary_sort} } : $conf->{secondary_sort} );
1203 }
1204
1205 } else {
1206 $self->errstr( 'Invalid Sort Option Selected' );
1207 return;
1208 }
1209
1210 return 1;
1211 }
1212
1213
1214
1215 #========================================================
1216 # Sets prev and next page links.
1217 # Feel free to clean this code up!
1218 #
1219 # Pass:
1220 # $resutls - reference to a hash (for access to the headers returned by swish)
1221 # $q - CGI object
1222 #
1223 # Returns:
1224 # Sets entries in the $results hash
1225 #
1226
1227 sub set_page {
1228
1229 my ( $self, $Page_Size ) = @_;
1230
1231 my $q = $self->{q};
1232
1233 my $navigation = $self->{navigation};
1234
1235
1236 my $start = $navigation->{from} - 1; # Current starting record
1237
1238
1239 my $prev = $start - $Page_Size;
1240 $prev = 0 if $prev < 0;
1241
1242 if ( $prev < $start ) {
1243 $navigation->{prev} = $prev;
1244 $navigation->{prev_count} = $start - $prev;
1245 }
1246
1247
1248 my $last = $navigation->{hits} - 1;
1249
1250
1251 my $next = $start + $Page_Size;
1252 $next = $last if $next > $last;
1253 my $cur_end = $start + $self->{hits} - 1;
1254 if ( $next > $cur_end ) {
1255 $navigation->{next} = $next;
1256 $navigation->{next_count} = $next + $Page_Size > $last
1257 ? $last - $next + 1
1258 : $Page_Size;
1259 }
1260
1261
1262 # Calculate pages ( is this -1 correct here? )
1263
1264 my $pages = int (($navigation->{hits} -1) / $Page_Size);
1265 if ( $pages ) {
1266
1267 my @pages = 0..$pages;
1268
1269 my $max_pages = 10;
1270
1271 if ( @pages > $max_pages ) {
1272 my $current_page = int ( $start / $Page_Size - $max_pages/2) ;
1273 $current_page = 0 if $current_page < 0;
1274 if ( $current_page + $max_pages - 1 > $pages ) {
1275 $current_page = $pages - $max_pages;
1276 }
1277
1278 @pages = $current_page..$current_page + $max_pages - 1;
1279 unshift @pages, 0 if $current_page && !$self->{config}{no_first_page_navigation};
1280 push @pages, $pages unless $current_page + $max_pages - 1 == $pages || $self->{config}{no_last_page_navigation}
1281 }
1282
1283
1284 $navigation->{pages} =
1285 join ' ', map {
1286 my $page_start = $_ * $Page_Size;
1287 my $page = $_ + 1;
1288 $page_start == $start
1289 ? $page
1290 : qq[<a href="$self->{query_href}&amp;start=$page_start">$page</a>];
1291 } @pages;
1292 }
1293
1294 }
1295
1296 #==================================================
1297 # Format and return the date range options in HTML
1298 #
1299 #--------------------------------------------------
1300 sub get_date_ranges {
1301
1302 my $self = shift;
1303
1304 my $q = $self->{q};
1305 my $conf = $self->{config};
1306
1307 return '' unless $conf->{date_ranges};
1308
1309 # pass parametes, and a hash to store the returned values.
1310
1311 my %fields;
1312
1313 DateRanges::DateRangeForm( $q, $conf->{date_ranges}, \%fields );
1314
1315
1316 # Set the layout:
1317
1318 my $string = '<br>Limit to: '
1319 . ( $fields{buttons} ? "$fields{buttons}<br>" : '' )
1320 . ( $fields{date_range_button} || '' )
1321 . ( $fields{date_range_low}
1322 ? " $fields{date_range_low} through $fields{date_range_high}"
1323 : '' );
1324
1325 return $string;
1326 }
1327
1328
1329
1330 #============================================
1331 # Run swish-e and gathers headers and results
1332 # Currently requires fork() to run.
1333 #
1334 # Pass:
1335 # $sh - an array with search parameters
1336 #
1337 # Returns:
1338 # a reference to a hash that contains the headers and results
1339 # or possibly a scalar with an error message.
1340 #
1341
1342
1343 sub run_swish {
1344
1345
1346 my $self = shift;
1347
1348 my $results = $self->{results};
1349 my $conf = $self->{config};
1350 my $q = $self->{q};
1351
1352
1353 my @properties;
1354 my %seen;
1355
1356 # Gather up the properties specified
1357
1358 for ( qw/ title_property description_prop display_props / ) {
1359 push @properties, ref $conf->{$_} ? @{$conf->{$_}} : $conf->{$_}
1360 if $conf->{$_} && !$seen{$_}++;
1361 }
1362
1363 # Add in the default props
1364 for ( qw/swishrank swishdocpath/ ) {
1365 push @properties, $_ unless $seen{$_};
1366 }
1367
1368
1369 # add in the default prop - a number must be first (this might be a duplicate in -x, oh well)
1370 @properties = ( 'swishreccount', @properties );
1371
1372 $self->swish_command( -x => join( '\t', map { "<$_>" } @properties ) . '\n' );
1373
1374 $self->swish_command( -H => 9 );
1375
1376 my $fh = $^O =~ /Win32/i
1377 ? windows_fork( $conf, $self )
1378 : real_fork( $conf, $self );
1379
1380
1381 $self->{COMMAND} = join ' ', $self->{prog}, $self->swish_command;
1382
1383
1384 # read in from child
1385
1386
1387 my @results;
1388
1389 my $trim_prop = $self->config('description_prop');
1390
1391 my $highlight = $self->config('highlight');
1392 my $highlight_object;
1393
1394 # Loop through values returned from swish.
1395
1396 my %stops_removed;
1397
1398 my $unknown_output = '';
1399
1400
1401 while (<$fh>) {
1402
1403 chomp;
1404 tr/\r//d;
1405
1406 # This will not work correctly with multiple indexes when different values are used.
1407 if ( /^# ([^:]+):\s+(.+)$/ ) {
1408
1409 print STDERR "$_\n" if $conf->{debug} & $SwishSearch::DEBUG_HEADERS;
1410
1411 my $h = lc $1;
1412 my $value = $2;
1413 $self->{_headers}{$h} = $value;
1414
1415 push @{$self->{_headers}{'removed stopwords'}}, $value if $h eq 'removed stopword' && !$stops_removed{$value}++;
1416
1417 next;
1418 } elsif ( $conf->{debug} & $SwishSearch::DEBUG_OUTPUT ) {
1419 print STDERR "$_\n";
1420 }
1421
1422
1423
1424 # return swish errors as a mesage to the script
1425 $self->errstr($1), return if /^err:\s*(.+)/;
1426
1427 # Or, if you want to log the errors and just say "Service Unavailable" use this:
1428 #die "$1\n" if /^err:\s*(.+)/;
1429
1430
1431 # Found a result
1432 if ( /^\d/ ) {
1433
1434 my %h;
1435 @h{@properties} = split /\t/;
1436 push @results, \%h;
1437
1438 # There's a chance that the docpath could be modified by highlighting
1439 # when used in a "display_props".
1440 $h{saved_swishdocpath} = $h{swishdocpath};
1441
1442 my $docpath = $h{swishdocpath};
1443 $docpath =~ s/\s/%20/g; # Replace spaces
1444 $h{swishdocpath_href} = ( $self->config('prepend_path') || '' ) . $docpath;
1445
1446
1447
1448
1449
1450 # Now do any formatting
1451 if ( $highlight ) {
1452 if ( !$highlight_object ) {
1453 my $package = $highlight->{package} || 'DefaultHighlight';
1454
1455 eval { require "$package.pm" };
1456 if ( $@ ) {
1457 $self->errstr( "Failed to load Highlighting Module - check error log" );
1458 warn "$0: $@";
1459 $highlight = '';
1460 next;
1461 } else {
1462 $highlight_object = $package->new( $self, $self->{metaname} );
1463 }
1464 }
1465
1466 # Highlight any fields, as needed
1467 $highlight_object->highlight( \%h );
1468
1469 next;
1470 }
1471
1472
1473
1474
1475 # Trim down the description if no highlight, or if highlighting some other property
1476 # Not very nice. The highlighting code would limit by words
1477
1478 if ( $trim_prop && $h{$trim_prop} ) {
1479 my $max = $conf->{max_chars} || 500;
1480
1481 if ( length $h{$trim_prop} > $max ) {
1482 $h{$trim_prop} = substr( $h{$trim_prop}, 0, $max) . ' <b>...</b>';
1483 }
1484 }
1485
1486 next;
1487
1488 } elsif ( /^\.$/ ) {
1489 last;
1490
1491 } else {
1492 next if /^#/;
1493 }
1494
1495 $unknown_output .= "'$_'\n";
1496
1497
1498
1499
1500 }
1501
1502 die "Swish returned unknown output: $unknown_output\n" if $unknown_output;
1503
1504 $self->{hits} = @results;
1505 $self->{_results} = \@results if @results;
1506
1507 }
1508
1509 #==================================================================
1510 # Run swish-e by forking
1511 #
1512
1513 use Symbol;
1514
1515 sub real_fork {
1516 my ( $conf, $self ) = @_;
1517
1518
1519 # Run swish
1520 my $fh = gensym;
1521 my $pid = open( $fh, '-|' );
1522
1523 die "Failed to fork: $!\n" unless defined $pid;
1524
1525
1526
1527 if ( !$pid ) { # in child
1528 if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) {
1529 print STDERR "---- Running swish with the following command and parameters ----\n";
1530 print STDERR join( " \\\n", map { /[^\/.\-\w\d]/ ? qq['$_'] : $_ } $self->{prog}, $self->swish_command );
1531 print STDERR "\n-----------------------------------------------\n";
1532 }
1533
1534
1535 unless ( exec $self->{prog}, $self->swish_command ) {
1536 warn "Child process Failed to exec '$self->{prog}' Error: $!";
1537 print "Failed to exec Swish"; # send this message to parent.
1538 exit;
1539 }
1540 }
1541
1542 return $fh;
1543 }
1544
1545
1546 #=====================================================================================
1547 # Windows work around
1548 # from perldoc perlfok -- na, that doesn't work. Try IPC::Open2
1549 #
1550 sub windows_fork {
1551 my ( $conf, $self ) = @_;
1552
1553 if ( $conf->{debug} & $SwishSearch::DEBUG_COMMAND ) {
1554 print STDERR "---- Running swish with the following command and parameters ----\n";
1555 print STDERR join( ' ', map { /[^.\-\w\d]/ ? qq["$_"] : $_ } map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command );
1556 print STDERR "\n-----------------------------------------------\n";
1557 }
1558
1559
1560 require IPC::Open2;
1561 my ( $rdrfh, $wtrfh );
1562
1563 # Ok, I'll say it. Windows sucks.
1564 my @command = map { s/"/\\"/g; $_ } $self->{prog}, $self->swish_command;
1565 my $pid = IPC::Open2::open2($rdrfh, $wtrfh, @command );
1566
1567
1568 $self->{pid} = $pid;
1569
1570 return $rdrfh;
1571 }
1572
1573 #=====================================================================================
1574 # This method parses out the query from the "Parsed words" returned by swish
1575 # for use in highlighting routines
1576 # This returns a hash ref:
1577 # $query->{text} # evertying is currently at level "text"
1578 # {$metaname} # the meta name
1579 # [ array of phrases ]
1580 # each phrase is made up of an array of words
1581
1582
1583
1584
1585
1586 use constant DEBUG_QUERY_PARSED => 0;
1587
1588 sub extract_query_match {
1589 my $self = shift;
1590
1591 my $query = $self->header('parsed words'); # grab query parsed by swish
1592
1593
1594 my %query_match; # kewords broken down by layer and field.
1595 $self->{query_match} = \%query_match;
1596
1597
1598 # Loop through the query
1599
1600 while ( $query =~ /([a-z]+)\s+=\s+(.+?)(?=\s+[a-z]+\s+=|$)/g ) {
1601
1602 my ( $field, $words ) = ( $1, $2 );
1603
1604
1605 my $inquotes;
1606 my $buffer;
1607 my %single_words;
1608
1609 my $layer = 'text'; # This might be used in the future to highlight tags when matching a href.
1610
1611 # Expand group searches -- not currently used
1612 my @fields = ( $field );
1613
1614
1615 for my $word ( split /\s+/, $words ) {
1616
1617
1618 # XXX This list of swish operators could change "and or not" and is dependent on stopwords.
1619 # remove control words and parens
1620 next if !$inquotes && $word =~ /^(and|or|not|\(|\))$/;
1621
1622 $buffer = [] unless $inquotes; # is there a better way to allocate memory like this?
1623
1624 if ( $word eq '"' ) {
1625 unless ( $inquotes ) {
1626 $inquotes++;
1627 next;
1628 } else {
1629 $inquotes = 0;
1630 }
1631
1632 } else {
1633
1634 push @$buffer, $word;
1635 }
1636
1637
1638 next if $inquotes;
1639
1640
1641 # Only record single words once (this will probably break something)
1642 # Reason: to reduce the number of matches must check
1643 next if @$buffer == 1 && $single_words{ $buffer->[0] }++;
1644
1645
1646 push @{$query_match{$layer}{$_}}, $buffer foreach @fields;
1647
1648
1649 }
1650 }
1651
1652
1653 # Here's a hack to make metaname expansion work
1654 # this will make an entry like all => [qw/ query words /]; for use with fake metanames
1655
1656 $query_match{text}{ $self->{metaname} } = $query_match{text}{$self->{real_metaname}}
1657 if $self->{real_metaname} && $query_match{text}{$self->{real_metaname}};
1658
1659
1660
1661 # Now, sort in desending order of phrase lenght
1662
1663
1664 foreach my $layer ( keys %query_match ) {
1665 print STDERR " LAYER: $layer\n" if DEBUG_QUERY_PARSED;
1666
1667
1668 foreach my $tag ( keys %{$query_match{$layer}} ) {
1669
1670 @{$query_match{$layer}{$tag}} = sort { @$b <=> @$a } @{$query_match{$layer}{$tag}};
1671
1672
1673 if ( DEBUG_QUERY_PARSED ) {
1674 print STDERR " TAG: '$tag'\n";
1675 print STDERR " : '@$_'\n" foreach @{$query_match{$layer}{$tag}};
1676 }
1677 }
1678 }
1679
1680
1681 # display parsed query instead of the title for debugging
1682 # use Data::Dumper;
1683 # $self->config('title',"<pre><font size=3>Query:\n$query\n" . Dumper(\%query_match) . '</font></pre>');
1684
1685
1686 return \%query_match;
1687 }
1688
1689
1690 1;
1691
1692
1693 __END__
1694
1695 =head1 NAME
1696
1697 swish.cgi -- Example Perl script for searching with the SWISH-E search engine.
1698
1699 =head1 DESCRIPTION
1700
1701 C<swish.cgi> is a CGI script for searching with the SWISH-E search engine version 2.1-dev and above.
1702 It returns results a page at a time, with matching words from the source document highlighted, showing a
1703 few words of content on either side of the highlighted word.
1704
1705 The script is highly configurable; you can search multiple (or selectable) indexes, limit searches to
1706 part of the index, allow sorting by a number of different properties, limit results to a date range, and so on.
1707
1708 The standard configuration (i.e. not using a config file) should work with most swish index files.
1709 Customization of the parameters will be
1710 needed if you are indexing special meta data and want to search and/or display the meta data. The
1711 configuration can be modified by editing this script directly, or by using a configuration file (.swishcgi.conf
1712 by default).
1713
1714 You are strongly encouraged to get the default configuration working before making changes. Most problems
1715 using this script are the result of configuration modifications.
1716
1717 The script is modular in design. Both the highlighting code and output generation is handled by modules, which
1718 are included in the F<example/modules> directory. This allows for easy customization of the output without
1719 changing the main CGI script. A module exists to generate standard HTML output. There's also modules and
1720 template examples to use with the popular Perl templating systems HTML::Template and Template-Toolkit. This allows
1721 you to tightly integrate this script with the look of an existing template-driven web site.
1722 HTML::Template and Template-Toolkit are available from the CPAN (http://search.cpan.org).
1723
1724 This scipt can also run basically unmodified as a mod_perl handler, providing much better performance than
1725 running as a CGI script.
1726
1727 Please read the rest of the documentation. There's a C<DEBUGGING> section, and a C<FAQ> section.
1728
1729 This script should work on Windows, but security may be an issue.
1730
1731 =head1 REQUIREMENTS
1732
1733 You should be running a reasonably current version of Perl. 5.00503 or above is recommended (anything older
1734 will not be supported).
1735
1736 If you wish to use the date range feature you will need to install the Date::Calc module. This is available
1737 from http://search.cpan.org.
1738
1739
1740 =head1 INSTALLATION
1741
1742 Here's an example installation session. Please get a simple installation working before modifying the
1743 configuration file. Most problems reported for using this script have been due to improper configuration.
1744
1745 The script's default settings are setup for initial testing. By default the settings expect to find
1746 most files and the swish-e binary in the same directory as the script.
1747
1748 For I<security> reasons, once you have tested the script you will want to change settings to limit access
1749 to some of these files by the web server
1750 (either by moving them out of web space, or using access control such as F<.htaccess>).
1751 An example of using F<.htaccess> on Apache is given below.
1752
1753 It's expected that you have already unpacked the swish-e distribution
1754 and built the swish-e binary (if using a source distribution).
1755
1756 Below is a (unix) session where we create a directory, move required files into this directory, adjust
1757 permissions, index some documents, and symlink into the web server.
1758
1759 =over 4
1760
1761 =item 1 Move required files into their own directory.
1762
1763 This assumes that swish-e was unpacked and build in the ~/swish-e directory.
1764
1765 ~ >mkdir swishdir
1766 ~ >cd swishdir
1767 ~/swishdir >cp ~/swish-e/example/swish.cgi .
1768 ~/swishdir >cp -rp ~/swish-e/example/modules .
1769 ~/swishdir >cp ~/swish-e/src/swish-e .
1770 ~/swishdir >chmod 755 swish.cgi
1771 ~/swishdir >chmod 644 modules/*
1772
1773
1774 =item 2 Create an index
1775
1776 This step you will create a simple configuration file. In this example the Apache documentation
1777 is indexed. Last we run a simple query to test swish.
1778
1779 ~/swishdir >cat swish.conf
1780 IndexDir /usr/local/apache/htdocs
1781 IndexOnly .html .htm
1782 DefaultContents HTML
1783 StoreDescription HTML <body> 200000
1784 MetaNames swishdocpath swishtitle
1785
1786 ~/swishdir >./swish-e -c swish.conf
1787 Indexing Data Source: "File-System"
1788 Indexing "/usr/local/apache/htdocs"
1789 Removing very common words...
1790 no words removed.
1791 Writing main index...
1792 Sorting words ...
1793 Sorting 7005 words alphabetically
1794 Writing header ...
1795 Writing index entries ...
1796 Writing word text: Complete
1797 Writing word hash: Complete
1798 Writing word data: Complete
1799 7005 unique words indexed.
1800 5 properties sorted.
1801 124 files indexed. 1485844 total bytes. 171704 total words.
1802 Elapsed time: 00:00:02 CPU time: 00:00:02
1803 Indexing done!
1804
1805 Now, verify that the index can be searched:
1806
1807 ~/swishdir >./swish-e -w install -m 1
1808 # SWISH format: 2.1-dev-25
1809 # Search words: install
1810 # Number of hits: 14
1811 # Search time: 0.001 seconds
1812 # Run time: 0.040 seconds
1813 1000 /usr/local/apache/htdocs/manual/dso.html "Apache 1.3 Dynamic Shared Object (DSO) support" 17341
1814 .
1815
1816 Let's see what files we have in our directory now:
1817
1818 ~/swishdir >ls -1 -F
1819 index.swish-e
1820 index.swish-e.prop
1821 modules/
1822 swish-e*
1823 swish.cgi*
1824 swish.conf
1825
1826 =item 3 Test the CGI script
1827
1828 This is a simple step, but often overlooked. You should test from the command line instead of jumping
1829 ahead and testing with the web server. See the C<DEBUGGING> section below for more information.
1830
1831 ~/swishdir >./swish.cgi | head
1832 Content-Type: text/html; charset=ISO-8859-1
1833
1834 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
1835 <html>
1836 <head>
1837 <title>
1838 Search our site
1839 </title>
1840 </head>
1841 <body>
1842
1843 The above shows that the script can be run directly, and generates a correct HTTP header and HTML.
1844
1845 If you run the above and see something like this:
1846
1847 ~/swishdir >./swish.cgi
1848 bash: ./swish.cgi: No such file or directory
1849
1850 then you probably need to edit the script to point to the correct location of your perl program.
1851 Here's one way to find out where perl is located (again, on unix):
1852
1853 ~/swishdir >which perl
1854 /usr/local/bin/perl
1855
1856 ~/swishdir >/usr/local/bin/perl -v
1857 This is perl, v5.6.0 built for i586-linux
1858 ...
1859
1860 Good! We are using a reasonably current version of perl. You should be running
1861 at least perl 5.005 (5.00503 really). You may have problems otherwise.
1862
1863 Now that we know perl is at F</usr/local/bin/perl> we can adjust the "shebang" line
1864 in the perl script (e.g. the first line of the script):
1865
1866 ~/swishdir >pico swish.cgi
1867 (edit the #! line)
1868 ~/swishdir >head -1 swish.cgi
1869 #!/usr/local/bin/perl -w
1870
1871 =item 4 Test with your web server
1872
1873 How you do this is completely dependent on your web server, and you may need to talk to your web
1874 server admin to get this working. Often files with the .cgi extension are automatically set up to
1875 run as CGI scripts, but not always. In other words, this step is really up to you to figure out!
1876
1877 First, I create a symlink in Apache's document root to point to my test directory "swishdir". This will work
1878 because I know my Apache server is configured to follow symbolic links.
1879
1880 ~/swishdir >su -c 'ln -s /home/bill/swishdir /usr/local/apache/htdocs/swishdir'
1881 Password: *********
1882
1883 If your account is on an ISP and your web directory is F<~/public_html> the you might just move the entire
1884 directory:
1885
1886 mv ~/swishdir ~/public_html
1887
1888 Now, let's make a real HTTP request. I happen to have Apache setup on a local port:
1889
1890 ~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head -3
1891 #!/usr/local/bin/perl -w
1892 package SwishSearch;
1893 use strict;
1894
1895 Oh, darn. It looks like Apache is not running the script and instead returning it as a
1896 static page. I need to tell Apache that swish.cgi is a CGI script.
1897
1898 In my case F<.htaccess> comes to the rescue:
1899
1900 ~/swishdir >cat .htaccess
1901
1902 # Deny everything by default
1903 Deny From All
1904
1905 # But allow just CGI script
1906 <files swish.cgi>
1907 Options ExecCGI
1908 Allow From All
1909 SetHandler cgi-script
1910 </files>
1911
1912 Let's try the request one more time:
1913
1914 ~/swishdir >GET http://localhost:8000/swishdir/swish.cgi | head
1915 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
1916 <html>
1917 <head>
1918 <title>
1919 Search our site
1920 </title>
1921 </head>
1922 <body>
1923 <h2>
1924 <a href="http://swish-e.org">
1925
1926 That looks better! Now use your web browser to test.
1927
1928 Make sure you look at your web server's error log file while testing the script.
1929
1930 BTW - "GET" is a program included with Perl's LWP library. If you do no have this you might
1931 try something like:
1932
1933 wget -O - http://localhost:8000/swishdir/swish.cgi | head
1934
1935 and if nothing else, you can always telnet to the web server and make a basic request.
1936
1937 ~/swishtest > telnet localhost 8000
1938 Trying 127.0.0.1...
1939 Connected to localhost.
1940 Escape character is '^]'.
1941 GET /swishtest/swish.cgi http/1.0
1942
1943 HTTP/1.1 200 OK
1944 Date: Wed, 13 Feb 2002 20:14:31 GMT
1945 Server: Apache/1.3.20 (Unix) mod_perl/1.25_01
1946 Connection: close
1947 Content-Type: text/html; charset=ISO-8859-1
1948
1949 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
1950 <html>
1951 <head>
1952 <title>
1953 Search our site
1954 </title>
1955 </head>
1956 <body>
1957
1958 This may seem like a lot of work compared to using a browser, but browsers
1959 are a poor tool for basic CGI debugging.
1960
1961
1962 =back
1963
1964 If you have problems check the C<DEBUGGING> section below.
1965
1966 =head1 CONFIGURATION
1967
1968 If you want to change the location of the swish-e binary or the index file, use multiple indexes, add additional metanames and properties,
1969 change the default highlighting behavior, etc., you will need to adjust the script's configuration settings.
1970
1971 Please get a test setup working with the default parameters before making changes to any configuration settings.
1972 Better to debug one thing at a time...
1973
1974 In general, you will need to adjust the script's settings to match the index file you are searching. For example,
1975 if you are indexing a hypermail list archive you may want to make the script
1976 use metanames/properties of Subject, Author, and, Email address. Or you may wish to provide a way to limit
1977 searches to parts of your index file (e.g. parts of your directory tree).
1978
1979 To make things somewhat "simple", the configuration parameters are included near the top of the swish.cgi program.
1980 That is the only place that the individual parameters are defined and explained, so you will need to open up
1981 the swish.cgi script in an editor to view the options. Further questions about individual settings should
1982 be referred to the swish-e discussion list.
1983
1984 The parameters are all part of a perl C<hash> structure, and the comments at the top of the program should
1985 get you going. The perl hash structure may seem a bit confusing, but it makes it easy to create nested and complex
1986 parameters. Syntax is important, so cut-n-paste should be your best defense if you are not a perl programmer.
1987
1988 By the way, Perl has a number of quote operators. For example, to quote a string you might write:
1989
1990 title => 'Search My Site',
1991
1992 Some options take more than one parameter, where each parameter must be quoted. For example:
1993
1994 metanames => [ 'swishdefault', 'swishtitle', 'swishdocpath' ],
1995
1996 Which assigns an array ( [...] ) of three strings to the "metanames" variable.
1997 Lists of quotes strings are so common in perl that there's a special operator called "qw" (quote word):
1998
1999 metanames => [ qw/ swishdefault swishtitle swishdocpath / ],
2000
2001 or to use the parenthesis as the quote character (you can pick any):
2002
2003 metanames => [ qw( swishdefault swishtitle swishdocpath ) ],
2004
2005
2006 You have two options for changing the configuration settings from their default values:
2007 you may edit the script directly, or you may use a configuration file. In either case, the configuration
2008 settings are a basic perl hash reference.
2009
2010 Using a configuration file is described below, but contains the same hash structure.
2011
2012 There are many configuration settings, and some of them are commented out either by using
2013 a "#" symbol, or by simply renaming the configuration directive (e.g. by adding an "x" to the parameter
2014 name).
2015
2016 A very basic configuration setup might look like:
2017
2018 return {
2019 title => 'Search the Swish-e list', # Title of your choice.
2020 swish_binary => './swish-e', # Location of swish-e binary
2021 swish_index => 'index.swish-e', # Location of your index file
2022 };
2023
2024 Or if searching more than one index:
2025
2026 return {
2027 title => 'Search the Swish-e list',
2028 swish_binary => './swish-e',
2029 swish_index => ['index.swish-e', 'index2'],
2030 };
2031
2032 Both of these examples return a reference to a perl hash ( C<return {...}> ). In the second example,
2033 the multiple index files are set as an array reference.
2034
2035 Note that in the example above the swish-e binary file is relative to the current directory.
2036 If running under mod_perl you will typically need to use absolute paths.
2037
2038 B<Using A Configuration File>
2039
2040 As mentioned above, you can either edit the F<swish.cgi> script directly and modify the configuration settings, or
2041 use an external configuration file. The settings in the configuration file are merged with (override)
2042 the settings defined in the script.
2043
2044 The advantage of using a configuration script is that you are not editing the swish.cgi script directly, and
2045 downloading a new version won't mean re-editing the cgi script. Also, if running under mod_perl you can use the same
2046 script loaded into Apache to manage many different search pages.
2047
2048 By default, the script will attempt to read from the file F<.swishcgi.conf>.
2049 For example, you might only wish to change the title used
2050 in the script. Simply create a file called F<.swishcgi.conf> in the same directory as the CGI script:
2051
2052 > cat .swishcgi.conf
2053 # Example swish.cgi configuration script.
2054 return {
2055 title => 'Search Our Mailing List Archive',
2056 };
2057
2058 The settings you use will depend on the index you create with swish. Here's a basic configuration:
2059
2060 return {
2061 title => 'Search the Apache documentation',
2062 swish_binary => './swish-e',
2063 swish_index => 'index.swish-e',
2064 metanames => [qw/swishdefault swishdocpath swishtitle/],
2065 display_props => [qw/swishtitle swishlastmodified swishdocsize swishdocpath/],
2066 title_property => 'swishdocpath',
2067 prepend_path => 'http://myhost/apachedocs',
2068
2069 name_labels => {
2070 swishdefault => 'Search All',
2071 swishtitle => 'Title',
2072 swishrank => 'Rank',
2073 swishlastmodified => 'Last Modified Date',
2074 swishdocpath => 'Document Path',
2075 swishdocsize => 'Document Size',
2076 },
2077
2078 };
2079
2080 The above configuration defines metanames to use on the form.
2081 Searches can be limited to these metanames.
2082
2083 "display_props" tells the script to display the property "swishlastmodified" (the last modified
2084 date of the file), the document size, and path with the search results.
2085
2086 The parameter "name_labels" is a hash (reference)
2087 that is used to give friendly names to the metanames.
2088
2089 Here's another example. Say you want to search either (or both) the Apache 1.3 documentation or the
2090 Apache 2.0 documentation:
2091
2092 return {
2093 title => 'Search the Apache Documentation',
2094 date_ranges => 0,
2095 swish_index => [ qw/ index.apache index.apache2 / ],
2096 select_indexes => {
2097 method => 'checkbox_group',
2098 labels => [ '1.3.23 docs', '2.0 docs' ], # Must match up one-to-one to swish_index
2099 description => 'Select: ',
2100 },
2101
2102 };
2103
2104 Now you can select either or both sets of documentation while searching.
2105
2106
2107 Please refer to the default configuration settings near the top of the script for details on
2108 the available settings.
2109
2110 =head1 DEBUGGING
2111
2112 Most problems with using this script have been a result of improper configuration. Please
2113 get the script working with default settings before adjusting the configuration settings.
2114
2115 The key to debugging CGI scripts is to run them from the command line, not with a browser.
2116
2117 First, make sure the program compiles correctly:
2118
2119 > perl -c swish.cgi
2120 swish.cgi syntax OK
2121
2122 Next, simply try running the program:
2123
2124 > ./swish.cgi | head
2125 Content-Type: text/html; charset=ISO-8859-1
2126
2127 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2128 <html>
2129 <head>
2130 <title>
2131 Search our site
2132 </title>
2133 </head>
2134 <body>
2135
2136 Now, you know that the program compiles and will run from the command line.
2137 Next, try accessing the script from a web browser.
2138
2139 If you see the contents of the CGI script instead of its output then your web server is
2140 not configured to run the script. You will need to look at settings like ScriptAlias, SetHandler,
2141 and Options.
2142
2143 If an error is reported (such as Internal Server Error or Forbidden)
2144 you need to locate your web server's error_log file
2145 and carefully read what the problem is. Contact your web administrator for help.
2146
2147 If you don't have access to the web server's error_log file, you can modify the script to report
2148 errors to the browser screen. Open the script and search for "CGI::Carp". (Author's suggestion is
2149 to debug from the command line -- adding the browser and web server into the equation only complicates
2150 debugging.)
2151
2152 The script does offer some basic debugging options that allow debugging from the command line.
2153 The debugging options are enabled by setting
2154 an environment variable "SWISH_DEBUG". How that is set depends on your operating system and the
2155 shell you are using. These examples are using the "bash" shell syntax.
2156
2157 Note: You can also use the "debug_options" configuration setting, but the recommended method
2158 is to set the environment variable.
2159
2160 You can list the available debugging options like this:
2161
2162 >SWISH_DEBUG=help ./swish.cgi >outfile
2163 Unknown debug option 'help'. Must be one of:
2164 basic: Basic debugging
2165 command: Show command used to run swish
2166 headers: Show headers returned from swish
2167 output: Show output from swish
2168 summary: Show summary of results
2169 dump: Show all data available to templates
2170
2171 As you work yourself down the list you will get more detail output. You can combine
2172 options like:
2173
2174 >SWISH_DEBUG=command,headers,summary ./swish.cgi >outfile
2175
2176 You will be asked for an input query and the max number of results to return. You can use the defaults
2177 in most cases. It's a good idea to redirect output to a file. Any error messages are sent to stderr, so
2178 those will still be displayed (unless you redirect stderr, too).
2179
2180 Here are some examples:
2181
2182 ~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile
2183 Debug level set to: 1
2184 Enter a query [all]:
2185 Using 'not asdfghjklzxcv' to match all records
2186 Enter max results to display [1]:
2187
2188 ------ Can't use DateRanges feature ------------
2189
2190 Script will run, but you can't use the date range feature
2191 Can't locate Date/Calc.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at modules/DateRanges.pm line 107, <STDIN> line 2.
2192 BEGIN failed--compilation aborted at modules/DateRanges.pm line 107, <STDIN> line 2.
2193 Compilation failed in require at ./swish.cgi line 971, <STDIN> line 2.
2194
2195 --------------
2196 Can't exec "./swish-e": No such file or directory at ./swish.cgi line 1245, <STDIN> line 2.
2197 Child process Failed to exec './swish-e' Error: No such file or directory at ./swish.cgi line 1246, <STDIN> line 2.
2198 Failed to find any results
2199
2200 The above told me about two problems. First, it's telling me that the Date::Calc module is not installed.
2201 The Date::Calc module is needed to use the date limiting feature of the script.
2202
2203 The second problem is a bit more serious. It's saying that the script can't find the swish-e binary file.
2204 I simply forgot to copy it.
2205
2206 ~/swishtest >cp ~/swish-e/src/swish-e .
2207 ~/swishtest >cat .swishcgi.conf
2208 return {
2209 title => 'Search the Apache Documentation',
2210 date_ranges => 0,
2211 };
2212
2213 Now, let's try again:
2214
2215 ~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile
2216 Debug level set to: 1
2217
2218 ---------- Read config parameters from '.swishcgi.conf' ------
2219 $VAR1 = {
2220 'date_ranges' => 0,
2221 'title' => 'Search the Apache Documentation'
2222 };
2223 -------------------------
2224 Enter a query [all]:
2225 Using 'not asdfghjklzxcv' to match all records
2226 Enter max results to display [1]:
2227 Found 1 results
2228
2229 Can't locate TemplateDefault.pm in @INC (@INC contains: modules /usr/local/lib/perl5/5.6.0/i586-linux /usr/local/lib/perl5/5.6.0 /usr/local/lib/perl5/site_perl/5.6.0/i586-linux /usr/local/lib/perl5/site_perl/5.6.0 /usr/local/lib/perl5/site_perl/5.005/i586-linux /usr/local/lib/perl5/site_perl/5.005 /usr/local/lib/perl5/site_perl .) at ./swish.cgi line 608.
2230
2231 Bother. I fixed the first two problems, but now there's this new error. Oh, I somehow forgot to
2232 copy the modules directory. The obvious way to fix that is to copy the directory. But, there may
2233 be times where you want to put the module directory in another location. So, let's modify the
2234 F<.swishcgi.conf> file and add a "use lib" setting:
2235
2236 ~/swishtest >cat .swishcgi.conf
2237 use lib '/home/bill/swish-e/example/modules';
2238
2239 return {
2240 title => 'Search the Apache Documentation',
2241 date_ranges => 0,
2242 };
2243
2244 ~/swishtest >SWISH_DEBUG=basic ./swish.cgi >outfile
2245 Debug level set to: 1
2246
2247 ---------- Read config parameters from '.swishcgi.conf' ------
2248 $VAR1 = {
2249 'date_ranges' => 0,
2250 'title' => 'Search the Apache Documentation'
2251 };
2252 -------------------------
2253 Enter a query [all]:
2254 Using 'not asdfghjklzxcv' to match all records
2255 Enter max results to display [1]:
2256 Found 1 results
2257
2258 Now were talking.
2259
2260 Here's a common problem. Everything checks out, but when you run the script you see the message:
2261
2262 Swish returned unknown output
2263
2264 Ok, let's find out what output it is returning:
2265
2266 ~/swishtest >SWISH_DEBUG=headers,output ./swish.cgi >outfile
2267 Debug level set to: 13
2268
2269 ---------- Read config parameters from '.swishcgi.conf' ------
2270 $VAR1 = {
2271 'swish_binary' => '/usr/local/bin/swish-e',
2272 'date_ranges' => 0,
2273 'title' => 'Search the Apache Documentation'
2274 };
2275 -------------------------
2276 Enter a query [all]:
2277 Using 'not asdfghjklzxcv' to match all records
2278 Enter max results to display [1]:
2279 usage: swish [-i dir file ... ] [-S system] [-c file] [-f file] [-l] [-v (num)]
2280 ...
2281 version: 2.0
2282 docs: http://sunsite.berkeley.edu/SWISH-E/
2283
2284 *** 9872 Failed to run swish: 'Swish returned unknown output' ***
2285 Failed to find any results
2286
2287 Oh, looks like /usr/local/bin/swish-e is version 2.0 of swish. We need 2.1-dev and above!
2288
2289 =head1 Frequently Asked Questions
2290
2291 Here's some common questions and answers.
2292
2293 =head2 How do I change the way the output looks?
2294
2295 The script uses a module to generate output. By default it uses the TemplateDefault.pm module.
2296 The module used can be selected in the configuration file.
2297
2298 If you want to make simple changes you can edit the TemplatDefault.pm module directly. If you want to
2299 copy a module, you must also change the "package" statement at the top of the module. For example:
2300
2301 cp TempateDefault.pm MyTemplateDefault.pm
2302
2303 Then at the top of the module adjust the "package" line to:
2304
2305 package MyTemplateDefault;
2306
2307 To use this modules you need to adjust the configuration settings (either at the top of F<swish.cgi> or in
2308 a configuration file:
2309
2310
2311 template => {
2312 package => 'MyTemplateDefault',
2313 },
2314
2315
2316 =head2 How do I use a templating system with swish.cgi?
2317
2318 In addition to the TemplateDefault.pm module, the swish-e distribution includes two other Perl modules for
2319 generating output using the templating systems HTML::Template and Template-Toolkit.
2320
2321 Templating systems use template files to generate the HTML, and make maintaining the look of a large (or small) site
2322 much easier. HTML::Template and Template-Toolkit are separate packages and can be downloaded from the CPAN.
2323 See http://search.cpan.org.
2324
2325 Two basic templates are provided as examples for generating output using these templating systems.
2326 The example templates are located in the F<example> directory.
2327 The module F<TemplateHTMLTemplate.pm> uses the file F<swish.tmpl> to generate its output, while the
2328 module F<TemplateToolkit.pm> uses the F<search.tt> file.
2329
2330 To use either of these modules you will need to adjust the "template" configuration setting. Examples for
2331 both templating systems are provided in the configuration settings near the top of the F<swish.cgi> program.
2332
2333 Use of these modules is an advanced usage of F<swish.cgi> and are provided as examples only.
2334
2335 All of the output generation modules are passed a hash with the results from the search, plus other data use to create the
2336 output page. You can see this hash by using the debugging option "dump" or by using the TemplateDumper.pm
2337 module:
2338
2339 ~/swishtest >cat .swishcgi.conf
2340 return {
2341 title => 'Search the Apache Documentation',
2342 template => {
2343 package => 'TemplateDumper',
2344 },
2345 };
2346
2347 And run a query. For example:
2348
2349 http://localhost:8000/swishtest/swish.cgi?query=install
2350
2351 =head2 Why are there three different highlighting modules?
2352
2353 Three are three highlighting modules included with the swish-e distribution.
2354 Each is a trade-off of speed vs. accuracy:
2355
2356 DefaultHighlight.pm - reasonably fast, but does not highlight phrases
2357 PhraseHighlight.pm - reasonably slow, but is reasonably accurate
2358 SimpleHighlight.pm - fast, some phrases, but least accurate
2359
2360 Eh, the default is actually "PhraseHighlight.pm". Oh well.
2361
2362 Optimizations to these modules are welcome!
2363
2364 =head2 My ISP doesn't provide access to the web server logs
2365
2366 There are a number of options. One way it to use the CGI::Carp module. Search in the
2367 swish.cgi script for:
2368
2369 use Carp;
2370 # Or use this instead -- PLEASE see perldoc CGI::Carp for details
2371 # use CGI::Carp qw(fatalsToBrowser warningsToBrowser);
2372
2373 And change it to look like:
2374
2375 #use Carp;
2376 # Or use this instead -- PLEASE see perldoc CGI::Carp for details
2377 use CGI::Carp qw(fatalsToBrowser warningsToBrowser);
2378
2379 This should be only for debugging purposes, as if used in production you may end up sending
2380 quite ugly and confusing messages to your browsers.
2381
2382 =head2 Why does the output show (NULL)?
2383
2384 The most common reason is that you did not use StoreDescription in your config file while indexing.
2385
2386 StoreDescription HTML <body> 200000
2387
2388 That tells swish to store the first 200,000 characters of text extracted from the body of each document parsed
2389 by the HTML parser. The text is stored as property "swishdescription". Running:
2390
2391 ~/swishtest > ./swish-e -T index_metanames
2392
2393 will display the properties defined in your index file.
2394
2395 This can happen with other properties, too.
2396 For example, this will happen when you are asking for a property to display that is not defined in swish.
2397
2398 ~/swishtest > ./swish-e -w install -m 1 -p foo
2399 # SWISH format: 2.1-dev-25
2400 # Search words: install
2401 err: Unknown Display property name "foo"
2402 .
2403
2404 ~/swishtest > ./swish-e -w install -m 1 -x 'Property foo=<foo>\n'
2405 # SWISH format: 2.1-dev-25
2406 # Search words: install
2407 # Number of hits: 14
2408 # Search time: 0.000 seconds
2409 # Run time: 0.038 seconds
2410 Property foo=(NULL)
2411 .
2412
2413 To check that a property exists in your index you can run:
2414
2415 ~/swishtest > ./swish-e -w not dkdk -T index_metanames | grep foo
2416 foo : id=10 type=70 META_PROP:STRING(case:ignore) *presorted*
2417
2418 Ok, in this case we see that "foo" is really defined as a property. Now let's make sure F<swish.cgi>
2419 is asking for "foo" (sorry for the long lines):
2420
2421 ~/swishtest > SWISH_DEBUG=command ./swish.cgi > /dev/null
2422 Debug level set to: 3
2423 Enter a query [all]:
2424 Using 'not asdfghjklzxcv' to match all records
2425 Enter max results to display [1]:
2426 ---- Running swish with the following command and parameters ----
2427 ./swish-e \
2428 -w \
2429 'swishdefault=(not asdfghjklzxcv)' \
2430 -b \
2431 1 \
2432 -m \
2433 1 \
2434 -f \
2435 index.swish-e \
2436 -s \
2437 swishrank \
2438 desc \
2439 swishlastmodified \
2440 desc \
2441 -x \
2442 '<swishreccount>\t<swishtitle>\t<swishdescription>\t<swishlastmodified>\t<swishdocsize>\t<swishdocpath>\t<fos>\t<swishrank>\t<swishdocpath>\n' \
2443 -H \
2444 9
2445
2446 If you look carefully you will see that the -x parameter has "fos" instead of "foo", so there's our problem.
2447
2448
2449 =head1 MOD_PERL
2450
2451 This script can be run under mod_perl (see http://perl.apache.org).
2452 This will improve the response time of the script compared to running under CGI.
2453
2454 Configuration is simple. In your httpd.conf or your startup.pl file you need to
2455 load the script. For example, in httpd.conf you can use a perl section:
2456
2457 <perl>
2458 use lib '/usr/local/apache/cgi-bin';
2459 use lib '/home/yourname/swish-e/example/modules';
2460 require "swish.cgi";
2461 </perl>
2462
2463 Again, note that the paths used will depend on where you installed the script and the modules.
2464 When running under mod_perl the swish.cgi script becomes a perl module, and therefore the script
2465 does not need to be installed in the cgi-bin directory. (But, you can actually use the same script as
2466 both a CGI script and a mod_perl module at the same time, read from the same location.)
2467
2468 The above loads the script into mod_perl. Then to configure the script to run add this to your httpd.conf
2469 configuration file:
2470
2471 <location /search>
2472 allow from all
2473 SetHandler perl-script
2474 PerlHandler SwishSearch
2475 </location>
2476
2477 Unlike CGI, mod_perl does not change the current directory to the location of the perl module, so
2478 your settings for the swish binary and the path to your index files must be absolute
2479 paths (or relative to the server root).
2480
2481 Take a look at the C<handler()> routine in this script for ideas how to use PerlSetVar commands
2482 in httpd.conf to control the script.
2483
2484 Please post to the swish-e discussion list if you have any questions about running this
2485 script under mod_perl.
2486
2487
2488 =head1 Spidering
2489
2490 There are two ways to spider with swish-e. One uses the "http" input method that uses code that's
2491 part of swish. The other way is to use the new "prog" method along with a perl helper program called
2492 C<spider.pl>.
2493
2494 Here's an example of a configuration file for spidering with the "http" input method.
2495 You can see that the configuration is not much different than the file system input method.
2496 (But, don't use the http input method -- use the -S prog method shown below.)
2497
2498 # Define what to index
2499 IndexDir http://www.myserver.name/index.html
2500 IndexOnly .html .htm
2501
2502 IndexContents HTML .html .htm
2503 DefaultContents HTML
2504 StoreDescription HTML <body> 200000
2505 MetaNames swishdocpath swishtitle
2506
2507 # Define http method specific settings -- see swish-e documentation
2508 SpiderDirectory ../swish-e/src/
2509 Delay 0
2510
2511 You index with the command:
2512
2513 swish-e -S http -c spider.conf
2514
2515 Note that this does take longer. For example, spidering the Apache documentation on
2516 a local web server with this method took over a minute, where indexing with the
2517 file system took less than two seconds. Using the "prog" method can speed this up.
2518
2519 Here's an example configuration file for using the "prog" input method:
2520
2521 # Define the location of the spider helper program
2522 IndexDir ../swish-e/prog-bin/spider.pl
2523
2524 # Tell the spider what to index.
2525 SwishProgParameters default http://www.myserver.name/index.html
2526
2527 IndexContents HTML .html .htm
2528 DefaultContents HTML
2529 StoreDescription HTML <body> 200000
2530 MetaNames swishdocpath swishtitle
2531
2532 Then to index you use the command:
2533
2534 swish-e -c prog.conf -S prog -v 0
2535
2536 Spidering with this method took nine seconds.
2537
2538
2539 =head1 Stemmed Indexes
2540
2541 Many people enable a feature of swish called word stemming to provide "fuzzy" search
2542 options to their users.
2543 The stemming code does not actually find the "stem" of word, rather removes and/or replaces
2544 common endings on words.
2545 Stemming is far from perfect, and many words do not stem as you might expect. But, it can
2546 be a helpful tool for searching your site. You may wish to create both a stemmed and non-stemmed index, and
2547 provide a checkbox for selecting the index file.
2548
2549 To enable a stemmed index you simply add to your configuration file:
2550
2551 UseStemming yes
2552
2553 If you want to use a stemmed index with this program and continue to highlight search terms you will need
2554 to install a perl module that will stem words. This section explains how to do this.
2555
2556 The perl module is included with the swish-e distribution. It can be found in the examples directory (where
2557 you found this file) and called something like:
2558
2559 SWISH-Stemmer-0.05.tar.gz
2560
2561 The module should also be available on CPAN (http://search.cpan.org/).
2562
2563 Here's an example session for installing the module. (There will be quite a bit of output
2564 when running make.)
2565
2566
2567 % gzip -dc SWISH-Stemmer-0.05.tar.gz |tar xof -
2568 % cd SWISH-Stemmer-0.05
2569 % perl Makefile.PL
2570 or
2571 % perl Makefile.PL PREFIX=$HOME/perl_lib
2572 % make
2573 % make test
2574
2575 (perhaps su root at this point if you did not use a PREFIX)
2576 % make install
2577 % cd ..
2578
2579 Use the B<PREFIX> if you do not have root access or you want to install the modules
2580 in a local library. If you do use a PREFIX setting, add a C<use lib> statement to the top of this
2581 swish.cgi program.
2582
2583 For example:
2584
2585 use lib qw(
2586 /home/bmoseley/perl_lib/lib/site_perl/5.6.0
2587 /home/bmoseley/perl_lib/lib/site_perl/5.6.0/i386-linux/
2588 );
2589
2590 Once the stemmer module is installed, and you are using a stemmed index, the C<swish.cgi> script will automatically
2591 detect this and use the stemmer module.
2592
2593 =head1 DISCLAIMER
2594
2595 Please use this CGI script at your own risk.
2596
2597 This script has been tested and used without problem, but you should still be aware that
2598 any code running on your server represents a risk. If you have any concerns please carefully
2599 review the code.
2600
2601 See http://www.w3.org/Security/Faq/www-security-faq.html
2602
2603 Security on Windows questionable.
2604
2605 =head1 SUPPORT
2606
2607 The SWISH-E discussion list is the place to ask for any help regarding SWISH-E or this example
2608 script. See http://swish-e.org.
2609
2610 Before posting please review:
2611
2612 http://swish-e.org/2.2/docs/INSTALL.html#When_posting_please_provide_the_
2613
2614 Please do not contact the author or any of the swish-e developers directly.
2615
2616 =head1 LICENSE
2617
2618 swish.cgi $Revision: 1.33 $ Copyright (C) 2001 Bill Moseley search@hank.org
2619 Example CGI program for searching with SWISH-E
2620
2621
2622 This program is free software; you can redistribute it and/or
2623 modify it under the terms of the GNU General Public License
2624 as published by the Free Software Foundation; either version
2625 2 of the License, or (at your option) any later version.
2626
2627 This program is distributed in the hope that it will be useful,
2628 but WITHOUT ANY WARRANTY; without even the implied warranty of
2629 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2630 GNU General Public License for more details.
2631
2632
2633 =head1 AUTHOR
2634
2635 Bill Moseley -- search@hank.org
2636
2637 =cut

  ViewVC Help
Powered by ViewVC 1.1.22