/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/swish.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/swish.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     $Id: swish.c,v 1.98 2002/08/22 22:58:39 whmoseley Exp $
3     **
4     ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5     **
6     ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
7     **
8     ** This program and library is free software; you can redistribute it and/or
9     ** modify it under the terms of the GNU (Library) General Public License
10     ** as published by the Free Software Foundation; either version 2
11     ** of the License, or any later version.
12     **
13     ** This program is distributed in the hope that it will be useful,
14     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16     ** GNU (Library) General Public License for more details.
17     **
18     ** You should have received a copy of the GNU (Library) General Public License
19     ** along with this program; if not, write to the Free Software
20     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21     **---------------------------------------------------------
22     */
23    
24    
25     #include <limits.h> // for ULONG_MAX
26     #include "swish.h"
27     #include "mem.h"
28     #include "string.h"
29     #include "error.h"
30     #include "list.h"
31     #include "search.h"
32     #include "index.h"
33     #include "file.h"
34     #include "http.h"
35     #include "merge.h"
36     #include "docprop.h"
37     #include "metanames.h"
38     #include "parse_conffile.h"
39     #include "result_output.h"
40     #include "result_sort.h"
41     #include "keychar_out.h"
42     #include "date_time.h"
43     #include "db.h"
44     #include "fs.h"
45     #include "dump.h"
46    
47     #include "proplimit.h"
48    
49    
50     /*
51     ** This array has pointers to all the indexing data source
52     ** structures
53     */
54     extern struct _indexing_data_source_def *data_sources[];
55    
56    
57    
58    
59    
60     typedef struct
61     {
62     char *name;
63     unsigned int bit;
64     char *description;
65     }
66     DEBUG_MAP;
67    
68     static DEBUG_MAP debug_map[] = {
69     /* These dump data from the index file */
70     {"INDEX_HEADER", DEBUG_INDEX_HEADER, "Show the headers from the index"},
71     {"INDEX_WORDS", DEBUG_INDEX_WORDS, "List words stored in index"},
72     {"INDEX_WORDS_ONLY", DEBUG_INDEX_WORDS_ONLY, "List only words, one per line, stored in index"},
73     {"INDEX_WORDS_META", DEBUG_INDEX_WORDS_META, "List only words and associated metaID separated by a tab"},
74     {"INDEX_WORDS_FULL", DEBUG_INDEX_WORDS_FULL, "List words stored in index (more verbose)"},
75     {"INDEX_STOPWORDS", DEBUG_INDEX_STOPWORDS, "List stopwords stored in index"},
76     {"INDEX_FILES", DEBUG_INDEX_FILES, "List file data stored in index"},
77     {"INDEX_METANAMES", DEBUG_INDEX_METANAMES, "List metaname table stored in index"},
78     {"INDEX_ALL", DEBUG_INDEX_ALL, "Dump data ALL above data from index file\n\n-- indexing --\n"},
79    
80     /* These trace indexing */
81     {"INDEXED_WORDS", DEBUG_WORDS, "Display words as they are indexed"},
82     {"PARSED_WORDS", DEBUG_PARSED_WORDS, "Display words as they are parsed from source"},
83     {"PROPERTIES", DEBUG_PROPERTIES, "Display properties associted with each file as they are indexed"},
84     {"REGEX", DEBUG_REGEX, "Debug regular expression processing"},
85     {"PARSED_TAGS", DEBUG_PARSED_TAGS, "Show meta tags as they are found"},
86     {"PARSED_TEXT", DEBUG_PARSED_TEXT, "Show text as it's parsed"},
87     };
88    
89    
90     /* Possible run modes */
91     typedef enum {
92     MODE_SEARCH,
93     MODE_INDEX,
94     MODE_DUMP,
95     MODE_WORDS,
96     MODE_MERGE,
97     MODE_UPDATE
98     }
99     CMD_MODE;
100    
101    
102     /* Parameters read from the command line, that are not stored in *SWISH */
103     typedef struct
104     {
105     CMD_MODE run_mode; /* selected run mode */
106     char *wordlist; /* list of -w words */
107     char keychar; /* for dumping words */
108    
109     struct swline *tmpsortprops; /* sort properties */
110     struct swline *conflist; /* Configuration file list */
111    
112     int hasverbose; /* flag if -v was used */
113    
114     int index_read_only; /* flag to not allow indexing or merging */
115     int swap_mode;
116     int structure; /* where in file to search */
117    
118     char *merge_out_file; /* the output file for merge */
119    
120     }
121     CMDPARAMS;
122    
123    
124     /************* TOC ***************************************/
125     static CMDPARAMS *new_swish_params();
126     static void printTime(double time);
127     static void get_command_line_params(SWISH *sw, char **argv, CMDPARAMS *params );
128     static void free_command_line_params( CMDPARAMS *params );
129     static unsigned int isDebugWord(char *word, CMDPARAMS *params );
130     static void printversion();
131     static void usage();
132     static int check_readonly_mode( char * );
133    
134     static void cmd_dump( SWISH *sw, CMDPARAMS *params );
135     static void cmd_index( SWISH *sw, CMDPARAMS *params );
136     static void cmd_merge( SWISH *sw, CMDPARAMS *params );
137     static void cmd_search( SWISH *sw, CMDPARAMS *params );
138     static void cmd_keywords( SWISH *sw, CMDPARAMS *params );
139     static void write_index_file( SWISH *sw, int process_stopwords, double elapsedStart, double cpuStart, int merge, int is_update);
140     /************* TOC ***************************************/
141    
142    
143     int main(int argc, char **argv)
144     {
145     SWISH *sw;
146     CMDPARAMS *params;
147    
148     setlocale(LC_CTYPE, "");
149    
150    
151    
152     /* Start a session */
153     sw = SwishNew(); /* Get swish handle */
154    
155    
156    
157     /* By default we are set up to use the first data source in the list */
158     /* I don't like this. modules.c would fix this */
159     IndexingDataSource = data_sources[0];
160    
161    
162    
163    
164     params = new_swish_params();
165     get_command_line_params(sw, argv, params );
166    
167     switch( params->run_mode )
168     {
169     case MODE_DUMP:
170     cmd_dump( sw, params ); /* first so will override */
171     break;
172    
173     case MODE_MERGE:
174     cmd_merge( sw, params );
175     break;
176    
177     case MODE_INDEX:
178     case MODE_UPDATE:
179     cmd_index( sw, params );
180     break;
181    
182     case MODE_SEARCH:
183     cmd_search( sw, params );
184     break;
185    
186     case MODE_WORDS:
187     cmd_keywords( sw ,params ); /* -k setting */
188     break;
189    
190    
191     default:
192     progerr("Invalid operation mode '%d'", (int)params->run_mode);
193     }
194    
195     free_command_line_params( params );
196    
197     SwishClose(sw);
198    
199     Mem_Summary("At end of program", 1);
200    
201     exit(0);
202    
203     return 0;
204     }
205    
206     /* Prints the running time (the time it took for indexing).
207     */
208    
209     static void printTime(double time)
210     {
211     int hh,
212     mm,
213     ss;
214     int delta;
215    
216     delta = (int) (time + 0.5);
217    
218     ss = delta % 60;
219     delta /= 60;
220     hh = delta / 60;
221     mm = delta % 60;
222    
223     printf("%02d:%02d:%02d", hh, mm, ss);
224     }
225    
226     /* Prints the SWISH usage.
227     */
228    
229     static void usage()
230     {
231     const char *defaultIndexingSystem = "";
232    
233     printf(" usage:\n");
234     printf(" swish [-e] [-i dir file ... ] [-S system] [-c file] [-f file] [-l] [-v (num)]\n");
235     printf(" swish -w word1 word2 ... [-f file1 file2 ...] \\\n");
236     printf(" [-P phrase_delimiter] [-p prop1 ...] [-s sortprop1 [asc|desc] ...] \\\n");
237     printf(" [-m num] [-t str] [-d delim] [-H (num)] [-x output_format]\n");
238     printf(" swish -k (char|*) [-f file1 file2 ...]\n");
239     printf(" swish -M index1 index2 ... outputfile\n");
240     printf(" swish -N /path/to/compare/file\n");
241     printf(" swish -V\n");
242     putchar('\n');
243     printf("options: defaults are in brackets\n");
244    
245    
246     printf(" -S : specify which indexing system to use.\n");
247     printf(" Valid options are:\n");
248     #ifdef ALLOW_FILESYSTEM_INDEXING_DATA_SOURCE
249     printf(" \"fs\" - index local files in your File System\n");
250     if (!*defaultIndexingSystem)
251     defaultIndexingSystem = "fs";
252     #endif
253    
254     #ifdef ALLOW_HTTP_INDEXING_DATA_SOURCE
255     printf(" \"http\" - index web site files using a web crawler\n");
256     if (!*defaultIndexingSystem)
257     defaultIndexingSystem = "http";
258     #endif
259    
260     #ifdef ALLOW_EXTERNAL_PROGRAM_DATA_SOURCE
261     printf(" \"prog\" - index files supplied by an external program\n");
262    
263     if (!*defaultIndexingSystem)
264     defaultIndexingSystem = "http";
265     #endif
266    
267     printf(" The default value is: \"%s\"\n", defaultIndexingSystem);
268    
269     printf(" -i : create an index from the specified files\n");
270     printf(" -w : search for words \"word1 word2 ...\"\n");
271     printf(" -t : tags to search in - specify as a string\n");
272     printf(" \"HBthec\" - in Head|Body|title|header|emphasized|comments\n");
273     printf(" -f : index file to create or file(s) to search from [%s]\n", INDEXFILE);
274     printf(" -c : configuration file(s) to use for indexing\n");
275     printf(" -v : indexing verbosity level (0 to 3) [-v %d]\n", VERBOSE);
276     printf(" -T : Trace options ('-T help' for info\n");
277     printf(" -l : follow symbolic links when indexing\n");
278     printf(" -b : begin results at this number\n");
279     printf(" -m : the maximum number of results to return [defaults to all results]\n");
280     printf(" -M : merges index files\n");
281     printf(" -N : index only files with a modification date newer than path supplied\n");
282     printf(" -p : include these document properties in the output \"prop1 prop2 ...\"\n");
283     printf(" -s : sort by these document properties in the output \"prop1 prop2 ...\"\n");
284     printf(" -d : next param is delimiter.\n");
285     printf(" -P : next param is Phrase delimiter.\n");
286     printf(" -V : prints the current version\n");
287     printf(" -e : \"Economic Mode\": The index proccess uses less RAM.\n");
288     printf(" -x : \"Extended Output Format\": Specify the output format.\n");
289     printf(" -H : \"Result Header Output\": verbosity (0 to 9) [1].\n");
290     printf(" -k : Print words starting with a given char.\n");
291     printf(" -E : Append errors to file specified, or stderr if file not specified.\n");
292     printf("\n");
293     printf("version: %s docs: http://swish-e.org\n", SWISH_VERSION);
294     exit(1);
295     }
296    
297     static void printversion()
298     {
299     printf("SWISH-E %s\n", SWISH_VERSION );
300     exit(0);
301     }
302    
303    
304     /*************************************************************************
305     * Deal with -T debug options
306     *
307     *
308     **************************************************************************/
309    
310     static unsigned int isDebugWord(char *word, CMDPARAMS *params)
311     {
312     int i,
313     help;
314    
315     help = strcasecmp(word, "help") == 0;
316    
317     if (help)
318     printf("\nAvailable debugging options for swish-e:\n");
319    
320     for (i = 0; i < sizeof(debug_map) / sizeof(debug_map[0]); i++)
321     if (help)
322     printf(" %20s => %s\n", debug_map[i].name, debug_map[i].description);
323     else if (strcasecmp(debug_map[i].name, word) == 0)
324     {
325     if (strncasecmp(word, "INDEX_", 6) == 0)
326     params->run_mode = MODE_DUMP;
327    
328     return debug_map[i].bit;
329     }
330    
331     if (help)
332     exit(1);
333    
334     return 0;
335     }
336    
337     /*************************************************************************
338     * Initialize the swish command parameters
339     *
340     * Call with:
341     * void
342     *
343     * Returns:
344     * pointer to CMDPARAMS
345     *
346     * To Do:
347     * The swish parameters probably should be groupped by switches and
348     * by config file (and maybe someday also by directory or path or
349     * content-type) and then merged.
350     *
351     **************************************************************************/
352    
353     static CMDPARAMS *new_swish_params()
354     {
355     CMDPARAMS *params = (CMDPARAMS *)emalloc( sizeof( CMDPARAMS ) );
356     memset( params, 0, sizeof( CMDPARAMS ) );
357    
358     params->run_mode = MODE_SEARCH; /* default run mode */
359     params->structure = IN_FILE; /* look in the file, by default */
360    
361     return params;
362     }
363    
364    
365    
366     /*************************************************************************
367     * Free the swish command parameters
368     *
369     * Call with:
370     * *CMDPARAMS
371     *
372     * Returns:
373     * void
374     *
375     * To Do:
376     * The swish parameters probably should be groupped by switches and
377     * by config file (and maybe someday also by directory or path or
378     * content-type) and then merged.
379     *
380     **************************************************************************/
381    
382     static void free_command_line_params( CMDPARAMS *params )
383     {
384     if ( params->wordlist )
385     efree( params->wordlist );
386    
387     if ( params->tmpsortprops )
388     freeswline( params->tmpsortprops );
389    
390     if ( params->conflist )
391     freeswline( params->conflist );
392    
393     efree( params );
394     }
395    
396    
397     /*************************************************************************
398     * Just checks if there is a next word
399     * Three helper fuctions - to be replaced by better command parsing soon...
400     **************************************************************************/
401    
402     static char *is_another_param( char **argv )
403     {
404     return ( *(argv + 1) && *(argv + 1)[0] != '-' )
405     ? *(argv + 1)
406     : NULL;
407     }
408    
409     static char *next_param( char ***argv )
410     {
411     char *c;
412    
413     if ( ( c = is_another_param( *argv ) ) )
414     {
415     (*argv)++;
416     return c;
417     }
418    
419     return NULL;
420     }
421    
422    
423     static int get_param_number(char ***argv, char c )
424     {
425     char *badchar;
426     long num;
427     char *string = next_param( argv );
428    
429     if ( !string )
430     progerr(" '-%c' requires a positive integer.", c );
431    
432     num = strtol( string, &badchar, 10 ); // would base zero be more flexible?
433    
434     if ( num == LONG_MAX || num == LONG_MIN )
435     progerrno("Failed to convert '-%c %s' to a number: ", c, string );
436    
437     if ( *badchar )
438     progerr("Invalid char '%c' found in argument to '-%c %s'", badchar[0], c, string);
439    
440    
441     return (int) num;
442     }
443    
444    
445    
446    
447     /*************************************************************************
448     * Gets the command line parameters, if any, and set values in the CMDPARMAS structure
449     *
450     *
451     * Returns:
452     * void (changes *sw and *params)
453     *
454     * To Do:
455     * This code is horrific. Get a structure to define the parameters, and messages!
456     * Move this into its own module!
457     *
458     * Also, mixes two structres for parameters, SWISH and CMDPARAMS. Not a great setup.
459     *
460     *
461     * I'd like to see a centeral routine for processing switches, and a way for
462     * modules to "register" what config options to parse out by the central routine.
463     *
464     **************************************************************************/
465     static void get_command_line_params(SWISH *sw, char **argv, CMDPARAMS *params )
466     {
467     char c;
468     int lenwordlist = 0;
469     char *w;
470    
471     /* not excited about this */
472     params->wordlist = (char *) emalloc((lenwordlist = MAXSTRLEN) + 1);
473     params->wordlist[0] = '\0';
474    
475    
476    
477     params->index_read_only = check_readonly_mode( *argv );
478    
479    
480    
481     if ( !*(argv + 1 ) )
482     progerr("Missing parameter. Use -h for options.", *argv);
483    
484     while ( *++argv )
485     {
486    
487     if ((*argv)[0] != '-') // every parameter starts with a dash
488     progerr("Missing switch character at '%s'. Use -h for options.", *argv);
489    
490     if ( !(c = (*argv)[1] ) ) // get single switch char
491     progerr("Missing switch character at '%s'. Use -h for options.", *argv);
492    
493     /* allow joined arguments */
494     if ( (*argv)[2] )
495     {
496     *argv += 2;
497     argv--;
498     }
499    
500    
501     /* files to index */
502     if (c == 'i')
503     {
504     if ( !is_another_param( argv ) )
505     progerr(" '-i' requires a list of index files.");
506    
507     if(params->run_mode != MODE_UPDATE) /* Preserve update mode */
508     params->run_mode = MODE_INDEX;
509    
510     while ( (w = next_param( &argv )) )
511     sw->dirlist = addswline(sw->dirlist, w );
512    
513     continue;
514     }
515    
516    
517     /* search words */
518    
519     if (c == 'w')
520     {
521     if ( !is_another_param( argv ) )
522     progerr(" '-w' requires list of search words.");
523    
524     while ( (w = next_param( &argv )) )
525     {
526     /* don't add blank words */
527     if (w[0] == '\0')
528     continue;
529    
530     if ((int)( strlen(params->wordlist) + strlen(" ") + strlen(w) ) >= lenwordlist)
531     {
532     lenwordlist = strlen(params->wordlist) + strlen(" ") + strlen(w) + 200;
533     params->wordlist = (char *) erealloc(params->wordlist, lenwordlist + 1);
534     }
535    
536     params->run_mode = MODE_SEARCH;
537     sprintf(params->wordlist, "%s%s%s", params->wordlist, (params->wordlist[0] == '\0') ? "" : " ", w);
538     }
539    
540     continue;
541     }
542    
543    
544    
545    
546     /* words to dump from index */
547    
548     if (c == 'k')
549     {
550     if ( !(w = next_param( &argv )) )
551     progerr(" '-k' requires a character (or '*').");
552    
553     if ( strlen( w ) != 1 )
554     progerr(" '-k' requires a character (or '*').");
555    
556    
557     params->run_mode = MODE_WORDS;
558     params->keychar = w[0];
559    
560     continue;
561     }
562    
563    
564    
565     /* Data source */
566    
567     else if (c == 'S')
568     {
569     struct _indexing_data_source_def **data_source;
570    
571     if ( !(w = next_param( &argv )) )
572     progerr(" '-S' requires a valid data source.");
573    
574     for (data_source = data_sources; *data_source != 0; data_source++)
575     if (strcmp(w, (*data_source)->IndexingDataSourceId) == 0)
576     break;
577    
578    
579     if (!*data_source)
580     progerr("Unknown -S option \"%s\"", w);
581     else
582     IndexingDataSource = *data_source;
583    
584     continue;
585     }
586    
587    
588    
589     /* sort properties */
590    
591     if (c == 's')
592     {
593     if ( !is_another_param( argv ) )
594     progerr(" '-s' requires list of sort properties.");
595    
596     while ( (w = next_param( &argv )) )
597     params->tmpsortprops = addswline(params->tmpsortprops, w);
598    
599     continue;
600     }
601    
602    
603     /* display properties */
604    
605     if (c == 'p')
606     {
607     if ( !is_another_param( argv ) )
608     progerr(" '-p' requires list of properties.");
609    
610     while ( (w = next_param( &argv )) )
611     addSearchResultDisplayProperty(sw, w);
612    
613     continue;
614     }
615    
616    
617    
618     /* Set limit values */
619    
620     if (c == 'L')
621     {
622     if ( !( is_another_param( argv ) && is_another_param( argv + 1 ) && is_another_param( argv + 2 )) )
623     progerr("-L requires three parameters <propname> <lorange> <highrange>");
624    
625     if ( !SetLimitParameter(sw, argv[1], argv[2], argv[3]) )
626     SwishAbortLastError( sw );
627    
628     argv += 3;
629    
630     continue;
631     }
632    
633    
634    
635     /* Index file(s) selection */
636    
637     if (c == 'f')
638     {
639     if ( !is_another_param( argv ) )
640     progerr(" '-f' requires list of index files.");
641    
642     while ( (w = next_param( &argv )) )
643     sw->indexlist = addindexfile(sw->indexlist, w);
644    
645     continue;
646     }
647    
648    
649     /* config file list */
650    
651     if (c == 'c')
652     {
653     if ( !is_another_param( argv ) )
654     progerr(" '-c' requires one or more configuration files.");
655    
656     if(params->run_mode != MODE_UPDATE) /* Preserve update mode */
657     params->run_mode = MODE_INDEX;
658    
659     while ( (w = next_param( &argv )) )
660     params->conflist = addswline(params->conflist, w);
661    
662     continue;
663     }
664    
665    
666    
667     /* Follow symbolic links */
668    
669     if (c == 'l')
670     {
671     sw->FS->followsymlinks = 1;
672     continue;
673     }
674    
675    
676     /* Set begin hit location */
677    
678     if (c == 'b')
679     {
680     sw->Search->beginhits = get_param_number( &argv, c );
681     continue;
682     }
683    
684    
685     /* Set max hits */
686    
687     if (c == 'm')
688     {
689     sw->Search->maxhits = get_param_number( &argv, c );
690     continue;
691     }
692    
693    
694    
695     /* Save the time for limiting indexing by a file date */
696    
697     if (c == 'N')
698     {
699     struct stat stat_buf;
700    
701     if ( !(w = next_param( &argv )) )
702     progerr("-N requires a path to a local file");
703    
704     if (stat( w, &stat_buf))
705     progerrno("Bad path '%s' specified with -N: ", w );
706    
707     sw->mtime_limit = stat_buf.st_mtime;
708    
709     continue;
710     }
711    
712    
713     /* limit by structure */
714    
715     if (c == 't')
716     {
717     char * c;
718    
719     if ( !(w = next_param( &argv )) )
720     progerr("Specify tag fields (HBtheca).");
721    
722    
723     params->structure = 0; /* reset to none */
724    
725     for ( c = w; *c; c++ )
726     switch ( *c )
727     {
728     case 'H':
729     params->structure |= IN_HEAD;
730     break;
731     case 'B':
732     params->structure |= IN_BODY;
733     break;
734     case 't':
735     params->structure |= IN_TITLE;
736     break;
737     case 'h':
738     params->structure |= IN_HEADER;
739     break;
740     case 'e':
741     params->structure |= IN_EMPHASIZED;
742     break;
743     case 'c':
744     params->structure |= IN_COMMENTS;
745     break;
746     case 'a':
747     params->structure |= IN_ALL;
748     break;
749     default:
750     progerr("-t must only include HBthec. Found '%c'", *c );
751     }
752     continue;
753     }
754    
755    
756    
757    
758    
759     /* verbose while indexing */
760    
761     if (c == 'v')
762     {
763     params->hasverbose = 1;
764     sw->verbose = get_param_number( &argv, c );
765     continue;
766     }
767    
768    
769    
770     /* print the version number */
771    
772     if (c == 'V')
773     printversion();
774    
775    
776    
777     /* "z" Huh? */
778    
779     if (c == 'z' || c == 'h' || c == '?')
780     usage();
781    
782    
783    
784     /* Merge settings */
785    
786     if (c == 'M')
787     {
788     if ( !is_another_param( argv ) )
789     progerr(" '-M' requires an output file name.");
790    
791     params->run_mode = MODE_MERGE;
792    
793     while ( (w = next_param( &argv )) )
794     {
795     /* Last one listed is the output file */
796     if ( is_another_param( argv ) )
797     sw->indexlist = addindexfile(sw->indexlist, w);
798     else
799     params->merge_out_file = estrdup( w );
800     }
801    
802     continue;
803     }
804    
805    
806    
807     /* Debugging options */
808    
809     if (c == 'T')
810     {
811     while ( (w = next_param( &argv )) )
812     {
813     unsigned int bit;
814    
815     if ((bit = isDebugWord( w, params) ))
816     DEBUG_MASK |= bit;
817     else
818     progerr("Invalid debugging option '%s'. Use '-T help' for help.", w);
819    
820     }
821     continue;
822     }
823    
824    
825    
826     /* Set where errors go */
827    
828     if (c == 'E')
829     {
830     if ( !is_another_param( argv ) )
831     set_error_handle( stderr ); // -E alone goes to stderr
832    
833     else
834     {
835     FILE *f;
836     w = next_param( &argv );
837     f = fopen( w, "a" );
838     if ( !f )
839     progerrno("Failed to open Error file '%s' for appending: ", w );
840    
841     set_error_handle( f );
842     }
843    
844     continue;
845     }
846    
847    
848    
849     /* Custom Phrase Delimiter - Jose Ruiz 01/00 */
850    
851     if (c == 'P')
852     {
853     if ( !(w = next_param( &argv )) )
854     progerr("'-P' requires a phrase delimiter.");
855    
856     sw->Search->PhraseDelimiter = (int) w[0];
857     continue;
858     }
859    
860    
861    
862     /* Set the custom delimiter */
863     if (c == 'd')
864     {
865     if ( !(w = next_param( &argv )) )
866     progerr("'-d' requires an output delimiter.");
867    
868     sw->ResultOutput->stdResultFieldDelimiter = estrredup(sw->ResultOutput->stdResultFieldDelimiter, w );
869    
870     /* This really doesn't work as is probably expected since it's a delimiter and not quoting the fields */
871     if (strcmp(sw->ResultOutput->stdResultFieldDelimiter, "dq") == 0)
872     strcpy( sw->ResultOutput->stdResultFieldDelimiter, "\"" );
873     else
874     {
875     int i,j;
876     int backslash = 0;
877    
878     for ( j=0, i=0; i < strlen( w ); i++ )
879     {
880     if ( !backslash )
881     {
882     if ( w[i] == '\\' )
883     {
884     backslash++;
885     continue;
886     }
887     else
888     {
889     sw->ResultOutput->stdResultFieldDelimiter[j++] = w[i];
890     continue;
891     }
892     }
893    
894    
895     switch ( w[i] )
896     {
897     case 'f':
898     sw->ResultOutput->stdResultFieldDelimiter[j++] = '\f';
899     break;
900     case 'n':
901     sw->ResultOutput->stdResultFieldDelimiter[j++] = '\n';
902     break;
903     case 'r':
904     sw->ResultOutput->stdResultFieldDelimiter[j++] = '\r';
905     break;
906     case 't':
907     sw->ResultOutput->stdResultFieldDelimiter[j++] = '\t';
908     break;
909     case '\\':
910     sw->ResultOutput->stdResultFieldDelimiter[j++] = '\\';
911     sw->ResultOutput->stdResultFieldDelimiter[j++] = '\\';
912     break;
913     default:
914     progerr("Unknown escape sequence '\\%c'. Must be one of \\f \\n \\r \\t \\\\", w[i]);
915     }
916     backslash = 0;
917     }
918     sw->ResultOutput->stdResultFieldDelimiter[j] = '\0';
919     }
920     continue;
921     }
922    
923    
924    
925     /* Econ mode */
926    
927     if (c == 'e')
928     {
929     /* Jose Ruiz 09/00 */
930     params->swap_mode = 1; /* "Economic mode": Uses less RAM */
931     /* The proccess is slower: Part of */
932     /* info is preserved in temporal */
933     /* files */
934    
935     continue;
936     }
937    
938    
939     /* $$$ These need better error reporting */
940    
941     /* Extended format */
942    
943     if (c == 'x')
944     {
945     /* Jose Ruiz 09/00 */
946     /* Search proc will show more info */
947     /* rasc 2001-02 extended -x fmtstr */
948    
949     if ( !(w = next_param( &argv )) )
950     progerr("'-x' requires an output format string.");
951    
952     {
953     char *s;
954     s = hasResultExtFmtStr(sw, w);
955     sw->ResultOutput->extendedformat = (s) ? s : w;
956     initPrintExtResult(sw, sw->ResultOutput->extendedformat);
957     }
958    
959     continue;
960     }
961    
962    
963    
964     /* Search header output control */
965     if (c == 'H')
966     {
967     sw->ResultOutput->headerOutVerbose = get_param_number( &argv, c );
968     continue;
969     }
970    
971    
972     /* Ignore sorted indexes */
973    
974     if (c == 'o')
975     {
976     sw->ResultSort->isPreSorted = 0;
977     continue;
978     }
979    
980     /* Update mode jmruiz 2002/03 */
981    
982     if (c == 'u')
983     {
984     params->run_mode = MODE_UPDATE;
985     continue;
986     }
987    
988     progerr("Unknown switch '-%c'. Use -h for options.", c );
989     }
990     }
991    
992    
993     /*************************************************************************
994     * Returns true if we think the program is called swish-search
995     * offers no real security
996     *
997     **************************************************************************/
998     static int check_readonly_mode( char *prog )
999     {
1000     char *tmp = prog + strlen(prog) - strlen("swish-search");
1001    
1002     if ( tmp < prog )
1003     return 0;
1004    
1005     /* We must ignore case for WIN 32 */
1006     if (strcasecmp(tmp, "swish-search") == 0)
1007     return 1;
1008    
1009     return 0;
1010     }
1011    
1012    
1013     /*************************************************************************
1014     * Dumps the index file(s)
1015     *
1016     **************************************************************************/
1017     static void cmd_dump( SWISH *sw, CMDPARAMS *params )
1018     {
1019    
1020     /* Set the default index file */
1021     if ( sw->indexlist == NULL )
1022     sw->indexlist = addindexfile(sw->indexlist, INDEXFILE);
1023    
1024     while ( sw->indexlist != NULL )
1025     {
1026    
1027     DB_decompress(sw, sw->indexlist);
1028     putchar('\n');
1029    
1030     sw->indexlist = sw->indexlist->next;
1031     }
1032     }
1033     /*************************************************************************
1034     * This run the indexing code
1035     *
1036     **************************************************************************/
1037    
1038     static void cmd_index( SWISH *sw, CMDPARAMS *params )
1039     {
1040     int hasdir = (sw->dirlist == NULL) ? 0 : 1;
1041     int hasindex = (sw->indexlist == NULL) ? 0 : 1;
1042     double elapsedStart = TimeElapsed();
1043     double cpuStart = TimeCPU();
1044     struct swline *tmpswline;
1045    
1046     if ( params->index_read_only )
1047     progerr("Sorry, this program is in readonly mode");
1048    
1049    
1050     /* Read configuration files */
1051     {
1052     struct swline *tmp = params->conflist;
1053     while ( tmp != NULL)
1054     {
1055     getdefaults(sw, tmp->line, &hasdir, &hasindex, params->hasverbose);
1056     tmp = tmp->next;
1057     }
1058     }
1059    
1060    
1061     /* Default index file */
1062     if ( sw->indexlist == NULL )
1063     sw->indexlist = addindexfile(sw->indexlist, INDEXFILE);
1064    
1065    
1066     if (!hasdir)
1067     progerr("Specify directories or files to index.");
1068    
1069    
1070     if (sw->verbose < 0)
1071     sw->verbose = 0;
1072    
1073     /* Update Economic mode */
1074     sw->Index->swap_locdata = params->swap_mode;
1075    
1076    
1077     /* Check for UPDATE_MODE jmruiz 2002/03 */
1078     if(params->run_mode == MODE_UPDATE)
1079     {
1080     /* Open the index file for read/write */
1081     sw->indexlist->DB = (void *) DB_Open(sw, sw->indexlist->line,DB_READWRITE);
1082     if ( sw->lasterror )
1083     SwishAbortLastError( sw );
1084    
1085    
1086     /* Read the header and overwrite the '-c' option and feault values - In other
1087     ** words, the header values are the good ones */
1088     read_header(sw, &sw->indexlist->header, sw->indexlist->DB);
1089     sw->TotalWords = sw->indexlist->header.totalwords;
1090     sw->TotalFiles = sw->indexlist->header.totalfiles;
1091    
1092     /* Adjust filenum to totalfiles */
1093     sw->Index->filenum = sw->TotalFiles;
1094    
1095     #ifndef USE_BTREE
1096     progerr("Invalid operation mode '%d': Update mode only supported with USE_BTREE feature", (int)params->run_mode);
1097     #endif
1098    
1099     }
1100     else
1101     {
1102     /* Create an empty File - before indexing to make sure can write to the index */
1103     sw->indexlist->DB = (void *) DB_Create(sw, sw->indexlist->line);
1104     if ( sw->lasterror )
1105     SwishAbortLastError( sw );
1106     }
1107    
1108    
1109     /* This should be printed by the module that's reading the source */
1110     if (sw->verbose >= 1)
1111     printf("Indexing Data Source: \"%s\"\n", IndexingDataSource->IndexingDataSourceName);
1112    
1113     tmpswline = sw->dirlist;
1114     while (tmpswline != NULL)
1115     {
1116     if (sw->verbose)
1117     {
1118     printf("Indexing \"%s\"\n", tmpswline->line);
1119     fflush(stdout);
1120     }
1121     indexpath(sw, tmpswline->line);
1122     tmpswline = tmpswline->next;
1123     }
1124    
1125    
1126     Mem_Summary("After indexing", 0);
1127    
1128    
1129     if (sw->verbose > 1)
1130     putchar('\n');
1131    
1132    
1133     if (sw->verbose)
1134     printf("Removing very common words...\n");
1135    
1136     fflush(stdout);
1137    
1138     write_index_file( sw, 1, elapsedStart, cpuStart, 0, params->run_mode == MODE_UPDATE?1:0);
1139     }
1140    
1141    
1142     /*************************************************************************
1143     * MERGE: prepare index files for merging, and call merge.c
1144     *
1145     * Most of this should probably be in merge.c
1146     *
1147     **************************************************************************/
1148     static void cmd_merge( SWISH *sw_input, CMDPARAMS *params )
1149     {
1150     SWISH *sw_out;
1151     double elapsedStart = TimeElapsed();
1152     double cpuStart = TimeCPU();
1153    
1154     if ( params->index_read_only )
1155     progerr("Sorry, this program is in readonly mode");
1156    
1157    
1158     if (!sw_input->indexlist)
1159     progerr("Failed to list any input files for merging");
1160    
1161    
1162     /* Open all the index files for reading */
1163     if ( !SwishAttach(sw_input) )
1164     SwishAbortLastError( sw_input );
1165    
1166    
1167     /* Check output file */
1168     if ( !params->merge_out_file )
1169     progerr("Failed to provide merge output file");
1170    
1171     if ( isfile(params->merge_out_file) )
1172     progerr("Merge output file '%s' already exists. Won't overwrite.\n", params->merge_out_file);
1173    
1174     /* create output */
1175     sw_out = SwishNew();
1176    
1177     sw_out->indexlist = addindexfile(sw_out->indexlist, params->merge_out_file);
1178    
1179    
1180     /* Update Economic mode */
1181     sw_out->Index->swap_locdata = params->swap_mode;
1182    
1183    
1184     /* Create an empty File - before indexing to make sure can write to the index */
1185     sw_out->indexlist->DB = (void *) DB_Create(sw_out, params->merge_out_file);
1186     if ( sw_out->lasterror )
1187     SwishAbortLastError( sw_out );
1188    
1189    
1190     merge_indexes( sw_input, sw_out );
1191    
1192     write_index_file( sw_out, 0, elapsedStart, cpuStart, 1, 0);
1193    
1194     SwishClose( sw_out );
1195    
1196     efree( params->merge_out_file );
1197     }
1198    
1199    
1200     /*************************************************************************
1201     * Displays all the words staring with params->keychar
1202     *
1203     **************************************************************************/
1204     static void cmd_keywords( SWISH *sw, CMDPARAMS *params )
1205     {
1206     if (!sw->indexlist)
1207     sw->indexlist = addindexfile(sw->indexlist, INDEXFILE);
1208    
1209     OutputKeyChar(sw, (int) (unsigned char) params->keychar);
1210     }
1211    
1212    
1213     /*************************************************************************
1214     * Runs a swish query
1215     *
1216     **************************************************************************/
1217     static void cmd_search( SWISH *sw, CMDPARAMS *params )
1218     {
1219     int rc = 0;
1220     double elapsedStart = TimeElapsed();
1221     double elapsedSearchStart;
1222     double elapsedEnd;
1223    
1224    
1225     /* Set default index file, if none specified */
1226     if (!sw->indexlist)
1227     sw->indexlist = addindexfile(sw->indexlist, INDEXFILE);
1228    
1229    
1230     /* Set the result sort order */
1231    
1232     if ( params->tmpsortprops )
1233     {
1234     int sortmode = -1; /* Ascendind by default */
1235     struct swline *tmplist;
1236     char *field;
1237    
1238     for (tmplist = params->tmpsortprops; tmplist; tmplist = tmplist->next)
1239     {
1240     field = tmplist->line;
1241     if (tmplist->next)
1242     {
1243     if (!strcasecmp(tmplist->next->line, "asc"))
1244     {
1245     sortmode = -1; /* asc sort */
1246     tmplist = tmplist->next;
1247     }
1248     else if (!strcasecmp(tmplist->next->line, "desc"))
1249     {
1250     sortmode = 1; /* desc sort */
1251     tmplist = tmplist->next;
1252     }
1253     }
1254     addSearchResultSortProperty(sw, field, sortmode);
1255     }
1256     }
1257    
1258    
1259     if (sw->Search->maxhits <= 0)
1260     sw->Search->maxhits = -1;
1261    
1262     if ( !SwishAttach(sw) )
1263     SwishAbortLastError( sw );
1264    
1265    
1266     resultHeaderOut(sw, 1, "%s\n", INDEXHEADER);
1267    
1268     /* print out "original" search words */
1269     resultHeaderOut(sw, 1, "# Search words: %s\n", params->wordlist);
1270    
1271    
1272    
1273     /* Get starting time */
1274     elapsedSearchStart = TimeElapsed();
1275    
1276     rc = search(sw, params->wordlist, params->structure);
1277    
1278     if ( rc < 0 )
1279     SwishAbortLastError( sw );
1280    
1281     resultHeaderOut(sw, 2, "#\n");
1282    
1283     if (rc > 0)
1284     {
1285     resultHeaderOut(sw, 1, "# Number of hits: %d\n", rc);
1286    
1287     elapsedEnd = TimeElapsed();
1288     resultHeaderOut(sw, 1, "# Search time: %0.3f seconds\n", elapsedEnd - elapsedSearchStart);
1289     resultHeaderOut(sw, 1, "# Run time: %0.3f seconds\n", elapsedEnd - elapsedStart);
1290     printSortedResults(sw);
1291     resultHeaderOut(sw, 1, ".\n");
1292     }
1293     else if (!rc )
1294     resultHeaderOut(sw, 1, "err: no results\n.\n");
1295    
1296    
1297     }
1298    
1299    
1300    
1301     /*************************************************************************
1302     * write_index_file -- used for both merge and for indexing
1303     *
1304     **************************************************************************/
1305    
1306     static void write_index_file( SWISH *sw, int process_stopwords, double elapsedStart, double cpuStart, int merge, int is_update)
1307     {
1308     int totalfiles = getfilecount(sw->indexlist);
1309     int stopwords = 0;
1310    
1311     /* Coalesce all remaining locations */
1312     coalesce_all_word_locations(sw, sw->indexlist);
1313    
1314     if ( process_stopwords )
1315     {
1316    
1317     /* Proccess IgnoreLimit option */
1318     getPositionsFromIgnoreLimitWords(sw);
1319    
1320     stopwords = getNumberOfIgnoreLimitWords(sw);
1321    
1322    
1323     if (sw->verbose )
1324     {
1325     if (stopwords)
1326     {
1327     int pos;
1328    
1329     /* 05/00 Jose Ruiz Adjust totalwords for IgnoreLimit ONLY */
1330     /* 2002-07 jmruiz
1331     **This is already done in getPositionsFromIgnoreLimitWords
1332     ** sw->indexlist->header.totalwords -= stopwords;
1333     */
1334    
1335     if (sw->indexlist->header.totalwords < 0)
1336     sw->indexlist->header.totalwords = 0;
1337    
1338     /* Same as "stopwords" */
1339     printf("%d words removed by IgnoreLimit:\n", sw->indexlist->header.stopPos);
1340    
1341     for (pos = 0; pos < sw->indexlist->header.stopPos; pos++)
1342     printf("%s, ", sw->indexlist->header.stopList[pos]);
1343    
1344     printf("\n");
1345     }
1346     else
1347     printf("no words removed.\n");
1348    
1349     }
1350     }
1351    
1352     if (sw->verbose)
1353     printf("Writing main index...\n");
1354    
1355     if ( !sw->indexlist->header.totalwords )
1356     /* Would be better to flag so db_native would know not to rename the (empty) index file */
1357     // printf("No unique words indexed!\n");
1358     progerr("No unique words indexed!");
1359    
1360     else
1361     {
1362    
1363    
1364     if (sw->verbose)
1365     printf("Sorting words ...\n");
1366    
1367     sort_words(sw, sw->indexlist);
1368    
1369    
1370    
1371     if (sw->verbose)
1372     printf("Writing header ...\n");
1373     fflush(stdout);
1374    
1375     write_header(sw, &sw->indexlist->header, sw->indexlist->DB, sw->indexlist->line, sw->indexlist->header.totalwords, totalfiles, merge);
1376    
1377     fflush(stdout);
1378    
1379     if (sw->verbose)
1380     printf("Writing index entries ...\n");
1381    
1382    
1383     write_index(sw, sw->indexlist);
1384    
1385    
1386     if (sw->verbose)
1387     printf("%d unique word%s indexed.\n", sw->indexlist->header.totalwords, (sw->indexlist->header.totalwords == 1) ? "" : "s");
1388    
1389    
1390     /* Sort properties -> Better search performance */
1391    
1392     /* First reopen the property file in read only mode for seek speed */
1393     DB_Reopen_PropertiesForRead( sw, sw->indexlist->DB );
1394     if ( sw->lasterror )
1395     SwishAbortLastError( sw );
1396    
1397     /* This does it all */
1398     sortFileProperties(sw,sw->indexlist);
1399     }
1400    
1401    
1402    
1403    
1404     if (sw->verbose)
1405     {
1406     if (totalfiles)
1407     printf("%d file%s indexed. %lu total bytes. %lu total words.\n",
1408     totalfiles, (totalfiles == 1) ? "" : "s", sw->indexlist->total_bytes, sw->indexlist->total_word_positions);
1409     else
1410     printf("no files indexed.\n");
1411    
1412     printf("Elapsed time: ");
1413     printTime(TimeElapsed() - elapsedStart);
1414     printf(" CPU time: ");
1415     printTime(TimeCPU() - cpuStart);
1416     printf("\n");
1417    
1418     printf("Indexing done!\n");
1419     }
1420    
1421    
1422     #ifdef INDEXPERMS
1423     chmod(sw->indexlist->line, INDEXPERMS);
1424     #endif
1425     }
1426    

  ViewVC Help
Powered by ViewVC 1.1.22