/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/swish.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/swish.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Error occurred while calculating annotation data.
Importing web-site building process.

1 /*
2 $Id: swish.c,v 1.98 2002/08/22 22:58:39 whmoseley Exp $
3 **
4 ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5 **
6 ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
7 **
8 ** This program and library is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU (Library) General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU (Library) General Public License for more details.
17 **
18 ** You should have received a copy of the GNU (Library) General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 **---------------------------------------------------------
22 */
23
24
25 #include <limits.h> // for ULONG_MAX
26 #include "swish.h"
27 #include "mem.h"
28 #include "string.h"
29 #include "error.h"
30 #include "list.h"
31 #include "search.h"
32 #include "index.h"
33 #include "file.h"
34 #include "http.h"
35 #include "merge.h"
36 #include "docprop.h"
37 #include "metanames.h"
38 #include "parse_conffile.h"
39 #include "result_output.h"
40 #include "result_sort.h"
41 #include "keychar_out.h"
42 #include "date_time.h"
43 #include "db.h"
44 #include "fs.h"
45 #include "dump.h"
46
47 #include "proplimit.h"
48
49
50 /*
51 ** This array has pointers to all the indexing data source
52 ** structures
53 */
54 extern struct _indexing_data_source_def *data_sources[];
55
56
57
58
59
60 typedef struct
61 {
62 char *name;
63 unsigned int bit;
64 char *description;
65 }
66 DEBUG_MAP;
67
68 static DEBUG_MAP debug_map[] = {
69 /* These dump data from the index file */
70 {"INDEX_HEADER", DEBUG_INDEX_HEADER, "Show the headers from the index"},
71 {"INDEX_WORDS", DEBUG_INDEX_WORDS, "List words stored in index"},
72 {"INDEX_WORDS_ONLY", DEBUG_INDEX_WORDS_ONLY, "List only words, one per line, stored in index"},
73 {"INDEX_WORDS_META", DEBUG_INDEX_WORDS_META, "List only words and associated metaID separated by a tab"},
74 {"INDEX_WORDS_FULL", DEBUG_INDEX_WORDS_FULL, "List words stored in index (more verbose)"},
75 {"INDEX_STOPWORDS", DEBUG_INDEX_STOPWORDS, "List stopwords stored in index"},
76 {"INDEX_FILES", DEBUG_INDEX_FILES, "List file data stored in index"},
77 {"INDEX_METANAMES", DEBUG_INDEX_METANAMES, "List metaname table stored in index"},
78 {"INDEX_ALL", DEBUG_INDEX_ALL, "Dump data ALL above data from index file\n\n-- indexing --\n"},
79
80 /* These trace indexing */
81 {"INDEXED_WORDS", DEBUG_WORDS, "Display words as they are indexed"},
82 {"PARSED_WORDS", DEBUG_PARSED_WORDS, "Display words as they are parsed from source"},
83 {"PROPERTIES", DEBUG_PROPERTIES, "Display properties associted with each file as they are indexed"},
84 {"REGEX", DEBUG_REGEX, "Debug regular expression processing"},
85 {"PARSED_TAGS", DEBUG_PARSED_TAGS, "Show meta tags as they are found"},
86 {"PARSED_TEXT", DEBUG_PARSED_TEXT, "Show text as it's parsed"},
87 };
88
89
90 /* Possible run modes */
91 typedef enum {
92 MODE_SEARCH,
93 MODE_INDEX,
94 MODE_DUMP,
95 MODE_WORDS,
96 MODE_MERGE,
97 MODE_UPDATE
98 }
99 CMD_MODE;
100
101
102 /* Parameters read from the command line, that are not stored in *SWISH */
103 typedef struct
104 {
105 CMD_MODE run_mode; /* selected run mode */
106 char *wordlist; /* list of -w words */
107 char keychar; /* for dumping words */
108
109 struct swline *tmpsortprops; /* sort properties */
110 struct swline *conflist; /* Configuration file list */
111
112 int hasverbose; /* flag if -v was used */
113
114 int index_read_only; /* flag to not allow indexing or merging */
115 int swap_mode;
116 int structure; /* where in file to search */
117
118 char *merge_out_file; /* the output file for merge */
119
120 }
121 CMDPARAMS;
122
123
124 /************* TOC ***************************************/
125 static CMDPARAMS *new_swish_params();
126 static void printTime(double time);
127 static void get_command_line_params(SWISH *sw, char **argv, CMDPARAMS *params );
128 static void free_command_line_params( CMDPARAMS *params );
129 static unsigned int isDebugWord(char *word, CMDPARAMS *params );
130 static void printversion();
131 static void usage();
132 static int check_readonly_mode( char * );
133
134 static void cmd_dump( SWISH *sw, CMDPARAMS *params );
135 static void cmd_index( SWISH *sw, CMDPARAMS *params );
136 static void cmd_merge( SWISH *sw, CMDPARAMS *params );
137 static void cmd_search( SWISH *sw, CMDPARAMS *params );
138 static void cmd_keywords( SWISH *sw, CMDPARAMS *params );
139 static void write_index_file( SWISH *sw, int process_stopwords, double elapsedStart, double cpuStart, int merge, int is_update);
140 /************* TOC ***************************************/
141
142
143 int main(int argc, char **argv)
144 {
145 SWISH *sw;
146 CMDPARAMS *params;
147
148 setlocale(LC_CTYPE, "");
149
150
151
152 /* Start a session */
153 sw = SwishNew(); /* Get swish handle */
154
155
156
157 /* By default we are set up to use the first data source in the list */
158 /* I don't like this. modules.c would fix this */
159 IndexingDataSource = data_sources[0];
160
161
162
163
164 params = new_swish_params();
165 get_command_line_params(sw, argv, params );
166
167 switch( params->run_mode )
168 {
169 case MODE_DUMP:
170 cmd_dump( sw, params ); /* first so will override */
171 break;
172
173 case MODE_MERGE:
174 cmd_merge( sw, params );
175 break;
176
177 case MODE_INDEX:
178 case MODE_UPDATE:
179 cmd_index( sw, params );
180 break;
181
182 case MODE_SEARCH:
183 cmd_search( sw, params );
184 break;
185
186 case MODE_WORDS:
187 cmd_keywords( sw ,params ); /* -k setting */
188 break;
189
190
191 default:
192 progerr("Invalid operation mode '%d'", (int)params->run_mode);
193 }
194
195 free_command_line_params( params );
196
197 SwishClose(sw);
198
199 Mem_Summary("At end of program", 1);
200
201 exit(0);
202
203 return 0;
204 }
205
206 /* Prints the running time (the time it took for indexing).
207 */
208
209 static void printTime(double time)
210 {
211 int hh,
212 mm,
213 ss;
214 int delta;
215
216 delta = (int) (time + 0.5);
217
218 ss = delta % 60;
219 delta /= 60;
220 hh = delta / 60;
221 mm = delta % 60;
222
223 printf("%02d:%02d:%02d", hh, mm, ss);
224 }
225
226 /* Prints the SWISH usage.
227 */
228
229 static void usage()
230 {
231 const char *defaultIndexingSystem = "";
232
233 printf(" usage:\n");
234 printf(" swish [-e] [-i dir file ... ] [-S system] [-c file] [-f file] [-l] [-v (num)]\n");
235 printf(" swish -w word1 word2 ... [-f file1 file2 ...] \\\n");
236 printf(" [-P phrase_delimiter] [-p prop1 ...] [-s sortprop1 [asc|desc] ...] \\\n");
237 printf(" [-m num] [-t str] [-d delim] [-H (num)] [-x output_format]\n");
238 printf(" swish -k (char|*) [-f file1 file2 ...]\n");
239 printf(" swish -M index1 index2 ... outputfile\n");
240 printf(" swish -N /path/to/compare/file\n");
241 printf(" swish -V\n");
242 putchar('\n');
243 printf("options: defaults are in brackets\n");
244
245
246 printf(" -S : specify which indexing system to use.\n");
247 printf(" Valid options are:\n");
248 #ifdef ALLOW_FILESYSTEM_INDEXING_DATA_SOURCE
249 printf(" \"fs\" - index local files in your File System\n");
250 if (!*defaultIndexingSystem)
251 defaultIndexingSystem = "fs";
252 #endif
253
254 #ifdef ALLOW_HTTP_INDEXING_DATA_SOURCE
255 printf(" \"http\" - index web site files using a web crawler\n");
256 if (!*defaultIndexingSystem)
257 defaultIndexingSystem = "http";
258 #endif
259
260 #ifdef ALLOW_EXTERNAL_PROGRAM_DATA_SOURCE
261 printf(" \"prog\" - index files supplied by an external program\n");
262
263 if (!*defaultIndexingSystem)
264 defaultIndexingSystem = "http";
265 #endif
266
267 printf(" The default value is: \"%s\"\n", defaultIndexingSystem);
268
269 printf(" -i : create an index from the specified files\n");
270 printf(" -w : search for words \"word1 word2 ...\"\n");
271 printf(" -t : tags to search in - specify as a string\n");
272 printf(" \"HBthec\" - in Head|Body|title|header|emphasized|comments\n");
273 printf(" -f : index file to create or file(s) to search from [%s]\n", INDEXFILE);
274 printf(" -c : configuration file(s) to use for indexing\n");
275 printf(" -v : indexing verbosity level (0 to 3) [-v %d]\n", VERBOSE);
276 printf(" -T : Trace options ('-T help' for info\n");
277 printf(" -l : follow symbolic links when indexing\n");
278 printf(" -b : begin results at this number\n");
279 printf(" -m : the maximum number of results to return [defaults to all results]\n");
280 printf(" -M : merges index files\n");
281 printf(" -N : index only files with a modification date newer than path supplied\n");
282 printf(" -p : include these document properties in the output \"prop1 prop2 ...\"\n");
283 printf(" -s : sort by these document properties in the output \"prop1 prop2 ...\"\n");
284 printf(" -d : next param is delimiter.\n");
285 printf(" -P : next param is Phrase delimiter.\n");
286 printf(" -V : prints the current version\n");
287 printf(" -e : \"Economic Mode\": The index proccess uses less RAM.\n");
288 printf(" -x : \"Extended Output Format\": Specify the output format.\n");
289 printf(" -H : \"Result Header Output\": verbosity (0 to 9) [1].\n");
290 printf(" -k : Print words starting with a given char.\n");
291 printf(" -E : Append errors to file specified, or stderr if file not specified.\n");
292 printf("\n");
293 printf("version: %s docs: http://swish-e.org\n", SWISH_VERSION);
294 exit(1);
295 }
296
297 static void printversion()
298 {
299 printf("SWISH-E %s\n", SWISH_VERSION );
300 exit(0);
301 }
302
303
304 /*************************************************************************
305 * Deal with -T debug options
306 *
307 *
308 **************************************************************************/
309
310 static unsigned int isDebugWord(char *word, CMDPARAMS *params)
311 {
312 int i,
313 help;
314
315 help = strcasecmp(word, "help") == 0;
316
317 if (help)
318 printf("\nAvailable debugging options for swish-e:\n");
319
320 for (i = 0; i < sizeof(debug_map) / sizeof(debug_map[0]); i++)
321 if (help)
322 printf(" %20s => %s\n", debug_map[i].name, debug_map[i].description);
323 else if (strcasecmp(debug_map[i].name, word) == 0)
324 {
325 if (strncasecmp(word, "INDEX_", 6) == 0)
326 params->run_mode = MODE_DUMP;
327
328 return debug_map[i].bit;
329 }
330
331 if (help)
332 exit(1);
333
334 return 0;
335 }
336
337 /*************************************************************************
338 * Initialize the swish command parameters
339 *
340 * Call with:
341 * void
342 *
343 * Returns:
344 * pointer to CMDPARAMS
345 *
346 * To Do:
347 * The swish parameters probably should be groupped by switches and
348 * by config file (and maybe someday also by directory or path or
349 * content-type) and then merged.
350 *
351 **************************************************************************/
352
353 static CMDPARAMS *new_swish_params()
354 {
355 CMDPARAMS *params = (CMDPARAMS *)emalloc( sizeof( CMDPARAMS ) );
356 memset( params, 0, sizeof( CMDPARAMS ) );
357
358 params->run_mode = MODE_SEARCH; /* default run mode */
359 params->structure = IN_FILE; /* look in the file, by default */
360
361 return params;
362 }
363
364
365
366 /*************************************************************************
367 * Free the swish command parameters
368 *
369 * Call with:
370 * *CMDPARAMS
371 *
372 * Returns:
373 * void
374 *
375 * To Do:
376 * The swish parameters probably should be groupped by switches and
377 * by config file (and maybe someday also by directory or path or
378 * content-type) and then merged.
379 *
380 **************************************************************************/
381
382 static void free_command_line_params( CMDPARAMS *params )
383 {
384 if ( params->wordlist )
385 efree( params->wordlist );
386
387 if ( params->tmpsortprops )
388 freeswline( params->tmpsortprops );
389
390 if ( params->conflist )
391 freeswline( params->conflist );
392
393 efree( params );
394 }
395
396
397 /*************************************************************************
398 * Just checks if there is a next word
399 * Three helper fuctions - to be replaced by better command parsing soon...
400 **************************************************************************/
401
402 static char *is_another_param( char **argv )
403 {
404 return ( *(argv + 1) && *(argv + 1)[0] != '-' )
405 ? *(argv + 1)
406 : NULL;
407 }
408
409 static char *next_param( char ***argv )
410 {
411 char *c;
412
413 if ( ( c = is_another_param( *argv ) ) )
414 {
415 (*argv)++;
416 return c;
417 }
418
419 return NULL;
420 }
421
422
423 static int get_param_number(char ***argv, char c )
424 {
425 char *badchar;
426 long num;
427 char *string = next_param( argv );
428
429 if ( !string )
430 progerr(" '-%c' requires a positive integer.", c );
431
432 num = strtol( string, &badchar, 10 ); // would base zero be more flexible?
433
434 if ( num == LONG_MAX || num == LONG_MIN )
435 progerrno("Failed to convert '-%c %s' to a number: ", c, string );
436
437 if ( *badchar )
438 progerr("Invalid char '%c' found in argument to '-%c %s'", badchar[0], c, string);
439
440
441 return (int) num;
442 }
443
444
445
446
447 /*************************************************************************
448 * Gets the command line parameters, if any, and set values in the CMDPARMAS structure
449 *
450 *
451 * Returns:
452 * void (changes *sw and *params)
453 *
454 * To Do:
455 * This code is horrific. Get a structure to define the parameters, and messages!
456 * Move this into its own module!
457 *
458 * Also, mixes two structres for parameters, SWISH and CMDPARAMS. Not a great setup.
459 *
460 *
461 * I'd like to see a centeral routine for processing switches, and a way for
462 * modules to "register" what config options to parse out by the central routine.
463 *
464 **************************************************************************/
465 static void get_command_line_params(SWISH *sw, char **argv, CMDPARAMS *params )
466 {
467 char c;
468 int lenwordlist = 0;
469 char *w;
470
471 /* not excited about this */
472 params->wordlist = (char *) emalloc((lenwordlist = MAXSTRLEN) + 1);
473 params->wordlist[0] = '\0';
474
475
476
477 params->index_read_only = check_readonly_mode( *argv );
478
479
480
481 if ( !*(argv + 1 ) )
482 progerr("Missing parameter. Use -h for options.", *argv);
483
484 while ( *++argv )
485 {
486
487 if ((*argv)[0] != '-') // every parameter starts with a dash
488 progerr("Missing switch character at '%s'. Use -h for options.", *argv);
489
490 if ( !(c = (*argv)[1] ) ) // get single switch char
491 progerr("Missing switch character at '%s'. Use -h for options.", *argv);
492
493 /* allow joined arguments */
494 if ( (*argv)[2] )
495 {
496 *argv += 2;
497 argv--;
498 }
499
500
501 /* files to index */
502 if (c == 'i')
503 {
504 if ( !is_another_param( argv ) )
505 progerr(" '-i' requires a list of index files.");
506
507 if(params->run_mode != MODE_UPDATE) /* Preserve update mode */
508 params->run_mode = MODE_INDEX;
509
510 while ( (w = next_param( &argv )) )
511 sw->dirlist = addswline(sw->dirlist, w );
512
513 continue;
514 }
515
516
517 /* search words */
518
519 if (c == 'w')
520 {
521 if ( !is_another_param( argv ) )
522 progerr(" '-w' requires list of search words.");
523
524 while ( (w = next_param( &argv )) )
525 {
526 /* don't add blank words */
527 if (w[0] == '\0')
528 continue;
529
530 if ((int)( strlen(params->wordlist) + strlen(" ") + strlen(w) ) >= lenwordlist)
531 {
532 lenwordlist = strlen(params->wordlist) + strlen(" ") + strlen(w) + 200;
533 params->wordlist = (char *) erealloc(params->wordlist, lenwordlist + 1);
534 }
535
536 params->run_mode = MODE_SEARCH;
537 sprintf(params->wordlist, "%s%s%s", params->wordlist, (params->wordlist[0] == '\0') ? "" : " ", w);
538 }
539
540 continue;
541 }
542
543
544
545
546 /* words to dump from index */
547
548 if (c == 'k')
549 {
550 if ( !(w = next_param( &argv )) )
551 progerr(" '-k' requires a character (or '*').");
552
553 if ( strlen( w ) != 1 )
554 progerr(" '-k' requires a character (or '*').");
555
556
557 params->run_mode = MODE_WORDS;
558 params->keychar = w[0];
559
560 continue;
561 }
562
563
564
565 /* Data source */
566
567 else if (c == 'S')
568 {
569 struct _indexing_data_source_def **data_source;
570
571 if ( !(w = next_param( &argv )) )
572 progerr(" '-S' requires a valid data source.");
573
574 for (data_source = data_sources; *data_source != 0; data_source++)
575 if (strcmp(w, (*data_source)->IndexingDataSourceId) == 0)
576 break;
577
578
579 if (!*data_source)
580 progerr("Unknown -S option \"%s\"", w);
581 else
582 IndexingDataSource = *data_source;
583
584 continue;
585 }
586
587
588
589 /* sort properties */
590
591 if (c == 's')
592 {
593 if ( !is_another_param( argv ) )
594 progerr(" '-s' requires list of sort properties.");
595
596 while ( (w = next_param( &argv )) )
597 params->tmpsortprops = addswline(params->tmpsortprops, w);
598
599 continue;
600 }
601
602
603 /* display properties */
604
605 if (c == 'p')
606 {
607 if ( !is_another_param( argv ) )
608 progerr(" '-p' requires list of properties.");
609
610 while ( (w = next_param( &argv )) )
611 addSearchResultDisplayProperty(sw, w);
612
613 continue;
614 }
615
616
617
618 /* Set limit values */
619
620 if (c == 'L')
621 {
622 if ( !( is_another_param( argv ) && is_another_param( argv + 1 ) && is_another_param( argv + 2 )) )
623 progerr("-L requires three parameters <propname> <lorange> <highrange>");
624
625 if ( !SetLimitParameter(sw, argv[1], argv[2], argv[3]) )
626 SwishAbortLastError( sw );
627
628 argv += 3;
629
630 continue;
631 }
632
633
634
635 /* Index file(s) selection */
636
637 if (c == 'f')
638 {
639 if ( !is_another_param( argv ) )
640 progerr(" '-f' requires list of index files.");
641
642 while ( (w = next_param( &argv )) )
643 sw->indexlist = addindexfile(sw->indexlist, w);
644
645 continue;
646 }
647
648
649 /* config file list */
650
651 if (c == 'c')
652 {
653 if ( !is_another_param( argv ) )
654 progerr(" '-c' requires one or more configuration files.");
655
656 if(params->run_mode != MODE_UPDATE) /* Preserve update mode */
657 params->run_mode = MODE_INDEX;
658
659 while ( (w = next_param( &argv )) )
660 params->conflist = addswline(params->conflist, w);
661
662 continue;
663 }
664
665
666
667 /* Follow symbolic links */
668
669 if (c == 'l')
670 {
671 sw->FS->followsymlinks = 1;
672 continue;
673 }
674
675
676 /* Set begin hit location */
677
678 if (c == 'b')
679 {
680 sw->Search->beginhits = get_param_number( &argv, c );
681 continue;
682 }
683
684
685 /* Set max hits */
686
687 if (c == 'm')
688 {
689 sw->Search->maxhits = get_param_number( &argv, c );
690 continue;
691 }
692
693
694
695 /* Save the time for limiting indexing by a file date */
696
697 if (c == 'N')
698 {
699 struct stat stat_buf;
700
701 if ( !(w = next_param( &argv )) )
702 progerr("-N requires a path to a local file");
703
704 if (stat( w, &stat_buf))
705 progerrno("Bad path '%s' specified with -N: ", w );
706
707 sw->mtime_limit = stat_buf.st_mtime;
708
709 continue;
710 }
711
712
713 /* limit by structure */
714
715 if (c == 't')
716 {
717 char * c;
718
719 if ( !(w = next_param( &argv )) )
720 progerr("Specify tag fields (HBtheca).");
721
722
723 params->structure = 0; /* reset to none */
724
725 for ( c = w; *c; c++ )
726 switch ( *c )
727 {
728 case 'H':
729 params->structure |= IN_HEAD;
730 break;
731 case 'B':
732 params->structure |= IN_BODY;
733 break;
734 case 't':
735 params->structure |= IN_TITLE;
736 break;
737 case 'h':
738 params->structure |= IN_HEADER;
739 break;
740 case 'e':
741 params->structure |= IN_EMPHASIZED;
742 break;
743 case 'c':
744 params->structure |= IN_COMMENTS;
745 break;
746 case 'a':
747 params->structure |= IN_ALL;
748 break;
749 default:
750 progerr("-t must only include HBthec. Found '%c'", *c );
751 }
752 continue;
753 }
754
755
756
757
758
759 /* verbose while indexing */
760
761 if (c == 'v')
762 {
763 params->hasverbose = 1;
764 sw->verbose = get_param_number( &argv, c );
765 continue;
766 }
767
768
769
770 /* print the version number */
771
772 if (c == 'V')
773 printversion();
774
775
776
777 /* "z" Huh? */
778
779 if (c == 'z' || c == 'h' || c == '?')
780 usage();
781
782
783
784 /* Merge settings */
785
786 if (c == 'M')
787 {
788 if ( !is_another_param( argv ) )
789 progerr(" '-M' requires an output file name.");
790
791 params->run_mode = MODE_MERGE;
792
793 while ( (w = next_param( &argv )) )
794 {
795 /* Last one listed is the output file */
796 if ( is_another_param( argv ) )
797 sw->indexlist = addindexfile(sw->indexlist, w);
798 else
799 params->merge_out_file = estrdup( w );
800 }
801
802 continue;
803 }
804
805
806
807 /* Debugging options */
808
809 if (c == 'T')
810 {
811 while ( (w = next_param( &argv )) )
812 {
813 unsigned int bit;
814
815 if ((bit = isDebugWord( w, params) ))
816 DEBUG_MASK |= bit;
817 else
818 progerr("Invalid debugging option '%s'. Use '-T help' for help.", w);
819
820 }
821 continue;
822 }
823
824
825
826 /* Set where errors go */
827
828 if (c == 'E')
829 {
830 if ( !is_another_param( argv ) )
831 set_error_handle( stderr ); // -E alone goes to stderr
832
833 else
834 {
835 FILE *f;
836 w = next_param( &argv );
837 f = fopen( w, "a" );
838 if ( !f )
839 progerrno("Failed to open Error file '%s' for appending: ", w );
840
841 set_error_handle( f );
842 }
843
844 continue;
845 }
846
847
848
849 /* Custom Phrase Delimiter - Jose Ruiz 01/00 */
850
851 if (c == 'P')
852 {
853 if ( !(w = next_param( &argv )) )
854 progerr("'-P' requires a phrase delimiter.");
855
856 sw->Search->PhraseDelimiter = (int) w[0];
857 continue;
858 }
859
860
861
862 /* Set the custom delimiter */
863 if (c == 'd')
864 {
865 if ( !(w = next_param( &argv )) )
866 progerr("'-d' requires an output delimiter.");
867
868 sw->ResultOutput->stdResultFieldDelimiter = estrredup(sw->ResultOutput->stdResultFieldDelimiter, w );
869
870 /* This really doesn't work as is probably expected since it's a delimiter and not quoting the fields */
871 if (strcmp(sw->ResultOutput->stdResultFieldDelimiter, "dq") == 0)
872 strcpy( sw->ResultOutput->stdResultFieldDelimiter, "\"" );
873 else
874 {
875 int i,j;
876 int backslash = 0;
877
878 for ( j=0, i=0; i < strlen( w ); i++ )
879 {
880 if ( !backslash )
881 {
882 if ( w[i] == '\\' )
883 {
884 backslash++;
885 continue;
886 }
887 else
888 {
889 sw->ResultOutput->stdResultFieldDelimiter[j++] = w[i];
890 continue;
891 }
892 }
893
894
895 switch ( w[i] )
896 {
897 case 'f':
898 sw->ResultOutput->stdResultFieldDelimiter[j++] = '\f';
899 break;
900 case 'n':
901 sw->ResultOutput->stdResultFieldDelimiter[j++] = '\n';
902 break;
903 case 'r':
904 sw->ResultOutput->stdResultFieldDelimiter[j++] = '\r';
905 break;
906 case 't':
907 sw->ResultOutput->stdResultFieldDelimiter[j++] = '\t';
908 break;
909 case '\\':
910 sw->ResultOutput->stdResultFieldDelimiter[j++] = '\\';
911 sw->ResultOutput->stdResultFieldDelimiter[j++] = '\\';
912 break;
913 default:
914 progerr("Unknown escape sequence '\\%c'. Must be one of \\f \\n \\r \\t \\\\", w[i]);
915 }
916 backslash = 0;
917 }
918 sw->ResultOutput->stdResultFieldDelimiter[j] = '\0';
919 }
920 continue;
921 }
922
923
924
925 /* Econ mode */
926
927 if (c == 'e')
928 {
929 /* Jose Ruiz 09/00 */
930 params->swap_mode = 1; /* "Economic mode": Uses less RAM */
931 /* The proccess is slower: Part of */
932 /* info is preserved in temporal */
933 /* files */
934
935 continue;
936 }
937
938
939 /* $$$ These need better error reporting */
940
941 /* Extended format */
942
943 if (c == 'x')
944 {
945 /* Jose Ruiz 09/00 */
946 /* Search proc will show more info */
947 /* rasc 2001-02 extended -x fmtstr */
948
949 if ( !(w = next_param( &argv )) )
950 progerr("'-x' requires an output format string.");
951
952 {
953 char *s;
954 s = hasResultExtFmtStr(sw, w);
955 sw->ResultOutput->extendedformat = (s) ? s : w;
956 initPrintExtResult(sw, sw->ResultOutput->extendedformat);
957 }
958
959 continue;
960 }
961
962
963
964 /* Search header output control */
965 if (c == 'H')
966 {
967 sw->ResultOutput->headerOutVerbose = get_param_number( &argv, c );
968 continue;
969 }
970
971
972 /* Ignore sorted indexes */
973
974 if (c == 'o')
975 {
976 sw->ResultSort->isPreSorted = 0;
977 continue;
978 }
979
980 /* Update mode jmruiz 2002/03 */
981
982 if (c == 'u')
983 {
984 params->run_mode = MODE_UPDATE;
985 continue;
986 }
987
988 progerr("Unknown switch '-%c'. Use -h for options.", c );
989 }
990 }
991
992
993 /*************************************************************************
994 * Returns true if we think the program is called swish-search
995 * offers no real security
996 *
997 **************************************************************************/
998 static int check_readonly_mode( char *prog )
999 {
1000 char *tmp = prog + strlen(prog) - strlen("swish-search");
1001
1002 if ( tmp < prog )
1003 return 0;
1004
1005 /* We must ignore case for WIN 32 */
1006 if (strcasecmp(tmp, "swish-search") == 0)
1007 return 1;
1008
1009 return 0;
1010 }
1011
1012
1013 /*************************************************************************
1014 * Dumps the index file(s)
1015 *
1016 **************************************************************************/
1017 static void cmd_dump( SWISH *sw, CMDPARAMS *params )
1018 {
1019
1020 /* Set the default index file */
1021 if ( sw->indexlist == NULL )
1022 sw->indexlist = addindexfile(sw->indexlist, INDEXFILE);
1023
1024 while ( sw->indexlist != NULL )
1025 {
1026
1027 DB_decompress(sw, sw->indexlist);
1028 putchar('\n');
1029
1030 sw->indexlist = sw->indexlist->next;
1031 }
1032 }
1033 /*************************************************************************
1034 * This run the indexing code
1035 *
1036 **************************************************************************/
1037
1038 static void cmd_index( SWISH *sw, CMDPARAMS *params )
1039 {
1040 int hasdir = (sw->dirlist == NULL) ? 0 : 1;
1041 int hasindex = (sw->indexlist == NULL) ? 0 : 1;
1042 double elapsedStart = TimeElapsed();
1043 double cpuStart = TimeCPU();
1044 struct swline *tmpswline;
1045
1046 if ( params->index_read_only )
1047 progerr("Sorry, this program is in readonly mode");
1048
1049
1050 /* Read configuration files */
1051 {
1052 struct swline *tmp = params->conflist;
1053 while ( tmp != NULL)
1054 {
1055 getdefaults(sw, tmp->line, &hasdir, &hasindex, params->hasverbose);
1056 tmp = tmp->next;
1057 }
1058 }
1059
1060
1061 /* Default index file */
1062 if ( sw->indexlist == NULL )
1063 sw->indexlist = addindexfile(sw->indexlist, INDEXFILE);
1064
1065
1066 if (!hasdir)
1067 progerr("Specify directories or files to index.");
1068
1069
1070 if (sw->verbose < 0)
1071 sw->verbose = 0;
1072
1073 /* Update Economic mode */
1074 sw->Index->swap_locdata = params->swap_mode;
1075
1076
1077 /* Check for UPDATE_MODE jmruiz 2002/03 */
1078 if(params->run_mode == MODE_UPDATE)
1079 {
1080 /* Open the index file for read/write */
1081 sw->indexlist->DB = (void *) DB_Open(sw, sw->indexlist->line,DB_READWRITE);
1082 if ( sw->lasterror )
1083 SwishAbortLastError( sw );
1084
1085
1086 /* Read the header and overwrite the '-c' option and feault values - In other
1087 ** words, the header values are the good ones */
1088 read_header(sw, &sw->indexlist->header, sw->indexlist->DB);
1089 sw->TotalWords = sw->indexlist->header.totalwords;
1090 sw->TotalFiles = sw->indexlist->header.totalfiles;
1091
1092 /* Adjust filenum to totalfiles */
1093 sw->Index->filenum = sw->TotalFiles;
1094
1095 #ifndef USE_BTREE
1096 progerr("Invalid operation mode '%d': Update mode only supported with USE_BTREE feature", (int)params->run_mode);
1097 #endif
1098
1099 }
1100 else
1101 {
1102 /* Create an empty File - before indexing to make sure can write to the index */
1103 sw->indexlist->DB = (void *) DB_Create(sw, sw->indexlist->line);
1104 if ( sw->lasterror )
1105 SwishAbortLastError( sw );
1106 }
1107
1108
1109 /* This should be printed by the module that's reading the source */
1110 if (sw->verbose >= 1)
1111 printf("Indexing Data Source: \"%s\"\n", IndexingDataSource->IndexingDataSourceName);
1112
1113 tmpswline = sw->dirlist;
1114 while (tmpswline != NULL)
1115 {
1116 if (sw->verbose)
1117 {
1118 printf("Indexing \"%s\"\n", tmpswline->line);
1119 fflush(stdout);
1120 }
1121 indexpath(sw, tmpswline->line);
1122 tmpswline = tmpswline->next;
1123 }
1124
1125
1126 Mem_Summary("After indexing", 0);
1127
1128
1129 if (sw->verbose > 1)
1130 putchar('\n');
1131
1132
1133 if (sw->verbose)
1134 printf("Removing very common words...\n");
1135
1136 fflush(stdout);
1137
1138 write_index_file( sw, 1, elapsedStart, cpuStart, 0, params->run_mode == MODE_UPDATE?1:0);
1139 }
1140
1141
1142 /*************************************************************************
1143 * MERGE: prepare index files for merging, and call merge.c
1144 *
1145 * Most of this should probably be in merge.c
1146 *
1147 **************************************************************************/
1148 static void cmd_merge( SWISH *sw_input, CMDPARAMS *params )
1149 {
1150 SWISH *sw_out;
1151 double elapsedStart = TimeElapsed();
1152 double cpuStart = TimeCPU();
1153
1154 if ( params->index_read_only )
1155 progerr("Sorry, this program is in readonly mode");
1156
1157
1158 if (!sw_input->indexlist)
1159 progerr("Failed to list any input files for merging");
1160
1161
1162 /* Open all the index files for reading */
1163 if ( !SwishAttach(sw_input) )
1164 SwishAbortLastError( sw_input );
1165
1166
1167 /* Check output file */
1168 if ( !params->merge_out_file )
1169 progerr("Failed to provide merge output file");
1170
1171 if ( isfile(params->merge_out_file) )
1172 progerr("Merge output file '%s' already exists. Won't overwrite.\n", params->merge_out_file);
1173
1174 /* create output */
1175 sw_out = SwishNew();
1176
1177 sw_out->indexlist = addindexfile(sw_out->indexlist, params->merge_out_file);
1178
1179
1180 /* Update Economic mode */
1181 sw_out->Index->swap_locdata = params->swap_mode;
1182
1183
1184 /* Create an empty File - before indexing to make sure can write to the index */
1185 sw_out->indexlist->DB = (void *) DB_Create(sw_out, params->merge_out_file);
1186 if ( sw_out->lasterror )
1187 SwishAbortLastError( sw_out );
1188
1189
1190 merge_indexes( sw_input, sw_out );
1191
1192 write_index_file( sw_out, 0, elapsedStart, cpuStart, 1, 0);
1193
1194 SwishClose( sw_out );
1195
1196 efree( params->merge_out_file );
1197 }
1198
1199
1200 /*************************************************************************
1201 * Displays all the words staring with params->keychar
1202 *
1203 **************************************************************************/
1204 static void cmd_keywords( SWISH *sw, CMDPARAMS *params )
1205 {
1206 if (!sw->indexlist)
1207 sw->indexlist = addindexfile(sw->indexlist, INDEXFILE);
1208
1209 OutputKeyChar(sw, (int) (unsigned char) params->keychar);
1210 }
1211
1212
1213 /*************************************************************************
1214 * Runs a swish query
1215 *
1216 **************************************************************************/
1217 static void cmd_search( SWISH *sw, CMDPARAMS *params )
1218 {
1219 int rc = 0;
1220 double elapsedStart = TimeElapsed();
1221 double elapsedSearchStart;
1222 double elapsedEnd;
1223
1224
1225 /* Set default index file, if none specified */
1226 if (!sw->indexlist)
1227 sw->indexlist = addindexfile(sw->indexlist, INDEXFILE);
1228
1229
1230 /* Set the result sort order */
1231
1232 if ( params->tmpsortprops )
1233 {
1234 int sortmode = -1; /* Ascendind by default */
1235 struct swline *tmplist;
1236 char *field;
1237
1238 for (tmplist = params->tmpsortprops; tmplist; tmplist = tmplist->next)
1239 {
1240 field = tmplist->line;
1241 if (tmplist->next)
1242 {
1243 if (!strcasecmp(tmplist->next->line, "asc"))
1244 {
1245 sortmode = -1; /* asc sort */
1246 tmplist = tmplist->next;
1247 }
1248 else if (!strcasecmp(tmplist->next->line, "desc"))
1249 {
1250 sortmode = 1; /* desc sort */
1251 tmplist = tmplist->next;
1252 }
1253 }
1254 addSearchResultSortProperty(sw, field, sortmode);
1255 }
1256 }
1257
1258
1259 if (sw->Search->maxhits <= 0)
1260 sw->Search->maxhits = -1;
1261
1262 if ( !SwishAttach(sw) )
1263 SwishAbortLastError( sw );
1264
1265
1266 resultHeaderOut(sw, 1, "%s\n", INDEXHEADER);
1267
1268 /* print out "original" search words */
1269 resultHeaderOut(sw, 1, "# Search words: %s\n", params->wordlist);
1270
1271
1272
1273 /* Get starting time */
1274 elapsedSearchStart = TimeElapsed();
1275
1276 rc = search(sw, params->wordlist, params->structure);
1277
1278 if ( rc < 0 )
1279 SwishAbortLastError( sw );
1280
1281 resultHeaderOut(sw, 2, "#\n");
1282
1283 if (rc > 0)
1284 {
1285 resultHeaderOut(sw, 1, "# Number of hits: %d\n", rc);
1286
1287 elapsedEnd = TimeElapsed();
1288 resultHeaderOut(sw, 1, "# Search time: %0.3f seconds\n", elapsedEnd - elapsedSearchStart);
1289 resultHeaderOut(sw, 1, "# Run time: %0.3f seconds\n", elapsedEnd - elapsedStart);
1290 printSortedResults(sw);
1291 resultHeaderOut(sw, 1, ".\n");
1292 }
1293 else if (!rc )
1294 resultHeaderOut(sw, 1, "err: no results\n.\n");
1295
1296
1297 }
1298
1299
1300
1301 /*************************************************************************
1302 * write_index_file -- used for both merge and for indexing
1303 *
1304 **************************************************************************/
1305
1306 static void write_index_file( SWISH *sw, int process_stopwords, double elapsedStart, double cpuStart, int merge, int is_update)
1307 {
1308 int totalfiles = getfilecount(sw->indexlist);
1309 int stopwords = 0;
1310
1311 /* Coalesce all remaining locations */
1312 coalesce_all_word_locations(sw, sw->indexlist);
1313
1314 if ( process_stopwords )
1315 {
1316
1317 /* Proccess IgnoreLimit option */
1318 getPositionsFromIgnoreLimitWords(sw);
1319
1320 stopwords = getNumberOfIgnoreLimitWords(sw);
1321
1322
1323 if (sw->verbose )
1324 {
1325 if (stopwords)
1326 {
1327 int pos;
1328
1329 /* 05/00 Jose Ruiz Adjust totalwords for IgnoreLimit ONLY */
1330 /* 2002-07 jmruiz
1331 **This is already done in getPositionsFromIgnoreLimitWords
1332 ** sw->indexlist->header.totalwords -= stopwords;
1333 */
1334
1335 if (sw->indexlist->header.totalwords < 0)
1336 sw->indexlist->header.totalwords = 0;
1337
1338 /* Same as "stopwords" */
1339 printf("%d words removed by IgnoreLimit:\n", sw->indexlist->header.stopPos);
1340
1341 for (pos = 0; pos < sw->indexlist->header.stopPos; pos++)
1342 printf("%s, ", sw->indexlist->header.stopList[pos]);
1343
1344 printf("\n");
1345 }
1346 else
1347 printf("no words removed.\n");
1348
1349 }
1350 }
1351
1352 if (sw->verbose)
1353 printf("Writing main index...\n");
1354
1355 if ( !sw->indexlist->header.totalwords )
1356 /* Would be better to flag so db_native would know not to rename the (empty) index file */
1357 // printf("No unique words indexed!\n");
1358 progerr("No unique words indexed!");
1359
1360 else
1361 {
1362
1363
1364 if (sw->verbose)
1365 printf("Sorting words ...\n");
1366
1367 sort_words(sw, sw->indexlist);
1368
1369
1370
1371 if (sw->verbose)
1372 printf("Writing header ...\n");
1373 fflush(stdout);
1374
1375 write_header(sw, &sw->indexlist->header, sw->indexlist->DB, sw->indexlist->line, sw->indexlist->header.totalwords, totalfiles, merge);
1376
1377 fflush(stdout);
1378
1379 if (sw->verbose)
1380 printf("Writing index entries ...\n");
1381
1382
1383 write_index(sw, sw->indexlist);
1384
1385
1386 if (sw->verbose)
1387 printf("%d unique word%s indexed.\n", sw->indexlist->header.totalwords, (sw->indexlist->header.totalwords == 1) ? "" : "s");
1388
1389
1390 /* Sort properties -> Better search performance */
1391
1392 /* First reopen the property file in read only mode for seek speed */
1393 DB_Reopen_PropertiesForRead( sw, sw->indexlist->DB );
1394 if ( sw->lasterror )
1395 SwishAbortLastError( sw );
1396
1397 /* This does it all */
1398 sortFileProperties(sw,sw->indexlist);
1399 }
1400
1401
1402
1403
1404 if (sw->verbose)
1405 {
1406 if (totalfiles)
1407 printf("%d file%s indexed. %lu total bytes. %lu total words.\n",
1408 totalfiles, (totalfiles == 1) ? "" : "s", sw->indexlist->total_bytes, sw->indexlist->total_word_positions);
1409 else
1410 printf("no files indexed.\n");
1411
1412 printf("Elapsed time: ");
1413 printTime(TimeElapsed() - elapsedStart);
1414 printf(" CPU time: ");
1415 printTime(TimeCPU() - cpuStart);
1416 printf("\n");
1417
1418 printf("Indexing done!\n");
1419 }
1420
1421
1422 #ifdef INDEXPERMS
1423 chmod(sw->indexlist->line, INDEXPERMS);
1424 #endif
1425 }
1426

  ViewVC Help
Powered by ViewVC 1.1.22