1 |
/* |
2 |
$Id: swish.c,v 1.98 2002/08/22 22:58:39 whmoseley Exp $ |
3 |
** |
4 |
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company |
5 |
** |
6 |
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 |
7 |
** |
8 |
** This program and library is free software; you can redistribute it and/or |
9 |
** modify it under the terms of the GNU (Library) General Public License |
10 |
** as published by the Free Software Foundation; either version 2 |
11 |
** of the License, or any later version. |
12 |
** |
13 |
** This program is distributed in the hope that it will be useful, |
14 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
** GNU (Library) General Public License for more details. |
17 |
** |
18 |
** You should have received a copy of the GNU (Library) General Public License |
19 |
** along with this program; if not, write to the Free Software |
20 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
21 |
**--------------------------------------------------------- |
22 |
*/ |
23 |
|
24 |
|
25 |
#include <limits.h> // for ULONG_MAX |
26 |
#include "swish.h" |
27 |
#include "mem.h" |
28 |
#include "string.h" |
29 |
#include "error.h" |
30 |
#include "list.h" |
31 |
#include "search.h" |
32 |
#include "index.h" |
33 |
#include "file.h" |
34 |
#include "http.h" |
35 |
#include "merge.h" |
36 |
#include "docprop.h" |
37 |
#include "metanames.h" |
38 |
#include "parse_conffile.h" |
39 |
#include "result_output.h" |
40 |
#include "result_sort.h" |
41 |
#include "keychar_out.h" |
42 |
#include "date_time.h" |
43 |
#include "db.h" |
44 |
#include "fs.h" |
45 |
#include "dump.h" |
46 |
|
47 |
#include "proplimit.h" |
48 |
|
49 |
|
50 |
/* |
51 |
** This array has pointers to all the indexing data source |
52 |
** structures |
53 |
*/ |
54 |
extern struct _indexing_data_source_def *data_sources[]; |
55 |
|
56 |
|
57 |
|
58 |
|
59 |
|
60 |
typedef struct |
61 |
{ |
62 |
char *name; |
63 |
unsigned int bit; |
64 |
char *description; |
65 |
} |
66 |
DEBUG_MAP; |
67 |
|
68 |
static DEBUG_MAP debug_map[] = { |
69 |
/* These dump data from the index file */ |
70 |
{"INDEX_HEADER", DEBUG_INDEX_HEADER, "Show the headers from the index"}, |
71 |
{"INDEX_WORDS", DEBUG_INDEX_WORDS, "List words stored in index"}, |
72 |
{"INDEX_WORDS_ONLY", DEBUG_INDEX_WORDS_ONLY, "List only words, one per line, stored in index"}, |
73 |
{"INDEX_WORDS_META", DEBUG_INDEX_WORDS_META, "List only words and associated metaID separated by a tab"}, |
74 |
{"INDEX_WORDS_FULL", DEBUG_INDEX_WORDS_FULL, "List words stored in index (more verbose)"}, |
75 |
{"INDEX_STOPWORDS", DEBUG_INDEX_STOPWORDS, "List stopwords stored in index"}, |
76 |
{"INDEX_FILES", DEBUG_INDEX_FILES, "List file data stored in index"}, |
77 |
{"INDEX_METANAMES", DEBUG_INDEX_METANAMES, "List metaname table stored in index"}, |
78 |
{"INDEX_ALL", DEBUG_INDEX_ALL, "Dump data ALL above data from index file\n\n-- indexing --\n"}, |
79 |
|
80 |
/* These trace indexing */ |
81 |
{"INDEXED_WORDS", DEBUG_WORDS, "Display words as they are indexed"}, |
82 |
{"PARSED_WORDS", DEBUG_PARSED_WORDS, "Display words as they are parsed from source"}, |
83 |
{"PROPERTIES", DEBUG_PROPERTIES, "Display properties associted with each file as they are indexed"}, |
84 |
{"REGEX", DEBUG_REGEX, "Debug regular expression processing"}, |
85 |
{"PARSED_TAGS", DEBUG_PARSED_TAGS, "Show meta tags as they are found"}, |
86 |
{"PARSED_TEXT", DEBUG_PARSED_TEXT, "Show text as it's parsed"}, |
87 |
}; |
88 |
|
89 |
|
90 |
/* Possible run modes */ |
91 |
typedef enum { |
92 |
MODE_SEARCH, |
93 |
MODE_INDEX, |
94 |
MODE_DUMP, |
95 |
MODE_WORDS, |
96 |
MODE_MERGE, |
97 |
MODE_UPDATE |
98 |
} |
99 |
CMD_MODE; |
100 |
|
101 |
|
102 |
/* Parameters read from the command line, that are not stored in *SWISH */ |
103 |
typedef struct |
104 |
{ |
105 |
CMD_MODE run_mode; /* selected run mode */ |
106 |
char *wordlist; /* list of -w words */ |
107 |
char keychar; /* for dumping words */ |
108 |
|
109 |
struct swline *tmpsortprops; /* sort properties */ |
110 |
struct swline *conflist; /* Configuration file list */ |
111 |
|
112 |
int hasverbose; /* flag if -v was used */ |
113 |
|
114 |
int index_read_only; /* flag to not allow indexing or merging */ |
115 |
int swap_mode; |
116 |
int structure; /* where in file to search */ |
117 |
|
118 |
char *merge_out_file; /* the output file for merge */ |
119 |
|
120 |
} |
121 |
CMDPARAMS; |
122 |
|
123 |
|
124 |
/************* TOC ***************************************/ |
125 |
static CMDPARAMS *new_swish_params(); |
126 |
static void printTime(double time); |
127 |
static void get_command_line_params(SWISH *sw, char **argv, CMDPARAMS *params ); |
128 |
static void free_command_line_params( CMDPARAMS *params ); |
129 |
static unsigned int isDebugWord(char *word, CMDPARAMS *params ); |
130 |
static void printversion(); |
131 |
static void usage(); |
132 |
static int check_readonly_mode( char * ); |
133 |
|
134 |
static void cmd_dump( SWISH *sw, CMDPARAMS *params ); |
135 |
static void cmd_index( SWISH *sw, CMDPARAMS *params ); |
136 |
static void cmd_merge( SWISH *sw, CMDPARAMS *params ); |
137 |
static void cmd_search( SWISH *sw, CMDPARAMS *params ); |
138 |
static void cmd_keywords( SWISH *sw, CMDPARAMS *params ); |
139 |
static void write_index_file( SWISH *sw, int process_stopwords, double elapsedStart, double cpuStart, int merge, int is_update); |
140 |
/************* TOC ***************************************/ |
141 |
|
142 |
|
143 |
int main(int argc, char **argv) |
144 |
{ |
145 |
SWISH *sw; |
146 |
CMDPARAMS *params; |
147 |
|
148 |
setlocale(LC_CTYPE, ""); |
149 |
|
150 |
|
151 |
|
152 |
/* Start a session */ |
153 |
sw = SwishNew(); /* Get swish handle */ |
154 |
|
155 |
|
156 |
|
157 |
/* By default we are set up to use the first data source in the list */ |
158 |
/* I don't like this. modules.c would fix this */ |
159 |
IndexingDataSource = data_sources[0]; |
160 |
|
161 |
|
162 |
|
163 |
|
164 |
params = new_swish_params(); |
165 |
get_command_line_params(sw, argv, params ); |
166 |
|
167 |
switch( params->run_mode ) |
168 |
{ |
169 |
case MODE_DUMP: |
170 |
cmd_dump( sw, params ); /* first so will override */ |
171 |
break; |
172 |
|
173 |
case MODE_MERGE: |
174 |
cmd_merge( sw, params ); |
175 |
break; |
176 |
|
177 |
case MODE_INDEX: |
178 |
case MODE_UPDATE: |
179 |
cmd_index( sw, params ); |
180 |
break; |
181 |
|
182 |
case MODE_SEARCH: |
183 |
cmd_search( sw, params ); |
184 |
break; |
185 |
|
186 |
case MODE_WORDS: |
187 |
cmd_keywords( sw ,params ); /* -k setting */ |
188 |
break; |
189 |
|
190 |
|
191 |
default: |
192 |
progerr("Invalid operation mode '%d'", (int)params->run_mode); |
193 |
} |
194 |
|
195 |
free_command_line_params( params ); |
196 |
|
197 |
SwishClose(sw); |
198 |
|
199 |
Mem_Summary("At end of program", 1); |
200 |
|
201 |
exit(0); |
202 |
|
203 |
return 0; |
204 |
} |
205 |
|
206 |
/* Prints the running time (the time it took for indexing). |
207 |
*/ |
208 |
|
209 |
static void printTime(double time) |
210 |
{ |
211 |
int hh, |
212 |
mm, |
213 |
ss; |
214 |
int delta; |
215 |
|
216 |
delta = (int) (time + 0.5); |
217 |
|
218 |
ss = delta % 60; |
219 |
delta /= 60; |
220 |
hh = delta / 60; |
221 |
mm = delta % 60; |
222 |
|
223 |
printf("%02d:%02d:%02d", hh, mm, ss); |
224 |
} |
225 |
|
226 |
/* Prints the SWISH usage. |
227 |
*/ |
228 |
|
229 |
static void usage() |
230 |
{ |
231 |
const char *defaultIndexingSystem = ""; |
232 |
|
233 |
printf(" usage:\n"); |
234 |
printf(" swish [-e] [-i dir file ... ] [-S system] [-c file] [-f file] [-l] [-v (num)]\n"); |
235 |
printf(" swish -w word1 word2 ... [-f file1 file2 ...] \\\n"); |
236 |
printf(" [-P phrase_delimiter] [-p prop1 ...] [-s sortprop1 [asc|desc] ...] \\\n"); |
237 |
printf(" [-m num] [-t str] [-d delim] [-H (num)] [-x output_format]\n"); |
238 |
printf(" swish -k (char|*) [-f file1 file2 ...]\n"); |
239 |
printf(" swish -M index1 index2 ... outputfile\n"); |
240 |
printf(" swish -N /path/to/compare/file\n"); |
241 |
printf(" swish -V\n"); |
242 |
putchar('\n'); |
243 |
printf("options: defaults are in brackets\n"); |
244 |
|
245 |
|
246 |
printf(" -S : specify which indexing system to use.\n"); |
247 |
printf(" Valid options are:\n"); |
248 |
#ifdef ALLOW_FILESYSTEM_INDEXING_DATA_SOURCE |
249 |
printf(" \"fs\" - index local files in your File System\n"); |
250 |
if (!*defaultIndexingSystem) |
251 |
defaultIndexingSystem = "fs"; |
252 |
#endif |
253 |
|
254 |
#ifdef ALLOW_HTTP_INDEXING_DATA_SOURCE |
255 |
printf(" \"http\" - index web site files using a web crawler\n"); |
256 |
if (!*defaultIndexingSystem) |
257 |
defaultIndexingSystem = "http"; |
258 |
#endif |
259 |
|
260 |
#ifdef ALLOW_EXTERNAL_PROGRAM_DATA_SOURCE |
261 |
printf(" \"prog\" - index files supplied by an external program\n"); |
262 |
|
263 |
if (!*defaultIndexingSystem) |
264 |
defaultIndexingSystem = "http"; |
265 |
#endif |
266 |
|
267 |
printf(" The default value is: \"%s\"\n", defaultIndexingSystem); |
268 |
|
269 |
printf(" -i : create an index from the specified files\n"); |
270 |
printf(" -w : search for words \"word1 word2 ...\"\n"); |
271 |
printf(" -t : tags to search in - specify as a string\n"); |
272 |
printf(" \"HBthec\" - in Head|Body|title|header|emphasized|comments\n"); |
273 |
printf(" -f : index file to create or file(s) to search from [%s]\n", INDEXFILE); |
274 |
printf(" -c : configuration file(s) to use for indexing\n"); |
275 |
printf(" -v : indexing verbosity level (0 to 3) [-v %d]\n", VERBOSE); |
276 |
printf(" -T : Trace options ('-T help' for info\n"); |
277 |
printf(" -l : follow symbolic links when indexing\n"); |
278 |
printf(" -b : begin results at this number\n"); |
279 |
printf(" -m : the maximum number of results to return [defaults to all results]\n"); |
280 |
printf(" -M : merges index files\n"); |
281 |
printf(" -N : index only files with a modification date newer than path supplied\n"); |
282 |
printf(" -p : include these document properties in the output \"prop1 prop2 ...\"\n"); |
283 |
printf(" -s : sort by these document properties in the output \"prop1 prop2 ...\"\n"); |
284 |
printf(" -d : next param is delimiter.\n"); |
285 |
printf(" -P : next param is Phrase delimiter.\n"); |
286 |
printf(" -V : prints the current version\n"); |
287 |
printf(" -e : \"Economic Mode\": The index proccess uses less RAM.\n"); |
288 |
printf(" -x : \"Extended Output Format\": Specify the output format.\n"); |
289 |
printf(" -H : \"Result Header Output\": verbosity (0 to 9) [1].\n"); |
290 |
printf(" -k : Print words starting with a given char.\n"); |
291 |
printf(" -E : Append errors to file specified, or stderr if file not specified.\n"); |
292 |
printf("\n"); |
293 |
printf("version: %s docs: http://swish-e.org\n", SWISH_VERSION); |
294 |
exit(1); |
295 |
} |
296 |
|
297 |
static void printversion() |
298 |
{ |
299 |
printf("SWISH-E %s\n", SWISH_VERSION ); |
300 |
exit(0); |
301 |
} |
302 |
|
303 |
|
304 |
/************************************************************************* |
305 |
* Deal with -T debug options |
306 |
* |
307 |
* |
308 |
**************************************************************************/ |
309 |
|
310 |
static unsigned int isDebugWord(char *word, CMDPARAMS *params) |
311 |
{ |
312 |
int i, |
313 |
help; |
314 |
|
315 |
help = strcasecmp(word, "help") == 0; |
316 |
|
317 |
if (help) |
318 |
printf("\nAvailable debugging options for swish-e:\n"); |
319 |
|
320 |
for (i = 0; i < sizeof(debug_map) / sizeof(debug_map[0]); i++) |
321 |
if (help) |
322 |
printf(" %20s => %s\n", debug_map[i].name, debug_map[i].description); |
323 |
else if (strcasecmp(debug_map[i].name, word) == 0) |
324 |
{ |
325 |
if (strncasecmp(word, "INDEX_", 6) == 0) |
326 |
params->run_mode = MODE_DUMP; |
327 |
|
328 |
return debug_map[i].bit; |
329 |
} |
330 |
|
331 |
if (help) |
332 |
exit(1); |
333 |
|
334 |
return 0; |
335 |
} |
336 |
|
337 |
/************************************************************************* |
338 |
* Initialize the swish command parameters |
339 |
* |
340 |
* Call with: |
341 |
* void |
342 |
* |
343 |
* Returns: |
344 |
* pointer to CMDPARAMS |
345 |
* |
346 |
* To Do: |
347 |
* The swish parameters probably should be groupped by switches and |
348 |
* by config file (and maybe someday also by directory or path or |
349 |
* content-type) and then merged. |
350 |
* |
351 |
**************************************************************************/ |
352 |
|
353 |
static CMDPARAMS *new_swish_params() |
354 |
{ |
355 |
CMDPARAMS *params = (CMDPARAMS *)emalloc( sizeof( CMDPARAMS ) ); |
356 |
memset( params, 0, sizeof( CMDPARAMS ) ); |
357 |
|
358 |
params->run_mode = MODE_SEARCH; /* default run mode */ |
359 |
params->structure = IN_FILE; /* look in the file, by default */ |
360 |
|
361 |
return params; |
362 |
} |
363 |
|
364 |
|
365 |
|
366 |
/************************************************************************* |
367 |
* Free the swish command parameters |
368 |
* |
369 |
* Call with: |
370 |
* *CMDPARAMS |
371 |
* |
372 |
* Returns: |
373 |
* void |
374 |
* |
375 |
* To Do: |
376 |
* The swish parameters probably should be groupped by switches and |
377 |
* by config file (and maybe someday also by directory or path or |
378 |
* content-type) and then merged. |
379 |
* |
380 |
**************************************************************************/ |
381 |
|
382 |
static void free_command_line_params( CMDPARAMS *params ) |
383 |
{ |
384 |
if ( params->wordlist ) |
385 |
efree( params->wordlist ); |
386 |
|
387 |
if ( params->tmpsortprops ) |
388 |
freeswline( params->tmpsortprops ); |
389 |
|
390 |
if ( params->conflist ) |
391 |
freeswline( params->conflist ); |
392 |
|
393 |
efree( params ); |
394 |
} |
395 |
|
396 |
|
397 |
/************************************************************************* |
398 |
* Just checks if there is a next word |
399 |
* Three helper fuctions - to be replaced by better command parsing soon... |
400 |
**************************************************************************/ |
401 |
|
402 |
static char *is_another_param( char **argv ) |
403 |
{ |
404 |
return ( *(argv + 1) && *(argv + 1)[0] != '-' ) |
405 |
? *(argv + 1) |
406 |
: NULL; |
407 |
} |
408 |
|
409 |
static char *next_param( char ***argv ) |
410 |
{ |
411 |
char *c; |
412 |
|
413 |
if ( ( c = is_another_param( *argv ) ) ) |
414 |
{ |
415 |
(*argv)++; |
416 |
return c; |
417 |
} |
418 |
|
419 |
return NULL; |
420 |
} |
421 |
|
422 |
|
423 |
static int get_param_number(char ***argv, char c ) |
424 |
{ |
425 |
char *badchar; |
426 |
long num; |
427 |
char *string = next_param( argv ); |
428 |
|
429 |
if ( !string ) |
430 |
progerr(" '-%c' requires a positive integer.", c ); |
431 |
|
432 |
num = strtol( string, &badchar, 10 ); // would base zero be more flexible? |
433 |
|
434 |
if ( num == LONG_MAX || num == LONG_MIN ) |
435 |
progerrno("Failed to convert '-%c %s' to a number: ", c, string ); |
436 |
|
437 |
if ( *badchar ) |
438 |
progerr("Invalid char '%c' found in argument to '-%c %s'", badchar[0], c, string); |
439 |
|
440 |
|
441 |
return (int) num; |
442 |
} |
443 |
|
444 |
|
445 |
|
446 |
|
447 |
/************************************************************************* |
448 |
* Gets the command line parameters, if any, and set values in the CMDPARMAS structure |
449 |
* |
450 |
* |
451 |
* Returns: |
452 |
* void (changes *sw and *params) |
453 |
* |
454 |
* To Do: |
455 |
* This code is horrific. Get a structure to define the parameters, and messages! |
456 |
* Move this into its own module! |
457 |
* |
458 |
* Also, mixes two structres for parameters, SWISH and CMDPARAMS. Not a great setup. |
459 |
* |
460 |
* |
461 |
* I'd like to see a centeral routine for processing switches, and a way for |
462 |
* modules to "register" what config options to parse out by the central routine. |
463 |
* |
464 |
**************************************************************************/ |
465 |
static void get_command_line_params(SWISH *sw, char **argv, CMDPARAMS *params ) |
466 |
{ |
467 |
char c; |
468 |
int lenwordlist = 0; |
469 |
char *w; |
470 |
|
471 |
/* not excited about this */ |
472 |
params->wordlist = (char *) emalloc((lenwordlist = MAXSTRLEN) + 1); |
473 |
params->wordlist[0] = '\0'; |
474 |
|
475 |
|
476 |
|
477 |
params->index_read_only = check_readonly_mode( *argv ); |
478 |
|
479 |
|
480 |
|
481 |
if ( !*(argv + 1 ) ) |
482 |
progerr("Missing parameter. Use -h for options.", *argv); |
483 |
|
484 |
while ( *++argv ) |
485 |
{ |
486 |
|
487 |
if ((*argv)[0] != '-') // every parameter starts with a dash |
488 |
progerr("Missing switch character at '%s'. Use -h for options.", *argv); |
489 |
|
490 |
if ( !(c = (*argv)[1] ) ) // get single switch char |
491 |
progerr("Missing switch character at '%s'. Use -h for options.", *argv); |
492 |
|
493 |
/* allow joined arguments */ |
494 |
if ( (*argv)[2] ) |
495 |
{ |
496 |
*argv += 2; |
497 |
argv--; |
498 |
} |
499 |
|
500 |
|
501 |
/* files to index */ |
502 |
if (c == 'i') |
503 |
{ |
504 |
if ( !is_another_param( argv ) ) |
505 |
progerr(" '-i' requires a list of index files."); |
506 |
|
507 |
if(params->run_mode != MODE_UPDATE) /* Preserve update mode */ |
508 |
params->run_mode = MODE_INDEX; |
509 |
|
510 |
while ( (w = next_param( &argv )) ) |
511 |
sw->dirlist = addswline(sw->dirlist, w ); |
512 |
|
513 |
continue; |
514 |
} |
515 |
|
516 |
|
517 |
/* search words */ |
518 |
|
519 |
if (c == 'w') |
520 |
{ |
521 |
if ( !is_another_param( argv ) ) |
522 |
progerr(" '-w' requires list of search words."); |
523 |
|
524 |
while ( (w = next_param( &argv )) ) |
525 |
{ |
526 |
/* don't add blank words */ |
527 |
if (w[0] == '\0') |
528 |
continue; |
529 |
|
530 |
if ((int)( strlen(params->wordlist) + strlen(" ") + strlen(w) ) >= lenwordlist) |
531 |
{ |
532 |
lenwordlist = strlen(params->wordlist) + strlen(" ") + strlen(w) + 200; |
533 |
params->wordlist = (char *) erealloc(params->wordlist, lenwordlist + 1); |
534 |
} |
535 |
|
536 |
params->run_mode = MODE_SEARCH; |
537 |
sprintf(params->wordlist, "%s%s%s", params->wordlist, (params->wordlist[0] == '\0') ? "" : " ", w); |
538 |
} |
539 |
|
540 |
continue; |
541 |
} |
542 |
|
543 |
|
544 |
|
545 |
|
546 |
/* words to dump from index */ |
547 |
|
548 |
if (c == 'k') |
549 |
{ |
550 |
if ( !(w = next_param( &argv )) ) |
551 |
progerr(" '-k' requires a character (or '*')."); |
552 |
|
553 |
if ( strlen( w ) != 1 ) |
554 |
progerr(" '-k' requires a character (or '*')."); |
555 |
|
556 |
|
557 |
params->run_mode = MODE_WORDS; |
558 |
params->keychar = w[0]; |
559 |
|
560 |
continue; |
561 |
} |
562 |
|
563 |
|
564 |
|
565 |
/* Data source */ |
566 |
|
567 |
else if (c == 'S') |
568 |
{ |
569 |
struct _indexing_data_source_def **data_source; |
570 |
|
571 |
if ( !(w = next_param( &argv )) ) |
572 |
progerr(" '-S' requires a valid data source."); |
573 |
|
574 |
for (data_source = data_sources; *data_source != 0; data_source++) |
575 |
if (strcmp(w, (*data_source)->IndexingDataSourceId) == 0) |
576 |
break; |
577 |
|
578 |
|
579 |
if (!*data_source) |
580 |
progerr("Unknown -S option \"%s\"", w); |
581 |
else |
582 |
IndexingDataSource = *data_source; |
583 |
|
584 |
continue; |
585 |
} |
586 |
|
587 |
|
588 |
|
589 |
/* sort properties */ |
590 |
|
591 |
if (c == 's') |
592 |
{ |
593 |
if ( !is_another_param( argv ) ) |
594 |
progerr(" '-s' requires list of sort properties."); |
595 |
|
596 |
while ( (w = next_param( &argv )) ) |
597 |
params->tmpsortprops = addswline(params->tmpsortprops, w); |
598 |
|
599 |
continue; |
600 |
} |
601 |
|
602 |
|
603 |
/* display properties */ |
604 |
|
605 |
if (c == 'p') |
606 |
{ |
607 |
if ( !is_another_param( argv ) ) |
608 |
progerr(" '-p' requires list of properties."); |
609 |
|
610 |
while ( (w = next_param( &argv )) ) |
611 |
addSearchResultDisplayProperty(sw, w); |
612 |
|
613 |
continue; |
614 |
} |
615 |
|
616 |
|
617 |
|
618 |
/* Set limit values */ |
619 |
|
620 |
if (c == 'L') |
621 |
{ |
622 |
if ( !( is_another_param( argv ) && is_another_param( argv + 1 ) && is_another_param( argv + 2 )) ) |
623 |
progerr("-L requires three parameters <propname> <lorange> <highrange>"); |
624 |
|
625 |
if ( !SetLimitParameter(sw, argv[1], argv[2], argv[3]) ) |
626 |
SwishAbortLastError( sw ); |
627 |
|
628 |
argv += 3; |
629 |
|
630 |
continue; |
631 |
} |
632 |
|
633 |
|
634 |
|
635 |
/* Index file(s) selection */ |
636 |
|
637 |
if (c == 'f') |
638 |
{ |
639 |
if ( !is_another_param( argv ) ) |
640 |
progerr(" '-f' requires list of index files."); |
641 |
|
642 |
while ( (w = next_param( &argv )) ) |
643 |
sw->indexlist = addindexfile(sw->indexlist, w); |
644 |
|
645 |
continue; |
646 |
} |
647 |
|
648 |
|
649 |
/* config file list */ |
650 |
|
651 |
if (c == 'c') |
652 |
{ |
653 |
if ( !is_another_param( argv ) ) |
654 |
progerr(" '-c' requires one or more configuration files."); |
655 |
|
656 |
if(params->run_mode != MODE_UPDATE) /* Preserve update mode */ |
657 |
params->run_mode = MODE_INDEX; |
658 |
|
659 |
while ( (w = next_param( &argv )) ) |
660 |
params->conflist = addswline(params->conflist, w); |
661 |
|
662 |
continue; |
663 |
} |
664 |
|
665 |
|
666 |
|
667 |
/* Follow symbolic links */ |
668 |
|
669 |
if (c == 'l') |
670 |
{ |
671 |
sw->FS->followsymlinks = 1; |
672 |
continue; |
673 |
} |
674 |
|
675 |
|
676 |
/* Set begin hit location */ |
677 |
|
678 |
if (c == 'b') |
679 |
{ |
680 |
sw->Search->beginhits = get_param_number( &argv, c ); |
681 |
continue; |
682 |
} |
683 |
|
684 |
|
685 |
/* Set max hits */ |
686 |
|
687 |
if (c == 'm') |
688 |
{ |
689 |
sw->Search->maxhits = get_param_number( &argv, c ); |
690 |
continue; |
691 |
} |
692 |
|
693 |
|
694 |
|
695 |
/* Save the time for limiting indexing by a file date */ |
696 |
|
697 |
if (c == 'N') |
698 |
{ |
699 |
struct stat stat_buf; |
700 |
|
701 |
if ( !(w = next_param( &argv )) ) |
702 |
progerr("-N requires a path to a local file"); |
703 |
|
704 |
if (stat( w, &stat_buf)) |
705 |
progerrno("Bad path '%s' specified with -N: ", w ); |
706 |
|
707 |
sw->mtime_limit = stat_buf.st_mtime; |
708 |
|
709 |
continue; |
710 |
} |
711 |
|
712 |
|
713 |
/* limit by structure */ |
714 |
|
715 |
if (c == 't') |
716 |
{ |
717 |
char * c; |
718 |
|
719 |
if ( !(w = next_param( &argv )) ) |
720 |
progerr("Specify tag fields (HBtheca)."); |
721 |
|
722 |
|
723 |
params->structure = 0; /* reset to none */ |
724 |
|
725 |
for ( c = w; *c; c++ ) |
726 |
switch ( *c ) |
727 |
{ |
728 |
case 'H': |
729 |
params->structure |= IN_HEAD; |
730 |
break; |
731 |
case 'B': |
732 |
params->structure |= IN_BODY; |
733 |
break; |
734 |
case 't': |
735 |
params->structure |= IN_TITLE; |
736 |
break; |
737 |
case 'h': |
738 |
params->structure |= IN_HEADER; |
739 |
break; |
740 |
case 'e': |
741 |
params->structure |= IN_EMPHASIZED; |
742 |
break; |
743 |
case 'c': |
744 |
params->structure |= IN_COMMENTS; |
745 |
break; |
746 |
case 'a': |
747 |
params->structure |= IN_ALL; |
748 |
break; |
749 |
default: |
750 |
progerr("-t must only include HBthec. Found '%c'", *c ); |
751 |
} |
752 |
continue; |
753 |
} |
754 |
|
755 |
|
756 |
|
757 |
|
758 |
|
759 |
/* verbose while indexing */ |
760 |
|
761 |
if (c == 'v') |
762 |
{ |
763 |
params->hasverbose = 1; |
764 |
sw->verbose = get_param_number( &argv, c ); |
765 |
continue; |
766 |
} |
767 |
|
768 |
|
769 |
|
770 |
/* print the version number */ |
771 |
|
772 |
if (c == 'V') |
773 |
printversion(); |
774 |
|
775 |
|
776 |
|
777 |
/* "z" Huh? */ |
778 |
|
779 |
if (c == 'z' || c == 'h' || c == '?') |
780 |
usage(); |
781 |
|
782 |
|
783 |
|
784 |
/* Merge settings */ |
785 |
|
786 |
if (c == 'M') |
787 |
{ |
788 |
if ( !is_another_param( argv ) ) |
789 |
progerr(" '-M' requires an output file name."); |
790 |
|
791 |
params->run_mode = MODE_MERGE; |
792 |
|
793 |
while ( (w = next_param( &argv )) ) |
794 |
{ |
795 |
/* Last one listed is the output file */ |
796 |
if ( is_another_param( argv ) ) |
797 |
sw->indexlist = addindexfile(sw->indexlist, w); |
798 |
else |
799 |
params->merge_out_file = estrdup( w ); |
800 |
} |
801 |
|
802 |
continue; |
803 |
} |
804 |
|
805 |
|
806 |
|
807 |
/* Debugging options */ |
808 |
|
809 |
if (c == 'T') |
810 |
{ |
811 |
while ( (w = next_param( &argv )) ) |
812 |
{ |
813 |
unsigned int bit; |
814 |
|
815 |
if ((bit = isDebugWord( w, params) )) |
816 |
DEBUG_MASK |= bit; |
817 |
else |
818 |
progerr("Invalid debugging option '%s'. Use '-T help' for help.", w); |
819 |
|
820 |
} |
821 |
continue; |
822 |
} |
823 |
|
824 |
|
825 |
|
826 |
/* Set where errors go */ |
827 |
|
828 |
if (c == 'E') |
829 |
{ |
830 |
if ( !is_another_param( argv ) ) |
831 |
set_error_handle( stderr ); // -E alone goes to stderr |
832 |
|
833 |
else |
834 |
{ |
835 |
FILE *f; |
836 |
w = next_param( &argv ); |
837 |
f = fopen( w, "a" ); |
838 |
if ( !f ) |
839 |
progerrno("Failed to open Error file '%s' for appending: ", w ); |
840 |
|
841 |
set_error_handle( f ); |
842 |
} |
843 |
|
844 |
continue; |
845 |
} |
846 |
|
847 |
|
848 |
|
849 |
/* Custom Phrase Delimiter - Jose Ruiz 01/00 */ |
850 |
|
851 |
if (c == 'P') |
852 |
{ |
853 |
if ( !(w = next_param( &argv )) ) |
854 |
progerr("'-P' requires a phrase delimiter."); |
855 |
|
856 |
sw->Search->PhraseDelimiter = (int) w[0]; |
857 |
continue; |
858 |
} |
859 |
|
860 |
|
861 |
|
862 |
/* Set the custom delimiter */ |
863 |
if (c == 'd') |
864 |
{ |
865 |
if ( !(w = next_param( &argv )) ) |
866 |
progerr("'-d' requires an output delimiter."); |
867 |
|
868 |
sw->ResultOutput->stdResultFieldDelimiter = estrredup(sw->ResultOutput->stdResultFieldDelimiter, w ); |
869 |
|
870 |
/* This really doesn't work as is probably expected since it's a delimiter and not quoting the fields */ |
871 |
if (strcmp(sw->ResultOutput->stdResultFieldDelimiter, "dq") == 0) |
872 |
strcpy( sw->ResultOutput->stdResultFieldDelimiter, "\"" ); |
873 |
else |
874 |
{ |
875 |
int i,j; |
876 |
int backslash = 0; |
877 |
|
878 |
for ( j=0, i=0; i < strlen( w ); i++ ) |
879 |
{ |
880 |
if ( !backslash ) |
881 |
{ |
882 |
if ( w[i] == '\\' ) |
883 |
{ |
884 |
backslash++; |
885 |
continue; |
886 |
} |
887 |
else |
888 |
{ |
889 |
sw->ResultOutput->stdResultFieldDelimiter[j++] = w[i]; |
890 |
continue; |
891 |
} |
892 |
} |
893 |
|
894 |
|
895 |
switch ( w[i] ) |
896 |
{ |
897 |
case 'f': |
898 |
sw->ResultOutput->stdResultFieldDelimiter[j++] = '\f'; |
899 |
break; |
900 |
case 'n': |
901 |
sw->ResultOutput->stdResultFieldDelimiter[j++] = '\n'; |
902 |
break; |
903 |
case 'r': |
904 |
sw->ResultOutput->stdResultFieldDelimiter[j++] = '\r'; |
905 |
break; |
906 |
case 't': |
907 |
sw->ResultOutput->stdResultFieldDelimiter[j++] = '\t'; |
908 |
break; |
909 |
case '\\': |
910 |
sw->ResultOutput->stdResultFieldDelimiter[j++] = '\\'; |
911 |
sw->ResultOutput->stdResultFieldDelimiter[j++] = '\\'; |
912 |
break; |
913 |
default: |
914 |
progerr("Unknown escape sequence '\\%c'. Must be one of \\f \\n \\r \\t \\\\", w[i]); |
915 |
} |
916 |
backslash = 0; |
917 |
} |
918 |
sw->ResultOutput->stdResultFieldDelimiter[j] = '\0'; |
919 |
} |
920 |
continue; |
921 |
} |
922 |
|
923 |
|
924 |
|
925 |
/* Econ mode */ |
926 |
|
927 |
if (c == 'e') |
928 |
{ |
929 |
/* Jose Ruiz 09/00 */ |
930 |
params->swap_mode = 1; /* "Economic mode": Uses less RAM */ |
931 |
/* The proccess is slower: Part of */ |
932 |
/* info is preserved in temporal */ |
933 |
/* files */ |
934 |
|
935 |
continue; |
936 |
} |
937 |
|
938 |
|
939 |
/* $$$ These need better error reporting */ |
940 |
|
941 |
/* Extended format */ |
942 |
|
943 |
if (c == 'x') |
944 |
{ |
945 |
/* Jose Ruiz 09/00 */ |
946 |
/* Search proc will show more info */ |
947 |
/* rasc 2001-02 extended -x fmtstr */ |
948 |
|
949 |
if ( !(w = next_param( &argv )) ) |
950 |
progerr("'-x' requires an output format string."); |
951 |
|
952 |
{ |
953 |
char *s; |
954 |
s = hasResultExtFmtStr(sw, w); |
955 |
sw->ResultOutput->extendedformat = (s) ? s : w; |
956 |
initPrintExtResult(sw, sw->ResultOutput->extendedformat); |
957 |
} |
958 |
|
959 |
continue; |
960 |
} |
961 |
|
962 |
|
963 |
|
964 |
/* Search header output control */ |
965 |
if (c == 'H') |
966 |
{ |
967 |
sw->ResultOutput->headerOutVerbose = get_param_number( &argv, c ); |
968 |
continue; |
969 |
} |
970 |
|
971 |
|
972 |
/* Ignore sorted indexes */ |
973 |
|
974 |
if (c == 'o') |
975 |
{ |
976 |
sw->ResultSort->isPreSorted = 0; |
977 |
continue; |
978 |
} |
979 |
|
980 |
/* Update mode jmruiz 2002/03 */ |
981 |
|
982 |
if (c == 'u') |
983 |
{ |
984 |
params->run_mode = MODE_UPDATE; |
985 |
continue; |
986 |
} |
987 |
|
988 |
progerr("Unknown switch '-%c'. Use -h for options.", c ); |
989 |
} |
990 |
} |
991 |
|
992 |
|
993 |
/************************************************************************* |
994 |
* Returns true if we think the program is called swish-search |
995 |
* offers no real security |
996 |
* |
997 |
**************************************************************************/ |
998 |
static int check_readonly_mode( char *prog ) |
999 |
{ |
1000 |
char *tmp = prog + strlen(prog) - strlen("swish-search"); |
1001 |
|
1002 |
if ( tmp < prog ) |
1003 |
return 0; |
1004 |
|
1005 |
/* We must ignore case for WIN 32 */ |
1006 |
if (strcasecmp(tmp, "swish-search") == 0) |
1007 |
return 1; |
1008 |
|
1009 |
return 0; |
1010 |
} |
1011 |
|
1012 |
|
1013 |
/************************************************************************* |
1014 |
* Dumps the index file(s) |
1015 |
* |
1016 |
**************************************************************************/ |
1017 |
static void cmd_dump( SWISH *sw, CMDPARAMS *params ) |
1018 |
{ |
1019 |
|
1020 |
/* Set the default index file */ |
1021 |
if ( sw->indexlist == NULL ) |
1022 |
sw->indexlist = addindexfile(sw->indexlist, INDEXFILE); |
1023 |
|
1024 |
while ( sw->indexlist != NULL ) |
1025 |
{ |
1026 |
|
1027 |
DB_decompress(sw, sw->indexlist); |
1028 |
putchar('\n'); |
1029 |
|
1030 |
sw->indexlist = sw->indexlist->next; |
1031 |
} |
1032 |
} |
1033 |
/************************************************************************* |
1034 |
* This run the indexing code |
1035 |
* |
1036 |
**************************************************************************/ |
1037 |
|
1038 |
static void cmd_index( SWISH *sw, CMDPARAMS *params ) |
1039 |
{ |
1040 |
int hasdir = (sw->dirlist == NULL) ? 0 : 1; |
1041 |
int hasindex = (sw->indexlist == NULL) ? 0 : 1; |
1042 |
double elapsedStart = TimeElapsed(); |
1043 |
double cpuStart = TimeCPU(); |
1044 |
struct swline *tmpswline; |
1045 |
|
1046 |
if ( params->index_read_only ) |
1047 |
progerr("Sorry, this program is in readonly mode"); |
1048 |
|
1049 |
|
1050 |
/* Read configuration files */ |
1051 |
{ |
1052 |
struct swline *tmp = params->conflist; |
1053 |
while ( tmp != NULL) |
1054 |
{ |
1055 |
getdefaults(sw, tmp->line, &hasdir, &hasindex, params->hasverbose); |
1056 |
tmp = tmp->next; |
1057 |
} |
1058 |
} |
1059 |
|
1060 |
|
1061 |
/* Default index file */ |
1062 |
if ( sw->indexlist == NULL ) |
1063 |
sw->indexlist = addindexfile(sw->indexlist, INDEXFILE); |
1064 |
|
1065 |
|
1066 |
if (!hasdir) |
1067 |
progerr("Specify directories or files to index."); |
1068 |
|
1069 |
|
1070 |
if (sw->verbose < 0) |
1071 |
sw->verbose = 0; |
1072 |
|
1073 |
/* Update Economic mode */ |
1074 |
sw->Index->swap_locdata = params->swap_mode; |
1075 |
|
1076 |
|
1077 |
/* Check for UPDATE_MODE jmruiz 2002/03 */ |
1078 |
if(params->run_mode == MODE_UPDATE) |
1079 |
{ |
1080 |
/* Open the index file for read/write */ |
1081 |
sw->indexlist->DB = (void *) DB_Open(sw, sw->indexlist->line,DB_READWRITE); |
1082 |
if ( sw->lasterror ) |
1083 |
SwishAbortLastError( sw ); |
1084 |
|
1085 |
|
1086 |
/* Read the header and overwrite the '-c' option and feault values - In other |
1087 |
** words, the header values are the good ones */ |
1088 |
read_header(sw, &sw->indexlist->header, sw->indexlist->DB); |
1089 |
sw->TotalWords = sw->indexlist->header.totalwords; |
1090 |
sw->TotalFiles = sw->indexlist->header.totalfiles; |
1091 |
|
1092 |
/* Adjust filenum to totalfiles */ |
1093 |
sw->Index->filenum = sw->TotalFiles; |
1094 |
|
1095 |
#ifndef USE_BTREE |
1096 |
progerr("Invalid operation mode '%d': Update mode only supported with USE_BTREE feature", (int)params->run_mode); |
1097 |
#endif |
1098 |
|
1099 |
} |
1100 |
else |
1101 |
{ |
1102 |
/* Create an empty File - before indexing to make sure can write to the index */ |
1103 |
sw->indexlist->DB = (void *) DB_Create(sw, sw->indexlist->line); |
1104 |
if ( sw->lasterror ) |
1105 |
SwishAbortLastError( sw ); |
1106 |
} |
1107 |
|
1108 |
|
1109 |
/* This should be printed by the module that's reading the source */ |
1110 |
if (sw->verbose >= 1) |
1111 |
printf("Indexing Data Source: \"%s\"\n", IndexingDataSource->IndexingDataSourceName); |
1112 |
|
1113 |
tmpswline = sw->dirlist; |
1114 |
while (tmpswline != NULL) |
1115 |
{ |
1116 |
if (sw->verbose) |
1117 |
{ |
1118 |
printf("Indexing \"%s\"\n", tmpswline->line); |
1119 |
fflush(stdout); |
1120 |
} |
1121 |
indexpath(sw, tmpswline->line); |
1122 |
tmpswline = tmpswline->next; |
1123 |
} |
1124 |
|
1125 |
|
1126 |
Mem_Summary("After indexing", 0); |
1127 |
|
1128 |
|
1129 |
if (sw->verbose > 1) |
1130 |
putchar('\n'); |
1131 |
|
1132 |
|
1133 |
if (sw->verbose) |
1134 |
printf("Removing very common words...\n"); |
1135 |
|
1136 |
fflush(stdout); |
1137 |
|
1138 |
write_index_file( sw, 1, elapsedStart, cpuStart, 0, params->run_mode == MODE_UPDATE?1:0); |
1139 |
} |
1140 |
|
1141 |
|
1142 |
/************************************************************************* |
1143 |
* MERGE: prepare index files for merging, and call merge.c |
1144 |
* |
1145 |
* Most of this should probably be in merge.c |
1146 |
* |
1147 |
**************************************************************************/ |
1148 |
static void cmd_merge( SWISH *sw_input, CMDPARAMS *params ) |
1149 |
{ |
1150 |
SWISH *sw_out; |
1151 |
double elapsedStart = TimeElapsed(); |
1152 |
double cpuStart = TimeCPU(); |
1153 |
|
1154 |
if ( params->index_read_only ) |
1155 |
progerr("Sorry, this program is in readonly mode"); |
1156 |
|
1157 |
|
1158 |
if (!sw_input->indexlist) |
1159 |
progerr("Failed to list any input files for merging"); |
1160 |
|
1161 |
|
1162 |
/* Open all the index files for reading */ |
1163 |
if ( !SwishAttach(sw_input) ) |
1164 |
SwishAbortLastError( sw_input ); |
1165 |
|
1166 |
|
1167 |
/* Check output file */ |
1168 |
if ( !params->merge_out_file ) |
1169 |
progerr("Failed to provide merge output file"); |
1170 |
|
1171 |
if ( isfile(params->merge_out_file) ) |
1172 |
progerr("Merge output file '%s' already exists. Won't overwrite.\n", params->merge_out_file); |
1173 |
|
1174 |
/* create output */ |
1175 |
sw_out = SwishNew(); |
1176 |
|
1177 |
sw_out->indexlist = addindexfile(sw_out->indexlist, params->merge_out_file); |
1178 |
|
1179 |
|
1180 |
/* Update Economic mode */ |
1181 |
sw_out->Index->swap_locdata = params->swap_mode; |
1182 |
|
1183 |
|
1184 |
/* Create an empty File - before indexing to make sure can write to the index */ |
1185 |
sw_out->indexlist->DB = (void *) DB_Create(sw_out, params->merge_out_file); |
1186 |
if ( sw_out->lasterror ) |
1187 |
SwishAbortLastError( sw_out ); |
1188 |
|
1189 |
|
1190 |
merge_indexes( sw_input, sw_out ); |
1191 |
|
1192 |
write_index_file( sw_out, 0, elapsedStart, cpuStart, 1, 0); |
1193 |
|
1194 |
SwishClose( sw_out ); |
1195 |
|
1196 |
efree( params->merge_out_file ); |
1197 |
} |
1198 |
|
1199 |
|
1200 |
/************************************************************************* |
1201 |
* Displays all the words staring with params->keychar |
1202 |
* |
1203 |
**************************************************************************/ |
1204 |
static void cmd_keywords( SWISH *sw, CMDPARAMS *params ) |
1205 |
{ |
1206 |
if (!sw->indexlist) |
1207 |
sw->indexlist = addindexfile(sw->indexlist, INDEXFILE); |
1208 |
|
1209 |
OutputKeyChar(sw, (int) (unsigned char) params->keychar); |
1210 |
} |
1211 |
|
1212 |
|
1213 |
/************************************************************************* |
1214 |
* Runs a swish query |
1215 |
* |
1216 |
**************************************************************************/ |
1217 |
static void cmd_search( SWISH *sw, CMDPARAMS *params ) |
1218 |
{ |
1219 |
int rc = 0; |
1220 |
double elapsedStart = TimeElapsed(); |
1221 |
double elapsedSearchStart; |
1222 |
double elapsedEnd; |
1223 |
|
1224 |
|
1225 |
/* Set default index file, if none specified */ |
1226 |
if (!sw->indexlist) |
1227 |
sw->indexlist = addindexfile(sw->indexlist, INDEXFILE); |
1228 |
|
1229 |
|
1230 |
/* Set the result sort order */ |
1231 |
|
1232 |
if ( params->tmpsortprops ) |
1233 |
{ |
1234 |
int sortmode = -1; /* Ascendind by default */ |
1235 |
struct swline *tmplist; |
1236 |
char *field; |
1237 |
|
1238 |
for (tmplist = params->tmpsortprops; tmplist; tmplist = tmplist->next) |
1239 |
{ |
1240 |
field = tmplist->line; |
1241 |
if (tmplist->next) |
1242 |
{ |
1243 |
if (!strcasecmp(tmplist->next->line, "asc")) |
1244 |
{ |
1245 |
sortmode = -1; /* asc sort */ |
1246 |
tmplist = tmplist->next; |
1247 |
} |
1248 |
else if (!strcasecmp(tmplist->next->line, "desc")) |
1249 |
{ |
1250 |
sortmode = 1; /* desc sort */ |
1251 |
tmplist = tmplist->next; |
1252 |
} |
1253 |
} |
1254 |
addSearchResultSortProperty(sw, field, sortmode); |
1255 |
} |
1256 |
} |
1257 |
|
1258 |
|
1259 |
if (sw->Search->maxhits <= 0) |
1260 |
sw->Search->maxhits = -1; |
1261 |
|
1262 |
if ( !SwishAttach(sw) ) |
1263 |
SwishAbortLastError( sw ); |
1264 |
|
1265 |
|
1266 |
resultHeaderOut(sw, 1, "%s\n", INDEXHEADER); |
1267 |
|
1268 |
/* print out "original" search words */ |
1269 |
resultHeaderOut(sw, 1, "# Search words: %s\n", params->wordlist); |
1270 |
|
1271 |
|
1272 |
|
1273 |
/* Get starting time */ |
1274 |
elapsedSearchStart = TimeElapsed(); |
1275 |
|
1276 |
rc = search(sw, params->wordlist, params->structure); |
1277 |
|
1278 |
if ( rc < 0 ) |
1279 |
SwishAbortLastError( sw ); |
1280 |
|
1281 |
resultHeaderOut(sw, 2, "#\n"); |
1282 |
|
1283 |
if (rc > 0) |
1284 |
{ |
1285 |
resultHeaderOut(sw, 1, "# Number of hits: %d\n", rc); |
1286 |
|
1287 |
elapsedEnd = TimeElapsed(); |
1288 |
resultHeaderOut(sw, 1, "# Search time: %0.3f seconds\n", elapsedEnd - elapsedSearchStart); |
1289 |
resultHeaderOut(sw, 1, "# Run time: %0.3f seconds\n", elapsedEnd - elapsedStart); |
1290 |
printSortedResults(sw); |
1291 |
resultHeaderOut(sw, 1, ".\n"); |
1292 |
} |
1293 |
else if (!rc ) |
1294 |
resultHeaderOut(sw, 1, "err: no results\n.\n"); |
1295 |
|
1296 |
|
1297 |
} |
1298 |
|
1299 |
|
1300 |
|
1301 |
/************************************************************************* |
1302 |
* write_index_file -- used for both merge and for indexing |
1303 |
* |
1304 |
**************************************************************************/ |
1305 |
|
1306 |
static void write_index_file( SWISH *sw, int process_stopwords, double elapsedStart, double cpuStart, int merge, int is_update) |
1307 |
{ |
1308 |
int totalfiles = getfilecount(sw->indexlist); |
1309 |
int stopwords = 0; |
1310 |
|
1311 |
/* Coalesce all remaining locations */ |
1312 |
coalesce_all_word_locations(sw, sw->indexlist); |
1313 |
|
1314 |
if ( process_stopwords ) |
1315 |
{ |
1316 |
|
1317 |
/* Proccess IgnoreLimit option */ |
1318 |
getPositionsFromIgnoreLimitWords(sw); |
1319 |
|
1320 |
stopwords = getNumberOfIgnoreLimitWords(sw); |
1321 |
|
1322 |
|
1323 |
if (sw->verbose ) |
1324 |
{ |
1325 |
if (stopwords) |
1326 |
{ |
1327 |
int pos; |
1328 |
|
1329 |
/* 05/00 Jose Ruiz Adjust totalwords for IgnoreLimit ONLY */ |
1330 |
/* 2002-07 jmruiz |
1331 |
**This is already done in getPositionsFromIgnoreLimitWords |
1332 |
** sw->indexlist->header.totalwords -= stopwords; |
1333 |
*/ |
1334 |
|
1335 |
if (sw->indexlist->header.totalwords < 0) |
1336 |
sw->indexlist->header.totalwords = 0; |
1337 |
|
1338 |
/* Same as "stopwords" */ |
1339 |
printf("%d words removed by IgnoreLimit:\n", sw->indexlist->header.stopPos); |
1340 |
|
1341 |
for (pos = 0; pos < sw->indexlist->header.stopPos; pos++) |
1342 |
printf("%s, ", sw->indexlist->header.stopList[pos]); |
1343 |
|
1344 |
printf("\n"); |
1345 |
} |
1346 |
else |
1347 |
printf("no words removed.\n"); |
1348 |
|
1349 |
} |
1350 |
} |
1351 |
|
1352 |
if (sw->verbose) |
1353 |
printf("Writing main index...\n"); |
1354 |
|
1355 |
if ( !sw->indexlist->header.totalwords ) |
1356 |
/* Would be better to flag so db_native would know not to rename the (empty) index file */ |
1357 |
// printf("No unique words indexed!\n"); |
1358 |
progerr("No unique words indexed!"); |
1359 |
|
1360 |
else |
1361 |
{ |
1362 |
|
1363 |
|
1364 |
if (sw->verbose) |
1365 |
printf("Sorting words ...\n"); |
1366 |
|
1367 |
sort_words(sw, sw->indexlist); |
1368 |
|
1369 |
|
1370 |
|
1371 |
if (sw->verbose) |
1372 |
printf("Writing header ...\n"); |
1373 |
fflush(stdout); |
1374 |
|
1375 |
write_header(sw, &sw->indexlist->header, sw->indexlist->DB, sw->indexlist->line, sw->indexlist->header.totalwords, totalfiles, merge); |
1376 |
|
1377 |
fflush(stdout); |
1378 |
|
1379 |
if (sw->verbose) |
1380 |
printf("Writing index entries ...\n"); |
1381 |
|
1382 |
|
1383 |
write_index(sw, sw->indexlist); |
1384 |
|
1385 |
|
1386 |
if (sw->verbose) |
1387 |
printf("%d unique word%s indexed.\n", sw->indexlist->header.totalwords, (sw->indexlist->header.totalwords == 1) ? "" : "s"); |
1388 |
|
1389 |
|
1390 |
/* Sort properties -> Better search performance */ |
1391 |
|
1392 |
/* First reopen the property file in read only mode for seek speed */ |
1393 |
DB_Reopen_PropertiesForRead( sw, sw->indexlist->DB ); |
1394 |
if ( sw->lasterror ) |
1395 |
SwishAbortLastError( sw ); |
1396 |
|
1397 |
/* This does it all */ |
1398 |
sortFileProperties(sw,sw->indexlist); |
1399 |
} |
1400 |
|
1401 |
|
1402 |
|
1403 |
|
1404 |
if (sw->verbose) |
1405 |
{ |
1406 |
if (totalfiles) |
1407 |
printf("%d file%s indexed. %lu total bytes. %lu total words.\n", |
1408 |
totalfiles, (totalfiles == 1) ? "" : "s", sw->indexlist->total_bytes, sw->indexlist->total_word_positions); |
1409 |
else |
1410 |
printf("no files indexed.\n"); |
1411 |
|
1412 |
printf("Elapsed time: "); |
1413 |
printTime(TimeElapsed() - elapsedStart); |
1414 |
printf(" CPU time: "); |
1415 |
printTime(TimeCPU() - cpuStart); |
1416 |
printf("\n"); |
1417 |
|
1418 |
printf("Indexing done!\n"); |
1419 |
} |
1420 |
|
1421 |
|
1422 |
#ifdef INDEXPERMS |
1423 |
chmod(sw->indexlist->line, INDEXPERMS); |
1424 |
#endif |
1425 |
} |
1426 |
|