1 |
/* |
2 |
$Id: swish_words.c,v 1.15 2002/08/22 22:58:39 whmoseley Exp $ |
3 |
** |
4 |
** This program and library is free software; you can redistribute it and/or |
5 |
** modify it under the terms of the GNU (Library) General Public License |
6 |
** as published by the Free Software Foundation; either version 2 |
7 |
** of the License, or any later version. |
8 |
** |
9 |
** This program is distributed in the hope that it will be useful, |
10 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 |
** GNU (Library) General Public License for more details. |
13 |
** |
14 |
** You should have received a copy of the GNU (Library) General Public License |
15 |
** along with this program; if not, write to the Free Software |
16 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 |
** |
18 |
** |
19 |
** 2001-05-23 moseley created - replaced parser in search.c |
20 |
** |
21 |
** 2001-12-11 moseley, updated to deal with swish operators inside of phrases |
22 |
** Still broken with regard to double-quotes inside of phrases |
23 |
** Very unlikely someone would want to search for a single double quote |
24 |
** within a phrase. It currently works if the double-quotes doesn't have |
25 |
** white space around. Really should tag the words as being operators, or |
26 |
** or "swish words", or let the backslash stay in the query until searching. |
27 |
** |
28 |
*/ |
29 |
|
30 |
#include "swish.h" |
31 |
#include "mem.h" |
32 |
#include "string.h" |
33 |
#include "search.h" |
34 |
#include "index.h" |
35 |
#include "file.h" |
36 |
#include "list.h" |
37 |
#include "hash.h" |
38 |
#include "stemmer.h" |
39 |
#include "soundex.h" |
40 |
#include "double_metaphone.h" |
41 |
#include "error.h" |
42 |
#include "metanames.h" |
43 |
#include "config.h" // for _AND_WORD... |
44 |
#include "search_alt.h" // for AND_WORD... humm maybe needs better organization |
45 |
|
46 |
struct MOD_Swish_Words |
47 |
{ |
48 |
char *word; |
49 |
int lenword; |
50 |
}; |
51 |
|
52 |
/* |
53 |
-- init structures for this module |
54 |
*/ |
55 |
|
56 |
void initModule_Swish_Words (SWISH *sw) |
57 |
{ |
58 |
struct MOD_Swish_Words *self; |
59 |
|
60 |
self = (struct MOD_Swish_Words *) emalloc(sizeof(struct MOD_Swish_Words)); |
61 |
sw->SwishWords = self; |
62 |
|
63 |
/* initialize buffers used by indexstring */ |
64 |
self->word = (char *) emalloc((self->lenword = MAXWORDLEN) + 1); |
65 |
|
66 |
return; |
67 |
} |
68 |
|
69 |
void freeModule_Swish_Words (SWISH *sw) |
70 |
{ |
71 |
struct MOD_Swish_Words *self = sw->SwishWords; |
72 |
|
73 |
efree( self->word ); |
74 |
efree ( self ); |
75 |
sw->SwishWords = NULL; |
76 |
|
77 |
return; |
78 |
} |
79 |
|
80 |
|
81 |
|
82 |
|
83 |
|
84 |
/* Returns true if the character is a search operator */ |
85 |
/* this could be a macro, but gcc is probably smart enough */ |
86 |
|
87 |
static int isSearchOperatorChar( int c, int phrase_delimiter, int inphrase ) |
88 |
{ |
89 |
return inphrase |
90 |
? ( '*' == c || c == phrase_delimiter ) |
91 |
: ( '(' == c || ')' == c || '=' == c || '*' == c || c == phrase_delimiter ); |
92 |
} |
93 |
|
94 |
|
95 |
/* This simply tokenizes by whitespace and by the special characters "()=" */ |
96 |
/* If within a phrase, then just splits by whitespace */ |
97 |
|
98 |
/* Funny how argv was joined into a string just to be split again... */ |
99 |
|
100 |
static int next_token( char **buf, char **word, int *lenword, int phrase_delimiter, int inphrase ) |
101 |
{ |
102 |
int i; |
103 |
int backslash; |
104 |
|
105 |
**word = '\0'; |
106 |
|
107 |
/* skip any leading whitespace */ |
108 |
while ( **buf && isspace( (unsigned char) **buf) ) |
109 |
(*buf)++; |
110 |
|
111 |
|
112 |
/* extract out word */ |
113 |
i = 0; |
114 |
backslash = 0; |
115 |
|
116 |
while ( **buf && !isspace( (unsigned char) **buf) ) |
117 |
{ |
118 |
|
119 |
// This should be looking at swish words, not raw input |
120 |
//if ( i > max_size + 4 ) /* leave a little room for operators */ |
121 |
// progerr( "Search word exceeded maxwordlimit setting." ); |
122 |
|
123 |
|
124 |
/* reallocate buffer, if needed -- only if maxwordlimit was set larger than MAXWORDLEN (1000) */ |
125 |
if ( i == *lenword ) |
126 |
{ |
127 |
*lenword *= 2; |
128 |
*word = erealloc(*word, *lenword + 1); |
129 |
} |
130 |
|
131 |
|
132 |
/* backslash says take next char as-is */ |
133 |
/* note that you cannot backslash whitespace */ |
134 |
if ( '\\' == **buf && ! backslash++ ) |
135 |
{ |
136 |
(*buf)++; |
137 |
continue; |
138 |
} |
139 |
|
140 |
|
141 |
if ( backslash || !isSearchOperatorChar( (unsigned char) **buf, phrase_delimiter, inphrase ) ) |
142 |
{ |
143 |
backslash = 0; |
144 |
|
145 |
(*word)[i++] = **buf; /* can't this be done in one line? */ |
146 |
(*buf)++; |
147 |
} |
148 |
|
149 |
else /* this is a search operator char */ |
150 |
{ |
151 |
if ( **word ) /* break if characters already found - end of this token */ |
152 |
break; |
153 |
|
154 |
(*word)[i++] = **buf; /* save the search operator char as it's own token, and end. */ |
155 |
(*buf)++; |
156 |
break; |
157 |
} |
158 |
|
159 |
} |
160 |
|
161 |
|
162 |
/* flag if we found a token */ |
163 |
if ( i ) |
164 |
{ |
165 |
(*word)[i] = '\0'; |
166 |
return 1; |
167 |
} |
168 |
|
169 |
return 0; |
170 |
} |
171 |
|
172 |
|
173 |
static int next_swish_word(INDEXDATAHEADER *header, char **buf, char **word, int *lenword ) |
174 |
{ |
175 |
int i; |
176 |
|
177 |
/* skip non-wordchars */ |
178 |
while ( **buf && !header->wordcharslookuptable[tolower((unsigned char)(**buf))] ) |
179 |
(*buf)++; |
180 |
|
181 |
i = 0; |
182 |
while ( **buf && header->wordcharslookuptable[tolower((unsigned char)(**buf))] ) |
183 |
{ |
184 |
/* reallocate buffer, if needed */ |
185 |
if ( i + 1 == *lenword ) |
186 |
{ |
187 |
*lenword *= 2; |
188 |
*word = erealloc(*word, *lenword + 1); |
189 |
} |
190 |
|
191 |
(*word)[i++] = **buf; |
192 |
(*word)[i] = '\0'; |
193 |
(*buf)++; |
194 |
} |
195 |
|
196 |
|
197 |
if ( i ) |
198 |
{ |
199 |
stripIgnoreLastChars( header, *word); |
200 |
stripIgnoreFirstChars(header, *word); |
201 |
|
202 |
|
203 |
return **word ? 1 : 0; |
204 |
} |
205 |
|
206 |
return 0; |
207 |
} |
208 |
|
209 |
|
210 |
/* Convert a word into swish words */ |
211 |
|
212 |
static struct swline *parse_swish_words( SWISH *sw, INDEXDATAHEADER *header, char *word, int max_size ) |
213 |
{ |
214 |
struct swline *swish_words = NULL; |
215 |
char *curpos; |
216 |
struct MOD_Swish_Words *self = sw->SwishWords; |
217 |
|
218 |
|
219 |
|
220 |
/* Some initial adjusting of the word */ |
221 |
|
222 |
|
223 |
TranslateChars(header->translatecharslookuptable, (unsigned char *)word); |
224 |
|
225 |
|
226 |
|
227 |
curpos = word; |
228 |
while( next_swish_word( header, &curpos, &self->word, &self->lenword ) ) |
229 |
{ |
230 |
/* Check Begin & EndCharacters */ |
231 |
if (!header->begincharslookuptable[(int) ((unsigned char) self->word[0])]) |
232 |
continue; |
233 |
|
234 |
|
235 |
if (!header->endcharslookuptable[(int) ((unsigned char) self->word[strlen(self->word) - 1])]) |
236 |
continue; |
237 |
|
238 |
|
239 |
/* limit by stopwords, min/max length, max number of digits, ... */ |
240 |
/* ------- processed elsewhere for search --------- |
241 |
if (!isokword(sw, self->word, indexf)) |
242 |
continue; |
243 |
- stopwords are processed in search.c because removing them may have side effects |
244 |
- maxwordlen is checked when first tokenizing for security reasons |
245 |
- limit by vowels, consonants and digits is not needed since search will just fail |
246 |
----------- */ |
247 |
if ( strlen( self->word ) > max_size ) |
248 |
{ |
249 |
sw->lasterror = SEARCH_WORD_TOO_BIG; |
250 |
return NULL; |
251 |
} |
252 |
|
253 |
if (!*self->word) |
254 |
continue; |
255 |
|
256 |
switch ( header->fuzzy_mode ) |
257 |
{ |
258 |
case FUZZY_NONE: |
259 |
swish_words = (struct swline *) addswline( swish_words, self->word ); |
260 |
break; |
261 |
|
262 |
case FUZZY_STEMMING: |
263 |
Stem(&self->word, &self->lenword); |
264 |
if ( *self->word ) // should not happen |
265 |
swish_words = (struct swline *) addswline( swish_words, self->word ); |
266 |
break; |
267 |
|
268 |
|
269 |
case FUZZY_SOUNDEX: |
270 |
soundex(self->word); |
271 |
if ( *self->word ) |
272 |
swish_words = (struct swline *) addswline( swish_words, self->word ); |
273 |
break; |
274 |
|
275 |
case FUZZY_METAPHONE: |
276 |
case FUZZY_DOUBLE_METAPHONE: |
277 |
{ |
278 |
char *codes[2]; |
279 |
DoubleMetaphone(self->word, codes); |
280 |
|
281 |
if ( !(*codes[0]) ) |
282 |
{ |
283 |
efree( codes[0] ); |
284 |
efree( codes[1] ); |
285 |
swish_words = (struct swline *) addswline( swish_words, self->word ); |
286 |
break; |
287 |
} |
288 |
|
289 |
|
290 |
/* check if just METAPHONE or only one word returned (e.g. they are the same) */ |
291 |
|
292 |
if ( header->fuzzy_mode == FUZZY_METAPHONE || !(*codes[1]) || !strcmp(codes[0], codes[1]) ) |
293 |
{ |
294 |
swish_words = (struct swline *) addswline( swish_words, codes[0] ); |
295 |
} |
296 |
else |
297 |
{ |
298 |
/* yuck! */ |
299 |
swish_words = (struct swline *) addswline( swish_words, "(" ); |
300 |
swish_words = (struct swline *) addswline( swish_words, codes[0] ); |
301 |
swish_words = (struct swline *) addswline( swish_words, "or" ); |
302 |
swish_words = (struct swline *) addswline( swish_words, codes[1] ); |
303 |
swish_words = (struct swline *) addswline( swish_words, ")" ); |
304 |
} |
305 |
|
306 |
efree( codes[0] ); |
307 |
efree( codes[1] ); |
308 |
} |
309 |
break; |
310 |
|
311 |
|
312 |
default: |
313 |
progerr("Invalid FuzzyMode '%d'", (int)header->fuzzy_mode ); |
314 |
} |
315 |
} |
316 |
|
317 |
return swish_words; |
318 |
|
319 |
|
320 |
} |
321 |
|
322 |
/* This is really dumb. swline needs a ->prev entry, really search needs its own linked list */ |
323 |
/* Replaces a given node with another node (or nodes) */ |
324 |
|
325 |
static void replace_swline( struct swline **original, struct swline *entry, struct swline *new_words ) |
326 |
{ |
327 |
struct swline *temp; |
328 |
|
329 |
|
330 |
temp = *original; |
331 |
|
332 |
|
333 |
/* check for case of first one */ |
334 |
if ( temp == entry ) |
335 |
{ |
336 |
if ( new_words ) |
337 |
{ |
338 |
new_words->nodep->next = temp->next; |
339 |
new_words->nodep = temp->nodep; |
340 |
*original = new_words; |
341 |
} |
342 |
else /* just delete first node */ |
343 |
{ |
344 |
if ( entry->next ) |
345 |
entry->next->nodep = entry->nodep; /* point next one to last one */ |
346 |
*original = entry->next; |
347 |
} |
348 |
} |
349 |
|
350 |
|
351 |
|
352 |
else /* not first node */ |
353 |
{ |
354 |
/* search for the preceeding node */ |
355 |
for ( temp = *original; temp && temp->next != entry; temp = temp->next ); |
356 |
|
357 |
if ( !temp ) |
358 |
progerr("Fatal Error: Failed to find insert point in replace_swline"); |
359 |
|
360 |
if ( new_words ) |
361 |
{ |
362 |
/* set the previous record to point to the start of the new entry (or entries) */ |
363 |
temp->next = new_words; |
364 |
|
365 |
/* set the end of the new string to point to the next entry */ |
366 |
new_words->nodep->next = entry->next; |
367 |
} |
368 |
else /* delete the entry */ |
369 |
temp->next = temp->next->next; |
370 |
} |
371 |
|
372 |
|
373 |
/* now free the removed item */ |
374 |
efree( entry->line ); |
375 |
efree( entry ); |
376 |
|
377 |
|
378 |
} |
379 |
|
380 |
|
381 |
static int checkbuzzword(INDEXDATAHEADER *header, char *word ) |
382 |
{ |
383 |
if ( !header->buzzwords_used_flag ) |
384 |
return 0; |
385 |
|
386 |
|
387 |
/* only strip when buzzwords are being used since stripped again as a "swish word" */ |
388 |
stripIgnoreLastChars( header, word ); |
389 |
stripIgnoreFirstChars( header, word ); |
390 |
|
391 |
if ( !*word ) /* stripped clean? */ |
392 |
return 0; |
393 |
|
394 |
|
395 |
return isbuzzword( header, word); |
396 |
} |
397 |
|
398 |
/* I hope this doesn't live too long */ |
399 |
|
400 |
static void fudge_wildcard( struct swline **original, struct swline *entry ) |
401 |
{ |
402 |
char *tmp; |
403 |
struct swline *wild_card; |
404 |
|
405 |
wild_card = entry->next; |
406 |
|
407 |
/* reallocate a string */ |
408 |
tmp = entry->line; |
409 |
entry->line = emalloc( strlen( entry->line ) + 2 ); |
410 |
strcpy( entry->line, tmp); |
411 |
efree( tmp ); |
412 |
strcat( entry->line, "*"); |
413 |
|
414 |
efree( wild_card->line ); |
415 |
|
416 |
|
417 |
/* removing last entry - set pointer to new end */ |
418 |
if ( (*original)->nodep == wild_card ) |
419 |
(*original)->nodep = entry; |
420 |
|
421 |
/* and point next to the one after next */ |
422 |
entry->next = wild_card->next; |
423 |
|
424 |
|
425 |
efree( wild_card ); |
426 |
} |
427 |
|
428 |
|
429 |
|
430 |
/******************** Public Functions *********************************/ |
431 |
|
432 |
char *isBooleanOperatorWord( char * word ) |
433 |
{ |
434 |
/* don't need strcasecmp here, since word should alrady be lowercase -- need to check alt-search first */ |
435 |
if (!strcasecmp( word, _AND_WORD)) |
436 |
return AND_WORD; |
437 |
|
438 |
if (!strcasecmp( word, _OR_WORD)) |
439 |
return OR_WORD; |
440 |
|
441 |
if (!strcasecmp( word, _NOT_WORD)) |
442 |
return NOT_WORD; |
443 |
|
444 |
return (char *)NULL; |
445 |
} |
446 |
|
447 |
|
448 |
|
449 |
struct swline *tokenize_query_string( SWISH *sw, char *words, INDEXDATAHEADER *header ) |
450 |
{ |
451 |
char *curpos; /* current position in the words string */ |
452 |
struct swline *tokens = NULL; |
453 |
struct swline *temp; |
454 |
struct swline *swish_words; |
455 |
struct swline *next_node; |
456 |
struct MOD_Swish_Words *self = sw->SwishWords; |
457 |
struct MOD_Search *srch = sw->Search; |
458 |
unsigned char PhraseDelimiter; |
459 |
int max_size; |
460 |
int inphrase = 0; |
461 |
|
462 |
|
463 |
/* Probably won't get to this point */ |
464 |
if ( !words || !*words ) |
465 |
{ |
466 |
sw->lasterror = NO_WORDS_IN_SEARCH; |
467 |
return NULL; |
468 |
} |
469 |
|
470 |
|
471 |
PhraseDelimiter = (unsigned char) srch->PhraseDelimiter; |
472 |
max_size = header->maxwordlimit; |
473 |
|
474 |
curpos = words; |
475 |
|
476 |
/* split into words by whitespace and by the swish operator characters */ |
477 |
|
478 |
while ( next_token( &curpos, &self->word, &self->lenword, PhraseDelimiter, inphrase ) ) |
479 |
{ |
480 |
tokens = (struct swline *) addswline( tokens, self->word ); |
481 |
|
482 |
|
483 |
if ( self->word[0] == PhraseDelimiter && !self->word[1] ) |
484 |
inphrase = !inphrase; |
485 |
} |
486 |
|
487 |
|
488 |
/* no search words found */ |
489 |
if ( !tokens ) |
490 |
return tokens; |
491 |
|
492 |
|
493 |
inphrase = 0; |
494 |
|
495 |
temp = tokens; |
496 |
while ( temp ) |
497 |
{ |
498 |
|
499 |
/* do look-ahead processing first -- metanames */ |
500 |
|
501 |
if ( !inphrase && isMetaNameOpNext(temp->next) ) |
502 |
{ |
503 |
|
504 |
if( !getMetaNameByName( header, temp->line ) ) |
505 |
{ |
506 |
set_progerr( UNKNOWN_METANAME, sw, "'%s'", temp->line ); |
507 |
freeswline( tokens ); |
508 |
return NULL; |
509 |
} |
510 |
|
511 |
|
512 |
/* this might be an option with XML */ |
513 |
strtolower( temp->line ); |
514 |
|
515 |
temp = temp->next; |
516 |
continue; |
517 |
} |
518 |
|
519 |
|
520 |
/* skip operators */ |
521 |
if ( strlen( temp->line ) == 1 && isSearchOperatorChar( (unsigned char) temp->line[0], PhraseDelimiter, inphrase ) ) |
522 |
{ |
523 |
|
524 |
if ( temp->line[0] == PhraseDelimiter && !temp->line[1] ) |
525 |
inphrase = !inphrase; |
526 |
|
527 |
temp = temp->next; |
528 |
continue; |
529 |
} |
530 |
|
531 |
/* this might be an option if case sensitive searches are used */ |
532 |
strtolower( temp->line ); |
533 |
|
534 |
|
535 |
/* check Boolean operators -- currently doesn't change it (search.c does) */ |
536 |
if ( !inphrase ) |
537 |
{ |
538 |
char *operator, *nextoperator; |
539 |
|
540 |
if ( (operator = isBooleanOperatorWord( temp->line )) ) |
541 |
{ |
542 |
/* replace the common "and not" with simply not" */ |
543 |
/* probably not the best place to do this level of processing */ |
544 |
/* since should also check for things like "and this" and "and and and not this" */ |
545 |
/* should probably be moved to end and recursively check for these (to catch "and and not") */ |
546 |
if ( |
547 |
temp->next |
548 |
&& ( strcmp( operator, AND_WORD ) == 0) |
549 |
&& ( (nextoperator = isBooleanOperatorWord( temp->next->line))) |
550 |
&& ( strcmp( nextoperator, NOT_WORD ) == 0) |
551 |
) { |
552 |
struct swline *andword = temp; /* save position */ |
553 |
|
554 |
temp = temp->next; /* now point to "not" word */ |
555 |
replace_swline( &tokens, andword, (struct swline *)NULL ); /* cut it out */ |
556 |
continue; |
557 |
} |
558 |
|
559 |
temp = temp->next; |
560 |
continue; |
561 |
} |
562 |
} |
563 |
|
564 |
|
565 |
/* buzzwords */ |
566 |
if ( checkbuzzword( header, temp->line ) ) |
567 |
{ |
568 |
temp = temp->next; |
569 |
continue; |
570 |
} |
571 |
|
572 |
|
573 |
|
574 |
/* query words left. Turn into "swish_words" */ |
575 |
swish_words = NULL; |
576 |
swish_words = parse_swish_words( sw, header, temp->line, max_size); |
577 |
|
578 |
if ( sw->lasterror ) |
579 |
return NULL; |
580 |
|
581 |
|
582 |
next_node = temp->next; |
583 |
|
584 |
/* move into list.c at some point */ |
585 |
replace_swline( &tokens, temp, swish_words ); |
586 |
temp = next_node; |
587 |
|
588 |
} |
589 |
|
590 |
/* fudge wild cards back onto preceeding word */ |
591 |
for ( temp = tokens ; temp; temp = temp->next ) |
592 |
if ( temp->next && strcmp( temp->next->line, "*") == 0 ) |
593 |
fudge_wildcard( &tokens, temp ); |
594 |
|
595 |
|
596 |
return tokens; |
597 |
} |
598 |
|