1 |
adcroft |
1.1 |
/* |
2 |
|
|
$Id: swish_words.c,v 1.15 2002/08/22 22:58:39 whmoseley Exp $ |
3 |
|
|
** |
4 |
|
|
** This program and library is free software; you can redistribute it and/or |
5 |
|
|
** modify it under the terms of the GNU (Library) General Public License |
6 |
|
|
** as published by the Free Software Foundation; either version 2 |
7 |
|
|
** of the License, or any later version. |
8 |
|
|
** |
9 |
|
|
** This program is distributed in the hope that it will be useful, |
10 |
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 |
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 |
|
|
** GNU (Library) General Public License for more details. |
13 |
|
|
** |
14 |
|
|
** You should have received a copy of the GNU (Library) General Public License |
15 |
|
|
** along with this program; if not, write to the Free Software |
16 |
|
|
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 |
|
|
** |
18 |
|
|
** |
19 |
|
|
** 2001-05-23 moseley created - replaced parser in search.c |
20 |
|
|
** |
21 |
|
|
** 2001-12-11 moseley, updated to deal with swish operators inside of phrases |
22 |
|
|
** Still broken with regard to double-quotes inside of phrases |
23 |
|
|
** Very unlikely someone would want to search for a single double quote |
24 |
|
|
** within a phrase. It currently works if the double-quotes doesn't have |
25 |
|
|
** white space around. Really should tag the words as being operators, or |
26 |
|
|
** or "swish words", or let the backslash stay in the query until searching. |
27 |
|
|
** |
28 |
|
|
*/ |
29 |
|
|
|
30 |
|
|
#include "swish.h" |
31 |
|
|
#include "mem.h" |
32 |
|
|
#include "string.h" |
33 |
|
|
#include "search.h" |
34 |
|
|
#include "index.h" |
35 |
|
|
#include "file.h" |
36 |
|
|
#include "list.h" |
37 |
|
|
#include "hash.h" |
38 |
|
|
#include "stemmer.h" |
39 |
|
|
#include "soundex.h" |
40 |
|
|
#include "double_metaphone.h" |
41 |
|
|
#include "error.h" |
42 |
|
|
#include "metanames.h" |
43 |
|
|
#include "config.h" // for _AND_WORD... |
44 |
|
|
#include "search_alt.h" // for AND_WORD... humm maybe needs better organization |
45 |
|
|
|
46 |
|
|
struct MOD_Swish_Words |
47 |
|
|
{ |
48 |
|
|
char *word; |
49 |
|
|
int lenword; |
50 |
|
|
}; |
51 |
|
|
|
52 |
|
|
/* |
53 |
|
|
-- init structures for this module |
54 |
|
|
*/ |
55 |
|
|
|
56 |
|
|
void initModule_Swish_Words (SWISH *sw) |
57 |
|
|
{ |
58 |
|
|
struct MOD_Swish_Words *self; |
59 |
|
|
|
60 |
|
|
self = (struct MOD_Swish_Words *) emalloc(sizeof(struct MOD_Swish_Words)); |
61 |
|
|
sw->SwishWords = self; |
62 |
|
|
|
63 |
|
|
/* initialize buffers used by indexstring */ |
64 |
|
|
self->word = (char *) emalloc((self->lenword = MAXWORDLEN) + 1); |
65 |
|
|
|
66 |
|
|
return; |
67 |
|
|
} |
68 |
|
|
|
69 |
|
|
void freeModule_Swish_Words (SWISH *sw) |
70 |
|
|
{ |
71 |
|
|
struct MOD_Swish_Words *self = sw->SwishWords; |
72 |
|
|
|
73 |
|
|
efree( self->word ); |
74 |
|
|
efree ( self ); |
75 |
|
|
sw->SwishWords = NULL; |
76 |
|
|
|
77 |
|
|
return; |
78 |
|
|
} |
79 |
|
|
|
80 |
|
|
|
81 |
|
|
|
82 |
|
|
|
83 |
|
|
|
84 |
|
|
/* Returns true if the character is a search operator */ |
85 |
|
|
/* this could be a macro, but gcc is probably smart enough */ |
86 |
|
|
|
87 |
|
|
static int isSearchOperatorChar( int c, int phrase_delimiter, int inphrase ) |
88 |
|
|
{ |
89 |
|
|
return inphrase |
90 |
|
|
? ( '*' == c || c == phrase_delimiter ) |
91 |
|
|
: ( '(' == c || ')' == c || '=' == c || '*' == c || c == phrase_delimiter ); |
92 |
|
|
} |
93 |
|
|
|
94 |
|
|
|
95 |
|
|
/* This simply tokenizes by whitespace and by the special characters "()=" */ |
96 |
|
|
/* If within a phrase, then just splits by whitespace */ |
97 |
|
|
|
98 |
|
|
/* Funny how argv was joined into a string just to be split again... */ |
99 |
|
|
|
100 |
|
|
static int next_token( char **buf, char **word, int *lenword, int phrase_delimiter, int inphrase ) |
101 |
|
|
{ |
102 |
|
|
int i; |
103 |
|
|
int backslash; |
104 |
|
|
|
105 |
|
|
**word = '\0'; |
106 |
|
|
|
107 |
|
|
/* skip any leading whitespace */ |
108 |
|
|
while ( **buf && isspace( (unsigned char) **buf) ) |
109 |
|
|
(*buf)++; |
110 |
|
|
|
111 |
|
|
|
112 |
|
|
/* extract out word */ |
113 |
|
|
i = 0; |
114 |
|
|
backslash = 0; |
115 |
|
|
|
116 |
|
|
while ( **buf && !isspace( (unsigned char) **buf) ) |
117 |
|
|
{ |
118 |
|
|
|
119 |
|
|
// This should be looking at swish words, not raw input |
120 |
|
|
//if ( i > max_size + 4 ) /* leave a little room for operators */ |
121 |
|
|
// progerr( "Search word exceeded maxwordlimit setting." ); |
122 |
|
|
|
123 |
|
|
|
124 |
|
|
/* reallocate buffer, if needed -- only if maxwordlimit was set larger than MAXWORDLEN (1000) */ |
125 |
|
|
if ( i == *lenword ) |
126 |
|
|
{ |
127 |
|
|
*lenword *= 2; |
128 |
|
|
*word = erealloc(*word, *lenword + 1); |
129 |
|
|
} |
130 |
|
|
|
131 |
|
|
|
132 |
|
|
/* backslash says take next char as-is */ |
133 |
|
|
/* note that you cannot backslash whitespace */ |
134 |
|
|
if ( '\\' == **buf && ! backslash++ ) |
135 |
|
|
{ |
136 |
|
|
(*buf)++; |
137 |
|
|
continue; |
138 |
|
|
} |
139 |
|
|
|
140 |
|
|
|
141 |
|
|
if ( backslash || !isSearchOperatorChar( (unsigned char) **buf, phrase_delimiter, inphrase ) ) |
142 |
|
|
{ |
143 |
|
|
backslash = 0; |
144 |
|
|
|
145 |
|
|
(*word)[i++] = **buf; /* can't this be done in one line? */ |
146 |
|
|
(*buf)++; |
147 |
|
|
} |
148 |
|
|
|
149 |
|
|
else /* this is a search operator char */ |
150 |
|
|
{ |
151 |
|
|
if ( **word ) /* break if characters already found - end of this token */ |
152 |
|
|
break; |
153 |
|
|
|
154 |
|
|
(*word)[i++] = **buf; /* save the search operator char as it's own token, and end. */ |
155 |
|
|
(*buf)++; |
156 |
|
|
break; |
157 |
|
|
} |
158 |
|
|
|
159 |
|
|
} |
160 |
|
|
|
161 |
|
|
|
162 |
|
|
/* flag if we found a token */ |
163 |
|
|
if ( i ) |
164 |
|
|
{ |
165 |
|
|
(*word)[i] = '\0'; |
166 |
|
|
return 1; |
167 |
|
|
} |
168 |
|
|
|
169 |
|
|
return 0; |
170 |
|
|
} |
171 |
|
|
|
172 |
|
|
|
173 |
|
|
static int next_swish_word(INDEXDATAHEADER *header, char **buf, char **word, int *lenword ) |
174 |
|
|
{ |
175 |
|
|
int i; |
176 |
|
|
|
177 |
|
|
/* skip non-wordchars */ |
178 |
|
|
while ( **buf && !header->wordcharslookuptable[tolower((unsigned char)(**buf))] ) |
179 |
|
|
(*buf)++; |
180 |
|
|
|
181 |
|
|
i = 0; |
182 |
|
|
while ( **buf && header->wordcharslookuptable[tolower((unsigned char)(**buf))] ) |
183 |
|
|
{ |
184 |
|
|
/* reallocate buffer, if needed */ |
185 |
|
|
if ( i + 1 == *lenword ) |
186 |
|
|
{ |
187 |
|
|
*lenword *= 2; |
188 |
|
|
*word = erealloc(*word, *lenword + 1); |
189 |
|
|
} |
190 |
|
|
|
191 |
|
|
(*word)[i++] = **buf; |
192 |
|
|
(*word)[i] = '\0'; |
193 |
|
|
(*buf)++; |
194 |
|
|
} |
195 |
|
|
|
196 |
|
|
|
197 |
|
|
if ( i ) |
198 |
|
|
{ |
199 |
|
|
stripIgnoreLastChars( header, *word); |
200 |
|
|
stripIgnoreFirstChars(header, *word); |
201 |
|
|
|
202 |
|
|
|
203 |
|
|
return **word ? 1 : 0; |
204 |
|
|
} |
205 |
|
|
|
206 |
|
|
return 0; |
207 |
|
|
} |
208 |
|
|
|
209 |
|
|
|
210 |
|
|
/* Convert a word into swish words */ |
211 |
|
|
|
212 |
|
|
static struct swline *parse_swish_words( SWISH *sw, INDEXDATAHEADER *header, char *word, int max_size ) |
213 |
|
|
{ |
214 |
|
|
struct swline *swish_words = NULL; |
215 |
|
|
char *curpos; |
216 |
|
|
struct MOD_Swish_Words *self = sw->SwishWords; |
217 |
|
|
|
218 |
|
|
|
219 |
|
|
|
220 |
|
|
/* Some initial adjusting of the word */ |
221 |
|
|
|
222 |
|
|
|
223 |
|
|
TranslateChars(header->translatecharslookuptable, (unsigned char *)word); |
224 |
|
|
|
225 |
|
|
|
226 |
|
|
|
227 |
|
|
curpos = word; |
228 |
|
|
while( next_swish_word( header, &curpos, &self->word, &self->lenword ) ) |
229 |
|
|
{ |
230 |
|
|
/* Check Begin & EndCharacters */ |
231 |
|
|
if (!header->begincharslookuptable[(int) ((unsigned char) self->word[0])]) |
232 |
|
|
continue; |
233 |
|
|
|
234 |
|
|
|
235 |
|
|
if (!header->endcharslookuptable[(int) ((unsigned char) self->word[strlen(self->word) - 1])]) |
236 |
|
|
continue; |
237 |
|
|
|
238 |
|
|
|
239 |
|
|
/* limit by stopwords, min/max length, max number of digits, ... */ |
240 |
|
|
/* ------- processed elsewhere for search --------- |
241 |
|
|
if (!isokword(sw, self->word, indexf)) |
242 |
|
|
continue; |
243 |
|
|
- stopwords are processed in search.c because removing them may have side effects |
244 |
|
|
- maxwordlen is checked when first tokenizing for security reasons |
245 |
|
|
- limit by vowels, consonants and digits is not needed since search will just fail |
246 |
|
|
----------- */ |
247 |
|
|
if ( strlen( self->word ) > max_size ) |
248 |
|
|
{ |
249 |
|
|
sw->lasterror = SEARCH_WORD_TOO_BIG; |
250 |
|
|
return NULL; |
251 |
|
|
} |
252 |
|
|
|
253 |
|
|
if (!*self->word) |
254 |
|
|
continue; |
255 |
|
|
|
256 |
|
|
switch ( header->fuzzy_mode ) |
257 |
|
|
{ |
258 |
|
|
case FUZZY_NONE: |
259 |
|
|
swish_words = (struct swline *) addswline( swish_words, self->word ); |
260 |
|
|
break; |
261 |
|
|
|
262 |
|
|
case FUZZY_STEMMING: |
263 |
|
|
Stem(&self->word, &self->lenword); |
264 |
|
|
if ( *self->word ) // should not happen |
265 |
|
|
swish_words = (struct swline *) addswline( swish_words, self->word ); |
266 |
|
|
break; |
267 |
|
|
|
268 |
|
|
|
269 |
|
|
case FUZZY_SOUNDEX: |
270 |
|
|
soundex(self->word); |
271 |
|
|
if ( *self->word ) |
272 |
|
|
swish_words = (struct swline *) addswline( swish_words, self->word ); |
273 |
|
|
break; |
274 |
|
|
|
275 |
|
|
case FUZZY_METAPHONE: |
276 |
|
|
case FUZZY_DOUBLE_METAPHONE: |
277 |
|
|
{ |
278 |
|
|
char *codes[2]; |
279 |
|
|
DoubleMetaphone(self->word, codes); |
280 |
|
|
|
281 |
|
|
if ( !(*codes[0]) ) |
282 |
|
|
{ |
283 |
|
|
efree( codes[0] ); |
284 |
|
|
efree( codes[1] ); |
285 |
|
|
swish_words = (struct swline *) addswline( swish_words, self->word ); |
286 |
|
|
break; |
287 |
|
|
} |
288 |
|
|
|
289 |
|
|
|
290 |
|
|
/* check if just METAPHONE or only one word returned (e.g. they are the same) */ |
291 |
|
|
|
292 |
|
|
if ( header->fuzzy_mode == FUZZY_METAPHONE || !(*codes[1]) || !strcmp(codes[0], codes[1]) ) |
293 |
|
|
{ |
294 |
|
|
swish_words = (struct swline *) addswline( swish_words, codes[0] ); |
295 |
|
|
} |
296 |
|
|
else |
297 |
|
|
{ |
298 |
|
|
/* yuck! */ |
299 |
|
|
swish_words = (struct swline *) addswline( swish_words, "(" ); |
300 |
|
|
swish_words = (struct swline *) addswline( swish_words, codes[0] ); |
301 |
|
|
swish_words = (struct swline *) addswline( swish_words, "or" ); |
302 |
|
|
swish_words = (struct swline *) addswline( swish_words, codes[1] ); |
303 |
|
|
swish_words = (struct swline *) addswline( swish_words, ")" ); |
304 |
|
|
} |
305 |
|
|
|
306 |
|
|
efree( codes[0] ); |
307 |
|
|
efree( codes[1] ); |
308 |
|
|
} |
309 |
|
|
break; |
310 |
|
|
|
311 |
|
|
|
312 |
|
|
default: |
313 |
|
|
progerr("Invalid FuzzyMode '%d'", (int)header->fuzzy_mode ); |
314 |
|
|
} |
315 |
|
|
} |
316 |
|
|
|
317 |
|
|
return swish_words; |
318 |
|
|
|
319 |
|
|
|
320 |
|
|
} |
321 |
|
|
|
322 |
|
|
/* This is really dumb. swline needs a ->prev entry, really search needs its own linked list */ |
323 |
|
|
/* Replaces a given node with another node (or nodes) */ |
324 |
|
|
|
325 |
|
|
static void replace_swline( struct swline **original, struct swline *entry, struct swline *new_words ) |
326 |
|
|
{ |
327 |
|
|
struct swline *temp; |
328 |
|
|
|
329 |
|
|
|
330 |
|
|
temp = *original; |
331 |
|
|
|
332 |
|
|
|
333 |
|
|
/* check for case of first one */ |
334 |
|
|
if ( temp == entry ) |
335 |
|
|
{ |
336 |
|
|
if ( new_words ) |
337 |
|
|
{ |
338 |
|
|
new_words->nodep->next = temp->next; |
339 |
|
|
new_words->nodep = temp->nodep; |
340 |
|
|
*original = new_words; |
341 |
|
|
} |
342 |
|
|
else /* just delete first node */ |
343 |
|
|
{ |
344 |
|
|
if ( entry->next ) |
345 |
|
|
entry->next->nodep = entry->nodep; /* point next one to last one */ |
346 |
|
|
*original = entry->next; |
347 |
|
|
} |
348 |
|
|
} |
349 |
|
|
|
350 |
|
|
|
351 |
|
|
|
352 |
|
|
else /* not first node */ |
353 |
|
|
{ |
354 |
|
|
/* search for the preceeding node */ |
355 |
|
|
for ( temp = *original; temp && temp->next != entry; temp = temp->next ); |
356 |
|
|
|
357 |
|
|
if ( !temp ) |
358 |
|
|
progerr("Fatal Error: Failed to find insert point in replace_swline"); |
359 |
|
|
|
360 |
|
|
if ( new_words ) |
361 |
|
|
{ |
362 |
|
|
/* set the previous record to point to the start of the new entry (or entries) */ |
363 |
|
|
temp->next = new_words; |
364 |
|
|
|
365 |
|
|
/* set the end of the new string to point to the next entry */ |
366 |
|
|
new_words->nodep->next = entry->next; |
367 |
|
|
} |
368 |
|
|
else /* delete the entry */ |
369 |
|
|
temp->next = temp->next->next; |
370 |
|
|
} |
371 |
|
|
|
372 |
|
|
|
373 |
|
|
/* now free the removed item */ |
374 |
|
|
efree( entry->line ); |
375 |
|
|
efree( entry ); |
376 |
|
|
|
377 |
|
|
|
378 |
|
|
} |
379 |
|
|
|
380 |
|
|
|
381 |
|
|
static int checkbuzzword(INDEXDATAHEADER *header, char *word ) |
382 |
|
|
{ |
383 |
|
|
if ( !header->buzzwords_used_flag ) |
384 |
|
|
return 0; |
385 |
|
|
|
386 |
|
|
|
387 |
|
|
/* only strip when buzzwords are being used since stripped again as a "swish word" */ |
388 |
|
|
stripIgnoreLastChars( header, word ); |
389 |
|
|
stripIgnoreFirstChars( header, word ); |
390 |
|
|
|
391 |
|
|
if ( !*word ) /* stripped clean? */ |
392 |
|
|
return 0; |
393 |
|
|
|
394 |
|
|
|
395 |
|
|
return isbuzzword( header, word); |
396 |
|
|
} |
397 |
|
|
|
398 |
|
|
/* I hope this doesn't live too long */ |
399 |
|
|
|
400 |
|
|
static void fudge_wildcard( struct swline **original, struct swline *entry ) |
401 |
|
|
{ |
402 |
|
|
char *tmp; |
403 |
|
|
struct swline *wild_card; |
404 |
|
|
|
405 |
|
|
wild_card = entry->next; |
406 |
|
|
|
407 |
|
|
/* reallocate a string */ |
408 |
|
|
tmp = entry->line; |
409 |
|
|
entry->line = emalloc( strlen( entry->line ) + 2 ); |
410 |
|
|
strcpy( entry->line, tmp); |
411 |
|
|
efree( tmp ); |
412 |
|
|
strcat( entry->line, "*"); |
413 |
|
|
|
414 |
|
|
efree( wild_card->line ); |
415 |
|
|
|
416 |
|
|
|
417 |
|
|
/* removing last entry - set pointer to new end */ |
418 |
|
|
if ( (*original)->nodep == wild_card ) |
419 |
|
|
(*original)->nodep = entry; |
420 |
|
|
|
421 |
|
|
/* and point next to the one after next */ |
422 |
|
|
entry->next = wild_card->next; |
423 |
|
|
|
424 |
|
|
|
425 |
|
|
efree( wild_card ); |
426 |
|
|
} |
427 |
|
|
|
428 |
|
|
|
429 |
|
|
|
430 |
|
|
/******************** Public Functions *********************************/ |
431 |
|
|
|
432 |
|
|
char *isBooleanOperatorWord( char * word ) |
433 |
|
|
{ |
434 |
|
|
/* don't need strcasecmp here, since word should alrady be lowercase -- need to check alt-search first */ |
435 |
|
|
if (!strcasecmp( word, _AND_WORD)) |
436 |
|
|
return AND_WORD; |
437 |
|
|
|
438 |
|
|
if (!strcasecmp( word, _OR_WORD)) |
439 |
|
|
return OR_WORD; |
440 |
|
|
|
441 |
|
|
if (!strcasecmp( word, _NOT_WORD)) |
442 |
|
|
return NOT_WORD; |
443 |
|
|
|
444 |
|
|
return (char *)NULL; |
445 |
|
|
} |
446 |
|
|
|
447 |
|
|
|
448 |
|
|
|
449 |
|
|
struct swline *tokenize_query_string( SWISH *sw, char *words, INDEXDATAHEADER *header ) |
450 |
|
|
{ |
451 |
|
|
char *curpos; /* current position in the words string */ |
452 |
|
|
struct swline *tokens = NULL; |
453 |
|
|
struct swline *temp; |
454 |
|
|
struct swline *swish_words; |
455 |
|
|
struct swline *next_node; |
456 |
|
|
struct MOD_Swish_Words *self = sw->SwishWords; |
457 |
|
|
struct MOD_Search *srch = sw->Search; |
458 |
|
|
unsigned char PhraseDelimiter; |
459 |
|
|
int max_size; |
460 |
|
|
int inphrase = 0; |
461 |
|
|
|
462 |
|
|
|
463 |
|
|
/* Probably won't get to this point */ |
464 |
|
|
if ( !words || !*words ) |
465 |
|
|
{ |
466 |
|
|
sw->lasterror = NO_WORDS_IN_SEARCH; |
467 |
|
|
return NULL; |
468 |
|
|
} |
469 |
|
|
|
470 |
|
|
|
471 |
|
|
PhraseDelimiter = (unsigned char) srch->PhraseDelimiter; |
472 |
|
|
max_size = header->maxwordlimit; |
473 |
|
|
|
474 |
|
|
curpos = words; |
475 |
|
|
|
476 |
|
|
/* split into words by whitespace and by the swish operator characters */ |
477 |
|
|
|
478 |
|
|
while ( next_token( &curpos, &self->word, &self->lenword, PhraseDelimiter, inphrase ) ) |
479 |
|
|
{ |
480 |
|
|
tokens = (struct swline *) addswline( tokens, self->word ); |
481 |
|
|
|
482 |
|
|
|
483 |
|
|
if ( self->word[0] == PhraseDelimiter && !self->word[1] ) |
484 |
|
|
inphrase = !inphrase; |
485 |
|
|
} |
486 |
|
|
|
487 |
|
|
|
488 |
|
|
/* no search words found */ |
489 |
|
|
if ( !tokens ) |
490 |
|
|
return tokens; |
491 |
|
|
|
492 |
|
|
|
493 |
|
|
inphrase = 0; |
494 |
|
|
|
495 |
|
|
temp = tokens; |
496 |
|
|
while ( temp ) |
497 |
|
|
{ |
498 |
|
|
|
499 |
|
|
/* do look-ahead processing first -- metanames */ |
500 |
|
|
|
501 |
|
|
if ( !inphrase && isMetaNameOpNext(temp->next) ) |
502 |
|
|
{ |
503 |
|
|
|
504 |
|
|
if( !getMetaNameByName( header, temp->line ) ) |
505 |
|
|
{ |
506 |
|
|
set_progerr( UNKNOWN_METANAME, sw, "'%s'", temp->line ); |
507 |
|
|
freeswline( tokens ); |
508 |
|
|
return NULL; |
509 |
|
|
} |
510 |
|
|
|
511 |
|
|
|
512 |
|
|
/* this might be an option with XML */ |
513 |
|
|
strtolower( temp->line ); |
514 |
|
|
|
515 |
|
|
temp = temp->next; |
516 |
|
|
continue; |
517 |
|
|
} |
518 |
|
|
|
519 |
|
|
|
520 |
|
|
/* skip operators */ |
521 |
|
|
if ( strlen( temp->line ) == 1 && isSearchOperatorChar( (unsigned char) temp->line[0], PhraseDelimiter, inphrase ) ) |
522 |
|
|
{ |
523 |
|
|
|
524 |
|
|
if ( temp->line[0] == PhraseDelimiter && !temp->line[1] ) |
525 |
|
|
inphrase = !inphrase; |
526 |
|
|
|
527 |
|
|
temp = temp->next; |
528 |
|
|
continue; |
529 |
|
|
} |
530 |
|
|
|
531 |
|
|
/* this might be an option if case sensitive searches are used */ |
532 |
|
|
strtolower( temp->line ); |
533 |
|
|
|
534 |
|
|
|
535 |
|
|
/* check Boolean operators -- currently doesn't change it (search.c does) */ |
536 |
|
|
if ( !inphrase ) |
537 |
|
|
{ |
538 |
|
|
char *operator, *nextoperator; |
539 |
|
|
|
540 |
|
|
if ( (operator = isBooleanOperatorWord( temp->line )) ) |
541 |
|
|
{ |
542 |
|
|
/* replace the common "and not" with simply not" */ |
543 |
|
|
/* probably not the best place to do this level of processing */ |
544 |
|
|
/* since should also check for things like "and this" and "and and and not this" */ |
545 |
|
|
/* should probably be moved to end and recursively check for these (to catch "and and not") */ |
546 |
|
|
if ( |
547 |
|
|
temp->next |
548 |
|
|
&& ( strcmp( operator, AND_WORD ) == 0) |
549 |
|
|
&& ( (nextoperator = isBooleanOperatorWord( temp->next->line))) |
550 |
|
|
&& ( strcmp( nextoperator, NOT_WORD ) == 0) |
551 |
|
|
) { |
552 |
|
|
struct swline *andword = temp; /* save position */ |
553 |
|
|
|
554 |
|
|
temp = temp->next; /* now point to "not" word */ |
555 |
|
|
replace_swline( &tokens, andword, (struct swline *)NULL ); /* cut it out */ |
556 |
|
|
continue; |
557 |
|
|
} |
558 |
|
|
|
559 |
|
|
temp = temp->next; |
560 |
|
|
continue; |
561 |
|
|
} |
562 |
|
|
} |
563 |
|
|
|
564 |
|
|
|
565 |
|
|
/* buzzwords */ |
566 |
|
|
if ( checkbuzzword( header, temp->line ) ) |
567 |
|
|
{ |
568 |
|
|
temp = temp->next; |
569 |
|
|
continue; |
570 |
|
|
} |
571 |
|
|
|
572 |
|
|
|
573 |
|
|
|
574 |
|
|
/* query words left. Turn into "swish_words" */ |
575 |
|
|
swish_words = NULL; |
576 |
|
|
swish_words = parse_swish_words( sw, header, temp->line, max_size); |
577 |
|
|
|
578 |
|
|
if ( sw->lasterror ) |
579 |
|
|
return NULL; |
580 |
|
|
|
581 |
|
|
|
582 |
|
|
next_node = temp->next; |
583 |
|
|
|
584 |
|
|
/* move into list.c at some point */ |
585 |
|
|
replace_swline( &tokens, temp, swish_words ); |
586 |
|
|
temp = next_node; |
587 |
|
|
|
588 |
|
|
} |
589 |
|
|
|
590 |
|
|
/* fudge wild cards back onto preceeding word */ |
591 |
|
|
for ( temp = tokens ; temp; temp = temp->next ) |
592 |
|
|
if ( temp->next && strcmp( temp->next->line, "*") == 0 ) |
593 |
|
|
fudge_wildcard( &tokens, temp ); |
594 |
|
|
|
595 |
|
|
|
596 |
|
|
return tokens; |
597 |
|
|
} |
598 |
|
|
|