/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/swish_words.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/swish_words.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 /*
2 $Id: swish_words.c,v 1.15 2002/08/22 22:58:39 whmoseley Exp $
3 **
4 ** This program and library is free software; you can redistribute it and/or
5 ** modify it under the terms of the GNU (Library) General Public License
6 ** as published by the Free Software Foundation; either version 2
7 ** of the License, or any later version.
8 **
9 ** This program is distributed in the hope that it will be useful,
10 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
11 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 ** GNU (Library) General Public License for more details.
13 **
14 ** You should have received a copy of the GNU (Library) General Public License
15 ** along with this program; if not, write to the Free Software
16 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 **
18 **
19 ** 2001-05-23 moseley created - replaced parser in search.c
20 **
21 ** 2001-12-11 moseley, updated to deal with swish operators inside of phrases
22 ** Still broken with regard to double-quotes inside of phrases
23 ** Very unlikely someone would want to search for a single double quote
24 ** within a phrase. It currently works if the double-quotes doesn't have
25 ** white space around. Really should tag the words as being operators, or
26 ** or "swish words", or let the backslash stay in the query until searching.
27 **
28 */
29
30 #include "swish.h"
31 #include "mem.h"
32 #include "string.h"
33 #include "search.h"
34 #include "index.h"
35 #include "file.h"
36 #include "list.h"
37 #include "hash.h"
38 #include "stemmer.h"
39 #include "soundex.h"
40 #include "double_metaphone.h"
41 #include "error.h"
42 #include "metanames.h"
43 #include "config.h" // for _AND_WORD...
44 #include "search_alt.h" // for AND_WORD... humm maybe needs better organization
45
46 struct MOD_Swish_Words
47 {
48 char *word;
49 int lenword;
50 };
51
52 /*
53 -- init structures for this module
54 */
55
56 void initModule_Swish_Words (SWISH *sw)
57 {
58 struct MOD_Swish_Words *self;
59
60 self = (struct MOD_Swish_Words *) emalloc(sizeof(struct MOD_Swish_Words));
61 sw->SwishWords = self;
62
63 /* initialize buffers used by indexstring */
64 self->word = (char *) emalloc((self->lenword = MAXWORDLEN) + 1);
65
66 return;
67 }
68
69 void freeModule_Swish_Words (SWISH *sw)
70 {
71 struct MOD_Swish_Words *self = sw->SwishWords;
72
73 efree( self->word );
74 efree ( self );
75 sw->SwishWords = NULL;
76
77 return;
78 }
79
80
81
82
83
84 /* Returns true if the character is a search operator */
85 /* this could be a macro, but gcc is probably smart enough */
86
87 static int isSearchOperatorChar( int c, int phrase_delimiter, int inphrase )
88 {
89 return inphrase
90 ? ( '*' == c || c == phrase_delimiter )
91 : ( '(' == c || ')' == c || '=' == c || '*' == c || c == phrase_delimiter );
92 }
93
94
95 /* This simply tokenizes by whitespace and by the special characters "()=" */
96 /* If within a phrase, then just splits by whitespace */
97
98 /* Funny how argv was joined into a string just to be split again... */
99
100 static int next_token( char **buf, char **word, int *lenword, int phrase_delimiter, int inphrase )
101 {
102 int i;
103 int backslash;
104
105 **word = '\0';
106
107 /* skip any leading whitespace */
108 while ( **buf && isspace( (unsigned char) **buf) )
109 (*buf)++;
110
111
112 /* extract out word */
113 i = 0;
114 backslash = 0;
115
116 while ( **buf && !isspace( (unsigned char) **buf) )
117 {
118
119 // This should be looking at swish words, not raw input
120 //if ( i > max_size + 4 ) /* leave a little room for operators */
121 // progerr( "Search word exceeded maxwordlimit setting." );
122
123
124 /* reallocate buffer, if needed -- only if maxwordlimit was set larger than MAXWORDLEN (1000) */
125 if ( i == *lenword )
126 {
127 *lenword *= 2;
128 *word = erealloc(*word, *lenword + 1);
129 }
130
131
132 /* backslash says take next char as-is */
133 /* note that you cannot backslash whitespace */
134 if ( '\\' == **buf && ! backslash++ )
135 {
136 (*buf)++;
137 continue;
138 }
139
140
141 if ( backslash || !isSearchOperatorChar( (unsigned char) **buf, phrase_delimiter, inphrase ) )
142 {
143 backslash = 0;
144
145 (*word)[i++] = **buf; /* can't this be done in one line? */
146 (*buf)++;
147 }
148
149 else /* this is a search operator char */
150 {
151 if ( **word ) /* break if characters already found - end of this token */
152 break;
153
154 (*word)[i++] = **buf; /* save the search operator char as it's own token, and end. */
155 (*buf)++;
156 break;
157 }
158
159 }
160
161
162 /* flag if we found a token */
163 if ( i )
164 {
165 (*word)[i] = '\0';
166 return 1;
167 }
168
169 return 0;
170 }
171
172
173 static int next_swish_word(INDEXDATAHEADER *header, char **buf, char **word, int *lenword )
174 {
175 int i;
176
177 /* skip non-wordchars */
178 while ( **buf && !header->wordcharslookuptable[tolower((unsigned char)(**buf))] )
179 (*buf)++;
180
181 i = 0;
182 while ( **buf && header->wordcharslookuptable[tolower((unsigned char)(**buf))] )
183 {
184 /* reallocate buffer, if needed */
185 if ( i + 1 == *lenword )
186 {
187 *lenword *= 2;
188 *word = erealloc(*word, *lenword + 1);
189 }
190
191 (*word)[i++] = **buf;
192 (*word)[i] = '\0';
193 (*buf)++;
194 }
195
196
197 if ( i )
198 {
199 stripIgnoreLastChars( header, *word);
200 stripIgnoreFirstChars(header, *word);
201
202
203 return **word ? 1 : 0;
204 }
205
206 return 0;
207 }
208
209
210 /* Convert a word into swish words */
211
212 static struct swline *parse_swish_words( SWISH *sw, INDEXDATAHEADER *header, char *word, int max_size )
213 {
214 struct swline *swish_words = NULL;
215 char *curpos;
216 struct MOD_Swish_Words *self = sw->SwishWords;
217
218
219
220 /* Some initial adjusting of the word */
221
222
223 TranslateChars(header->translatecharslookuptable, (unsigned char *)word);
224
225
226
227 curpos = word;
228 while( next_swish_word( header, &curpos, &self->word, &self->lenword ) )
229 {
230 /* Check Begin & EndCharacters */
231 if (!header->begincharslookuptable[(int) ((unsigned char) self->word[0])])
232 continue;
233
234
235 if (!header->endcharslookuptable[(int) ((unsigned char) self->word[strlen(self->word) - 1])])
236 continue;
237
238
239 /* limit by stopwords, min/max length, max number of digits, ... */
240 /* ------- processed elsewhere for search ---------
241 if (!isokword(sw, self->word, indexf))
242 continue;
243 - stopwords are processed in search.c because removing them may have side effects
244 - maxwordlen is checked when first tokenizing for security reasons
245 - limit by vowels, consonants and digits is not needed since search will just fail
246 ----------- */
247 if ( strlen( self->word ) > max_size )
248 {
249 sw->lasterror = SEARCH_WORD_TOO_BIG;
250 return NULL;
251 }
252
253 if (!*self->word)
254 continue;
255
256 switch ( header->fuzzy_mode )
257 {
258 case FUZZY_NONE:
259 swish_words = (struct swline *) addswline( swish_words, self->word );
260 break;
261
262 case FUZZY_STEMMING:
263 Stem(&self->word, &self->lenword);
264 if ( *self->word ) // should not happen
265 swish_words = (struct swline *) addswline( swish_words, self->word );
266 break;
267
268
269 case FUZZY_SOUNDEX:
270 soundex(self->word);
271 if ( *self->word )
272 swish_words = (struct swline *) addswline( swish_words, self->word );
273 break;
274
275 case FUZZY_METAPHONE:
276 case FUZZY_DOUBLE_METAPHONE:
277 {
278 char *codes[2];
279 DoubleMetaphone(self->word, codes);
280
281 if ( !(*codes[0]) )
282 {
283 efree( codes[0] );
284 efree( codes[1] );
285 swish_words = (struct swline *) addswline( swish_words, self->word );
286 break;
287 }
288
289
290 /* check if just METAPHONE or only one word returned (e.g. they are the same) */
291
292 if ( header->fuzzy_mode == FUZZY_METAPHONE || !(*codes[1]) || !strcmp(codes[0], codes[1]) )
293 {
294 swish_words = (struct swline *) addswline( swish_words, codes[0] );
295 }
296 else
297 {
298 /* yuck! */
299 swish_words = (struct swline *) addswline( swish_words, "(" );
300 swish_words = (struct swline *) addswline( swish_words, codes[0] );
301 swish_words = (struct swline *) addswline( swish_words, "or" );
302 swish_words = (struct swline *) addswline( swish_words, codes[1] );
303 swish_words = (struct swline *) addswline( swish_words, ")" );
304 }
305
306 efree( codes[0] );
307 efree( codes[1] );
308 }
309 break;
310
311
312 default:
313 progerr("Invalid FuzzyMode '%d'", (int)header->fuzzy_mode );
314 }
315 }
316
317 return swish_words;
318
319
320 }
321
322 /* This is really dumb. swline needs a ->prev entry, really search needs its own linked list */
323 /* Replaces a given node with another node (or nodes) */
324
325 static void replace_swline( struct swline **original, struct swline *entry, struct swline *new_words )
326 {
327 struct swline *temp;
328
329
330 temp = *original;
331
332
333 /* check for case of first one */
334 if ( temp == entry )
335 {
336 if ( new_words )
337 {
338 new_words->nodep->next = temp->next;
339 new_words->nodep = temp->nodep;
340 *original = new_words;
341 }
342 else /* just delete first node */
343 {
344 if ( entry->next )
345 entry->next->nodep = entry->nodep; /* point next one to last one */
346 *original = entry->next;
347 }
348 }
349
350
351
352 else /* not first node */
353 {
354 /* search for the preceeding node */
355 for ( temp = *original; temp && temp->next != entry; temp = temp->next );
356
357 if ( !temp )
358 progerr("Fatal Error: Failed to find insert point in replace_swline");
359
360 if ( new_words )
361 {
362 /* set the previous record to point to the start of the new entry (or entries) */
363 temp->next = new_words;
364
365 /* set the end of the new string to point to the next entry */
366 new_words->nodep->next = entry->next;
367 }
368 else /* delete the entry */
369 temp->next = temp->next->next;
370 }
371
372
373 /* now free the removed item */
374 efree( entry->line );
375 efree( entry );
376
377
378 }
379
380
381 static int checkbuzzword(INDEXDATAHEADER *header, char *word )
382 {
383 if ( !header->buzzwords_used_flag )
384 return 0;
385
386
387 /* only strip when buzzwords are being used since stripped again as a "swish word" */
388 stripIgnoreLastChars( header, word );
389 stripIgnoreFirstChars( header, word );
390
391 if ( !*word ) /* stripped clean? */
392 return 0;
393
394
395 return isbuzzword( header, word);
396 }
397
398 /* I hope this doesn't live too long */
399
400 static void fudge_wildcard( struct swline **original, struct swline *entry )
401 {
402 char *tmp;
403 struct swline *wild_card;
404
405 wild_card = entry->next;
406
407 /* reallocate a string */
408 tmp = entry->line;
409 entry->line = emalloc( strlen( entry->line ) + 2 );
410 strcpy( entry->line, tmp);
411 efree( tmp );
412 strcat( entry->line, "*");
413
414 efree( wild_card->line );
415
416
417 /* removing last entry - set pointer to new end */
418 if ( (*original)->nodep == wild_card )
419 (*original)->nodep = entry;
420
421 /* and point next to the one after next */
422 entry->next = wild_card->next;
423
424
425 efree( wild_card );
426 }
427
428
429
430 /******************** Public Functions *********************************/
431
432 char *isBooleanOperatorWord( char * word )
433 {
434 /* don't need strcasecmp here, since word should alrady be lowercase -- need to check alt-search first */
435 if (!strcasecmp( word, _AND_WORD))
436 return AND_WORD;
437
438 if (!strcasecmp( word, _OR_WORD))
439 return OR_WORD;
440
441 if (!strcasecmp( word, _NOT_WORD))
442 return NOT_WORD;
443
444 return (char *)NULL;
445 }
446
447
448
449 struct swline *tokenize_query_string( SWISH *sw, char *words, INDEXDATAHEADER *header )
450 {
451 char *curpos; /* current position in the words string */
452 struct swline *tokens = NULL;
453 struct swline *temp;
454 struct swline *swish_words;
455 struct swline *next_node;
456 struct MOD_Swish_Words *self = sw->SwishWords;
457 struct MOD_Search *srch = sw->Search;
458 unsigned char PhraseDelimiter;
459 int max_size;
460 int inphrase = 0;
461
462
463 /* Probably won't get to this point */
464 if ( !words || !*words )
465 {
466 sw->lasterror = NO_WORDS_IN_SEARCH;
467 return NULL;
468 }
469
470
471 PhraseDelimiter = (unsigned char) srch->PhraseDelimiter;
472 max_size = header->maxwordlimit;
473
474 curpos = words;
475
476 /* split into words by whitespace and by the swish operator characters */
477
478 while ( next_token( &curpos, &self->word, &self->lenword, PhraseDelimiter, inphrase ) )
479 {
480 tokens = (struct swline *) addswline( tokens, self->word );
481
482
483 if ( self->word[0] == PhraseDelimiter && !self->word[1] )
484 inphrase = !inphrase;
485 }
486
487
488 /* no search words found */
489 if ( !tokens )
490 return tokens;
491
492
493 inphrase = 0;
494
495 temp = tokens;
496 while ( temp )
497 {
498
499 /* do look-ahead processing first -- metanames */
500
501 if ( !inphrase && isMetaNameOpNext(temp->next) )
502 {
503
504 if( !getMetaNameByName( header, temp->line ) )
505 {
506 set_progerr( UNKNOWN_METANAME, sw, "'%s'", temp->line );
507 freeswline( tokens );
508 return NULL;
509 }
510
511
512 /* this might be an option with XML */
513 strtolower( temp->line );
514
515 temp = temp->next;
516 continue;
517 }
518
519
520 /* skip operators */
521 if ( strlen( temp->line ) == 1 && isSearchOperatorChar( (unsigned char) temp->line[0], PhraseDelimiter, inphrase ) )
522 {
523
524 if ( temp->line[0] == PhraseDelimiter && !temp->line[1] )
525 inphrase = !inphrase;
526
527 temp = temp->next;
528 continue;
529 }
530
531 /* this might be an option if case sensitive searches are used */
532 strtolower( temp->line );
533
534
535 /* check Boolean operators -- currently doesn't change it (search.c does) */
536 if ( !inphrase )
537 {
538 char *operator, *nextoperator;
539
540 if ( (operator = isBooleanOperatorWord( temp->line )) )
541 {
542 /* replace the common "and not" with simply not" */
543 /* probably not the best place to do this level of processing */
544 /* since should also check for things like "and this" and "and and and not this" */
545 /* should probably be moved to end and recursively check for these (to catch "and and not") */
546 if (
547 temp->next
548 && ( strcmp( operator, AND_WORD ) == 0)
549 && ( (nextoperator = isBooleanOperatorWord( temp->next->line)))
550 && ( strcmp( nextoperator, NOT_WORD ) == 0)
551 ) {
552 struct swline *andword = temp; /* save position */
553
554 temp = temp->next; /* now point to "not" word */
555 replace_swline( &tokens, andword, (struct swline *)NULL ); /* cut it out */
556 continue;
557 }
558
559 temp = temp->next;
560 continue;
561 }
562 }
563
564
565 /* buzzwords */
566 if ( checkbuzzword( header, temp->line ) )
567 {
568 temp = temp->next;
569 continue;
570 }
571
572
573
574 /* query words left. Turn into "swish_words" */
575 swish_words = NULL;
576 swish_words = parse_swish_words( sw, header, temp->line, max_size);
577
578 if ( sw->lasterror )
579 return NULL;
580
581
582 next_node = temp->next;
583
584 /* move into list.c at some point */
585 replace_swline( &tokens, temp, swish_words );
586 temp = next_node;
587
588 }
589
590 /* fudge wild cards back onto preceeding word */
591 for ( temp = tokens ; temp; temp = temp->next )
592 if ( temp->next && strcmp( temp->next->line, "*") == 0 )
593 fudge_wildcard( &tokens, temp );
594
595
596 return tokens;
597 }
598

  ViewVC Help
Powered by ViewVC 1.1.22