/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/swish_words.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/swish_words.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (hide annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 adcroft 1.1 /*
2     $Id: swish_words.c,v 1.15 2002/08/22 22:58:39 whmoseley Exp $
3     **
4     ** This program and library is free software; you can redistribute it and/or
5     ** modify it under the terms of the GNU (Library) General Public License
6     ** as published by the Free Software Foundation; either version 2
7     ** of the License, or any later version.
8     **
9     ** This program is distributed in the hope that it will be useful,
10     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
11     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12     ** GNU (Library) General Public License for more details.
13     **
14     ** You should have received a copy of the GNU (Library) General Public License
15     ** along with this program; if not, write to the Free Software
16     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17     **
18     **
19     ** 2001-05-23 moseley created - replaced parser in search.c
20     **
21     ** 2001-12-11 moseley, updated to deal with swish operators inside of phrases
22     ** Still broken with regard to double-quotes inside of phrases
23     ** Very unlikely someone would want to search for a single double quote
24     ** within a phrase. It currently works if the double-quotes doesn't have
25     ** white space around. Really should tag the words as being operators, or
26     ** or "swish words", or let the backslash stay in the query until searching.
27     **
28     */
29    
30     #include "swish.h"
31     #include "mem.h"
32     #include "string.h"
33     #include "search.h"
34     #include "index.h"
35     #include "file.h"
36     #include "list.h"
37     #include "hash.h"
38     #include "stemmer.h"
39     #include "soundex.h"
40     #include "double_metaphone.h"
41     #include "error.h"
42     #include "metanames.h"
43     #include "config.h" // for _AND_WORD...
44     #include "search_alt.h" // for AND_WORD... humm maybe needs better organization
45    
46     struct MOD_Swish_Words
47     {
48     char *word;
49     int lenword;
50     };
51    
52     /*
53     -- init structures for this module
54     */
55    
56     void initModule_Swish_Words (SWISH *sw)
57     {
58     struct MOD_Swish_Words *self;
59    
60     self = (struct MOD_Swish_Words *) emalloc(sizeof(struct MOD_Swish_Words));
61     sw->SwishWords = self;
62    
63     /* initialize buffers used by indexstring */
64     self->word = (char *) emalloc((self->lenword = MAXWORDLEN) + 1);
65    
66     return;
67     }
68    
69     void freeModule_Swish_Words (SWISH *sw)
70     {
71     struct MOD_Swish_Words *self = sw->SwishWords;
72    
73     efree( self->word );
74     efree ( self );
75     sw->SwishWords = NULL;
76    
77     return;
78     }
79    
80    
81    
82    
83    
84     /* Returns true if the character is a search operator */
85     /* this could be a macro, but gcc is probably smart enough */
86    
87     static int isSearchOperatorChar( int c, int phrase_delimiter, int inphrase )
88     {
89     return inphrase
90     ? ( '*' == c || c == phrase_delimiter )
91     : ( '(' == c || ')' == c || '=' == c || '*' == c || c == phrase_delimiter );
92     }
93    
94    
95     /* This simply tokenizes by whitespace and by the special characters "()=" */
96     /* If within a phrase, then just splits by whitespace */
97    
98     /* Funny how argv was joined into a string just to be split again... */
99    
100     static int next_token( char **buf, char **word, int *lenword, int phrase_delimiter, int inphrase )
101     {
102     int i;
103     int backslash;
104    
105     **word = '\0';
106    
107     /* skip any leading whitespace */
108     while ( **buf && isspace( (unsigned char) **buf) )
109     (*buf)++;
110    
111    
112     /* extract out word */
113     i = 0;
114     backslash = 0;
115    
116     while ( **buf && !isspace( (unsigned char) **buf) )
117     {
118    
119     // This should be looking at swish words, not raw input
120     //if ( i > max_size + 4 ) /* leave a little room for operators */
121     // progerr( "Search word exceeded maxwordlimit setting." );
122    
123    
124     /* reallocate buffer, if needed -- only if maxwordlimit was set larger than MAXWORDLEN (1000) */
125     if ( i == *lenword )
126     {
127     *lenword *= 2;
128     *word = erealloc(*word, *lenword + 1);
129     }
130    
131    
132     /* backslash says take next char as-is */
133     /* note that you cannot backslash whitespace */
134     if ( '\\' == **buf && ! backslash++ )
135     {
136     (*buf)++;
137     continue;
138     }
139    
140    
141     if ( backslash || !isSearchOperatorChar( (unsigned char) **buf, phrase_delimiter, inphrase ) )
142     {
143     backslash = 0;
144    
145     (*word)[i++] = **buf; /* can't this be done in one line? */
146     (*buf)++;
147     }
148    
149     else /* this is a search operator char */
150     {
151     if ( **word ) /* break if characters already found - end of this token */
152     break;
153    
154     (*word)[i++] = **buf; /* save the search operator char as it's own token, and end. */
155     (*buf)++;
156     break;
157     }
158    
159     }
160    
161    
162     /* flag if we found a token */
163     if ( i )
164     {
165     (*word)[i] = '\0';
166     return 1;
167     }
168    
169     return 0;
170     }
171    
172    
173     static int next_swish_word(INDEXDATAHEADER *header, char **buf, char **word, int *lenword )
174     {
175     int i;
176    
177     /* skip non-wordchars */
178     while ( **buf && !header->wordcharslookuptable[tolower((unsigned char)(**buf))] )
179     (*buf)++;
180    
181     i = 0;
182     while ( **buf && header->wordcharslookuptable[tolower((unsigned char)(**buf))] )
183     {
184     /* reallocate buffer, if needed */
185     if ( i + 1 == *lenword )
186     {
187     *lenword *= 2;
188     *word = erealloc(*word, *lenword + 1);
189     }
190    
191     (*word)[i++] = **buf;
192     (*word)[i] = '\0';
193     (*buf)++;
194     }
195    
196    
197     if ( i )
198     {
199     stripIgnoreLastChars( header, *word);
200     stripIgnoreFirstChars(header, *word);
201    
202    
203     return **word ? 1 : 0;
204     }
205    
206     return 0;
207     }
208    
209    
210     /* Convert a word into swish words */
211    
212     static struct swline *parse_swish_words( SWISH *sw, INDEXDATAHEADER *header, char *word, int max_size )
213     {
214     struct swline *swish_words = NULL;
215     char *curpos;
216     struct MOD_Swish_Words *self = sw->SwishWords;
217    
218    
219    
220     /* Some initial adjusting of the word */
221    
222    
223     TranslateChars(header->translatecharslookuptable, (unsigned char *)word);
224    
225    
226    
227     curpos = word;
228     while( next_swish_word( header, &curpos, &self->word, &self->lenword ) )
229     {
230     /* Check Begin & EndCharacters */
231     if (!header->begincharslookuptable[(int) ((unsigned char) self->word[0])])
232     continue;
233    
234    
235     if (!header->endcharslookuptable[(int) ((unsigned char) self->word[strlen(self->word) - 1])])
236     continue;
237    
238    
239     /* limit by stopwords, min/max length, max number of digits, ... */
240     /* ------- processed elsewhere for search ---------
241     if (!isokword(sw, self->word, indexf))
242     continue;
243     - stopwords are processed in search.c because removing them may have side effects
244     - maxwordlen is checked when first tokenizing for security reasons
245     - limit by vowels, consonants and digits is not needed since search will just fail
246     ----------- */
247     if ( strlen( self->word ) > max_size )
248     {
249     sw->lasterror = SEARCH_WORD_TOO_BIG;
250     return NULL;
251     }
252    
253     if (!*self->word)
254     continue;
255    
256     switch ( header->fuzzy_mode )
257     {
258     case FUZZY_NONE:
259     swish_words = (struct swline *) addswline( swish_words, self->word );
260     break;
261    
262     case FUZZY_STEMMING:
263     Stem(&self->word, &self->lenword);
264     if ( *self->word ) // should not happen
265     swish_words = (struct swline *) addswline( swish_words, self->word );
266     break;
267    
268    
269     case FUZZY_SOUNDEX:
270     soundex(self->word);
271     if ( *self->word )
272     swish_words = (struct swline *) addswline( swish_words, self->word );
273     break;
274    
275     case FUZZY_METAPHONE:
276     case FUZZY_DOUBLE_METAPHONE:
277     {
278     char *codes[2];
279     DoubleMetaphone(self->word, codes);
280    
281     if ( !(*codes[0]) )
282     {
283     efree( codes[0] );
284     efree( codes[1] );
285     swish_words = (struct swline *) addswline( swish_words, self->word );
286     break;
287     }
288    
289    
290     /* check if just METAPHONE or only one word returned (e.g. they are the same) */
291    
292     if ( header->fuzzy_mode == FUZZY_METAPHONE || !(*codes[1]) || !strcmp(codes[0], codes[1]) )
293     {
294     swish_words = (struct swline *) addswline( swish_words, codes[0] );
295     }
296     else
297     {
298     /* yuck! */
299     swish_words = (struct swline *) addswline( swish_words, "(" );
300     swish_words = (struct swline *) addswline( swish_words, codes[0] );
301     swish_words = (struct swline *) addswline( swish_words, "or" );
302     swish_words = (struct swline *) addswline( swish_words, codes[1] );
303     swish_words = (struct swline *) addswline( swish_words, ")" );
304     }
305    
306     efree( codes[0] );
307     efree( codes[1] );
308     }
309     break;
310    
311    
312     default:
313     progerr("Invalid FuzzyMode '%d'", (int)header->fuzzy_mode );
314     }
315     }
316    
317     return swish_words;
318    
319    
320     }
321    
322     /* This is really dumb. swline needs a ->prev entry, really search needs its own linked list */
323     /* Replaces a given node with another node (or nodes) */
324    
325     static void replace_swline( struct swline **original, struct swline *entry, struct swline *new_words )
326     {
327     struct swline *temp;
328    
329    
330     temp = *original;
331    
332    
333     /* check for case of first one */
334     if ( temp == entry )
335     {
336     if ( new_words )
337     {
338     new_words->nodep->next = temp->next;
339     new_words->nodep = temp->nodep;
340     *original = new_words;
341     }
342     else /* just delete first node */
343     {
344     if ( entry->next )
345     entry->next->nodep = entry->nodep; /* point next one to last one */
346     *original = entry->next;
347     }
348     }
349    
350    
351    
352     else /* not first node */
353     {
354     /* search for the preceeding node */
355     for ( temp = *original; temp && temp->next != entry; temp = temp->next );
356    
357     if ( !temp )
358     progerr("Fatal Error: Failed to find insert point in replace_swline");
359    
360     if ( new_words )
361     {
362     /* set the previous record to point to the start of the new entry (or entries) */
363     temp->next = new_words;
364    
365     /* set the end of the new string to point to the next entry */
366     new_words->nodep->next = entry->next;
367     }
368     else /* delete the entry */
369     temp->next = temp->next->next;
370     }
371    
372    
373     /* now free the removed item */
374     efree( entry->line );
375     efree( entry );
376    
377    
378     }
379    
380    
381     static int checkbuzzword(INDEXDATAHEADER *header, char *word )
382     {
383     if ( !header->buzzwords_used_flag )
384     return 0;
385    
386    
387     /* only strip when buzzwords are being used since stripped again as a "swish word" */
388     stripIgnoreLastChars( header, word );
389     stripIgnoreFirstChars( header, word );
390    
391     if ( !*word ) /* stripped clean? */
392     return 0;
393    
394    
395     return isbuzzword( header, word);
396     }
397    
398     /* I hope this doesn't live too long */
399    
400     static void fudge_wildcard( struct swline **original, struct swline *entry )
401     {
402     char *tmp;
403     struct swline *wild_card;
404    
405     wild_card = entry->next;
406    
407     /* reallocate a string */
408     tmp = entry->line;
409     entry->line = emalloc( strlen( entry->line ) + 2 );
410     strcpy( entry->line, tmp);
411     efree( tmp );
412     strcat( entry->line, "*");
413    
414     efree( wild_card->line );
415    
416    
417     /* removing last entry - set pointer to new end */
418     if ( (*original)->nodep == wild_card )
419     (*original)->nodep = entry;
420    
421     /* and point next to the one after next */
422     entry->next = wild_card->next;
423    
424    
425     efree( wild_card );
426     }
427    
428    
429    
430     /******************** Public Functions *********************************/
431    
432     char *isBooleanOperatorWord( char * word )
433     {
434     /* don't need strcasecmp here, since word should alrady be lowercase -- need to check alt-search first */
435     if (!strcasecmp( word, _AND_WORD))
436     return AND_WORD;
437    
438     if (!strcasecmp( word, _OR_WORD))
439     return OR_WORD;
440    
441     if (!strcasecmp( word, _NOT_WORD))
442     return NOT_WORD;
443    
444     return (char *)NULL;
445     }
446    
447    
448    
449     struct swline *tokenize_query_string( SWISH *sw, char *words, INDEXDATAHEADER *header )
450     {
451     char *curpos; /* current position in the words string */
452     struct swline *tokens = NULL;
453     struct swline *temp;
454     struct swline *swish_words;
455     struct swline *next_node;
456     struct MOD_Swish_Words *self = sw->SwishWords;
457     struct MOD_Search *srch = sw->Search;
458     unsigned char PhraseDelimiter;
459     int max_size;
460     int inphrase = 0;
461    
462    
463     /* Probably won't get to this point */
464     if ( !words || !*words )
465     {
466     sw->lasterror = NO_WORDS_IN_SEARCH;
467     return NULL;
468     }
469    
470    
471     PhraseDelimiter = (unsigned char) srch->PhraseDelimiter;
472     max_size = header->maxwordlimit;
473    
474     curpos = words;
475    
476     /* split into words by whitespace and by the swish operator characters */
477    
478     while ( next_token( &curpos, &self->word, &self->lenword, PhraseDelimiter, inphrase ) )
479     {
480     tokens = (struct swline *) addswline( tokens, self->word );
481    
482    
483     if ( self->word[0] == PhraseDelimiter && !self->word[1] )
484     inphrase = !inphrase;
485     }
486    
487    
488     /* no search words found */
489     if ( !tokens )
490     return tokens;
491    
492    
493     inphrase = 0;
494    
495     temp = tokens;
496     while ( temp )
497     {
498    
499     /* do look-ahead processing first -- metanames */
500    
501     if ( !inphrase && isMetaNameOpNext(temp->next) )
502     {
503    
504     if( !getMetaNameByName( header, temp->line ) )
505     {
506     set_progerr( UNKNOWN_METANAME, sw, "'%s'", temp->line );
507     freeswline( tokens );
508     return NULL;
509     }
510    
511    
512     /* this might be an option with XML */
513     strtolower( temp->line );
514    
515     temp = temp->next;
516     continue;
517     }
518    
519    
520     /* skip operators */
521     if ( strlen( temp->line ) == 1 && isSearchOperatorChar( (unsigned char) temp->line[0], PhraseDelimiter, inphrase ) )
522     {
523    
524     if ( temp->line[0] == PhraseDelimiter && !temp->line[1] )
525     inphrase = !inphrase;
526    
527     temp = temp->next;
528     continue;
529     }
530    
531     /* this might be an option if case sensitive searches are used */
532     strtolower( temp->line );
533    
534    
535     /* check Boolean operators -- currently doesn't change it (search.c does) */
536     if ( !inphrase )
537     {
538     char *operator, *nextoperator;
539    
540     if ( (operator = isBooleanOperatorWord( temp->line )) )
541     {
542     /* replace the common "and not" with simply not" */
543     /* probably not the best place to do this level of processing */
544     /* since should also check for things like "and this" and "and and and not this" */
545     /* should probably be moved to end and recursively check for these (to catch "and and not") */
546     if (
547     temp->next
548     && ( strcmp( operator, AND_WORD ) == 0)
549     && ( (nextoperator = isBooleanOperatorWord( temp->next->line)))
550     && ( strcmp( nextoperator, NOT_WORD ) == 0)
551     ) {
552     struct swline *andword = temp; /* save position */
553    
554     temp = temp->next; /* now point to "not" word */
555     replace_swline( &tokens, andword, (struct swline *)NULL ); /* cut it out */
556     continue;
557     }
558    
559     temp = temp->next;
560     continue;
561     }
562     }
563    
564    
565     /* buzzwords */
566     if ( checkbuzzword( header, temp->line ) )
567     {
568     temp = temp->next;
569     continue;
570     }
571    
572    
573    
574     /* query words left. Turn into "swish_words" */
575     swish_words = NULL;
576     swish_words = parse_swish_words( sw, header, temp->line, max_size);
577    
578     if ( sw->lasterror )
579     return NULL;
580    
581    
582     next_node = temp->next;
583    
584     /* move into list.c at some point */
585     replace_swline( &tokens, temp, swish_words );
586     temp = next_node;
587    
588     }
589    
590     /* fudge wild cards back onto preceeding word */
591     for ( temp = tokens ; temp; temp = temp->next )
592     if ( temp->next && strcmp( temp->next->line, "*") == 0 )
593     fudge_wildcard( &tokens, temp );
594    
595    
596     return tokens;
597     }
598    

  ViewVC Help
Powered by ViewVC 1.1.22