swish-e/src/swish_words.c

/*
$Id: swish_words.c,v 1.15 2002/08/22 22:58:39 whmoseley Exp $
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**
**
** 2001-05-23  moseley  created - replaced parser in search.c
**
** 2001-12-11  moseley, updated to deal with swish operators inside of phrases
**                      Still broken with regard to double-quotes inside of phrases
**                      Very unlikely someone would want to search for a single double quote
**                      within a phrase.  It currently works if the double-quotes doesn't have
**                      white space around.  Really should tag the words as being operators, or
**                      or "swish words", or let the backslash stay in the query until searching.
**
*/

#include "swish.h"
#include "mem.h"
#include "string.h"
#include "search.h"
#include "index.h"
#include "file.h"
#include "list.h"
#include "hash.h"
#include "stemmer.h"
#include "soundex.h"
#include "double_metaphone.h"
#include "error.h"
#include "metanames.h"
#include "config.h"         // for _AND_WORD...
#include "search_alt.h"     // for AND_WORD... humm maybe needs better organization

struct MOD_Swish_Words
{
    char   *word;
    int     lenword;
};

/* 
  -- init structures for this module
*/

void initModule_Swish_Words (SWISH  *sw)
{
    struct MOD_Swish_Words *self;

    self = (struct MOD_Swish_Words *) emalloc(sizeof(struct MOD_Swish_Words));
    sw->SwishWords = self;

    /* initialize buffers used by indexstring */
    self->word = (char *) emalloc((self->lenword = MAXWORDLEN) + 1);

    return;
}

void freeModule_Swish_Words (SWISH *sw)
{
  struct MOD_Swish_Words *self = sw->SwishWords;

  efree( self->word );
  efree ( self );
  sw->SwishWords = NULL;

  return;
}


/* Returns true if the character is a search operator */
/* this could be a macro, but gcc is probably smart enough */

static int isSearchOperatorChar( int c, int phrase_delimiter, int inphrase )
{
    return inphrase
        ? ( '*' == c || c == phrase_delimiter )
        : ( '(' == c || ')' == c || '=' == c || '*' == c || c == phrase_delimiter );
}


/* This simply tokenizes by whitespace and by the special characters "()=" */
/* If within a phrase, then just splits by whitespace */

/* Funny how argv was joined into a string just to be split again... */

static int next_token( char **buf, char **word, int *lenword, int phrase_delimiter, int inphrase )
{
    int     i;
    int     backslash;

    **word = '\0';

    /* skip any leading whitespace */
    while ( **buf && isspace( (unsigned char) **buf) )
        (*buf)++;


    /* extract out word */
    i = 0;
    backslash = 0;
    
    while ( **buf && !isspace( (unsigned char) **buf) )
    {

        // This should be looking at swish words, not raw input
        //if ( i > max_size + 4 )   /* leave a little room for operators */
        //    progerr( "Search word exceeded maxwordlimit setting." ); 

    
        /* reallocate buffer, if needed -- only if maxwordlimit was set larger than MAXWORDLEN (1000) */
        if ( i == *lenword )
        {
            *lenword *= 2;
            *word = erealloc(*word, *lenword + 1);
        }


        /* backslash says take next char as-is */
        /* note that you cannot backslash whitespace */
        if ( '\\' == **buf && ! backslash++ )
        {
            (*buf)++;
            continue;
        }


        if ( backslash || !isSearchOperatorChar( (unsigned char) **buf, phrase_delimiter, inphrase ) )
        {
            backslash = 0;
            
            (*word)[i++] = **buf;  /* can't this be done in one line? */
            (*buf)++;
        }

        else  /* this is a search operator char */
        {
            if ( **word )  /* break if characters already found - end of this token */
                break;

            (*word)[i++] = **buf;  /* save the search operator char as it's own token, and end. */
            (*buf)++;
            break;
        }

    }


    /* flag if we found a token */
    if ( i )
    {
        (*word)[i] = '\0';
        return 1;
    }

    return 0;
}


static int next_swish_word(INDEXDATAHEADER *header, char **buf, char **word, int *lenword )
{
    int     i;

    /* skip non-wordchars */
    while ( **buf && !header->wordcharslookuptable[tolower((unsigned char)(**buf))] )
        (*buf)++;

    i = 0;
    while ( **buf && header->wordcharslookuptable[tolower((unsigned char)(**buf))] )
    {
        /* reallocate buffer, if needed */
        if ( i + 1 == *lenword )
        {
            *lenword *= 2;
            *word = erealloc(*word, *lenword + 1);
        }

        (*word)[i++] = **buf;
        (*word)[i] = '\0';
        (*buf)++;
    }


    if ( i )
    {
        stripIgnoreLastChars( header, *word);
        stripIgnoreFirstChars(header, *word);


        return **word ? 1 : 0;
    }

    return 0;
}


/* Convert a word into swish words */

static struct swline *parse_swish_words( SWISH *sw, INDEXDATAHEADER *header, char *word, int max_size )
{
    struct  swline  *swish_words = NULL;
    char   *curpos;
    struct  MOD_Swish_Words *self = sw->SwishWords;


    /* Some initial adjusting of the word */


    TranslateChars(header->translatecharslookuptable, (unsigned char *)word);


    curpos = word;
    while( next_swish_word( header, &curpos, &self->word, &self->lenword ) )
    {
        /* Check Begin & EndCharacters */
        if (!header->begincharslookuptable[(int) ((unsigned char) self->word[0])])
            continue;


        if (!header->endcharslookuptable[(int) ((unsigned char) self->word[strlen(self->word) - 1])])
            continue;


        /* limit by stopwords, min/max length, max number of digits, ... */
        /* ------- processed elsewhere for search ---------
        if (!isokword(sw, self->word, indexf))
            continue;
        - stopwords are processed in search.c because removing them may have side effects
        - maxwordlen is checked when first tokenizing for security reasons
        - limit by vowels, consonants and digits is not needed since search will just fail
        ----------- */
        if ( strlen( self->word ) > max_size )
        {
            sw->lasterror = SEARCH_WORD_TOO_BIG;
            return NULL;
        }

        if (!*self->word)
            continue;

        switch ( header->fuzzy_mode )
        {
            case FUZZY_NONE:
                swish_words = (struct swline *) addswline( swish_words, self->word );
                break;

            case FUZZY_STEMMING:
                Stem(&self->word, &self->lenword);
                if ( *self->word ) // should not happen
                    swish_words = (struct swline *) addswline( swish_words, self->word );
                break;

                
            case FUZZY_SOUNDEX:
                soundex(self->word);
                if ( *self->word )
                    swish_words = (struct swline *) addswline( swish_words, self->word );
                break;

            case FUZZY_METAPHONE:
            case FUZZY_DOUBLE_METAPHONE:
                {
                    char *codes[2];
                    DoubleMetaphone(self->word, codes);

                    if ( !(*codes[0]) )
                    {
                        efree( codes[0] );
                        efree( codes[1] );
                        swish_words = (struct swline *) addswline( swish_words, self->word );
                        break;
                    }


                    /* check if just METAPHONE or only one word returned (e.g. they are the same) */
                
                    if ( header->fuzzy_mode == FUZZY_METAPHONE || !(*codes[1]) || !strcmp(codes[0], codes[1]) )
                    {
                        swish_words = (struct swline *) addswline( swish_words, codes[0] );
                    }
                    else
                    {
                        /* yuck! */
                        swish_words = (struct swline *) addswline( swish_words, "(" );
                        swish_words = (struct swline *) addswline( swish_words, codes[0] );
                        swish_words = (struct swline *) addswline( swish_words, "or" );
                        swish_words = (struct swline *) addswline( swish_words, codes[1] );
                        swish_words = (struct swline *) addswline( swish_words, ")" );
                    }

                    efree( codes[0] );
                    efree( codes[1] );
                }
                break;
                

            default:
                progerr("Invalid FuzzyMode '%d'", (int)header->fuzzy_mode );
        }
    }

    return swish_words;


}

/* This is really dumb.  swline needs a ->prev entry, really search needs its own linked list */
/* Replaces a given node with another node (or nodes) */

static void  replace_swline( struct swline **original, struct swline *entry, struct swline *new_words )
{
    struct swline *temp;


    temp = *original;


    /* check for case of first one */
    if ( temp == entry )
    {
        if ( new_words )
        {
            new_words->nodep->next = temp->next;
            new_words->nodep = temp->nodep;
            *original = new_words;
        } 
        else /* just delete first node */
        {
            if ( entry->next )
                entry->next->nodep = entry->nodep; /* point next one to last one */
            *original = entry->next;
        }
    }


    else /* not first node */
    {
        /* search for the preceeding node */
        for ( temp = *original; temp && temp->next != entry; temp = temp->next );

        if ( !temp )
            progerr("Fatal Error: Failed to find insert point in replace_swline");

        if ( new_words )
        {
            /* set the previous record to point to the start of the new entry (or entries) */
            temp->next = new_words;

            /* set the end of the new string to point to the next entry */
            new_words->nodep->next = entry->next;
        }
        else /* delete the entry */
            temp->next = temp->next->next;
    }
    

    /* now free the removed item */
    efree( entry->line );
    efree( entry );


}


static int checkbuzzword(INDEXDATAHEADER *header, char *word )
{
    if ( !header->buzzwords_used_flag )
        return 0;

        
    /* only strip when buzzwords are being used since stripped again as a "swish word" */
    stripIgnoreLastChars( header, word );
    stripIgnoreFirstChars( header, word );
    
    if ( !*word ) /* stripped clean? */
        return 0;


    return isbuzzword( header, word);
}

/* I hope this doesn't live too long */

static void fudge_wildcard( struct swline **original, struct swline *entry )
{
    char    *tmp;
    struct swline *wild_card;

    wild_card = entry->next;        

    /* reallocate a string */
    tmp = entry->line;
    entry->line = emalloc( strlen( entry->line ) + 2 );
    strcpy( entry->line, tmp);
    efree( tmp );
    strcat( entry->line, "*");

    efree( wild_card->line );


    /* removing last entry - set pointer to new end */
    if ( (*original)->nodep == wild_card )
        (*original)->nodep = entry;

    /* and point next to the one after next */
    entry->next = wild_card->next;


    efree( wild_card );
}

    
/******************** Public Functions *********************************/

char *isBooleanOperatorWord( char * word )
{
    /* don't need strcasecmp here, since word should alrady be lowercase -- need to check alt-search first */
    if (!strcasecmp( word, _AND_WORD))
        return AND_WORD;

    if (!strcasecmp( word, _OR_WORD))
        return OR_WORD;

    if (!strcasecmp( word, _NOT_WORD))
        return NOT_WORD;

    return (char *)NULL;
}


struct swline *tokenize_query_string( SWISH *sw, char *words, INDEXDATAHEADER *header )
{
    char   *curpos;               /* current position in the words string */
    struct  swline *tokens = NULL;
    struct  swline *temp;
    struct  swline *swish_words;
    struct  swline *next_node;
    struct  MOD_Swish_Words *self = sw->SwishWords;
    struct  MOD_Search *srch = sw->Search;
    unsigned char PhraseDelimiter;
    int     max_size;
    int     inphrase = 0;


    /* Probably won't get to this point */
    if ( !words || !*words )
    {
        sw->lasterror = NO_WORDS_IN_SEARCH;
        return NULL;
    }


    PhraseDelimiter = (unsigned char) srch->PhraseDelimiter;
    max_size        = header->maxwordlimit;

    curpos = words;  

    /* split into words by whitespace and by the swish operator characters */
    
    while ( next_token( &curpos, &self->word, &self->lenword, PhraseDelimiter, inphrase ) )
    {
        tokens = (struct swline *) addswline( tokens, self->word );


        if ( self->word[0] == PhraseDelimiter && !self->word[1] )
            inphrase = !inphrase;
    }


    /* no search words found */
    if ( !tokens )
        return tokens;


    inphrase = 0;

    temp = tokens;
    while ( temp )
    {

        /* do look-ahead processing first -- metanames */

        if ( !inphrase && isMetaNameOpNext(temp->next) )
        {

            if( !getMetaNameByName( header, temp->line ) )
            {
                set_progerr( UNKNOWN_METANAME, sw, "'%s'", temp->line );
                freeswline( tokens );
                return NULL;
            }


            /* this might be an option with XML */
            strtolower( temp->line );

            temp = temp->next;
            continue;
        }
                

        /* skip operators */
        if ( strlen( temp->line ) == 1 && isSearchOperatorChar( (unsigned char) temp->line[0], PhraseDelimiter, inphrase ) )
        {

            if ( temp->line[0] == PhraseDelimiter && !temp->line[1] )
                inphrase = !inphrase;

            temp = temp->next;
            continue;
        }

        /* this might be an option if case sensitive searches are used */
        strtolower( temp->line );


        /* check Boolean operators -- currently doesn't change it (search.c does) */
        if ( !inphrase )
        {
            char *operator, *nextoperator;

            if ( (operator = isBooleanOperatorWord( temp->line )) )
            {
                /* replace the common "and not" with simply not" */
                /* probably not the best place to do this level of processing */
                /* since should also check for things like "and this" and "and and and not this" */
                /* should probably be moved to end and recursively check for these (to catch "and and not") */
                if (
                    temp->next
                    && ( strcmp( operator, AND_WORD ) == 0)
                    && ( (nextoperator = isBooleanOperatorWord( temp->next->line)))
                    && ( strcmp( nextoperator, NOT_WORD ) == 0)
                ) {
                    struct swline *andword = temp; /* save position */

                    temp = temp->next;  /* now point to "not" word */
                    replace_swline( &tokens, andword, (struct swline *)NULL ); /* cut it out */
                    continue;
                }

                temp = temp->next;
                continue;
            }
        }
    

        /* buzzwords */
        if ( checkbuzzword( header, temp->line ) )
        {
            temp = temp->next;
            continue;
        }


        /* query words left.  Turn into "swish_words" */
        swish_words = NULL;
        swish_words = parse_swish_words( sw, header, temp->line, max_size);

        if ( sw->lasterror )
            return NULL;

        
        next_node = temp->next;

        /* move into list.c at some point */
        replace_swline( &tokens, temp, swish_words );
        temp = next_node;
        
    }

    /* fudge wild cards back onto preceeding word */
    for ( temp = tokens ; temp; temp = temp->next )
        if ( temp->next && strcmp( temp->next->line, "*") == 0 )
            fudge_wildcard( &tokens, temp );


    return tokens;
}

1	adcroft	1.1	/*
2			$Id: swish_words.c,v 1.15 2002/08/22 22:58:39 whmoseley Exp $
3			**
4			** This program and library is free software; you can redistribute it and/or
5			** modify it under the terms of the GNU (Library) General Public License
6			** as published by the Free Software Foundation; either version 2
7			** of the License, or any later version.
8			**
9			** This program is distributed in the hope that it will be useful,
10			** but WITHOUT ANY WARRANTY; without even the implied warranty of
11			** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12			** GNU (Library) General Public License for more details.
13			**
14			** You should have received a copy of the GNU (Library) General Public License
15			** along with this program; if not, write to the Free Software
16			** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17			**
18			**
19			** 2001-05-23 moseley created - replaced parser in search.c
20			**
21			** 2001-12-11 moseley, updated to deal with swish operators inside of phrases
22			** Still broken with regard to double-quotes inside of phrases
23			** Very unlikely someone would want to search for a single double quote
24			** within a phrase. It currently works if the double-quotes doesn't have
25			** white space around. Really should tag the words as being operators, or
26			** or "swish words", or let the backslash stay in the query until searching.
27			**
28			*/
29
30			#include "swish.h"
31			#include "mem.h"
32			#include "string.h"
33			#include "search.h"
34			#include "index.h"
35			#include "file.h"
36			#include "list.h"
37			#include "hash.h"
38			#include "stemmer.h"
39			#include "soundex.h"
40			#include "double_metaphone.h"
41			#include "error.h"
42			#include "metanames.h"
43			#include "config.h" // for _AND_WORD...
44			#include "search_alt.h" // for AND_WORD... humm maybe needs better organization
45
46			struct MOD_Swish_Words
47			{
48			char *word;
49			int lenword;
50			};
51
52			/*
53			-- init structures for this module
54			*/
55
56			void initModule_Swish_Words (SWISH *sw)
57			{
58			struct MOD_Swish_Words *self;
59
60			self = (struct MOD_Swish_Words *) emalloc(sizeof(struct MOD_Swish_Words));
61			sw->SwishWords = self;
62
63			/* initialize buffers used by indexstring */
64			self->word = (char *) emalloc((self->lenword = MAXWORDLEN) + 1);
65
66			return;
67			}
68
69			void freeModule_Swish_Words (SWISH *sw)
70			{
71			struct MOD_Swish_Words *self = sw->SwishWords;
72
73			efree( self->word );
74			efree ( self );
75			sw->SwishWords = NULL;
76
77			return;
78			}
79
80
81
82
83
84			/* Returns true if the character is a search operator */
85			/* this could be a macro, but gcc is probably smart enough */
86
87			static int isSearchOperatorChar( int c, int phrase_delimiter, int inphrase )
88			{
89			return inphrase
90			? ( '*' == c \|\| c == phrase_delimiter )
91			: ( '(' == c \|\| ')' == c \|\| '=' == c \|\| '*' == c \|\| c == phrase_delimiter );
92			}
93
94
95			/* This simply tokenizes by whitespace and by the special characters "()=" */
96			/* If within a phrase, then just splits by whitespace */
97
98			/* Funny how argv was joined into a string just to be split again... */
99
100			static int next_token( char buf, char word, int *lenword, int phrase_delimiter, int inphrase )
101			{
102			int i;
103			int backslash;
104
105			**word = '\0';
106
107			/* skip any leading whitespace */
108			while ( buf && isspace( (unsigned char) buf) )
109			(*buf)++;
110
111
112			/* extract out word */
113			i = 0;
114			backslash = 0;
115
116			while ( buf && !isspace( (unsigned char) buf) )
117			{
118
119			// This should be looking at swish words, not raw input
120			//if ( i > max_size + 4 ) /* leave a little room for operators */
121			// progerr( "Search word exceeded maxwordlimit setting." );
122
123
124			/* reallocate buffer, if needed -- only if maxwordlimit was set larger than MAXWORDLEN (1000) */
125			if ( i == *lenword )
126			{
127			lenword = 2;
128			word = erealloc(word, *lenword + 1);
129			}
130
131
132			/* backslash says take next char as-is */
133			/* note that you cannot backslash whitespace */
134			if ( '\\' == **buf && ! backslash++ )
135			{
136			(*buf)++;
137			continue;
138			}
139
140
141			if ( backslash \|\| !isSearchOperatorChar( (unsigned char) **buf, phrase_delimiter, inphrase ) )
142			{
143			backslash = 0;
144
145			(word)[i++] = buf; / can't this be done in one line? */
146			(*buf)++;
147			}
148
149			else /* this is a search operator char */
150			{
151			if ( *word ) / break if characters already found - end of this token */
152			break;
153
154			(word)[i++] = buf; / save the search operator char as it's own token, and end. */
155			(*buf)++;
156			break;
157			}
158
159			}
160
161
162			/* flag if we found a token */
163			if ( i )
164			{
165			(*word)[i] = '\0';
166			return 1;
167			}
168
169			return 0;
170			}
171
172
173			static int next_swish_word(INDEXDATAHEADER header, char buf, char word, int lenword )
174			{
175			int i;
176
177			/* skip non-wordchars */
178			while ( buf && !header->wordcharslookuptable[tolower((unsigned char)(buf))] )
179			(*buf)++;
180
181			i = 0;
182			while ( buf && header->wordcharslookuptable[tolower((unsigned char)(buf))] )
183			{
184			/* reallocate buffer, if needed */
185			if ( i + 1 == *lenword )
186			{
187			lenword = 2;
188			word = erealloc(word, *lenword + 1);
189			}
190
191			(word)[i++] = *buf;
192			(*word)[i] = '\0';
193			(*buf)++;
194			}
195
196
197			if ( i )
198			{
199			stripIgnoreLastChars( header, *word);
200			stripIgnoreFirstChars(header, *word);
201
202
203			return **word ? 1 : 0;
204			}
205
206			return 0;
207			}
208
209
210			/* Convert a word into swish words */
211
212			static struct swline parse_swish_words( SWISH sw, INDEXDATAHEADER header, char word, int max_size )
213			{
214			struct swline *swish_words = NULL;
215			char *curpos;
216			struct MOD_Swish_Words *self = sw->SwishWords;
217
218
219
220			/* Some initial adjusting of the word */
221
222
223			TranslateChars(header->translatecharslookuptable, (unsigned char *)word);
224
225
226
227			curpos = word;
228			while( next_swish_word( header, &curpos, &self->word, &self->lenword ) )
229			{
230			/* Check Begin & EndCharacters */
231			if (!header->begincharslookuptable[(int) ((unsigned char) self->word[0])])
232			continue;
233
234
235			if (!header->endcharslookuptable[(int) ((unsigned char) self->word[strlen(self->word) - 1])])
236			continue;
237
238
239			/* limit by stopwords, min/max length, max number of digits, ... */
240			/* ------- processed elsewhere for search ---------
241			if (!isokword(sw, self->word, indexf))
242			continue;
243			- stopwords are processed in search.c because removing them may have side effects
244			- maxwordlen is checked when first tokenizing for security reasons
245			- limit by vowels, consonants and digits is not needed since search will just fail
246			----------- */
247			if ( strlen( self->word ) > max_size )
248			{
249			sw->lasterror = SEARCH_WORD_TOO_BIG;
250			return NULL;
251			}
252
253			if (!*self->word)
254			continue;
255
256			switch ( header->fuzzy_mode )
257			{
258			case FUZZY_NONE:
259			swish_words = (struct swline *) addswline( swish_words, self->word );
260			break;
261
262			case FUZZY_STEMMING:
263			Stem(&self->word, &self->lenword);
264			if ( *self->word ) // should not happen
265			swish_words = (struct swline *) addswline( swish_words, self->word );
266			break;
267
268
269			case FUZZY_SOUNDEX:
270			soundex(self->word);
271			if ( *self->word )
272			swish_words = (struct swline *) addswline( swish_words, self->word );
273			break;
274
275			case FUZZY_METAPHONE:
276			case FUZZY_DOUBLE_METAPHONE:
277			{
278			char *codes[2];
279			DoubleMetaphone(self->word, codes);
280
281			if ( !(*codes[0]) )
282			{
283			efree( codes[0] );
284			efree( codes[1] );
285			swish_words = (struct swline *) addswline( swish_words, self->word );
286			break;
287			}
288
289
290			/* check if just METAPHONE or only one word returned (e.g. they are the same) */
291
292			if ( header->fuzzy_mode == FUZZY_METAPHONE \|\| !(*codes[1]) \|\| !strcmp(codes[0], codes[1]) )
293			{
294			swish_words = (struct swline *) addswline( swish_words, codes[0] );
295			}
296			else
297			{
298			/* yuck! */
299			swish_words = (struct swline *) addswline( swish_words, "(" );
300			swish_words = (struct swline *) addswline( swish_words, codes[0] );
301			swish_words = (struct swline *) addswline( swish_words, "or" );
302			swish_words = (struct swline *) addswline( swish_words, codes[1] );
303			swish_words = (struct swline *) addswline( swish_words, ")" );
304			}
305
306			efree( codes[0] );
307			efree( codes[1] );
308			}
309			break;
310
311
312			default:
313			progerr("Invalid FuzzyMode '%d'", (int)header->fuzzy_mode );
314			}
315			}
316
317			return swish_words;
318
319
320			}
321
322			/* This is really dumb. swline needs a ->prev entry, really search needs its own linked list */
323			/* Replaces a given node with another node (or nodes) */
324
325			static void replace_swline( struct swline *original, struct swline entry, struct swline *new_words )
326			{
327			struct swline *temp;
328
329
330			temp = *original;
331
332
333			/* check for case of first one */
334			if ( temp == entry )
335			{
336			if ( new_words )
337			{
338			new_words->nodep->next = temp->next;
339			new_words->nodep = temp->nodep;
340			*original = new_words;
341			}
342			else /* just delete first node */
343			{
344			if ( entry->next )
345			entry->next->nodep = entry->nodep; /* point next one to last one */
346			*original = entry->next;
347			}
348			}
349
350
351
352			else /* not first node */
353			{
354			/* search for the preceeding node */
355			for ( temp = *original; temp && temp->next != entry; temp = temp->next );
356
357			if ( !temp )
358			progerr("Fatal Error: Failed to find insert point in replace_swline");
359
360			if ( new_words )
361			{
362			/* set the previous record to point to the start of the new entry (or entries) */
363			temp->next = new_words;
364
365			/* set the end of the new string to point to the next entry */
366			new_words->nodep->next = entry->next;
367			}
368			else /* delete the entry */
369			temp->next = temp->next->next;
370			}
371
372
373			/* now free the removed item */
374			efree( entry->line );
375			efree( entry );
376
377
378			}
379
380
381			static int checkbuzzword(INDEXDATAHEADER header, char word )
382			{
383			if ( !header->buzzwords_used_flag )
384			return 0;
385
386
387			/* only strip when buzzwords are being used since stripped again as a "swish word" */
388			stripIgnoreLastChars( header, word );
389			stripIgnoreFirstChars( header, word );
390
391			if ( !word ) / stripped clean? */
392			return 0;
393
394
395			return isbuzzword( header, word);
396			}
397
398			/* I hope this doesn't live too long */
399
400			static void fudge_wildcard( struct swline *original, struct swline entry )
401			{
402			char *tmp;
403			struct swline *wild_card;
404
405			wild_card = entry->next;
406
407			/* reallocate a string */
408			tmp = entry->line;
409			entry->line = emalloc( strlen( entry->line ) + 2 );
410			strcpy( entry->line, tmp);
411			efree( tmp );
412			strcat( entry->line, "*");
413
414			efree( wild_card->line );
415
416
417			/* removing last entry - set pointer to new end */
418			if ( (*original)->nodep == wild_card )
419			(*original)->nodep = entry;
420
421			/* and point next to the one after next */
422			entry->next = wild_card->next;
423
424
425			efree( wild_card );
426			}
427
428
429
430			/****************** Public Functions *******************************/
431
432			char isBooleanOperatorWord( char word )
433			{
434			/* don't need strcasecmp here, since word should alrady be lowercase -- need to check alt-search first */
435			if (!strcasecmp( word, _AND_WORD))
436			return AND_WORD;
437
438			if (!strcasecmp( word, _OR_WORD))
439			return OR_WORD;
440
441			if (!strcasecmp( word, _NOT_WORD))
442			return NOT_WORD;
443
444			return (char *)NULL;
445			}
446
447
448
449			struct swline tokenize_query_string( SWISH sw, char words, INDEXDATAHEADER header )
450			{
451			char curpos; / current position in the words string */
452			struct swline *tokens = NULL;
453			struct swline *temp;
454			struct swline *swish_words;
455			struct swline *next_node;
456			struct MOD_Swish_Words *self = sw->SwishWords;
457			struct MOD_Search *srch = sw->Search;
458			unsigned char PhraseDelimiter;
459			int max_size;
460			int inphrase = 0;
461
462
463			/* Probably won't get to this point */
464			if ( !words \|\| !*words )
465			{
466			sw->lasterror = NO_WORDS_IN_SEARCH;
467			return NULL;
468			}
469
470
471			PhraseDelimiter = (unsigned char) srch->PhraseDelimiter;
472			max_size = header->maxwordlimit;
473
474			curpos = words;
475
476			/* split into words by whitespace and by the swish operator characters */
477
478			while ( next_token( &curpos, &self->word, &self->lenword, PhraseDelimiter, inphrase ) )
479			{
480			tokens = (struct swline *) addswline( tokens, self->word );
481
482
483			if ( self->word[0] == PhraseDelimiter && !self->word[1] )
484			inphrase = !inphrase;
485			}
486
487
488			/* no search words found */
489			if ( !tokens )
490			return tokens;
491
492
493			inphrase = 0;
494
495			temp = tokens;
496			while ( temp )
497			{
498
499			/* do look-ahead processing first -- metanames */
500
501			if ( !inphrase && isMetaNameOpNext(temp->next) )
502			{
503
504			if( !getMetaNameByName( header, temp->line ) )
505			{
506			set_progerr( UNKNOWN_METANAME, sw, "'%s'", temp->line );
507			freeswline( tokens );
508			return NULL;
509			}
510
511
512			/* this might be an option with XML */
513			strtolower( temp->line );
514
515			temp = temp->next;
516			continue;
517			}
518
519
520			/* skip operators */
521			if ( strlen( temp->line ) == 1 && isSearchOperatorChar( (unsigned char) temp->line[0], PhraseDelimiter, inphrase ) )
522			{
523
524			if ( temp->line[0] == PhraseDelimiter && !temp->line[1] )
525			inphrase = !inphrase;
526
527			temp = temp->next;
528			continue;
529			}
530
531			/* this might be an option if case sensitive searches are used */
532			strtolower( temp->line );
533
534
535			/* check Boolean operators -- currently doesn't change it (search.c does) */
536			if ( !inphrase )
537			{
538			char operator, nextoperator;
539
540			if ( (operator = isBooleanOperatorWord( temp->line )) )
541			{
542			/* replace the common "and not" with simply not" */
543			/* probably not the best place to do this level of processing */
544			/* since should also check for things like "and this" and "and and and not this" */
545			/* should probably be moved to end and recursively check for these (to catch "and and not") */
546			if (
547			temp->next
548			&& ( strcmp( operator, AND_WORD ) == 0)
549			&& ( (nextoperator = isBooleanOperatorWord( temp->next->line)))
550			&& ( strcmp( nextoperator, NOT_WORD ) == 0)
551			) {
552			struct swline andword = temp; / save position */
553
554			temp = temp->next; /* now point to "not" word */
555			replace_swline( &tokens, andword, (struct swline )NULL ); / cut it out */
556			continue;
557			}
558
559			temp = temp->next;
560			continue;
561			}
562			}
563
564
565			/* buzzwords */
566			if ( checkbuzzword( header, temp->line ) )
567			{
568			temp = temp->next;
569			continue;
570			}
571
572
573
574			/* query words left. Turn into "swish_words" */
575			swish_words = NULL;
576			swish_words = parse_swish_words( sw, header, temp->line, max_size);
577
578			if ( sw->lasterror )
579			return NULL;
580
581
582			next_node = temp->next;
583
584			/* move into list.c at some point */
585			replace_swline( &tokens, temp, swish_words );
586			temp = next_node;
587
588			}
589
590			/* fudge wild cards back onto preceeding word */
591			for ( temp = tokens ; temp; temp = temp->next )
592			if ( temp->next && strcmp( temp->next->line, "*") == 0 )
593			fudge_wildcard( &tokens, temp );
594
595
596			return tokens;
597			}
598