/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/search.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/search.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     $Id: search.c,v 1.104 2002/08/22 22:58:39 whmoseley Exp $
3     **
4     ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5     ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6     **
7     ** This program and library is free software; you can redistribute it and/or
8     ** modify it under the terms of the GNU (Library) General Public License
9     ** as published by the Free Software Foundation; either version 2
10     ** of the License, or any later version.
11     **
12     ** This program is distributed in the hope that it will be useful,
13     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15     ** GNU (Library) General Public License for more details.
16     **
17     ** You should have received a copy of the GNU (Library) General Public License
18     ** along with this program; if not, write to the Free Software
19     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20     **-----------------------------------------------------------------
21     ** Changes in expandstar and parseterm to fix the wildcard * problem.
22     ** G. Hill, ghill@library.berkeley.edu 3/11/97
23     **
24     ** Changes in notresultlist, parseterm, and fixnot to fix the NOT problem
25     ** G. Hill, ghill@library.berkeley.edu 3/13/97
26     **
27     ** Changes in search, parseterm, fixnot, operate, getfileinfo
28     ** to support METADATA
29     ** G. Hill 3/18/97 ghill@library.berkeley.edu
30     **
31     ** Change in search to allow for search with a list including
32     ** also some empty indexes.
33     ** G. Hill after a suggestion by J. Winstead 12/18/97
34     **
35     ** Created countResults for number of hits in search
36     ** G. Hill 12/18/97
37     **
38     **
39     ** Change in search to allow maxhits to return N number
40     ** of results for each index specified
41     ** D. Norris after suggestion by D. Chrisment 08/29/99
42     **
43     ** Created resultmaxhits as a global, renewable maxhits
44     ** D. Norris 08/29/99
45     **
46     ** added word length arg to Stem() call for strcat overflow checking in stemmer.c
47     ** added safestrcpy() macro to avoid corruption from strcpy overflow
48     ** SRE 11/17/99
49     **
50     ** 10/10/99 & 11/23/99 - Bill Moseley (merged by SRE)
51     ** - Changed to stem words *before* expanding with expandstar
52     ** so can find words in the index
53     ** - Moved META tag check before expandstar so META names don't get
54     ** expanded!
55     **
56     ** fixed cast to int problems pointed out by "gcc -Wall"
57     ** SRE 2/22/00
58     **
59     ** fixed search() for case where stopword is followed by rule:
60     ** stopword was removed, rule was left, no matches ever found
61     ** added "# Stopwords removed:" to output header so caller can
62     ** trap actions of IGNORE_STOPWORDS_IN_QUERY
63     ** SRE 2/25/00
64     **
65     ** 04/00 - Jose Ruiz
66     ** Added code for phrase search
67     ** - New function phraseresultlists
68     ** - New function expandphrase
69     **
70     ** 04/00 - Jose Ruiz
71     ** Added freeresult function for freing results memory
72     ** Also added changes to orresultlists andresultlists notresultlist
73     ** for freing memory
74     **
75     ** 04/00 - Jose Ruiz
76     ** Now use bighash instead of hash for better performance in
77     ** orresultlist (a* or b*). Also changed hash.c
78     **
79     ** 04/00 - Jose Ruiz
80     ** Function getfileinfo rewrite
81     ** - Now use a hash approach for faster searching
82     ** - Solves the long timed searches (a* or b* or c*)
83     **
84     ** 04/00 - Jose Ruiz
85     ** Ordering of result rewrite
86     ** Now builtin C function qsort is used for faster ordering of results
87     ** This is useful when lots of results are found
88     ** For example: not (axf) -> This gives you all the documents!!
89     **
90     ** 06/00 - Jose Ruiz
91     ** Rewrite of andresultlits and phraseresultlists for better permonace
92     ** New function notresultlits for better performance
93     **
94     ** 07/00 and 08/00 - Jose Ruiz
95     ** Many modifications to make all search functions thread safe
96     **
97     ** 08/00 - Added ascending and descending capabilities in results sorting
98     **
99     ** 2001-02-xx rasc search call changed, tolower changed...
100     ** 2001-03-03 rasc altavista search, translatechar in headers
101     ** 2001-03-13 rasc definable logical operators via sw->SearchAlt->srch_op.[and|or|nor]
102     ** bugfix in parse_search_string handling...
103     ** 2001-03-14 rasc resultHeaderOutput -H <n>
104     **
105     ** 2001-05-23 moseley - replace parse_search_string with new parser
106     **
107     */
108    
109     #include "swish.h"
110     #include "mem.h"
111     #include "string.h"
112     #include "search.h"
113     #include "index.h"
114     #include "file.h"
115     #include "list.h"
116     #include "merge.h"
117     #include "hash.h"
118     #include "docprop.h"
119     #include "error.h"
120     #include "compress.h"
121     /* removed stuff
122     #include "deflate.h"
123     */
124     #include "metanames.h"
125     #include "result_sort.h"
126     #include "result_output.h"
127     #include "search_alt.h"
128     #include "db.h"
129     #include "swish_words.h"
130     #include "swish_qsort.h"
131    
132     #include "proplimit.h"
133    
134     #include "rank.h"
135    
136    
137     /*** ??? $$$ fix this ****/
138     #define TOTAL_WORDS_FIX 1;
139     /*
140     -- init structures for this module
141     */
142    
143     void initModule_Search (SWISH *sw)
144    
145     {
146     struct MOD_Search *srch;
147    
148     srch = (struct MOD_Search *) emalloc(sizeof(struct MOD_Search));
149     sw->Search = srch;
150    
151    
152     /* Default variables for search */
153     srch->maxhits = -1;
154     srch->beginhits = 0;
155    
156     srch->currentMaxPropertiesToDisplay = 0;
157     srch->numPropertiesToDisplay = 0;
158    
159     srch->db_results = NULL;
160    
161     srch->propNameToDisplay =NULL;
162    
163     srch->PhraseDelimiter=PHRASE_DELIMITER_CHAR;
164    
165     srch->bigrank = 0;
166    
167     srch->resultSearchZone = Mem_ZoneCreate("resultSearch Zone", 0, 0);
168    
169     return;
170     }
171    
172     /* Resets memory of vars used by ResultSortt properties configuration */
173     void resetModule_Search (SWISH *sw)
174    
175     {
176     struct DB_RESULTS *tmp;
177     struct MOD_Search *srch = sw->Search;
178     int i;
179     IndexFILE *tmpindexlist;
180    
181    
182    
183     /* Default variables for search */
184     srch->maxhits = -1;
185     srch->beginhits = 0;
186    
187     /* Free results from search if they exists */
188    
189     while ( srch->db_results )
190     {
191     tmp = srch->db_results->next;
192     freeresultlist(sw,srch->db_results);
193     efree(srch->db_results);
194     srch->db_results = tmp;
195     }
196    
197    
198    
199    
200    
201     Mem_ZoneReset(srch->resultSearchZone);
202    
203    
204     /* Free display props arrays */
205     /* First the common part to all the index files */
206     if (srch->propNameToDisplay)
207     {
208     for(i=0;i<srch->numPropertiesToDisplay;i++)
209     efree(srch->propNameToDisplay[i]);
210    
211     efree(srch->propNameToDisplay);
212     }
213    
214     srch->propNameToDisplay=NULL;
215     srch->numPropertiesToDisplay=0;
216     srch->currentMaxPropertiesToDisplay=0;
217    
218     /* Now the IDs of each index file */
219     for(tmpindexlist=sw->indexlist;tmpindexlist;tmpindexlist=tmpindexlist->next)
220     {
221     if (tmpindexlist->propIDToDisplay)
222     efree(tmpindexlist->propIDToDisplay);
223    
224     tmpindexlist->propIDToDisplay=NULL;
225     }
226     }
227    
228     /*
229     -- release all wired memory for this module
230     */
231    
232     void freeModule_Search (SWISH *sw)
233    
234     {
235     struct MOD_Search *srch = sw->Search;
236    
237     /* free module data */
238     Mem_ZoneFree(&srch->resultSearchZone);
239    
240     efree (srch);
241     sw->Search = NULL;
242    
243     return;
244     }
245    
246     // #define DUMP_RESULTS 1
247    
248    
249     #ifdef DUMP_RESULTS
250    
251     void dump_result_lists( SWISH *sw, char *message )
252    
253     {
254     struct DB_RESULTS *db_results = sw->Search->db_results;
255     int cnt = 0;
256    
257     printf("\nDump Results: (%s)\n", message );
258    
259     while ( db_results )
260     {
261     RESULT *result;
262     printf("Looking at index\n");
263     if ( !db_results->resultlist )
264     {
265     printf("no resultlist\n");
266     continue;
267     }
268     result = db_results->resultlist->head;
269    
270     if ( !result )
271     {
272     printf("resultlist, but head is null\n");
273     continue;
274     }
275     while ( result )
276     {
277     printf("Result (%2d): filenum '%d' from index file '%s'\n", ++cnt, result->filenum, result->indexf->line );
278     result = result->next;
279     }
280    
281     printf("end of results for index\n");
282    
283     db_results = db_results->next;
284     }
285     printf("end of results\n\n");
286     }
287     #endif
288    
289    
290    
291    
292     /*
293     ** ----------------------------------------------
294     **
295     ** Module config code starts here
296     **
297     ** ----------------------------------------------
298     */
299    
300    
301     /*
302     -- Config Directives
303     -- Configuration directives for this Module
304     -- return: 0/1 = none/config applied
305     */
306    
307     int configModule_Search (SWISH *sw, StringList *sl)
308    
309     {
310     //struct MOD_Search *srch = sw->Search;
311     //char *w0 = sl->word[0];
312     int retval = 0;
313    
314     return retval;
315     }
316    
317    
318     /* Compare two integers */
319     /* This routine is used by qsort */
320     int icomp(const void *s1, const void *s2)
321     {
322     return (*(int *) s1 - *(int *) s2);
323     }
324    
325     /* Compare two positions as stored in posdata */
326     /* This routine is used by qsort */
327     int icomp_posdata(const void *s1, const void *s2)
328     {
329     return (GET_POSITION(*(int *) s1) - GET_POSITION(*(int *) s2));
330     }
331    
332     /* 01/2001 Jose Ruiz */
333     /* Compare RESULTS using RANK */
334     /* This routine is used by qsort */
335     int compResultsByFileNum(const void *s1, const void *s2)
336     {
337     return ((*(RESULT * const *) s1)->filenum - (*(RESULT * const *) s2)->filenum);
338     }
339    
340     /* 04/00 Jose Ruiz */
341     /* Simple routing for comparing pointers to integers in order to
342     get an ascending sort with qsort */
343     /* Identical to previous one but use two integers per array */
344     int icomp2(const void *s1, const void *s2)
345     {
346     int rc,
347     *p1,
348     *p2;
349    
350     rc = (*(int *) s1 - *(int *) s2);
351     if (rc)
352     return (rc);
353     else
354     {
355     p1 = (int *) s1;
356     p2 = (int *) s2;
357     return (*(++p1) - *(++p2));
358     }
359     }
360    
361    
362    
363     /*
364     -- Search Swish
365     -- Check if AltaVista/Lycos/Web.de like search string has to be converted
366     -- and call swish search...
367     -- 2001-03-02 rasc
368     */
369    
370     int search(SWISH * sw, char *words, int structure)
371     {
372     char *sw_srch_str;
373     int ret;
374    
375    
376     if (sw->SearchAlt->enableAltSearchSyntax)
377     { /* AltaVista like search enabled? */
378     sw_srch_str = convAltSearch2SwishStr(words);
379     ret = search_2(sw, sw_srch_str, structure);
380     efree(sw_srch_str);
381     }
382     else
383     {
384     ret = search_2(sw, words, structure);
385     }
386     return ret;
387     }
388    
389    
390     static void limit_result_list( SWISH *sw )
391     {
392     RESULT *rtmp;
393     RESULT *rp;
394     RESULT *last;
395     struct DB_RESULTS *db_results = sw->Search->db_results;
396    
397    
398     /* Process each index */
399     while ( db_results )
400     {
401     if(db_results->resultlist)
402     rp = db_results->resultlist->head;
403     else
404     rp = NULL;
405    
406     last = NULL;
407    
408     while (rp)
409     {
410     rtmp = rp->next;
411    
412     if ( LimitByProperty( sw, rp->indexf, rp->filenum ) )
413     {
414     freeresult( sw, rp );
415    
416     if ( !last ) /* if first in list */
417     db_results->resultlist->head = rtmp;
418     else
419     last->next = rtmp;
420     }
421     else
422     last = rp; /* move the last pointer to current one */
423    
424     rp = rtmp;
425     }
426    
427     db_results = db_results->next;
428    
429     }
430     }
431    
432    
433    
434    
435     /********************************************************************************
436     * Returns a negative number on error, zero or positive = number of results
437     *
438     ********************************************************************************/
439    
440     int search_2(SWISH * sw, char *words, int structure)
441     {
442     int j,
443     k,
444     hassearch,
445     indexYes,
446     totalResults;
447     struct swline *searchwordlist,
448     *tmplist2;
449     IndexFILE *indexlist;
450     int rc = 0;
451     unsigned char PhraseDelimiter;
452     struct DB_RESULTS *db_results,
453     *db_tmp;
454     struct MOD_Search *srch = sw->Search;
455    
456    
457     /* If not words - do nothing */
458     if (!words || !*words)
459     return (sw->lasterror = NO_WORDS_IN_SEARCH);
460    
461     PhraseDelimiter = (unsigned char) srch->PhraseDelimiter;
462    
463     indexlist = sw->indexlist;
464     sw->Search->db_results = NULL;
465    
466    
467     j = 0;
468     searchwordlist = NULL;
469     indexYes = 0;
470     totalResults = 0;
471     hassearch = 0;
472    
473     sw->lasterror = RC_OK;
474     sw->commonerror = RC_OK;
475    
476     if ((rc = initSearchResultProperties(sw)))
477     return rc;
478     if ((rc = initSortResultProperties(sw)))
479     return rc;
480    
481    
482     /* This returns false when no files found within the limit */
483     if ( !Prepare_PropLookup( sw ) )
484     return sw->lasterror; /* normally RC_OK (no results), but could be an error */
485    
486     while (indexlist != NULL)
487     {
488     /* moved these inside the loop Nov 24,2001. Need to verify */
489     sw->lasterror = RC_OK;
490     sw->commonerror = RC_OK;
491    
492    
493     /* Clean up from previous loop */
494     if (searchwordlist)
495     {
496     freeswline(searchwordlist);
497     searchwordlist = NULL;
498     }
499    
500    
501    
502     /* tokenize the query into swish words based on this current index */
503    
504     if (!(searchwordlist = tokenize_query_string(sw, words, &indexlist->header)))
505     {
506     indexlist = indexlist->next;
507     if ( sw->lasterror )
508     return sw->lasterror;
509     continue;
510     }
511    
512    
513     hassearch = 1; // Flag that we found some words to search form
514    
515     srch->bigrank = 0;
516    
517    
518     /* Is there an open index? */
519     if (!indexlist->DB)
520     {
521     if (searchwordlist)
522     freeswline(searchwordlist); /* 2001-03-13 rasc */
523     return sw->lasterror;
524     }
525    
526    
527     /* Any files in the index? */
528    
529     if (!indexlist->header.totalfiles)
530     {
531     indexlist = indexlist->next;
532     continue;
533     }
534     else
535     {
536     indexYes = 1; /*There is a non-empty index */
537     }
538    
539    
540     resultHeaderOut(sw, 2, "#\n# Index File: %s\n", indexlist->line);
541    
542     searchwordlist = (struct swline *) ignore_words_in_query(sw, indexlist, searchwordlist, PhraseDelimiter);
543     if ( sw->lasterror )
544     {
545     if (searchwordlist)
546     freeswline(searchwordlist);
547     return sw->lasterror;
548     }
549    
550    
551     /* Echo index file, fixed search, stopwords */
552    
553     /* Result Header Output (2001-03-14 rasc, rewritten) */
554    
555     resultPrintHeader(sw, 2, &indexlist->header, indexlist->header.savedasheader, 0);
556    
557    
558     resultHeaderOut(sw, 3, "# StopWords:");
559     for (k = 0; k < indexlist->header.stopPos; k++)
560     resultHeaderOut(sw, 3, " %s", indexlist->header.stopList[k]);
561    
562     resultHeaderOut(sw, 3, "\n");
563    
564    
565    
566     printheaderbuzzwords(sw,indexlist);
567    
568    
569     resultHeaderOut(sw, 2, "# Search Words: %s\n", words);
570    
571     tmplist2 = searchwordlist;
572    
573     if ( !tmplist2 )
574     sw->commonerror++; // Flag at the query (with this index) did no have any search words
575     else
576     {
577     resultHeaderOut(sw, 2, "# Parsed Words: ");
578    
579     while (tmplist2)
580     {
581     resultHeaderOut(sw, 2, "%s ", tmplist2->line);
582     tmplist2 = tmplist2->next;
583     }
584    
585     resultHeaderOut(sw, 2, "\n");
586     }
587    
588    
589     /* Expand phrase search: "kim harlow" becomes (kim PHRASE_WORD harlow) */
590     searchwordlist = (struct swline *) expandphrase(searchwordlist, PhraseDelimiter);
591     searchwordlist = (struct swline *) fixmetanames(searchwordlist);
592     searchwordlist = (struct swline *) fixnot1(searchwordlist);
593     searchwordlist = (struct swline *) fixnot2(searchwordlist);
594    
595     /* Allocate memory for the result list structure */
596     db_results = (struct DB_RESULTS *) emalloc(sizeof(struct DB_RESULTS));
597     memset( db_results, 0, sizeof(struct DB_RESULTS));
598    
599    
600     /* Now do the search */
601    
602     if (searchwordlist)
603     {
604     int metaID = 1; /* Default meta ID to search */
605    
606     tmplist2 = searchwordlist;
607     db_results->resultlist = (RESULT_LIST *) parseterm(sw, 0, metaID, structure, indexlist, &tmplist2);
608     }
609    
610    
611     /* add db_results to the list of results */
612    
613     if ( !sw->Search->db_results)
614     sw->Search->db_results = db_results;
615     else
616     {
617     db_tmp = sw->Search->db_results;
618     while (db_tmp)
619     {
620     if (!db_tmp->next)
621     {
622     db_tmp->next = db_results;
623     break;
624     }
625     db_tmp = db_tmp->next;
626     }
627    
628     }
629    
630    
631     indexlist = indexlist->next;
632    
633     } /* end process each index file */
634    
635    
636     /* Did any of the indexes have a query? */
637    
638     if (!hassearch)
639     return (sw->lasterror = NO_WORDS_IN_SEARCH);
640    
641    
642    
643     if (searchwordlist)
644     {
645     freeswline(searchwordlist);
646     searchwordlist = NULL;
647     }
648    
649    
650     /* Limit result list by -L parameter */
651     // note -- this used to be in addtoresultlist, but that was checking every word
652     // placing here means that it only checks for each file, instead of each word.
653    
654     if ( is_prop_limit_used( sw ) )
655     limit_result_list( sw );
656    
657     #ifdef DUMP_RESULTS
658     dump_result_lists( sw, "After search" );
659     #endif
660    
661    
662     /* 04/00 Jose Ruiz - Sort results by rank or by properties */
663    
664     totalResults = sortresults(sw, structure);
665    
666    
667     if (!totalResults && sw->commonerror)
668     return (sw->lasterror = WORDS_TOO_COMMON);
669    
670     if (!totalResults && !indexYes)
671     return (sw->lasterror = INDEX_FILE_IS_EMPTY);
672    
673     return totalResults;
674     }
675    
676     /* 2001-09 jmruiz - Rewriting
677     ** This puts parentheses in the right places around meta searches
678     ** to avoid problems whith them. Basically "metaname = bla"
679     ** becomes "(metanames = bla)" */
680    
681     struct swline *fixmetanames(struct swline *sp)
682     {
683     int metapar;
684     struct swline *tmpp,
685     *newp;
686    
687     tmpp = sp;
688     newp = NULL;
689    
690     /* Fix metanames with parenthesys eg: metaname = bla => (metanames = bla) */
691     while (tmpp != NULL)
692     {
693     if (isMetaNameOpNext(tmpp->next))
694     {
695     /* If it is a metaName add the name and = and skip to next */
696     newp = (struct swline *) addswline(newp, "(");
697     newp = (struct swline *) addswline(newp, tmpp->line);
698     newp = (struct swline *) addswline(newp, "=");
699     tmpp = tmpp->next;
700     tmpp = tmpp->next;
701     if ( !tmpp )
702     return NULL;
703    
704     /* 06/00 Jose Ruiz
705     ** Fix to consider parenthesys in the
706     ** content of a MetaName */
707     if (tmpp->line[0] == '(')
708     {
709     metapar = 1;
710     newp = (struct swline *) addswline(newp, tmpp->line);
711     tmpp = tmpp->next;
712     while (metapar && tmpp)
713     {
714     if (tmpp->line[0] == '(')
715     metapar++;
716     else if (tmpp->line[0] == ')')
717     metapar--;
718     newp = (struct swline *) addswline(newp, tmpp->line);
719     if (metapar)
720     tmpp = tmpp->next;
721     }
722     if (!tmpp)
723     return (newp);
724     }
725     else
726     newp = (struct swline *) addswline(newp, tmpp->line);
727     newp = (struct swline *) addswline(newp, ")");
728     }
729     else
730     newp = (struct swline *) addswline(newp, tmpp->line);
731     /* next one */
732     tmpp = tmpp->next;
733     }
734    
735     freeswline(sp);
736     return newp;
737     }
738    
739     /* 2001 -09 jmruiz Rewritten
740     ** This optimizes some NOT operator to be faster.
741     **
742     ** "word1 not word" is changed by "word1 and_not word2"
743     **
744     ** In the old way the previous query was...
745     ** get results if word1
746     ** get results of word2
747     ** not results of word2 (If we have 100000 docs and word2 is in
748     ** just 3 docs, this means read 99997
749     ** results)
750     ** intersect both list of results
751     **
752     ** The "new way"
753     ** get results if word1
754     ** get results of word2
755     ** intersect (and_not_rule) both lists of results
756     **
757     */
758    
759     struct swline *fixnot1(struct swline *sp)
760     {
761     struct swline *tmpp,
762     *prev;
763    
764     if (!sp)
765     return NULL;
766     /* 06/00 Jose Ruiz - Check if first word is NOT_RULE */
767     /* Change remaining NOT by AND_NOT_RULE */
768     for (tmpp = sp, prev = NULL; tmpp; prev = tmpp, tmpp = tmpp->next)
769     {
770     if (tmpp->line[0] == '(')
771     continue;
772     else if (isnotrule(tmpp->line))
773     {
774     if(prev && prev->line[0]!='=' && prev->line[0]!='(')
775     {
776     efree(tmpp->line);
777     tmpp->line = estrdup(AND_NOT_WORD);
778     }
779     }
780     }
781     return sp;
782     }
783    
784     /* 2001 -09 jmruiz - Totally new - Fix the meta=(not ahsg) bug
785     ** Add parentheses to avoid the way operator NOT confuse complex queries */
786     struct swline *fixnot2(struct swline *sp)
787     {
788     int openparen, found;
789     struct swline *tmpp, *newp;
790     char *magic = "<__not__>"; /* magic avoids parsing the
791     ** "not" operator twice
792     ** and put the code in an
793     ** endless loop */
794    
795     found = 1;
796     while(found)
797     {
798     openparen = 0;
799     found = 0;
800    
801     for (tmpp = sp , newp = NULL; tmpp ; tmpp = tmpp->next)
802     {
803     if (isnotrule(tmpp->line))
804     {
805     found = 1;
806     /* Add parentheses */
807     newp = (struct swline *) addswline(newp, "(");
808     /* Change "NOT" by magic to avoid find it in next iteration */
809     newp = (struct swline *) addswline(newp, magic);
810     for(tmpp = tmpp->next; tmpp; tmpp = tmpp->next)
811     {
812     if ((tmpp->line)[0] == '(')
813     openparen++;
814     else if(!openparen)
815     {
816     newp = (struct swline *) addswline(newp, tmpp->line);
817     /* Add parentheses */
818     newp = (struct swline *) addswline(newp, ")");
819     break;
820     }
821     else if ((tmpp->line)[0] == ')')
822     openparen--;
823     newp = (struct swline *) addswline(newp, tmpp->line);
824     }
825     if(!tmpp)
826     break;
827     }
828     else
829     newp = (struct swline *) addswline(newp, tmpp->line);
830     }
831     freeswline(sp);
832     sp = newp;
833     }
834    
835     /* remove magic and put the "real" NOT in place */
836     for(tmpp = newp; tmpp ; tmpp = tmpp->next)
837     {
838     if(!strcmp(tmpp->line,magic))
839     {
840     efree(tmpp->line);
841     tmpp->line = estrdup(NOT_WORD);
842     }
843     }
844    
845     return newp;
846     }
847    
848     /* expandstar removed - Jose Ruiz 04/00 */
849    
850     /* Expands phrase search. Berkeley University becomes Berkeley PHRASE_WORD University */
851     /* It also fixes the and, not or problem when they appeared inside a phrase */
852     struct swline *expandphrase(struct swline *sp, char delimiter)
853     {
854     struct swline *tmp,
855     *newp;
856     int inphrase;
857    
858     if (!sp)
859     return NULL;
860     inphrase = 0;
861     newp = NULL;
862     tmp = sp;
863     while (tmp != NULL)
864     {
865     if ((tmp->line)[0] == delimiter)
866     {
867     if (inphrase)
868     {
869     inphrase = 0;
870     newp = (struct swline *) addswline(newp, ")");
871     }
872     else
873     {
874     inphrase++;
875     newp = (struct swline *) addswline(newp, "(");
876     }
877     }
878     else
879     {
880     if (inphrase)
881     {
882     if (inphrase > 1)
883     newp = (struct swline *) addswline(newp, PHRASE_WORD);
884     inphrase++;
885     newp = (struct swline *) addswline(newp, tmp->line);
886     }
887     else
888     {
889     char *operator;
890    
891     if ( ( operator = isBooleanOperatorWord( tmp->line )) )
892     newp = (struct swline *) addswline(newp, operator);
893     else
894     newp = (struct swline *) addswline(newp, tmp->line);
895     }
896     }
897     tmp = tmp->next;
898     }
899     freeswline(sp);
900     return newp;
901     }
902    
903    
904    
905     /* Print the buzzwords */
906     void printheaderbuzzwords(SWISH *sw, IndexFILE * indexf)
907     {
908     int hashval;
909     struct swline *sp = NULL;
910    
911     resultHeaderOut(sw, 3, "# BuzzWords:");
912    
913     for (hashval = 0; hashval < HASHSIZE; hashval++)
914     {
915     sp = indexf->header.hashbuzzwordlist[hashval];
916     while (sp != NULL)
917     {
918     resultHeaderOut(sw, 3, " %s", sp->line);
919     sp = sp->next;
920     }
921     }
922     resultHeaderOut(sw, 3, "\n");
923     }
924    
925    
926    
927    
928    
929    
930     /* The recursive parsing function.
931     ** This was a headache to make but ended up being surprisingly easy. :)
932     ** parseone tells the function to only operate on one word or term.
933     ** parseone is needed so that with metaA=foo bar "bar" is searched
934     ** with the default metaname.
935     */
936    
937     RESULT_LIST *parseterm(SWISH * sw, int parseone, int metaID, int structure, IndexFILE * indexf, struct swline **searchwordlist)
938     {
939     int rulenum;
940     char *word;
941     int lenword;
942     RESULT_LIST *l_rp,
943     *new_l_rp;
944    
945     /*
946     * The andLevel is used to help keep the ranking function honest
947     * when it ANDs the results of the latest search term with
948     * the results so far (rp). The idea is that if you AND three
949     * words together you ultimately want to resulting rank to
950     * be the average of all three individual work ranks. By keeping
951     * a running total of the number of terms already ANDed, the
952     * next AND operation can properly scale the average-rank-so-far
953     * and recompute the new average properly (see andresultlists()).
954     * This implementation is a little weak in that it will not average
955     * across terms that are in parenthesis. (It treats an () expression
956     * as one term, and weights it as "one".)
957     */
958     int andLevel = 0; /* number of terms ANDed so far */
959    
960     word = NULL;
961     lenword = 0;
962    
963     l_rp = NULL;
964    
965     rulenum = OR_RULE;
966     while (*searchwordlist)
967     {
968    
969     word = SafeStrCopy(word, (*searchwordlist)->line, &lenword);
970    
971    
972     if (rulenum == NO_RULE)
973     rulenum = sw->SearchAlt->srch_op.defaultrule;
974    
975    
976     if (isunaryrule(word)) /* is it a NOT? */
977     {
978     *searchwordlist = (*searchwordlist)->next;
979     l_rp = (RESULT_LIST *) parseterm(sw, 1, metaID, structure, indexf, searchwordlist);
980     l_rp = (RESULT_LIST *) notresultlist(sw, l_rp, indexf);
981    
982     /* Wild goose chase */
983     rulenum = NO_RULE;
984     continue;
985     }
986    
987    
988     /* If it's an operator, set the current rulenum, and continue */
989     else if (isbooleanrule(word))
990     {
991     rulenum = getrulenum(word);
992     *searchwordlist = (*searchwordlist)->next;
993     continue;
994     }
995    
996    
997     /* Bump up the count of AND terms for this level */
998    
999     if (rulenum != AND_RULE)
1000     andLevel = 0; /* reset */
1001     else if (rulenum == AND_RULE)
1002     andLevel++;
1003    
1004    
1005    
1006     /* Is this the start of a sub-query? */
1007    
1008     if (word[0] == '(')
1009     {
1010    
1011    
1012     /* Recurse */
1013     *searchwordlist = (*searchwordlist)->next;
1014     new_l_rp = (RESULT_LIST *) parseterm(sw, 0, metaID, structure, indexf, searchwordlist);
1015    
1016    
1017     if (rulenum == AND_RULE)
1018     l_rp = (RESULT_LIST *) andresultlists(sw, l_rp, new_l_rp, andLevel);
1019    
1020     else if (rulenum == OR_RULE)
1021     l_rp = (RESULT_LIST *) orresultlists(sw, l_rp, new_l_rp);
1022    
1023     else if (rulenum == PHRASE_RULE)
1024     l_rp = (RESULT_LIST *) phraseresultlists(sw, l_rp, new_l_rp, 1);
1025    
1026     else if (rulenum == AND_NOT_RULE)
1027     l_rp = (RESULT_LIST *) notresultlists(sw, l_rp, new_l_rp);
1028    
1029     if (!*searchwordlist)
1030     break;
1031    
1032     rulenum = NO_RULE;
1033     continue;
1034    
1035     }
1036     else if (word[0] == ')')
1037     {
1038     *searchwordlist = (*searchwordlist)->next;
1039     break;
1040     }
1041    
1042    
1043     /* Now down to checking for metanames and actual search words */
1044    
1045    
1046     /* Check if the next word is '=' */
1047     if (isMetaNameOpNext((*searchwordlist)->next))
1048     {
1049     struct metaEntry *m = getMetaNameByName(&indexf->header, word);
1050    
1051     /* shouldn't happen since already checked */
1052     if ( !m )
1053     progerr("Unknown metaname '%s' -- swish_words failed to find.", word );
1054    
1055     metaID = m->metaID;
1056    
1057    
1058     /* Skip both the metaName end the '=' */
1059     *searchwordlist = (*searchwordlist)->next->next;
1060    
1061    
1062     if ((*searchwordlist) && ((*searchwordlist)->line[0] == '('))
1063     {
1064     *searchwordlist = (*searchwordlist)->next;
1065     parseone = 0;
1066     }
1067     else
1068     parseone = 1;
1069    
1070     /* Now recursively process the next terms */
1071    
1072     new_l_rp = (RESULT_LIST *) parseterm(sw, parseone, metaID, structure, indexf, searchwordlist);
1073     if (rulenum == AND_RULE)
1074     l_rp = (RESULT_LIST *) andresultlists(sw, l_rp, new_l_rp, andLevel);
1075    
1076     else if (rulenum == OR_RULE)
1077     l_rp = (RESULT_LIST *) orresultlists(sw, l_rp, new_l_rp);
1078    
1079     else if (rulenum == PHRASE_RULE)
1080     l_rp = (RESULT_LIST *) phraseresultlists(sw, l_rp, new_l_rp, 1);
1081    
1082     else if (rulenum == AND_NOT_RULE)
1083     l_rp = (RESULT_LIST *) notresultlists(sw, l_rp, new_l_rp);
1084    
1085     if (!*searchwordlist)
1086     break;
1087    
1088     rulenum = NO_RULE;
1089     metaID = 1;
1090     continue;
1091     }
1092    
1093    
1094     /* Finally, look up a word, and merge with previous results. */
1095    
1096     l_rp = (RESULT_LIST *) operate(sw, l_rp, rulenum, word, indexf->DB, metaID, structure, andLevel, indexf);
1097    
1098     if (parseone)
1099     {
1100     *searchwordlist = (*searchwordlist)->next;
1101     break;
1102     }
1103     rulenum = NO_RULE;
1104    
1105     *searchwordlist = (*searchwordlist)->next;
1106     }
1107    
1108     if (lenword)
1109     efree(word);
1110    
1111     return l_rp;
1112     }
1113    
1114     /* Looks up a word in the index file -
1115     ** it calls getfileinfo(), which does the real searching.
1116     */
1117    
1118     RESULT_LIST *operate(SWISH * sw, RESULT_LIST * l_rp, int rulenum, char *wordin, void *DB, int metaID, int structure, int andLevel, IndexFILE * indexf)
1119     {
1120     RESULT_LIST *new_l_rp,
1121     *return_l_rp;
1122     char *word;
1123     int lenword;
1124    
1125     word = estrdup(wordin);
1126     lenword = strlen(word);
1127    
1128     new_l_rp = return_l_rp = NULL;
1129    
1130    
1131     /* ??? Stopwords have already been removed at this point. */
1132     /*****
1133     if (isstopword(&indexf->header, word) && !isrule(word))
1134     {
1135     if (rulenum == OR_RULE && l_rp != NULL)
1136     return l_rp;
1137     else
1138     sw->commonerror = 1;
1139     }
1140     ******/
1141    
1142    
1143     /* ??? Ever a time when rulenum is not one of these? */
1144    
1145     if (rulenum == AND_RULE)
1146     {
1147     new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1148     return_l_rp = (RESULT_LIST *) andresultlists(sw, l_rp, new_l_rp, andLevel);
1149     }
1150    
1151    
1152     else if (rulenum == OR_RULE)
1153     {
1154     new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1155     return_l_rp = (RESULT_LIST *) orresultlists(sw, l_rp, new_l_rp);
1156     }
1157    
1158    
1159     else if (rulenum == NOT_RULE)
1160     {
1161     new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1162     return_l_rp = (RESULT_LIST *) notresultlist(sw, new_l_rp, indexf);
1163     }
1164    
1165    
1166     else if (rulenum == PHRASE_RULE)
1167     {
1168     new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1169     return_l_rp = (RESULT_LIST *) phraseresultlists(sw, l_rp, new_l_rp, 1);
1170     }
1171    
1172    
1173     else if (rulenum == AND_NOT_RULE)
1174     {
1175     new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1176     return_l_rp = (RESULT_LIST *) notresultlists(sw, l_rp, new_l_rp);
1177     }
1178    
1179    
1180     efree(word);
1181     return return_l_rp;
1182     }
1183    
1184    
1185     static RESULT_LIST *newResultsList(SWISH *sw)
1186     {
1187     RESULT_LIST *l = (RESULT_LIST *)Mem_ZoneAlloc(sw->Search->resultSearchZone, sizeof(RESULT_LIST));
1188     l->head = l->tail = NULL;
1189     l->sw = (struct SWISH *)sw;
1190    
1191     return l;
1192     }
1193    
1194     void addResultToList(RESULT_LIST *l_r, RESULT *r)
1195     {
1196     r->next = NULL;
1197    
1198     r->reslist = l_r;
1199    
1200     if(!l_r->head)
1201     l_r->head = r;
1202     if(l_r->tail)
1203     l_r->tail->next = r;
1204     l_r->tail = r;
1205    
1206     }
1207    
1208    
1209     /* Routine to test structure in a result */
1210     /* Also removes posdata that do not fit with structure field */
1211     int test_structure(int structure, int frequency, int *posdata)
1212     {
1213     int i, j; /* i -> counter upto frequency, j -> new frequency */
1214     int *p,*q; /* Use pointers to ints instead of arrays for
1215     ** faster proccess */
1216    
1217     for(i = j = 0, p = q = posdata; i < frequency; i++, p++)
1218     {
1219     if(GET_STRUCTURE(*p) & structure)
1220     {
1221     if(p - q)
1222     {
1223     *q = *p;
1224     }
1225     j++;
1226     q++;
1227     }
1228     }
1229     return j; /* return new frequency */
1230     }
1231    
1232    
1233    
1234     /* Finds a word and returns its corresponding file and rank information list.
1235     ** If not found, NULL is returned.
1236     */
1237     /* Jose Ruiz
1238     ** New implmentation based on Hashing for direct access. Faster!!
1239     ** Also solves stars. Faster!! It can even found "and", "or"
1240     ** when looking for "an*" or "o*" if they are not stop words
1241     */
1242    
1243     #define MAX_POSDATA_STACK 256
1244    
1245     RESULT_LIST *getfileinfo(SWISH * sw, char *word, IndexFILE * indexf, int metaID, int structure)
1246     {
1247     int j,
1248     x,
1249     filenum,
1250     frequency,
1251     len,
1252     curmetaID,
1253     index_structure,
1254     index_structfreq,
1255     tmpval;
1256     RESULT_LIST *l_rp, *l_rp2;
1257     long wordID;
1258     unsigned long nextposmetaname;
1259     char *p;
1260     int tfrequency = 0;
1261     unsigned char *s, *buffer;
1262     int sz_buffer;
1263     unsigned char flag;
1264     int stack_posdata[MAX_POSDATA_STACK]; /* stack buffer for posdata */
1265     int *posdata;
1266    
1267     x = j = filenum = frequency = len = curmetaID = index_structure = index_structfreq = 0;
1268     nextposmetaname = 0L;
1269    
1270    
1271     l_rp = l_rp2 = NULL;
1272    
1273    
1274     /* First: Look for star at the end of the word */
1275     if ((p = strrchr(word, '*')))
1276     {
1277     if (p != word && *(p - 1) == '\\') /* Check for an escaped * */
1278     {
1279     p = NULL; /* If escaped it is not a wildcard */
1280     }
1281     else
1282     {
1283     /* Check if it is at the end of the word */
1284     if (p == (word + strlen(word) - 1))
1285     {
1286     word[strlen(word) - 1] = '\0';
1287     /* Remove the wildcard - p remains not NULL */
1288     }
1289     else
1290     {
1291     p = NULL; /* Not at the end - Ignore */
1292     }
1293     }
1294     }
1295    
1296    
1297    
1298    
1299     DB_InitReadWords(sw, indexf->DB);
1300     if (!p) /* No wildcard -> Direct hash search */
1301     {
1302     DB_ReadWordHash(sw, word, &wordID, indexf->DB);
1303    
1304     if(!wordID)
1305     {
1306     DB_EndReadWords(sw, indexf->DB);
1307     sw->lasterror = WORD_NOT_FOUND;
1308     return NULL;
1309     }
1310     }
1311    
1312     else /* There is a star. So use the sequential approach */
1313     {
1314     char *resultword;
1315    
1316     if (*word == '*')
1317     {
1318     sw->lasterror = UNIQUE_WILDCARD_NOT_ALLOWED_IN_WORD;
1319     return NULL;
1320     }
1321    
1322    
1323     DB_ReadFirstWordInvertedIndex(sw, word, &resultword, &wordID, indexf->DB);
1324    
1325     if (!wordID)
1326     {
1327     DB_EndReadWords(sw, indexf->DB);
1328     sw->lasterror = WORD_NOT_FOUND;
1329     return NULL;
1330     }
1331     efree(resultword); /* Do not need it */
1332     }
1333    
1334    
1335     /* If code is here we have found the word !! */
1336    
1337     do
1338     {
1339     DB_ReadWordData(sw, wordID, &buffer, &sz_buffer, indexf->DB);
1340     s = buffer;
1341    
1342     // buffer structure = <tfreq><metaID><delta to next meta>
1343    
1344     /* Get the data of the word */
1345     tfrequency = uncompress2(&s); /* tfrequency - number of files with this word */
1346    
1347    
1348     /* Now look for a correct Metaname */
1349     curmetaID = uncompress2(&s);
1350    
1351     while (curmetaID)
1352     {
1353     nextposmetaname = UNPACKLONG2(s);
1354     s += sizeof(long);
1355    
1356    
1357     if (curmetaID >= metaID)
1358     break;
1359    
1360     s = buffer + nextposmetaname;
1361     if(nextposmetaname == (unsigned long)sz_buffer)
1362     break; // if no more meta data
1363    
1364     curmetaID = uncompress2(&s);
1365     }
1366    
1367     if (curmetaID == metaID) /* found a matching meta value */
1368     {
1369     filenum = 0;
1370     do
1371     {
1372     /* Read on all items */
1373     uncompress_location_values(&s,&flag,&tmpval,&frequency);
1374     filenum += tmpval;
1375    
1376     /* stack_posdata is just to avoid calling emalloc */
1377     /* it should be enough for most cases */
1378     if(frequency > MAX_POSDATA_STACK)
1379     posdata = (int *)emalloc(frequency * sizeof(int));
1380     else
1381     posdata = stack_posdata;
1382    
1383     /* read positions */
1384     uncompress_location_positions(&s,flag,frequency,posdata);
1385    
1386     /* test structure and adjust frequency */
1387     frequency = test_structure(structure, frequency, posdata);
1388    
1389     /* Store -1 in rank - In this way, we can delay its computation */
1390     /* This stuff has been removed */
1391    
1392     /* Store result */
1393     if(frequency)
1394     {
1395     /* This is very useful if we sorted by other property */
1396     if(!l_rp)
1397     l_rp = newResultsList(sw);
1398    
1399     // tfrequency = number of files with this word
1400     // frequency = number of times this words is in this document for this metaID
1401    
1402     addtoresultlist(l_rp, filenum, -1, tfrequency, frequency, indexf, sw);
1403    
1404     /* Copy positions */
1405     memcpy((unsigned char *)l_rp->tail->posdata,(unsigned char *)posdata,frequency * sizeof(int));
1406    
1407     // Temp fix
1408     {
1409     struct RESULT *r1 = l_rp->tail;
1410     r1->rank = r1->rank = getrank( sw, r1->frequency, r1->tfrequency, r1->posdata, r1->indexf, r1->filenum );
1411     }
1412     }
1413     if(posdata != stack_posdata)
1414     efree(posdata);
1415    
1416    
1417     } while ((unsigned long)(s - buffer) != nextposmetaname);
1418    
1419    
1420     }
1421    
1422     efree(buffer);
1423    
1424    
1425     if (!p)
1426     break; /* direct access (no wild card) -> break */
1427    
1428     else
1429     {
1430     char *resultword;
1431    
1432     /* Jump to next word */
1433     /* No more data for this word but we
1434     are in sequential search because of
1435     the star (p is not null) */
1436     /* So, go for next word */
1437     DB_ReadNextWordInvertedIndex(sw, word, &resultword, &wordID, indexf->DB);
1438     if (! wordID)
1439     break; /* no more data */
1440    
1441     efree(resultword); /* Do not need it (although might be useful for highlighting some day) */
1442     }
1443    
1444     } while(1); /* continue on in loop for wildcard search */
1445    
1446    
1447    
1448     if (p)
1449     {
1450     /* Finally, if we are in an sequential search merge all results */
1451     l_rp = mergeresulthashlist(sw, l_rp);
1452     }
1453    
1454     DB_EndReadWords(sw, indexf->DB);
1455     return l_rp;
1456     }
1457    
1458    
1459     /*
1460     -- Rules checking
1461     -- u_is... = user rules (and, or, ...)
1462     -- is... = internal rules checking
1463     */
1464    
1465    
1466     /* Is a word a rule?
1467     */
1468    
1469     int isrule(char *word)
1470     {
1471     int i;
1472     static char *ops[] = { /* internal ops... */
1473     AND_WORD, OR_WORD, NOT_WORD, PHRASE_WORD, AND_NOT_WORD,
1474     NULL
1475     };
1476    
1477     for (i = 0; ops[i]; i++)
1478     {
1479     if (!strcmp(word, ops[i]))
1480     return 1;
1481     }
1482     return 0;
1483     }
1484    
1485     int u_isrule(SWISH * sw, char *word)
1486     {
1487     LOGICAL_OP *op;
1488    
1489     op = &(sw->SearchAlt->srch_op);
1490     if (!strcmp(word, op->and) || !strcmp(word, op->or) || !strcmp(word, op->not))
1491     return 1;
1492     else
1493     return 0;
1494     }
1495    
1496     int isnotrule(char *word)
1497     {
1498     if (!strcmp(word, NOT_WORD))
1499     return 1;
1500     else
1501     return 0;
1502     }
1503    
1504     int u_isnotrule(SWISH * sw, char *word)
1505     {
1506     if (!strcmp(word, sw->SearchAlt->srch_op.not))
1507     return 1;
1508     else
1509     return 0;
1510     }
1511    
1512    
1513     /* Is a word a boolean rule?
1514     */
1515    
1516     int isbooleanrule(char *word)
1517     {
1518     if (!strcmp(word, AND_WORD) || !strcmp(word, OR_WORD) || !strcmp(word, PHRASE_WORD) || !strcmp(word, AND_NOT_WORD))
1519     return 1;
1520     else
1521     return 0;
1522     }
1523    
1524     /* Is a word a unary rule?
1525     */
1526    
1527     int isunaryrule(char *word)
1528     {
1529     if (!strcmp(word, NOT_WORD))
1530     return 1;
1531     else
1532     return 0;
1533     }
1534    
1535     /* Return the number for a rule.
1536     */
1537    
1538     int getrulenum(char *word)
1539     {
1540     if (!strcmp(word, AND_WORD))
1541     return AND_RULE;
1542     else if (!strcmp(word, OR_WORD))
1543     return OR_RULE;
1544     else if (!strcmp(word, NOT_WORD))
1545     return NOT_RULE;
1546     else if (!strcmp(word, PHRASE_WORD))
1547     return PHRASE_RULE;
1548     else if (!strcmp(word, AND_NOT_WORD))
1549     return AND_NOT_RULE;
1550     return NO_RULE;
1551     }
1552    
1553    
1554     /*
1555     -- Return selected RuleNumber for default rule.
1556     -- defined via current Swish Search Boolean OP Settings for user.
1557     */
1558    
1559     int u_SelectDefaultRulenum(SWISH * sw, char *word)
1560     {
1561     if (!strcasecmp(word, sw->SearchAlt->srch_op.and))
1562     return AND_RULE;
1563     else if (!strcasecmp(word, sw->SearchAlt->srch_op.or))
1564     return OR_RULE;
1565     return NO_RULE;
1566     }
1567    
1568    
1569    
1570     static void make_db_res_and_free(SWISH *sw,RESULT_LIST *l_res) {
1571     struct DB_RESULTS tmp;
1572     memset (&tmp,0,sizeof(struct DB_RESULTS));
1573     tmp.resultlist = l_res;
1574     freeresultlist(sw,&tmp);
1575     }
1576    
1577    
1578    
1579     /* Takes two lists of results from searches and ANDs them together.
1580     ** On input, both result lists r1 and r2 must be sorted by filenum
1581     ** On output, the new result list remains sorted
1582     */
1583    
1584     RESULT_LIST *andresultlists(SWISH * sw, RESULT_LIST * l_r1, RESULT_LIST * l_r2, int andLevel)
1585     {
1586     RESULT_LIST *new_results_list = NULL;
1587     RESULT *r1;
1588     RESULT *r2;
1589     int res = 0;
1590    
1591     /* patch provided by Mukund Srinivasan */
1592     if (l_r1 == NULL || l_r2 == NULL)
1593     {
1594     make_db_res_and_free(sw,l_r1);
1595     make_db_res_and_free(sw,l_r2);
1596     return NULL;
1597     }
1598    
1599     if (andLevel < 1)
1600     andLevel = 1;
1601    
1602     for (r1 = l_r1->head, r2 = l_r2->head; r1 && r2;)
1603     {
1604     res = r1->filenum - r2->filenum;
1605     if (!res)
1606     {
1607     /*
1608     * Computing the new rank is interesting because
1609     * we want to weight each of the words that was
1610     * previously ANDed equally along with the new word.
1611     * We compute a running average using andLevel and
1612     * simply scale up the old average (in r1->rank)
1613     * and recompute a new, equally weighted average.
1614     */
1615     int newRank = 0;
1616    
1617     if(r1->rank == -1)
1618     r1->rank = getrank( sw, r1->frequency, r1->tfrequency, r1->posdata, r1->indexf, r1->filenum );
1619    
1620     if(r2->rank == -1)
1621     r2->rank = getrank( sw, r2->frequency, r1->tfrequency, r2->posdata, r2->indexf, r2->filenum );
1622    
1623     newRank = ((r1->rank * andLevel) + r2->rank) / (andLevel + 1);
1624    
1625    
1626     if(!new_results_list)
1627     new_results_list = newResultsList(sw);
1628    
1629     addtoresultlist(new_results_list, r1->filenum, newRank, 0, r1->frequency + r2->frequency, r1->indexf, sw);
1630    
1631    
1632     /* Storing all positions could be useful in the future */
1633    
1634     CopyPositions(new_results_list->tail->posdata, 0, r1->posdata, 0, r1->frequency);
1635     CopyPositions(new_results_list->tail->posdata, r1->frequency, r2->posdata, 0, r2->frequency);
1636    
1637    
1638     r1 = r1->next;
1639     r2 = r2->next;
1640     }
1641    
1642     else if (res > 0)
1643     {
1644     r2 = r2->next;
1645     }
1646     else
1647     {
1648     r1 = r1->next;
1649     }
1650     }
1651    
1652     return new_results_list;
1653     }
1654    
1655     /* Takes two lists of results from searches and ORs them together.
1656     2001-11 jmruiz Completely rewritten. Older one was really
1657     slow when the lists are very long
1658     On input, both result lists r1 and r2 must be sorted by filenum
1659     On output, the new result list remains sorted
1660    
1661     rank is combined for matching files. That is,
1662     "foo OR bar" will rank files with both higher.
1663    
1664     */
1665    
1666    
1667     RESULT_LIST *orresultlists(SWISH * sw, RESULT_LIST * l_r1, RESULT_LIST * l_r2)
1668     {
1669     int rc;
1670     RESULT *r1;
1671     RESULT *r2;
1672     RESULT *rp,
1673     *tmp;
1674     RESULT_LIST *new_results_list = NULL;
1675    
1676    
1677     /* If either list is empty, just return the other */
1678     if (l_r1 == NULL)
1679     return l_r2;
1680    
1681     else if (l_r2 == NULL)
1682     return l_r1;
1683    
1684     /* Look for files that have both words, and add up the ranks */
1685    
1686     r1 = l_r1->head;
1687     r2 = l_r2->head;
1688    
1689     while(r1 && r2)
1690     {
1691     rc = r1->filenum - r2->filenum;
1692     if(rc < 0)
1693     {
1694     rp = r1;
1695     r1 = r1->next;
1696     }
1697     else if(rc > 0)
1698     {
1699     rp = r2;
1700     r2 = r2->next;
1701     }
1702    
1703     else /* Matching file number */
1704     {
1705     int result_size;
1706    
1707     /* Compute rank if not yet computed */
1708     if(r1->rank == -1)
1709     r1->rank = getrank( sw, r1->frequency, r1->tfrequency, r1->posdata, r1->indexf, r1->filenum );
1710    
1711     if(r2->rank == -1)
1712     r2->rank = getrank( sw, r2->frequency, r2->tfrequency, r2->posdata, r2->indexf, r2->filenum );
1713    
1714    
1715     /* Create a new RESULT - Should be a function to creeate this, I'd think */
1716    
1717     result_size = sizeof(RESULT) + ( (r1->frequency + r2->frequency - 1) * sizeof(int) );
1718     rp = (RESULT *) Mem_ZoneAlloc(sw->Search->resultSearchZone, result_size );
1719     memset( rp, 0, result_size );
1720    
1721    
1722     rp->fi.filenum = rp->filenum = r1->filenum;
1723    
1724     rp->rank = ( r1->rank + r2->rank) * 2; // bump up the or terms
1725     rp->tfrequency = 0;
1726     rp->frequency = r1->frequency + r2->frequency;
1727     rp->indexf = r1->indexf;
1728    
1729    
1730     /* save the combined position data in the new result. (Would freq ever be zero?) */
1731     if (r1->frequency)
1732     CopyPositions(rp->posdata, 0, r1->posdata, 0, r1->frequency);
1733    
1734     if (r2->frequency)
1735     CopyPositions(rp->posdata, r1->frequency, r2->posdata, 0, r2->frequency);
1736    
1737     r1 = r1->next;
1738     r2 = r2->next;
1739     }
1740    
1741    
1742     /* Now add the result to the output list */
1743    
1744     if(!new_results_list)
1745     new_results_list = newResultsList(sw);
1746    
1747     addResultToList(new_results_list,rp);
1748     }
1749    
1750    
1751     /* Add the remaining results */
1752    
1753     tmp = r1 ? r1 : r2;
1754    
1755     while(tmp)
1756     {
1757     rp = tmp;
1758     tmp = tmp->next;
1759     if(!new_results_list)
1760     new_results_list = newResultsList(sw);
1761    
1762     addResultToList(new_results_list,rp);
1763     }
1764    
1765     return new_results_list;
1766     }
1767    
1768    
1769     /* 2001-10 jmruiz - This code was originally at merge.c
1770     ** Also made it thread safe
1771     */
1772     /* These three routines are only used by notresultlist */
1773    
1774     struct markentry
1775     {
1776     struct markentry *next;
1777     int num;
1778     };
1779    
1780     /* This marks a number as having been printed.
1781     */
1782    
1783     void marknum(SWISH *sw, struct markentry **markentrylist, int num)
1784     {
1785     unsigned hashval;
1786     struct markentry *mp;
1787    
1788     mp = (struct markentry *) Mem_ZoneAlloc(sw->Search->resultSearchZone, sizeof(struct markentry));
1789    
1790     mp->num = num;
1791    
1792     hashval = bignumhash(num);
1793     mp->next = markentrylist[hashval];
1794     markentrylist[hashval] = mp;
1795     }
1796    
1797    
1798     /* Has a number been printed?
1799     */
1800    
1801     int ismarked(struct markentry **markentrylist, int num)
1802     {
1803     unsigned hashval;
1804     struct markentry *mp;
1805    
1806     hashval = bignumhash(num);
1807     mp = markentrylist[hashval];
1808    
1809     while (mp != NULL)
1810     {
1811     if (mp->num == num)
1812     return 1;
1813     mp = mp->next;
1814     }
1815     return 0;
1816     }
1817    
1818     /* Initialize the marking list.
1819     */
1820    
1821     void initmarkentrylist(struct markentry **markentrylist)
1822     {
1823     int i;
1824    
1825     for (i = 0; i < BIGHASHSIZE; i++)
1826     markentrylist[i] = NULL;
1827     }
1828    
1829     void freemarkentrylist(struct markentry **markentrylist)
1830     {
1831     int i;
1832    
1833     for (i = 0; i < BIGHASHSIZE; i++)
1834     {
1835     markentrylist[i] = NULL;
1836     }
1837     }
1838    
1839     /* This performs the NOT unary operation on a result list.
1840     ** NOTed files are marked with a default rank of 1000.
1841     **
1842     ** Basically it returns all the files that have not been
1843     ** marked (GH)
1844     */
1845    
1846     RESULT_LIST *notresultlist(SWISH * sw, RESULT_LIST * l_rp, IndexFILE * indexf)
1847     {
1848     int i,
1849     filenums;
1850     RESULT *rp;
1851     RESULT_LIST *new_results_list = NULL;
1852     struct markentry *markentrylist[BIGHASHSIZE];
1853    
1854     if(!l_rp)
1855     rp = NULL;
1856     else
1857     rp = l_rp->head;
1858    
1859     initmarkentrylist(markentrylist);
1860     while (rp != NULL)
1861     {
1862     marknum(sw, markentrylist, rp->filenum);
1863     rp = rp->next;
1864     }
1865    
1866     filenums = indexf->header.totalfiles;
1867    
1868     for (i = 1; i <= filenums; i++)
1869     {
1870     if (!ismarked(markentrylist, i))
1871     {
1872     if(!new_results_list)
1873     new_results_list = newResultsList(sw);
1874     addtoresultlist(new_results_list, i, 1000, 0, 0, indexf, sw);
1875     }
1876     }
1877    
1878     freemarkentrylist(markentrylist);
1879    
1880     new_results_list = sortresultsbyfilenum(new_results_list);
1881    
1882     return new_results_list;
1883     }
1884    
1885     /* Phrase result routine - see distance parameter. For phrase search this
1886     ** value must be 1 (consecutive words)
1887     **
1888     ** On input, both result lists r1 abd r2 must be sorted by filenum
1889     ** On output, the new result list remains sorted
1890     */
1891     RESULT_LIST *phraseresultlists(SWISH * sw, RESULT_LIST * l_r1, RESULT_LIST * l_r2, int distance)
1892     {
1893     int i,
1894     j,
1895     found,
1896     newRank,
1897     *allpositions;
1898     int res = 0;
1899     RESULT_LIST *new_results_list = NULL;
1900     RESULT *r1, *r2;
1901    
1902     if (l_r1 == NULL || l_r2 == NULL)
1903     return NULL;
1904    
1905     for (r1 = l_r1->head, r2 = l_r2->head; r1 && r2;)
1906     {
1907     res = r1->filenum - r2->filenum;
1908     if (!res)
1909     {
1910     found = 0;
1911     allpositions = NULL;
1912     for (i = 0; i < r1->frequency; i++)
1913     {
1914     for (j = 0; j < r2->frequency; j++)
1915     {
1916     if ((GET_POSITION(r1->posdata[i]) + distance) == GET_POSITION(r2->posdata[j]))
1917     {
1918     found++;
1919     if (allpositions)
1920     allpositions = (int *) erealloc(allpositions, found * sizeof(int));
1921    
1922     else
1923     allpositions = (int *) emalloc(found * sizeof(int));
1924    
1925     allpositions[found - 1] = r2->posdata[j];
1926     break;
1927     }
1928     }
1929     }
1930     if (found)
1931     {
1932     /* Compute newrank */
1933     if(r1->rank == -1)
1934     r1->rank = getrank( sw, r1->frequency, r1->tfrequency, r1->posdata, r1->indexf, r1->filenum );
1935     if(r2->rank == -1)
1936     r2->rank = getrank( sw, r2->frequency, r1->tfrequency, r2->posdata, r2->indexf, r2->filenum );
1937    
1938     newRank = (r1->rank + r2->rank) / 2;
1939     /*
1940     * Storing positions is neccesary for further
1941     * operations
1942     */
1943     if(!new_results_list)
1944     new_results_list = newResultsList(sw);
1945    
1946     addtoresultlist(new_results_list, r1->filenum, newRank, 0, found, r1->indexf, sw);
1947    
1948     CopyPositions(new_results_list->tail->posdata, 0, allpositions, 0, found);
1949     efree(allpositions);
1950     }
1951     r1 = r1->next;
1952     r2 = r2->next;
1953     }
1954     else if (res > 0)
1955     {
1956     r2 = r2->next;
1957     }
1958     else
1959     {
1960     r1 = r1->next;
1961     }
1962    
1963     }
1964    
1965     return new_results_list;
1966     }
1967    
1968     /* Adds a file number and rank to a list of results.
1969     */
1970    
1971    
1972     void addtoresultlist(RESULT_LIST * l_rp, int filenum, int rank, int tfrequency, int frequency, IndexFILE * indexf, SWISH * sw)
1973     {
1974     RESULT *newnode;
1975     int result_size;
1976    
1977     result_size = sizeof(RESULT) + ((frequency - 1) * sizeof(int));
1978     newnode = (RESULT *) Mem_ZoneAlloc(sw->Search->resultSearchZone, result_size );
1979     memset( newnode, 0, result_size );
1980    
1981     newnode->fi.filenum = newnode->filenum = filenum;
1982    
1983     newnode->rank = rank;
1984     newnode->tfrequency = tfrequency;
1985     newnode->frequency = frequency;
1986     newnode->indexf = indexf;
1987    
1988     addResultToList(l_rp, newnode);
1989     }
1990    
1991    
1992    
1993    
1994     RESULT *SwishNext(SWISH * sw)
1995     {
1996     struct MOD_Search *srch = sw->Search;
1997     RESULT *res = NULL;
1998     RESULT *res2 = NULL;
1999     int rc;
2000     struct DB_RESULTS *db_results = NULL,
2001     *db_results_winner = NULL;
2002     int num;
2003    
2004     if (srch->bigrank)
2005     num = 10000000 / srch->bigrank;
2006     else
2007     num = 10000;
2008    
2009    
2010     /* Seems like we should error here if there are no results */
2011     if ( !sw->Search->db_results )
2012     {
2013     set_progerr(SWISH_LISTRESULTS_EOF, sw, "Attempted to read results before searching");
2014     return NULL;
2015     }
2016    
2017     /* Check for a unique index file */
2018     if (!sw->Search->db_results->next)
2019     {
2020     if ((res = sw->Search->db_results->currentresult))
2021     {
2022     /* Increase Pointer */
2023     sw->Search->db_results->currentresult = res->next;
2024     /* If rank was delayed, compute it now */
2025     if(res->rank == -1)
2026     res->rank = getrank( sw, res->frequency, res->tfrequency, res->posdata, res->indexf, res->filenum );
2027    
2028     }
2029     }
2030    
2031    
2032     else /* tape merge to find the next one from all the index files */
2033    
2034     {
2035     /* We have more than one index file - can't use pre-sorted index */
2036     /* Get the lower value */
2037     db_results_winner = sw->Search->db_results;
2038    
2039     if ((res = db_results_winner->currentresult))
2040     {
2041     /* If rank was delayed, compute it now */
2042     if(res->rank == -1)
2043     res->rank = getrank( sw, res->frequency, res->tfrequency, res->posdata, res->indexf, res->filenum );
2044    
2045     if ( !res->PropSort )
2046     res->PropSort = getResultSortProperties(sw, res);
2047     }
2048    
2049    
2050     for (db_results = sw->Search->db_results->next; db_results; db_results = db_results->next)
2051     {
2052     /* Any more results for this index? */
2053     if (!(res2 = db_results->currentresult))
2054     continue;
2055    
2056     else
2057     {
2058     /* If rank was delayed, compute it now */
2059     if(res2->rank == -1)
2060     res2->rank = getrank( sw, res2->frequency, res2->tfrequency, res2->posdata, res2->indexf, res2->filenum );
2061    
2062     if ( !res2->PropSort )
2063     res2->PropSort = getResultSortProperties(sw, res2);
2064     }
2065    
2066     if (!res)
2067     {
2068     res = res2;
2069     db_results_winner = db_results;
2070     continue;
2071     }
2072    
2073     rc = (int) compResultsByNonSortedProps(&res, &res2);
2074     if (rc < 0)
2075     {
2076     res = res2;
2077     db_results_winner = db_results;
2078     }
2079     }
2080    
2081     if ((res = db_results_winner->currentresult))
2082     db_results_winner->currentresult = res->next;
2083     }
2084    
2085    
2086    
2087     /* Normalize rank */
2088     if (res)
2089     {
2090     res->rank = (int) (res->rank * num)/10000;
2091     if (res->rank >= 999)
2092     res->rank = 1000;
2093     else if (res->rank < 1)
2094     res->rank = 1;
2095     }
2096     else
2097     {
2098     // it's expected to just return null on end of list.
2099     // sw->lasterror = SWISH_LISTRESULTS_EOF;
2100     }
2101    
2102    
2103     return res;
2104    
2105     }
2106    
2107    
2108    
2109    
2110    
2111     /* Checks if the next word is "="
2112     */
2113    
2114     int isMetaNameOpNext(struct swline *searchWord)
2115     {
2116     if (searchWord == NULL)
2117     return 0;
2118     if (!strcmp(searchWord->line, "="))
2119     return 1;
2120     return 0;
2121     }
2122    
2123     /* funtion to free all memory of a list of results */
2124     void freeresultlist(SWISH * sw, struct DB_RESULTS *dbres)
2125     {
2126     RESULT *rp;
2127     RESULT *tmp;
2128    
2129     #ifdef DUMP_RESULTS
2130     dump_result_lists( sw, "In freeresultlist()" );
2131     #endif
2132    
2133     if(dbres->resultlist)
2134     rp = dbres->resultlist->head;
2135     else
2136     rp = NULL;
2137    
2138     while (rp)
2139     {
2140     tmp = rp->next;
2141     freeresult(sw, rp);
2142     rp = tmp;
2143     }
2144     dbres->resultlist = NULL;
2145     dbres->currentresult = NULL;
2146     dbres->sortresultlist = NULL;
2147     }
2148    
2149     /* funtion to free the memory of one result */
2150     void freeresult(SWISH * sw, RESULT * rp)
2151     {
2152     int i;
2153    
2154     if (rp)
2155     {
2156     freefileinfo( &rp->fi ); // may have already been freed
2157    
2158     if (sw->ResultSort->numPropertiesToSort && rp->PropSort)
2159     {
2160     for (i = 0; i < sw->ResultSort->numPropertiesToSort; i++)
2161     efree(rp->PropSort[i]);
2162    
2163     /* For better performance with thousands of results this is stored in a MemZone
2164     in result_sort.c module
2165     efree(rp->PropSort);
2166     */
2167    
2168     }
2169     /* For better performance with thousands of results this is stored in a MemZone
2170     in result_sort.c module
2171    
2172     if (sw->ResultSort->numPropertiesToSort && rp->iPropSort)
2173     {
2174     efree(rp->iPropSort);
2175     }
2176     */
2177     /* For better performance store results in a MemZone
2178    
2179     efree(rp);
2180     */
2181     }
2182     }
2183    
2184    
2185    
2186     /*
2187     06/00 Jose Ruiz - Sort results by filenum
2188     Uses an array and qsort for better performance
2189     Used for faster "and" and "phrase" of results
2190     */
2191     RESULT_LIST *sortresultsbyfilenum(RESULT_LIST * l_rp)
2192     {
2193     int i,
2194     j;
2195     RESULT **ptmp;
2196     RESULT *rp;
2197    
2198     /* Very trivial case */
2199     if (!l_rp)
2200     return NULL;
2201    
2202    
2203     /* Compute results */
2204     for (i = 0, rp = l_rp->head; rp; rp = rp->next, i++);
2205     /* Another very trivial case */
2206     if (i == 1)
2207     return l_rp;
2208     /* Compute array size */
2209     ptmp = (void *) emalloc(i * sizeof(RESULT *));
2210     /* Build an array with the elements to compare
2211     and pointers to data */
2212     for (j = 0, rp = l_rp->head; rp; rp = rp->next)
2213     ptmp[j++] = rp;
2214     /* Sort them */
2215     swish_qsort(ptmp, i, sizeof(RESULT *), &compResultsByFileNum);
2216     /* Build the list */
2217     for (j = 0, rp = NULL; j < i; j++)
2218     {
2219     if (!rp)
2220     l_rp->head = ptmp[j];
2221     else
2222     rp->next = ptmp[j];
2223     rp = ptmp[j];
2224     }
2225     rp->next = NULL;
2226     l_rp->tail = rp;
2227    
2228     /* Free the memory of the array */
2229     efree(ptmp);
2230    
2231     return l_rp;
2232     }
2233    
2234    
2235     /* 06/00 Jose Ruiz
2236     ** returns all results in r1 that not contains r2
2237     **
2238     ** On input, both result lists r1 and r2 must be sorted by filenum
2239     ** On output, the new result list remains sorted
2240     */
2241     RESULT_LIST *notresultlists(SWISH * sw, RESULT_LIST * l_r1, RESULT_LIST * l_r2)
2242     {
2243     RESULT *rp, *r1, *r2;
2244     RESULT_LIST *new_results_list = NULL;
2245     int res = 0;
2246    
2247     if (!l_r1)
2248     return NULL;
2249     if (l_r1 && !l_r2)
2250     return l_r1;
2251    
2252     for (r1 = l_r1->head, r2 = l_r2->head; r1 && r2;)
2253     {
2254     res = r1->filenum - r2->filenum;
2255     if (res < 0)
2256     {
2257     /*
2258     * Storing all positions could be useful
2259     * in the future
2260     */
2261    
2262     rp = r1;
2263     r1 = r1->next;
2264     if(!new_results_list)
2265     new_results_list = newResultsList(sw);
2266     addResultToList(new_results_list,rp);
2267     }
2268     else if (res > 0)
2269     {
2270     r2 = r2->next;
2271     }
2272     else
2273     {
2274     r1 = r1->next;
2275     r2 = r2->next;
2276     }
2277     }
2278     /* Add remaining results */
2279     while (r1)
2280     {
2281     rp = r1;
2282     r1 = r1->next;
2283     if(!new_results_list)
2284     new_results_list = newResultsList(sw);
2285     addResultToList(new_results_list,rp);
2286     }
2287    
2288     return new_results_list;
2289     }
2290    
2291    
2292     /******************************************************************************
2293     * Remove the stop words from the tokenized query
2294     * rewritten Nov 24, 2001 - moseley
2295     * Still horrible! Need a real parse tree.
2296     *******************************************************************************/
2297    
2298    
2299     struct swline *ignore_words_in_query(SWISH * sw, IndexFILE * indexf, struct swline *searchwordlist, unsigned char phrase_delimiter)
2300     {
2301     struct swline *cur_token = searchwordlist;
2302     struct swline *prev_token = NULL;
2303     struct swline *prev_prev_token = NULL; // for removing two things
2304     int in_phrase = 0;
2305     int word_count = 0; /* number of search words found */
2306     int paren_count = 0;
2307     struct MOD_Search *srch = sw->Search;
2308    
2309     srch->removed_stopwords = 0;
2310    
2311    
2312    
2313     while ( cur_token )
2314     {
2315     int remove = 0;
2316     char first_char = cur_token->line[0];
2317    
2318     if ( cur_token == searchwordlist )
2319     {
2320     prev_token = prev_prev_token = NULL;
2321     word_count = 0;
2322     paren_count = 0;
2323     in_phrase = 0;
2324     }
2325    
2326     while ( 1 ) // so we can use break.
2327     {
2328    
2329     /* Can't backslash here -- (because this code should really be include in swish_words.c) */
2330    
2331     if ( first_char == phrase_delimiter )
2332     {
2333     in_phrase = !in_phrase;
2334    
2335     if ( !in_phrase && prev_token && prev_token->line[0] == phrase_delimiter )
2336     remove = 2;
2337    
2338     break;
2339     }
2340    
2341    
2342     /* leave everything alone inside a pharse */
2343    
2344     if ( in_phrase )
2345     {
2346     if (isstopword(&indexf->header, cur_token->line))
2347     {
2348     srch->removed_stopwords++;
2349     resultHeaderOut(sw, 1, "# Removed stopword: %s\n", cur_token->line);
2350     remove = 1;
2351     }
2352     else
2353     word_count++;
2354    
2355     break;
2356     }
2357    
2358    
2359     /* Allow operators */
2360    
2361     if ( first_char == '=' )
2362     break;
2363    
2364     if ( first_char == '(' )
2365     {
2366     paren_count++;
2367     break;
2368     }
2369    
2370     if ( first_char == ')' )
2371     {
2372     paren_count--;
2373    
2374     if ( prev_token && prev_token->line[0] == '(' )
2375     remove = 2;
2376    
2377     break;
2378     }
2379    
2380    
2381    
2382     /* Allow all metanames */
2383    
2384     if ( isMetaNameOpNext(cur_token->next) )
2385     break;
2386    
2387    
2388    
2389     /* Look for AND OR NOT - remove AND OR at start, and remove second of doubles */
2390    
2391     if ( u_isrule(sw, cur_token->line) )
2392     {
2393     if ( prev_token )
2394     {
2395     /* remove double tokens */
2396     if ( u_isrule( sw, prev_token->line ) )
2397     remove = 1;
2398     }
2399     /* allow NOT at the start */
2400     else if ( !u_isnotrule(sw, cur_token->line) )
2401     remove = 1;
2402    
2403     break;
2404     }
2405    
2406    
2407    
2408     /* Finally, is it a stop word? */
2409    
2410     if ( isstopword(&indexf->header, cur_token->line) )
2411     {
2412     srch->removed_stopwords++;
2413     resultHeaderOut(sw, 1, "# Removed stopword: %s\n", cur_token->line);
2414     remove = 1;
2415     }
2416     else
2417     word_count++;
2418    
2419    
2420    
2421     break;
2422     }
2423    
2424    
2425     /* Catch dangling metanames */
2426     if ( !remove && !cur_token->next && isMetaNameOpNext( cur_token ) )
2427     remove = 2;
2428    
2429    
2430     if ( remove )
2431     {
2432     struct swline *tmp = cur_token;
2433    
2434     if ( cur_token == searchwordlist ) // we are removing first token
2435     searchwordlist = cur_token->next;
2436     else
2437     {
2438     prev_token->next = cur_token->next; // remove one in the middle
2439     cur_token = prev_token; // save if remove == 2
2440     }
2441    
2442    
2443     efree( tmp->line );
2444     efree( tmp );
2445    
2446     if ( remove == 2 )
2447     {
2448     tmp = cur_token;
2449    
2450     if ( cur_token == searchwordlist ) // we are removing first token
2451     searchwordlist = cur_token->next;
2452     else
2453     prev_prev_token->next = cur_token->next; // remove one in the middle
2454    
2455     efree( tmp->line );
2456     efree( tmp );
2457     }
2458    
2459    
2460     /* start at the beginning again */
2461    
2462     cur_token = searchwordlist;
2463     continue;
2464     }
2465    
2466     if ( prev_token )
2467     prev_prev_token = prev_token;
2468    
2469     prev_token = cur_token;
2470     cur_token = cur_token->next;
2471     }
2472    
2473    
2474     if ( in_phrase || paren_count )
2475     sw->lasterror = QUERY_SYNTAX_ERROR;
2476    
2477     else if ( !word_count )
2478     sw->lasterror = srch->removed_stopwords ? WORDS_TOO_COMMON : NO_WORDS_IN_SEARCH;
2479    
2480     return searchwordlist;
2481    
2482     }
2483    
2484    
2485    
2486     /* Adds a file number to a hash table of results.
2487     ** If the entry's already there, add the ranks,
2488     ** else make a new entry.
2489     **
2490     ** Jose Ruiz 04/00
2491     ** For better performance in large "or"
2492     ** keep the lists sorted by filenum
2493     **
2494     ** Jose Ruiz 2001/11 Rewritten to get better performance
2495     */
2496     RESULT_LIST *mergeresulthashlist(SWISH *sw, RESULT_LIST *l_r)
2497     {
2498     unsigned hashval;
2499     RESULT *r,
2500     *rp,
2501     *tmp,
2502     *next,
2503     *start,
2504     *newnode = NULL;
2505     RESULT_LIST *new_results_list = NULL;
2506     int i,
2507     tot_frequency,
2508     pos_off,
2509     filenum;
2510    
2511     if(!l_r)
2512     return NULL;
2513    
2514     if(!l_r->head)
2515     return NULL;
2516    
2517     /* Init hash table */
2518     for (i = 0; i < BIGHASHSIZE; i++)
2519     sw->Search->resulthashlist[i] = NULL;
2520    
2521     for(r = l_r->head, next = NULL; r; r =next)
2522     {
2523     next = r->next;
2524    
2525     tmp = NULL;
2526     hashval = bignumhash(r->filenum);
2527    
2528     rp = sw->Search->resulthashlist[hashval];
2529     for(tmp = NULL; rp; )
2530     {
2531     if (r->filenum <= rp->filenum)
2532     {
2533     break;
2534     }
2535     tmp = rp;
2536     rp = rp->next;
2537     }
2538     if (tmp)
2539     {
2540     tmp->next = r;
2541     }
2542     else
2543     {
2544     sw->Search->resulthashlist[hashval] = r;
2545     }
2546     r->next = rp;
2547     }
2548    
2549     /* Now coalesce reptitive filenums */
2550     for (i = 0; i < BIGHASHSIZE; i++)
2551     {
2552     rp = sw->Search->resulthashlist[i];
2553     for (filenum = 0, start = NULL; ; )
2554     {
2555     if(rp)
2556     next = rp->next;
2557     if(!rp || rp->filenum != filenum)
2558     {
2559     /* Start of new block, coalesce previous results */
2560     if(filenum)
2561     {
2562     int result_size;
2563    
2564     for(tmp = start, tot_frequency = 0; tmp!=rp; tmp = tmp->next)
2565     {
2566     tot_frequency += tmp->frequency;
2567     }
2568    
2569     result_size = sizeof(RESULT) + ((tot_frequency - 1) * sizeof(int));
2570     newnode = (RESULT *) Mem_ZoneAlloc(sw->Search->resultSearchZone, result_size );
2571     memset( newnode, 0, result_size );
2572    
2573     newnode->fi.filenum = newnode->filenum = filenum;
2574     newnode->rank = 0;
2575     newnode->tfrequency = 0;
2576     newnode->frequency = tot_frequency;
2577     newnode->indexf = start->indexf;
2578    
2579     for(tmp = start, pos_off = 0; tmp!=rp; tmp = tmp->next)
2580     {
2581     /* Compute rank if not yet computed */
2582     if(tmp->rank == -1)
2583     tmp->rank = getrank( sw, tmp->frequency, tmp->tfrequency, tmp->posdata, tmp->indexf, tmp->filenum );
2584    
2585     newnode->rank += tmp->rank;
2586     if (tmp->frequency)
2587     {
2588     CopyPositions(newnode->posdata, pos_off, tmp->posdata, 0, tmp->frequency);
2589     pos_off += tmp->frequency;
2590     }
2591    
2592     }
2593     /* Add at the end of new_results_list */
2594     if(!new_results_list)
2595     {
2596     new_results_list = newResultsList(sw);
2597     }
2598     addResultToList(new_results_list,newnode);
2599     /* Sort positions */
2600     swish_qsort(newnode->posdata,newnode->frequency,sizeof(int),&icomp_posdata);
2601     }
2602     if(rp)
2603     filenum = rp->filenum;
2604     start = rp;
2605     }
2606     if(!rp)
2607     break;
2608     rp = next;
2609     }
2610     }
2611     /* Sort results by filenum and return */
2612     return sortresultsbyfilenum(new_results_list);
2613     }
2614    

  ViewVC Help
Powered by ViewVC 1.1.22