/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/search.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/search.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 /*
2 $Id: search.c,v 1.104 2002/08/22 22:58:39 whmoseley Exp $
3 **
4 ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
5 ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
6 **
7 ** This program and library is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU (Library) General Public License
9 ** as published by the Free Software Foundation; either version 2
10 ** of the License, or any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ** GNU (Library) General Public License for more details.
16 **
17 ** You should have received a copy of the GNU (Library) General Public License
18 ** along with this program; if not, write to the Free Software
19 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 **-----------------------------------------------------------------
21 ** Changes in expandstar and parseterm to fix the wildcard * problem.
22 ** G. Hill, ghill@library.berkeley.edu 3/11/97
23 **
24 ** Changes in notresultlist, parseterm, and fixnot to fix the NOT problem
25 ** G. Hill, ghill@library.berkeley.edu 3/13/97
26 **
27 ** Changes in search, parseterm, fixnot, operate, getfileinfo
28 ** to support METADATA
29 ** G. Hill 3/18/97 ghill@library.berkeley.edu
30 **
31 ** Change in search to allow for search with a list including
32 ** also some empty indexes.
33 ** G. Hill after a suggestion by J. Winstead 12/18/97
34 **
35 ** Created countResults for number of hits in search
36 ** G. Hill 12/18/97
37 **
38 **
39 ** Change in search to allow maxhits to return N number
40 ** of results for each index specified
41 ** D. Norris after suggestion by D. Chrisment 08/29/99
42 **
43 ** Created resultmaxhits as a global, renewable maxhits
44 ** D. Norris 08/29/99
45 **
46 ** added word length arg to Stem() call for strcat overflow checking in stemmer.c
47 ** added safestrcpy() macro to avoid corruption from strcpy overflow
48 ** SRE 11/17/99
49 **
50 ** 10/10/99 & 11/23/99 - Bill Moseley (merged by SRE)
51 ** - Changed to stem words *before* expanding with expandstar
52 ** so can find words in the index
53 ** - Moved META tag check before expandstar so META names don't get
54 ** expanded!
55 **
56 ** fixed cast to int problems pointed out by "gcc -Wall"
57 ** SRE 2/22/00
58 **
59 ** fixed search() for case where stopword is followed by rule:
60 ** stopword was removed, rule was left, no matches ever found
61 ** added "# Stopwords removed:" to output header so caller can
62 ** trap actions of IGNORE_STOPWORDS_IN_QUERY
63 ** SRE 2/25/00
64 **
65 ** 04/00 - Jose Ruiz
66 ** Added code for phrase search
67 ** - New function phraseresultlists
68 ** - New function expandphrase
69 **
70 ** 04/00 - Jose Ruiz
71 ** Added freeresult function for freing results memory
72 ** Also added changes to orresultlists andresultlists notresultlist
73 ** for freing memory
74 **
75 ** 04/00 - Jose Ruiz
76 ** Now use bighash instead of hash for better performance in
77 ** orresultlist (a* or b*). Also changed hash.c
78 **
79 ** 04/00 - Jose Ruiz
80 ** Function getfileinfo rewrite
81 ** - Now use a hash approach for faster searching
82 ** - Solves the long timed searches (a* or b* or c*)
83 **
84 ** 04/00 - Jose Ruiz
85 ** Ordering of result rewrite
86 ** Now builtin C function qsort is used for faster ordering of results
87 ** This is useful when lots of results are found
88 ** For example: not (axf) -> This gives you all the documents!!
89 **
90 ** 06/00 - Jose Ruiz
91 ** Rewrite of andresultlits and phraseresultlists for better permonace
92 ** New function notresultlits for better performance
93 **
94 ** 07/00 and 08/00 - Jose Ruiz
95 ** Many modifications to make all search functions thread safe
96 **
97 ** 08/00 - Added ascending and descending capabilities in results sorting
98 **
99 ** 2001-02-xx rasc search call changed, tolower changed...
100 ** 2001-03-03 rasc altavista search, translatechar in headers
101 ** 2001-03-13 rasc definable logical operators via sw->SearchAlt->srch_op.[and|or|nor]
102 ** bugfix in parse_search_string handling...
103 ** 2001-03-14 rasc resultHeaderOutput -H <n>
104 **
105 ** 2001-05-23 moseley - replace parse_search_string with new parser
106 **
107 */
108
109 #include "swish.h"
110 #include "mem.h"
111 #include "string.h"
112 #include "search.h"
113 #include "index.h"
114 #include "file.h"
115 #include "list.h"
116 #include "merge.h"
117 #include "hash.h"
118 #include "docprop.h"
119 #include "error.h"
120 #include "compress.h"
121 /* removed stuff
122 #include "deflate.h"
123 */
124 #include "metanames.h"
125 #include "result_sort.h"
126 #include "result_output.h"
127 #include "search_alt.h"
128 #include "db.h"
129 #include "swish_words.h"
130 #include "swish_qsort.h"
131
132 #include "proplimit.h"
133
134 #include "rank.h"
135
136
137 /*** ??? $$$ fix this ****/
138 #define TOTAL_WORDS_FIX 1;
139 /*
140 -- init structures for this module
141 */
142
143 void initModule_Search (SWISH *sw)
144
145 {
146 struct MOD_Search *srch;
147
148 srch = (struct MOD_Search *) emalloc(sizeof(struct MOD_Search));
149 sw->Search = srch;
150
151
152 /* Default variables for search */
153 srch->maxhits = -1;
154 srch->beginhits = 0;
155
156 srch->currentMaxPropertiesToDisplay = 0;
157 srch->numPropertiesToDisplay = 0;
158
159 srch->db_results = NULL;
160
161 srch->propNameToDisplay =NULL;
162
163 srch->PhraseDelimiter=PHRASE_DELIMITER_CHAR;
164
165 srch->bigrank = 0;
166
167 srch->resultSearchZone = Mem_ZoneCreate("resultSearch Zone", 0, 0);
168
169 return;
170 }
171
172 /* Resets memory of vars used by ResultSortt properties configuration */
173 void resetModule_Search (SWISH *sw)
174
175 {
176 struct DB_RESULTS *tmp;
177 struct MOD_Search *srch = sw->Search;
178 int i;
179 IndexFILE *tmpindexlist;
180
181
182
183 /* Default variables for search */
184 srch->maxhits = -1;
185 srch->beginhits = 0;
186
187 /* Free results from search if they exists */
188
189 while ( srch->db_results )
190 {
191 tmp = srch->db_results->next;
192 freeresultlist(sw,srch->db_results);
193 efree(srch->db_results);
194 srch->db_results = tmp;
195 }
196
197
198
199
200
201 Mem_ZoneReset(srch->resultSearchZone);
202
203
204 /* Free display props arrays */
205 /* First the common part to all the index files */
206 if (srch->propNameToDisplay)
207 {
208 for(i=0;i<srch->numPropertiesToDisplay;i++)
209 efree(srch->propNameToDisplay[i]);
210
211 efree(srch->propNameToDisplay);
212 }
213
214 srch->propNameToDisplay=NULL;
215 srch->numPropertiesToDisplay=0;
216 srch->currentMaxPropertiesToDisplay=0;
217
218 /* Now the IDs of each index file */
219 for(tmpindexlist=sw->indexlist;tmpindexlist;tmpindexlist=tmpindexlist->next)
220 {
221 if (tmpindexlist->propIDToDisplay)
222 efree(tmpindexlist->propIDToDisplay);
223
224 tmpindexlist->propIDToDisplay=NULL;
225 }
226 }
227
228 /*
229 -- release all wired memory for this module
230 */
231
232 void freeModule_Search (SWISH *sw)
233
234 {
235 struct MOD_Search *srch = sw->Search;
236
237 /* free module data */
238 Mem_ZoneFree(&srch->resultSearchZone);
239
240 efree (srch);
241 sw->Search = NULL;
242
243 return;
244 }
245
246 // #define DUMP_RESULTS 1
247
248
249 #ifdef DUMP_RESULTS
250
251 void dump_result_lists( SWISH *sw, char *message )
252
253 {
254 struct DB_RESULTS *db_results = sw->Search->db_results;
255 int cnt = 0;
256
257 printf("\nDump Results: (%s)\n", message );
258
259 while ( db_results )
260 {
261 RESULT *result;
262 printf("Looking at index\n");
263 if ( !db_results->resultlist )
264 {
265 printf("no resultlist\n");
266 continue;
267 }
268 result = db_results->resultlist->head;
269
270 if ( !result )
271 {
272 printf("resultlist, but head is null\n");
273 continue;
274 }
275 while ( result )
276 {
277 printf("Result (%2d): filenum '%d' from index file '%s'\n", ++cnt, result->filenum, result->indexf->line );
278 result = result->next;
279 }
280
281 printf("end of results for index\n");
282
283 db_results = db_results->next;
284 }
285 printf("end of results\n\n");
286 }
287 #endif
288
289
290
291
292 /*
293 ** ----------------------------------------------
294 **
295 ** Module config code starts here
296 **
297 ** ----------------------------------------------
298 */
299
300
301 /*
302 -- Config Directives
303 -- Configuration directives for this Module
304 -- return: 0/1 = none/config applied
305 */
306
307 int configModule_Search (SWISH *sw, StringList *sl)
308
309 {
310 //struct MOD_Search *srch = sw->Search;
311 //char *w0 = sl->word[0];
312 int retval = 0;
313
314 return retval;
315 }
316
317
318 /* Compare two integers */
319 /* This routine is used by qsort */
320 int icomp(const void *s1, const void *s2)
321 {
322 return (*(int *) s1 - *(int *) s2);
323 }
324
325 /* Compare two positions as stored in posdata */
326 /* This routine is used by qsort */
327 int icomp_posdata(const void *s1, const void *s2)
328 {
329 return (GET_POSITION(*(int *) s1) - GET_POSITION(*(int *) s2));
330 }
331
332 /* 01/2001 Jose Ruiz */
333 /* Compare RESULTS using RANK */
334 /* This routine is used by qsort */
335 int compResultsByFileNum(const void *s1, const void *s2)
336 {
337 return ((*(RESULT * const *) s1)->filenum - (*(RESULT * const *) s2)->filenum);
338 }
339
340 /* 04/00 Jose Ruiz */
341 /* Simple routing for comparing pointers to integers in order to
342 get an ascending sort with qsort */
343 /* Identical to previous one but use two integers per array */
344 int icomp2(const void *s1, const void *s2)
345 {
346 int rc,
347 *p1,
348 *p2;
349
350 rc = (*(int *) s1 - *(int *) s2);
351 if (rc)
352 return (rc);
353 else
354 {
355 p1 = (int *) s1;
356 p2 = (int *) s2;
357 return (*(++p1) - *(++p2));
358 }
359 }
360
361
362
363 /*
364 -- Search Swish
365 -- Check if AltaVista/Lycos/Web.de like search string has to be converted
366 -- and call swish search...
367 -- 2001-03-02 rasc
368 */
369
370 int search(SWISH * sw, char *words, int structure)
371 {
372 char *sw_srch_str;
373 int ret;
374
375
376 if (sw->SearchAlt->enableAltSearchSyntax)
377 { /* AltaVista like search enabled? */
378 sw_srch_str = convAltSearch2SwishStr(words);
379 ret = search_2(sw, sw_srch_str, structure);
380 efree(sw_srch_str);
381 }
382 else
383 {
384 ret = search_2(sw, words, structure);
385 }
386 return ret;
387 }
388
389
390 static void limit_result_list( SWISH *sw )
391 {
392 RESULT *rtmp;
393 RESULT *rp;
394 RESULT *last;
395 struct DB_RESULTS *db_results = sw->Search->db_results;
396
397
398 /* Process each index */
399 while ( db_results )
400 {
401 if(db_results->resultlist)
402 rp = db_results->resultlist->head;
403 else
404 rp = NULL;
405
406 last = NULL;
407
408 while (rp)
409 {
410 rtmp = rp->next;
411
412 if ( LimitByProperty( sw, rp->indexf, rp->filenum ) )
413 {
414 freeresult( sw, rp );
415
416 if ( !last ) /* if first in list */
417 db_results->resultlist->head = rtmp;
418 else
419 last->next = rtmp;
420 }
421 else
422 last = rp; /* move the last pointer to current one */
423
424 rp = rtmp;
425 }
426
427 db_results = db_results->next;
428
429 }
430 }
431
432
433
434
435 /********************************************************************************
436 * Returns a negative number on error, zero or positive = number of results
437 *
438 ********************************************************************************/
439
440 int search_2(SWISH * sw, char *words, int structure)
441 {
442 int j,
443 k,
444 hassearch,
445 indexYes,
446 totalResults;
447 struct swline *searchwordlist,
448 *tmplist2;
449 IndexFILE *indexlist;
450 int rc = 0;
451 unsigned char PhraseDelimiter;
452 struct DB_RESULTS *db_results,
453 *db_tmp;
454 struct MOD_Search *srch = sw->Search;
455
456
457 /* If not words - do nothing */
458 if (!words || !*words)
459 return (sw->lasterror = NO_WORDS_IN_SEARCH);
460
461 PhraseDelimiter = (unsigned char) srch->PhraseDelimiter;
462
463 indexlist = sw->indexlist;
464 sw->Search->db_results = NULL;
465
466
467 j = 0;
468 searchwordlist = NULL;
469 indexYes = 0;
470 totalResults = 0;
471 hassearch = 0;
472
473 sw->lasterror = RC_OK;
474 sw->commonerror = RC_OK;
475
476 if ((rc = initSearchResultProperties(sw)))
477 return rc;
478 if ((rc = initSortResultProperties(sw)))
479 return rc;
480
481
482 /* This returns false when no files found within the limit */
483 if ( !Prepare_PropLookup( sw ) )
484 return sw->lasterror; /* normally RC_OK (no results), but could be an error */
485
486 while (indexlist != NULL)
487 {
488 /* moved these inside the loop Nov 24,2001. Need to verify */
489 sw->lasterror = RC_OK;
490 sw->commonerror = RC_OK;
491
492
493 /* Clean up from previous loop */
494 if (searchwordlist)
495 {
496 freeswline(searchwordlist);
497 searchwordlist = NULL;
498 }
499
500
501
502 /* tokenize the query into swish words based on this current index */
503
504 if (!(searchwordlist = tokenize_query_string(sw, words, &indexlist->header)))
505 {
506 indexlist = indexlist->next;
507 if ( sw->lasterror )
508 return sw->lasterror;
509 continue;
510 }
511
512
513 hassearch = 1; // Flag that we found some words to search form
514
515 srch->bigrank = 0;
516
517
518 /* Is there an open index? */
519 if (!indexlist->DB)
520 {
521 if (searchwordlist)
522 freeswline(searchwordlist); /* 2001-03-13 rasc */
523 return sw->lasterror;
524 }
525
526
527 /* Any files in the index? */
528
529 if (!indexlist->header.totalfiles)
530 {
531 indexlist = indexlist->next;
532 continue;
533 }
534 else
535 {
536 indexYes = 1; /*There is a non-empty index */
537 }
538
539
540 resultHeaderOut(sw, 2, "#\n# Index File: %s\n", indexlist->line);
541
542 searchwordlist = (struct swline *) ignore_words_in_query(sw, indexlist, searchwordlist, PhraseDelimiter);
543 if ( sw->lasterror )
544 {
545 if (searchwordlist)
546 freeswline(searchwordlist);
547 return sw->lasterror;
548 }
549
550
551 /* Echo index file, fixed search, stopwords */
552
553 /* Result Header Output (2001-03-14 rasc, rewritten) */
554
555 resultPrintHeader(sw, 2, &indexlist->header, indexlist->header.savedasheader, 0);
556
557
558 resultHeaderOut(sw, 3, "# StopWords:");
559 for (k = 0; k < indexlist->header.stopPos; k++)
560 resultHeaderOut(sw, 3, " %s", indexlist->header.stopList[k]);
561
562 resultHeaderOut(sw, 3, "\n");
563
564
565
566 printheaderbuzzwords(sw,indexlist);
567
568
569 resultHeaderOut(sw, 2, "# Search Words: %s\n", words);
570
571 tmplist2 = searchwordlist;
572
573 if ( !tmplist2 )
574 sw->commonerror++; // Flag at the query (with this index) did no have any search words
575 else
576 {
577 resultHeaderOut(sw, 2, "# Parsed Words: ");
578
579 while (tmplist2)
580 {
581 resultHeaderOut(sw, 2, "%s ", tmplist2->line);
582 tmplist2 = tmplist2->next;
583 }
584
585 resultHeaderOut(sw, 2, "\n");
586 }
587
588
589 /* Expand phrase search: "kim harlow" becomes (kim PHRASE_WORD harlow) */
590 searchwordlist = (struct swline *) expandphrase(searchwordlist, PhraseDelimiter);
591 searchwordlist = (struct swline *) fixmetanames(searchwordlist);
592 searchwordlist = (struct swline *) fixnot1(searchwordlist);
593 searchwordlist = (struct swline *) fixnot2(searchwordlist);
594
595 /* Allocate memory for the result list structure */
596 db_results = (struct DB_RESULTS *) emalloc(sizeof(struct DB_RESULTS));
597 memset( db_results, 0, sizeof(struct DB_RESULTS));
598
599
600 /* Now do the search */
601
602 if (searchwordlist)
603 {
604 int metaID = 1; /* Default meta ID to search */
605
606 tmplist2 = searchwordlist;
607 db_results->resultlist = (RESULT_LIST *) parseterm(sw, 0, metaID, structure, indexlist, &tmplist2);
608 }
609
610
611 /* add db_results to the list of results */
612
613 if ( !sw->Search->db_results)
614 sw->Search->db_results = db_results;
615 else
616 {
617 db_tmp = sw->Search->db_results;
618 while (db_tmp)
619 {
620 if (!db_tmp->next)
621 {
622 db_tmp->next = db_results;
623 break;
624 }
625 db_tmp = db_tmp->next;
626 }
627
628 }
629
630
631 indexlist = indexlist->next;
632
633 } /* end process each index file */
634
635
636 /* Did any of the indexes have a query? */
637
638 if (!hassearch)
639 return (sw->lasterror = NO_WORDS_IN_SEARCH);
640
641
642
643 if (searchwordlist)
644 {
645 freeswline(searchwordlist);
646 searchwordlist = NULL;
647 }
648
649
650 /* Limit result list by -L parameter */
651 // note -- this used to be in addtoresultlist, but that was checking every word
652 // placing here means that it only checks for each file, instead of each word.
653
654 if ( is_prop_limit_used( sw ) )
655 limit_result_list( sw );
656
657 #ifdef DUMP_RESULTS
658 dump_result_lists( sw, "After search" );
659 #endif
660
661
662 /* 04/00 Jose Ruiz - Sort results by rank or by properties */
663
664 totalResults = sortresults(sw, structure);
665
666
667 if (!totalResults && sw->commonerror)
668 return (sw->lasterror = WORDS_TOO_COMMON);
669
670 if (!totalResults && !indexYes)
671 return (sw->lasterror = INDEX_FILE_IS_EMPTY);
672
673 return totalResults;
674 }
675
676 /* 2001-09 jmruiz - Rewriting
677 ** This puts parentheses in the right places around meta searches
678 ** to avoid problems whith them. Basically "metaname = bla"
679 ** becomes "(metanames = bla)" */
680
681 struct swline *fixmetanames(struct swline *sp)
682 {
683 int metapar;
684 struct swline *tmpp,
685 *newp;
686
687 tmpp = sp;
688 newp = NULL;
689
690 /* Fix metanames with parenthesys eg: metaname = bla => (metanames = bla) */
691 while (tmpp != NULL)
692 {
693 if (isMetaNameOpNext(tmpp->next))
694 {
695 /* If it is a metaName add the name and = and skip to next */
696 newp = (struct swline *) addswline(newp, "(");
697 newp = (struct swline *) addswline(newp, tmpp->line);
698 newp = (struct swline *) addswline(newp, "=");
699 tmpp = tmpp->next;
700 tmpp = tmpp->next;
701 if ( !tmpp )
702 return NULL;
703
704 /* 06/00 Jose Ruiz
705 ** Fix to consider parenthesys in the
706 ** content of a MetaName */
707 if (tmpp->line[0] == '(')
708 {
709 metapar = 1;
710 newp = (struct swline *) addswline(newp, tmpp->line);
711 tmpp = tmpp->next;
712 while (metapar && tmpp)
713 {
714 if (tmpp->line[0] == '(')
715 metapar++;
716 else if (tmpp->line[0] == ')')
717 metapar--;
718 newp = (struct swline *) addswline(newp, tmpp->line);
719 if (metapar)
720 tmpp = tmpp->next;
721 }
722 if (!tmpp)
723 return (newp);
724 }
725 else
726 newp = (struct swline *) addswline(newp, tmpp->line);
727 newp = (struct swline *) addswline(newp, ")");
728 }
729 else
730 newp = (struct swline *) addswline(newp, tmpp->line);
731 /* next one */
732 tmpp = tmpp->next;
733 }
734
735 freeswline(sp);
736 return newp;
737 }
738
739 /* 2001 -09 jmruiz Rewritten
740 ** This optimizes some NOT operator to be faster.
741 **
742 ** "word1 not word" is changed by "word1 and_not word2"
743 **
744 ** In the old way the previous query was...
745 ** get results if word1
746 ** get results of word2
747 ** not results of word2 (If we have 100000 docs and word2 is in
748 ** just 3 docs, this means read 99997
749 ** results)
750 ** intersect both list of results
751 **
752 ** The "new way"
753 ** get results if word1
754 ** get results of word2
755 ** intersect (and_not_rule) both lists of results
756 **
757 */
758
759 struct swline *fixnot1(struct swline *sp)
760 {
761 struct swline *tmpp,
762 *prev;
763
764 if (!sp)
765 return NULL;
766 /* 06/00 Jose Ruiz - Check if first word is NOT_RULE */
767 /* Change remaining NOT by AND_NOT_RULE */
768 for (tmpp = sp, prev = NULL; tmpp; prev = tmpp, tmpp = tmpp->next)
769 {
770 if (tmpp->line[0] == '(')
771 continue;
772 else if (isnotrule(tmpp->line))
773 {
774 if(prev && prev->line[0]!='=' && prev->line[0]!='(')
775 {
776 efree(tmpp->line);
777 tmpp->line = estrdup(AND_NOT_WORD);
778 }
779 }
780 }
781 return sp;
782 }
783
784 /* 2001 -09 jmruiz - Totally new - Fix the meta=(not ahsg) bug
785 ** Add parentheses to avoid the way operator NOT confuse complex queries */
786 struct swline *fixnot2(struct swline *sp)
787 {
788 int openparen, found;
789 struct swline *tmpp, *newp;
790 char *magic = "<__not__>"; /* magic avoids parsing the
791 ** "not" operator twice
792 ** and put the code in an
793 ** endless loop */
794
795 found = 1;
796 while(found)
797 {
798 openparen = 0;
799 found = 0;
800
801 for (tmpp = sp , newp = NULL; tmpp ; tmpp = tmpp->next)
802 {
803 if (isnotrule(tmpp->line))
804 {
805 found = 1;
806 /* Add parentheses */
807 newp = (struct swline *) addswline(newp, "(");
808 /* Change "NOT" by magic to avoid find it in next iteration */
809 newp = (struct swline *) addswline(newp, magic);
810 for(tmpp = tmpp->next; tmpp; tmpp = tmpp->next)
811 {
812 if ((tmpp->line)[0] == '(')
813 openparen++;
814 else if(!openparen)
815 {
816 newp = (struct swline *) addswline(newp, tmpp->line);
817 /* Add parentheses */
818 newp = (struct swline *) addswline(newp, ")");
819 break;
820 }
821 else if ((tmpp->line)[0] == ')')
822 openparen--;
823 newp = (struct swline *) addswline(newp, tmpp->line);
824 }
825 if(!tmpp)
826 break;
827 }
828 else
829 newp = (struct swline *) addswline(newp, tmpp->line);
830 }
831 freeswline(sp);
832 sp = newp;
833 }
834
835 /* remove magic and put the "real" NOT in place */
836 for(tmpp = newp; tmpp ; tmpp = tmpp->next)
837 {
838 if(!strcmp(tmpp->line,magic))
839 {
840 efree(tmpp->line);
841 tmpp->line = estrdup(NOT_WORD);
842 }
843 }
844
845 return newp;
846 }
847
848 /* expandstar removed - Jose Ruiz 04/00 */
849
850 /* Expands phrase search. Berkeley University becomes Berkeley PHRASE_WORD University */
851 /* It also fixes the and, not or problem when they appeared inside a phrase */
852 struct swline *expandphrase(struct swline *sp, char delimiter)
853 {
854 struct swline *tmp,
855 *newp;
856 int inphrase;
857
858 if (!sp)
859 return NULL;
860 inphrase = 0;
861 newp = NULL;
862 tmp = sp;
863 while (tmp != NULL)
864 {
865 if ((tmp->line)[0] == delimiter)
866 {
867 if (inphrase)
868 {
869 inphrase = 0;
870 newp = (struct swline *) addswline(newp, ")");
871 }
872 else
873 {
874 inphrase++;
875 newp = (struct swline *) addswline(newp, "(");
876 }
877 }
878 else
879 {
880 if (inphrase)
881 {
882 if (inphrase > 1)
883 newp = (struct swline *) addswline(newp, PHRASE_WORD);
884 inphrase++;
885 newp = (struct swline *) addswline(newp, tmp->line);
886 }
887 else
888 {
889 char *operator;
890
891 if ( ( operator = isBooleanOperatorWord( tmp->line )) )
892 newp = (struct swline *) addswline(newp, operator);
893 else
894 newp = (struct swline *) addswline(newp, tmp->line);
895 }
896 }
897 tmp = tmp->next;
898 }
899 freeswline(sp);
900 return newp;
901 }
902
903
904
905 /* Print the buzzwords */
906 void printheaderbuzzwords(SWISH *sw, IndexFILE * indexf)
907 {
908 int hashval;
909 struct swline *sp = NULL;
910
911 resultHeaderOut(sw, 3, "# BuzzWords:");
912
913 for (hashval = 0; hashval < HASHSIZE; hashval++)
914 {
915 sp = indexf->header.hashbuzzwordlist[hashval];
916 while (sp != NULL)
917 {
918 resultHeaderOut(sw, 3, " %s", sp->line);
919 sp = sp->next;
920 }
921 }
922 resultHeaderOut(sw, 3, "\n");
923 }
924
925
926
927
928
929
930 /* The recursive parsing function.
931 ** This was a headache to make but ended up being surprisingly easy. :)
932 ** parseone tells the function to only operate on one word or term.
933 ** parseone is needed so that with metaA=foo bar "bar" is searched
934 ** with the default metaname.
935 */
936
937 RESULT_LIST *parseterm(SWISH * sw, int parseone, int metaID, int structure, IndexFILE * indexf, struct swline **searchwordlist)
938 {
939 int rulenum;
940 char *word;
941 int lenword;
942 RESULT_LIST *l_rp,
943 *new_l_rp;
944
945 /*
946 * The andLevel is used to help keep the ranking function honest
947 * when it ANDs the results of the latest search term with
948 * the results so far (rp). The idea is that if you AND three
949 * words together you ultimately want to resulting rank to
950 * be the average of all three individual work ranks. By keeping
951 * a running total of the number of terms already ANDed, the
952 * next AND operation can properly scale the average-rank-so-far
953 * and recompute the new average properly (see andresultlists()).
954 * This implementation is a little weak in that it will not average
955 * across terms that are in parenthesis. (It treats an () expression
956 * as one term, and weights it as "one".)
957 */
958 int andLevel = 0; /* number of terms ANDed so far */
959
960 word = NULL;
961 lenword = 0;
962
963 l_rp = NULL;
964
965 rulenum = OR_RULE;
966 while (*searchwordlist)
967 {
968
969 word = SafeStrCopy(word, (*searchwordlist)->line, &lenword);
970
971
972 if (rulenum == NO_RULE)
973 rulenum = sw->SearchAlt->srch_op.defaultrule;
974
975
976 if (isunaryrule(word)) /* is it a NOT? */
977 {
978 *searchwordlist = (*searchwordlist)->next;
979 l_rp = (RESULT_LIST *) parseterm(sw, 1, metaID, structure, indexf, searchwordlist);
980 l_rp = (RESULT_LIST *) notresultlist(sw, l_rp, indexf);
981
982 /* Wild goose chase */
983 rulenum = NO_RULE;
984 continue;
985 }
986
987
988 /* If it's an operator, set the current rulenum, and continue */
989 else if (isbooleanrule(word))
990 {
991 rulenum = getrulenum(word);
992 *searchwordlist = (*searchwordlist)->next;
993 continue;
994 }
995
996
997 /* Bump up the count of AND terms for this level */
998
999 if (rulenum != AND_RULE)
1000 andLevel = 0; /* reset */
1001 else if (rulenum == AND_RULE)
1002 andLevel++;
1003
1004
1005
1006 /* Is this the start of a sub-query? */
1007
1008 if (word[0] == '(')
1009 {
1010
1011
1012 /* Recurse */
1013 *searchwordlist = (*searchwordlist)->next;
1014 new_l_rp = (RESULT_LIST *) parseterm(sw, 0, metaID, structure, indexf, searchwordlist);
1015
1016
1017 if (rulenum == AND_RULE)
1018 l_rp = (RESULT_LIST *) andresultlists(sw, l_rp, new_l_rp, andLevel);
1019
1020 else if (rulenum == OR_RULE)
1021 l_rp = (RESULT_LIST *) orresultlists(sw, l_rp, new_l_rp);
1022
1023 else if (rulenum == PHRASE_RULE)
1024 l_rp = (RESULT_LIST *) phraseresultlists(sw, l_rp, new_l_rp, 1);
1025
1026 else if (rulenum == AND_NOT_RULE)
1027 l_rp = (RESULT_LIST *) notresultlists(sw, l_rp, new_l_rp);
1028
1029 if (!*searchwordlist)
1030 break;
1031
1032 rulenum = NO_RULE;
1033 continue;
1034
1035 }
1036 else if (word[0] == ')')
1037 {
1038 *searchwordlist = (*searchwordlist)->next;
1039 break;
1040 }
1041
1042
1043 /* Now down to checking for metanames and actual search words */
1044
1045
1046 /* Check if the next word is '=' */
1047 if (isMetaNameOpNext((*searchwordlist)->next))
1048 {
1049 struct metaEntry *m = getMetaNameByName(&indexf->header, word);
1050
1051 /* shouldn't happen since already checked */
1052 if ( !m )
1053 progerr("Unknown metaname '%s' -- swish_words failed to find.", word );
1054
1055 metaID = m->metaID;
1056
1057
1058 /* Skip both the metaName end the '=' */
1059 *searchwordlist = (*searchwordlist)->next->next;
1060
1061
1062 if ((*searchwordlist) && ((*searchwordlist)->line[0] == '('))
1063 {
1064 *searchwordlist = (*searchwordlist)->next;
1065 parseone = 0;
1066 }
1067 else
1068 parseone = 1;
1069
1070 /* Now recursively process the next terms */
1071
1072 new_l_rp = (RESULT_LIST *) parseterm(sw, parseone, metaID, structure, indexf, searchwordlist);
1073 if (rulenum == AND_RULE)
1074 l_rp = (RESULT_LIST *) andresultlists(sw, l_rp, new_l_rp, andLevel);
1075
1076 else if (rulenum == OR_RULE)
1077 l_rp = (RESULT_LIST *) orresultlists(sw, l_rp, new_l_rp);
1078
1079 else if (rulenum == PHRASE_RULE)
1080 l_rp = (RESULT_LIST *) phraseresultlists(sw, l_rp, new_l_rp, 1);
1081
1082 else if (rulenum == AND_NOT_RULE)
1083 l_rp = (RESULT_LIST *) notresultlists(sw, l_rp, new_l_rp);
1084
1085 if (!*searchwordlist)
1086 break;
1087
1088 rulenum = NO_RULE;
1089 metaID = 1;
1090 continue;
1091 }
1092
1093
1094 /* Finally, look up a word, and merge with previous results. */
1095
1096 l_rp = (RESULT_LIST *) operate(sw, l_rp, rulenum, word, indexf->DB, metaID, structure, andLevel, indexf);
1097
1098 if (parseone)
1099 {
1100 *searchwordlist = (*searchwordlist)->next;
1101 break;
1102 }
1103 rulenum = NO_RULE;
1104
1105 *searchwordlist = (*searchwordlist)->next;
1106 }
1107
1108 if (lenword)
1109 efree(word);
1110
1111 return l_rp;
1112 }
1113
1114 /* Looks up a word in the index file -
1115 ** it calls getfileinfo(), which does the real searching.
1116 */
1117
1118 RESULT_LIST *operate(SWISH * sw, RESULT_LIST * l_rp, int rulenum, char *wordin, void *DB, int metaID, int structure, int andLevel, IndexFILE * indexf)
1119 {
1120 RESULT_LIST *new_l_rp,
1121 *return_l_rp;
1122 char *word;
1123 int lenword;
1124
1125 word = estrdup(wordin);
1126 lenword = strlen(word);
1127
1128 new_l_rp = return_l_rp = NULL;
1129
1130
1131 /* ??? Stopwords have already been removed at this point. */
1132 /*****
1133 if (isstopword(&indexf->header, word) && !isrule(word))
1134 {
1135 if (rulenum == OR_RULE && l_rp != NULL)
1136 return l_rp;
1137 else
1138 sw->commonerror = 1;
1139 }
1140 ******/
1141
1142
1143 /* ??? Ever a time when rulenum is not one of these? */
1144
1145 if (rulenum == AND_RULE)
1146 {
1147 new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1148 return_l_rp = (RESULT_LIST *) andresultlists(sw, l_rp, new_l_rp, andLevel);
1149 }
1150
1151
1152 else if (rulenum == OR_RULE)
1153 {
1154 new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1155 return_l_rp = (RESULT_LIST *) orresultlists(sw, l_rp, new_l_rp);
1156 }
1157
1158
1159 else if (rulenum == NOT_RULE)
1160 {
1161 new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1162 return_l_rp = (RESULT_LIST *) notresultlist(sw, new_l_rp, indexf);
1163 }
1164
1165
1166 else if (rulenum == PHRASE_RULE)
1167 {
1168 new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1169 return_l_rp = (RESULT_LIST *) phraseresultlists(sw, l_rp, new_l_rp, 1);
1170 }
1171
1172
1173 else if (rulenum == AND_NOT_RULE)
1174 {
1175 new_l_rp = (RESULT_LIST *) getfileinfo(sw, word, indexf, metaID, structure);
1176 return_l_rp = (RESULT_LIST *) notresultlists(sw, l_rp, new_l_rp);
1177 }
1178
1179
1180 efree(word);
1181 return return_l_rp;
1182 }
1183
1184
1185 static RESULT_LIST *newResultsList(SWISH *sw)
1186 {
1187 RESULT_LIST *l = (RESULT_LIST *)Mem_ZoneAlloc(sw->Search->resultSearchZone, sizeof(RESULT_LIST));
1188 l->head = l->tail = NULL;
1189 l->sw = (struct SWISH *)sw;
1190
1191 return l;
1192 }
1193
1194 void addResultToList(RESULT_LIST *l_r, RESULT *r)
1195 {
1196 r->next = NULL;
1197
1198 r->reslist = l_r;
1199
1200 if(!l_r->head)
1201 l_r->head = r;
1202 if(l_r->tail)
1203 l_r->tail->next = r;
1204 l_r->tail = r;
1205
1206 }
1207
1208
1209 /* Routine to test structure in a result */
1210 /* Also removes posdata that do not fit with structure field */
1211 int test_structure(int structure, int frequency, int *posdata)
1212 {
1213 int i, j; /* i -> counter upto frequency, j -> new frequency */
1214 int *p,*q; /* Use pointers to ints instead of arrays for
1215 ** faster proccess */
1216
1217 for(i = j = 0, p = q = posdata; i < frequency; i++, p++)
1218 {
1219 if(GET_STRUCTURE(*p) & structure)
1220 {
1221 if(p - q)
1222 {
1223 *q = *p;
1224 }
1225 j++;
1226 q++;
1227 }
1228 }
1229 return j; /* return new frequency */
1230 }
1231
1232
1233
1234 /* Finds a word and returns its corresponding file and rank information list.
1235 ** If not found, NULL is returned.
1236 */
1237 /* Jose Ruiz
1238 ** New implmentation based on Hashing for direct access. Faster!!
1239 ** Also solves stars. Faster!! It can even found "and", "or"
1240 ** when looking for "an*" or "o*" if they are not stop words
1241 */
1242
1243 #define MAX_POSDATA_STACK 256
1244
1245 RESULT_LIST *getfileinfo(SWISH * sw, char *word, IndexFILE * indexf, int metaID, int structure)
1246 {
1247 int j,
1248 x,
1249 filenum,
1250 frequency,
1251 len,
1252 curmetaID,
1253 index_structure,
1254 index_structfreq,
1255 tmpval;
1256 RESULT_LIST *l_rp, *l_rp2;
1257 long wordID;
1258 unsigned long nextposmetaname;
1259 char *p;
1260 int tfrequency = 0;
1261 unsigned char *s, *buffer;
1262 int sz_buffer;
1263 unsigned char flag;
1264 int stack_posdata[MAX_POSDATA_STACK]; /* stack buffer for posdata */
1265 int *posdata;
1266
1267 x = j = filenum = frequency = len = curmetaID = index_structure = index_structfreq = 0;
1268 nextposmetaname = 0L;
1269
1270
1271 l_rp = l_rp2 = NULL;
1272
1273
1274 /* First: Look for star at the end of the word */
1275 if ((p = strrchr(word, '*')))
1276 {
1277 if (p != word && *(p - 1) == '\\') /* Check for an escaped * */
1278 {
1279 p = NULL; /* If escaped it is not a wildcard */
1280 }
1281 else
1282 {
1283 /* Check if it is at the end of the word */
1284 if (p == (word + strlen(word) - 1))
1285 {
1286 word[strlen(word) - 1] = '\0';
1287 /* Remove the wildcard - p remains not NULL */
1288 }
1289 else
1290 {
1291 p = NULL; /* Not at the end - Ignore */
1292 }
1293 }
1294 }
1295
1296
1297
1298
1299 DB_InitReadWords(sw, indexf->DB);
1300 if (!p) /* No wildcard -> Direct hash search */
1301 {
1302 DB_ReadWordHash(sw, word, &wordID, indexf->DB);
1303
1304 if(!wordID)
1305 {
1306 DB_EndReadWords(sw, indexf->DB);
1307 sw->lasterror = WORD_NOT_FOUND;
1308 return NULL;
1309 }
1310 }
1311
1312 else /* There is a star. So use the sequential approach */
1313 {
1314 char *resultword;
1315
1316 if (*word == '*')
1317 {
1318 sw->lasterror = UNIQUE_WILDCARD_NOT_ALLOWED_IN_WORD;
1319 return NULL;
1320 }
1321
1322
1323 DB_ReadFirstWordInvertedIndex(sw, word, &resultword, &wordID, indexf->DB);
1324
1325 if (!wordID)
1326 {
1327 DB_EndReadWords(sw, indexf->DB);
1328 sw->lasterror = WORD_NOT_FOUND;
1329 return NULL;
1330 }
1331 efree(resultword); /* Do not need it */
1332 }
1333
1334
1335 /* If code is here we have found the word !! */
1336
1337 do
1338 {
1339 DB_ReadWordData(sw, wordID, &buffer, &sz_buffer, indexf->DB);
1340 s = buffer;
1341
1342 // buffer structure = <tfreq><metaID><delta to next meta>
1343
1344 /* Get the data of the word */
1345 tfrequency = uncompress2(&s); /* tfrequency - number of files with this word */
1346
1347
1348 /* Now look for a correct Metaname */
1349 curmetaID = uncompress2(&s);
1350
1351 while (curmetaID)
1352 {
1353 nextposmetaname = UNPACKLONG2(s);
1354 s += sizeof(long);
1355
1356
1357 if (curmetaID >= metaID)
1358 break;
1359
1360 s = buffer + nextposmetaname;
1361 if(nextposmetaname == (unsigned long)sz_buffer)
1362 break; // if no more meta data
1363
1364 curmetaID = uncompress2(&s);
1365 }
1366
1367 if (curmetaID == metaID) /* found a matching meta value */
1368 {
1369 filenum = 0;
1370 do
1371 {
1372 /* Read on all items */
1373 uncompress_location_values(&s,&flag,&tmpval,&frequency);
1374 filenum += tmpval;
1375
1376 /* stack_posdata is just to avoid calling emalloc */
1377 /* it should be enough for most cases */
1378 if(frequency > MAX_POSDATA_STACK)
1379 posdata = (int *)emalloc(frequency * sizeof(int));
1380 else
1381 posdata = stack_posdata;
1382
1383 /* read positions */
1384 uncompress_location_positions(&s,flag,frequency,posdata);
1385
1386 /* test structure and adjust frequency */
1387 frequency = test_structure(structure, frequency, posdata);
1388
1389 /* Store -1 in rank - In this way, we can delay its computation */
1390 /* This stuff has been removed */
1391
1392 /* Store result */
1393 if(frequency)
1394 {
1395 /* This is very useful if we sorted by other property */
1396 if(!l_rp)
1397 l_rp = newResultsList(sw);
1398
1399 // tfrequency = number of files with this word
1400 // frequency = number of times this words is in this document for this metaID
1401
1402 addtoresultlist(l_rp, filenum, -1, tfrequency, frequency, indexf, sw);
1403
1404 /* Copy positions */
1405 memcpy((unsigned char *)l_rp->tail->posdata,(unsigned char *)posdata,frequency * sizeof(int));
1406
1407 // Temp fix
1408 {
1409 struct RESULT *r1 = l_rp->tail;
1410 r1->rank = r1->rank = getrank( sw, r1->frequency, r1->tfrequency, r1->posdata, r1->indexf, r1->filenum );
1411 }
1412 }
1413 if(posdata != stack_posdata)
1414 efree(posdata);
1415
1416
1417 } while ((unsigned long)(s - buffer) != nextposmetaname);
1418
1419
1420 }
1421
1422 efree(buffer);
1423
1424
1425 if (!p)
1426 break; /* direct access (no wild card) -> break */
1427
1428 else
1429 {
1430 char *resultword;
1431
1432 /* Jump to next word */
1433 /* No more data for this word but we
1434 are in sequential search because of
1435 the star (p is not null) */
1436 /* So, go for next word */
1437 DB_ReadNextWordInvertedIndex(sw, word, &resultword, &wordID, indexf->DB);
1438 if (! wordID)
1439 break; /* no more data */
1440
1441 efree(resultword); /* Do not need it (although might be useful for highlighting some day) */
1442 }
1443
1444 } while(1); /* continue on in loop for wildcard search */
1445
1446
1447
1448 if (p)
1449 {
1450 /* Finally, if we are in an sequential search merge all results */
1451 l_rp = mergeresulthashlist(sw, l_rp);
1452 }
1453
1454 DB_EndReadWords(sw, indexf->DB);
1455 return l_rp;
1456 }
1457
1458
1459 /*
1460 -- Rules checking
1461 -- u_is... = user rules (and, or, ...)
1462 -- is... = internal rules checking
1463 */
1464
1465
1466 /* Is a word a rule?
1467 */
1468
1469 int isrule(char *word)
1470 {
1471 int i;
1472 static char *ops[] = { /* internal ops... */
1473 AND_WORD, OR_WORD, NOT_WORD, PHRASE_WORD, AND_NOT_WORD,
1474 NULL
1475 };
1476
1477 for (i = 0; ops[i]; i++)
1478 {
1479 if (!strcmp(word, ops[i]))
1480 return 1;
1481 }
1482 return 0;
1483 }
1484
1485 int u_isrule(SWISH * sw, char *word)
1486 {
1487 LOGICAL_OP *op;
1488
1489 op = &(sw->SearchAlt->srch_op);
1490 if (!strcmp(word, op->and) || !strcmp(word, op->or) || !strcmp(word, op->not))
1491 return 1;
1492 else
1493 return 0;
1494 }
1495
1496 int isnotrule(char *word)
1497 {
1498 if (!strcmp(word, NOT_WORD))
1499 return 1;
1500 else
1501 return 0;
1502 }
1503
1504 int u_isnotrule(SWISH * sw, char *word)
1505 {
1506 if (!strcmp(word, sw->SearchAlt->srch_op.not))
1507 return 1;
1508 else
1509 return 0;
1510 }
1511
1512
1513 /* Is a word a boolean rule?
1514 */
1515
1516 int isbooleanrule(char *word)
1517 {
1518 if (!strcmp(word, AND_WORD) || !strcmp(word, OR_WORD) || !strcmp(word, PHRASE_WORD) || !strcmp(word, AND_NOT_WORD))
1519 return 1;
1520 else
1521 return 0;
1522 }
1523
1524 /* Is a word a unary rule?
1525 */
1526
1527 int isunaryrule(char *word)
1528 {
1529 if (!strcmp(word, NOT_WORD))
1530 return 1;
1531 else
1532 return 0;
1533 }
1534
1535 /* Return the number for a rule.
1536 */
1537
1538 int getrulenum(char *word)
1539 {
1540 if (!strcmp(word, AND_WORD))
1541 return AND_RULE;
1542 else if (!strcmp(word, OR_WORD))
1543 return OR_RULE;
1544 else if (!strcmp(word, NOT_WORD))
1545 return NOT_RULE;
1546 else if (!strcmp(word, PHRASE_WORD))
1547 return PHRASE_RULE;
1548 else if (!strcmp(word, AND_NOT_WORD))
1549 return AND_NOT_RULE;
1550 return NO_RULE;
1551 }
1552
1553
1554 /*
1555 -- Return selected RuleNumber for default rule.
1556 -- defined via current Swish Search Boolean OP Settings for user.
1557 */
1558
1559 int u_SelectDefaultRulenum(SWISH * sw, char *word)
1560 {
1561 if (!strcasecmp(word, sw->SearchAlt->srch_op.and))
1562 return AND_RULE;
1563 else if (!strcasecmp(word, sw->SearchAlt->srch_op.or))
1564 return OR_RULE;
1565 return NO_RULE;
1566 }
1567
1568
1569
1570 static void make_db_res_and_free(SWISH *sw,RESULT_LIST *l_res) {
1571 struct DB_RESULTS tmp;
1572 memset (&tmp,0,sizeof(struct DB_RESULTS));
1573 tmp.resultlist = l_res;
1574 freeresultlist(sw,&tmp);
1575 }
1576
1577
1578
1579 /* Takes two lists of results from searches and ANDs them together.
1580 ** On input, both result lists r1 and r2 must be sorted by filenum
1581 ** On output, the new result list remains sorted
1582 */
1583
1584 RESULT_LIST *andresultlists(SWISH * sw, RESULT_LIST * l_r1, RESULT_LIST * l_r2, int andLevel)
1585 {
1586 RESULT_LIST *new_results_list = NULL;
1587 RESULT *r1;
1588 RESULT *r2;
1589 int res = 0;
1590
1591 /* patch provided by Mukund Srinivasan */
1592 if (l_r1 == NULL || l_r2 == NULL)
1593 {
1594 make_db_res_and_free(sw,l_r1);
1595 make_db_res_and_free(sw,l_r2);
1596 return NULL;
1597 }
1598
1599 if (andLevel < 1)
1600 andLevel = 1;
1601
1602 for (r1 = l_r1->head, r2 = l_r2->head; r1 && r2;)
1603 {
1604 res = r1->filenum - r2->filenum;
1605 if (!res)
1606 {
1607 /*
1608 * Computing the new rank is interesting because
1609 * we want to weight each of the words that was
1610 * previously ANDed equally along with the new word.
1611 * We compute a running average using andLevel and
1612 * simply scale up the old average (in r1->rank)
1613 * and recompute a new, equally weighted average.
1614 */
1615 int newRank = 0;
1616
1617 if(r1->rank == -1)
1618 r1->rank = getrank( sw, r1->frequency, r1->tfrequency, r1->posdata, r1->indexf, r1->filenum );
1619
1620 if(r2->rank == -1)
1621 r2->rank = getrank( sw, r2->frequency, r1->tfrequency, r2->posdata, r2->indexf, r2->filenum );
1622
1623 newRank = ((r1->rank * andLevel) + r2->rank) / (andLevel + 1);
1624
1625
1626 if(!new_results_list)
1627 new_results_list = newResultsList(sw);
1628
1629 addtoresultlist(new_results_list, r1->filenum, newRank, 0, r1->frequency + r2->frequency, r1->indexf, sw);
1630
1631
1632 /* Storing all positions could be useful in the future */
1633
1634 CopyPositions(new_results_list->tail->posdata, 0, r1->posdata, 0, r1->frequency);
1635 CopyPositions(new_results_list->tail->posdata, r1->frequency, r2->posdata, 0, r2->frequency);
1636
1637
1638 r1 = r1->next;
1639 r2 = r2->next;
1640 }
1641
1642 else if (res > 0)
1643 {
1644 r2 = r2->next;
1645 }
1646 else
1647 {
1648 r1 = r1->next;
1649 }
1650 }
1651
1652 return new_results_list;
1653 }
1654
1655 /* Takes two lists of results from searches and ORs them together.
1656 2001-11 jmruiz Completely rewritten. Older one was really
1657 slow when the lists are very long
1658 On input, both result lists r1 and r2 must be sorted by filenum
1659 On output, the new result list remains sorted
1660
1661 rank is combined for matching files. That is,
1662 "foo OR bar" will rank files with both higher.
1663
1664 */
1665
1666
1667 RESULT_LIST *orresultlists(SWISH * sw, RESULT_LIST * l_r1, RESULT_LIST * l_r2)
1668 {
1669 int rc;
1670 RESULT *r1;
1671 RESULT *r2;
1672 RESULT *rp,
1673 *tmp;
1674 RESULT_LIST *new_results_list = NULL;
1675
1676
1677 /* If either list is empty, just return the other */
1678 if (l_r1 == NULL)
1679 return l_r2;
1680
1681 else if (l_r2 == NULL)
1682 return l_r1;
1683
1684 /* Look for files that have both words, and add up the ranks */
1685
1686 r1 = l_r1->head;
1687 r2 = l_r2->head;
1688
1689 while(r1 && r2)
1690 {
1691 rc = r1->filenum - r2->filenum;
1692 if(rc < 0)
1693 {
1694 rp = r1;
1695 r1 = r1->next;
1696 }
1697 else if(rc > 0)
1698 {
1699 rp = r2;
1700 r2 = r2->next;
1701 }
1702
1703 else /* Matching file number */
1704 {
1705 int result_size;
1706
1707 /* Compute rank if not yet computed */
1708 if(r1->rank == -1)
1709 r1->rank = getrank( sw, r1->frequency, r1->tfrequency, r1->posdata, r1->indexf, r1->filenum );
1710
1711 if(r2->rank == -1)
1712 r2->rank = getrank( sw, r2->frequency, r2->tfrequency, r2->posdata, r2->indexf, r2->filenum );
1713
1714
1715 /* Create a new RESULT - Should be a function to creeate this, I'd think */
1716
1717 result_size = sizeof(RESULT) + ( (r1->frequency + r2->frequency - 1) * sizeof(int) );
1718 rp = (RESULT *) Mem_ZoneAlloc(sw->Search->resultSearchZone, result_size );
1719 memset( rp, 0, result_size );
1720
1721
1722 rp->fi.filenum = rp->filenum = r1->filenum;
1723
1724 rp->rank = ( r1->rank + r2->rank) * 2; // bump up the or terms
1725 rp->tfrequency = 0;
1726 rp->frequency = r1->frequency + r2->frequency;
1727 rp->indexf = r1->indexf;
1728
1729
1730 /* save the combined position data in the new result. (Would freq ever be zero?) */
1731 if (r1->frequency)
1732 CopyPositions(rp->posdata, 0, r1->posdata, 0, r1->frequency);
1733
1734 if (r2->frequency)
1735 CopyPositions(rp->posdata, r1->frequency, r2->posdata, 0, r2->frequency);
1736
1737 r1 = r1->next;
1738 r2 = r2->next;
1739 }
1740
1741
1742 /* Now add the result to the output list */
1743
1744 if(!new_results_list)
1745 new_results_list = newResultsList(sw);
1746
1747 addResultToList(new_results_list,rp);
1748 }
1749
1750
1751 /* Add the remaining results */
1752
1753 tmp = r1 ? r1 : r2;
1754
1755 while(tmp)
1756 {
1757 rp = tmp;
1758 tmp = tmp->next;
1759 if(!new_results_list)
1760 new_results_list = newResultsList(sw);
1761
1762 addResultToList(new_results_list,rp);
1763 }
1764
1765 return new_results_list;
1766 }
1767
1768
1769 /* 2001-10 jmruiz - This code was originally at merge.c
1770 ** Also made it thread safe
1771 */
1772 /* These three routines are only used by notresultlist */
1773
1774 struct markentry
1775 {
1776 struct markentry *next;
1777 int num;
1778 };
1779
1780 /* This marks a number as having been printed.
1781 */
1782
1783 void marknum(SWISH *sw, struct markentry **markentrylist, int num)
1784 {
1785 unsigned hashval;
1786 struct markentry *mp;
1787
1788 mp = (struct markentry *) Mem_ZoneAlloc(sw->Search->resultSearchZone, sizeof(struct markentry));
1789
1790 mp->num = num;
1791
1792 hashval = bignumhash(num);
1793 mp->next = markentrylist[hashval];
1794 markentrylist[hashval] = mp;
1795 }
1796
1797
1798 /* Has a number been printed?
1799 */
1800
1801 int ismarked(struct markentry **markentrylist, int num)
1802 {
1803 unsigned hashval;
1804 struct markentry *mp;
1805
1806 hashval = bignumhash(num);
1807 mp = markentrylist[hashval];
1808
1809 while (mp != NULL)
1810 {
1811 if (mp->num == num)
1812 return 1;
1813 mp = mp->next;
1814 }
1815 return 0;
1816 }
1817
1818 /* Initialize the marking list.
1819 */
1820
1821 void initmarkentrylist(struct markentry **markentrylist)
1822 {
1823 int i;
1824
1825 for (i = 0; i < BIGHASHSIZE; i++)
1826 markentrylist[i] = NULL;
1827 }
1828
1829 void freemarkentrylist(struct markentry **markentrylist)
1830 {
1831 int i;
1832
1833 for (i = 0; i < BIGHASHSIZE; i++)
1834 {
1835 markentrylist[i] = NULL;
1836 }
1837 }
1838
1839 /* This performs the NOT unary operation on a result list.
1840 ** NOTed files are marked with a default rank of 1000.
1841 **
1842 ** Basically it returns all the files that have not been
1843 ** marked (GH)
1844 */
1845
1846 RESULT_LIST *notresultlist(SWISH * sw, RESULT_LIST * l_rp, IndexFILE * indexf)
1847 {
1848 int i,
1849 filenums;
1850 RESULT *rp;
1851 RESULT_LIST *new_results_list = NULL;
1852 struct markentry *markentrylist[BIGHASHSIZE];
1853
1854 if(!l_rp)
1855 rp = NULL;
1856 else
1857 rp = l_rp->head;
1858
1859 initmarkentrylist(markentrylist);
1860 while (rp != NULL)
1861 {
1862 marknum(sw, markentrylist, rp->filenum);
1863 rp = rp->next;
1864 }
1865
1866 filenums = indexf->header.totalfiles;
1867
1868 for (i = 1; i <= filenums; i++)
1869 {
1870 if (!ismarked(markentrylist, i))
1871 {
1872 if(!new_results_list)
1873 new_results_list = newResultsList(sw);
1874 addtoresultlist(new_results_list, i, 1000, 0, 0, indexf, sw);
1875 }
1876 }
1877
1878 freemarkentrylist(markentrylist);
1879
1880 new_results_list = sortresultsbyfilenum(new_results_list);
1881
1882 return new_results_list;
1883 }
1884
1885 /* Phrase result routine - see distance parameter. For phrase search this
1886 ** value must be 1 (consecutive words)
1887 **
1888 ** On input, both result lists r1 abd r2 must be sorted by filenum
1889 ** On output, the new result list remains sorted
1890 */
1891 RESULT_LIST *phraseresultlists(SWISH * sw, RESULT_LIST * l_r1, RESULT_LIST * l_r2, int distance)
1892 {
1893 int i,
1894 j,
1895 found,
1896 newRank,
1897 *allpositions;
1898 int res = 0;
1899 RESULT_LIST *new_results_list = NULL;
1900 RESULT *r1, *r2;
1901
1902 if (l_r1 == NULL || l_r2 == NULL)
1903 return NULL;
1904
1905 for (r1 = l_r1->head, r2 = l_r2->head; r1 && r2;)
1906 {
1907 res = r1->filenum - r2->filenum;
1908 if (!res)
1909 {
1910 found = 0;
1911 allpositions = NULL;
1912 for (i = 0; i < r1->frequency; i++)
1913 {
1914 for (j = 0; j < r2->frequency; j++)
1915 {
1916 if ((GET_POSITION(r1->posdata[i]) + distance) == GET_POSITION(r2->posdata[j]))
1917 {
1918 found++;
1919 if (allpositions)
1920 allpositions = (int *) erealloc(allpositions, found * sizeof(int));
1921
1922 else
1923 allpositions = (int *) emalloc(found * sizeof(int));
1924
1925 allpositions[found - 1] = r2->posdata[j];
1926 break;
1927 }
1928 }
1929 }
1930 if (found)
1931 {
1932 /* Compute newrank */
1933 if(r1->rank == -1)
1934 r1->rank = getrank( sw, r1->frequency, r1->tfrequency, r1->posdata, r1->indexf, r1->filenum );
1935 if(r2->rank == -1)
1936 r2->rank = getrank( sw, r2->frequency, r1->tfrequency, r2->posdata, r2->indexf, r2->filenum );
1937
1938 newRank = (r1->rank + r2->rank) / 2;
1939 /*
1940 * Storing positions is neccesary for further
1941 * operations
1942 */
1943 if(!new_results_list)
1944 new_results_list = newResultsList(sw);
1945
1946 addtoresultlist(new_results_list, r1->filenum, newRank, 0, found, r1->indexf, sw);
1947
1948 CopyPositions(new_results_list->tail->posdata, 0, allpositions, 0, found);
1949 efree(allpositions);
1950 }
1951 r1 = r1->next;
1952 r2 = r2->next;
1953 }
1954 else if (res > 0)
1955 {
1956 r2 = r2->next;
1957 }
1958 else
1959 {
1960 r1 = r1->next;
1961 }
1962
1963 }
1964
1965 return new_results_list;
1966 }
1967
1968 /* Adds a file number and rank to a list of results.
1969 */
1970
1971
1972 void addtoresultlist(RESULT_LIST * l_rp, int filenum, int rank, int tfrequency, int frequency, IndexFILE * indexf, SWISH * sw)
1973 {
1974 RESULT *newnode;
1975 int result_size;
1976
1977 result_size = sizeof(RESULT) + ((frequency - 1) * sizeof(int));
1978 newnode = (RESULT *) Mem_ZoneAlloc(sw->Search->resultSearchZone, result_size );
1979 memset( newnode, 0, result_size );
1980
1981 newnode->fi.filenum = newnode->filenum = filenum;
1982
1983 newnode->rank = rank;
1984 newnode->tfrequency = tfrequency;
1985 newnode->frequency = frequency;
1986 newnode->indexf = indexf;
1987
1988 addResultToList(l_rp, newnode);
1989 }
1990
1991
1992
1993
1994 RESULT *SwishNext(SWISH * sw)
1995 {
1996 struct MOD_Search *srch = sw->Search;
1997 RESULT *res = NULL;
1998 RESULT *res2 = NULL;
1999 int rc;
2000 struct DB_RESULTS *db_results = NULL,
2001 *db_results_winner = NULL;
2002 int num;
2003
2004 if (srch->bigrank)
2005 num = 10000000 / srch->bigrank;
2006 else
2007 num = 10000;
2008
2009
2010 /* Seems like we should error here if there are no results */
2011 if ( !sw->Search->db_results )
2012 {
2013 set_progerr(SWISH_LISTRESULTS_EOF, sw, "Attempted to read results before searching");
2014 return NULL;
2015 }
2016
2017 /* Check for a unique index file */
2018 if (!sw->Search->db_results->next)
2019 {
2020 if ((res = sw->Search->db_results->currentresult))
2021 {
2022 /* Increase Pointer */
2023 sw->Search->db_results->currentresult = res->next;
2024 /* If rank was delayed, compute it now */
2025 if(res->rank == -1)
2026 res->rank = getrank( sw, res->frequency, res->tfrequency, res->posdata, res->indexf, res->filenum );
2027
2028 }
2029 }
2030
2031
2032 else /* tape merge to find the next one from all the index files */
2033
2034 {
2035 /* We have more than one index file - can't use pre-sorted index */
2036 /* Get the lower value */
2037 db_results_winner = sw->Search->db_results;
2038
2039 if ((res = db_results_winner->currentresult))
2040 {
2041 /* If rank was delayed, compute it now */
2042 if(res->rank == -1)
2043 res->rank = getrank( sw, res->frequency, res->tfrequency, res->posdata, res->indexf, res->filenum );
2044
2045 if ( !res->PropSort )
2046 res->PropSort = getResultSortProperties(sw, res);
2047 }
2048
2049
2050 for (db_results = sw->Search->db_results->next; db_results; db_results = db_results->next)
2051 {
2052 /* Any more results for this index? */
2053 if (!(res2 = db_results->currentresult))
2054 continue;
2055
2056 else
2057 {
2058 /* If rank was delayed, compute it now */
2059 if(res2->rank == -1)
2060 res2->rank = getrank( sw, res2->frequency, res2->tfrequency, res2->posdata, res2->indexf, res2->filenum );
2061
2062 if ( !res2->PropSort )
2063 res2->PropSort = getResultSortProperties(sw, res2);
2064 }
2065
2066 if (!res)
2067 {
2068 res = res2;
2069 db_results_winner = db_results;
2070 continue;
2071 }
2072
2073 rc = (int) compResultsByNonSortedProps(&res, &res2);
2074 if (rc < 0)
2075 {
2076 res = res2;
2077 db_results_winner = db_results;
2078 }
2079 }
2080
2081 if ((res = db_results_winner->currentresult))
2082 db_results_winner->currentresult = res->next;
2083 }
2084
2085
2086
2087 /* Normalize rank */
2088 if (res)
2089 {
2090 res->rank = (int) (res->rank * num)/10000;
2091 if (res->rank >= 999)
2092 res->rank = 1000;
2093 else if (res->rank < 1)
2094 res->rank = 1;
2095 }
2096 else
2097 {
2098 // it's expected to just return null on end of list.
2099 // sw->lasterror = SWISH_LISTRESULTS_EOF;
2100 }
2101
2102
2103 return res;
2104
2105 }
2106
2107
2108
2109
2110
2111 /* Checks if the next word is "="
2112 */
2113
2114 int isMetaNameOpNext(struct swline *searchWord)
2115 {
2116 if (searchWord == NULL)
2117 return 0;
2118 if (!strcmp(searchWord->line, "="))
2119 return 1;
2120 return 0;
2121 }
2122
2123 /* funtion to free all memory of a list of results */
2124 void freeresultlist(SWISH * sw, struct DB_RESULTS *dbres)
2125 {
2126 RESULT *rp;
2127 RESULT *tmp;
2128
2129 #ifdef DUMP_RESULTS
2130 dump_result_lists( sw, "In freeresultlist()" );
2131 #endif
2132
2133 if(dbres->resultlist)
2134 rp = dbres->resultlist->head;
2135 else
2136 rp = NULL;
2137
2138 while (rp)
2139 {
2140 tmp = rp->next;
2141 freeresult(sw, rp);
2142 rp = tmp;
2143 }
2144 dbres->resultlist = NULL;
2145 dbres->currentresult = NULL;
2146 dbres->sortresultlist = NULL;
2147 }
2148
2149 /* funtion to free the memory of one result */
2150 void freeresult(SWISH * sw, RESULT * rp)
2151 {
2152 int i;
2153
2154 if (rp)
2155 {
2156 freefileinfo( &rp->fi ); // may have already been freed
2157
2158 if (sw->ResultSort->numPropertiesToSort && rp->PropSort)
2159 {
2160 for (i = 0; i < sw->ResultSort->numPropertiesToSort; i++)
2161 efree(rp->PropSort[i]);
2162
2163 /* For better performance with thousands of results this is stored in a MemZone
2164 in result_sort.c module
2165 efree(rp->PropSort);
2166 */
2167
2168 }
2169 /* For better performance with thousands of results this is stored in a MemZone
2170 in result_sort.c module
2171
2172 if (sw->ResultSort->numPropertiesToSort && rp->iPropSort)
2173 {
2174 efree(rp->iPropSort);
2175 }
2176 */
2177 /* For better performance store results in a MemZone
2178
2179 efree(rp);
2180 */
2181 }
2182 }
2183
2184
2185
2186 /*
2187 06/00 Jose Ruiz - Sort results by filenum
2188 Uses an array and qsort for better performance
2189 Used for faster "and" and "phrase" of results
2190 */
2191 RESULT_LIST *sortresultsbyfilenum(RESULT_LIST * l_rp)
2192 {
2193 int i,
2194 j;
2195 RESULT **ptmp;
2196 RESULT *rp;
2197
2198 /* Very trivial case */
2199 if (!l_rp)
2200 return NULL;
2201
2202
2203 /* Compute results */
2204 for (i = 0, rp = l_rp->head; rp; rp = rp->next, i++);
2205 /* Another very trivial case */
2206 if (i == 1)
2207 return l_rp;
2208 /* Compute array size */
2209 ptmp = (void *) emalloc(i * sizeof(RESULT *));
2210 /* Build an array with the elements to compare
2211 and pointers to data */
2212 for (j = 0, rp = l_rp->head; rp; rp = rp->next)
2213 ptmp[j++] = rp;
2214 /* Sort them */
2215 swish_qsort(ptmp, i, sizeof(RESULT *), &compResultsByFileNum);
2216 /* Build the list */
2217 for (j = 0, rp = NULL; j < i; j++)
2218 {
2219 if (!rp)
2220 l_rp->head = ptmp[j];
2221 else
2222 rp->next = ptmp[j];
2223 rp = ptmp[j];
2224 }
2225 rp->next = NULL;
2226 l_rp->tail = rp;
2227
2228 /* Free the memory of the array */
2229 efree(ptmp);
2230
2231 return l_rp;
2232 }
2233
2234
2235 /* 06/00 Jose Ruiz
2236 ** returns all results in r1 that not contains r2
2237 **
2238 ** On input, both result lists r1 and r2 must be sorted by filenum
2239 ** On output, the new result list remains sorted
2240 */
2241 RESULT_LIST *notresultlists(SWISH * sw, RESULT_LIST * l_r1, RESULT_LIST * l_r2)
2242 {
2243 RESULT *rp, *r1, *r2;
2244 RESULT_LIST *new_results_list = NULL;
2245 int res = 0;
2246
2247 if (!l_r1)
2248 return NULL;
2249 if (l_r1 && !l_r2)
2250 return l_r1;
2251
2252 for (r1 = l_r1->head, r2 = l_r2->head; r1 && r2;)
2253 {
2254 res = r1->filenum - r2->filenum;
2255 if (res < 0)
2256 {
2257 /*
2258 * Storing all positions could be useful
2259 * in the future
2260 */
2261
2262 rp = r1;
2263 r1 = r1->next;
2264 if(!new_results_list)
2265 new_results_list = newResultsList(sw);
2266 addResultToList(new_results_list,rp);
2267 }
2268 else if (res > 0)
2269 {
2270 r2 = r2->next;
2271 }
2272 else
2273 {
2274 r1 = r1->next;
2275 r2 = r2->next;
2276 }
2277 }
2278 /* Add remaining results */
2279 while (r1)
2280 {
2281 rp = r1;
2282 r1 = r1->next;
2283 if(!new_results_list)
2284 new_results_list = newResultsList(sw);
2285 addResultToList(new_results_list,rp);
2286 }
2287
2288 return new_results_list;
2289 }
2290
2291
2292 /******************************************************************************
2293 * Remove the stop words from the tokenized query
2294 * rewritten Nov 24, 2001 - moseley
2295 * Still horrible! Need a real parse tree.
2296 *******************************************************************************/
2297
2298
2299 struct swline *ignore_words_in_query(SWISH * sw, IndexFILE * indexf, struct swline *searchwordlist, unsigned char phrase_delimiter)
2300 {
2301 struct swline *cur_token = searchwordlist;
2302 struct swline *prev_token = NULL;
2303 struct swline *prev_prev_token = NULL; // for removing two things
2304 int in_phrase = 0;
2305 int word_count = 0; /* number of search words found */
2306 int paren_count = 0;
2307 struct MOD_Search *srch = sw->Search;
2308
2309 srch->removed_stopwords = 0;
2310
2311
2312
2313 while ( cur_token )
2314 {
2315 int remove = 0;
2316 char first_char = cur_token->line[0];
2317
2318 if ( cur_token == searchwordlist )
2319 {
2320 prev_token = prev_prev_token = NULL;
2321 word_count = 0;
2322 paren_count = 0;
2323 in_phrase = 0;
2324 }
2325
2326 while ( 1 ) // so we can use break.
2327 {
2328
2329 /* Can't backslash here -- (because this code should really be include in swish_words.c) */
2330
2331 if ( first_char == phrase_delimiter )
2332 {
2333 in_phrase = !in_phrase;
2334
2335 if ( !in_phrase && prev_token && prev_token->line[0] == phrase_delimiter )
2336 remove = 2;
2337
2338 break;
2339 }
2340
2341
2342 /* leave everything alone inside a pharse */
2343
2344 if ( in_phrase )
2345 {
2346 if (isstopword(&indexf->header, cur_token->line))
2347 {
2348 srch->removed_stopwords++;
2349 resultHeaderOut(sw, 1, "# Removed stopword: %s\n", cur_token->line);
2350 remove = 1;
2351 }
2352 else
2353 word_count++;
2354
2355 break;
2356 }
2357
2358
2359 /* Allow operators */
2360
2361 if ( first_char == '=' )
2362 break;
2363
2364 if ( first_char == '(' )
2365 {
2366 paren_count++;
2367 break;
2368 }
2369
2370 if ( first_char == ')' )
2371 {
2372 paren_count--;
2373
2374 if ( prev_token && prev_token->line[0] == '(' )
2375 remove = 2;
2376
2377 break;
2378 }
2379
2380
2381
2382 /* Allow all metanames */
2383
2384 if ( isMetaNameOpNext(cur_token->next) )
2385 break;
2386
2387
2388
2389 /* Look for AND OR NOT - remove AND OR at start, and remove second of doubles */
2390
2391 if ( u_isrule(sw, cur_token->line) )
2392 {
2393 if ( prev_token )
2394 {
2395 /* remove double tokens */
2396 if ( u_isrule( sw, prev_token->line ) )
2397 remove = 1;
2398 }
2399 /* allow NOT at the start */
2400 else if ( !u_isnotrule(sw, cur_token->line) )
2401 remove = 1;
2402
2403 break;
2404 }
2405
2406
2407
2408 /* Finally, is it a stop word? */
2409
2410 if ( isstopword(&indexf->header, cur_token->line) )
2411 {
2412 srch->removed_stopwords++;
2413 resultHeaderOut(sw, 1, "# Removed stopword: %s\n", cur_token->line);
2414 remove = 1;
2415 }
2416 else
2417 word_count++;
2418
2419
2420
2421 break;
2422 }
2423
2424
2425 /* Catch dangling metanames */
2426 if ( !remove && !cur_token->next && isMetaNameOpNext( cur_token ) )
2427 remove = 2;
2428
2429
2430 if ( remove )
2431 {
2432 struct swline *tmp = cur_token;
2433
2434 if ( cur_token == searchwordlist ) // we are removing first token
2435 searchwordlist = cur_token->next;
2436 else
2437 {
2438 prev_token->next = cur_token->next; // remove one in the middle
2439 cur_token = prev_token; // save if remove == 2
2440 }
2441
2442
2443 efree( tmp->line );
2444 efree( tmp );
2445
2446 if ( remove == 2 )
2447 {
2448 tmp = cur_token;
2449
2450 if ( cur_token == searchwordlist ) // we are removing first token
2451 searchwordlist = cur_token->next;
2452 else
2453 prev_prev_token->next = cur_token->next; // remove one in the middle
2454
2455 efree( tmp->line );
2456 efree( tmp );
2457 }
2458
2459
2460 /* start at the beginning again */
2461
2462 cur_token = searchwordlist;
2463 continue;
2464 }
2465
2466 if ( prev_token )
2467 prev_prev_token = prev_token;
2468
2469 prev_token = cur_token;
2470 cur_token = cur_token->next;
2471 }
2472
2473
2474 if ( in_phrase || paren_count )
2475 sw->lasterror = QUERY_SYNTAX_ERROR;
2476
2477 else if ( !word_count )
2478 sw->lasterror = srch->removed_stopwords ? WORDS_TOO_COMMON : NO_WORDS_IN_SEARCH;
2479
2480 return searchwordlist;
2481
2482 }
2483
2484
2485
2486 /* Adds a file number to a hash table of results.
2487 ** If the entry's already there, add the ranks,
2488 ** else make a new entry.
2489 **
2490 ** Jose Ruiz 04/00
2491 ** For better performance in large "or"
2492 ** keep the lists sorted by filenum
2493 **
2494 ** Jose Ruiz 2001/11 Rewritten to get better performance
2495 */
2496 RESULT_LIST *mergeresulthashlist(SWISH *sw, RESULT_LIST *l_r)
2497 {
2498 unsigned hashval;
2499 RESULT *r,
2500 *rp,
2501 *tmp,
2502 *next,
2503 *start,
2504 *newnode = NULL;
2505 RESULT_LIST *new_results_list = NULL;
2506 int i,
2507 tot_frequency,
2508 pos_off,
2509 filenum;
2510
2511 if(!l_r)
2512 return NULL;
2513
2514 if(!l_r->head)
2515 return NULL;
2516
2517 /* Init hash table */
2518 for (i = 0; i < BIGHASHSIZE; i++)
2519 sw->Search->resulthashlist[i] = NULL;
2520
2521 for(r = l_r->head, next = NULL; r; r =next)
2522 {
2523 next = r->next;
2524
2525 tmp = NULL;
2526 hashval = bignumhash(r->filenum);
2527
2528 rp = sw->Search->resulthashlist[hashval];
2529 for(tmp = NULL; rp; )
2530 {
2531 if (r->filenum <= rp->filenum)
2532 {
2533 break;
2534 }
2535 tmp = rp;
2536 rp = rp->next;
2537 }
2538 if (tmp)
2539 {
2540 tmp->next = r;
2541 }
2542 else
2543 {
2544 sw->Search->resulthashlist[hashval] = r;
2545 }
2546 r->next = rp;
2547 }
2548
2549 /* Now coalesce reptitive filenums */
2550 for (i = 0; i < BIGHASHSIZE; i++)
2551 {
2552 rp = sw->Search->resulthashlist[i];
2553 for (filenum = 0, start = NULL; ; )
2554 {
2555 if(rp)
2556 next = rp->next;
2557 if(!rp || rp->filenum != filenum)
2558 {
2559 /* Start of new block, coalesce previous results */
2560 if(filenum)
2561 {
2562 int result_size;
2563
2564 for(tmp = start, tot_frequency = 0; tmp!=rp; tmp = tmp->next)
2565 {
2566 tot_frequency += tmp->frequency;
2567 }
2568
2569 result_size = sizeof(RESULT) + ((tot_frequency - 1) * sizeof(int));
2570 newnode = (RESULT *) Mem_ZoneAlloc(sw->Search->resultSearchZone, result_size );
2571 memset( newnode, 0, result_size );
2572
2573 newnode->fi.filenum = newnode->filenum = filenum;
2574 newnode->rank = 0;
2575 newnode->tfrequency = 0;
2576 newnode->frequency = tot_frequency;
2577 newnode->indexf = start->indexf;
2578
2579 for(tmp = start, pos_off = 0; tmp!=rp; tmp = tmp->next)
2580 {
2581 /* Compute rank if not yet computed */
2582 if(tmp->rank == -1)
2583 tmp->rank = getrank( sw, tmp->frequency, tmp->tfrequency, tmp->posdata, tmp->indexf, tmp->filenum );
2584
2585 newnode->rank += tmp->rank;
2586 if (tmp->frequency)
2587 {
2588 CopyPositions(newnode->posdata, pos_off, tmp->posdata, 0, tmp->frequency);
2589 pos_off += tmp->frequency;
2590 }
2591
2592 }
2593 /* Add at the end of new_results_list */
2594 if(!new_results_list)
2595 {
2596 new_results_list = newResultsList(sw);
2597 }
2598 addResultToList(new_results_list,newnode);
2599 /* Sort positions */
2600 swish_qsort(newnode->posdata,newnode->frequency,sizeof(int),&icomp_posdata);
2601 }
2602 if(rp)
2603 filenum = rp->filenum;
2604 start = rp;
2605 }
2606 if(!rp)
2607 break;
2608 rp = next;
2609 }
2610 }
2611 /* Sort results by filenum and return */
2612 return sortresultsbyfilenum(new_results_list);
2613 }
2614

  ViewVC Help
Powered by ViewVC 1.1.22