1 |
adcroft |
1.1 |
/* |
2 |
|
|
$Id: index.c,v 1.194 2002/08/29 13:59:48 jmruiz Exp $ |
3 |
|
|
** |
4 |
|
|
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company |
5 |
|
|
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 |
6 |
|
|
** |
7 |
|
|
** This program and library is free software; you can redistribute it and/or |
8 |
|
|
** as published by the Free Software Foundation; either version 2 |
9 |
|
|
** of the License, or any later version. |
10 |
|
|
** |
11 |
|
|
** This program is distributed in the hope that it will be useful, |
12 |
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 |
|
|
** GNU (Library) General Public License for more details. |
15 |
|
|
** |
16 |
|
|
** You should have received a copy of the GNU (Library) General Public License |
17 |
|
|
** long with this program; if not, write to the Free Software |
18 |
|
|
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
19 |
|
|
**-------------------------------------------------------------------- |
20 |
|
|
** ** ** PATCHED 5/13/96, CJC |
21 |
|
|
** |
22 |
|
|
** Added code to countwords and countwordstr to disreguard the last char |
23 |
|
|
** if requiered by the config.h |
24 |
|
|
** G. Hill 3/12/97 ghill@library.berkeley.edu |
25 |
|
|
** |
26 |
|
|
** Changed addentry, countwords, countwordstr, parsecomment, rintindex |
27 |
|
|
** added createMetaEntryList, getMeta, parseMetaData |
28 |
|
|
** to support METADATA |
29 |
|
|
** G. Hill 3/18/97 ghill@library.berkeley.edu |
30 |
|
|
** |
31 |
|
|
** Changed removestops to support printing of stop words |
32 |
|
|
** G. Hill 4/7/97 |
33 |
|
|
** |
34 |
|
|
** Changed countwords, countwrdstr, and parseMetaData to disreguard the |
35 |
|
|
** first char if required by the config.h |
36 |
|
|
** G.Hill 10/16/97 ghill@library.berkeley.edu |
37 |
|
|
** |
38 |
|
|
** Added stripIgnoreLastChars and isIgnoreLastChar routines which iteratively |
39 |
|
|
** remove all ignore characters from the end of each word. |
40 |
|
|
** P. Bergner 10/5/97 bergner@lcse.umn.edu |
41 |
|
|
** |
42 |
|
|
** Added stripIgnoreFirstChars and isIgnoreFirstChar to make stripping of |
43 |
|
|
** the ignore first chars iterative. |
44 |
|
|
** G. Hill 11/19/97 ghill@library.berkeley.edu |
45 |
|
|
** |
46 |
|
|
** Added possibility of use of quotes and brackets in meta CONTENT countwords, parsemetadata |
47 |
|
|
** G. Hill 1/14/98 |
48 |
|
|
** |
49 |
|
|
** Added regex for replace rule G.Hill 1/98 |
50 |
|
|
** |
51 |
|
|
** REQMETANAME - don't index meta tags not specified in MetaNames |
52 |
|
|
** 10/11/99 - Bill Moseley |
53 |
|
|
** |
54 |
|
|
** change sprintf to snprintf to avoid corruption, use MAXPROPLEN instead of literal "20", |
55 |
|
|
** added include of merge.h - missing declaration caused compile error in prototypes, |
56 |
|
|
** added word length arg to Stem() call for strcat overflow checking in stemmer.c |
57 |
|
|
** added safestrcpy() macro to avoid corruption from strcpy overflow |
58 |
|
|
** SRE 11/17/99 |
59 |
|
|
** |
60 |
|
|
** fixed misc problems pointed out by "gcc -Wall" |
61 |
|
|
** SRE 2/22/00 |
62 |
|
|
** |
63 |
|
|
** Added code for storing word positions in index file |
64 |
|
|
** Jose Ruiz 3/00 jmruiz@boe.es |
65 |
|
|
** |
66 |
|
|
** 04/00 - Jose Ruiz |
67 |
|
|
** Added code for a hash table in index file for searching words |
68 |
|
|
** via getfileinfo in search.c (Lots of addons). Better perfomance |
69 |
|
|
** with big databases and or heavy searchs (a* or b* or c*) |
70 |
|
|
** |
71 |
|
|
** 04/00 - Jose Ruiz |
72 |
|
|
** Improved number compression function (compress) |
73 |
|
|
** New number decompress function |
74 |
|
|
** Both converted into macros for better performance |
75 |
|
|
** |
76 |
|
|
** 07/00 and 08/00 - Jose Ruiz |
77 |
|
|
** Many modifications to make some functions thread safe |
78 |
|
|
** |
79 |
|
|
** 08/00 - Jose Ruiz |
80 |
|
|
** New function indexstring. Up to now there were 4 functions doing almost |
81 |
|
|
** the same thing: countwords, countwordstr, parseMetaData and parsecomment |
82 |
|
|
** From now on, these 4 functions calls indexstring wich is the common part |
83 |
|
|
** to all of them. In fact, countwordstr, parseMetaData and parsecomment |
84 |
|
|
** are now simple frontends to indexstring |
85 |
|
|
** |
86 |
|
|
** 2000-11 - rasc |
87 |
|
|
** some redesgin, place common index code into a common routine |
88 |
|
|
** FileProp structures, routines |
89 |
|
|
** |
90 |
|
|
** -- |
91 |
|
|
** TODO |
92 |
|
|
** $$ there has still to be some resesign to be done. |
93 |
|
|
** $$ swish-e was originally designed to index html only. So the routines |
94 |
|
|
** $$ are for historically reasons scattered |
95 |
|
|
** $$ (e.g. isoktitle (), is ishtml() etc.) |
96 |
|
|
** |
97 |
|
|
** 2000-12 Jose Ruiz |
98 |
|
|
** obsolete routine ishtml removed |
99 |
|
|
** isoktitle moved to html.c |
100 |
|
|
** |
101 |
|
|
** 2001-03-02 rasc Header: write translatecharacters |
102 |
|
|
** 2001-03-14 rasc resultHeaderOutput -H n |
103 |
|
|
** 2001-03-24 rasc timeroutines rearranged |
104 |
|
|
** 2001-06-08 wsm Store word after ENTRY to save memory |
105 |
|
|
** 2001-08 jmruiz All locations stuff rewritten to save memory |
106 |
|
|
** |
107 |
|
|
*/ |
108 |
|
|
|
109 |
|
|
#include "swish.h" |
110 |
|
|
#include "mem.h" |
111 |
|
|
#include "string.h" |
112 |
|
|
#include "index.h" |
113 |
|
|
#include "hash.h" |
114 |
|
|
#include "check.h" |
115 |
|
|
#include "search.h" |
116 |
|
|
#include "merge.h" |
117 |
|
|
#include "docprop.h" |
118 |
|
|
#include "stemmer.h" |
119 |
|
|
#include "soundex.h" |
120 |
|
|
#include "double_metaphone.h" |
121 |
|
|
#include "error.h" |
122 |
|
|
#include "file.h" |
123 |
|
|
#include "compress.h" |
124 |
|
|
/* Removed due to problems with patents |
125 |
|
|
#include "deflate.h" |
126 |
|
|
*/ |
127 |
|
|
#include "html.h" |
128 |
|
|
#include "xml.h" |
129 |
|
|
#include "parser.h" |
130 |
|
|
#include "txt.h" |
131 |
|
|
#include "metanames.h" |
132 |
|
|
#include "result_sort.h" |
133 |
|
|
#include "result_output.h" |
134 |
|
|
#include "filter.h" |
135 |
|
|
#include "date_time.h" |
136 |
|
|
#include "db.h" |
137 |
|
|
#include "dump.h" |
138 |
|
|
#include "swish_qsort.h" |
139 |
|
|
|
140 |
|
|
static void index_path_parts( SWISH *sw, char *path, path_extract_list *list, INDEXDATAHEADER *header, docProperties **properties ); |
141 |
|
|
|
142 |
|
|
|
143 |
|
|
|
144 |
|
|
/* |
145 |
|
|
-- init structures for this module |
146 |
|
|
*/ |
147 |
|
|
|
148 |
|
|
|
149 |
|
|
void initModule_Index (SWISH *sw) |
150 |
|
|
{ |
151 |
|
|
int i; |
152 |
|
|
struct MOD_Index *idx; |
153 |
|
|
|
154 |
|
|
idx = (struct MOD_Index *) emalloc(sizeof(struct MOD_Index)); |
155 |
|
|
memset( idx, 0, sizeof( struct MOD_Index ) ); |
156 |
|
|
sw->Index = idx; |
157 |
|
|
|
158 |
|
|
idx->filenum = 0; |
159 |
|
|
idx->entryArray = NULL; |
160 |
|
|
|
161 |
|
|
idx->len_compression_buffer = MAXSTRLEN; /* For example */ |
162 |
|
|
idx->compression_buffer=(unsigned char *)emalloc(idx->len_compression_buffer); |
163 |
|
|
|
164 |
|
|
idx->len_worddata_buffer = MAXSTRLEN; /* For example */ |
165 |
|
|
idx->worddata_buffer=(unsigned char *)emalloc(idx->len_worddata_buffer); |
166 |
|
|
idx->sz_worddata_buffer = 0; |
167 |
|
|
|
168 |
|
|
/* Init entries hash table */ |
169 |
|
|
for (i=0; i<VERYBIGHASHSIZE; i++) |
170 |
|
|
{ |
171 |
|
|
idx->hashentries[i] = NULL; |
172 |
|
|
idx->hashentriesdirty[i] = 0; |
173 |
|
|
} |
174 |
|
|
|
175 |
|
|
|
176 |
|
|
/* Economic flag and temp files*/ |
177 |
|
|
idx->swap_locdata = SWAP_LOC_DEFAULT; |
178 |
|
|
|
179 |
|
|
|
180 |
|
|
for(i=0;i<BIGHASHSIZE;i++) idx->inode_hash[i]=NULL; |
181 |
|
|
|
182 |
|
|
/* initialize buffers used by indexstring */ |
183 |
|
|
idx->word = (char *) emalloc((idx->lenword = MAXWORDLEN) + 1); |
184 |
|
|
idx->swishword = (char *) emalloc((idx->lenswishword = MAXWORDLEN) + 1); |
185 |
|
|
|
186 |
|
|
idx->plimit=PLIMIT; |
187 |
|
|
idx->flimit=FLIMIT; |
188 |
|
|
idx->nIgnoreLimitWords = 0; |
189 |
|
|
idx->IgnoreLimitPositionsArray = NULL; |
190 |
|
|
|
191 |
|
|
/* Swapping access file functions */ |
192 |
|
|
idx->swap_tell = ftell; |
193 |
|
|
idx->swap_write = fwrite; |
194 |
|
|
idx->swap_close = fclose; |
195 |
|
|
idx->swap_seek = fseek; |
196 |
|
|
idx->swap_read = fread; |
197 |
|
|
idx->swap_getc = fgetc; |
198 |
|
|
idx->swap_putc = fputc; |
199 |
|
|
|
200 |
|
|
for( i = 0; i <MAX_LOC_SWAP_FILES ; i++) |
201 |
|
|
{ |
202 |
|
|
idx->swap_location_name[i] = NULL; |
203 |
|
|
idx->fp_loc_write[i] = NULL; |
204 |
|
|
idx->fp_loc_read[i] = NULL; |
205 |
|
|
} |
206 |
|
|
/* Index in blocks of chunk_size documents */ |
207 |
|
|
idx->chunk_size = INDEX_DEFAULT_CHUNK_SIZE; |
208 |
|
|
|
209 |
|
|
/* Use this value to avoid using big zones just as a temporary location storage */ |
210 |
|
|
idx->optimalChunkLocZoneSize = INDEX_DEFAULT_OPTIMAL_CHUNK_ZONE_SIZE_FOR_LOCATIONS; |
211 |
|
|
|
212 |
|
|
idx->freeLocMemChain = NULL; |
213 |
|
|
|
214 |
|
|
/* memory zones for common structures */ |
215 |
|
|
idx->perDocTmpZone = Mem_ZoneCreate("Per Doc Temporal Zone", 0, 0); |
216 |
|
|
idx->currentChunkLocZone = Mem_ZoneCreate("Current Chunk Locators", 0, 0); |
217 |
|
|
idx->totalLocZone = Mem_ZoneCreate("All Locators", 0, 0); |
218 |
|
|
idx->entryZone = Mem_ZoneCreate("struct ENTRY", 0, 0); |
219 |
|
|
|
220 |
|
|
/* table for storing which metaIDs to index */ |
221 |
|
|
idx->metaIDtable.max = 200; /* totally random guess */ |
222 |
|
|
idx->metaIDtable.num = 0; |
223 |
|
|
idx->metaIDtable.array = (int *)emalloc( idx->metaIDtable.max * sizeof(int) ); |
224 |
|
|
idx->metaIDtable.defaultID = -1; |
225 |
|
|
|
226 |
|
|
|
227 |
|
|
/* $$$ this is only a fix while http.c and httpserver.c still exist */ |
228 |
|
|
idx->tmpdir = estrdup("."); |
229 |
|
|
|
230 |
|
|
return; |
231 |
|
|
} |
232 |
|
|
|
233 |
|
|
|
234 |
|
|
/* |
235 |
|
|
-- release all wired memory for this module |
236 |
|
|
-- 2001-04-11 rasc |
237 |
|
|
*/ |
238 |
|
|
|
239 |
|
|
void freeModule_Index (SWISH *sw) |
240 |
|
|
{ |
241 |
|
|
struct MOD_Index *idx = sw->Index; |
242 |
|
|
int i; |
243 |
|
|
|
244 |
|
|
/* we need to call the real free here */ |
245 |
|
|
|
246 |
|
|
for( i = 0; i < MAX_LOC_SWAP_FILES ; i++) |
247 |
|
|
{ |
248 |
|
|
if (idx->swap_location_name[i] && isfile(idx->swap_location_name[i])) |
249 |
|
|
{ |
250 |
|
|
if (idx->fp_loc_read[i]) |
251 |
|
|
idx->swap_close(idx->fp_loc_read[i]); |
252 |
|
|
|
253 |
|
|
if (idx->fp_loc_write[i]) |
254 |
|
|
idx->swap_close(idx->fp_loc_write[i]); |
255 |
|
|
|
256 |
|
|
remove(idx->swap_location_name[i]); |
257 |
|
|
} |
258 |
|
|
|
259 |
|
|
|
260 |
|
|
if (idx->swap_location_name[i]) |
261 |
|
|
efree(idx->swap_location_name[i]); |
262 |
|
|
} |
263 |
|
|
|
264 |
|
|
if(idx->tmpdir) efree(idx->tmpdir); |
265 |
|
|
|
266 |
|
|
/* Free compression buffer */ |
267 |
|
|
efree(idx->compression_buffer); |
268 |
|
|
/* free worddata buffer */ |
269 |
|
|
efree(idx->worddata_buffer); |
270 |
|
|
|
271 |
|
|
/* free word buffers used by indexstring */ |
272 |
|
|
efree(idx->word); |
273 |
|
|
efree(idx->swishword); |
274 |
|
|
|
275 |
|
|
/* free IgnoreLimit stuff */ |
276 |
|
|
if(idx->IgnoreLimitPositionsArray) |
277 |
|
|
{ |
278 |
|
|
for(i=0; i<sw->indexlist->header.totalfiles; i++) |
279 |
|
|
{ |
280 |
|
|
if(idx->IgnoreLimitPositionsArray[i]) |
281 |
|
|
{ |
282 |
|
|
efree(idx->IgnoreLimitPositionsArray[i]->pos); |
283 |
|
|
efree(idx->IgnoreLimitPositionsArray[i]); |
284 |
|
|
} |
285 |
|
|
} |
286 |
|
|
efree(idx->IgnoreLimitPositionsArray); |
287 |
|
|
} |
288 |
|
|
|
289 |
|
|
/* should be free by now!!! But just in case... */ |
290 |
|
|
if (idx->entryZone) |
291 |
|
|
Mem_ZoneFree(&idx->entryZone); |
292 |
|
|
|
293 |
|
|
if (idx->totalLocZone) |
294 |
|
|
Mem_ZoneFree(&idx->totalLocZone); |
295 |
|
|
if (idx->currentChunkLocZone) |
296 |
|
|
Mem_ZoneFree(&idx->currentChunkLocZone); |
297 |
|
|
if (idx->perDocTmpZone) |
298 |
|
|
Mem_ZoneFree(&idx->perDocTmpZone); |
299 |
|
|
|
300 |
|
|
|
301 |
|
|
if ( idx->entryArray ) |
302 |
|
|
efree( idx->entryArray); |
303 |
|
|
|
304 |
|
|
|
305 |
|
|
efree( idx->metaIDtable.array ); |
306 |
|
|
|
307 |
|
|
/* free module data */ |
308 |
|
|
efree (idx); |
309 |
|
|
sw->Index = NULL; |
310 |
|
|
|
311 |
|
|
|
312 |
|
|
return; |
313 |
|
|
} |
314 |
|
|
|
315 |
|
|
|
316 |
|
|
/* |
317 |
|
|
** ---------------------------------------------- |
318 |
|
|
** |
319 |
|
|
** Module config code starts here |
320 |
|
|
** |
321 |
|
|
** ---------------------------------------------- |
322 |
|
|
*/ |
323 |
|
|
|
324 |
|
|
|
325 |
|
|
/* |
326 |
|
|
-- Config Directives |
327 |
|
|
-- Configuration directives for this Module |
328 |
|
|
-- return: 0/1 = none/config applied |
329 |
|
|
*/ |
330 |
|
|
|
331 |
|
|
int configModule_Index (SWISH *sw, StringList *sl) |
332 |
|
|
|
333 |
|
|
{ |
334 |
|
|
struct MOD_Index *idx = sw->Index; |
335 |
|
|
char *w0 = sl->word[0]; |
336 |
|
|
int retval = 1; |
337 |
|
|
char *env_tmp = NULL; |
338 |
|
|
|
339 |
|
|
if (strcasecmp(w0, "tmpdir") == 0) |
340 |
|
|
{ |
341 |
|
|
if (sl->n == 2) |
342 |
|
|
{ |
343 |
|
|
idx->tmpdir = erealloc( idx->tmpdir, strlen( sl->word[1] ) + 1 ); |
344 |
|
|
strcpy( idx->tmpdir, sl->word[1] ); |
345 |
|
|
normalize_path( idx->tmpdir ); |
346 |
|
|
|
347 |
|
|
if (!isdirectory(idx->tmpdir)) |
348 |
|
|
progerr("%s: %s is not a directory", w0, idx->tmpdir); |
349 |
|
|
|
350 |
|
|
if ( !( env_tmp = getenv("TMPDIR")) ) |
351 |
|
|
if ( !(env_tmp = getenv("TMP")) ) |
352 |
|
|
env_tmp = getenv("TEMP"); |
353 |
|
|
|
354 |
|
|
if ( env_tmp ) |
355 |
|
|
progwarn("Configuration setting for TmpDir '%s' will be overridden by environment setting '%s'", idx->tmpdir, env_tmp ); |
356 |
|
|
|
357 |
|
|
|
358 |
|
|
} |
359 |
|
|
else |
360 |
|
|
progerr("%s: requires one value", w0); |
361 |
|
|
} |
362 |
|
|
else if (strcasecmp(w0, "IgnoreLimit") == 0) |
363 |
|
|
{ |
364 |
|
|
if (sl->n == 3) |
365 |
|
|
{ |
366 |
|
|
idx->plimit = atol(sl->word[1]); |
367 |
|
|
idx->flimit = atol(sl->word[2]); |
368 |
|
|
} |
369 |
|
|
else |
370 |
|
|
progerr("%s: requires two values", w0); |
371 |
|
|
} |
372 |
|
|
else |
373 |
|
|
{ |
374 |
|
|
retval = 0; /* not a module directive */ |
375 |
|
|
} |
376 |
|
|
return retval; |
377 |
|
|
} |
378 |
|
|
|
379 |
|
|
/************************************************************************** |
380 |
|
|
* Remove a file from the index. Used when the parser aborts |
381 |
|
|
* while indexing. Typically because of FileRules. |
382 |
|
|
* |
383 |
|
|
**************************************************************************/ |
384 |
|
|
|
385 |
|
|
|
386 |
|
|
static void remove_last_file_from_list(SWISH * sw, IndexFILE * indexf) |
387 |
|
|
{ |
388 |
|
|
struct MOD_Index *idx = sw->Index; |
389 |
|
|
int i; |
390 |
|
|
ENTRY *ep, *prev_ep; |
391 |
|
|
LOCATION *l; |
392 |
|
|
|
393 |
|
|
/* Decrease filenum */ |
394 |
|
|
idx->filenum--; |
395 |
|
|
indexf->header.totalfiles--; |
396 |
|
|
|
397 |
|
|
/* Should be removed */ |
398 |
|
|
if(idx->filenum < 0 || indexf->header.totalfiles < 0) |
399 |
|
|
progerr("Internal error in remove_last_file_from_list"); |
400 |
|
|
|
401 |
|
|
|
402 |
|
|
/* walk the hash list to remove words */ |
403 |
|
|
for (i = 0; i < VERYBIGHASHSIZE; i++) |
404 |
|
|
{ |
405 |
|
|
if (idx->hashentriesdirty[i]) |
406 |
|
|
{ |
407 |
|
|
idx->hashentriesdirty[i] = 0; |
408 |
|
|
for (ep = idx->hashentries[i], prev_ep =NULL; ep; ep = ep->next) |
409 |
|
|
{ |
410 |
|
|
if(ep->currentChunkLocationList) |
411 |
|
|
{ |
412 |
|
|
/* First of all - Adjust tfrequency */ |
413 |
|
|
for(l = ep->currentChunkLocationList; l; l = l->next) |
414 |
|
|
{ |
415 |
|
|
ep->tfrequency--; |
416 |
|
|
} |
417 |
|
|
/* Remove locations */ |
418 |
|
|
/* Do not use efree, locations uses a MemZone (currentChunkLocZone) */ |
419 |
|
|
/* Will be freed later */ |
420 |
|
|
ep->currentChunkLocationList = NULL; |
421 |
|
|
ep->currentlocation = NULL; |
422 |
|
|
/* If there is no locations we must also remove the word */ |
423 |
|
|
/* Do not call efree to remove the entry, entries use |
424 |
|
|
** a MemZone (perDocTmpZone) - Will be freed later */ |
425 |
|
|
if(!ep->allLocationList) |
426 |
|
|
{ |
427 |
|
|
if(!prev_ep) |
428 |
|
|
{ |
429 |
|
|
idx->hashentries[i] = ep->next; |
430 |
|
|
} |
431 |
|
|
else |
432 |
|
|
{ |
433 |
|
|
prev_ep->next = ep->next; |
434 |
|
|
} |
435 |
|
|
/* Adjust word counters */ |
436 |
|
|
idx->entryArray->numWords--; |
437 |
|
|
indexf->header.totalwords--; |
438 |
|
|
} |
439 |
|
|
} |
440 |
|
|
else |
441 |
|
|
{ |
442 |
|
|
prev_ep = ep; |
443 |
|
|
} |
444 |
|
|
} |
445 |
|
|
} |
446 |
|
|
} |
447 |
|
|
} |
448 |
|
|
|
449 |
|
|
|
450 |
|
|
|
451 |
|
|
/************************************************************************** |
452 |
|
|
* Index just the file name (or the title) for NoContents files |
453 |
|
|
* $$$ this can be removed if libxml2 is used full time |
454 |
|
|
**************************************************************************/ |
455 |
|
|
static int index_no_content(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer) |
456 |
|
|
{ |
457 |
|
|
struct MOD_Index *idx = sw->Index; |
458 |
|
|
char *title = ""; |
459 |
|
|
int n; |
460 |
|
|
int position = 1; /* Position of word */ |
461 |
|
|
int metaID = 1; /* THIS ASSUMES that that's the default ID number */ |
462 |
|
|
|
463 |
|
|
|
464 |
|
|
/* Look for title if HTML document */ |
465 |
|
|
|
466 |
|
|
if (fprop->doctype == HTML) |
467 |
|
|
{ |
468 |
|
|
title = parseHTMLtitle( sw , buffer ); |
469 |
|
|
|
470 |
|
|
if (!isoktitle(sw, title)) |
471 |
|
|
return -2; /* skipped because of title */ |
472 |
|
|
} |
473 |
|
|
|
474 |
|
|
|
475 |
|
|
#ifdef HAVE_LIBXML2 |
476 |
|
|
if (fprop->doctype == HTML2) |
477 |
|
|
return parse_HTML( sw, fprop, fi, buffer ); |
478 |
|
|
#endif |
479 |
|
|
|
480 |
|
|
|
481 |
|
|
addCommonProperties( sw, fprop, fi, title, NULL, 0 ); |
482 |
|
|
|
483 |
|
|
|
484 |
|
|
n = indexstring( sw, *title == '\0' ? fprop->real_path : title , idx->filenum, IN_FILE, 1, &metaID, &position); |
485 |
|
|
|
486 |
|
|
|
487 |
|
|
/** ??? $$$ doesn't look right -- check this ***/ |
488 |
|
|
if ( *title != '\0' ) |
489 |
|
|
efree( title ); |
490 |
|
|
|
491 |
|
|
return n; |
492 |
|
|
} |
493 |
|
|
|
494 |
|
|
|
495 |
|
|
/********************************************************************* |
496 |
|
|
** 2001-08 jmruiz - A couple of specialized routines to be used with |
497 |
|
|
** locations and MemZones. The main goal is avoid malloc/realloc/free |
498 |
|
|
** wich produces a lot of fragmentation |
499 |
|
|
** |
500 |
|
|
** The memory will be allocated in blocks of 64 bytes inside a zone. |
501 |
|
|
** (I have tried both 32 and 64. 32 looks fine |
502 |
|
|
** In this way, there is some overhead because when a new block is |
503 |
|
|
** requested from the MemZone, the space is not recovered. But this |
504 |
|
|
** only true for the current document because the MemZone is reset |
505 |
|
|
** onces the document is processed. Then, the space is recovered |
506 |
|
|
** after a MemZoneReset is issued |
507 |
|
|
** |
508 |
|
|
** 2001-09 jmruiz Improved. Now unused space is recovered when asking |
509 |
|
|
** for space. Free nlocks are maintained using a linked list |
510 |
|
|
********************************************************************/ |
511 |
|
|
|
512 |
|
|
#define LOC_BLOCK_SIZE 32 /* Must be greater than sizeof(LOCATION) and a power of 2 */ |
513 |
|
|
#define LOC_MIN_SIZE ((sizeof(LOCATION) + LOC_BLOCK_SIZE - 1) & (~(LOC_BLOCK_SIZE - 1))) |
514 |
|
|
|
515 |
|
|
struct loc_chain { |
516 |
|
|
struct loc_chain *next; |
517 |
|
|
int size; |
518 |
|
|
}; |
519 |
|
|
|
520 |
|
|
/******************************************************************** |
521 |
|
|
** 2001-08 jmruiz |
522 |
|
|
** Routine to allocate memory inside a zone for a plain LOCATION |
523 |
|
|
** (frequency is 1). Since we are asking for LOC_BLOCK_SIZE bytes, we |
524 |
|
|
** are loosing some of the space. |
525 |
|
|
** The advantage is that we do not need to call realloc so often. In |
526 |
|
|
** fact, most realloc function work this way. They asks for more memory |
527 |
|
|
** to avoid the overhead of the sequence malloc, memcpy, free. |
528 |
|
|
********************************************************************/ |
529 |
|
|
|
530 |
|
|
LOCATION *alloc_location(struct MOD_Index *idx,int size) |
531 |
|
|
{ |
532 |
|
|
struct loc_chain *tmp = (struct loc_chain *) idx->freeLocMemChain; |
533 |
|
|
struct loc_chain *big = NULL; |
534 |
|
|
LOCATION *tmp2 = NULL; |
535 |
|
|
int avail = 0; |
536 |
|
|
struct loc_chain *p_avail = NULL; |
537 |
|
|
|
538 |
|
|
/* Search for a previously freed location of the same size */ |
539 |
|
|
while(tmp) |
540 |
|
|
{ |
541 |
|
|
if(tmp->size == size) |
542 |
|
|
{ |
543 |
|
|
if(!tmp2) |
544 |
|
|
idx->freeLocMemChain = (LOCATION *)tmp->next; |
545 |
|
|
else |
546 |
|
|
tmp2->next = (LOCATION *)tmp->next; |
547 |
|
|
return (LOCATION *)tmp; |
548 |
|
|
} |
549 |
|
|
else if(tmp->size > size) |
550 |
|
|
{ |
551 |
|
|
/* Just reserve it to be used if we do not find a match */ |
552 |
|
|
big = tmp; |
553 |
|
|
} |
554 |
|
|
else |
555 |
|
|
{ |
556 |
|
|
p_avail = tmp; |
557 |
|
|
avail = tmp->size; |
558 |
|
|
/* Check consecutive for consecutive blocks */ |
559 |
|
|
while(((unsigned char *)tmp + tmp->size) == (unsigned char *)tmp->next) |
560 |
|
|
{ |
561 |
|
|
avail += tmp->next->size; |
562 |
|
|
if(avail == size) |
563 |
|
|
{ |
564 |
|
|
if(!tmp2) |
565 |
|
|
idx->freeLocMemChain = (LOCATION *)tmp->next->next; |
566 |
|
|
else |
567 |
|
|
tmp2->next = (LOCATION *)tmp->next->next; |
568 |
|
|
return (LOCATION *)p_avail; |
569 |
|
|
} |
570 |
|
|
else if(avail > size) |
571 |
|
|
{ |
572 |
|
|
break; |
573 |
|
|
} |
574 |
|
|
else |
575 |
|
|
{ |
576 |
|
|
tmp = tmp->next; |
577 |
|
|
} |
578 |
|
|
} |
579 |
|
|
} |
580 |
|
|
tmp2 = (LOCATION *)tmp; |
581 |
|
|
tmp = tmp->next; |
582 |
|
|
} |
583 |
|
|
/* Perhaps we have a block with greater size */ |
584 |
|
|
if(big) |
585 |
|
|
{ |
586 |
|
|
/* Split it */ |
587 |
|
|
while(big->size > size) |
588 |
|
|
{ |
589 |
|
|
big->size >>= 1; |
590 |
|
|
tmp = (struct loc_chain *) ((unsigned char *)big + big->size); |
591 |
|
|
tmp->next = big->next; |
592 |
|
|
tmp->size = big->size; |
593 |
|
|
if(tmp->size == size) |
594 |
|
|
return (LOCATION *)tmp; |
595 |
|
|
big->next = tmp; |
596 |
|
|
big = tmp; |
597 |
|
|
} |
598 |
|
|
} |
599 |
|
|
/* NO memory in free chain of the same size - Asks for size */ |
600 |
|
|
return (LOCATION *)Mem_ZoneAlloc(idx->currentChunkLocZone, size); |
601 |
|
|
} |
602 |
|
|
|
603 |
|
|
|
604 |
|
|
LOCATION *new_location(struct MOD_Index *idx) |
605 |
|
|
{ |
606 |
|
|
return (LOCATION *)alloc_location(idx, LOC_MIN_SIZE); |
607 |
|
|
} |
608 |
|
|
|
609 |
|
|
|
610 |
|
|
int is_location_full(int size) |
611 |
|
|
{ |
612 |
|
|
int i; |
613 |
|
|
|
614 |
|
|
/* Fast test. Since LOC_BLOCK_SIZE is the minimum size ... */ |
615 |
|
|
if(size % LOC_BLOCK_SIZE) |
616 |
|
|
return 0; /* it is not a power of two */ |
617 |
|
|
/* Check if size is a power of 2 (32,64,128,256,...) in binary ..000100... */ |
618 |
|
|
for(i=LOC_BLOCK_SIZE;;i <<= 1) |
619 |
|
|
{ |
620 |
|
|
if(size>i) |
621 |
|
|
{ |
622 |
|
|
continue; |
623 |
|
|
} |
624 |
|
|
if((size & i) == size) |
625 |
|
|
{ |
626 |
|
|
return 1; |
627 |
|
|
} |
628 |
|
|
else |
629 |
|
|
{ |
630 |
|
|
break; |
631 |
|
|
} |
632 |
|
|
} |
633 |
|
|
return 0; |
634 |
|
|
} |
635 |
|
|
|
636 |
|
|
/******************************************************************** |
637 |
|
|
** 2001-08 jmruiz |
638 |
|
|
** Routine to reallocate memory inside a zone for a previous allocated |
639 |
|
|
** LOCATION (frequency > 1). |
640 |
|
|
** A new block is allocated only if the previous becomes full |
641 |
|
|
********************************************************************/ |
642 |
|
|
LOCATION *add_position_location(void *oldp, struct MOD_Index *idx, int frequency) |
643 |
|
|
{ |
644 |
|
|
LOCATION *newp = NULL; |
645 |
|
|
struct loc_chain *tmp = NULL; |
646 |
|
|
int oldsize; |
647 |
|
|
|
648 |
|
|
oldsize = sizeof(LOCATION) + (frequency - 1) * sizeof(int); |
649 |
|
|
|
650 |
|
|
/* Check for available size in block */ |
651 |
|
|
if(is_location_full(oldsize)) |
652 |
|
|
{ |
653 |
|
|
/* Not enough size - Allocate a new block. Size rounded to LOC_BLOCK_SIZE */ |
654 |
|
|
newp = (LOCATION *)alloc_location(idx,oldsize << 1); |
655 |
|
|
memcpy((void *)newp,(void *)oldp,oldsize); |
656 |
|
|
/* Add old zone to the free chain of blocks */ |
657 |
|
|
tmp = (struct loc_chain *)oldp; |
658 |
|
|
tmp->next = (struct loc_chain *)idx->freeLocMemChain; |
659 |
|
|
tmp->size = oldsize; |
660 |
|
|
idx->freeLocMemChain = (LOCATION *) tmp; |
661 |
|
|
} |
662 |
|
|
else |
663 |
|
|
/* Enough size */ |
664 |
|
|
newp = oldp; |
665 |
|
|
|
666 |
|
|
return newp; |
667 |
|
|
} |
668 |
|
|
|
669 |
|
|
/*********************************************************************** |
670 |
|
|
-- Start the real indexing process for a file. |
671 |
|
|
-- This routine will be called by the different indexing methods |
672 |
|
|
-- (httpd, filesystem, etc.) |
673 |
|
|
-- The indexed file may be the |
674 |
|
|
-- - real file on filesystem |
675 |
|
|
-- - tmpfile or work file (shadow of the real file) |
676 |
|
|
-- Checks if file has to be send thru filter (file stream) |
677 |
|
|
-- 2000-11-19 rasc |
678 |
|
|
***********************************************************************/ |
679 |
|
|
|
680 |
|
|
void do_index_file(SWISH * sw, FileProp * fprop) |
681 |
|
|
{ |
682 |
|
|
int (*countwords)(SWISH *sw,FileProp *fprop, FileRec *fi, char *buffer); |
683 |
|
|
IndexFILE *indexf = sw->indexlist; |
684 |
|
|
int wordcount; |
685 |
|
|
char *rd_buffer = NULL; /* complete file read into buffer */ |
686 |
|
|
struct MOD_Index *idx = sw->Index; |
687 |
|
|
char strType[30]; |
688 |
|
|
int i; |
689 |
|
|
FileRec fi; /* place to hold doc properties */ |
690 |
|
|
|
691 |
|
|
memset( &fi, 0, sizeof( FileRec ) ); |
692 |
|
|
|
693 |
|
|
|
694 |
|
|
wordcount = -1; |
695 |
|
|
|
696 |
|
|
|
697 |
|
|
|
698 |
|
|
/* skip file is the last_mod date is newer than the check date */ |
699 |
|
|
|
700 |
|
|
if (sw->mtime_limit && fprop->mtime < sw->mtime_limit) |
701 |
|
|
{ |
702 |
|
|
if (sw->verbose >= 3) |
703 |
|
|
progwarn("Skipping %s: last_mod date is too old\n", fprop->real_path); |
704 |
|
|
|
705 |
|
|
/* external program must seek past this data (fseek fails) */ |
706 |
|
|
if (fprop->fp) |
707 |
|
|
flush_stream( fprop ); |
708 |
|
|
|
709 |
|
|
return; |
710 |
|
|
} |
711 |
|
|
|
712 |
|
|
|
713 |
|
|
/* Upon entry, if fprop->fp is non-NULL then it's already opened and ready to be read from. |
714 |
|
|
This is the case with "prog" external programs, *except* when a filter is selected for the file type. |
715 |
|
|
If a filter is used with "prog" a temporary file was created (fprop->work_file), and |
716 |
|
|
fprop->fp will be NULL (as is with http and fs access methods). |
717 |
|
|
2001-05-13 moseley |
718 |
|
|
*/ |
719 |
|
|
|
720 |
|
|
|
721 |
|
|
|
722 |
|
|
/* Get input file handle */ |
723 |
|
|
if (fprop->hasfilter) |
724 |
|
|
{ |
725 |
|
|
fprop->fp = FilterOpen(fprop); |
726 |
|
|
|
727 |
|
|
/* This should be checked in filteropen because the popen probably won't fail */ |
728 |
|
|
if ( !fprop->fp ) |
729 |
|
|
progerr("Failed to open filter for file '%s'",fprop->real_path); |
730 |
|
|
} |
731 |
|
|
|
732 |
|
|
else if ( !fprop->fp ) |
733 |
|
|
{ |
734 |
|
|
fprop->fp = fopen(fprop->work_path, F_READ_TEXT ); |
735 |
|
|
|
736 |
|
|
if ( !fprop->fp ) |
737 |
|
|
{ |
738 |
|
|
progwarnno("Failed to open: '%s': ", fprop->work_path); |
739 |
|
|
return; |
740 |
|
|
} |
741 |
|
|
} |
742 |
|
|
else /* Already open - flag to prevent closing the stream used with "prog" */ |
743 |
|
|
fprop->external_program++; |
744 |
|
|
|
745 |
|
|
|
746 |
|
|
|
747 |
|
|
|
748 |
|
|
/** Replace the path for ReplaceRules **/ |
749 |
|
|
|
750 |
|
|
if ( sw->replaceRegexps ) |
751 |
|
|
{ |
752 |
|
|
int matched = 0; |
753 |
|
|
fprop->real_path = process_regex_list( fprop->real_path, sw->replaceRegexps, &matched ); |
754 |
|
|
} |
755 |
|
|
|
756 |
|
|
|
757 |
|
|
|
758 |
|
|
/** Read the buffer, if not a stream parser **/ |
759 |
|
|
#ifdef HAVE_LIBXML2 |
760 |
|
|
if ( fprop->doctype == HTML2 || fprop->doctype == XML2 || fprop->doctype == TXT2 ) |
761 |
|
|
rd_buffer = NULL; |
762 |
|
|
else |
763 |
|
|
#endif |
764 |
|
|
/* -- Read all data (len = 0 if filtered...) */ |
765 |
|
|
rd_buffer = read_stream(sw, fprop->real_path, fprop->fp, (fprop->hasfilter) ? 0 : fprop->fsize, sw->truncateDocSize); |
766 |
|
|
|
767 |
|
|
|
768 |
|
|
/* just for fun so we can show total bytes shown */ |
769 |
|
|
sw->indexlist->total_bytes += fprop->fsize; |
770 |
|
|
|
771 |
|
|
|
772 |
|
|
/* Set which parser to use */ |
773 |
|
|
|
774 |
|
|
switch (fprop->doctype) |
775 |
|
|
{ |
776 |
|
|
|
777 |
|
|
case TXT: |
778 |
|
|
strcpy(strType,"TXT"); |
779 |
|
|
countwords = countwords_TXT; |
780 |
|
|
break; |
781 |
|
|
|
782 |
|
|
case HTML: |
783 |
|
|
strcpy(strType,"HTML"); |
784 |
|
|
countwords = countwords_HTML; |
785 |
|
|
break; |
786 |
|
|
|
787 |
|
|
case XML: |
788 |
|
|
strcpy(strType,"XML"); |
789 |
|
|
countwords = countwords_XML; |
790 |
|
|
break; |
791 |
|
|
|
792 |
|
|
#ifdef HAVE_LIBXML2 |
793 |
|
|
case XML2: |
794 |
|
|
strcpy(strType,"XML2"); |
795 |
|
|
countwords = parse_XML; |
796 |
|
|
break; |
797 |
|
|
|
798 |
|
|
case HTML2: |
799 |
|
|
strcpy(strType,"HTML2"); |
800 |
|
|
countwords = parse_HTML; |
801 |
|
|
break; |
802 |
|
|
|
803 |
|
|
case TXT2: |
804 |
|
|
strcpy(strType,"TXT2"); |
805 |
|
|
countwords = parse_TXT; |
806 |
|
|
break; |
807 |
|
|
#endif |
808 |
|
|
|
809 |
|
|
case WML: |
810 |
|
|
strcpy(strType,"WML"); |
811 |
|
|
countwords = countwords_HTML; |
812 |
|
|
break; |
813 |
|
|
|
814 |
|
|
default: |
815 |
|
|
strcpy(strType,"DEFAULT (HTML)"); |
816 |
|
|
countwords = countwords_HTML; |
817 |
|
|
break; |
818 |
|
|
} |
819 |
|
|
|
820 |
|
|
if (sw->verbose >= 3) |
821 |
|
|
printf(" - Using %s parser - ",strType); |
822 |
|
|
|
823 |
|
|
|
824 |
|
|
/* Check for NoContents flag and just save the path name */ |
825 |
|
|
/* $$$ Note, really need to only read_stream if reading from a pipe. */ |
826 |
|
|
/* $$$ waste of disk IO and memory if reading from file system */ |
827 |
|
|
|
828 |
|
|
if (fprop->index_no_content) |
829 |
|
|
countwords = index_no_content; |
830 |
|
|
|
831 |
|
|
|
832 |
|
|
/* Make sure all meta flags are cleared (incase a parser aborts) */ |
833 |
|
|
ClearInMetaFlags( &indexf->header ); |
834 |
|
|
|
835 |
|
|
|
836 |
|
|
|
837 |
|
|
|
838 |
|
|
/* Now bump the file counter */ |
839 |
|
|
idx->filenum++; |
840 |
|
|
indexf->header.totalfiles++; /* why ??? is this needed */ |
841 |
|
|
fi.filenum = idx->filenum; |
842 |
|
|
|
843 |
|
|
/** PARSE **/ |
844 |
|
|
wordcount = countwords(sw, fprop, &fi, rd_buffer); |
845 |
|
|
|
846 |
|
|
|
847 |
|
|
|
848 |
|
|
|
849 |
|
|
|
850 |
|
|
if (!fprop->external_program) /* external_program is not set if a filter is in use */ |
851 |
|
|
{ |
852 |
|
|
if (fprop->hasfilter) |
853 |
|
|
FilterClose(fprop->fp); /* close filter pipe - should the filter be flushed? */ |
854 |
|
|
else |
855 |
|
|
fclose(fprop->fp); /* close file */ |
856 |
|
|
} |
857 |
|
|
/* Else, it's -S prog so make sure we read all the bytes we are suppose to read! */ |
858 |
|
|
/* Can remove the check for fprop->bytes_read once read_stream is no longer used */ |
859 |
|
|
|
860 |
|
|
else if ( fprop->bytes_read && fprop->bytes_read < fprop->fsize ) |
861 |
|
|
flush_stream( fprop ); |
862 |
|
|
|
863 |
|
|
|
864 |
|
|
if (sw->verbose >= 3) |
865 |
|
|
{ |
866 |
|
|
if (wordcount > 0) |
867 |
|
|
printf(" (%d words)\n", wordcount); |
868 |
|
|
else if (wordcount == 0) |
869 |
|
|
printf(" (no words indexed)\n"); |
870 |
|
|
else if (wordcount == -1) |
871 |
|
|
printf(" (not opened)\n"); |
872 |
|
|
else if (wordcount == -2) |
873 |
|
|
printf(" (Skipped due to 'FileRules title' setting)\n"); |
874 |
|
|
else if (wordcount == -3) |
875 |
|
|
printf(" (Skipped due to Robots Excluion Rule in meta tag)\n"); |
876 |
|
|
fflush(stdout); |
877 |
|
|
} |
878 |
|
|
|
879 |
|
|
|
880 |
|
|
/* If indexing aborted, remove the last file entry */ |
881 |
|
|
if ( wordcount == -3 || wordcount == -2 ) |
882 |
|
|
{ |
883 |
|
|
remove_last_file_from_list( sw, indexf ); |
884 |
|
|
return; |
885 |
|
|
} |
886 |
|
|
|
887 |
|
|
|
888 |
|
|
/* Continue if a file was not indexed */ |
889 |
|
|
if ( wordcount < 0 ) |
890 |
|
|
return; |
891 |
|
|
|
892 |
|
|
|
893 |
|
|
if ( DEBUG_MASK & DEBUG_PROPERTIES ) |
894 |
|
|
dump_file_properties( indexf, &fi ); |
895 |
|
|
|
896 |
|
|
|
897 |
|
|
/* write properties to disk, and release docprop array (and the prop index array) */ |
898 |
|
|
/* Currently this just passes sw, and assumes only one index file when indexing */ |
899 |
|
|
WritePropertiesToDisk( sw , &fi ); |
900 |
|
|
|
901 |
|
|
|
902 |
|
|
/* Save total words per file */ |
903 |
|
|
if ( !indexf->header.ignoreTotalWordCountWhenRanking ) |
904 |
|
|
{ |
905 |
|
|
|
906 |
|
|
setTotalWordsPerFile(sw, indexf, fi.filenum - 1,wordcount); |
907 |
|
|
} |
908 |
|
|
|
909 |
|
|
|
910 |
|
|
|
911 |
|
|
|
912 |
|
|
/* Compress the entries */ |
913 |
|
|
{ |
914 |
|
|
ENTRY *ep; |
915 |
|
|
|
916 |
|
|
/* walk the hash list, and compress entries */ |
917 |
|
|
for (i = 0; i < VERYBIGHASHSIZE; i++) |
918 |
|
|
{ |
919 |
|
|
if (idx->hashentriesdirty[i]) |
920 |
|
|
{ |
921 |
|
|
idx->hashentriesdirty[i] = 0; |
922 |
|
|
for (ep = idx->hashentries[i]; ep; ep = ep->next) |
923 |
|
|
CompressCurrentLocEntry(sw, indexf, ep); |
924 |
|
|
} |
925 |
|
|
} |
926 |
|
|
|
927 |
|
|
/* Coalesce word positions int a more optimal schema to avoid maintain the location data contiguous */ |
928 |
|
|
if(idx->filenum && ((!(idx->filenum % idx->chunk_size)) || (Mem_ZoneSize(idx->currentChunkLocZone) > idx->optimalChunkLocZoneSize))) |
929 |
|
|
{ |
930 |
|
|
for (i = 0; i < VERYBIGHASHSIZE; i++) |
931 |
|
|
for (ep = idx->hashentries[i]; ep; ep = ep->next) |
932 |
|
|
coalesce_word_locations(sw, indexf, ep); |
933 |
|
|
/* Make zone available for reuse */ |
934 |
|
|
Mem_ZoneReset(idx->currentChunkLocZone); |
935 |
|
|
idx->freeLocMemChain = NULL; |
936 |
|
|
|
937 |
|
|
} |
938 |
|
|
} |
939 |
|
|
|
940 |
|
|
|
941 |
|
|
/* Make zone available for reuse */ |
942 |
|
|
Mem_ZoneReset(idx->perDocTmpZone); |
943 |
|
|
|
944 |
|
|
|
945 |
|
|
return; |
946 |
|
|
} |
947 |
|
|
|
948 |
|
|
|
949 |
|
|
ENTRY *getentry(SWISH * sw, char *word) |
950 |
|
|
{ |
951 |
|
|
IndexFILE *indexf = sw->indexlist; |
952 |
|
|
struct MOD_Index *idx = sw->Index; |
953 |
|
|
int hashval; |
954 |
|
|
ENTRY *e; |
955 |
|
|
|
956 |
|
|
if (!idx->entryArray) |
957 |
|
|
{ |
958 |
|
|
idx->entryArray = (ENTRYARRAY *) emalloc(sizeof(ENTRYARRAY)); |
959 |
|
|
idx->entryArray->numWords = 0; |
960 |
|
|
idx->entryArray->elist = NULL; |
961 |
|
|
} |
962 |
|
|
/* Compute hash value of word */ |
963 |
|
|
hashval = verybighash(word); |
964 |
|
|
|
965 |
|
|
|
966 |
|
|
/* Look for the word in the hash array */ |
967 |
|
|
for (e = idx->hashentries[hashval]; e; e = e->next) |
968 |
|
|
if (strcmp(e->word, word) == 0) |
969 |
|
|
break; |
970 |
|
|
|
971 |
|
|
/* flag hash entry used this file, so that the locations can be "compressed" in do_index_file */ |
972 |
|
|
idx->hashentriesdirty[hashval] = 1; |
973 |
|
|
|
974 |
|
|
|
975 |
|
|
/* Word found, return it */ |
976 |
|
|
if (e) |
977 |
|
|
return e; |
978 |
|
|
|
979 |
|
|
/* Word not found, so create a new word */ |
980 |
|
|
|
981 |
|
|
e = (ENTRY *) Mem_ZoneAlloc(idx->entryZone, sizeof(ENTRY) + strlen(word)); |
982 |
|
|
strcpy(e->word, word); |
983 |
|
|
e->next = idx->hashentries[hashval]; |
984 |
|
|
idx->hashentries[hashval] = e; |
985 |
|
|
|
986 |
|
|
/* Init values */ |
987 |
|
|
e->tfrequency = 0; |
988 |
|
|
e->u1.last_filenum = 0; |
989 |
|
|
e->currentlocation = NULL; |
990 |
|
|
e->currentChunkLocationList = NULL; |
991 |
|
|
e->allLocationList = NULL; |
992 |
|
|
|
993 |
|
|
idx->entryArray->numWords++; |
994 |
|
|
indexf->header.totalwords++; |
995 |
|
|
|
996 |
|
|
return e; |
997 |
|
|
} |
998 |
|
|
|
999 |
|
|
/* Adds a word to the master index tree. |
1000 |
|
|
*/ |
1001 |
|
|
|
1002 |
|
|
void addentry(SWISH * sw, ENTRY *e, int filenum, int structure, int metaID, int position) |
1003 |
|
|
{ |
1004 |
|
|
int found; |
1005 |
|
|
LOCATION *tp, *newtp, *prevtp; |
1006 |
|
|
IndexFILE *indexf = sw->indexlist; |
1007 |
|
|
struct MOD_Index *idx = sw->Index; |
1008 |
|
|
|
1009 |
|
|
|
1010 |
|
|
indexf->total_word_positions++; |
1011 |
|
|
|
1012 |
|
|
if ( DEBUG_MASK & DEBUG_WORDS ) |
1013 |
|
|
{ |
1014 |
|
|
struct metaEntry *m = getMetaNameByID(&indexf->header, metaID); |
1015 |
|
|
|
1016 |
|
|
printf(" Adding:[%d:%s(%d)] '%s' Pos:%d Stuct:0x%0X (", filenum, m ? m->metaName : "PROP_UNKNOWN", metaID, e->word, position, structure); |
1017 |
|
|
|
1018 |
|
|
if ( structure & IN_EMPHASIZED ) printf(" EM"); |
1019 |
|
|
if ( structure & IN_HEADER ) printf(" HEADING"); |
1020 |
|
|
if ( structure & IN_COMMENTS ) printf(" COMMENT"); |
1021 |
|
|
if ( structure & IN_META ) printf(" META"); |
1022 |
|
|
if ( structure & IN_BODY ) printf(" BODY"); |
1023 |
|
|
if ( structure & IN_HEAD ) printf(" HEAD"); |
1024 |
|
|
if ( structure & IN_TITLE ) printf(" TITLE"); |
1025 |
|
|
if ( structure & IN_FILE ) printf(" FILE"); |
1026 |
|
|
printf(" )\n"); |
1027 |
|
|
} |
1028 |
|
|
|
1029 |
|
|
|
1030 |
|
|
/* Check for first time */ |
1031 |
|
|
if(!e->tfrequency) |
1032 |
|
|
{ |
1033 |
|
|
/* create a location record */ |
1034 |
|
|
tp = (LOCATION *) new_location(idx); |
1035 |
|
|
tp->filenum = filenum; |
1036 |
|
|
tp->frequency = 1; |
1037 |
|
|
tp->metaID = metaID; |
1038 |
|
|
tp->posdata[0] = SET_POSDATA(position,structure); |
1039 |
|
|
tp->next = NULL; |
1040 |
|
|
|
1041 |
|
|
e->currentChunkLocationList = tp; |
1042 |
|
|
e->tfrequency = 1; |
1043 |
|
|
e->u1.last_filenum = filenum; |
1044 |
|
|
|
1045 |
|
|
return; |
1046 |
|
|
} |
1047 |
|
|
|
1048 |
|
|
/* Word found -- look for same metaID and filename */ |
1049 |
|
|
/* $$$ To do it right, should probably compare the structure, too */ |
1050 |
|
|
/* Note: filename not needed due to compress we are only looking at the current file */ |
1051 |
|
|
/* Oct 18, 2001 -- filename is needed since merge adds words in non-filenum order */ |
1052 |
|
|
|
1053 |
|
|
tp = e->currentChunkLocationList; |
1054 |
|
|
found = 0; |
1055 |
|
|
|
1056 |
|
|
while (tp != e->currentlocation) |
1057 |
|
|
{ |
1058 |
|
|
if(tp->metaID == metaID && tp->filenum == filenum ) |
1059 |
|
|
{ |
1060 |
|
|
found =1; |
1061 |
|
|
break; |
1062 |
|
|
} |
1063 |
|
|
tp = tp->next; |
1064 |
|
|
} |
1065 |
|
|
|
1066 |
|
|
/* matching metaID NOT found. So, add a new LOCATION record onto the word */ |
1067 |
|
|
/* This expands the size of the location array for this word by one */ |
1068 |
|
|
|
1069 |
|
|
if(!found) |
1070 |
|
|
{ |
1071 |
|
|
/* create the new LOCATION entry */ |
1072 |
|
|
tp = (LOCATION *) new_location(idx); |
1073 |
|
|
tp->filenum = filenum; |
1074 |
|
|
tp->frequency = 1; /* count of times this word in this file:metaID */ |
1075 |
|
|
tp->metaID = metaID; |
1076 |
|
|
tp->posdata[0] = SET_POSDATA(position,structure); |
1077 |
|
|
|
1078 |
|
|
/* add the new LOCATION onto the array */ |
1079 |
|
|
tp->next = e->currentChunkLocationList; |
1080 |
|
|
e->currentChunkLocationList = tp; |
1081 |
|
|
|
1082 |
|
|
/* Count number of different files that this word is used in */ |
1083 |
|
|
if ( e->u1.last_filenum != filenum ) |
1084 |
|
|
{ |
1085 |
|
|
e->tfrequency++; |
1086 |
|
|
e->u1.last_filenum = filenum; |
1087 |
|
|
} |
1088 |
|
|
|
1089 |
|
|
return; /* all done */ |
1090 |
|
|
} |
1091 |
|
|
|
1092 |
|
|
|
1093 |
|
|
/* Otherwise, found matching LOCATION record (matches filenum and metaID) */ |
1094 |
|
|
/* Just add the position number onto the end by expanding the size of the LOCATION record */ |
1095 |
|
|
|
1096 |
|
|
/* 2001/08 jmruiz - Much better memory usage occurs if we use MemZones */ |
1097 |
|
|
/* MemZone will be reset when the doc is completely proccesed */ |
1098 |
|
|
|
1099 |
|
|
newtp = add_position_location(tp, idx, tp->frequency); |
1100 |
|
|
|
1101 |
|
|
if(newtp != tp) |
1102 |
|
|
{ |
1103 |
|
|
if(e->currentChunkLocationList == tp) |
1104 |
|
|
e->currentChunkLocationList = newtp; |
1105 |
|
|
else |
1106 |
|
|
for(prevtp = e->currentChunkLocationList;;prevtp = prevtp->next) |
1107 |
|
|
{ |
1108 |
|
|
if(prevtp->next == tp) |
1109 |
|
|
{ |
1110 |
|
|
prevtp->next = newtp; |
1111 |
|
|
break; |
1112 |
|
|
} |
1113 |
|
|
} |
1114 |
|
|
tp = newtp; |
1115 |
|
|
} |
1116 |
|
|
|
1117 |
|
|
tp->posdata[tp->frequency++] = SET_POSDATA(position,structure); |
1118 |
|
|
|
1119 |
|
|
} |
1120 |
|
|
|
1121 |
|
|
|
1122 |
|
|
/******************************************************************* |
1123 |
|
|
* Adds common file properties to the last entry in the file array |
1124 |
|
|
* (which should be the current one) |
1125 |
|
|
* |
1126 |
|
|
* |
1127 |
|
|
* Call with: |
1128 |
|
|
* *SWISH - need for indexing words |
1129 |
|
|
* *fprop |
1130 |
|
|
* *fi |
1131 |
|
|
* *summary - document summary (why here?) |
1132 |
|
|
* start - start position of a sub-document |
1133 |
|
|
* size - size in bytes of document |
1134 |
|
|
* |
1135 |
|
|
* Returns: |
1136 |
|
|
* void |
1137 |
|
|
* |
1138 |
|
|
* Note: |
1139 |
|
|
* Uses cached meta entries (created in metanames.c) to save the |
1140 |
|
|
* metaEntry lookup by name costs |
1141 |
|
|
* |
1142 |
|
|
********************************************************************/ |
1143 |
|
|
|
1144 |
|
|
void addCommonProperties( SWISH *sw, FileProp *fprop, FileRec *fi, char *title, char *summary, int start ) |
1145 |
|
|
{ |
1146 |
|
|
struct metaEntry *q; |
1147 |
|
|
docProperties **properties = &fi->docProperties; |
1148 |
|
|
unsigned long tmp; |
1149 |
|
|
int metaID; |
1150 |
|
|
INDEXDATAHEADER *header = &sw->indexlist->header; |
1151 |
|
|
char *filename = fprop->real_path; /* should always have a path */ |
1152 |
|
|
int filenum = fi->filenum; |
1153 |
|
|
|
1154 |
|
|
|
1155 |
|
|
|
1156 |
|
|
/* Check if filename is internal swish metadata -- should be! */ |
1157 |
|
|
|
1158 |
|
|
if ((q = getPropNameByName(header, AUTOPROPERTY_DOCPATH))) |
1159 |
|
|
addDocProperty( properties, q, (unsigned char *)filename, strlen(filename),0); |
1160 |
|
|
|
1161 |
|
|
|
1162 |
|
|
/* Perhaps we want it to be indexed ... */ |
1163 |
|
|
if ((q = getMetaNameByName(header, AUTOPROPERTY_DOCPATH))) |
1164 |
|
|
{ |
1165 |
|
|
int metaID, |
1166 |
|
|
positionMeta; |
1167 |
|
|
|
1168 |
|
|
metaID = q->metaID; |
1169 |
|
|
positionMeta = 1; |
1170 |
|
|
indexstring(sw, filename, filenum, IN_FILE, 1, &metaID, &positionMeta); |
1171 |
|
|
} |
1172 |
|
|
|
1173 |
|
|
|
1174 |
|
|
/* This allows extracting out parts of a path and indexing as a separate meta name */ |
1175 |
|
|
if ( sw->pathExtractList ) |
1176 |
|
|
index_path_parts( sw, fprop->orig_path, sw->pathExtractList, header, properties ); |
1177 |
|
|
|
1178 |
|
|
|
1179 |
|
|
|
1180 |
|
|
/* Check if title is internal swish metadata */ |
1181 |
|
|
if ( title ) |
1182 |
|
|
{ |
1183 |
|
|
if ( (q = getPropNameByName(header, AUTOPROPERTY_TITLE))) |
1184 |
|
|
addDocProperty(properties, q, (unsigned char *)title, strlen(title),0); |
1185 |
|
|
|
1186 |
|
|
|
1187 |
|
|
/* Perhaps we want it to be indexed ... */ |
1188 |
|
|
if ( (q = getMetaNameByName(header, AUTOPROPERTY_TITLE))) |
1189 |
|
|
{ |
1190 |
|
|
int positionMeta; |
1191 |
|
|
|
1192 |
|
|
metaID = q->metaID; |
1193 |
|
|
positionMeta = 1; |
1194 |
|
|
indexstring(sw, title, filenum, IN_FILE, 1, &metaID, &positionMeta); |
1195 |
|
|
} |
1196 |
|
|
} |
1197 |
|
|
|
1198 |
|
|
|
1199 |
|
|
if ( summary ) |
1200 |
|
|
{ |
1201 |
|
|
if ( (q = getPropNameByName(header, AUTOPROPERTY_SUMMARY))) |
1202 |
|
|
addDocProperty(properties, q, (unsigned char *)summary, strlen(summary),0); |
1203 |
|
|
|
1204 |
|
|
|
1205 |
|
|
if ( (q = getMetaNameByName(header, AUTOPROPERTY_SUMMARY))) |
1206 |
|
|
{ |
1207 |
|
|
int metaID, |
1208 |
|
|
positionMeta; |
1209 |
|
|
|
1210 |
|
|
metaID = q->metaID; |
1211 |
|
|
positionMeta = 1; |
1212 |
|
|
indexstring(sw, summary, filenum, IN_FILE, 1, &metaID, &positionMeta); |
1213 |
|
|
} |
1214 |
|
|
} |
1215 |
|
|
|
1216 |
|
|
|
1217 |
|
|
|
1218 |
|
|
/* Currently don't allow indexing by date or size or position */ |
1219 |
|
|
|
1220 |
|
|
/* mtime is a time_t, but we don't have an entry for NOT A TIME. Does anyone care about the first second of 1970? */ |
1221 |
|
|
|
1222 |
|
|
if ( fprop->mtime && (q = getPropNameByName(header, AUTOPROPERTY_LASTMODIFIED))) |
1223 |
|
|
{ |
1224 |
|
|
tmp = (unsigned long) fprop->mtime; |
1225 |
|
|
tmp = PACKLONG(tmp); /* make it portable */ |
1226 |
|
|
addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1); |
1227 |
|
|
} |
1228 |
|
|
|
1229 |
|
|
if ( (q = getPropNameByName(header, AUTOPROPERTY_DOCSIZE))) |
1230 |
|
|
{ |
1231 |
|
|
tmp = (unsigned long) fprop->fsize; |
1232 |
|
|
tmp = PACKLONG(tmp); /* make it portable */ |
1233 |
|
|
addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1); |
1234 |
|
|
} |
1235 |
|
|
|
1236 |
|
|
|
1237 |
|
|
if ( (q = getPropNameByName(header, AUTOPROPERTY_STARTPOS))) |
1238 |
|
|
{ |
1239 |
|
|
tmp = (unsigned long) start; |
1240 |
|
|
tmp = PACKLONG(tmp); /* make it portable */ |
1241 |
|
|
addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1); |
1242 |
|
|
} |
1243 |
|
|
|
1244 |
|
|
} |
1245 |
|
|
|
1246 |
|
|
|
1247 |
|
|
/******************************************************************* |
1248 |
|
|
* extracts out parts from a path name and indexes that part |
1249 |
|
|
* |
1250 |
|
|
********************************************************************/ |
1251 |
|
|
static void index_path_parts( SWISH *sw, char *path, path_extract_list *list, INDEXDATAHEADER *header, docProperties **properties ) |
1252 |
|
|
{ |
1253 |
|
|
int metaID; |
1254 |
|
|
int positionMeta = 1; |
1255 |
|
|
int matched = 0; /* flag if any patterns matched */ |
1256 |
|
|
|
1257 |
|
|
while ( list ) |
1258 |
|
|
{ |
1259 |
|
|
char *str = process_regex_list( estrdup(path), list->regex, &matched ); |
1260 |
|
|
|
1261 |
|
|
if ( !matched ) |
1262 |
|
|
{ |
1263 |
|
|
/* use default? */ |
1264 |
|
|
if ( list->meta_entry->extractpath_default ) |
1265 |
|
|
{ |
1266 |
|
|
metaID = list->meta_entry->metaID; |
1267 |
|
|
indexstring(sw, list->meta_entry->extractpath_default, sw->Index->filenum, IN_FILE, 1, &metaID, &positionMeta); |
1268 |
|
|
} |
1269 |
|
|
} |
1270 |
|
|
else |
1271 |
|
|
{ |
1272 |
|
|
struct metaEntry *q; |
1273 |
|
|
|
1274 |
|
|
metaID = list->meta_entry->metaID; |
1275 |
|
|
indexstring(sw, str, sw->Index->filenum, IN_FILE, 1, &metaID, &positionMeta); |
1276 |
|
|
|
1277 |
|
|
if ((q = getPropNameByName(header, list->meta_entry->metaName ))) |
1278 |
|
|
addDocProperty( properties, q, (unsigned char *)str, strlen(str),0); |
1279 |
|
|
|
1280 |
|
|
|
1281 |
|
|
efree( str ); |
1282 |
|
|
} |
1283 |
|
|
|
1284 |
|
|
matched = 0; |
1285 |
|
|
list = list->next; |
1286 |
|
|
} |
1287 |
|
|
} |
1288 |
|
|
|
1289 |
|
|
|
1290 |
|
|
/* Just goes through the master list of files and |
1291 |
|
|
** counts 'em. |
1292 |
|
|
*/ |
1293 |
|
|
|
1294 |
|
|
int getfilecount(IndexFILE * indexf) |
1295 |
|
|
{ |
1296 |
|
|
return indexf->header.totalfiles; |
1297 |
|
|
} |
1298 |
|
|
|
1299 |
|
|
|
1300 |
|
|
|
1301 |
|
|
/* Removes words that occur in over _plimit_ percent of the files and |
1302 |
|
|
** that occur in over _flimit_ files (marks them as stopwords, that is). |
1303 |
|
|
*/ |
1304 |
|
|
/* 05/00 Jose Ruiz |
1305 |
|
|
** Recompute positions when a stopword is removed from lists |
1306 |
|
|
** This piece of code is terrorific because the first goal |
1307 |
|
|
** was getting the best possible performace. So, the code is not |
1308 |
|
|
** very clear. |
1309 |
|
|
** The main problem is to recalculate word positions for all |
1310 |
|
|
** the words after removing the automatic stop words. This means |
1311 |
|
|
** looking at all word's positions for each automatic stop word |
1312 |
|
|
** and decrement its position |
1313 |
|
|
*/ |
1314 |
|
|
/* 2001/02 jmruiz - rewritten - all the proccess is made in one pass to achieve |
1315 |
|
|
better performance */ |
1316 |
|
|
/* 2001-08 jmruiz - rewritten - adapted to new locations and zone schema */ |
1317 |
|
|
/* 2002-07 jmruiz - rewritten - adapted to new -e schema */ |
1318 |
|
|
|
1319 |
|
|
int getNumberOfIgnoreLimitWords(SWISH *sw) |
1320 |
|
|
{ |
1321 |
|
|
return sw->Index->nIgnoreLimitWords; |
1322 |
|
|
} |
1323 |
|
|
|
1324 |
|
|
void getPositionsFromIgnoreLimitWords(SWISH * sw) |
1325 |
|
|
{ |
1326 |
|
|
int i, |
1327 |
|
|
j, |
1328 |
|
|
k, |
1329 |
|
|
m, |
1330 |
|
|
stopwords, |
1331 |
|
|
percent, |
1332 |
|
|
bytes_size, |
1333 |
|
|
chunk_size, |
1334 |
|
|
metaID, |
1335 |
|
|
frequency, |
1336 |
|
|
tmpval, |
1337 |
|
|
filenum; |
1338 |
|
|
int *positions; |
1339 |
|
|
int local_positions[MAX_STACK_POSITIONS]; |
1340 |
|
|
|
1341 |
|
|
LOCATION *l, *next; |
1342 |
|
|
ENTRY *ep, |
1343 |
|
|
*ep2; |
1344 |
|
|
ENTRY **estop = NULL; |
1345 |
|
|
int estopsz = 0, |
1346 |
|
|
estopmsz = 0; |
1347 |
|
|
int totalwords; |
1348 |
|
|
IndexFILE *indexf = sw->indexlist; |
1349 |
|
|
int totalfiles = getfilecount(indexf); |
1350 |
|
|
struct IgnoreLimitPositions **filepos = NULL; |
1351 |
|
|
struct IgnoreLimitPositions *fpos; |
1352 |
|
|
struct MOD_Index *idx = sw->Index; |
1353 |
|
|
unsigned char *p, *q, *compressed_data, flag; |
1354 |
|
|
int last_loc_swap; |
1355 |
|
|
|
1356 |
|
|
stopwords = 0; |
1357 |
|
|
totalwords = indexf->header.totalwords; |
1358 |
|
|
|
1359 |
|
|
idx->nIgnoreLimitWords = 0; |
1360 |
|
|
idx->IgnoreLimitPositionsArray = NULL; |
1361 |
|
|
|
1362 |
|
|
if (!totalwords || idx->plimit >= NO_PLIMIT) |
1363 |
|
|
return; |
1364 |
|
|
|
1365 |
|
|
if (sw->verbose) |
1366 |
|
|
{ |
1367 |
|
|
printf("\r Getting IgnoreLimit stopwords: ..."); |
1368 |
|
|
fflush(stdout); |
1369 |
|
|
} |
1370 |
|
|
|
1371 |
|
|
|
1372 |
|
|
if (!estopmsz) |
1373 |
|
|
{ |
1374 |
|
|
estopmsz = 1; |
1375 |
|
|
estop = (ENTRY **) emalloc(estopmsz * sizeof(ENTRY *)); |
1376 |
|
|
} |
1377 |
|
|
|
1378 |
|
|
|
1379 |
|
|
/* this is the easy part: Remove the automatic stopwords from the hash array */ |
1380 |
|
|
/* Builds a list estop[] of ENTRY's that need to be removed */ |
1381 |
|
|
|
1382 |
|
|
for (i = 0; i < VERYBIGHASHSIZE; i++) |
1383 |
|
|
{ |
1384 |
|
|
for (ep2 = NULL, ep = sw->Index->hashentries[i]; ep; ep = ep->next) |
1385 |
|
|
{ |
1386 |
|
|
percent = (ep->tfrequency * 100) / totalfiles; |
1387 |
|
|
if (percent >= idx->plimit && ep->tfrequency >= idx->flimit) |
1388 |
|
|
{ |
1389 |
|
|
addStopList(&indexf->header, ep->word); /* For printing list of words */ |
1390 |
|
|
addstophash(&indexf->header, ep->word); /* Lookup hash */ |
1391 |
|
|
stopwords++; |
1392 |
|
|
/* unlink the ENTRY from the hash */ |
1393 |
|
|
if (ep2) |
1394 |
|
|
ep2->next = ep->next; |
1395 |
|
|
else |
1396 |
|
|
sw->Index->hashentries[i] = ep->next; |
1397 |
|
|
|
1398 |
|
|
totalwords--; |
1399 |
|
|
sw->Index->entryArray->numWords--; |
1400 |
|
|
indexf->header.totalwords--; |
1401 |
|
|
|
1402 |
|
|
/* Reallocte if more space is needed */ |
1403 |
|
|
if (estopsz == estopmsz) |
1404 |
|
|
{ |
1405 |
|
|
estopmsz *= 2; |
1406 |
|
|
estop = (ENTRY **) erealloc(estop, estopmsz * sizeof(ENTRY *)); |
1407 |
|
|
} |
1408 |
|
|
|
1409 |
|
|
/* estop is an array of ENTRY's that need to be removed */ |
1410 |
|
|
estop[estopsz++] = ep; |
1411 |
|
|
} |
1412 |
|
|
else |
1413 |
|
|
ep2 = ep; |
1414 |
|
|
} |
1415 |
|
|
} |
1416 |
|
|
|
1417 |
|
|
|
1418 |
|
|
|
1419 |
|
|
/* If we have automatic stopwords we have to recalculate word positions */ |
1420 |
|
|
|
1421 |
|
|
if (estopsz) |
1422 |
|
|
{ |
1423 |
|
|
/* Build an array with all the files positions to be removed */ |
1424 |
|
|
filepos = (struct IgnoreLimitPositions **) emalloc(totalfiles * sizeof(struct IgnoreLimitPositions *)); |
1425 |
|
|
|
1426 |
|
|
for (i = 0; i < totalfiles; i++) |
1427 |
|
|
filepos[i] = NULL; |
1428 |
|
|
|
1429 |
|
|
/* Compute bytes required for chunk location size. Eg: 4096 -> 2 bytes, 65535 -> 2 bytes */ |
1430 |
|
|
for(bytes_size = 0, i = COALESCE_BUFFER_MAX_SIZE; i; i >>= 8) |
1431 |
|
|
bytes_size++; |
1432 |
|
|
|
1433 |
|
|
/* Process each automatic stop word */ |
1434 |
|
|
for (i = 0; i < estopsz; i++) |
1435 |
|
|
{ |
1436 |
|
|
ep = estop[i]; |
1437 |
|
|
|
1438 |
|
|
if (sw->verbose) |
1439 |
|
|
{ |
1440 |
|
|
printf("\r Getting IgnoreLimit stopwords: %25s",ep->word); |
1441 |
|
|
fflush(stdout); |
1442 |
|
|
} |
1443 |
|
|
|
1444 |
|
|
if(sw->Index->swap_locdata) |
1445 |
|
|
{ |
1446 |
|
|
/* jmruiz - Be careful with this lines!!!! If we have a lot of words, |
1447 |
|
|
** probably this code can be very slow and may be rethought. |
1448 |
|
|
** Fortunately, only a few words must usually raise a IgnoreLimit option |
1449 |
|
|
*/ |
1450 |
|
|
last_loc_swap = (verybighash(ep->word) * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1); |
1451 |
|
|
unSwapLocData(sw, last_loc_swap, ep ); |
1452 |
|
|
} |
1453 |
|
|
|
1454 |
|
|
/* Run through location list to get positions */ |
1455 |
|
|
for(l=ep->allLocationList;l;) |
1456 |
|
|
{ |
1457 |
|
|
compressed_data = (unsigned char *) l; |
1458 |
|
|
/* Preserve next element */ |
1459 |
|
|
next = *(LOCATION **)compressed_data; |
1460 |
|
|
/* Jump pointer to next element */ |
1461 |
|
|
p = compressed_data + sizeof(LOCATION *); |
1462 |
|
|
|
1463 |
|
|
metaID = uncompress2(&p); |
1464 |
|
|
|
1465 |
|
|
for(chunk_size = 0, k = 0, j = bytes_size - 1; k < bytes_size; k++, j--) |
1466 |
|
|
chunk_size |= p[k] << (j * 8); |
1467 |
|
|
p += bytes_size; |
1468 |
|
|
|
1469 |
|
|
filenum = 0; |
1470 |
|
|
while(chunk_size) |
1471 |
|
|
{ /* Read on all items */ |
1472 |
|
|
q = p; |
1473 |
|
|
uncompress_location_values(&p,&flag,&tmpval,&frequency); |
1474 |
|
|
filenum += tmpval; |
1475 |
|
|
|
1476 |
|
|
if(frequency > MAX_STACK_POSITIONS) |
1477 |
|
|
positions = (int *) emalloc(frequency * sizeof(int)); |
1478 |
|
|
else |
1479 |
|
|
positions = local_positions; |
1480 |
|
|
|
1481 |
|
|
uncompress_location_positions(&p,flag,frequency,positions); |
1482 |
|
|
|
1483 |
|
|
chunk_size -= (p-q); |
1484 |
|
|
|
1485 |
|
|
/* Now build the list by filenum of meta/position info */ |
1486 |
|
|
|
1487 |
|
|
if (!filepos[filenum - 1]) |
1488 |
|
|
{ |
1489 |
|
|
fpos = (struct IgnoreLimitPositions *) emalloc(sizeof(struct IgnoreLimitPositions)); |
1490 |
|
|
fpos->pos = (int *) emalloc(frequency * 2 * sizeof(int)); |
1491 |
|
|
fpos->n = 0; |
1492 |
|
|
filepos[filenum - 1] = fpos; |
1493 |
|
|
} |
1494 |
|
|
else /* file exists in array. just append the meta and position data */ |
1495 |
|
|
{ |
1496 |
|
|
fpos = filepos[filenum - 1]; |
1497 |
|
|
fpos->pos = (int *) erealloc(fpos->pos, (fpos->n + frequency) * 2 * sizeof(int)); |
1498 |
|
|
} |
1499 |
|
|
|
1500 |
|
|
for (m = fpos->n * 2, k = 0; k < frequency; k++) |
1501 |
|
|
{ |
1502 |
|
|
fpos->pos[m++] = metaID; |
1503 |
|
|
fpos->pos[m++] = GET_POSITION(positions[k]); |
1504 |
|
|
} |
1505 |
|
|
|
1506 |
|
|
fpos->n += frequency; |
1507 |
|
|
|
1508 |
|
|
if(positions != local_positions) |
1509 |
|
|
efree(positions); |
1510 |
|
|
} |
1511 |
|
|
l = next; |
1512 |
|
|
} |
1513 |
|
|
if(sw->Index->swap_locdata) |
1514 |
|
|
Mem_ZoneReset(idx->totalLocZone); |
1515 |
|
|
} |
1516 |
|
|
|
1517 |
|
|
/* sort each file sort entries by metaname/position */ |
1518 |
|
|
for (i = 0; i < totalfiles; i++) |
1519 |
|
|
{ |
1520 |
|
|
if (filepos[i]) |
1521 |
|
|
swish_qsort(filepos[i]->pos, filepos[i]->n, 2 * sizeof(int), &icomp2); |
1522 |
|
|
} |
1523 |
|
|
} |
1524 |
|
|
|
1525 |
|
|
idx->nIgnoreLimitWords = estopsz; |
1526 |
|
|
idx->IgnoreLimitPositionsArray = filepos; |
1527 |
|
|
|
1528 |
|
|
if (sw->verbose) |
1529 |
|
|
{ |
1530 |
|
|
printf("\r Getting IgnoreLimit stopwords: Complete \n"); |
1531 |
|
|
fflush(stdout); |
1532 |
|
|
} |
1533 |
|
|
|
1534 |
|
|
|
1535 |
|
|
} |
1536 |
|
|
|
1537 |
|
|
/* 2001-08 jmruiz - Adjust positions if there was IgnoreLimit stopwords |
1538 |
|
|
** In all cases, removes null end of chunk marks */ |
1539 |
|
|
void adjustWordPositions(unsigned char *worddata, int *sz_worddata, int n_files, struct IgnoreLimitPositions **ilp) |
1540 |
|
|
{ |
1541 |
|
|
int frequency, |
1542 |
|
|
metaID, |
1543 |
|
|
tmpval, |
1544 |
|
|
r_filenum, |
1545 |
|
|
w_filenum, |
1546 |
|
|
*posdata; |
1547 |
|
|
int i,j,k; |
1548 |
|
|
unsigned long r_nextposmeta; |
1549 |
|
|
unsigned char *w_nextposmeta; |
1550 |
|
|
int local_posdata[MAX_STACK_POSITIONS]; |
1551 |
|
|
unsigned char r_flag, *w_flag; |
1552 |
|
|
unsigned char *p, *q; |
1553 |
|
|
|
1554 |
|
|
p = worddata; |
1555 |
|
|
|
1556 |
|
|
tmpval = uncompress2(&p); /* tfrequency */ |
1557 |
|
|
metaID = uncompress2(&p); /* metaID */ |
1558 |
|
|
r_nextposmeta = UNPACKLONG2(p); |
1559 |
|
|
w_nextposmeta = p; |
1560 |
|
|
p += sizeof(long); |
1561 |
|
|
|
1562 |
|
|
q = p; |
1563 |
|
|
r_filenum = w_filenum = 0; |
1564 |
|
|
while(1) |
1565 |
|
|
{ /* Read on all items */ |
1566 |
|
|
uncompress_location_values(&p,&r_flag,&tmpval,&frequency); |
1567 |
|
|
r_filenum += tmpval; |
1568 |
|
|
|
1569 |
|
|
if(frequency <= MAX_STACK_POSITIONS) |
1570 |
|
|
posdata = local_posdata; |
1571 |
|
|
else |
1572 |
|
|
posdata = (int *) emalloc(frequency * sizeof(int)); |
1573 |
|
|
|
1574 |
|
|
uncompress_location_positions(&p,r_flag,frequency,posdata); |
1575 |
|
|
|
1576 |
|
|
if(n_files && ilp && ilp[r_filenum - 1]) |
1577 |
|
|
{ |
1578 |
|
|
for(i = 0; i < ilp[r_filenum - 1]->n; i++) |
1579 |
|
|
{ |
1580 |
|
|
tmpval = ilp[r_filenum - 1]->pos[2 * i]; |
1581 |
|
|
if( tmpval >= metaID) |
1582 |
|
|
break; |
1583 |
|
|
} |
1584 |
|
|
if(tmpval == metaID) |
1585 |
|
|
{ |
1586 |
|
|
for(j = 0; j < frequency ; j++) |
1587 |
|
|
{ |
1588 |
|
|
for(k = i; k < ilp[r_filenum - 1]->n ; k++) |
1589 |
|
|
{ |
1590 |
|
|
if(ilp[r_filenum - 1]->pos[2 * k] != metaID || |
1591 |
|
|
ilp[r_filenum - 1]->pos[2 * k + 1] > GET_POSITION(posdata[j])) |
1592 |
|
|
break; /* End */ |
1593 |
|
|
} |
1594 |
|
|
posdata[j] = SET_POSDATA(GET_POSITION(posdata[j]) - (k-i), GET_STRUCTURE(posdata[j])); |
1595 |
|
|
} |
1596 |
|
|
} |
1597 |
|
|
} |
1598 |
|
|
/* Store the filenum incrementally to save space */ |
1599 |
|
|
compress_location_values(&q,&w_flag,r_filenum - w_filenum,frequency, posdata); |
1600 |
|
|
w_filenum = r_filenum; |
1601 |
|
|
|
1602 |
|
|
/* store positions */ |
1603 |
|
|
compress_location_positions(&q,w_flag,frequency,posdata); |
1604 |
|
|
|
1605 |
|
|
if(posdata != local_posdata) |
1606 |
|
|
efree(posdata); |
1607 |
|
|
|
1608 |
|
|
if(!p[0]) /* End of chunk mark */ |
1609 |
|
|
{ |
1610 |
|
|
r_filenum = 0; /* reset filenum */ |
1611 |
|
|
p++; |
1612 |
|
|
} |
1613 |
|
|
if ((p - worddata) == *sz_worddata) |
1614 |
|
|
break; /* End of worddata */ |
1615 |
|
|
|
1616 |
|
|
if ((unsigned long)(p - worddata) == r_nextposmeta) |
1617 |
|
|
{ |
1618 |
|
|
if(q != p) |
1619 |
|
|
PACKLONG2(q - worddata, w_nextposmeta); |
1620 |
|
|
|
1621 |
|
|
metaID = uncompress2(&p); |
1622 |
|
|
q = compress3(metaID,q); |
1623 |
|
|
|
1624 |
|
|
r_nextposmeta = UNPACKLONG2(p); |
1625 |
|
|
p += sizeof(long); |
1626 |
|
|
|
1627 |
|
|
w_nextposmeta = q; |
1628 |
|
|
q += sizeof(long); |
1629 |
|
|
|
1630 |
|
|
w_filenum = 0; |
1631 |
|
|
} |
1632 |
|
|
} |
1633 |
|
|
*sz_worddata = q - worddata; |
1634 |
|
|
PACKLONG2(*sz_worddata, w_nextposmeta); |
1635 |
|
|
} |
1636 |
|
|
|
1637 |
|
|
|
1638 |
|
|
|
1639 |
|
|
/* |
1640 |
|
|
** This is an all new ranking algorithm. I can't say it is based on anything, |
1641 |
|
|
** but it does seem to be better than what was used before! |
1642 |
|
|
** 2001/05 wsm |
1643 |
|
|
** |
1644 |
|
|
** Parameters: |
1645 |
|
|
** sw |
1646 |
|
|
** Pointer to SWISH structure |
1647 |
|
|
** |
1648 |
|
|
** freq |
1649 |
|
|
** Number of times this word appeared in this file |
1650 |
|
|
** |
1651 |
|
|
** tfreq |
1652 |
|
|
** Number of files this word appeared in this index (not used for ranking) |
1653 |
|
|
** |
1654 |
|
|
** words |
1655 |
|
|
** Number of owrds in this file |
1656 |
|
|
** |
1657 |
|
|
** structure |
1658 |
|
|
** Bit mask of context where this word appeared |
1659 |
|
|
** |
1660 |
|
|
** ignoreTotalWordCount |
1661 |
|
|
** Ignore total word count when ranking (config file parameter) |
1662 |
|
|
*/ |
1663 |
|
|
|
1664 |
|
|
|
1665 |
|
|
|
1666 |
|
|
int entrystructcmp(const void *e1, const void *e2) |
1667 |
|
|
{ |
1668 |
|
|
const ENTRY *ep1 = *(ENTRY * const *) e1; |
1669 |
|
|
const ENTRY *ep2 = *(ENTRY * const *) e2; |
1670 |
|
|
|
1671 |
|
|
return (strcmp(ep1->word, ep2->word)); |
1672 |
|
|
} |
1673 |
|
|
|
1674 |
|
|
|
1675 |
|
|
/* Sorts the words */ |
1676 |
|
|
void sort_words(SWISH * sw, IndexFILE * indexf) |
1677 |
|
|
{ |
1678 |
|
|
int i, |
1679 |
|
|
j; |
1680 |
|
|
ENTRY *e; |
1681 |
|
|
|
1682 |
|
|
|
1683 |
|
|
if (!sw->Index->entryArray || !sw->Index->entryArray->numWords) |
1684 |
|
|
return; |
1685 |
|
|
|
1686 |
|
|
|
1687 |
|
|
if (sw->verbose) |
1688 |
|
|
{ |
1689 |
|
|
printf("Sorting %d words alphabetically\n", sw->Index->entryArray->numWords ); |
1690 |
|
|
fflush(stdout); |
1691 |
|
|
} |
1692 |
|
|
|
1693 |
|
|
/* Build the array with the pointers to the entries */ |
1694 |
|
|
sw->Index->entryArray->elist = (ENTRY **) emalloc(sw->Index->entryArray->numWords * sizeof(ENTRY *)); |
1695 |
|
|
|
1696 |
|
|
/* Fill the array with all the entries */ |
1697 |
|
|
for (i = 0, j = 0; i < VERYBIGHASHSIZE; i++) |
1698 |
|
|
for (e = sw->Index->hashentries[i]; e; e = e->next) |
1699 |
|
|
sw->Index->entryArray->elist[j++] = e; |
1700 |
|
|
|
1701 |
|
|
/* Sort them */ |
1702 |
|
|
swish_qsort(sw->Index->entryArray->elist, sw->Index->entryArray->numWords, sizeof(ENTRY *), &entrystructcmp); |
1703 |
|
|
} |
1704 |
|
|
|
1705 |
|
|
|
1706 |
|
|
|
1707 |
|
|
/* Sort chunk locations of entry e by metaID, filenum */ |
1708 |
|
|
void sortChunkLocations(SWISH * sw, IndexFILE * indexf, ENTRY * e) |
1709 |
|
|
{ |
1710 |
|
|
int i, |
1711 |
|
|
j, |
1712 |
|
|
k, |
1713 |
|
|
filenum,metaID,frequency; |
1714 |
|
|
unsigned char flag; |
1715 |
|
|
unsigned char *ptmp, |
1716 |
|
|
*ptmp2, |
1717 |
|
|
*compressed_data; |
1718 |
|
|
int *pi = NULL; |
1719 |
|
|
LOCATION *l, *prev = NULL, **lp; |
1720 |
|
|
|
1721 |
|
|
/* Very trivial case */ |
1722 |
|
|
if (!e) |
1723 |
|
|
return; |
1724 |
|
|
|
1725 |
|
|
if(!e->currentChunkLocationList) |
1726 |
|
|
return; |
1727 |
|
|
|
1728 |
|
|
/* Get the number of locations in chunk */ |
1729 |
|
|
for(i = 0, l = e->currentChunkLocationList; l; i++) |
1730 |
|
|
l=*(LOCATION **)l; /* Get next location */ |
1731 |
|
|
|
1732 |
|
|
/* Compute array wide */ |
1733 |
|
|
j = 2 * sizeof(int) + sizeof(void *); |
1734 |
|
|
|
1735 |
|
|
/* Compute array size */ |
1736 |
|
|
ptmp = (void *) emalloc(j * i); |
1737 |
|
|
|
1738 |
|
|
/* Build an array with the elements to compare |
1739 |
|
|
and pointers to data */ |
1740 |
|
|
|
1741 |
|
|
for(l = e->currentChunkLocationList, ptmp2 = ptmp; l; ) |
1742 |
|
|
{ |
1743 |
|
|
pi = (int *) ptmp2; |
1744 |
|
|
|
1745 |
|
|
compressed_data = (unsigned char *)l; |
1746 |
|
|
/* Jump next offset */ |
1747 |
|
|
compressed_data += sizeof(LOCATION *); |
1748 |
|
|
|
1749 |
|
|
metaID = uncompress2(&compressed_data); |
1750 |
|
|
uncompress_location_values(&compressed_data,&flag,&filenum,&frequency); |
1751 |
|
|
pi[0] = metaID; |
1752 |
|
|
pi[1] = filenum; |
1753 |
|
|
ptmp2 += 2 * sizeof(int); |
1754 |
|
|
|
1755 |
|
|
lp = (LOCATION **)ptmp2; |
1756 |
|
|
*lp = l; |
1757 |
|
|
ptmp2 += sizeof(void *); |
1758 |
|
|
/* Get next location */ |
1759 |
|
|
l=*(LOCATION **)l; /* Get next location */ |
1760 |
|
|
} |
1761 |
|
|
|
1762 |
|
|
/* Sort them */ |
1763 |
|
|
swish_qsort(ptmp, i, j, &icomp2); |
1764 |
|
|
|
1765 |
|
|
/* Store results */ |
1766 |
|
|
for (k = 0, ptmp2 = ptmp; k < i; k++) |
1767 |
|
|
{ |
1768 |
|
|
ptmp2 += 2 * sizeof(int); |
1769 |
|
|
|
1770 |
|
|
l = *(LOCATION **)ptmp2; |
1771 |
|
|
if(!k) |
1772 |
|
|
e->currentChunkLocationList = l; |
1773 |
|
|
else |
1774 |
|
|
prev->next =l; |
1775 |
|
|
ptmp2 += sizeof(void *); |
1776 |
|
|
prev = l; |
1777 |
|
|
} |
1778 |
|
|
l->next =NULL; |
1779 |
|
|
|
1780 |
|
|
/* Free the memory of the array */ |
1781 |
|
|
efree(ptmp); |
1782 |
|
|
} |
1783 |
|
|
|
1784 |
|
|
void coalesce_all_word_locations(SWISH * sw, IndexFILE * indexf) |
1785 |
|
|
{ |
1786 |
|
|
int i; |
1787 |
|
|
ENTRY *epi; |
1788 |
|
|
|
1789 |
|
|
for (i = 0; i < VERYBIGHASHSIZE; i++) |
1790 |
|
|
{ |
1791 |
|
|
if ((epi = sw->Index->hashentries[i])) |
1792 |
|
|
{ |
1793 |
|
|
while (epi) |
1794 |
|
|
{ |
1795 |
|
|
coalesce_word_locations(sw, indexf, epi); |
1796 |
|
|
epi = epi->next; |
1797 |
|
|
} |
1798 |
|
|
} |
1799 |
|
|
} |
1800 |
|
|
|
1801 |
|
|
} |
1802 |
|
|
|
1803 |
|
|
/* Write the index entries that hold the word, rank, and other information. |
1804 |
|
|
*/ |
1805 |
|
|
|
1806 |
|
|
|
1807 |
|
|
#ifndef USE_BTREE |
1808 |
|
|
void write_index(SWISH * sw, IndexFILE * indexf) |
1809 |
|
|
{ |
1810 |
|
|
int i; |
1811 |
|
|
ENTRYARRAY *ep; |
1812 |
|
|
ENTRY *epi; |
1813 |
|
|
int totalwords; |
1814 |
|
|
int percent, lastPercent, n; |
1815 |
|
|
int last_loc_swap; |
1816 |
|
|
|
1817 |
|
|
#define DELTA 10 |
1818 |
|
|
|
1819 |
|
|
|
1820 |
|
|
if ( !(ep = sw->Index->entryArray )) |
1821 |
|
|
return; /* nothing to do */ |
1822 |
|
|
|
1823 |
|
|
|
1824 |
|
|
totalwords = ep->numWords; |
1825 |
|
|
|
1826 |
|
|
DB_InitWriteWords(sw, indexf->DB); |
1827 |
|
|
|
1828 |
|
|
if (sw->verbose) |
1829 |
|
|
{ |
1830 |
|
|
printf(" Writing word text: ..."); |
1831 |
|
|
fflush(stdout); |
1832 |
|
|
} |
1833 |
|
|
|
1834 |
|
|
/* This is not longer needed. So free it as soon as possible */ |
1835 |
|
|
Mem_ZoneFree(&sw->Index->perDocTmpZone); |
1836 |
|
|
|
1837 |
|
|
|
1838 |
|
|
/* This is not longer needed. So free it as soon as possible */ |
1839 |
|
|
Mem_ZoneFree(&sw->Index->currentChunkLocZone); |
1840 |
|
|
|
1841 |
|
|
/* If we are swaping locs to file, reset memory zone */ |
1842 |
|
|
if(sw->Index->swap_locdata) |
1843 |
|
|
Mem_ZoneReset(sw->Index->totalLocZone); |
1844 |
|
|
|
1845 |
|
|
n = lastPercent = 0; |
1846 |
|
|
for (i = 0; i < totalwords; i++) |
1847 |
|
|
{ |
1848 |
|
|
if ( sw->verbose && totalwords > 10000 ) // just some random guess |
1849 |
|
|
{ |
1850 |
|
|
n++; |
1851 |
|
|
percent = (n * 100)/totalwords; |
1852 |
|
|
if (percent - lastPercent >= DELTA ) |
1853 |
|
|
{ |
1854 |
|
|
printf("\r Writing word text: %3d%%", percent ); |
1855 |
|
|
fflush(stdout); |
1856 |
|
|
lastPercent = percent; |
1857 |
|
|
} |
1858 |
|
|
} |
1859 |
|
|
|
1860 |
|
|
epi = ep->elist[i]; |
1861 |
|
|
|
1862 |
|
|
/* why check for stopwords here? removestopwords could have remove them */ |
1863 |
|
|
if (!isstopword(&indexf->header, epi->word)) |
1864 |
|
|
{ |
1865 |
|
|
/* Write word to index file */ |
1866 |
|
|
write_word(sw, epi, indexf); |
1867 |
|
|
} |
1868 |
|
|
else |
1869 |
|
|
epi->u1.wordID = -1; /* flag as a stop word */ |
1870 |
|
|
} |
1871 |
|
|
|
1872 |
|
|
if (sw->verbose) |
1873 |
|
|
{ |
1874 |
|
|
printf("\r Writing word text: Complete\n" ); |
1875 |
|
|
printf(" Writing word hash: ..."); |
1876 |
|
|
fflush(stdout); |
1877 |
|
|
} |
1878 |
|
|
|
1879 |
|
|
|
1880 |
|
|
|
1881 |
|
|
n = lastPercent = 0; |
1882 |
|
|
for (i = 0; i < VERYBIGHASHSIZE; i++) |
1883 |
|
|
{ |
1884 |
|
|
if ( sw->verbose ) |
1885 |
|
|
{ |
1886 |
|
|
n++; |
1887 |
|
|
percent = (n * 100)/VERYBIGHASHSIZE; |
1888 |
|
|
if (percent - lastPercent >= DELTA ) |
1889 |
|
|
{ |
1890 |
|
|
printf("\r Writing word hash: %3d%%", percent ); |
1891 |
|
|
fflush(stdout); |
1892 |
|
|
lastPercent = percent; |
1893 |
|
|
} |
1894 |
|
|
} |
1895 |
|
|
|
1896 |
|
|
|
1897 |
|
|
if ((epi = sw->Index->hashentries[i])) |
1898 |
|
|
{ |
1899 |
|
|
while (epi) |
1900 |
|
|
{ |
1901 |
|
|
/* If it is not a stopword write it */ |
1902 |
|
|
if (epi->u1.wordID > 0) |
1903 |
|
|
DB_WriteWordHash(sw, epi->word,epi->u1.wordID,indexf->DB); |
1904 |
|
|
epi = epi->next; |
1905 |
|
|
} |
1906 |
|
|
} |
1907 |
|
|
} |
1908 |
|
|
|
1909 |
|
|
if (sw->verbose) |
1910 |
|
|
{ |
1911 |
|
|
printf("\r Writing word hash: Complete\n" ); |
1912 |
|
|
printf(" Writing word data: ..."); |
1913 |
|
|
fflush(stdout); |
1914 |
|
|
} |
1915 |
|
|
|
1916 |
|
|
|
1917 |
|
|
n = lastPercent = last_loc_swap = -1; |
1918 |
|
|
for (i = 0; i < VERYBIGHASHSIZE; i++) |
1919 |
|
|
{ |
1920 |
|
|
/* If we are in economic mode -e restore locations */ |
1921 |
|
|
if(sw->Index->swap_locdata) |
1922 |
|
|
{ |
1923 |
|
|
if (((i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1)) != last_loc_swap) |
1924 |
|
|
{ |
1925 |
|
|
/* Free not longer needed memory */ |
1926 |
|
|
Mem_ZoneReset(sw->Index->totalLocZone); |
1927 |
|
|
last_loc_swap = (i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1); |
1928 |
|
|
unSwapLocData(sw, last_loc_swap, NULL ); |
1929 |
|
|
} |
1930 |
|
|
} |
1931 |
|
|
if ((epi = sw->Index->hashentries[i])) |
1932 |
|
|
{ |
1933 |
|
|
while (epi) |
1934 |
|
|
{ |
1935 |
|
|
/* If we are in economic mode -e we must sort locations by metaID, filenum */ |
1936 |
|
|
if(sw->Index->swap_locdata) |
1937 |
|
|
{ |
1938 |
|
|
sortSwapLocData(sw, epi); |
1939 |
|
|
} |
1940 |
|
|
if ( sw->verbose && totalwords > 10000 ) // just some random guess |
1941 |
|
|
{ |
1942 |
|
|
n++; |
1943 |
|
|
percent = (n * 100)/totalwords; |
1944 |
|
|
if (percent - lastPercent >= DELTA ) |
1945 |
|
|
{ |
1946 |
|
|
printf("\r Writing word data: %3d%%", percent ); |
1947 |
|
|
fflush(stdout); |
1948 |
|
|
lastPercent = percent; |
1949 |
|
|
} |
1950 |
|
|
} |
1951 |
|
|
if (epi->u1.wordID > 0) /* Not a stopword */ |
1952 |
|
|
{ |
1953 |
|
|
build_worddata(sw, epi, indexf); |
1954 |
|
|
write_worddata(sw, epi, indexf); |
1955 |
|
|
} |
1956 |
|
|
epi = epi->next; |
1957 |
|
|
} |
1958 |
|
|
} |
1959 |
|
|
} |
1960 |
|
|
if (sw->verbose) |
1961 |
|
|
printf("\r Writing word data: Complete\n" ); |
1962 |
|
|
|
1963 |
|
|
|
1964 |
|
|
DB_EndWriteWords(sw, indexf->DB); |
1965 |
|
|
|
1966 |
|
|
/* free all ENTRY structs at once */ |
1967 |
|
|
Mem_ZoneFree(&sw->Index->entryZone); |
1968 |
|
|
|
1969 |
|
|
/* free all location compressed data */ |
1970 |
|
|
Mem_ZoneFree(&sw->Index->totalLocZone); |
1971 |
|
|
|
1972 |
|
|
efree(ep->elist); |
1973 |
|
|
} |
1974 |
|
|
|
1975 |
|
|
#else |
1976 |
|
|
|
1977 |
|
|
void write_index(SWISH * sw, IndexFILE * indexf) |
1978 |
|
|
{ |
1979 |
|
|
int i; |
1980 |
|
|
ENTRYARRAY *ep; |
1981 |
|
|
ENTRY *epi; |
1982 |
|
|
int totalwords; |
1983 |
|
|
int percent, lastPercent, n; |
1984 |
|
|
long old_wordid; |
1985 |
|
|
unsigned char *buffer =NULL; |
1986 |
|
|
int sz_buffer = 0; |
1987 |
|
|
#define DELTA 10 |
1988 |
|
|
|
1989 |
|
|
|
1990 |
|
|
if ( !(ep = sw->Index->entryArray )) |
1991 |
|
|
return; /* nothing to do */ |
1992 |
|
|
|
1993 |
|
|
totalwords = ep->numWords; |
1994 |
|
|
|
1995 |
|
|
|
1996 |
|
|
/* Write words */ |
1997 |
|
|
DB_InitWriteWords(sw, indexf->DB); |
1998 |
|
|
|
1999 |
|
|
if (sw->verbose) |
2000 |
|
|
{ |
2001 |
|
|
printf(" Writing word text: ..."); |
2002 |
|
|
fflush(stdout); |
2003 |
|
|
} |
2004 |
|
|
|
2005 |
|
|
/* This is not longer needed. So free it as soon as possible */ |
2006 |
|
|
Mem_ZoneFree(&sw->Index->perDocTmpZone); |
2007 |
|
|
|
2008 |
|
|
|
2009 |
|
|
/* This is not longer needed. So free it as soon as possible */ |
2010 |
|
|
Mem_ZoneFree(&sw->Index->currentChunkLocZone); |
2011 |
|
|
|
2012 |
|
|
/* If we are swaping locs to file, reset memory zone */ |
2013 |
|
|
if(sw->Index->swap_locdata) |
2014 |
|
|
Mem_ZoneReset(sw->Index->totalLocZone); |
2015 |
|
|
|
2016 |
|
|
n = lastPercent = 0; |
2017 |
|
|
for (i = 0; i < totalwords; i++) |
2018 |
|
|
{ |
2019 |
|
|
if ( sw->verbose && totalwords > 10000 ) // just some random guess |
2020 |
|
|
{ |
2021 |
|
|
n++; |
2022 |
|
|
percent = (n * 100)/totalwords; |
2023 |
|
|
if (percent - lastPercent >= DELTA ) |
2024 |
|
|
{ |
2025 |
|
|
printf("\r Writing word text: %3d%%", percent ); |
2026 |
|
|
fflush(stdout); |
2027 |
|
|
lastPercent = percent; |
2028 |
|
|
} |
2029 |
|
|
} |
2030 |
|
|
|
2031 |
|
|
epi = ep->elist[i]; |
2032 |
|
|
|
2033 |
|
|
/* why check for stopwords here? removestopwords could have remove them */ |
2034 |
|
|
if (!isstopword(&indexf->header, epi->word)) |
2035 |
|
|
{ |
2036 |
|
|
/* Build worddata buffer */ |
2037 |
|
|
build_worddata(sw, epi, indexf); |
2038 |
|
|
/* let's see if word is already in the index */ |
2039 |
|
|
old_wordid = read_worddata(sw, epi, indexf, &buffer, &sz_buffer); |
2040 |
|
|
/* If exists, we have to add the new worddata buffer to the old one */ |
2041 |
|
|
if(old_wordid) |
2042 |
|
|
{ |
2043 |
|
|
add_worddata(sw, epi, indexf, buffer, sz_buffer); |
2044 |
|
|
efree(buffer); |
2045 |
|
|
buffer = NULL; |
2046 |
|
|
sz_buffer = 0; |
2047 |
|
|
delete_worddata(sw, old_wordid, indexf); |
2048 |
|
|
write_worddata(sw, epi, indexf); |
2049 |
|
|
update_wordID(sw, epi, indexf); |
2050 |
|
|
} |
2051 |
|
|
else |
2052 |
|
|
{ |
2053 |
|
|
/* Write word to index file */ |
2054 |
|
|
write_worddata(sw, epi, indexf); |
2055 |
|
|
write_word(sw, epi, indexf); |
2056 |
|
|
} |
2057 |
|
|
} |
2058 |
|
|
} |
2059 |
|
|
|
2060 |
|
|
if (sw->verbose) |
2061 |
|
|
{ |
2062 |
|
|
printf("\r Writing word text: Complete\n" ); |
2063 |
|
|
fflush(stdout); |
2064 |
|
|
} |
2065 |
|
|
|
2066 |
|
|
|
2067 |
|
|
DB_EndWriteWords(sw, indexf->DB); |
2068 |
|
|
|
2069 |
|
|
/* free all ENTRY structs at once */ |
2070 |
|
|
Mem_ZoneFree(&sw->Index->entryZone); |
2071 |
|
|
|
2072 |
|
|
/* free all location compressed data */ |
2073 |
|
|
Mem_ZoneFree(&sw->Index->totalLocZone); |
2074 |
|
|
|
2075 |
|
|
efree(ep->elist); |
2076 |
|
|
} |
2077 |
|
|
|
2078 |
|
|
|
2079 |
|
|
#endif |
2080 |
|
|
|
2081 |
|
|
|
2082 |
|
|
|
2083 |
|
|
|
2084 |
|
|
/* These 2 routines fix the problem when a word ends with mutiple |
2085 |
|
|
** IGNORELASTCHAR's (eg, qwerty'. ). The old code correctly deleted |
2086 |
|
|
** the ".", but didn't check if the new last character ("'") is also |
2087 |
|
|
** an ignore character. |
2088 |
|
|
*/ |
2089 |
|
|
void stripIgnoreLastChars(INDEXDATAHEADER *header, char *word) |
2090 |
|
|
{ |
2091 |
|
|
int k,j,i = strlen(word); |
2092 |
|
|
|
2093 |
|
|
/* Get rid of specified last char's */ |
2094 |
|
|
/* for (i=0; word[i] != '\0'; i++); */ |
2095 |
|
|
/* Iteratively strip off the last character if it's an ignore character */ |
2096 |
|
|
while ((i > 0) && (isIgnoreLastChar(header, word[--i]))) |
2097 |
|
|
{ |
2098 |
|
|
word[i] = '\0'; |
2099 |
|
|
|
2100 |
|
|
/* We must take care of the escaped characeters */ |
2101 |
|
|
/* Things like hello\c hello\\c hello\\\c can appear */ |
2102 |
|
|
for(j=0,k=i-1;k>=0 && word[k]=='\\';k--,j++); |
2103 |
|
|
|
2104 |
|
|
/* j contains the number of \ */ |
2105 |
|
|
if(j%2) /* Remove the escape if even */ |
2106 |
|
|
{ |
2107 |
|
|
word[--i]='\0'; |
2108 |
|
|
} |
2109 |
|
|
} |
2110 |
|
|
} |
2111 |
|
|
|
2112 |
|
|
void stripIgnoreFirstChars(INDEXDATAHEADER *header, char *word) |
2113 |
|
|
{ |
2114 |
|
|
int j, |
2115 |
|
|
k; |
2116 |
|
|
int i = 0; |
2117 |
|
|
|
2118 |
|
|
/* Keep going until a char not to ignore is found */ |
2119 |
|
|
/* We must take care of the escaped characeters */ |
2120 |
|
|
/* Things like \chello \\chello can appear */ |
2121 |
|
|
|
2122 |
|
|
while (word[i]) |
2123 |
|
|
{ |
2124 |
|
|
if(word[i]=='\\') /* Jump escape */ |
2125 |
|
|
k=i+1; |
2126 |
|
|
else |
2127 |
|
|
k=i; |
2128 |
|
|
if(!word[k] || !isIgnoreFirstChar(header, word[k])) |
2129 |
|
|
break; |
2130 |
|
|
else |
2131 |
|
|
i=k+1; |
2132 |
|
|
} |
2133 |
|
|
|
2134 |
|
|
/* If all the char's are valid, just return */ |
2135 |
|
|
if (0 == i) |
2136 |
|
|
return; |
2137 |
|
|
else |
2138 |
|
|
{ |
2139 |
|
|
for (k = i, j = 0; word[k] != '\0'; j++, k++) |
2140 |
|
|
{ |
2141 |
|
|
word[j] = word[k]; |
2142 |
|
|
} |
2143 |
|
|
/* Add the NULL */ |
2144 |
|
|
word[j] = '\0'; |
2145 |
|
|
} |
2146 |
|
|
} |
2147 |
|
|
|
2148 |
|
|
|
2149 |
|
|
|
2150 |
|
|
static void addword( char *word, SWISH * sw, int filenum, int structure, int numMetaNames, int *metaID, int *word_position) |
2151 |
|
|
{ |
2152 |
|
|
int i; |
2153 |
|
|
|
2154 |
|
|
/* Add the word for each nested metaname. */ |
2155 |
|
|
for (i = 0; i < numMetaNames; i++) |
2156 |
|
|
(void) addentry(sw, getentry(sw,word), filenum, structure, metaID[i], *word_position); |
2157 |
|
|
|
2158 |
|
|
(*word_position)++; |
2159 |
|
|
} |
2160 |
|
|
|
2161 |
|
|
|
2162 |
|
|
|
2163 |
|
|
|
2164 |
|
|
/* Gets the next white-space delimited word */ |
2165 |
|
|
int next_word( char **buf, char **word, int *lenword ) |
2166 |
|
|
{ |
2167 |
|
|
int i; |
2168 |
|
|
|
2169 |
|
|
/* skip any whitespace */ |
2170 |
|
|
while ( **buf && isspace( (unsigned char) **buf) ) |
2171 |
|
|
(*buf)++; |
2172 |
|
|
|
2173 |
|
|
i = 0; |
2174 |
|
|
while ( **buf && !isspace( (unsigned char) **buf) ) |
2175 |
|
|
{ |
2176 |
|
|
/* reallocate buffer, if needed */ |
2177 |
|
|
if ( i == *lenword ) |
2178 |
|
|
{ |
2179 |
|
|
*lenword *= 2; |
2180 |
|
|
*word = erealloc(*word, *lenword + 1); |
2181 |
|
|
} |
2182 |
|
|
|
2183 |
|
|
(*word)[i++] = **buf; |
2184 |
|
|
(*buf)++; |
2185 |
|
|
} |
2186 |
|
|
|
2187 |
|
|
if ( i ) |
2188 |
|
|
{ |
2189 |
|
|
(*word)[i] = '\0'; |
2190 |
|
|
return 1; |
2191 |
|
|
} |
2192 |
|
|
|
2193 |
|
|
return 0; |
2194 |
|
|
} |
2195 |
|
|
|
2196 |
|
|
/* Gets the next non WordChars delimited word */ |
2197 |
|
|
/* Bumps position if needed */ |
2198 |
|
|
int next_swish_word(SWISH * sw, char **buf, char **word, int *lenword, int *word_position ) |
2199 |
|
|
{ |
2200 |
|
|
int i; |
2201 |
|
|
IndexFILE *indexf = sw->indexlist; |
2202 |
|
|
int bump_flag = 0; |
2203 |
|
|
|
2204 |
|
|
/* skip non-wordchars and check for bump chars */ |
2205 |
|
|
while ( **buf && !iswordchar(indexf->header, **buf ) ) |
2206 |
|
|
{ |
2207 |
|
|
if (!bump_flag && isBumpPositionCounterChar(&indexf->header, (int) **buf)) |
2208 |
|
|
bump_flag++; |
2209 |
|
|
|
2210 |
|
|
(*buf)++; |
2211 |
|
|
} |
2212 |
|
|
|
2213 |
|
|
i = 0; |
2214 |
|
|
while ( **buf && iswordchar(indexf->header, **buf) ) |
2215 |
|
|
{ |
2216 |
|
|
/* It doesn't really make sense to have a WordChar that's also a bump char */ |
2217 |
|
|
if (!bump_flag && isBumpPositionCounterChar(&indexf->header, (int) **buf)) |
2218 |
|
|
bump_flag++; |
2219 |
|
|
|
2220 |
|
|
|
2221 |
|
|
/* reallocate buffer, if needed */ |
2222 |
|
|
if ( i == *lenword ) |
2223 |
|
|
{ |
2224 |
|
|
*lenword *= 2; |
2225 |
|
|
*word = erealloc(*word, *lenword + 1); |
2226 |
|
|
} |
2227 |
|
|
|
2228 |
|
|
(*word)[i++] = **buf; |
2229 |
|
|
(*buf)++; |
2230 |
|
|
} |
2231 |
|
|
|
2232 |
|
|
/* If any bump chars were found then bump to prevent phrase matching */ |
2233 |
|
|
if ( bump_flag ) |
2234 |
|
|
(*word_position)++; |
2235 |
|
|
|
2236 |
|
|
if ( i ) |
2237 |
|
|
{ |
2238 |
|
|
(*word)[i] = '\0'; |
2239 |
|
|
stripIgnoreLastChars(&indexf->header, *word); |
2240 |
|
|
stripIgnoreFirstChars(&indexf->header, *word); |
2241 |
|
|
|
2242 |
|
|
return *word ? 1 : 0; |
2243 |
|
|
} |
2244 |
|
|
|
2245 |
|
|
return 0; |
2246 |
|
|
} |
2247 |
|
|
|
2248 |
|
|
/****************************************************************** |
2249 |
|
|
* Build the list of metaIDs that need to be indexed |
2250 |
|
|
* |
2251 |
|
|
* Returns number of IDs found |
2252 |
|
|
* |
2253 |
|
|
* |
2254 |
|
|
******************************************************************/ |
2255 |
|
|
static int build_metaID_list( SWISH *sw ) |
2256 |
|
|
{ |
2257 |
|
|
struct MOD_Index *idx = sw->Index; |
2258 |
|
|
METAIDTABLE *metas = &idx->metaIDtable; |
2259 |
|
|
IndexFILE *indexf = sw->indexlist; |
2260 |
|
|
INDEXDATAHEADER *header = &indexf->header; |
2261 |
|
|
struct metaEntry *m; |
2262 |
|
|
int i; |
2263 |
|
|
|
2264 |
|
|
|
2265 |
|
|
/* cache the default metaID for speed */ |
2266 |
|
|
if ( metas->defaultID == -1 ) |
2267 |
|
|
{ |
2268 |
|
|
m = getMetaNameByName( header, AUTOPROPERTY_DEFAULT ); |
2269 |
|
|
metas->defaultID = m ? m->metaID : 0; |
2270 |
|
|
} |
2271 |
|
|
|
2272 |
|
|
|
2273 |
|
|
metas->num = 0; |
2274 |
|
|
|
2275 |
|
|
|
2276 |
|
|
/* Would be smart to track number of metas flagged so not to loop through all for every lookup */ |
2277 |
|
|
|
2278 |
|
|
for ( i = 0; i < header->metaCounter; i++) |
2279 |
|
|
{ |
2280 |
|
|
m = header->metaEntryArray[i]; |
2281 |
|
|
|
2282 |
|
|
if ( (m->metaType & META_INDEX) && m->in_tag ) |
2283 |
|
|
{ |
2284 |
|
|
if ( ++metas->num > metas->max ) |
2285 |
|
|
metas->array = (int *)erealloc( metas->array, (metas->max = metas->num + 200) ); |
2286 |
|
|
|
2287 |
|
|
metas->array[metas->num - 1] = m->metaID; |
2288 |
|
|
} |
2289 |
|
|
} |
2290 |
|
|
|
2291 |
|
|
/* If no metas found to index, then add default metaID */ |
2292 |
|
|
if ( !metas->num && metas->defaultID ) |
2293 |
|
|
metas->array[metas->num++] = metas->defaultID; |
2294 |
|
|
|
2295 |
|
|
return metas->num; |
2296 |
|
|
} |
2297 |
|
|
|
2298 |
|
|
|
2299 |
|
|
/****************************************************************** |
2300 |
|
|
* Index a string |
2301 |
|
|
* |
2302 |
|
|
* |
2303 |
|
|
******************************************************************/ |
2304 |
|
|
|
2305 |
|
|
/* 05/2001 Jose Ruiz - Changed word and swishword buffers to make this routine ** thread safe */ |
2306 |
|
|
|
2307 |
|
|
|
2308 |
|
|
int indexstring(SWISH * sw, char *s, int filenum, int structure, int numMetaNames, int *metaID, int *position) |
2309 |
|
|
{ |
2310 |
|
|
int wordcount = 0; |
2311 |
|
|
|
2312 |
|
|
IndexFILE *indexf = sw->indexlist; |
2313 |
|
|
|
2314 |
|
|
char *buf_pos; /* pointer to current position */ |
2315 |
|
|
char *cur_pos; /* pointer to position with a word */ |
2316 |
|
|
|
2317 |
|
|
int stem_return; /* return value of stem operation */ |
2318 |
|
|
|
2319 |
|
|
struct MOD_Index *idx = sw->Index; |
2320 |
|
|
|
2321 |
|
|
/* Assign word buffers */ |
2322 |
|
|
char *word = idx->word; |
2323 |
|
|
int lenword = idx->lenword; |
2324 |
|
|
char *swishword = idx->swishword; |
2325 |
|
|
int lenswishword = idx->lenswishword; |
2326 |
|
|
|
2327 |
|
|
|
2328 |
|
|
|
2329 |
|
|
/* Generate list of metaIDs to index unless passed in */ |
2330 |
|
|
if ( !metaID ) |
2331 |
|
|
{ |
2332 |
|
|
if ( !(numMetaNames = build_metaID_list( sw )) ) |
2333 |
|
|
return 0; |
2334 |
|
|
else |
2335 |
|
|
metaID = idx->metaIDtable.array; |
2336 |
|
|
} |
2337 |
|
|
|
2338 |
|
|
/* current pointer into buffer */ |
2339 |
|
|
buf_pos = s; |
2340 |
|
|
|
2341 |
|
|
|
2342 |
|
|
/* get the next word as defined by whitespace */ |
2343 |
|
|
while ( next_word( &buf_pos, &word, &lenword ) ) |
2344 |
|
|
{ |
2345 |
|
|
if ( DEBUG_MASK & DEBUG_PARSED_WORDS ) |
2346 |
|
|
printf("White-space found word '%s'\n", word ); |
2347 |
|
|
|
2348 |
|
|
|
2349 |
|
|
strtolower(word); |
2350 |
|
|
|
2351 |
|
|
/* is this a useful feature? */ |
2352 |
|
|
if ( indexf->header.is_use_words_flag ) |
2353 |
|
|
{ |
2354 |
|
|
if ( isuseword(&indexf->header, word) ) |
2355 |
|
|
{ |
2356 |
|
|
addword(word, sw, filenum, structure, numMetaNames, metaID, position ); |
2357 |
|
|
wordcount++; |
2358 |
|
|
} |
2359 |
|
|
|
2360 |
|
|
continue; |
2361 |
|
|
} |
2362 |
|
|
|
2363 |
|
|
|
2364 |
|
|
/* Check for buzzwords */ |
2365 |
|
|
if ( indexf->header.buzzwords_used_flag ) |
2366 |
|
|
{ |
2367 |
|
|
/* only strip when buzzwords are being used since stripped again as a "swish word" */ |
2368 |
|
|
stripIgnoreLastChars(&indexf->header, word); |
2369 |
|
|
stripIgnoreFirstChars(&indexf->header, word); |
2370 |
|
|
if ( !*word ) /* stripped clean? */ |
2371 |
|
|
continue; |
2372 |
|
|
|
2373 |
|
|
|
2374 |
|
|
if ( isbuzzword(&indexf->header, word) ) |
2375 |
|
|
{ |
2376 |
|
|
addword(word, sw, filenum, structure, numMetaNames, metaID, position ); |
2377 |
|
|
wordcount++; |
2378 |
|
|
continue; |
2379 |
|
|
} |
2380 |
|
|
} |
2381 |
|
|
|
2382 |
|
|
|
2383 |
|
|
|
2384 |
|
|
|
2385 |
|
|
|
2386 |
|
|
/* Translate chars */ |
2387 |
|
|
TranslateChars(indexf->header.translatecharslookuptable, (unsigned char *)word); |
2388 |
|
|
|
2389 |
|
|
cur_pos = word; |
2390 |
|
|
|
2391 |
|
|
|
2392 |
|
|
|
2393 |
|
|
/* Now split the word up into "swish words" */ |
2394 |
|
|
|
2395 |
|
|
while ( next_swish_word( sw, &cur_pos, &swishword, &lenswishword, position ) ) |
2396 |
|
|
{ |
2397 |
|
|
|
2398 |
|
|
/* Weed out Numbers - or anything that's all the listed chars */ |
2399 |
|
|
if ( indexf->header.numberchars_used_flag ) |
2400 |
|
|
{ |
2401 |
|
|
unsigned char *c = (unsigned char *)swishword; |
2402 |
|
|
|
2403 |
|
|
/* look for any char that's NOT in the lookup table */ |
2404 |
|
|
while ( *c ) { |
2405 |
|
|
if ( !indexf->header.numbercharslookuptable[(int) *c ] ) |
2406 |
|
|
break; |
2407 |
|
|
c++; |
2408 |
|
|
} |
2409 |
|
|
|
2410 |
|
|
/* if got all the way through the string then it's only those chars */ |
2411 |
|
|
if ( !*c ) |
2412 |
|
|
continue; /* skip this word */ |
2413 |
|
|
} |
2414 |
|
|
|
2415 |
|
|
|
2416 |
|
|
/* Check Begin & EndCharacters */ |
2417 |
|
|
if (!indexf->header.begincharslookuptable[(int) ((unsigned char) swishword[0])]) |
2418 |
|
|
continue; |
2419 |
|
|
|
2420 |
|
|
if (!indexf->header.endcharslookuptable[(int) ((unsigned char) swishword[strlen(swishword) - 1])]) |
2421 |
|
|
continue; |
2422 |
|
|
|
2423 |
|
|
|
2424 |
|
|
/* limit by stopwords, min/max length, max number of digits, ... */ |
2425 |
|
|
if (!isokword(sw, swishword, indexf)) |
2426 |
|
|
continue; |
2427 |
|
|
|
2428 |
|
|
/* Now translate word if fuzzy mode */ |
2429 |
|
|
|
2430 |
|
|
switch ( indexf->header.fuzzy_mode ) |
2431 |
|
|
{ |
2432 |
|
|
case FUZZY_NONE: |
2433 |
|
|
addword(swishword, sw, filenum, structure, numMetaNames, metaID, position ); |
2434 |
|
|
wordcount++; |
2435 |
|
|
break; |
2436 |
|
|
|
2437 |
|
|
case FUZZY_STEMMING: |
2438 |
|
|
stem_return = Stem(&swishword, &lenswishword); |
2439 |
|
|
|
2440 |
|
|
/* === |
2441 |
|
|
if ( stem_return == STEM_NOT_ALPHA ) printf("Stem: not alpha in '%s'\n", swishword ); |
2442 |
|
|
if ( stem_return == STEM_TOO_SMALL ) printf("Stem: too small in '%s'\n", swishword ); |
2443 |
|
|
if ( stem_return == STEM_WORD_TOO_BIG ) printf("Stem: too big to stem in '%s'\n", swishword ); |
2444 |
|
|
if ( stem_return == STEM_TO_NOTHING ) printf("Stem: stems to nothing '%s'\n", swishword ); |
2445 |
|
|
=== */ |
2446 |
|
|
|
2447 |
|
|
addword(swishword, sw, filenum, structure, numMetaNames, metaID, position ); |
2448 |
|
|
wordcount++; |
2449 |
|
|
break; |
2450 |
|
|
|
2451 |
|
|
|
2452 |
|
|
case FUZZY_SOUNDEX: |
2453 |
|
|
soundex(swishword); |
2454 |
|
|
addword(swishword, sw, filenum, structure, numMetaNames, metaID, position ); |
2455 |
|
|
wordcount++; |
2456 |
|
|
break; |
2457 |
|
|
|
2458 |
|
|
case FUZZY_METAPHONE: |
2459 |
|
|
case FUZZY_DOUBLE_METAPHONE: |
2460 |
|
|
{ |
2461 |
|
|
char *codes[2]; |
2462 |
|
|
DoubleMetaphone(swishword, codes); |
2463 |
|
|
|
2464 |
|
|
if ( !(*codes[0]) ) |
2465 |
|
|
{ |
2466 |
|
|
efree( codes[0] ); |
2467 |
|
|
efree( codes[1] ); |
2468 |
|
|
addword(swishword, sw, filenum, structure, numMetaNames, metaID, position ); |
2469 |
|
|
wordcount++; |
2470 |
|
|
break; |
2471 |
|
|
} |
2472 |
|
|
addword(codes[0], sw, filenum, structure, numMetaNames, metaID, position ); |
2473 |
|
|
wordcount++; |
2474 |
|
|
|
2475 |
|
|
if ( indexf->header.fuzzy_mode == FUZZY_DOUBLE_METAPHONE && *(codes[1]) && strcmp(codes[0], codes[1]) ) |
2476 |
|
|
{ |
2477 |
|
|
(*position)--; /* at same position as first word */ |
2478 |
|
|
addword(codes[1], sw, filenum, structure, numMetaNames, metaID, position ); |
2479 |
|
|
wordcount++; |
2480 |
|
|
} |
2481 |
|
|
|
2482 |
|
|
efree( codes[0] ); |
2483 |
|
|
efree( codes[1] ); |
2484 |
|
|
} |
2485 |
|
|
|
2486 |
|
|
break; |
2487 |
|
|
|
2488 |
|
|
|
2489 |
|
|
default: |
2490 |
|
|
progerr("Invalid FuzzyMode '%d'", (int)indexf->header.fuzzy_mode ); |
2491 |
|
|
} |
2492 |
|
|
} |
2493 |
|
|
} |
2494 |
|
|
|
2495 |
|
|
/* Buffers can be reallocated - So, reasign them */ |
2496 |
|
|
idx->word = word; |
2497 |
|
|
idx->lenword = lenword; |
2498 |
|
|
idx->swishword = swishword; |
2499 |
|
|
idx->lenswishword = lenswishword; |
2500 |
|
|
|
2501 |
|
|
return wordcount; |
2502 |
|
|
} |
2503 |
|
|
|
2504 |
|
|
|
2505 |
|
|
/* Coalesce word current word location into the linked list */ |
2506 |
|
|
void add_coalesced(SWISH *sw, ENTRY *e, unsigned char *coalesced, int sz_coalesced, int metaID) |
2507 |
|
|
{ |
2508 |
|
|
int tmp; |
2509 |
|
|
LOCATION *tloc, *tprev; |
2510 |
|
|
LOCATION **tmploc, **tmploc2; |
2511 |
|
|
unsigned char *tp; |
2512 |
|
|
|
2513 |
|
|
|
2514 |
|
|
/* Check for economic mode (-e) and swap data to disk */ |
2515 |
|
|
if(sw->Index->swap_locdata) |
2516 |
|
|
{ |
2517 |
|
|
tmploc = (LOCATION **)coalesced; |
2518 |
|
|
*tmploc = (LOCATION *)e; /* Preserve e in buffer */ |
2519 |
|
|
/* The cast is for avoiding the warning */ |
2520 |
|
|
SwapLocData(sw, e, coalesced, sz_coalesced); |
2521 |
|
|
return; |
2522 |
|
|
} |
2523 |
|
|
|
2524 |
|
|
/* Add to the linked list keeping the data sorted by metaname, filenum */ |
2525 |
|
|
for(tprev =NULL, tloc = e->allLocationList; tloc; ) |
2526 |
|
|
{ |
2527 |
|
|
tp = (unsigned char *)tloc + sizeof(void *); |
2528 |
|
|
tmp = uncompress2(&tp); /* Read metaID */ |
2529 |
|
|
if(tmp > metaID) |
2530 |
|
|
break; |
2531 |
|
|
tprev = tloc; |
2532 |
|
|
tmploc = (LOCATION **)tloc; |
2533 |
|
|
tloc = *tmploc; |
2534 |
|
|
} |
2535 |
|
|
|
2536 |
|
|
if(! tprev) |
2537 |
|
|
{ |
2538 |
|
|
tmploc = (LOCATION **)coalesced; |
2539 |
|
|
*tmploc = e->allLocationList; |
2540 |
|
|
e->allLocationList = (LOCATION *)coalesced; |
2541 |
|
|
} |
2542 |
|
|
else |
2543 |
|
|
{ |
2544 |
|
|
tmploc = (LOCATION **)coalesced; |
2545 |
|
|
tmploc2 = (LOCATION **)tprev; |
2546 |
|
|
*tmploc = *tmploc2; |
2547 |
|
|
*tmploc2 = (LOCATION *)coalesced; |
2548 |
|
|
} |
2549 |
|
|
} |
2550 |
|
|
|
2551 |
|
|
|
2552 |
|
|
void coalesce_word_locations(SWISH * sw, IndexFILE * indexf, ENTRY *e) |
2553 |
|
|
{ |
2554 |
|
|
int curmetaID, metaID, |
2555 |
|
|
curfilenum, filenum, |
2556 |
|
|
frequency, |
2557 |
|
|
num_locs, |
2558 |
|
|
bytes_size, |
2559 |
|
|
worst_case_size; |
2560 |
|
|
int i, j, tmp; |
2561 |
|
|
unsigned char *p, *q, *size_p = NULL; |
2562 |
|
|
unsigned char uflag, *cflag; |
2563 |
|
|
LOCATION *loc, *next; |
2564 |
|
|
static unsigned char buffer[COALESCE_BUFFER_MAX_SIZE]; |
2565 |
|
|
unsigned char *coalesced_buffer; |
2566 |
|
|
int *posdata; |
2567 |
|
|
int local_posdata[MAX_STACK_POSITIONS]; |
2568 |
|
|
|
2569 |
|
|
|
2570 |
|
|
/* Check for new locations in the current chunk */ |
2571 |
|
|
if(!e->currentChunkLocationList) |
2572 |
|
|
return; |
2573 |
|
|
|
2574 |
|
|
/* Compute bytes required for size. Eg: 4096 -> 2 bytes, 65535 -> 2 bytes */ |
2575 |
|
|
for(bytes_size = 0, tmp = COALESCE_BUFFER_MAX_SIZE; tmp; tmp >>= 8) |
2576 |
|
|
bytes_size++; |
2577 |
|
|
|
2578 |
|
|
/* Sort all pending word locations by metaID, filenum */ |
2579 |
|
|
sortChunkLocations(sw, indexf, e); |
2580 |
|
|
|
2581 |
|
|
/* Init vars */ |
2582 |
|
|
curmetaID = 0; |
2583 |
|
|
curfilenum = 0; |
2584 |
|
|
q = buffer; /* Destination buffer */ |
2585 |
|
|
num_locs = 0; /* Number of coalesced LOCATIONS */ |
2586 |
|
|
|
2587 |
|
|
/* Run on all locations */ |
2588 |
|
|
for(loc = e->currentChunkLocationList; loc; ) |
2589 |
|
|
{ |
2590 |
|
|
p = (unsigned char *) loc; |
2591 |
|
|
|
2592 |
|
|
/* get next LOCATION in linked list*/ |
2593 |
|
|
next = * (LOCATION **) loc; |
2594 |
|
|
p += sizeof(LOCATION *); |
2595 |
|
|
|
2596 |
|
|
/* get metaID of LOCATION */ |
2597 |
|
|
metaID = uncompress2(&p); |
2598 |
|
|
|
2599 |
|
|
/* Check for new metaID */ |
2600 |
|
|
if(metaID != curmetaID) |
2601 |
|
|
{ |
2602 |
|
|
/* If exits previous data add it to the linked list */ |
2603 |
|
|
if(curmetaID) |
2604 |
|
|
{ |
2605 |
|
|
/* add to the linked list and reset values */ |
2606 |
|
|
/* Update the size of chunk's data in *size_p */ |
2607 |
|
|
tmp = q - (size_p + bytes_size); /* tmp contains the size */ |
2608 |
|
|
/* Write the size */ |
2609 |
|
|
for(i = 0, j = bytes_size - 1; i < bytes_size; i++, j--) |
2610 |
|
|
size_p[i] = tmp >> (j * 8); |
2611 |
|
|
/* Add to the linked list keeping the data sorted by metaname, filenum */ |
2612 |
|
|
/* Allocate memory space */ |
2613 |
|
|
coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer); |
2614 |
|
|
/* Copy content to it */ |
2615 |
|
|
memcpy(coalesced_buffer,buffer,q-buffer); |
2616 |
|
|
/* Add to the linked list */ |
2617 |
|
|
add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID); |
2618 |
|
|
} |
2619 |
|
|
/* Reset values */ |
2620 |
|
|
curfilenum = 0; |
2621 |
|
|
curmetaID = metaID; |
2622 |
|
|
q = buffer + sizeof(void *); /* Make room for linked list pointer */ |
2623 |
|
|
q = compress3(metaID,q); /* Add metaID */ |
2624 |
|
|
size_p = q; /* Preserve position for size */ |
2625 |
|
|
q += bytes_size; /* Make room for size */ |
2626 |
|
|
num_locs = 0; |
2627 |
|
|
} |
2628 |
|
|
uncompress_location_values(&p,&uflag,&filenum,&frequency); |
2629 |
|
|
worst_case_size = sizeof(unsigned char *) + (3 + frequency) * MAXINTCOMPSIZE; |
2630 |
|
|
|
2631 |
|
|
while ((q + worst_case_size) - buffer > sizeof(buffer)) |
2632 |
|
|
{ |
2633 |
|
|
if(!num_locs) |
2634 |
|
|
progerr("Buffer too short in coalesce_word_locations. Increase COALESCE_BUFFER_MAX_SIZE in config.h and rebuild."); |
2635 |
|
|
/* add to the linked list and reset values */ |
2636 |
|
|
/* Update the size of chunk's data in *size_p */ |
2637 |
|
|
tmp = q - (size_p + bytes_size); /* tmp contains the size */ |
2638 |
|
|
/* Write the size */ |
2639 |
|
|
for(i = 0, j = bytes_size - 1; i < bytes_size; i++, j--) |
2640 |
|
|
size_p[i] = tmp >> (j * 8); |
2641 |
|
|
/* Add to the linked list keeping the data sorted by metaname, filenum */ |
2642 |
|
|
/* Allocate memory space */ |
2643 |
|
|
coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer); |
2644 |
|
|
/* Copy content to it */ |
2645 |
|
|
memcpy(coalesced_buffer,buffer,q-buffer); |
2646 |
|
|
/* Add to the linked list */ |
2647 |
|
|
add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID); |
2648 |
|
|
|
2649 |
|
|
/* Reset values */ |
2650 |
|
|
curfilenum = 0; |
2651 |
|
|
curmetaID = metaID; |
2652 |
|
|
q = buffer + sizeof(void *); /* Make room for linked list pointer */ |
2653 |
|
|
q = compress3(metaID,q); |
2654 |
|
|
size_p = q; /* Preserve position for size */ |
2655 |
|
|
q += bytes_size; /* Make room for size */ |
2656 |
|
|
num_locs = 0; |
2657 |
|
|
} |
2658 |
|
|
|
2659 |
|
|
if(frequency > MAX_STACK_POSITIONS) |
2660 |
|
|
posdata = emalloc(frequency * sizeof(int)); |
2661 |
|
|
else |
2662 |
|
|
posdata = local_posdata; |
2663 |
|
|
|
2664 |
|
|
uncompress_location_positions(&p,uflag,frequency,posdata); |
2665 |
|
|
|
2666 |
|
|
/* Store the filenum incrementally to save space */ |
2667 |
|
|
compress_location_values(&q,&cflag,filenum - curfilenum,frequency, posdata); |
2668 |
|
|
|
2669 |
|
|
curfilenum = filenum; |
2670 |
|
|
|
2671 |
|
|
compress_location_positions(&q,cflag,frequency,posdata); |
2672 |
|
|
|
2673 |
|
|
if(frequency > MAX_STACK_POSITIONS) |
2674 |
|
|
efree(posdata); |
2675 |
|
|
|
2676 |
|
|
num_locs++; |
2677 |
|
|
|
2678 |
|
|
loc = next; |
2679 |
|
|
} |
2680 |
|
|
if (num_locs) |
2681 |
|
|
{ |
2682 |
|
|
/* add to the linked list and reset values */ |
2683 |
|
|
/* Update the size of chunk's data in *size_p */ |
2684 |
|
|
tmp = q - (size_p + bytes_size); /* tmp contains the size */ |
2685 |
|
|
/* Write the size */ |
2686 |
|
|
for(i = 0, j = bytes_size - 1; i < bytes_size; i++, j--) |
2687 |
|
|
size_p[i] = tmp >> (j * 8); |
2688 |
|
|
/* Add to the linked list keeping the data sorted by metaname, filenum */ |
2689 |
|
|
/* Allocate memory space */ |
2690 |
|
|
coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer); |
2691 |
|
|
/* Copy content to it */ |
2692 |
|
|
memcpy(coalesced_buffer,buffer,q-buffer); |
2693 |
|
|
/* Add to the linked list */ |
2694 |
|
|
add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID); |
2695 |
|
|
} |
2696 |
|
|
e->currentChunkLocationList = NULL; |
2697 |
|
|
e->currentlocation = NULL; |
2698 |
|
|
|
2699 |
|
|
/* If we are swaping locs to file, reset also correspondant memory zone */ |
2700 |
|
|
if(sw->Index->swap_locdata) |
2701 |
|
|
Mem_ZoneReset(sw->Index->totalLocZone); |
2702 |
|
|
|
2703 |
|
|
} |
2704 |
|
|
|