1 |
/* |
2 |
** |
3 |
** This program and library is free software; you can redistribute it and/or |
4 |
** modify it under the terms of the GNU (Library) General Public License |
5 |
** as published by the Free Software Foundation; either version 2 |
6 |
** of the License, or any later version. |
7 |
** |
8 |
** This program is distributed in the hope that it will be useful, |
9 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 |
** GNU (Library) General Public License for more details. |
12 |
** |
13 |
** You should have received a copy of the GNU (Library) General Public License |
14 |
** along with this program; if not, write to the Free Software |
15 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
16 |
** |
17 |
** |
18 |
** |
19 |
** 2001-05-07 jmruiz init coding |
20 |
** |
21 |
*/ |
22 |
|
23 |
#include "swish.h" |
24 |
#include "mem.h" |
25 |
#include "string.h" |
26 |
#include "index.h" |
27 |
#include "hash.h" |
28 |
#include "date_time.h" |
29 |
#include "compress.h" |
30 |
#include "error.h" |
31 |
#include "metanames.h" |
32 |
#include "db.h" |
33 |
#include "db_native.h" |
34 |
// #include "db_berkeley_db.h" |
35 |
|
36 |
#ifndef min |
37 |
#define min(a, b) (a) < (b) ? a : b |
38 |
#endif |
39 |
|
40 |
/* |
41 |
-- init structures for this module |
42 |
*/ |
43 |
|
44 |
void initModule_DB (SWISH *sw) |
45 |
{ |
46 |
/* Allocate structure */ |
47 |
initModule_DBNative(sw); |
48 |
// initModule_DB_db(sw); |
49 |
return; |
50 |
} |
51 |
|
52 |
|
53 |
/* |
54 |
-- release all wired memory for this module |
55 |
*/ |
56 |
|
57 |
void freeModule_DB (SWISH *sw) |
58 |
{ |
59 |
freeModule_DBNative(sw); |
60 |
// freeModule_DB_db(sw); |
61 |
return; |
62 |
} |
63 |
|
64 |
|
65 |
|
66 |
/* ---------------------------------------------- */ |
67 |
|
68 |
|
69 |
|
70 |
/* |
71 |
-- Config Directives |
72 |
-- Configuration directives for this Module |
73 |
-- return: 0/1 = none/config applied |
74 |
*/ |
75 |
|
76 |
int configModule_DB (SWISH *sw, StringList *sl) |
77 |
{ |
78 |
//struct MOD_DB *DB = sw->Db; |
79 |
// char *w0 = sl->word[0]; |
80 |
int retval = 1; |
81 |
|
82 |
|
83 |
retval = 0; // tmp due to empty routine |
84 |
|
85 |
return retval; |
86 |
} |
87 |
|
88 |
|
89 |
/* General write DB routines - Common to all DB */ |
90 |
|
91 |
/* Header routines */ |
92 |
|
93 |
#define write_header_int(sw,id,num,DB) {unsigned long itmp = (num); itmp = PACKLONG(itmp); DB_WriteHeaderData((sw),(id), (unsigned char *)&itmp, sizeof(long), (DB));} |
94 |
#define write_header_int2(sw,id,num1,num2,DB) {unsigned long itmp[2]; itmp[0] = (num1); itmp[1] = (num2); itmp[0]= PACKLONG(itmp[0]); itmp[1] = PACKLONG(itmp[1]); DB_WriteHeaderData((sw),(id), (unsigned char *)itmp, sizeof(long) * 2, (DB));} |
95 |
|
96 |
|
97 |
void write_header(SWISH *sw, INDEXDATAHEADER * header, void * DB, char *filename, int totalwords, int totalfiles, int merged) |
98 |
{ |
99 |
char *c, |
100 |
*tmp; |
101 |
|
102 |
c = (char *) strrchr(filename, '/'); |
103 |
if (!c || (c && !*(c + 1))) |
104 |
c = filename; |
105 |
else |
106 |
c += 1; |
107 |
|
108 |
DB_InitWriteHeader(sw, DB); |
109 |
|
110 |
DB_WriteHeaderData(sw, INDEXHEADER_ID, (unsigned char *)INDEXHEADER, strlen(INDEXHEADER) +1, DB); |
111 |
DB_WriteHeaderData(sw, INDEXVERSION_ID, (unsigned char *)INDEXVERSION, strlen(INDEXVERSION) + 1, DB); |
112 |
write_header_int(sw, MERGED_ID, merged, DB); |
113 |
DB_WriteHeaderData(sw, NAMEHEADER_ID, (unsigned char *)header->indexn, strlen(header->indexn) + 1, DB); |
114 |
DB_WriteHeaderData(sw, SAVEDASHEADER_ID, (unsigned char *)c, strlen(c) + 1, DB); |
115 |
write_header_int2(sw, COUNTSHEADER_ID, totalwords, totalfiles, DB); |
116 |
tmp = getTheDateISO(); |
117 |
DB_WriteHeaderData(sw, INDEXEDONHEADER_ID, (unsigned char *)tmp, strlen(tmp) + 1,DB); |
118 |
efree(tmp); |
119 |
DB_WriteHeaderData(sw, DESCRIPTIONHEADER_ID, (unsigned char *)header->indexd, strlen(header->indexd) + 1, DB); |
120 |
DB_WriteHeaderData(sw, POINTERHEADER_ID, (unsigned char *)header->indexp, strlen(header->indexp) + 1, DB); |
121 |
DB_WriteHeaderData(sw, MAINTAINEDBYHEADER_ID, (unsigned char *)header->indexa, strlen(header->indexa) + 1,DB); |
122 |
write_header_int(sw, DOCPROPENHEADER_ID, 1, DB); |
123 |
|
124 |
write_header_int(sw, FUZZYMODEHEADER_ID, header->fuzzy_mode, DB); |
125 |
|
126 |
write_header_int(sw, IGNORETOTALWORDCOUNTWHENRANKING_ID, header->ignoreTotalWordCountWhenRanking, DB); |
127 |
DB_WriteHeaderData(sw, WORDCHARSHEADER_ID, (unsigned char *)header->wordchars, strlen(header->wordchars) + 1, DB); |
128 |
write_header_int(sw, MINWORDLIMHEADER_ID, header->minwordlimit, DB); |
129 |
write_header_int(sw, MAXWORDLIMHEADER_ID, header->maxwordlimit, DB); |
130 |
DB_WriteHeaderData(sw, BEGINCHARSHEADER_ID, (unsigned char *)header->beginchars, strlen(header->beginchars) + 1, DB); |
131 |
DB_WriteHeaderData(sw, ENDCHARSHEADER_ID, (unsigned char *)header->endchars, strlen(header->endchars) + 1, DB); |
132 |
DB_WriteHeaderData(sw, IGNOREFIRSTCHARHEADER_ID, (unsigned char *)header->ignorefirstchar, strlen(header->ignorefirstchar) + 1, DB); |
133 |
DB_WriteHeaderData(sw, IGNORELASTCHARHEADER_ID, (unsigned char *)header->ignorelastchar, strlen(header->ignorelastchar) + 1,DB); |
134 |
/* Removed - Patents |
135 |
write_header_int(FILEINFOCOMPRESSION_ID, header->applyFileInfoCompression, DB); |
136 |
*/ |
137 |
|
138 |
|
139 |
|
140 |
/* Jose Ruiz 06/00 Added this line to delimite the header */ |
141 |
write_integer_table_to_header(sw, TRANSLATECHARTABLE_ID, header->translatecharslookuptable, sizeof(header->translatecharslookuptable) / sizeof(int), DB); |
142 |
|
143 |
/* Other header stuff */ |
144 |
/* StopWords */ |
145 |
write_words_to_header(sw, STOPWORDS_ID, header->hashstoplist, DB); |
146 |
/* Metanames */ |
147 |
write_MetaNames(sw, METANAMES_ID, header, DB); |
148 |
|
149 |
/* BuzzWords */ |
150 |
write_words_to_header(sw, BUZZWORDS_ID, header->hashbuzzwordlist, DB); |
151 |
|
152 |
#ifndef USE_BTREE |
153 |
/* Write the total words per file array, if used */ |
154 |
if ( !header->ignoreTotalWordCountWhenRanking ) |
155 |
write_integer_table_to_header(sw, TOTALWORDSPERFILE_ID, header->TotalWordsPerFile, totalfiles, DB); |
156 |
#endif |
157 |
|
158 |
DB_EndWriteHeader(sw, DB); |
159 |
} |
160 |
|
161 |
|
162 |
/* Jose Ruiz 11/00 |
163 |
** Function to write a word to the index DB |
164 |
*/ |
165 |
void write_word(SWISH * sw, ENTRY * ep, IndexFILE * indexf) |
166 |
{ |
167 |
long wordID; |
168 |
|
169 |
wordID = DB_GetWordID(sw, indexf->DB); |
170 |
|
171 |
DB_WriteWord(sw, ep->word,wordID,indexf->DB); |
172 |
/* Store word offset for futher hash computing */ |
173 |
ep->u1.wordID = wordID; |
174 |
|
175 |
} |
176 |
|
177 |
#ifdef USE_BTREE |
178 |
/* 04/2002 jmruiz |
179 |
** Routine to update wordID |
180 |
*/ |
181 |
void update_wordID(SWISH * sw, ENTRY * ep, IndexFILE * indexf) |
182 |
{ |
183 |
long wordID; |
184 |
|
185 |
wordID = DB_GetWordID(sw, indexf->DB); |
186 |
|
187 |
DB_UpdateWordID(sw, ep->word,wordID,indexf->DB); |
188 |
/* Store word offset for futher hash computing */ |
189 |
ep->u1.wordID = wordID; |
190 |
} |
191 |
|
192 |
void delete_worddata(SWISH * sw, long wordID, IndexFILE * indexf) |
193 |
{ |
194 |
DB_DeleteWordData(sw,wordID,indexf->DB); |
195 |
} |
196 |
|
197 |
#endif |
198 |
|
199 |
/* Jose Ruiz 11/00 |
200 |
** Function to write all word's data to the index DB |
201 |
*/ |
202 |
|
203 |
|
204 |
void build_worddata(SWISH * sw, ENTRY * ep, IndexFILE * indexf) |
205 |
{ |
206 |
int i, j, |
207 |
curmetaID, |
208 |
sz_worddata; |
209 |
unsigned long tmp, |
210 |
curmetanamepos; |
211 |
int metaID; |
212 |
int bytes_size, |
213 |
chunk_size; |
214 |
unsigned char *compressed_data, |
215 |
*p,*q; |
216 |
LOCATION *l, *next; |
217 |
|
218 |
|
219 |
curmetaID=0; |
220 |
curmetanamepos=0L; |
221 |
q=sw->Index->worddata_buffer; |
222 |
|
223 |
/* Compute bytes required for chunk location size. Eg: 4096 -> 2 bytes, 65535 -> 2 bytes */ |
224 |
for(bytes_size = 0, i = COALESCE_BUFFER_MAX_SIZE; i; i >>= 8) |
225 |
bytes_size++; |
226 |
|
227 |
/* Write tfrequency */ |
228 |
q = compress3(ep->tfrequency,q); |
229 |
|
230 |
/* Write location list */ |
231 |
for(l=ep->allLocationList;l;) |
232 |
{ |
233 |
compressed_data = (unsigned char *) l; |
234 |
/* Get next element */ |
235 |
next = *(LOCATION **)compressed_data; |
236 |
/* Jump pointer to next element */ |
237 |
p = compressed_data + sizeof(LOCATION *); |
238 |
|
239 |
metaID = uncompress2(&p); |
240 |
|
241 |
for(chunk_size = 0, i = 0, j = bytes_size - 1; i < bytes_size; i++, j--) |
242 |
chunk_size |= p[i] << (j * 8); |
243 |
p += bytes_size; |
244 |
|
245 |
if(curmetaID!=metaID) |
246 |
{ |
247 |
if(curmetaID) |
248 |
{ |
249 |
/* Write in previous meta (curmetaID) |
250 |
** file offset to next meta */ |
251 |
tmp=q - sw->Index->worddata_buffer; |
252 |
PACKLONG2(tmp,sw->Index->worddata_buffer+curmetanamepos); |
253 |
} |
254 |
/* Check for enough memory */ |
255 |
/* |
256 |
** MAXINTCOMPSIZE is for the worst case metaID |
257 |
** |
258 |
** sizeof(long) is to leave four bytes to |
259 |
** store the offset of the next metaname |
260 |
** (it will be 0 if no more metanames). |
261 |
** |
262 |
** 1 is for the trailing '\0' |
263 |
*/ |
264 |
|
265 |
tmp=q - sw->Index->worddata_buffer; |
266 |
|
267 |
if((long)(tmp + MAXINTCOMPSIZE + sizeof(long) + 1) >= (long)sw->Index->len_worddata_buffer) |
268 |
{ |
269 |
sw->Index->len_worddata_buffer=sw->Index->len_worddata_buffer*2+MAXINTCOMPSIZE+sizeof(long)+1; |
270 |
sw->Index->worddata_buffer=(unsigned char *) erealloc(sw->Index->worddata_buffer,sw->Index->len_worddata_buffer); |
271 |
q=sw->Index->worddata_buffer+tmp; /* reasign pointer inside buffer */ |
272 |
} |
273 |
|
274 |
/* store metaID in buffer */ |
275 |
curmetaID=metaID; |
276 |
q = compress3(curmetaID,q); |
277 |
|
278 |
/* preserve position for offset to next |
279 |
** metaname. We do not know its size |
280 |
** so store it as a packed long */ |
281 |
curmetanamepos=q - sw->Index->worddata_buffer; |
282 |
|
283 |
/* Store 0 and increase pointer */ |
284 |
tmp=0L; |
285 |
PACKLONG2(tmp,q); |
286 |
|
287 |
q+=sizeof(unsigned long); |
288 |
} |
289 |
/* Store all data for this chunk */ |
290 |
/* First check for enough space |
291 |
** |
292 |
** 1 is for the trailing '\0' |
293 |
*/ |
294 |
|
295 |
tmp=q - sw->Index->worddata_buffer; |
296 |
|
297 |
if((long)(tmp + chunk_size + 1) >= (long)sw->Index->len_worddata_buffer) |
298 |
{ |
299 |
sw->Index->len_worddata_buffer=sw->Index->len_worddata_buffer*2+chunk_size+1; |
300 |
sw->Index->worddata_buffer=(unsigned char *) erealloc(sw->Index->worddata_buffer,sw->Index->len_worddata_buffer); |
301 |
q=sw->Index->worddata_buffer+tmp; /* reasign pointer inside buffer */ |
302 |
} |
303 |
|
304 |
/* Copy it and advance pointer */ |
305 |
memcpy(q,p,chunk_size); |
306 |
q += chunk_size; |
307 |
|
308 |
/* End of chunk mark -> Write trailing '\0' */ |
309 |
*q++ = '\0'; |
310 |
|
311 |
l = next; |
312 |
} |
313 |
|
314 |
/* Write in previous meta (curmetaID) |
315 |
** file offset to end of metas */ |
316 |
tmp=q - sw->Index->worddata_buffer; |
317 |
PACKLONG2(tmp,sw->Index->worddata_buffer+curmetanamepos); |
318 |
|
319 |
sz_worddata = q - sw->Index->worddata_buffer; |
320 |
|
321 |
/* Adjust word positions. |
322 |
** if ignorelimit was set and some new stopwords weee found, positions |
323 |
** are recalculated |
324 |
** Also call it even if we have not set IgnoreLimit to calesce word chunks |
325 |
** and remove trailing 0 from chunks to save some bytes |
326 |
*/ |
327 |
adjustWordPositions(sw->Index->worddata_buffer, &sz_worddata, sw->indexlist->header.totalfiles, sw->Index->IgnoreLimitPositionsArray); |
328 |
|
329 |
sw->Index->sz_worddata_buffer = sz_worddata; |
330 |
} |
331 |
|
332 |
/* 04/2002 jmruiz |
333 |
** New simpler routine to write worddata |
334 |
*/ |
335 |
void write_worddata(SWISH * sw, ENTRY * ep, IndexFILE * indexf ) |
336 |
{ |
337 |
DB_WriteWordData(sw, ep->u1.wordID,sw->Index->worddata_buffer,sw->Index->sz_worddata_buffer,indexf->DB); |
338 |
|
339 |
} |
340 |
|
341 |
|
342 |
/* 04/2002 jmruiz |
343 |
** Function to read all word's data from the index DB |
344 |
*/ |
345 |
|
346 |
|
347 |
long read_worddata(SWISH * sw, ENTRY * ep, IndexFILE * indexf, unsigned char **buffer, int *sz_buffer) |
348 |
{ |
349 |
long wordID; |
350 |
char *word = ep->word; |
351 |
|
352 |
DB_InitReadWords(sw, indexf->DB); |
353 |
DB_ReadWordHash(sw, word, &wordID, indexf->DB); |
354 |
|
355 |
if(!wordID) |
356 |
{ |
357 |
DB_EndReadWords(sw, indexf->DB); |
358 |
sw->lasterror = WORD_NOT_FOUND; |
359 |
*buffer = NULL; |
360 |
*sz_buffer = 0; |
361 |
return 0L; |
362 |
} |
363 |
DB_ReadWordData(sw, wordID, buffer, sz_buffer, indexf->DB); |
364 |
DB_EndReadWords(sw, indexf->DB); |
365 |
return wordID; |
366 |
} |
367 |
|
368 |
/* 04/2002 jmruiz |
369 |
** Routine to merge two buffers of worddata |
370 |
*/ |
371 |
void add_worddata(SWISH *sw, ENTRY *epi, IndexFILE *indexf, unsigned char *olddata, int sz_olddata) |
372 |
{ |
373 |
int maxtotsize; |
374 |
unsigned char stack_buffer[32000]; /* Just to try malloc/free fragmentation */ |
375 |
unsigned char *newdata; |
376 |
int sz_newdata; |
377 |
int tfreq1, tfreq2; |
378 |
unsigned char *p1, *p2, *p; |
379 |
int curmetaID_1,curmetaID_2; |
380 |
unsigned long nextposmetaname_1,nextposmetaname_2, curmetanamepos, curmetanamepos_1, curmetanamepos_2, tmp; |
381 |
int last_filenum, filenum, tmpval, frequency, *posdata; |
382 |
#define POSDATA_STACK 2000 |
383 |
int stack_posdata[POSDATA_STACK]; /* Just to avoid the overhead of malloc/free */ |
384 |
unsigned char r_flag, *w_flag; |
385 |
unsigned char *q; |
386 |
|
387 |
/* First of all, ckeck for size in buffer */ |
388 |
maxtotsize = sw->Index->sz_worddata_buffer + sz_olddata; |
389 |
if(maxtotsize > sw->Index->len_worddata_buffer) |
390 |
{ |
391 |
sw->Index->len_worddata_buffer = maxtotsize + 2000; |
392 |
sw->Index->worddata_buffer = (unsigned char *) erealloc(sw->Index->worddata_buffer,sw->Index->len_worddata_buffer); |
393 |
} |
394 |
/* Preserve new data in a local copy - sw->Index->worddata_buffer is the final destination |
395 |
** of data |
396 |
*/ |
397 |
if(sw->Index->sz_worddata_buffer > sizeof(stack_buffer)) |
398 |
newdata = (unsigned char *) emalloc(sw->Index->sz_worddata_buffer); |
399 |
else |
400 |
newdata = stack_buffer; |
401 |
sz_newdata = sw->Index->sz_worddata_buffer; |
402 |
memcpy(newdata,sw->Index->worddata_buffer, sz_newdata); |
403 |
|
404 |
/* Set pointers to all buffers */ |
405 |
p1 = olddata; |
406 |
p2 = newdata; |
407 |
q = p = sw->Index->worddata_buffer; |
408 |
|
409 |
/* Now read tfrequency */ |
410 |
tfreq1 = uncompress2(&p1); /* tfrequency - number of files with this word */ |
411 |
tfreq2 = uncompress2(&p2); /* tfrequency - number of files with this word */ |
412 |
/* Write tfrequency */ |
413 |
p = compress3(tfreq1 + tfreq2, p); |
414 |
|
415 |
/* Now look for MetaIDs */ |
416 |
curmetaID_1 = uncompress2(&p1); |
417 |
curmetaID_2 = uncompress2(&p2); |
418 |
nextposmetaname_1 = UNPACKLONG2(p1); |
419 |
p1 += sizeof(long); |
420 |
curmetanamepos_1 = p1 - olddata; |
421 |
nextposmetaname_2 = UNPACKLONG2(p2); |
422 |
p2 += sizeof(long); |
423 |
curmetanamepos_2 = p2 - newdata; |
424 |
|
425 |
|
426 |
|
427 |
while(curmetaID_1 && curmetaID_2) |
428 |
{ |
429 |
p = compress3(min(curmetaID_1,curmetaID_2),p); |
430 |
|
431 |
curmetanamepos = p - sw->Index->worddata_buffer; |
432 |
/* Store 0 and increase pointer */ |
433 |
tmp=0L; |
434 |
|
435 |
PACKLONG2(tmp,p); |
436 |
p+=sizeof(unsigned long); |
437 |
if(curmetaID_1 == curmetaID_2) |
438 |
{ |
439 |
/* Both buffers have the same metaID - In this case I have to know |
440 |
the number of the filenum of the last hit of the original buffer to adjust the |
441 |
filenum counter in the second buffer */ |
442 |
last_filenum = 0; |
443 |
do |
444 |
{ |
445 |
/* Read on all items */ |
446 |
uncompress_location_values(&p1,&r_flag,&tmpval,&frequency); |
447 |
last_filenum += tmpval; |
448 |
if(frequency > POSDATA_STACK) |
449 |
posdata = (int *) emalloc(frequency * sizeof(int)); |
450 |
else |
451 |
posdata = stack_posdata; |
452 |
|
453 |
/* Read and discard positions just to advance pointer */ |
454 |
uncompress_location_positions(&p1,r_flag,frequency,posdata); |
455 |
if(posdata!=stack_posdata) |
456 |
efree(posdata); |
457 |
|
458 |
if ((p1 - olddata) == sz_olddata) |
459 |
{ |
460 |
curmetaID_1 = 0; /* No more metaIDs for olddata */ |
461 |
break; /* End of olddata */ |
462 |
} |
463 |
|
464 |
if ((unsigned long)(p1 - olddata) == nextposmetaname_1) |
465 |
{ |
466 |
break; |
467 |
} |
468 |
} while(1); |
469 |
memcpy(p,olddata + curmetanamepos_1, p1 - (olddata + curmetanamepos_1)); |
470 |
p += p1 - (olddata + curmetanamepos_1); |
471 |
/* Values for next metaID if exists */ |
472 |
if(curmetaID_1) |
473 |
{ |
474 |
curmetaID_1 = uncompress2(&p1); /* Next metaID */ |
475 |
nextposmetaname_1 = UNPACKLONG2(p1); |
476 |
p1 += sizeof(long); |
477 |
curmetanamepos_1 = p1 - olddata; |
478 |
} |
479 |
|
480 |
/* Now add the new values adjusting with last_filenum just the first |
481 |
** filenum in olddata*/ |
482 |
/* Read first item */ |
483 |
uncompress_location_values(&p2,&r_flag,&tmpval,&frequency); |
484 |
filenum = tmpval; /* First filenum in chunk */ |
485 |
if(frequency > POSDATA_STACK) |
486 |
posdata = (int *) emalloc(frequency * sizeof(int)); |
487 |
else |
488 |
posdata = stack_posdata; |
489 |
|
490 |
/* Read positions */ |
491 |
uncompress_location_positions(&p2,r_flag,frequency,posdata); |
492 |
|
493 |
compress_location_values(&p,&w_flag,filenum - last_filenum,frequency,posdata); |
494 |
compress_location_positions(&p,w_flag,frequency,posdata); |
495 |
|
496 |
if(posdata!=stack_posdata) |
497 |
efree(posdata); |
498 |
|
499 |
/* Copy rest of data */ |
500 |
memcpy(p,p2,nextposmetaname_2 - (p2 - newdata)); |
501 |
p += nextposmetaname_2 - (p2 - newdata); |
502 |
p2 += nextposmetaname_2 - (p2 - newdata); |
503 |
|
504 |
if ((p2 - newdata) == sz_newdata) |
505 |
{ |
506 |
curmetaID_2 = 0; /* No more metaIDs for newdata */ |
507 |
} |
508 |
/* Values for next metaID if exists */ |
509 |
if(curmetaID_2) |
510 |
{ |
511 |
curmetaID_2 = uncompress2(&p2); /* Next metaID */ |
512 |
nextposmetaname_2 = UNPACKLONG2(p2); |
513 |
p2 += sizeof(long); |
514 |
curmetanamepos_2 = p2 - newdata; |
515 |
} |
516 |
} |
517 |
else if (curmetaID_1 < curmetaID_2) |
518 |
{ |
519 |
memcpy(p,p1,nextposmetaname_1 - (p1 - olddata)); |
520 |
p += nextposmetaname_1 - (p1 - olddata); |
521 |
p1 = olddata + nextposmetaname_1; |
522 |
if ((p1 - olddata) == sz_olddata) |
523 |
{ |
524 |
curmetaID_1 = 0; /* No more metaIDs for newdata */ |
525 |
} |
526 |
else |
527 |
{ |
528 |
curmetaID_1 = uncompress2(&p1); /* Next metaID */ |
529 |
nextposmetaname_1 = UNPACKLONG2(p1); |
530 |
p1 += sizeof(long); |
531 |
curmetanamepos_1 = p1 - olddata; |
532 |
} |
533 |
} |
534 |
else /* curmetaID_1 > curmetaID_2 */ |
535 |
{ |
536 |
memcpy(p,p2,nextposmetaname_2 - (p2 - newdata)); |
537 |
p += nextposmetaname_2 - (p2 - newdata); |
538 |
p2 = newdata + nextposmetaname_2; |
539 |
if ((p2 - newdata) == sz_newdata) |
540 |
{ |
541 |
curmetaID_2 = 0; /* No more metaIDs for newdata */ |
542 |
} |
543 |
else |
544 |
{ |
545 |
curmetaID_2 = uncompress2(&p2); /* Next metaID */ |
546 |
nextposmetaname_2 = UNPACKLONG2(p2); |
547 |
p2 += sizeof(long); |
548 |
curmetanamepos_2 = p2 - newdata; |
549 |
} |
550 |
} |
551 |
/* Put nextmetaname offset */ |
552 |
PACKLONG2(p - sw->Index->worddata_buffer, sw->Index->worddata_buffer + curmetanamepos); |
553 |
|
554 |
} /* while */ |
555 |
|
556 |
/* Add the rest of the data if exists */ |
557 |
while(curmetaID_1) |
558 |
{ |
559 |
p = compress3(curmetaID_1,p); |
560 |
|
561 |
curmetanamepos = p - sw->Index->worddata_buffer; |
562 |
/* Store 0 and increase pointer */ |
563 |
tmp=0L; |
564 |
PACKLONG2(tmp,p); |
565 |
p += sizeof(unsigned long); |
566 |
|
567 |
memcpy(p,p1,nextposmetaname_1 - (p1 - olddata)); |
568 |
p += nextposmetaname_1 - (p1 - olddata); |
569 |
p1 = olddata + nextposmetaname_1; |
570 |
if ((p1 - olddata) == sz_olddata) |
571 |
{ |
572 |
curmetaID_1 = 0; /* No more metaIDs for olddata */ |
573 |
} |
574 |
else |
575 |
{ |
576 |
curmetaID_1 = uncompress2(&p1); /* Next metaID */ |
577 |
nextposmetaname_1 = UNPACKLONG2(p1); |
578 |
p1 += sizeof(long); |
579 |
curmetanamepos_1 = p1 - olddata; |
580 |
} |
581 |
PACKLONG2(p - sw->Index->worddata_buffer, sw->Index->worddata_buffer + curmetanamepos); |
582 |
} |
583 |
|
584 |
|
585 |
while(curmetaID_2) |
586 |
{ |
587 |
p = compress3(curmetaID_2,p); |
588 |
|
589 |
curmetanamepos = p - sw->Index->worddata_buffer; |
590 |
/* Store 0 and increase pointer */ |
591 |
tmp=0L; |
592 |
PACKLONG2(tmp,p); |
593 |
p += sizeof(unsigned long); |
594 |
|
595 |
memcpy(p,p2,nextposmetaname_2 - (p2 - newdata)); |
596 |
p += nextposmetaname_2 - (p2 - newdata); |
597 |
p2 = newdata + nextposmetaname_2; |
598 |
if ((p2 - newdata) == sz_newdata) |
599 |
{ |
600 |
curmetaID_2 = 0; /* No more metaIDs for olddata */ |
601 |
} |
602 |
else |
603 |
{ |
604 |
curmetaID_2 = uncompress2(&p2); /* Next metaID */ |
605 |
nextposmetaname_2 = UNPACKLONG2(p2); |
606 |
p2+= sizeof(long); |
607 |
curmetanamepos_2= p2 - newdata; |
608 |
} |
609 |
} |
610 |
|
611 |
|
612 |
if(newdata != stack_buffer) |
613 |
efree(newdata); |
614 |
|
615 |
/* Save the new size */ |
616 |
sw->Index->sz_worddata_buffer = p - sw->Index->worddata_buffer; |
617 |
} |
618 |
|
619 |
/* Writes the list of metaNames into the DB index |
620 |
*/ |
621 |
|
622 |
void write_MetaNames(SWISH *sw, int id, INDEXDATAHEADER * header, void *DB) |
623 |
{ |
624 |
struct metaEntry *entry = NULL; |
625 |
int i, |
626 |
sz_buffer, |
627 |
len; |
628 |
unsigned char *buffer,*s; |
629 |
int fields; |
630 |
|
631 |
/* Use new metaType schema - see metanames.h */ |
632 |
// Format of metaname is |
633 |
// <len><metaName><metaType><Alias><rank_bias> |
634 |
// len, metaType, alias, and rank_bias are compressed numbers |
635 |
// metaName is the ascii name of the metaname |
636 |
// |
637 |
// The list of metanames is delimited by a 0 |
638 |
|
639 |
fields = 5; // len, metaID, metaType, alias, rank_bias |
640 |
|
641 |
|
642 |
/* Compute buffer size */ |
643 |
for (sz_buffer = 0 , i = 0; i < header->metaCounter; i++) |
644 |
{ |
645 |
entry = header->metaEntryArray[i]; |
646 |
len = strlen(entry->metaName); |
647 |
sz_buffer += len + fields * MAXINTCOMPSIZE; /* compress can use MAXINTCOMPSIZE bytes in worse case, */ |
648 |
} |
649 |
|
650 |
sz_buffer += MAXINTCOMPSIZE; /* Add extra MAXINTCOMPSIZE for the number of metanames */ |
651 |
|
652 |
s = buffer = (unsigned char *) emalloc(sz_buffer); |
653 |
|
654 |
s = compress3(header->metaCounter,s); /* store the number of metanames */ |
655 |
|
656 |
for (i = 0; i < header->metaCounter; i++) |
657 |
{ |
658 |
entry = header->metaEntryArray[i]; |
659 |
len = strlen(entry->metaName); |
660 |
s = compress3(len, s); |
661 |
memcpy(s,entry->metaName,len); |
662 |
s += len; |
663 |
s = compress3(entry->metaID, s); |
664 |
s = compress3(entry->metaType, s); |
665 |
s = compress3(entry->alias+1, s); /* keep zeros away from compress3, I believe */ |
666 |
s = compress3(entry->rank_bias+RANK_BIAS_RANGE+1, s); |
667 |
} |
668 |
DB_WriteHeaderData(sw, id,buffer,s-buffer,DB); |
669 |
efree(buffer); |
670 |
} |
671 |
|
672 |
|
673 |
|
674 |
/* Write a the hashlist of words into the index header file (used by stopwords and buzzwords |
675 |
*/ |
676 |
|
677 |
int write_words_to_header(SWISH *sw, int header_ID, struct swline **hash, void *DB) |
678 |
{ |
679 |
int hashval, |
680 |
len, |
681 |
num_words, |
682 |
sz_buffer; |
683 |
char *buffer, *s; |
684 |
struct swline *sp = NULL; |
685 |
|
686 |
/* Let's count the words */ |
687 |
|
688 |
for (sz_buffer = 0, num_words = 0 , hashval = 0; hashval < HASHSIZE; hashval++) |
689 |
{ |
690 |
sp = hash[hashval]; |
691 |
while (sp != NULL) |
692 |
{ |
693 |
num_words++; |
694 |
sz_buffer += MAXINTCOMPSIZE + strlen(sp->line); |
695 |
sp = sp->next; |
696 |
} |
697 |
} |
698 |
|
699 |
if(num_words) |
700 |
{ |
701 |
sz_buffer += MAXINTCOMPSIZE; /* Add MAXINTCOMPSIZE for the number of words */ |
702 |
|
703 |
s = buffer = (char *)emalloc(sz_buffer); |
704 |
|
705 |
s = (char *)compress3(num_words, (unsigned char *)s); |
706 |
|
707 |
for (hashval = 0; hashval < HASHSIZE; hashval++) |
708 |
{ |
709 |
sp = hash[hashval]; |
710 |
while (sp != NULL) |
711 |
{ |
712 |
len = strlen(sp->line); |
713 |
s = (char *)compress3(len,(unsigned char *)s); |
714 |
memcpy(s,sp->line,len); |
715 |
s +=len; |
716 |
sp = sp->next; |
717 |
} |
718 |
} |
719 |
DB_WriteHeaderData(sw, header_ID, (unsigned char *)buffer, s - buffer, DB); |
720 |
efree(buffer); |
721 |
} |
722 |
return 0; |
723 |
} |
724 |
|
725 |
|
726 |
|
727 |
int write_integer_table_to_header(SWISH *sw, int id, int table[], int table_size, void *DB) |
728 |
{ |
729 |
int i, |
730 |
tmp; |
731 |
char *s; |
732 |
char *buffer; |
733 |
|
734 |
s = buffer = (char *) emalloc((table_size + 1) * MAXINTCOMPSIZE); |
735 |
|
736 |
s = (char *)compress3(table_size,(unsigned char *)s); /* Put the number of elements */ |
737 |
for (i = 0; i < table_size; i++) |
738 |
{ |
739 |
tmp = table[i] + 1; |
740 |
s = (char *)compress3(tmp, (unsigned char *)s); /* Put all the elements */ |
741 |
} |
742 |
|
743 |
DB_WriteHeaderData(sw, id, (unsigned char *)buffer, s-buffer, DB); |
744 |
|
745 |
efree(buffer); |
746 |
return 0; |
747 |
} |
748 |
|
749 |
|
750 |
|
751 |
|
752 |
|
753 |
|
754 |
|
755 |
|
756 |
/* General read DB routines - Common to all DB */ |
757 |
|
758 |
/* Reads the file offset table in the index file. |
759 |
*/ |
760 |
|
761 |
/* Reads and prints the header of an index file. |
762 |
** Also reads the information in the header (wordchars, beginchars, etc) |
763 |
*/ |
764 |
|
765 |
// $$$ to be rewritten as function = smaller code (rasc) |
766 |
|
767 |
#define parse_int_from_buffer(num,s) (num) = UNPACKLONG2((s)) |
768 |
#define parse_int2_from_buffer(num1,num2,s) (num1) = UNPACKLONG2((s));(num2) = UNPACKLONG2((s+sizeof(long))) |
769 |
|
770 |
|
771 |
void read_header(SWISH *sw, INDEXDATAHEADER *header, void *DB) |
772 |
{ |
773 |
int id, |
774 |
len; |
775 |
unsigned long tmp, tmp1, tmp2; |
776 |
unsigned char *buffer; |
777 |
|
778 |
DB_InitReadHeader(sw, DB); |
779 |
|
780 |
DB_ReadHeaderData(sw, &id,&buffer,&len,DB); |
781 |
|
782 |
while (id) |
783 |
{ |
784 |
switch (id) |
785 |
{ |
786 |
case INDEXHEADER_ID: |
787 |
case INDEXVERSION_ID: |
788 |
case MERGED_ID: |
789 |
case DOCPROPENHEADER_ID: |
790 |
break; |
791 |
case WORDCHARSHEADER_ID: |
792 |
header->wordchars = SafeStrCopy(header->wordchars, (char *)buffer, &header->lenwordchars); |
793 |
sortstring(header->wordchars); |
794 |
makelookuptable(header->wordchars, header->wordcharslookuptable); |
795 |
break; |
796 |
case BEGINCHARSHEADER_ID: |
797 |
header->beginchars = SafeStrCopy(header->beginchars, (char *)buffer, &header->lenbeginchars); |
798 |
sortstring(header->beginchars); |
799 |
makelookuptable(header->beginchars, header->begincharslookuptable); |
800 |
break; |
801 |
case ENDCHARSHEADER_ID: |
802 |
header->endchars = SafeStrCopy(header->endchars, (char *)buffer, &header->lenendchars); |
803 |
sortstring(header->endchars); |
804 |
makelookuptable(header->endchars, header->endcharslookuptable); |
805 |
break; |
806 |
case IGNOREFIRSTCHARHEADER_ID: |
807 |
header->ignorefirstchar = SafeStrCopy(header->ignorefirstchar, (char *)buffer, &header->lenignorefirstchar); |
808 |
sortstring(header->ignorefirstchar); |
809 |
makelookuptable(header->ignorefirstchar, header->ignorefirstcharlookuptable); |
810 |
break; |
811 |
case IGNORELASTCHARHEADER_ID: |
812 |
header->ignorelastchar = SafeStrCopy(header->ignorelastchar, (char *)buffer, &header->lenignorelastchar); |
813 |
sortstring(header->ignorelastchar); |
814 |
makelookuptable(header->ignorelastchar, header->ignorelastcharlookuptable); |
815 |
break; |
816 |
|
817 |
/* replaced by fuzzy_mode Aug 20, 2002 |
818 |
case STEMMINGHEADER_ID: |
819 |
parse_int_from_buffer(tmp,buffer); |
820 |
header-> = tmp; |
821 |
break; |
822 |
case SOUNDEXHEADER_ID: |
823 |
parse_int_from_buffer(tmp,buffer); |
824 |
header->applySoundexRules = tmp; |
825 |
break; |
826 |
*/ |
827 |
|
828 |
case FUZZYMODEHEADER_ID: |
829 |
parse_int_from_buffer(tmp,buffer); |
830 |
header->fuzzy_mode = tmp; |
831 |
break; |
832 |
|
833 |
case IGNORETOTALWORDCOUNTWHENRANKING_ID: |
834 |
parse_int_from_buffer(tmp,buffer); |
835 |
header->ignoreTotalWordCountWhenRanking = tmp; |
836 |
break; |
837 |
case MINWORDLIMHEADER_ID: |
838 |
parse_int_from_buffer(tmp,buffer); |
839 |
header->minwordlimit = tmp; |
840 |
break; |
841 |
case MAXWORDLIMHEADER_ID: |
842 |
parse_int_from_buffer(tmp,buffer); |
843 |
header->maxwordlimit = tmp; |
844 |
break; |
845 |
case SAVEDASHEADER_ID: |
846 |
header->savedasheader = SafeStrCopy(header->savedasheader, (char *)buffer, &header->lensavedasheader); |
847 |
break; |
848 |
case NAMEHEADER_ID: |
849 |
header->indexn = SafeStrCopy(header->indexn, (char *)buffer, &header->lenindexn); |
850 |
break; |
851 |
case DESCRIPTIONHEADER_ID: |
852 |
header->indexd = SafeStrCopy(header->indexd, (char *)buffer, &header->lenindexd); |
853 |
break; |
854 |
case POINTERHEADER_ID: |
855 |
header->indexp = SafeStrCopy(header->indexp, (char *)buffer, &header->lenindexp); |
856 |
break; |
857 |
case MAINTAINEDBYHEADER_ID: |
858 |
header->indexa = SafeStrCopy(header->indexa, (char *)buffer, &header->lenindexa); |
859 |
break; |
860 |
case INDEXEDONHEADER_ID: |
861 |
header->indexedon = SafeStrCopy(header->indexedon, (char *)buffer, &header->lenindexedon); |
862 |
break; |
863 |
case COUNTSHEADER_ID: |
864 |
parse_int2_from_buffer(tmp1,tmp2,buffer); |
865 |
header->totalwords = tmp1; |
866 |
header->totalfiles = tmp2; |
867 |
break; |
868 |
/* removed due to patents problems |
869 |
case FILEINFOCOMPRESSION_ID: |
870 |
ReadHeaderInt(itmp, fp); |
871 |
header->applyFileInfoCompression = itmp; |
872 |
break; |
873 |
*/ |
874 |
case TRANSLATECHARTABLE_ID: |
875 |
parse_integer_table_from_buffer(header->translatecharslookuptable, sizeof(header->translatecharslookuptable) / sizeof(int), (char *)buffer); |
876 |
break; |
877 |
case STOPWORDS_ID: |
878 |
parse_stopwords_from_buffer(header, (char *)buffer); |
879 |
break; |
880 |
case METANAMES_ID: |
881 |
parse_MetaNames_from_buffer(header, (char *)buffer); |
882 |
break; |
883 |
case BUZZWORDS_ID: |
884 |
parse_buzzwords_from_buffer(header, (char *)buffer); |
885 |
break; |
886 |
|
887 |
#ifndef USE_BTREE |
888 |
case TOTALWORDSPERFILE_ID: |
889 |
if ( !header->ignoreTotalWordCountWhenRanking ) |
890 |
{ |
891 |
header->TotalWordsPerFile = emalloc( header->totalfiles * sizeof(int) ); |
892 |
parse_integer_table_from_buffer(header->TotalWordsPerFile, header->totalfiles, (char *)buffer); |
893 |
} |
894 |
break; |
895 |
#endif |
896 |
|
897 |
default: |
898 |
progerr("Severe index error in header"); |
899 |
break; |
900 |
} |
901 |
efree(buffer); |
902 |
DB_ReadHeaderData(sw, &id,&buffer,&len,DB); |
903 |
} |
904 |
DB_EndReadHeader(sw, DB); |
905 |
} |
906 |
|
907 |
/* Reads the metaNames from the index |
908 |
*/ |
909 |
|
910 |
void parse_MetaNames_from_buffer(INDEXDATAHEADER *header, char *buffer) |
911 |
{ |
912 |
int len; |
913 |
int num_metanames; |
914 |
int metaType, |
915 |
i, |
916 |
alias, |
917 |
bias, |
918 |
metaID; |
919 |
char *word; |
920 |
unsigned char *s = (unsigned char *)buffer; |
921 |
struct metaEntry *m; |
922 |
|
923 |
|
924 |
/* First clear out the default metanames */ |
925 |
freeMetaEntries( header ); |
926 |
|
927 |
num_metanames = uncompress2(&s); |
928 |
|
929 |
for (i = 0; i < num_metanames; i++) |
930 |
{ |
931 |
len = uncompress2(&s); |
932 |
word = emalloc(len +1); |
933 |
memcpy(word,s,len); s += len; |
934 |
word[len] = '\0'; |
935 |
/* Read metaID */ |
936 |
metaID = uncompress2(&s); |
937 |
/* metaType was saved as metaType+1 */ |
938 |
metaType = uncompress2(&s); |
939 |
|
940 |
alias = uncompress2(&s) - 1; |
941 |
|
942 |
bias = uncompress2(&s) - RANK_BIAS_RANGE - 1; |
943 |
|
944 |
|
945 |
/* add the meta tag */ |
946 |
if ( !(m = addNewMetaEntry(header, word, metaType, metaID))) |
947 |
progerr("failed to add new meta entry '%s:%d'", word, metaID ); |
948 |
|
949 |
m->alias = alias; |
950 |
m->rank_bias = bias; |
951 |
|
952 |
efree(word); |
953 |
} |
954 |
} |
955 |
|
956 |
/* Reads the stopwords in the index file. |
957 |
*/ |
958 |
|
959 |
void parse_stopwords_from_buffer(INDEXDATAHEADER *header, char *buffer) |
960 |
{ |
961 |
int len; |
962 |
int num_words; |
963 |
int i; |
964 |
char *word = NULL; |
965 |
|
966 |
unsigned char *s = (unsigned char *)buffer; |
967 |
|
968 |
num_words = uncompress2(&s); |
969 |
|
970 |
for (i=0; i < num_words ; i++) |
971 |
{ |
972 |
len = uncompress2(&s); |
973 |
word = emalloc(len+1); |
974 |
memcpy(word,s,len); s += len; |
975 |
word[len] = '\0'; |
976 |
addStopList(header, word); |
977 |
addstophash(header, word); |
978 |
efree(word); |
979 |
} |
980 |
} |
981 |
|
982 |
/* read the buzzwords from the index file */ |
983 |
|
984 |
void parse_buzzwords_from_buffer(INDEXDATAHEADER *header, char *buffer) |
985 |
{ |
986 |
int len; |
987 |
int num_words; |
988 |
int i; |
989 |
char *word = NULL; |
990 |
|
991 |
unsigned char *s = (unsigned char *)buffer; |
992 |
|
993 |
num_words = uncompress2(&s); |
994 |
for (i=0; i < num_words ; i++) |
995 |
{ |
996 |
len = uncompress2(&s); |
997 |
word = emalloc(len+1); |
998 |
memcpy(word,s,len); s += len; |
999 |
word[len] = '\0'; |
1000 |
addbuzzwordhash(header, word); |
1001 |
efree(word); |
1002 |
} |
1003 |
} |
1004 |
|
1005 |
|
1006 |
|
1007 |
|
1008 |
void parse_integer_table_from_buffer(int table[], int table_size, char *buffer) |
1009 |
{ |
1010 |
int tmp,i; |
1011 |
unsigned char *s = (unsigned char *)buffer; |
1012 |
|
1013 |
tmp = uncompress2(&s); /* Jump the number of elements */ |
1014 |
for (i = 0; i < table_size; i++) |
1015 |
{ |
1016 |
tmp = uncompress2(&s); /* Gut all the elements */ |
1017 |
table[i] = tmp - 1; |
1018 |
} |
1019 |
} |
1020 |
|
1021 |
|
1022 |
/* 11/00 Function to read all words starting with a character */ |
1023 |
char *getfilewords(SWISH * sw, int c, IndexFILE * indexf) |
1024 |
{ |
1025 |
int i, |
1026 |
j; |
1027 |
int wordlen; |
1028 |
char *buffer, *resultword; |
1029 |
int bufferpos, |
1030 |
bufferlen; |
1031 |
unsigned char word[2]; |
1032 |
long wordID; |
1033 |
|
1034 |
|
1035 |
|
1036 |
if (!c) |
1037 |
return ""; |
1038 |
/* Check if already read */ |
1039 |
j = (int) ((unsigned char) c); |
1040 |
if (indexf->keywords[j]) |
1041 |
return (indexf->keywords[j]); |
1042 |
|
1043 |
DB_InitReadWords(sw, indexf->DB); |
1044 |
|
1045 |
word[0]=(unsigned char)c; |
1046 |
word[1]='\0'; |
1047 |
|
1048 |
DB_ReadFirstWordInvertedIndex(sw, (char *)word, &resultword, &wordID, indexf->DB); |
1049 |
i = (int) ((unsigned char) c); |
1050 |
if (!wordID) |
1051 |
{ |
1052 |
DB_EndReadWords(sw, indexf->DB); |
1053 |
sw->lasterror = WORD_NOT_FOUND; |
1054 |
return ""; |
1055 |
} |
1056 |
|
1057 |
wordlen = strlen(resultword); |
1058 |
bufferlen = wordlen + MAXSTRLEN * 10; |
1059 |
bufferpos = 0; |
1060 |
buffer = emalloc(bufferlen + 1); |
1061 |
buffer[0] = '\0'; |
1062 |
|
1063 |
|
1064 |
memcpy(buffer, resultword, wordlen); |
1065 |
efree(resultword); |
1066 |
if (c != (int)((unsigned char) buffer[bufferpos])) |
1067 |
{ |
1068 |
buffer[bufferpos] = '\0'; |
1069 |
indexf->keywords[j] = buffer; |
1070 |
return (indexf->keywords[j]); |
1071 |
} |
1072 |
|
1073 |
buffer[bufferpos + wordlen] = '\0'; |
1074 |
bufferpos += wordlen + 1; |
1075 |
|
1076 |
/* Look for occurrences */ |
1077 |
DB_ReadNextWordInvertedIndex(sw, (char *)word, &resultword, &wordID, indexf->DB); |
1078 |
while (wordID) |
1079 |
{ |
1080 |
wordlen = strlen(resultword); |
1081 |
if ((bufferpos + wordlen + 1 + 1) > bufferlen) |
1082 |
{ |
1083 |
bufferlen += MAXSTRLEN + wordlen + 1 + 1; |
1084 |
buffer = (char *) erealloc(buffer, bufferlen + 1); |
1085 |
} |
1086 |
memcpy(buffer + bufferpos, resultword, wordlen); |
1087 |
efree(resultword); |
1088 |
if (c != (int)((unsigned char)buffer[bufferpos])) |
1089 |
{ |
1090 |
buffer[bufferpos] = '\0'; |
1091 |
break; |
1092 |
} |
1093 |
|
1094 |
buffer[bufferpos + wordlen] = '\0'; |
1095 |
bufferpos += wordlen + 1; |
1096 |
DB_ReadNextWordInvertedIndex(sw, (char *)word, &resultword, &wordID, indexf->DB); |
1097 |
} |
1098 |
buffer[bufferpos] = '\0'; |
1099 |
indexf->keywords[j] = buffer; |
1100 |
return (indexf->keywords[j]); |
1101 |
} |
1102 |
|
1103 |
void setTotalWordsPerFile(SWISH *sw, IndexFILE *indexf, int idx,int wordcount) |
1104 |
{ |
1105 |
INDEXDATAHEADER *header = &indexf->header; |
1106 |
#ifdef USE_BTREE |
1107 |
DB_WriteTotalWordsPerFile(sw, idx, wordcount, indexf->DB); |
1108 |
|
1109 |
#else |
1110 |
|
1111 |
if ( !header->TotalWordsPerFile || idx >= header->TotalWordsPerFileMax ) |
1112 |
{ |
1113 |
header->TotalWordsPerFileMax += 20000; /* random guess -- could be a config setting */ |
1114 |
if(! header->TotalWordsPerFile) |
1115 |
header->TotalWordsPerFile = emalloc( header->TotalWordsPerFileMax * sizeof(int) ); |
1116 |
else |
1117 |
header->TotalWordsPerFile = erealloc( header->TotalWordsPerFile, header->TotalWordsPerFileMax * sizeof(int) ); |
1118 |
} |
1119 |
|
1120 |
header->TotalWordsPerFile[idx] = wordcount; |
1121 |
#endif |
1122 |
} |
1123 |
|
1124 |
|
1125 |
void getTotalWordsPerFile(SWISH *sw, IndexFILE *indexf, int idx,int *wordcount) |
1126 |
{ |
1127 |
#ifdef USE_BTREE |
1128 |
DB_ReadTotalWordsPerFile(sw, idx, wordcount, indexf->DB); |
1129 |
#else |
1130 |
INDEXDATAHEADER *header = &indexf->header; |
1131 |
*wordcount = header->TotalWordsPerFile[idx]; |
1132 |
#endif |
1133 |
} |
1134 |
|
1135 |
|
1136 |
/*------------------------------------------------------*/ |
1137 |
/*---------- General entry point of DB module ----------*/ |
1138 |
|
1139 |
void *DB_Create (SWISH *sw, char *dbname) |
1140 |
{ |
1141 |
return sw->Db->DB_Create(sw, dbname); |
1142 |
} |
1143 |
|
1144 |
void *DB_Open (SWISH *sw, char *dbname, int mode) |
1145 |
{ |
1146 |
return sw->Db->DB_Open(sw, dbname,mode); |
1147 |
} |
1148 |
|
1149 |
void DB_Close(SWISH *sw, void *DB) |
1150 |
{ |
1151 |
sw->Db->DB_Close(DB); |
1152 |
} |
1153 |
|
1154 |
|
1155 |
void DB_Remove(SWISH *sw, void *DB) |
1156 |
{ |
1157 |
sw->Db->DB_Remove(DB); |
1158 |
} |
1159 |
|
1160 |
int DB_InitWriteHeader(SWISH *sw, void *DB) |
1161 |
{ |
1162 |
return sw->Db->DB_InitWriteHeader(DB); |
1163 |
} |
1164 |
|
1165 |
int DB_WriteHeaderData(SWISH *sw, int id, unsigned char *s, int len, void *DB) |
1166 |
{ |
1167 |
return sw->Db->DB_WriteHeaderData(id, s,len,DB); |
1168 |
} |
1169 |
|
1170 |
int DB_EndWriteHeader(SWISH *sw, void *DB) |
1171 |
{ |
1172 |
return sw->Db->DB_EndWriteHeader(DB); |
1173 |
} |
1174 |
|
1175 |
|
1176 |
int DB_InitReadHeader(SWISH *sw, void *DB) |
1177 |
{ |
1178 |
return sw->Db->DB_InitReadHeader(DB); |
1179 |
} |
1180 |
|
1181 |
int DB_ReadHeaderData(SWISH *sw, int *id, unsigned char **s, int *len, void *DB) |
1182 |
{ |
1183 |
return sw->Db->DB_ReadHeaderData(id, s, len, DB); |
1184 |
} |
1185 |
|
1186 |
int DB_EndReadHeader(SWISH *sw, void *DB) |
1187 |
{ |
1188 |
return sw->Db->DB_EndReadHeader(DB); |
1189 |
} |
1190 |
|
1191 |
|
1192 |
int DB_InitWriteWords(SWISH *sw, void *DB) |
1193 |
{ |
1194 |
return sw->Db->DB_InitWriteWords(DB); |
1195 |
} |
1196 |
|
1197 |
long DB_GetWordID(SWISH *sw, void *DB) |
1198 |
{ |
1199 |
return sw->Db->DB_GetWordID(DB); |
1200 |
} |
1201 |
|
1202 |
int DB_WriteWord(SWISH *sw, char *word, long wordID, void *DB) |
1203 |
{ |
1204 |
return sw->Db->DB_WriteWord(word, wordID, DB); |
1205 |
} |
1206 |
|
1207 |
#ifdef USE_BTREE |
1208 |
int DB_UpdateWordID(SWISH *sw, char *word, long wordID, void *DB) |
1209 |
{ |
1210 |
return sw->Db->DB_UpdateWordID(word, wordID, DB); |
1211 |
} |
1212 |
|
1213 |
int DB_DeleteWordData(SWISH *sw, long wordID, void *DB) |
1214 |
{ |
1215 |
return sw->Db->DB_DeleteWordData(wordID, DB); |
1216 |
} |
1217 |
|
1218 |
#endif |
1219 |
|
1220 |
int DB_WriteWordHash(SWISH *sw, char *word, long wordID, void *DB) |
1221 |
{ |
1222 |
return sw->Db->DB_WriteWordHash(word, wordID, DB); |
1223 |
} |
1224 |
|
1225 |
long DB_WriteWordData(SWISH *sw, long wordID, unsigned char *worddata, int lendata, void *DB) |
1226 |
{ |
1227 |
return sw->Db->DB_WriteWordData(wordID, worddata, lendata, DB); |
1228 |
} |
1229 |
|
1230 |
int DB_EndWriteWords(SWISH *sw, void *DB) |
1231 |
{ |
1232 |
return sw->Db->DB_EndWriteWords(DB); |
1233 |
} |
1234 |
|
1235 |
|
1236 |
int DB_InitReadWords(SWISH *sw, void *DB) |
1237 |
{ |
1238 |
return sw->Db->DB_InitReadWords(DB); |
1239 |
} |
1240 |
|
1241 |
int DB_ReadWordHash(SWISH *sw, char *word, long *wordID, void *DB) |
1242 |
{ |
1243 |
return sw->Db->DB_ReadWordHash(word, wordID, DB); |
1244 |
} |
1245 |
|
1246 |
int DB_ReadFirstWordInvertedIndex(SWISH *sw, char *word, char **resultword, long *wordID, void *DB) |
1247 |
{ |
1248 |
return sw->Db->DB_ReadFirstWordInvertedIndex(word, resultword, wordID, DB); |
1249 |
} |
1250 |
|
1251 |
int DB_ReadNextWordInvertedIndex(SWISH *sw, char *word, char **resultword, long *wordID, void *DB) |
1252 |
{ |
1253 |
return sw->Db->DB_ReadNextWordInvertedIndex(word, resultword, wordID, DB); |
1254 |
} |
1255 |
|
1256 |
long DB_ReadWordData(SWISH *sw, long wordID, unsigned char **worddata, int *lendata, void *DB) |
1257 |
{ |
1258 |
return sw->Db->DB_ReadWordData(wordID, worddata, lendata, DB); |
1259 |
} |
1260 |
|
1261 |
int DB_EndReadWords(SWISH *sw, void *DB) |
1262 |
{ |
1263 |
return sw->Db->DB_EndReadWords(DB); |
1264 |
} |
1265 |
|
1266 |
|
1267 |
|
1268 |
int DB_InitWriteFiles(SWISH *sw, void *DB) |
1269 |
{ |
1270 |
return sw->Db->DB_InitWriteFiles(DB); |
1271 |
} |
1272 |
|
1273 |
int DB_WriteFile(SWISH *sw, int filenum, unsigned char *filedata,int sz_filedata, void *DB) |
1274 |
{ |
1275 |
return sw->Db->DB_WriteFile(filenum, filedata, sz_filedata, DB); |
1276 |
} |
1277 |
|
1278 |
int DB_EndWriteFiles(SWISH *sw, void *DB) |
1279 |
{ |
1280 |
return sw->Db->DB_EndWriteFiles(DB); |
1281 |
} |
1282 |
|
1283 |
|
1284 |
int DB_InitReadFiles(SWISH *sw, void *DB) |
1285 |
{ |
1286 |
return sw->Db->DB_InitReadFiles(DB); |
1287 |
} |
1288 |
|
1289 |
int DB_ReadFile(SWISH *sw, int filenum, unsigned char **filedata,int *sz_filedata, void *DB) |
1290 |
{ |
1291 |
return sw->Db->DB_ReadFile(filenum, filedata,sz_filedata, DB); |
1292 |
} |
1293 |
|
1294 |
int DB_EndReadFiles(SWISH *sw, void *DB) |
1295 |
{ |
1296 |
return sw->Db->DB_EndReadFiles(DB); |
1297 |
} |
1298 |
|
1299 |
#ifdef USE_BTREE |
1300 |
int DB_InitWriteSortedIndex(SWISH *sw, void *DB, int n_props) |
1301 |
{ |
1302 |
return sw->Db->DB_InitWriteSortedIndex(DB, n_props); |
1303 |
} |
1304 |
#else |
1305 |
int DB_InitWriteSortedIndex(SWISH *sw, void *DB) |
1306 |
{ |
1307 |
return sw->Db->DB_InitWriteSortedIndex(DB); |
1308 |
} |
1309 |
#endif |
1310 |
int DB_WriteSortedIndex(SWISH *sw, int propID, unsigned char *data, int sz_data,void *DB) |
1311 |
{ |
1312 |
return sw->Db->DB_WriteSortedIndex(propID, data, sz_data,DB); |
1313 |
} |
1314 |
|
1315 |
int DB_EndWriteSortedIndex(SWISH *sw, void *DB) |
1316 |
{ |
1317 |
return sw->Db->DB_EndWriteSortedIndex(DB); |
1318 |
} |
1319 |
|
1320 |
|
1321 |
int DB_InitReadSortedIndex(SWISH *sw, void *DB) |
1322 |
{ |
1323 |
return sw->Db->DB_InitReadSortedIndex(DB); |
1324 |
} |
1325 |
|
1326 |
int DB_ReadSortedIndex(SWISH *sw, int propID, unsigned char **data, int *sz_data,void *DB) |
1327 |
{ |
1328 |
return sw->Db->DB_ReadSortedIndex(propID, data, sz_data,DB); |
1329 |
} |
1330 |
|
1331 |
int DB_ReadSortedData(SWISH *sw, int *data,int index, int *value, void *DB) |
1332 |
{ |
1333 |
return sw->Db->DB_ReadSortedData(data,index,value,DB); |
1334 |
} |
1335 |
|
1336 |
int DB_EndReadSortedIndex(SWISH *sw, void *DB) |
1337 |
{ |
1338 |
return sw->Db->DB_EndReadSortedIndex(DB); |
1339 |
} |
1340 |
|
1341 |
|
1342 |
void DB_WriteProperty( SWISH *sw, IndexFILE *indexf, FileRec *fi, int propID, char *buffer, int buf_len, int uncompressed_len, void *db) |
1343 |
{ |
1344 |
sw->Db->DB_WriteProperty( indexf, fi, propID, buffer, buf_len, uncompressed_len, db ); |
1345 |
} |
1346 |
|
1347 |
void DB_WritePropPositions(SWISH *sw, IndexFILE *indexf, FileRec *fi, void *db) |
1348 |
{ |
1349 |
sw->Db->DB_WritePropPositions( indexf, fi, db); |
1350 |
} |
1351 |
|
1352 |
void DB_ReadPropPositions(SWISH *sw, IndexFILE *indexf, FileRec *fi, void *db) |
1353 |
{ |
1354 |
sw->Db->DB_ReadPropPositions( indexf, fi, db); |
1355 |
} |
1356 |
|
1357 |
|
1358 |
char *DB_ReadProperty(SWISH *sw, IndexFILE *indexf, FileRec *fi, int propID, int *buf_len, int *uncompressed_len, void *db) |
1359 |
{ |
1360 |
return sw->Db->DB_ReadProperty( indexf, fi, propID, buf_len, uncompressed_len, db ); |
1361 |
} |
1362 |
|
1363 |
|
1364 |
void DB_Reopen_PropertiesForRead(SWISH *sw, void *DB ) |
1365 |
{ |
1366 |
sw->Db->DB_Reopen_PropertiesForRead(DB); |
1367 |
} |
1368 |
|
1369 |
|
1370 |
#ifdef USE_BTREE |
1371 |
|
1372 |
int DB_WriteTotalWordsPerFile(SWISH *sw, int idx, int wordcount, void *DB) |
1373 |
{ |
1374 |
return sw->Db->DB_WriteTotalWordsPerFile(sw, idx, wordcount, DB); |
1375 |
} |
1376 |
|
1377 |
|
1378 |
int DB_ReadTotalWordsPerFile(SWISH *sw, int index, int *value, void *DB) |
1379 |
{ |
1380 |
return sw->Db->DB_ReadTotalWordsPerFile(sw, index, value, DB); |
1381 |
} |
1382 |
|
1383 |
#endif |
1384 |
|
1385 |
|