/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
**
** 2001-02-12 rasc errormsg "print" changed...
**
*/
#include "swish.h"
#include "string.h"
#include "compress.h"
#include "mem.h"
#include "error.h"
#include "merge.h"
#include "docprop.h"
#include "index.h"
#include "search.h"
#include "hash.h"
#include "ramdisk.h"
#include "swish_qsort.h"
#include "file.h"
/* 2001-05 jmruiz */
/* Routines for compressing numbers - Macros converted to routines */
/* Compress a number and writes it to a file */
void compress1(int num, FILE * fp, int (*f_putc) (int, FILE *))
{
int _i = 0,
_r = num;
unsigned char _s[MAXINTCOMPSIZE];
/* Trivial case: 0 */
if(!_r)
{
f_putc(0,fp);
return;
}
/* Any other case ... */
while (_r)
{
_s[_i++] = _r & 127;
_r >>= 7;
}
while (--_i >= 0)
f_putc(_s[_i] | (_i ? 128 : 0), fp);
}
/* Compress a number and writes it to a buffer */
/* buffer must be previously allocated */
/* returns the decreased buffer pointer after storing the compressed number in it */
unsigned char *SW_compress2(int num, unsigned char *buffer)
{
int _i = num;
/* Trivial case: 0 */
if(!_i)
{
*buffer-- = 0;
return 0;
}
/* Any other case ... */
while (_i)
{
*buffer = _i & 127;
if (_i != num)
*buffer |= 128;
_i >>= 7;
buffer--;
}
return buffer;
}
/* Compress a number and writes it to a buffer */
/* buffer must be previously allocated */
/* returns the incrmented buffer pointer after storing the compressed number in it */
unsigned char *compress3(int num, unsigned char *buffer)
{
int _i = 0,
_r = num;
unsigned char _s[MAXINTCOMPSIZE];
/* Trivial case: 0 */
if(!_r)
{
*buffer++ = 0;
return buffer;
}
/* Any other case ... */
while (_r)
{
_s[_i++] = _r & 127;
_r >>= 7;
}
while (--_i >= 0)
*buffer++ = (_s[_i] | (_i ? 128 : 0));
return buffer;
}
/* Uncompress a number from a file */
int uncompress1(FILE * fp, int (*f_getc) (FILE *))
{
int _c;
int num = 0;
do
{
_c = (int) f_getc(fp);
num <<= 7;
num |= _c & 127;
if (!num)
break;
}
while (_c & 128);
return num;
}
/* same routine but this works with a memory forward buffer instead of file */
/* it also increases the buffer pointer */
int uncompress2(unsigned char **buffer)
{
int _c;
int num = 0;
unsigned char *p = *buffer;
do
{
_c = (int) ((unsigned char) *p++);
num <<= 7;
num |= _c & 127;
if (!num)
break;
}
while (_c & 128);
*buffer = p;
return num;
}
/* Rourtines to make long integers portable */
unsigned long PACKLONG(unsigned long num)
{
unsigned long _i = 0L;
unsigned char *_s;
if (num)
{
_s = (unsigned char *) &_i;
_s[0] = (unsigned char) ((num >> 24) & 0xFF);
_s[1] = (unsigned char) ((num >> 16) & 0xFF);
_s[2] = (unsigned char) ((num >> 8) & 0xFF);
_s[3] = (unsigned char) (num & 0xFF);
return _i;
}
return num;
}
/* Same routine - Packs long in buffer */
void PACKLONG2(unsigned long num, unsigned char *buffer)
{
buffer[0] = (unsigned char) ((num >> 24) & 0xFF);
buffer[1] = (unsigned char) ((num >> 16) & 0xFF);
buffer[2] = (unsigned char) ((num >> 8) & 0xFF);
buffer[3] = (unsigned char) (num & 0xFF);
}
unsigned long UNPACKLONG(unsigned long num)
{
unsigned char *_s = (unsigned char *) #
return (_s[0] << 24) + (_s[1] << 16) + (_s[2] << 8) + _s[3];
}
/* Same macro - UnPacks long from buffer */
unsigned long UNPACKLONG2(unsigned char *buffer)
{
return (buffer[0] << 24) + (buffer[1] << 16) + (buffer[2] << 8) + buffer[3];
}
/***********************************************************************************
* 09/00 Jose Ruiz
* Function to compress location data in memory
*
* Compresses a LOCATION entry
*
* A single position LOCATION goes from 20 to 3 bytes.
* three positions goes from 28 to 5.
*
************************************************************************************/
#define IS_FLAG 0x80 /* Binary 10000000 */
#define COMMON_STRUCTURE 0x60 /* Binary 01100000 */
#define COMMON_IN_FILE 0x20 /* Binary 00100000 */
#define COMMON_IN_HTML_BODY 0x40 /* Binary 01000000 */
#define POS_4_BIT 0x10 /* Binary 00010000 */
/************************************************************************
From Jose on Feb 13, 2002
IS_FLAG is to indicate that the byte is a flag. As far as I remember, I
needed it to avoid null values.
When COMMON_STRUCTURE is on, this means that all the positions
have the same structure value. This helps a lot with non html
files and can save a lot of space.
When FREQ_AND_POS_EQ_1 is on, this means that freq is 1 and
pos[0]=1. Mmm, I am not sure if this is very useful now. Let me
explain better. This was useful for xml files with fields that contains
just one value. For example:
00001
20001231
But, now, I am not sure if this is useful because long time ago I
changed the position counter to not be reseted after a each field
change.
I need to check this.
POS_4_BIT indicates that all positions are within 16 positions
of each other and can thus be stored as 2 per byte. Position numbers are
stored as a delta from the previous position.
Here's indexing /usr/doc:
23840 files indexed. 177638538 total bytes. 19739102 total words.
Elapsed time: 00:04:42 CPU time: 00:03:09
Indexing done!
4 bit = 843,081 (total length = 10,630,425) 12 bytes/chunk
not 4 bit = 13,052,904 (length 83,811,498) 6 bytes/chunk
I wonder if storing the initial postion would improve that much.
*************************************************************************/
void compress_location_values(unsigned char **buf,unsigned char **flagp,int filenum,int frequency, int *posdata)
{
unsigned char *p = *buf;
unsigned char *flag;
int structure = GET_STRUCTURE(posdata[0]);
int common_structure = COMMON_STRUCTURE;
int i;
/* Make room for flag and init it */
flag = p;
*flagp = p;
p++;
*flag = IS_FLAG;
/* Add file number */
p = compress3(filenum, p);
/* Check for special case frequency == 1 and position[0] < 128 && structure == IN_FILE */
if(frequency == 1 && (GET_POSITION(posdata[0]) < 128) && structure == IN_FILE)
{
/* Remove IS_FLAG and store position in the lower 7 bits */
/* In this way we have 0bbbbbbb in *flag
** where bbbbbbb is the position and the leading 0 bit
** indicates that frequency is 1 and position is < 128 */
*flag = (unsigned char) ((int)(GET_POSITION(posdata[0])));
}
else
{
/* Otherwise IS_FLAG is set */
/* Now, let's see if all positions have the same structure to
** get better compression */
for(i=1;i 0 ; i--)
{
posdata[i] = SET_POSDATA(GET_POSITION(posdata[i]) - GET_POSITION(posdata[i-1]),GET_STRUCTURE(posdata[i]));
if( GET_POSITION(posdata[i]) >= 16)
(*flag) &= ~POS_4_BIT;
}
/* Always write first position "as is" */
p = compress3(GET_POSITION(posdata[0]), p);
/* write the position data starting at 1 */
if((*flag) & POS_4_BIT)
{
for (i = 1, j = 0; i < frequency ; i++, j++)
{
if(j % 2)
p[j/2] |= (unsigned char) GET_POSITION(posdata[i]);
else
p[j/2] = (unsigned char) GET_POSITION(posdata[i]) << 4;
}
p += ((j + 1)/2);
}
else
{
for (i = 1; i < frequency; i++)
p = compress3(GET_POSITION(posdata[i]), p);
}
/* Write out the structure bytes */
if(! (*flag & COMMON_STRUCTURE))
for(i = 0; i < frequency; i++)
*p++ = (unsigned char) GET_STRUCTURE(posdata[i]);
*buf = p;
}
}
unsigned char *compress_location(SWISH * sw, IndexFILE * indexf, LOCATION * l)
{
unsigned char *p,
*q;
int i,
max_size;
unsigned char *flag;
struct MOD_Index *idx = sw->Index;
/* check if the work buffer is long enough */
/* just to avoid bufferoverruns */
/* In the worst case and integer will need MAXINTCOMPSIZE bytes */
/* but fortunatelly this is very uncommon */
/* 2002/01 JMRUIZ
** Added an extra byte (MAXINTCOMPSIZE+1) for each position's structure
*/
max_size = sizeof(unsigned char) + sizeof(LOCATION *) + (((sizeof(LOCATION) / sizeof(int) + 1) + (l->frequency - 1)) * (MAXINTCOMPSIZE + sizeof(unsigned char)));
/* reallocate if needed */
if (max_size > idx->len_compression_buffer)
{
idx->len_compression_buffer = max_size + 200;
idx->compression_buffer = erealloc(idx->compression_buffer, idx->len_compression_buffer);
}
/* Pointer to the buffer */
p = idx->compression_buffer;
/* Add extra bytes for handling linked list */
//***JMRUIZ
memcpy(p,&l->next,sizeof(LOCATION *));
p += sizeof(LOCATION *);
/* Add the metaID */
p = compress3(l->metaID,p);
compress_location_values(&p,&flag,l->filenum,l->frequency, l->posdata);
compress_location_positions(&p,flag,l->frequency,l->posdata);
/* Get the length of all the data */
i = p - idx->compression_buffer;
/* Did we underrun our buffer? */
if (i > idx->len_compression_buffer)
progerr("Internal error in compress_location routine");
q = (unsigned char *) Mem_ZoneAlloc(idx->currentChunkLocZone, i);
memcpy(q, idx->compression_buffer, i);
return (unsigned char *) q;
}
void uncompress_location_values(unsigned char **buf,unsigned char *flag, int *filenum,int *frequency)
{
unsigned char *p = *buf;
*frequency = 0;
*flag = *p++;
if(!((*flag) & IS_FLAG))
{
*frequency = 1;
}
else
(*frequency) |= (*flag) & 15; /* Binary 00001111 */
*filenum = uncompress2(&p);
if(! (*frequency))
*frequency = uncompress2(&p);
*buf = p;
}
unsigned long four_bit_count = 0;
unsigned long four_bit_bytes = 0;
unsigned long not_four = 0;
unsigned long not_four_bytes = 0;
unsigned long four_bit_called = 0;
unsigned long not_four_called;
void uncompress_location_positions(unsigned char **buf, unsigned char flag, int frequency, int *posdata)
{
int i, j, tmp;
unsigned char *p = *buf;
int common_structure = 0;
int structure = 0;
/* Check for special case frequency == 1 and position[0] < 128 and structure == IN_FILE */
if (!(flag & IS_FLAG))
{
structure = IN_FILE;
posdata[0] = SET_POSDATA((int)(flag),structure);
}
else
{
/* Check for common structure */
if ((tmp =(flag & COMMON_STRUCTURE)))
{
common_structure = COMMON_STRUCTURE;
switch(tmp)
{
case COMMON_IN_FILE:
structure = IN_FILE;
break;
case COMMON_IN_HTML_BODY:
structure = IN_FILE | IN_BODY;
break;
default:
structure = (int)((unsigned char) *p++);
break;
}
}
/* First position is always "as is" */
posdata[0] = uncompress2(&p);
/* Check if positions where stored as two values per byte or the old "compress" style */
if(flag & POS_4_BIT)
{
for (i = 1, j = 0; i < frequency; i++, j++)
{
if(j%2)
posdata[i] = p[j/2] & 0x0F;
else
posdata[i] = p[j/2] >> 4;
}
p += ((j + 1)/2);
}
else
{
for (i = 1; i < frequency; i++)
{
tmp = uncompress2(&p);
posdata[i] = tmp;
}
}
/* Position were compressed incrementally. So restore them */
for(i = 1; i < frequency; i++)
posdata[i] += posdata[i-1];
/* Get structure */
for(i = 0; i < frequency; i++)
{
if(!common_structure)
structure = (int)((unsigned char) *p++);
posdata[i] = SET_POSDATA(posdata[i],structure);
}
}
/* Update buffer pointer */
*buf = p;
}
/* 09/00 Jose Ruiz
** Extract compressed location
*/
LOCATION *uncompress_location(SWISH * sw, IndexFILE * indexf, unsigned char *p)
{
LOCATION *loc;
int metaID,
filenum,
frequency;
unsigned char flag;
metaID = uncompress2(&p);
uncompress_location_values(&p,&flag,&filenum,&frequency);
loc = (LOCATION *) emalloc(sizeof(LOCATION) + (frequency - 1) * sizeof(int));
loc->metaID = metaID;
loc->filenum = filenum;
loc->frequency = frequency;
uncompress_location_positions(&p,flag,frequency,loc->posdata);
return loc;
}
/* 09/00 Jose Ruiz
** Compress all non yet compressed location data of an entry
*/
void CompressCurrentLocEntry(SWISH * sw, IndexFILE * indexf, ENTRY * e)
{
LOCATION *l, *prev, *next, *comp;
for(l = e->currentChunkLocationList,prev = NULL ; l != e->currentlocation; )
{
next = l->next;
comp = (LOCATION *) compress_location(sw, indexf, l);
if(l == e->currentChunkLocationList)
e->currentChunkLocationList =comp;
if(prev)
memcpy(prev, &comp, sizeof(LOCATION *)); /* Use memcpy to avoid alignment problems */
prev = comp;
l = next;
}
e->currentlocation = e->currentChunkLocationList;
}
/* 09/00 Jose Ruiz
** Function to swap location data to a temporal file or ramdisk to save
** memory. Unfortunately we cannot use it with IgnoreLimit option
** enabled.
** The data has been compressed previously in memory.
** Returns the pointer to the file.
*/
void SwapLocData(SWISH * sw, ENTRY *e, unsigned char *buf, int lenbuf)
{
int idx_swap_file;
struct MOD_Index *idx = sw->Index;
/* 2002-07 jmruiz - Get de corrsponding swap file */
/* Get the corresponding hash value to this word */
/* IMPORTANT!!!! - The routine being used here to compute the hash */
/* must be the same used to store the words */
/* Then we must get the corresponding swap file index */
/* Since we cannot have so many swap files (VERYBIGHASHSIZE for verybighash */
/* routine) we must scale the hash into SWAP_FILES */
idx_swap_file = (verybighash(e->word) * (MAX_LOC_SWAP_FILES -1))/(VERYBIGHASHSIZE -1);
if (!idx->fp_loc_write[idx_swap_file])
{
idx->fp_loc_write[idx_swap_file] = create_tempfile(sw, F_WRITE_BINARY, "loc", &idx->swap_location_name[idx_swap_file], 0 );
}
compress1(lenbuf, idx->fp_loc_write[idx_swap_file], idx->swap_putc);
if (idx->swap_write(buf, 1, lenbuf, idx->fp_loc_write[idx_swap_file]) != (unsigned int) lenbuf)
{
progerr("Cannot write location to swap file");
}
}
/* 2002-07 jmruiz - New -e schema */
/* Get location data from swap file */
/* If e is null, all data will be restored */
/* If e si not null, only the location for this data will be readed */
void unSwapLocData(SWISH * sw, int idx_swap_file, ENTRY *ep)
{
unsigned char *buf;
int lenbuf;
struct MOD_Index *idx = sw->Index;
ENTRY *e;
LOCATION *l;
FILE *fp;
/* Check if some swap file is being used */
if(!idx->fp_loc_write[idx_swap_file] && !idx->fp_loc_read[idx_swap_file])
return;
/* Check if the file is opened for write and close it */
if(idx->fp_loc_write[idx_swap_file])
{
/* Write a 0 to mark the end of locations */
idx->swap_putc(0,idx->fp_loc_write[idx_swap_file]);
idx->swap_close(idx->fp_loc_write[idx_swap_file]);
idx->fp_loc_write[idx_swap_file] = NULL;
}
/* Reopen in read mode for (for faster reads, I suppose) */
if(!idx->fp_loc_read[idx_swap_file])
{
if (!(idx->fp_loc_read[idx_swap_file] = fopen(idx->swap_location_name[idx_swap_file], F_READ_BINARY)))
progerrno("Could not open temp file %s: ", idx->swap_location_name[idx_swap_file]);
}
else
{
/* File already opened for read -> reset pointer */
fseek(idx->fp_loc_read[idx_swap_file],0,SEEK_SET);
}
fp = idx->fp_loc_read[idx_swap_file];
while((lenbuf = uncompress1(fp, idx->swap_getc)))
{
if(ep == NULL)
{
buf = (unsigned char *) Mem_ZoneAlloc(idx->totalLocZone,lenbuf);
idx->swap_read(buf, lenbuf, 1, fp);
e = *(ENTRY **)buf;
/* Store the locations in reverse order - Faster. They will be
** sorted later */
l = (LOCATION *) buf;
l->next = e->allLocationList;
e->allLocationList = l;
}
else
{
idx->swap_read(&e,sizeof(ENTRY *),1,fp);
if(ep == e)
{
buf = (unsigned char *) Mem_ZoneAlloc(idx->totalLocZone,lenbuf);
memcpy(buf,&e,sizeof(ENTRY *));
idx->swap_read(buf + sizeof(ENTRY *),lenbuf - sizeof(ENTRY *),1,fp);
/* Store the locations in reverse order - Faster. They will be
** sorted later */
l = (LOCATION *) buf;
l->next = e->allLocationList;
e->allLocationList = l;
}
else
{
/* Just advance file pointer */
idx->swap_seek(fp,lenbuf - sizeof(ENTRY *),SEEK_CUR);
}
}
}
}
/* 2002-07 jmruiz - Sorts unswaped location data by metaname, filenum */
void sortSwapLocData(SWISH * sw, ENTRY *e)
{
int i, j, k, metaID;
int *pi = NULL;
unsigned char *ptmp,
*ptmp2,
*compressed_data;
LOCATION **tmploc;
LOCATION *l, *prev=NULL, **lp;
/* Count the locations */
for(i = 0, l = e->allLocationList; l;i++, l = l->next);
/* Very trivial case */
if(i < 2)
return;
/* */
/* Now, let's sort by metanum, offset in file */
/* Compute array wide for sort */
j = 2 * sizeof(int) + sizeof(void *);
/* Compute array size */
ptmp = (void *) emalloc(j * i);
/* Build an array with the elements to compare
and pointers to data */
/* Very important to remind - data was read from the loc
** swap file in reverse order, so, to get data sorted
** by filenum we just need to use a reverse counter (i - k)
** as the other value for sorting (it has the same effect
** as filenum)
*/
for(k=0, ptmp2 = ptmp, l = e->allLocationList ; k < i; k++, l = l->next)
{
pi = (int *) ptmp2;
compressed_data = (unsigned char *)l;
/* Jump fileoffset */
compressed_data += sizeof(LOCATION *);
metaID = uncompress2(&compressed_data);
pi[0] = metaID;
pi[1] = i-k;
ptmp2 += 2 * sizeof(int);
lp = (LOCATION **)ptmp2;
*lp = l;
ptmp2 += sizeof(void *);
}
/* Sort them */
swish_qsort(ptmp, i, j, &icomp2);
/* Store results */
for (k = 0, ptmp2 = ptmp; k < i; k++)
{
ptmp2 += 2 * sizeof(int);
l = *(LOCATION **)ptmp2;
if(!k)
e->allLocationList = l;
else
{
tmploc = (LOCATION **)prev;
*tmploc = l;
}
ptmp2 += sizeof(void *);
prev = l;
}
tmploc = (LOCATION **)l;
*tmploc = NULL;
/* Free the memory of the sorting array */
efree(ptmp);
}