swish-e/src/hash.c

/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**---------------------------------------------------------
** Added addStopList to support printing of common words
** G. Hill 4/7/97  ghill@library.berkeley.edu
**
** change sprintf to snprintf to avoid corruption
** SRE 11/17/99
**
** 04/00 - Jose Ruiz
** change hash for bighash in mergeresultlists for better performance
** when big searchs (a* or b* or c*)
**
*/

#include "swish.h"
#include "string.h"
#include "hash.h"
#include "mem.h"
#include "search.h"

/* Hashes a string.
*/

unsigned hash(s)
     char   *s;
{
    unsigned hashval;

    for (hashval = 0; *s != '\0'; s++)
        hashval = (int) ((unsigned char) *s) + 31 * hashval;
    return hashval % HASHSIZE;
}

/* Hashes a string for a larger hash table.
*/

unsigned bighash(s)
     char   *s;
{
    unsigned hashval;

    for (hashval = 0; *s != '\0'; s++)
        hashval = (int) ((unsigned char) *s) + 31 * hashval;
    return hashval % BIGHASHSIZE;
}

/* Hashes a int.
*/

unsigned numhash(i)
     int     i;
{
    return i % HASHSIZE;
}

/* Hashes a int for a larger hash table.
*/

unsigned bignumhash(i)
     int     i;
{
    return i % BIGHASHSIZE;
}

/* Hashes a string for a larger hash table (for search).
*/

unsigned verybighash(s)
     char   *s;
{
    unsigned hashval;

    for (hashval = 0; *s != '\0'; s++)
        hashval = (int) ((unsigned char) *s) + 31 * hashval;
    return hashval % VERYBIGHASHSIZE;
}


/* Adds a stop word to the list of removed common words */
void    addStopList(INDEXDATAHEADER *header, char *word)
{
    char   *arrayWord;

    if (isstopword(header, word))
        return;

    /* Another BUG!!  Jose Ruiz 04/00
       The dimension of the array was not checked 
       Fixed */
    if (header->stopPos == header->stopMaxSize)
    {
        header->stopMaxSize += 100;
        if (!header->stopList)
            header->stopList = (char **) emalloc(header->stopMaxSize * sizeof(char *));

        else
            header->stopList = (char **) erealloc(header->stopList, header->stopMaxSize * sizeof(char *));
    }
    arrayWord = (char *) estrdup(word);
    header->stopList[header->stopPos++] = arrayWord;
}


/* Adds a stop word to a hash table.
*/

void    addstophash(INDEXDATAHEADER *header, char *word)
{
    unsigned hashval;
    struct swline *sp;

    if (isstopword(header, word))
        return;

    sp = (struct swline *) emalloc(sizeof(struct swline));

    sp->line = (char *) estrdup(word);

    hashval = hash(word);
    sp->next = header->hashstoplist[hashval];
    header->hashstoplist[hashval] = sp;
}

/* Sees if a word is a stop word by looking it up in the hash table.
*/

int     isstopword(INDEXDATAHEADER *header, char *word)
{
    unsigned hashval;
    struct swline *sp;

    hashval = hash(word);
    sp = header->hashstoplist[hashval];

    while (sp != NULL)
    {
        if (!strcmp(sp->line, word))
            return 1;
        sp = sp->next;
    }
    return 0;
}


/* Adds a buzzword to a hash table.*/

void    addbuzzwordhash(INDEXDATAHEADER *header, char *word)
{
    unsigned hashval;
    struct swline *sp;

    if (isbuzzword(header, word))
        return;

    header->buzzwords_used_flag++;

    sp = (struct swline *) emalloc(sizeof(struct swline));

    sp->line = (char *) estrdup(word);

    
    /* should buzzwords be case sensitive? */
    strtolower( sp->line );

    hashval = hash( sp->line );

    
    sp->next = header->hashbuzzwordlist[hashval];
    header->hashbuzzwordlist[hashval] = sp;
}

void    freebuzzwordhash(INDEXDATAHEADER *header)
{
    int     i;
    struct swline *sp,
           *tmp;

    for (i = 0; i < HASHSIZE; i++)
        if (header->hashbuzzwordlist[i])
        {
            sp = (struct swline *) header->hashbuzzwordlist[i];
            while (sp)
            {
                tmp = sp->next;
                efree(sp->line);
                efree(sp);
                sp = tmp;
            }
            header->hashbuzzwordlist[i] = NULL;
        }
}


/* Sees if a word is a buzzword by looking it up in the hash table. */

int     isbuzzword(INDEXDATAHEADER *header, char *word)
{
    unsigned hashval;
    struct swline *sp;

    hashval = hash(word);
    sp = header->hashbuzzwordlist[hashval];

    while (sp != NULL)
    {
        if (!strcmp(sp->line, word))
            return 1;
        sp = sp->next;
    }
    return 0;
}


void    freestophash(INDEXDATAHEADER *header)
{
    int     i;
    struct swline *sp,
           *tmp;

    for (i = 0; i < HASHSIZE; i++)
        if (header->hashstoplist[i])
        {
            sp = (struct swline *) header->hashstoplist[i];
            while (sp)
            {
                tmp = sp->next;
                efree(sp->line);
                efree(sp);
                sp = tmp;
            }
            header->hashstoplist[i] = NULL;
        }
}

void    freeStopList(INDEXDATAHEADER *header)
{
    int     i;

    for (i = 0; i < header->stopPos; i++)
        efree(header->stopList[i]);
    if (header->stopList)
        efree(header->stopList);
    header->stopList = NULL;
    header->stopPos = header->stopMaxSize = 0;
}

/* Adds a "use" word to a hash table.
*/

void    addusehash(INDEXDATAHEADER *header, char *word)
{
    unsigned hashval;
    struct swline *sp;

    if (isuseword(header, word))
        return;

    sp = (struct swline *) emalloc(sizeof(struct swline));

    sp->line = (char *) estrdup(word);

    hashval = hash(word);
    sp->next = header->hashuselist[hashval];
    header->hashuselist[hashval] = sp;
}

/* Sees if a word is a "use" word by looking it up in the hash table.
*/

int     isuseword(INDEXDATAHEADER *header, char *word)
{
    unsigned hashval;
    struct swline *sp;

    hashval = hash(word);
    sp = header->hashuselist[hashval];

    while (sp != NULL)
    {
        if (!strcmp(sp->line, word))
            return 1;
        sp = sp->next;
    }
    return 0;
}
1	/*
2	** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3	** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
4	**
5	** This program and library is free software; you can redistribute it and/or
6	** modify it under the terms of the GNU (Library) General Public License
7	** as published by the Free Software Foundation; either version 2
8	** of the License, or any later version.
9	**
10	** This program is distributed in the hope that it will be useful,
11	** but WITHOUT ANY WARRANTY; without even the implied warranty of
12	** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	** GNU (Library) General Public License for more details.
14	**
15	** You should have received a copy of the GNU (Library) General Public License
16	** along with this program; if not, write to the Free Software
17	** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18	**---------------------------------------------------------
19	** Added addStopList to support printing of common words
20	** G. Hill 4/7/97 ghill@library.berkeley.edu
21	**
22	** change sprintf to snprintf to avoid corruption
23	** SRE 11/17/99
24	**
25	** 04/00 - Jose Ruiz
26	** change hash for bighash in mergeresultlists for better performance
27	** when big searchs (a* or b* or c*)
28	**
29	*/
30
31	#include "swish.h"
32	#include "string.h"
33	#include "hash.h"
34	#include "mem.h"
35	#include "search.h"
36
37	/* Hashes a string.
38	*/
39
40	unsigned hash(s)
41	char *s;
42	{
43	unsigned hashval;
44
45	for (hashval = 0; *s != '\0'; s++)
46	hashval = (int) ((unsigned char) s) + 31 hashval;
47	return hashval % HASHSIZE;
48	}
49
50	/* Hashes a string for a larger hash table.
51	*/
52
53	unsigned bighash(s)
54	char *s;
55	{
56	unsigned hashval;
57
58	for (hashval = 0; *s != '\0'; s++)
59	hashval = (int) ((unsigned char) s) + 31 hashval;
60	return hashval % BIGHASHSIZE;
61	}
62
63	/* Hashes a int.
64	*/
65
66	unsigned numhash(i)
67	int i;
68	{
69	return i % HASHSIZE;
70	}
71
72	/* Hashes a int for a larger hash table.
73	*/
74
75	unsigned bignumhash(i)
76	int i;
77	{
78	return i % BIGHASHSIZE;
79	}
80
81	/* Hashes a string for a larger hash table (for search).
82	*/
83
84	unsigned verybighash(s)
85	char *s;
86	{
87	unsigned hashval;
88
89	for (hashval = 0; *s != '\0'; s++)
90	hashval = (int) ((unsigned char) s) + 31 hashval;
91	return hashval % VERYBIGHASHSIZE;
92	}
93
94
95	/* Adds a stop word to the list of removed common words */
96	void addStopList(INDEXDATAHEADER header, char word)
97	{
98	char *arrayWord;
99
100	if (isstopword(header, word))
101	return;
102
103	/* Another BUG!! Jose Ruiz 04/00
104	The dimension of the array was not checked
105	Fixed */
106	if (header->stopPos == header->stopMaxSize)
107	{
108	header->stopMaxSize += 100;
109	if (!header->stopList)
110	header->stopList = (char *) emalloc(header->stopMaxSize sizeof(char *));
111
112	else
113	header->stopList = (char *) erealloc(header->stopList, header->stopMaxSize sizeof(char *));
114	}
115	arrayWord = (char *) estrdup(word);
116	header->stopList[header->stopPos++] = arrayWord;
117	}
118
119
120	/* Adds a stop word to a hash table.
121	*/
122
123	void addstophash(INDEXDATAHEADER header, char word)
124	{
125	unsigned hashval;
126	struct swline *sp;
127
128	if (isstopword(header, word))
129	return;
130
131	sp = (struct swline *) emalloc(sizeof(struct swline));
132
133	sp->line = (char *) estrdup(word);
134
135	hashval = hash(word);
136	sp->next = header->hashstoplist[hashval];
137	header->hashstoplist[hashval] = sp;
138	}
139
140	/* Sees if a word is a stop word by looking it up in the hash table.
141	*/
142
143	int isstopword(INDEXDATAHEADER header, char word)
144	{
145	unsigned hashval;
146	struct swline *sp;
147
148	hashval = hash(word);
149	sp = header->hashstoplist[hashval];
150
151	while (sp != NULL)
152	{
153	if (!strcmp(sp->line, word))
154	return 1;
155	sp = sp->next;
156	}
157	return 0;
158	}
159
160
161
162	/* Adds a buzzword to a hash table.*/
163
164	void addbuzzwordhash(INDEXDATAHEADER header, char word)
165	{
166	unsigned hashval;
167	struct swline *sp;
168
169	if (isbuzzword(header, word))
170	return;
171
172	header->buzzwords_used_flag++;
173
174	sp = (struct swline *) emalloc(sizeof(struct swline));
175
176	sp->line = (char *) estrdup(word);
177
178
179	/* should buzzwords be case sensitive? */
180	strtolower( sp->line );
181
182	hashval = hash( sp->line );
183
184
185	sp->next = header->hashbuzzwordlist[hashval];
186	header->hashbuzzwordlist[hashval] = sp;
187	}
188
189	void freebuzzwordhash(INDEXDATAHEADER *header)
190	{
191	int i;
192	struct swline *sp,
193	*tmp;
194
195	for (i = 0; i < HASHSIZE; i++)
196	if (header->hashbuzzwordlist[i])
197	{
198	sp = (struct swline *) header->hashbuzzwordlist[i];
199	while (sp)
200	{
201	tmp = sp->next;
202	efree(sp->line);
203	efree(sp);
204	sp = tmp;
205	}
206	header->hashbuzzwordlist[i] = NULL;
207	}
208	}
209
210
211	/* Sees if a word is a buzzword by looking it up in the hash table. */
212
213	int isbuzzword(INDEXDATAHEADER header, char word)
214	{
215	unsigned hashval;
216	struct swline *sp;
217
218	hashval = hash(word);
219	sp = header->hashbuzzwordlist[hashval];
220
221	while (sp != NULL)
222	{
223	if (!strcmp(sp->line, word))
224	return 1;
225	sp = sp->next;
226	}
227	return 0;
228	}
229
230
231
232	void freestophash(INDEXDATAHEADER *header)
233	{
234	int i;
235	struct swline *sp,
236	*tmp;
237
238	for (i = 0; i < HASHSIZE; i++)
239	if (header->hashstoplist[i])
240	{
241	sp = (struct swline *) header->hashstoplist[i];
242	while (sp)
243	{
244	tmp = sp->next;
245	efree(sp->line);
246	efree(sp);
247	sp = tmp;
248	}
249	header->hashstoplist[i] = NULL;
250	}
251	}
252
253	void freeStopList(INDEXDATAHEADER *header)
254	{
255	int i;
256
257	for (i = 0; i < header->stopPos; i++)
258	efree(header->stopList[i]);
259	if (header->stopList)
260	efree(header->stopList);
261	header->stopList = NULL;
262	header->stopPos = header->stopMaxSize = 0;
263	}
264
265	/* Adds a "use" word to a hash table.
266	*/
267
268	void addusehash(INDEXDATAHEADER header, char word)
269	{
270	unsigned hashval;
271	struct swline *sp;
272
273	if (isuseword(header, word))
274	return;
275
276	sp = (struct swline *) emalloc(sizeof(struct swline));
277
278	sp->line = (char *) estrdup(word);
279
280	hashval = hash(word);
281	sp->next = header->hashuselist[hashval];
282	header->hashuselist[hashval] = sp;
283	}
284
285	/* Sees if a word is a "use" word by looking it up in the hash table.
286	*/
287
288	int isuseword(INDEXDATAHEADER header, char word)
289	{
290	unsigned hashval;
291	struct swline *sp;
292
293	hashval = hash(word);
294	sp = header->hashuselist[hashval];
295
296	while (sp != NULL)
297	{
298	if (!strcmp(sp->line, word))
299	return 1;
300	sp = sp->next;
301	}
302	return 0;
303	}