1 |
adcroft |
1.1 |
/* |
2 |
|
|
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company |
3 |
|
|
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 |
4 |
|
|
** |
5 |
|
|
** This program and library is free software; you can redistribute it and/or |
6 |
|
|
** modify it under the terms of the GNU (Library) General Public License |
7 |
|
|
** as published by the Free Software Foundation; either version 2 |
8 |
|
|
** of the License, or any later version. |
9 |
|
|
** |
10 |
|
|
** This program is distributed in the hope that it will be useful, |
11 |
|
|
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
|
|
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
|
|
** GNU (Library) General Public License for more details. |
14 |
|
|
** |
15 |
|
|
** You should have received a copy of the GNU (Library) General Public License |
16 |
|
|
** along with this program; if not, write to the Free Software |
17 |
|
|
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
18 |
|
|
** |
19 |
|
|
** fixed non-int subscripting pointed out by "gcc -Wall" |
20 |
|
|
** SRE 2/22/00 |
21 |
|
|
** |
22 |
|
|
** 2001-03-08 rasc rewritten and enhanced suffix routines |
23 |
|
|
** |
24 |
|
|
*/ |
25 |
|
|
|
26 |
|
|
#include "swish.h" |
27 |
|
|
#include "check.h" |
28 |
|
|
#include "hash.h" |
29 |
|
|
#include "string.h" |
30 |
|
|
#include "mem.h" |
31 |
|
|
|
32 |
|
|
/* Check if a file with a particular suffix should be indexed |
33 |
|
|
** according to the settings in the configuration file. |
34 |
|
|
*/ |
35 |
|
|
|
36 |
|
|
/* Should a word be indexed? Consults the stopword hash list |
37 |
|
|
** and checks if the word is of a reasonable length... |
38 |
|
|
** If you have any good rules that can work with most languages, |
39 |
|
|
** please let me know... |
40 |
|
|
*/ |
41 |
|
|
|
42 |
|
|
int isokword(sw, word, indexf) |
43 |
|
|
SWISH *sw; |
44 |
|
|
char *word; |
45 |
|
|
IndexFILE *indexf; |
46 |
|
|
{ |
47 |
|
|
int i, |
48 |
|
|
same, |
49 |
|
|
hasnumber, |
50 |
|
|
hasvowel, |
51 |
|
|
hascons, |
52 |
|
|
numberrow, |
53 |
|
|
vowelrow, |
54 |
|
|
consrow, |
55 |
|
|
wordlen; |
56 |
|
|
char lastchar; |
57 |
|
|
|
58 |
|
|
if (word[0] == '\0') |
59 |
|
|
return 0; |
60 |
|
|
|
61 |
|
|
if (isstopword(&indexf->header, word)) |
62 |
|
|
return 0; |
63 |
|
|
|
64 |
|
|
wordlen = strlen(word); |
65 |
|
|
if ((wordlen < indexf->header.minwordlimit) || (wordlen > indexf->header.maxwordlimit)) |
66 |
|
|
return 0; |
67 |
|
|
|
68 |
|
|
lastchar = '\0'; |
69 |
|
|
same = 0; |
70 |
|
|
hasnumber = hasvowel = hascons = 0; |
71 |
|
|
numberrow = vowelrow = consrow = 0; |
72 |
|
|
|
73 |
|
|
for (i = 0; word[i] != '\0'; i++) |
74 |
|
|
{ |
75 |
|
|
/* Max number of times a char can repeat in a word */ |
76 |
|
|
if (word[i] == lastchar) |
77 |
|
|
{ |
78 |
|
|
same++; |
79 |
|
|
if (same > IGNORESAME) |
80 |
|
|
return 0; |
81 |
|
|
} |
82 |
|
|
else |
83 |
|
|
same = 0; |
84 |
|
|
|
85 |
|
|
/* Max number of consecutive digits */ |
86 |
|
|
if (isdigit((int) ( (unsigned char) word[i]))) |
87 |
|
|
{ |
88 |
|
|
hasnumber = 1; |
89 |
|
|
numberrow++; |
90 |
|
|
if (numberrow > IGNOREROWN) |
91 |
|
|
return 0; |
92 |
|
|
vowelrow = 0; |
93 |
|
|
consrow = 0; |
94 |
|
|
} |
95 |
|
|
|
96 |
|
|
/* maximum number of consecutive vowels a word can have */ |
97 |
|
|
else if (isvowel(sw, word[i])) |
98 |
|
|
{ |
99 |
|
|
hasvowel = 1; |
100 |
|
|
vowelrow++; |
101 |
|
|
if (vowelrow > IGNOREROWV) |
102 |
|
|
return 0; |
103 |
|
|
numberrow = 0; |
104 |
|
|
consrow = 0; |
105 |
|
|
} |
106 |
|
|
|
107 |
|
|
/* maximum number of consecutive consonants a word can have */ |
108 |
|
|
else if (!ispunct((int) ( (unsigned char) word[i]))) |
109 |
|
|
{ |
110 |
|
|
hascons = 1; |
111 |
|
|
consrow++; |
112 |
|
|
if (consrow > IGNOREROWC) |
113 |
|
|
return 0; |
114 |
|
|
numberrow = 0; |
115 |
|
|
vowelrow = 0; |
116 |
|
|
} |
117 |
|
|
lastchar = word[i]; |
118 |
|
|
} |
119 |
|
|
|
120 |
|
|
/* If IGNOREALLV is 1, words containing all vowels won't be indexed. */ |
121 |
|
|
if (IGNOREALLV) |
122 |
|
|
if (hasvowel && !hascons) |
123 |
|
|
return 0; |
124 |
|
|
|
125 |
|
|
/* If IGNOREALLC is 1, words containing all consonants won't be indexed */ |
126 |
|
|
if (IGNOREALLC) |
127 |
|
|
if (hascons && !hasvowel) |
128 |
|
|
return 0; |
129 |
|
|
|
130 |
|
|
/* If IGNOREALLN is 1, words containing all digits won't be indexed */ |
131 |
|
|
if (IGNOREALLN) |
132 |
|
|
if (hasnumber && !hasvowel && !hascons) |
133 |
|
|
return 0; |
134 |
|
|
|
135 |
|
|
return 1; |
136 |
|
|
} |
137 |
|
|
|
138 |
|
|
|
139 |
|
|
/* |
140 |
|
|
-- Determine document type by checking the file extension |
141 |
|
|
-- of the filename |
142 |
|
|
-- Return: doctype |
143 |
|
|
-- 2001-03-08 rasc rewritten (optimize and match also |
144 |
|
|
-- e.g. ".htm", ".htm.de" or ".html.gz") |
145 |
|
|
*/ |
146 |
|
|
|
147 |
|
|
int getdoctype(char *filename, struct IndexContents *indexcontents) |
148 |
|
|
{ |
149 |
|
|
struct swline *swl; |
150 |
|
|
char *s, |
151 |
|
|
*fe; |
152 |
|
|
|
153 |
|
|
|
154 |
|
|
if (!indexcontents) |
155 |
|
|
return NODOCTYPE; |
156 |
|
|
|
157 |
|
|
/* basically do a right to left compare */ |
158 |
|
|
fe = (filename + strlen(filename)); |
159 |
|
|
while (indexcontents) |
160 |
|
|
{ |
161 |
|
|
swl = indexcontents->patt; |
162 |
|
|
|
163 |
|
|
while (swl) |
164 |
|
|
{ |
165 |
|
|
s = fe - strlen(swl->line); |
166 |
|
|
if (s >= filename) |
167 |
|
|
{ /* no negative overflow! */ |
168 |
|
|
if (!strcasecmp(swl->line, s)) |
169 |
|
|
{ |
170 |
|
|
return indexcontents->DocType;; |
171 |
|
|
} |
172 |
|
|
} |
173 |
|
|
swl = swl->next; |
174 |
|
|
} |
175 |
|
|
|
176 |
|
|
indexcontents = indexcontents->next; |
177 |
|
|
} |
178 |
|
|
|
179 |
|
|
return NODOCTYPE; |
180 |
|
|
} |
181 |
|
|
|
182 |
|
|
|
183 |
|
|
|
184 |
|
|
|
185 |
|
|
|
186 |
|
|
struct StoreDescription *hasdescription(int doctype, struct StoreDescription *sd) |
187 |
|
|
{ |
188 |
|
|
while (sd) |
189 |
|
|
{ |
190 |
|
|
if (sd->DocType == doctype) |
191 |
|
|
return sd; |
192 |
|
|
sd = sd->next; |
193 |
|
|
} |
194 |
|
|
return NULL; |
195 |
|
|
} |