1 |
/* |
2 |
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company |
3 |
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 |
4 |
** |
5 |
** This program and library is free software; you can redistribute it and/or |
6 |
** modify it under the terms of the GNU (Library) General Public License |
7 |
** as published by the Free Software Foundation; either version 2 |
8 |
** of the License, or any later version. |
9 |
** |
10 |
** This program is distributed in the hope that it will be useful, |
11 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
** GNU (Library) General Public License for more details. |
14 |
** |
15 |
** You should have received a copy of the GNU (Library) General Public License |
16 |
** along with this program; if not, write to the Free Software |
17 |
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
18 |
** |
19 |
** fixed non-int subscripting pointed out by "gcc -Wall" |
20 |
** SRE 2/22/00 |
21 |
** |
22 |
** 2001-03-08 rasc rewritten and enhanced suffix routines |
23 |
** |
24 |
*/ |
25 |
|
26 |
#include "swish.h" |
27 |
#include "check.h" |
28 |
#include "hash.h" |
29 |
#include "string.h" |
30 |
#include "mem.h" |
31 |
|
32 |
/* Check if a file with a particular suffix should be indexed |
33 |
** according to the settings in the configuration file. |
34 |
*/ |
35 |
|
36 |
/* Should a word be indexed? Consults the stopword hash list |
37 |
** and checks if the word is of a reasonable length... |
38 |
** If you have any good rules that can work with most languages, |
39 |
** please let me know... |
40 |
*/ |
41 |
|
42 |
int isokword(sw, word, indexf) |
43 |
SWISH *sw; |
44 |
char *word; |
45 |
IndexFILE *indexf; |
46 |
{ |
47 |
int i, |
48 |
same, |
49 |
hasnumber, |
50 |
hasvowel, |
51 |
hascons, |
52 |
numberrow, |
53 |
vowelrow, |
54 |
consrow, |
55 |
wordlen; |
56 |
char lastchar; |
57 |
|
58 |
if (word[0] == '\0') |
59 |
return 0; |
60 |
|
61 |
if (isstopword(&indexf->header, word)) |
62 |
return 0; |
63 |
|
64 |
wordlen = strlen(word); |
65 |
if ((wordlen < indexf->header.minwordlimit) || (wordlen > indexf->header.maxwordlimit)) |
66 |
return 0; |
67 |
|
68 |
lastchar = '\0'; |
69 |
same = 0; |
70 |
hasnumber = hasvowel = hascons = 0; |
71 |
numberrow = vowelrow = consrow = 0; |
72 |
|
73 |
for (i = 0; word[i] != '\0'; i++) |
74 |
{ |
75 |
/* Max number of times a char can repeat in a word */ |
76 |
if (word[i] == lastchar) |
77 |
{ |
78 |
same++; |
79 |
if (same > IGNORESAME) |
80 |
return 0; |
81 |
} |
82 |
else |
83 |
same = 0; |
84 |
|
85 |
/* Max number of consecutive digits */ |
86 |
if (isdigit((int) ( (unsigned char) word[i]))) |
87 |
{ |
88 |
hasnumber = 1; |
89 |
numberrow++; |
90 |
if (numberrow > IGNOREROWN) |
91 |
return 0; |
92 |
vowelrow = 0; |
93 |
consrow = 0; |
94 |
} |
95 |
|
96 |
/* maximum number of consecutive vowels a word can have */ |
97 |
else if (isvowel(sw, word[i])) |
98 |
{ |
99 |
hasvowel = 1; |
100 |
vowelrow++; |
101 |
if (vowelrow > IGNOREROWV) |
102 |
return 0; |
103 |
numberrow = 0; |
104 |
consrow = 0; |
105 |
} |
106 |
|
107 |
/* maximum number of consecutive consonants a word can have */ |
108 |
else if (!ispunct((int) ( (unsigned char) word[i]))) |
109 |
{ |
110 |
hascons = 1; |
111 |
consrow++; |
112 |
if (consrow > IGNOREROWC) |
113 |
return 0; |
114 |
numberrow = 0; |
115 |
vowelrow = 0; |
116 |
} |
117 |
lastchar = word[i]; |
118 |
} |
119 |
|
120 |
/* If IGNOREALLV is 1, words containing all vowels won't be indexed. */ |
121 |
if (IGNOREALLV) |
122 |
if (hasvowel && !hascons) |
123 |
return 0; |
124 |
|
125 |
/* If IGNOREALLC is 1, words containing all consonants won't be indexed */ |
126 |
if (IGNOREALLC) |
127 |
if (hascons && !hasvowel) |
128 |
return 0; |
129 |
|
130 |
/* If IGNOREALLN is 1, words containing all digits won't be indexed */ |
131 |
if (IGNOREALLN) |
132 |
if (hasnumber && !hasvowel && !hascons) |
133 |
return 0; |
134 |
|
135 |
return 1; |
136 |
} |
137 |
|
138 |
|
139 |
/* |
140 |
-- Determine document type by checking the file extension |
141 |
-- of the filename |
142 |
-- Return: doctype |
143 |
-- 2001-03-08 rasc rewritten (optimize and match also |
144 |
-- e.g. ".htm", ".htm.de" or ".html.gz") |
145 |
*/ |
146 |
|
147 |
int getdoctype(char *filename, struct IndexContents *indexcontents) |
148 |
{ |
149 |
struct swline *swl; |
150 |
char *s, |
151 |
*fe; |
152 |
|
153 |
|
154 |
if (!indexcontents) |
155 |
return NODOCTYPE; |
156 |
|
157 |
/* basically do a right to left compare */ |
158 |
fe = (filename + strlen(filename)); |
159 |
while (indexcontents) |
160 |
{ |
161 |
swl = indexcontents->patt; |
162 |
|
163 |
while (swl) |
164 |
{ |
165 |
s = fe - strlen(swl->line); |
166 |
if (s >= filename) |
167 |
{ /* no negative overflow! */ |
168 |
if (!strcasecmp(swl->line, s)) |
169 |
{ |
170 |
return indexcontents->DocType;; |
171 |
} |
172 |
} |
173 |
swl = swl->next; |
174 |
} |
175 |
|
176 |
indexcontents = indexcontents->next; |
177 |
} |
178 |
|
179 |
return NODOCTYPE; |
180 |
} |
181 |
|
182 |
|
183 |
|
184 |
|
185 |
|
186 |
struct StoreDescription *hasdescription(int doctype, struct StoreDescription *sd) |
187 |
{ |
188 |
while (sd) |
189 |
{ |
190 |
if (sd->DocType == doctype) |
191 |
return sd; |
192 |
sd = sd->next; |
193 |
} |
194 |
return NULL; |
195 |
} |