/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/check.c
ViewVC logotype

Contents of /mitgcm.org/devel/buildweb/pkg/swish-e/src/check.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 /*
2 ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3 ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
4 **
5 ** This program and library is free software; you can redistribute it and/or
6 ** modify it under the terms of the GNU (Library) General Public License
7 ** as published by the Free Software Foundation; either version 2
8 ** of the License, or any later version.
9 **
10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU (Library) General Public License for more details.
14 **
15 ** You should have received a copy of the GNU (Library) General Public License
16 ** along with this program; if not, write to the Free Software
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 **
19 ** fixed non-int subscripting pointed out by "gcc -Wall"
20 ** SRE 2/22/00
21 **
22 ** 2001-03-08 rasc rewritten and enhanced suffix routines
23 **
24 */
25
26 #include "swish.h"
27 #include "check.h"
28 #include "hash.h"
29 #include "string.h"
30 #include "mem.h"
31
32 /* Check if a file with a particular suffix should be indexed
33 ** according to the settings in the configuration file.
34 */
35
36 /* Should a word be indexed? Consults the stopword hash list
37 ** and checks if the word is of a reasonable length...
38 ** If you have any good rules that can work with most languages,
39 ** please let me know...
40 */
41
42 int isokword(sw, word, indexf)
43 SWISH *sw;
44 char *word;
45 IndexFILE *indexf;
46 {
47 int i,
48 same,
49 hasnumber,
50 hasvowel,
51 hascons,
52 numberrow,
53 vowelrow,
54 consrow,
55 wordlen;
56 char lastchar;
57
58 if (word[0] == '\0')
59 return 0;
60
61 if (isstopword(&indexf->header, word))
62 return 0;
63
64 wordlen = strlen(word);
65 if ((wordlen < indexf->header.minwordlimit) || (wordlen > indexf->header.maxwordlimit))
66 return 0;
67
68 lastchar = '\0';
69 same = 0;
70 hasnumber = hasvowel = hascons = 0;
71 numberrow = vowelrow = consrow = 0;
72
73 for (i = 0; word[i] != '\0'; i++)
74 {
75 /* Max number of times a char can repeat in a word */
76 if (word[i] == lastchar)
77 {
78 same++;
79 if (same > IGNORESAME)
80 return 0;
81 }
82 else
83 same = 0;
84
85 /* Max number of consecutive digits */
86 if (isdigit((int) ( (unsigned char) word[i])))
87 {
88 hasnumber = 1;
89 numberrow++;
90 if (numberrow > IGNOREROWN)
91 return 0;
92 vowelrow = 0;
93 consrow = 0;
94 }
95
96 /* maximum number of consecutive vowels a word can have */
97 else if (isvowel(sw, word[i]))
98 {
99 hasvowel = 1;
100 vowelrow++;
101 if (vowelrow > IGNOREROWV)
102 return 0;
103 numberrow = 0;
104 consrow = 0;
105 }
106
107 /* maximum number of consecutive consonants a word can have */
108 else if (!ispunct((int) ( (unsigned char) word[i])))
109 {
110 hascons = 1;
111 consrow++;
112 if (consrow > IGNOREROWC)
113 return 0;
114 numberrow = 0;
115 vowelrow = 0;
116 }
117 lastchar = word[i];
118 }
119
120 /* If IGNOREALLV is 1, words containing all vowels won't be indexed. */
121 if (IGNOREALLV)
122 if (hasvowel && !hascons)
123 return 0;
124
125 /* If IGNOREALLC is 1, words containing all consonants won't be indexed */
126 if (IGNOREALLC)
127 if (hascons && !hasvowel)
128 return 0;
129
130 /* If IGNOREALLN is 1, words containing all digits won't be indexed */
131 if (IGNOREALLN)
132 if (hasnumber && !hasvowel && !hascons)
133 return 0;
134
135 return 1;
136 }
137
138
139 /*
140 -- Determine document type by checking the file extension
141 -- of the filename
142 -- Return: doctype
143 -- 2001-03-08 rasc rewritten (optimize and match also
144 -- e.g. ".htm", ".htm.de" or ".html.gz")
145 */
146
147 int getdoctype(char *filename, struct IndexContents *indexcontents)
148 {
149 struct swline *swl;
150 char *s,
151 *fe;
152
153
154 if (!indexcontents)
155 return NODOCTYPE;
156
157 /* basically do a right to left compare */
158 fe = (filename + strlen(filename));
159 while (indexcontents)
160 {
161 swl = indexcontents->patt;
162
163 while (swl)
164 {
165 s = fe - strlen(swl->line);
166 if (s >= filename)
167 { /* no negative overflow! */
168 if (!strcasecmp(swl->line, s))
169 {
170 return indexcontents->DocType;;
171 }
172 }
173 swl = swl->next;
174 }
175
176 indexcontents = indexcontents->next;
177 }
178
179 return NODOCTYPE;
180 }
181
182
183
184
185
186 struct StoreDescription *hasdescription(int doctype, struct StoreDescription *sd)
187 {
188 while (sd)
189 {
190 if (sd->DocType == doctype)
191 return sd;
192 sd = sd->next;
193 }
194 return NULL;
195 }

  ViewVC Help
Powered by ViewVC 1.1.22