/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/check.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/check.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (hide annotations) (download)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch point for: Import, MAIN
File MIME type: text/plain
Initial revision

1 adcroft 1.1 /*
2     ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3     ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
4     **
5     ** This program and library is free software; you can redistribute it and/or
6     ** modify it under the terms of the GNU (Library) General Public License
7     ** as published by the Free Software Foundation; either version 2
8     ** of the License, or any later version.
9     **
10     ** This program is distributed in the hope that it will be useful,
11     ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12     ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13     ** GNU (Library) General Public License for more details.
14     **
15     ** You should have received a copy of the GNU (Library) General Public License
16     ** along with this program; if not, write to the Free Software
17     ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18     **
19     ** fixed non-int subscripting pointed out by "gcc -Wall"
20     ** SRE 2/22/00
21     **
22     ** 2001-03-08 rasc rewritten and enhanced suffix routines
23     **
24     */
25    
26     #include "swish.h"
27     #include "check.h"
28     #include "hash.h"
29     #include "string.h"
30     #include "mem.h"
31    
32     /* Check if a file with a particular suffix should be indexed
33     ** according to the settings in the configuration file.
34     */
35    
36     /* Should a word be indexed? Consults the stopword hash list
37     ** and checks if the word is of a reasonable length...
38     ** If you have any good rules that can work with most languages,
39     ** please let me know...
40     */
41    
42     int isokword(sw, word, indexf)
43     SWISH *sw;
44     char *word;
45     IndexFILE *indexf;
46     {
47     int i,
48     same,
49     hasnumber,
50     hasvowel,
51     hascons,
52     numberrow,
53     vowelrow,
54     consrow,
55     wordlen;
56     char lastchar;
57    
58     if (word[0] == '\0')
59     return 0;
60    
61     if (isstopword(&indexf->header, word))
62     return 0;
63    
64     wordlen = strlen(word);
65     if ((wordlen < indexf->header.minwordlimit) || (wordlen > indexf->header.maxwordlimit))
66     return 0;
67    
68     lastchar = '\0';
69     same = 0;
70     hasnumber = hasvowel = hascons = 0;
71     numberrow = vowelrow = consrow = 0;
72    
73     for (i = 0; word[i] != '\0'; i++)
74     {
75     /* Max number of times a char can repeat in a word */
76     if (word[i] == lastchar)
77     {
78     same++;
79     if (same > IGNORESAME)
80     return 0;
81     }
82     else
83     same = 0;
84    
85     /* Max number of consecutive digits */
86     if (isdigit((int) ( (unsigned char) word[i])))
87     {
88     hasnumber = 1;
89     numberrow++;
90     if (numberrow > IGNOREROWN)
91     return 0;
92     vowelrow = 0;
93     consrow = 0;
94     }
95    
96     /* maximum number of consecutive vowels a word can have */
97     else if (isvowel(sw, word[i]))
98     {
99     hasvowel = 1;
100     vowelrow++;
101     if (vowelrow > IGNOREROWV)
102     return 0;
103     numberrow = 0;
104     consrow = 0;
105     }
106    
107     /* maximum number of consecutive consonants a word can have */
108     else if (!ispunct((int) ( (unsigned char) word[i])))
109     {
110     hascons = 1;
111     consrow++;
112     if (consrow > IGNOREROWC)
113     return 0;
114     numberrow = 0;
115     vowelrow = 0;
116     }
117     lastchar = word[i];
118     }
119    
120     /* If IGNOREALLV is 1, words containing all vowels won't be indexed. */
121     if (IGNOREALLV)
122     if (hasvowel && !hascons)
123     return 0;
124    
125     /* If IGNOREALLC is 1, words containing all consonants won't be indexed */
126     if (IGNOREALLC)
127     if (hascons && !hasvowel)
128     return 0;
129    
130     /* If IGNOREALLN is 1, words containing all digits won't be indexed */
131     if (IGNOREALLN)
132     if (hasnumber && !hasvowel && !hascons)
133     return 0;
134    
135     return 1;
136     }
137    
138    
139     /*
140     -- Determine document type by checking the file extension
141     -- of the filename
142     -- Return: doctype
143     -- 2001-03-08 rasc rewritten (optimize and match also
144     -- e.g. ".htm", ".htm.de" or ".html.gz")
145     */
146    
147     int getdoctype(char *filename, struct IndexContents *indexcontents)
148     {
149     struct swline *swl;
150     char *s,
151     *fe;
152    
153    
154     if (!indexcontents)
155     return NODOCTYPE;
156    
157     /* basically do a right to left compare */
158     fe = (filename + strlen(filename));
159     while (indexcontents)
160     {
161     swl = indexcontents->patt;
162    
163     while (swl)
164     {
165     s = fe - strlen(swl->line);
166     if (s >= filename)
167     { /* no negative overflow! */
168     if (!strcasecmp(swl->line, s))
169     {
170     return indexcontents->DocType;;
171     }
172     }
173     swl = swl->next;
174     }
175    
176     indexcontents = indexcontents->next;
177     }
178    
179     return NODOCTYPE;
180     }
181    
182    
183    
184    
185    
186     struct StoreDescription *hasdescription(int doctype, struct StoreDescription *sd)
187     {
188     while (sd)
189     {
190     if (sd->DocType == doctype)
191     return sd;
192     sd = sd->next;
193     }
194     return NULL;
195     }

  ViewVC Help
Powered by ViewVC 1.1.22