swish-e/src/check.c

/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**
** fixed non-int subscripting pointed out by "gcc -Wall"
** SRE 2/22/00
**
** 2001-03-08 rasc   rewritten and enhanced suffix routines
**
*/

#include "swish.h"
#include "check.h"
#include "hash.h"
#include "string.h"
#include "mem.h"

/* Check if a file with a particular suffix should be indexed
** according to the settings in the configuration file.
*/

/* Should a word be indexed? Consults the stopword hash list
** and checks if the word is of a reasonable length...
** If you have any good rules that can work with most languages,
** please let me know...
*/

int     isokword(sw, word, indexf)
     SWISH  *sw;
     char   *word;
     IndexFILE *indexf;
{
    int     i,
            same,
            hasnumber,
            hasvowel,
            hascons,
            numberrow,
            vowelrow,
            consrow,
            wordlen;
    char    lastchar;

    if (word[0] == '\0')
        return 0;

    if (isstopword(&indexf->header, word))
        return 0;

    wordlen = strlen(word);
    if ((wordlen < indexf->header.minwordlimit) || (wordlen > indexf->header.maxwordlimit))
        return 0;

    lastchar = '\0';
    same = 0;
    hasnumber = hasvowel = hascons = 0;
    numberrow = vowelrow = consrow = 0;

    for (i = 0; word[i] != '\0'; i++)
    {
        /* Max number of times a char can repeat in a word */
        if (word[i] == lastchar)
        {
            same++;
            if (same > IGNORESAME)
                return 0;
        }
        else
            same = 0;

        /* Max number of consecutive digits */
        if (isdigit((int) ( (unsigned char) word[i])))
        {
            hasnumber = 1;
            numberrow++;
            if (numberrow > IGNOREROWN)
                return 0;
            vowelrow = 0;
            consrow = 0;
        }

        /* maximum number of consecutive vowels a word can have */
        else if (isvowel(sw, word[i]))
        {
            hasvowel = 1;
            vowelrow++;
            if (vowelrow > IGNOREROWV)
                return 0;
            numberrow = 0;
            consrow = 0;
        }

        /* maximum number of consecutive consonants a word can have */
        else if (!ispunct((int) ( (unsigned char) word[i])))
        {
            hascons = 1;
            consrow++;
            if (consrow > IGNOREROWC)
                return 0;
            numberrow = 0;
            vowelrow = 0;
        }
        lastchar = word[i];
    }

    /* If IGNOREALLV is 1, words containing all vowels won't be indexed. */
    if (IGNOREALLV)
        if (hasvowel && !hascons)
            return 0;

    /* If IGNOREALLC is 1, words containing all consonants won't be indexed */
    if (IGNOREALLC)
        if (hascons && !hasvowel)
            return 0;

    /* If IGNOREALLN is 1, words containing all digits won't be indexed */
    if (IGNOREALLN)
        if (hasnumber && !hasvowel && !hascons)
            return 0;

    return 1;
}


/*
  -- Determine document type by checking the file extension
  -- of the filename
  -- Return: doctype
  -- 2001-03-08 rasc   rewritten (optimize and match also
  --                   e.g. ".htm", ".htm.de" or ".html.gz")
*/

int     getdoctype(char *filename, struct IndexContents *indexcontents)
{
    struct swline *swl;
    char   *s,
           *fe;


    if (!indexcontents)
        return NODOCTYPE;

    /* basically do a right to left compare */
    fe = (filename + strlen(filename));
    while (indexcontents)
    {
        swl = indexcontents->patt;

        while (swl)
        {
            s = fe - strlen(swl->line);
            if (s >= filename)
            {                   /* no negative overflow! */
                if (!strcasecmp(swl->line, s))
                {
                    return indexcontents->DocType;;
                }
            }
            swl = swl->next;
        }

        indexcontents = indexcontents->next;
    }

    return NODOCTYPE;
}


struct StoreDescription *hasdescription(int doctype, struct StoreDescription *sd)
{
    while (sd)
    {
        if (sd->DocType == doctype)
            return sd;
        sd = sd->next;
    }
    return NULL;
}
1	/*
2	** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
3	** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
4	**
5	** This program and library is free software; you can redistribute it and/or
6	** modify it under the terms of the GNU (Library) General Public License
7	** as published by the Free Software Foundation; either version 2
8	** of the License, or any later version.
9	**
10	** This program is distributed in the hope that it will be useful,
11	** but WITHOUT ANY WARRANTY; without even the implied warranty of
12	** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	** GNU (Library) General Public License for more details.
14	**
15	** You should have received a copy of the GNU (Library) General Public License
16	** along with this program; if not, write to the Free Software
17	** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18	**
19	** fixed non-int subscripting pointed out by "gcc -Wall"
20	** SRE 2/22/00
21	**
22	** 2001-03-08 rasc rewritten and enhanced suffix routines
23	**
24	*/
25
26	#include "swish.h"
27	#include "check.h"
28	#include "hash.h"
29	#include "string.h"
30	#include "mem.h"
31
32	/* Check if a file with a particular suffix should be indexed
33	** according to the settings in the configuration file.
34	*/
35
36	/* Should a word be indexed? Consults the stopword hash list
37	** and checks if the word is of a reasonable length...
38	** If you have any good rules that can work with most languages,
39	** please let me know...
40	*/
41
42	int isokword(sw, word, indexf)
43	SWISH *sw;
44	char *word;
45	IndexFILE *indexf;
46	{
47	int i,
48	same,
49	hasnumber,
50	hasvowel,
51	hascons,
52	numberrow,
53	vowelrow,
54	consrow,
55	wordlen;
56	char lastchar;
57
58	if (word[0] == '\0')
59	return 0;
60
61	if (isstopword(&indexf->header, word))
62	return 0;
63
64	wordlen = strlen(word);
65	if ((wordlen < indexf->header.minwordlimit) \|\| (wordlen > indexf->header.maxwordlimit))
66	return 0;
67
68	lastchar = '\0';
69	same = 0;
70	hasnumber = hasvowel = hascons = 0;
71	numberrow = vowelrow = consrow = 0;
72
73	for (i = 0; word[i] != '\0'; i++)
74	{
75	/* Max number of times a char can repeat in a word */
76	if (word[i] == lastchar)
77	{
78	same++;
79	if (same > IGNORESAME)
80	return 0;
81	}
82	else
83	same = 0;
84
85	/* Max number of consecutive digits */
86	if (isdigit((int) ( (unsigned char) word[i])))
87	{
88	hasnumber = 1;
89	numberrow++;
90	if (numberrow > IGNOREROWN)
91	return 0;
92	vowelrow = 0;
93	consrow = 0;
94	}
95
96	/* maximum number of consecutive vowels a word can have */
97	else if (isvowel(sw, word[i]))
98	{
99	hasvowel = 1;
100	vowelrow++;
101	if (vowelrow > IGNOREROWV)
102	return 0;
103	numberrow = 0;
104	consrow = 0;
105	}
106
107	/* maximum number of consecutive consonants a word can have */
108	else if (!ispunct((int) ( (unsigned char) word[i])))
109	{
110	hascons = 1;
111	consrow++;
112	if (consrow > IGNOREROWC)
113	return 0;
114	numberrow = 0;
115	vowelrow = 0;
116	}
117	lastchar = word[i];
118	}
119
120	/* If IGNOREALLV is 1, words containing all vowels won't be indexed. */
121	if (IGNOREALLV)
122	if (hasvowel && !hascons)
123	return 0;
124
125	/* If IGNOREALLC is 1, words containing all consonants won't be indexed */
126	if (IGNOREALLC)
127	if (hascons && !hasvowel)
128	return 0;
129
130	/* If IGNOREALLN is 1, words containing all digits won't be indexed */
131	if (IGNOREALLN)
132	if (hasnumber && !hasvowel && !hascons)
133	return 0;
134
135	return 1;
136	}
137
138
139	/*
140	-- Determine document type by checking the file extension
141	-- of the filename
142	-- Return: doctype
143	-- 2001-03-08 rasc rewritten (optimize and match also
144	-- e.g. ".htm", ".htm.de" or ".html.gz")
145	*/
146
147	int getdoctype(char filename, struct IndexContents indexcontents)
148	{
149	struct swline *swl;
150	char *s,
151	*fe;
152
153
154	if (!indexcontents)
155	return NODOCTYPE;
156
157	/* basically do a right to left compare */
158	fe = (filename + strlen(filename));
159	while (indexcontents)
160	{
161	swl = indexcontents->patt;
162
163	while (swl)
164	{
165	s = fe - strlen(swl->line);
166	if (s >= filename)
167	{ /* no negative overflow! */
168	if (!strcasecmp(swl->line, s))
169	{
170	return indexcontents->DocType;;
171	}
172	}
173	swl = swl->next;
174	}
175
176	indexcontents = indexcontents->next;
177	}
178
179	return NODOCTYPE;
180	}
181
182
183
184
185
186	struct StoreDescription hasdescription(int doctype, struct StoreDescription sd)
187	{
188	while (sd)
189	{
190	if (sd->DocType == doctype)
191	return sd;
192	sd = sd->next;
193	}
194	return NULL;
195	}