/* ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 ** ** This program and library is free software; you can redistribute it and/or ** modify it under the terms of the GNU (Library) General Public License ** as published by the Free Software Foundation; either version 2 ** of the License, or any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU (Library) General Public License for more details. ** ** You should have received a copy of the GNU (Library) General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ** ** fixed non-int subscripting pointed out by "gcc -Wall" ** SRE 2/22/00 ** ** 2001-03-08 rasc rewritten and enhanced suffix routines ** */ #include "swish.h" #include "check.h" #include "hash.h" #include "string.h" #include "mem.h" /* Check if a file with a particular suffix should be indexed ** according to the settings in the configuration file. */ /* Should a word be indexed? Consults the stopword hash list ** and checks if the word is of a reasonable length... ** If you have any good rules that can work with most languages, ** please let me know... */ int isokword(sw, word, indexf) SWISH *sw; char *word; IndexFILE *indexf; { int i, same, hasnumber, hasvowel, hascons, numberrow, vowelrow, consrow, wordlen; char lastchar; if (word[0] == '\0') return 0; if (isstopword(&indexf->header, word)) return 0; wordlen = strlen(word); if ((wordlen < indexf->header.minwordlimit) || (wordlen > indexf->header.maxwordlimit)) return 0; lastchar = '\0'; same = 0; hasnumber = hasvowel = hascons = 0; numberrow = vowelrow = consrow = 0; for (i = 0; word[i] != '\0'; i++) { /* Max number of times a char can repeat in a word */ if (word[i] == lastchar) { same++; if (same > IGNORESAME) return 0; } else same = 0; /* Max number of consecutive digits */ if (isdigit((int) ( (unsigned char) word[i]))) { hasnumber = 1; numberrow++; if (numberrow > IGNOREROWN) return 0; vowelrow = 0; consrow = 0; } /* maximum number of consecutive vowels a word can have */ else if (isvowel(sw, word[i])) { hasvowel = 1; vowelrow++; if (vowelrow > IGNOREROWV) return 0; numberrow = 0; consrow = 0; } /* maximum number of consecutive consonants a word can have */ else if (!ispunct((int) ( (unsigned char) word[i]))) { hascons = 1; consrow++; if (consrow > IGNOREROWC) return 0; numberrow = 0; vowelrow = 0; } lastchar = word[i]; } /* If IGNOREALLV is 1, words containing all vowels won't be indexed. */ if (IGNOREALLV) if (hasvowel && !hascons) return 0; /* If IGNOREALLC is 1, words containing all consonants won't be indexed */ if (IGNOREALLC) if (hascons && !hasvowel) return 0; /* If IGNOREALLN is 1, words containing all digits won't be indexed */ if (IGNOREALLN) if (hasnumber && !hasvowel && !hascons) return 0; return 1; } /* -- Determine document type by checking the file extension -- of the filename -- Return: doctype -- 2001-03-08 rasc rewritten (optimize and match also -- e.g. ".htm", ".htm.de" or ".html.gz") */ int getdoctype(char *filename, struct IndexContents *indexcontents) { struct swline *swl; char *s, *fe; if (!indexcontents) return NODOCTYPE; /* basically do a right to left compare */ fe = (filename + strlen(filename)); while (indexcontents) { swl = indexcontents->patt; while (swl) { s = fe - strlen(swl->line); if (s >= filename) { /* no negative overflow! */ if (!strcasecmp(swl->line, s)) { return indexcontents->DocType;; } } swl = swl->next; } indexcontents = indexcontents->next; } return NODOCTYPE; } struct StoreDescription *hasdescription(int doctype, struct StoreDescription *sd) { while (sd) { if (sd->DocType == doctype) return sd; sd = sd->next; } return NULL; }