/[MITgcm]/mitgcm.org/devel/buildweb/pkg/swish-e/src/stemmer.c
ViewVC logotype

Annotation of /mitgcm.org/devel/buildweb/pkg/swish-e/src/stemmer.c

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1.1.1 - (hide annotations) (download) (vendor branch)
Fri Sep 20 19:47:29 2002 UTC (22 years, 10 months ago) by adcroft
Branch: Import, MAIN
CVS Tags: baseline, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain
Importing web-site building process.

1 adcroft 1.1 /*
2     ** NOTE: This implementation was originally part of the WAIS system
3     ** The main function, Stem(), was incorporated into Swish-E 1.1
4     ** to provide a stemming function.
5     ** 11/24/98 Mark Gaulin
6     **
7     ** Stem returns original word if words stems to empty string
8     ** Bill Moseley 10/11/99
9     **
10     ** Repeats stemming until word will stem no more
11     ** Bill Moseley 10/17/99
12     **
13     ** function: EndsWithCVC patched a bug. see below. Moseley 10/19/99
14     **
15     ** Added word length arg to ReplaceEnd and Stem to avoid strcat overflow
16     ** 11/17/99 - SRE
17     **
18     ** fixed int cast, missing return value, braces around initializations: problems pointed out by "gcc -Wall"
19     ** SRE 2/22/00
20     **
21     ** Jose Ruiz 18/10/00
22     ** Remove static word from end var and make the code thread safe
23     **
24     ** Bill Moseley 20/05/01
25     ** Rewrote to simplify. No more need to repeat stem (expandstar gone from search.c)
26     ** got rid of most of the reallocation of memory.
27     */
28    
29     /* WIDE AREA INFORMATION SERVER SOFTWARE:
30     No guarantees or restrictions. See the readme file for the full standard
31     disclaimer.
32    
33     francois@welchgate.welch.jhu.edu
34     */
35    
36     /* Copyright (c) CNIDR (see ../COPYRIGHT) */
37    
38    
39    
40     /*
41     * stems a word.
42     *
43     */
44    
45     /* the main functions are:
46     * stemmer
47     *
48     */
49    
50     #include "swish.h"
51     #include <ctype.h>
52     #include <string.h>
53     #include <stdio.h>
54    
55    
56     #include "stemmer.h"
57     #include "mem.h"
58    
59    
60     #define FALSE 0
61     #define TRUE 1
62    
63    
64    
65     /******************************* stem.c ***********************************
66    
67     Purpose: Implementation of the Porter stemming algorithm documented
68     in: Porter, M.F., "An Algorithm For Suffix Stripping,"
69     Program 14 (3), July 1980, pp. 130-137.
70    
71     Provenance: Written by B. Frakes and C. Cox, 1986.
72     Changed by C. Fox, 1990.
73     - made measure function a DFA
74     - restructured structs
75     - renamed functions and variables
76     - restricted function and variable scopes
77     Changed by C. Fox, July, 1991.
78     - added ANSI C declarations
79     - branch tested to 90% coverage
80    
81     Notes: This code will make little sense without the the Porter
82     article. The stemming function converts its input to
83     lower case.
84     **/
85    
86     /************************ Standard Include Files *************************/
87    
88     /*****************************************************************************/
89     /***************** Private Defines and Data Structures *******************/
90    
91     #define IsVowel(c) ('a'==(c)||'e'==(c)||'i'==(c)||'o'==(c)||'u'==(c))
92    
93     typedef struct
94     {
95     int id; /* returned if rule fired */
96     char *old_end; /* suffix replaced */
97     char *new_end; /* suffix replacement */
98     int old_offset; /* from end of word to start of suffix */
99     int new_offset; /* from beginning to end of new suffix */
100     int min_root_size; /* min root word size for replacement */
101     int (*condition) (); /* the replacement test function */
102     }
103     RuleList;
104    
105     #define LAMBDA "" /* the constant empty string */
106    
107     /*****************************************************************************/
108     /******************** Private Function Declarations **********************/
109    
110     #ifdef __STDC__
111    
112     int WordSize(char *word);
113     int ContainsVowel(char *word);
114     int EndsWithCVC(char *word);
115     int AddAnE(char *word);
116     int RemoveAnE(char *word);
117     int ReplaceEnd(char *word, RuleList * rule);
118    
119     #else
120    
121     int WordSize( /* word */ );
122     int ContainsVowel( /* word */ );
123     int EndsWithCVC( /* word */ );
124     int AddAnE( /* word */ );
125     int RemoveAnE( /* word */ );
126     int ReplaceEnd( /* word, rule */ );
127    
128     #endif
129    
130     /******************************************************************************/
131     /***************** Initialized Private Data Structures ********************/
132     /* 2/22/00 - added braces around each line */
133    
134     static RuleList step1a_rules[] = {
135     {101, "sses", "ss", 3, 1, -1, NULL,},
136     {102, "ies", "i", 2, 0, -1, NULL,},
137     {103, "ss", "ss", 1, 1, -1, NULL,},
138     {104, "s", LAMBDA, 0, -1, -1, NULL,},
139     {000, NULL, NULL, 0, 0, 0, NULL,}
140     };
141    
142     static RuleList step1b_rules[] = {
143     {105, "eed", "ee", 2, 1, 0, NULL,},
144     {106, "ed", LAMBDA, 1, -1, -1, ContainsVowel,},
145     {107, "ing", LAMBDA, 2, -1, -1, ContainsVowel,},
146     {000, NULL, NULL, 0, 0, 0, NULL,}
147     };
148    
149     static RuleList step1b1_rules[] = {
150     {108, "at", "ate", 1, 2, -1, NULL,},
151     {109, "bl", "ble", 1, 2, -1, NULL,},
152     {110, "iz", "ize", 1, 2, -1, NULL,},
153     {111, "bb", "b", 1, 0, -1, NULL,},
154     {112, "dd", "d", 1, 0, -1, NULL,},
155     {113, "ff", "f", 1, 0, -1, NULL,},
156     {114, "gg", "g", 1, 0, -1, NULL,},
157     {115, "mm", "m", 1, 0, -1, NULL,},
158     {116, "nn", "n", 1, 0, -1, NULL,},
159     {117, "pp", "p", 1, 0, -1, NULL,},
160     {118, "rr", "r", 1, 0, -1, NULL,},
161     {119, "tt", "t", 1, 0, -1, NULL,},
162     {120, "ww", "w", 1, 0, -1, NULL,},
163     {121, "xx", "x", 1, 0, -1, NULL,},
164     {122, LAMBDA, "e", -1, 0, -1, AddAnE,},
165     {000, NULL, NULL, 0, 0, 0, NULL,}
166     };
167    
168     static RuleList step1c_rules[] = {
169     {123, "y", "i", 0, 0, -1, ContainsVowel,},
170     {000, NULL, NULL, 0, 0, 0, NULL,}
171     };
172    
173     static RuleList step2_rules[] = {
174     {203, "ational", "ate", 6, 2, 0, NULL,},
175     {204, "tional", "tion", 5, 3, 0, NULL,},
176     {205, "enci", "ence", 3, 3, 0, NULL,},
177     {206, "anci", "ance", 3, 3, 0, NULL,},
178     {207, "izer", "ize", 3, 2, 0, NULL,},
179     {208, "abli", "able", 3, 3, 0, NULL,},
180     {209, "alli", "al", 3, 1, 0, NULL,},
181     {210, "entli", "ent", 4, 2, 0, NULL,},
182     {211, "eli", "e", 2, 0, 0, NULL,},
183     {213, "ousli", "ous", 4, 2, 0, NULL,},
184     {214, "ization", "ize", 6, 2, 0, NULL,},
185     {215, "ation", "ate", 4, 2, 0, NULL,},
186     {216, "ator", "ate", 3, 2, 0, NULL,},
187     {217, "alism", "al", 4, 1, 0, NULL,},
188     {218, "iveness", "ive", 6, 2, 0, NULL,},
189     {219, "fulnes", "ful", 5, 2, 0, NULL,},
190     {220, "ousness", "ous", 6, 2, 0, NULL,},
191     {221, "aliti", "al", 4, 1, 0, NULL,},
192     {222, "iviti", "ive", 4, 2, 0, NULL,},
193     {223, "biliti", "ble", 5, 2, 0, NULL,},
194     {000, NULL, NULL, 0, 0, 0, NULL,}
195     };
196    
197     static RuleList step3_rules[] = {
198     {301, "icate", "ic", 4, 1, 0, NULL,},
199     {302, "ative", LAMBDA, 4, -1, 0, NULL,},
200     {303, "alize", "al", 4, 1, 0, NULL,},
201     {304, "iciti", "ic", 4, 1, 0, NULL,},
202     {305, "ical", "ic", 3, 1, 0, NULL,},
203     {308, "ful", LAMBDA, 2, -1, 0, NULL,},
204     {309, "ness", LAMBDA, 3, -1, 0, NULL,},
205     {000, NULL, NULL, 0, 0, 0, NULL,}
206     };
207    
208     static RuleList step4_rules[] = {
209     {401, "al", LAMBDA, 1, -1, 1, NULL,},
210     {402, "ance", LAMBDA, 3, -1, 1, NULL,},
211     {403, "ence", LAMBDA, 3, -1, 1, NULL,},
212     {405, "er", LAMBDA, 1, -1, 1, NULL,},
213     {406, "ic", LAMBDA, 1, -1, 1, NULL,},
214     {407, "able", LAMBDA, 3, -1, 1, NULL,},
215     {408, "ible", LAMBDA, 3, -1, 1, NULL,},
216     {409, "ant", LAMBDA, 2, -1, 1, NULL,},
217     {410, "ement", LAMBDA, 4, -1, 1, NULL,},
218     {411, "ment", LAMBDA, 3, -1, 1, NULL,},
219     {412, "ent", LAMBDA, 2, -1, 1, NULL,},
220     {423, "sion", "s", 3, 0, 1, NULL,},
221     {424, "tion", "t", 3, 0, 1, NULL,},
222     {415, "ou", LAMBDA, 1, -1, 1, NULL,},
223     {416, "ism", LAMBDA, 2, -1, 1, NULL,},
224     {417, "ate", LAMBDA, 2, -1, 1, NULL,},
225     {418, "iti", LAMBDA, 2, -1, 1, NULL,},
226     {419, "ous", LAMBDA, 2, -1, 1, NULL,},
227     {420, "ive", LAMBDA, 2, -1, 1, NULL,},
228     {421, "ize", LAMBDA, 2, -1, 1, NULL,},
229     {000, NULL, NULL, 0, 0, 0, NULL,}
230     };
231    
232     static RuleList step5a_rules[] = {
233     {501, "e", LAMBDA, 0, -1, 1, NULL,},
234     {502, "e", LAMBDA, 0, -1, -1, RemoveAnE,},
235     {000, NULL, NULL, 0, 0, 0, NULL,}
236     };
237    
238     static RuleList step5b_rules[] = {
239     {503, "ll", "l", 1, 0, 1, NULL,},
240     {000, NULL, NULL, 0, 0, 0, NULL,}
241     };
242    
243    
244     static RuleList *all_steps[] = {
245     step1a_rules,
246     step1b_rules,
247     /* step1b1_rules, -- conditionaly called below */
248     step1c_rules,
249     step2_rules,
250     step3_rules,
251     step4_rules,
252     step5a_rules,
253     step5b_rules,
254     };
255    
256    
257    
258    
259     /*****************************************************************************/
260     /******************** Private Function Declarations **********************/
261    
262     /*FN***************************************************************************
263    
264     WordSize( word )
265    
266     Returns: int -- a weird count of word size in adjusted syllables
267    
268     Purpose: Count syllables in a special way: count the number
269     vowel-consonant pairs in a word, disregarding initial
270     consonants and final vowels. The letter "y" counts as a
271     consonant at the beginning of a word and when it has a vowel
272     in front of it; otherwise (when it follows a consonant) it
273     is treated as a vowel. For example, the WordSize of "cat"
274     is 1, of "any" is 1, of "amount" is 2, of "anything" is 3.
275    
276     Plan: Run a DFA to compute the word size
277    
278     Notes: The easiest and fastest way to compute this funny measure is
279     with a finite state machine. The initial state 0 checks
280     the first letter. If it is a vowel, then the machine changes
281     to state 1, which is the "last letter was a vowel" state.
282     If the first letter is a consonant or y, then it changes
283     to state 2, the "last letter was a consonant state". In
284     state 1, a y is treated as a consonant (since it follows
285     a vowel), but in state 2, y is treated as a vowel (since
286     it follows a consonant. The result counter is incremented
287     on the transition from state 1 to state 2, since this
288     transition only occurs after a vowel-consonant pair, which
289     is what we are counting.
290     **/
291    
292     int WordSize(char *word)
293     {
294     register int result; /* WordSize of the word */
295     register int state; /* current state in machine */
296    
297     result = 0;
298     state = 0;
299    
300     /* Run a DFA to compute the word size */
301     while ( *word )
302     {
303     switch (state)
304     {
305     case 0:
306     state = (IsVowel(*word)) ? 1 : 2;
307     break;
308     case 1:
309     state = (IsVowel(*word)) ? 1 : 2;
310     if (2 == state)
311     result++;
312     break;
313     case 2:
314     state = (IsVowel(*word) || ('y' == *word)) ? 1 : 2;
315     break;
316     }
317     word++;
318     }
319    
320     return (result);
321    
322     } /* WordSize */
323    
324     /*FN**************************************************************************
325    
326     ContainsVowel( word, end )
327    
328     Returns: int -- TRUE (1) if the word parameter contains a vowel,
329     FALSE (0) otherwise.
330    
331     Purpose: Some of the rewrite rules apply only to a root containing
332     a vowel, where a vowel is one of "aeiou" or y with a
333     consonant in front of it.
334    
335     Plan: Obviously, under the definition of a vowel, a word contains
336     a vowel iff either its first letter is one of "aeiou", or
337     any of its other letters are "aeiouy". The plan is to
338     test this condition.
339    
340     Notes: None
341     **/
342    
343     int ContainsVowel(char *word)
344     {
345    
346     /* This isn't needed, right? */
347     if ( !*word )
348     return FALSE;
349    
350     return (IsVowel(*word) || (NULL != strpbrk(word + 1, "aeiouy")));
351    
352    
353     } /* ContainsVowel */
354    
355     /*FN**************************************************************************
356    
357     EndsWithCVC( word )
358    
359     Returns: int -- TRUE (1) if the current word ends with a
360     consonant-vowel-consonant combination, and the second
361     consonant is not w, x, or y, FALSE (0) otherwise.
362    
363     Purpose: Some of the rewrite rules apply only to a root with
364     this characteristic.
365    
366     Plan: Look at the last three characters.
367    
368     Notes: None
369     **/
370    
371     int EndsWithCVC(char *word)
372     {
373     int length; /* for finding the last three characters */
374    
375     if ((length = (int) strlen(word)) < 3) /* This was < 2 in original - Moseley 10/19/99 */
376     return (FALSE);
377    
378    
379     return (
380     (NULL == strchr("aeiouwxy", (int)word[--length])) /* consonant */
381     && (NULL != strchr("aeiouy", (int)word[--length])) /* vowel */
382     && (NULL == strchr("aeiou", (int)word[--length]))
383     ); /* consonant */
384    
385     } /* EndsWithCVC */
386    
387     /*FN**************************************************************************
388    
389     AddAnE( word )
390    
391     Returns: int -- TRUE (1) if the current word meets special conditions
392     for adding an e.
393    
394     Purpose: Rule 122 applies only to a root with this characteristic.
395    
396     Plan: Check for size of 1 and a consonant-vowel-consonant ending.
397    
398     Notes: None
399     **/
400    
401     int AddAnE(char *word)
402     {
403    
404     return ((1 == WordSize(word)) && EndsWithCVC(word));
405    
406     } /* AddAnE */
407    
408     /*FN**************************************************************************
409    
410     RemoveAnE( word )
411    
412     Returns: int -- TRUE (1) if the current word meets special conditions
413     for removing an e.
414    
415     Purpose: Rule 502 applies only to a root with this characteristic.
416    
417     Plan: Check for size of 1 and no consonant-vowel-consonant ending.
418    
419     Notes: None
420     **/
421    
422     int RemoveAnE(char *word)
423     {
424    
425     return ((1 == WordSize(word)) && !EndsWithCVC(word));
426    
427     } /* RemoveAnE */
428    
429     /*FN**************************************************************************
430    
431     ReplaceEnd( word, rule, end)
432    
433     Returns: int -- the id for the rule fired, 0 is none is fired
434    
435     Purpose: Apply a set of rules to replace the suffix of a word
436    
437     Plan: Loop through the rule set until a match meeting all conditions
438     is found. If a rule fires, return its id, otherwise return 0.
439     Connditions on the length of the root are checked as part of this
440     function's processing because this check is so often made.
441    
442     Notes: This is the main routine driving the stemmer. It goes through
443     a set of suffix replacement rules looking for a match on the
444     current suffix. When it finds one, if the root of the word
445     is long enough, and it meets whatever other conditions are
446     required, then the suffix is replaced, and the function returns.
447     **/
448    
449     int ReplaceEnd(char *word, RuleList *rule)
450     {
451     char *ending; /* set to start of possible stemmed suffix */
452     char tmp_ch; /* save replaced character when testing */
453     char *end; /* pointer to last char of string */
454    
455     end = word + strlen( word ) - 1;
456    
457     while ( rule->id )
458     {
459     /* point ending to the start of the test sufix */
460     ending = end - rule->old_offset;
461    
462     /* is word long enough to contain suffix, and does it exist? */
463     if ( (word <= ending ) && (0 == strcmp(ending, rule->old_end)) )
464     {
465     tmp_ch = *ending; /* in case we change our mind */
466     *ending = '\0';
467    
468     if ( rule->min_root_size < WordSize(word) )
469     if (!rule->condition || (*rule->condition) ( word ))
470     {
471     /* replace the ending */
472     if ( (strlen( word ) + rule->new_offset + 1 ) >= MAXWORDLEN )
473     return STEM_WORD_TOO_BIG;
474     strcat( word, rule->new_end );
475    
476     return rule->id;
477     }
478     *ending = tmp_ch; /* nope, put it back */
479     }
480    
481     rule++;
482     }
483    
484     return 0;
485    
486     } /* ReplaceEnd */
487    
488     /*****************************************************************************/
489     /********************* Public Function Declarations **********************/
490    
491     /*FN***************************************************************************
492    
493     Stem( word )
494    
495     Returns: int -- FALSE (0) if the word contains non-alphabetic characters
496     and hence is not stemmed, TRUE (1) otherwise
497    
498     Purpose: Stem a word
499    
500     Plan: Part 1: Check to ensure the word is all alphabetic
501     Part 2: Run through the Porter algorithm
502     Part 3: Return an indication of successful stemming
503    
504     Notes: This function implements the Porter stemming algorithm, with
505     a few additions here and there. See:
506    
507     Porter, M.F., "An Algorithm For Suffix Stripping,"
508     Program 14 (3), July 1980, pp. 130-137.
509    
510     Porter's algorithm is an ad hoc set of rewrite rules with
511     various conditions on rule firing. The terminology of
512     "step 1a" and so on, is taken directly from Porter's
513     article, which unfortunately gives almost no justification
514     for the various steps. Thus this function more or less
515     faithfully refects the opaque presentation in the article.
516     Changes from the article amount to a few additions to the
517     rewrite rules; these are marked in the RuleList data
518     structures with comments.
519     **/
520    
521     int Stem(char **inword, int *lenword)
522     {
523     char *end; /* pointer to the end of the word */
524     char word[MAXWORDLEN+1];
525     int length;
526     int rule_result; /* which rule is fired in replacing an end */
527     // RuleList **rule;
528     int i;
529    
530    
531    
532     /* Make sure the word is not too large from the start. */
533     if ( strlen( *inword ) >= MAXWORDLEN )
534     return STEM_WORD_TOO_BIG;
535    
536    
537     /* make working copy */
538     strcpy( word, *inword );
539    
540    
541    
542     /* Part 1: Check to ensure the word is all alphabetic */
543     /* no longer converts to lower case -- word should be lower before calling */
544    
545     for ( end = word; *end; end++ )
546     if ( !isalpha( (unsigned int) *end ) )
547     return STEM_NOT_ALPHA;
548    
549    
550    
551     /* Part 2: Run through the Porter algorithm */
552    
553    
554     for (i = 0; i < sizeof(all_steps)/sizeof(all_steps[0]); i++)
555     {
556     rule_result = ReplaceEnd(word, all_steps[i]);
557    
558     if ((rule_result == 106) || (rule_result == 107))
559     rule_result = ReplaceEnd(word, step1b1_rules);
560    
561     if ( rule_result == STEM_WORD_TOO_BIG )
562     return rule_result;
563    
564     }
565    
566    
567    
568     length = strlen( word );
569    
570     /* Stem must be two chars or more in length */
571     if ( length <= 1 )
572     return STEM_TO_NOTHING;
573    
574     /* reallocate memory if need more room */
575    
576     if ( length >= *lenword )
577     {
578     efree( *inword );
579    
580     *lenword = length;
581     *inword = emalloc( *lenword + 1 );
582     }
583    
584     strcpy( *inword, word );
585     return STEM_OK;
586     }
587    

  ViewVC Help
Powered by ViewVC 1.1.22