1 |
/* |
2 |
$Id: double_metaphone.c,v 1.1 2002/08/20 23:20:21 whmoseley Exp $ |
3 |
** |
4 |
** |
5 |
** August 20, 2002 moseley - first added to swish-e |
6 |
** |
7 |
** this is a very slightly modified version of the double_metaphone.c code |
8 |
** from the Perl module Text::DoubleMetaphone by Maurice Aubrey, and based |
9 |
** on the work of Lawrence Philips. |
10 |
** See http://aspell.sourceforge.net/metaphone |
11 |
** |
12 |
** From the Text::DoubleMetaphone README file: |
13 |
|
14 |
DESCRIPTION |
15 |
|
16 |
This module implements a "sounds like" algorithm developed |
17 |
by Lawrence Philips which he published in the June, 2000 issue |
18 |
of C/C++ Users Journal. Double Metaphone is an improved |
19 |
version of Philips' original Metaphone algorithm. |
20 |
|
21 |
COPYRIGHT |
22 |
|
23 |
Copyright 2000, Maurice Aubrey <maurice@hevanet.com>. |
24 |
All rights reserved. |
25 |
|
26 |
This code is based heavily on the C++ implementation by |
27 |
Lawrence Philips and incorporates several bug fixes courtesy |
28 |
of Kevin Atkinson <kevina@users.sourceforge.net>. |
29 |
|
30 |
This module is free software; you may redistribute it and/or |
31 |
modify it under the same terms as Perl itself. |
32 |
|
33 |
|
34 |
** |
35 |
** |
36 |
*/ |
37 |
|
38 |
|
39 |
|
40 |
#include <stdio.h> |
41 |
#include <ctype.h> |
42 |
#include <stdlib.h> |
43 |
#include <string.h> |
44 |
#include <stdarg.h> |
45 |
#include <assert.h> |
46 |
#include "double_metaphone.h" |
47 |
#include "mem.h" |
48 |
|
49 |
|
50 |
#define META_MALLOC(v,n,t) (v = (t*)emalloc(((n)*sizeof(t)))) |
51 |
|
52 |
#define META_REALLOC(v,n,t) (v = (t*)erealloc((v),((n)*sizeof(t)))) |
53 |
|
54 |
#define META_FREE(x) efree((x)) |
55 |
|
56 |
|
57 |
metastring * |
58 |
NewMetaString(char *init_str) |
59 |
{ |
60 |
metastring *s; |
61 |
char empty_string[] = ""; |
62 |
|
63 |
META_MALLOC(s, 1, metastring); |
64 |
assert( s != NULL ); |
65 |
|
66 |
if (init_str == NULL) |
67 |
init_str = empty_string; |
68 |
s->length = strlen(init_str); |
69 |
/* preallocate a bit more for potential growth */ |
70 |
s->bufsize = s->length + 7; |
71 |
|
72 |
META_MALLOC(s->str, s->bufsize, char); |
73 |
assert( s->str != NULL ); |
74 |
|
75 |
strncpy(s->str, init_str, s->length + 1); |
76 |
s->free_string_on_destroy = 1; |
77 |
|
78 |
return s; |
79 |
} |
80 |
|
81 |
|
82 |
void |
83 |
DestroyMetaString(metastring * s) |
84 |
{ |
85 |
if (s == NULL) |
86 |
return; |
87 |
|
88 |
if (s->free_string_on_destroy && (s->str != NULL)) |
89 |
META_FREE(s->str); |
90 |
|
91 |
META_FREE(s); |
92 |
} |
93 |
|
94 |
|
95 |
void |
96 |
IncreaseBuffer(metastring * s, int chars_needed) |
97 |
{ |
98 |
META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char); |
99 |
assert( s->str != NULL ); |
100 |
s->bufsize = s->bufsize + chars_needed + 10; |
101 |
} |
102 |
|
103 |
|
104 |
void |
105 |
MakeUpper(metastring * s) |
106 |
{ |
107 |
char *i; |
108 |
|
109 |
for (i = s->str; *i; i++) |
110 |
{ |
111 |
*i = toupper(*i); |
112 |
} |
113 |
} |
114 |
|
115 |
|
116 |
int |
117 |
IsVowel(metastring * s, int pos) |
118 |
{ |
119 |
char c; |
120 |
|
121 |
if ((pos < 0) || (pos >= s->length)) |
122 |
return 0; |
123 |
|
124 |
c = *(s->str + pos); |
125 |
if ((c == 'A') || (c == 'E') || (c == 'I') || (c =='O') || |
126 |
(c =='U') || (c == 'Y')) |
127 |
return 1; |
128 |
|
129 |
return 0; |
130 |
} |
131 |
|
132 |
|
133 |
int |
134 |
SlavoGermanic(metastring * s) |
135 |
{ |
136 |
if ((char *) strstr(s->str, "W")) |
137 |
return 1; |
138 |
else if ((char *) strstr(s->str, "K")) |
139 |
return 1; |
140 |
else if ((char *) strstr(s->str, "CZ")) |
141 |
return 1; |
142 |
else if ((char *) strstr(s->str, "WITZ")) |
143 |
return 1; |
144 |
else |
145 |
return 0; |
146 |
} |
147 |
|
148 |
|
149 |
int |
150 |
GetLength(metastring * s) |
151 |
{ |
152 |
return s->length; |
153 |
} |
154 |
|
155 |
|
156 |
char |
157 |
GetAt(metastring * s, int pos) |
158 |
{ |
159 |
if ((pos < 0) || (pos >= s->length)) |
160 |
return '\0'; |
161 |
|
162 |
return ((char) *(s->str + pos)); |
163 |
} |
164 |
|
165 |
|
166 |
void |
167 |
SetAt(metastring * s, int pos, char c) |
168 |
{ |
169 |
if ((pos < 0) || (pos >= s->length)) |
170 |
return; |
171 |
|
172 |
*(s->str + pos) = c; |
173 |
} |
174 |
|
175 |
|
176 |
/* |
177 |
Caveats: the START value is 0 based |
178 |
*/ |
179 |
int |
180 |
StringAt(metastring * s, int start, int length, ...) |
181 |
{ |
182 |
char *test; |
183 |
char *pos; |
184 |
va_list ap; |
185 |
|
186 |
if ((start < 0) || (start >= s->length)) |
187 |
return 0; |
188 |
|
189 |
pos = (s->str + start); |
190 |
va_start(ap, length); |
191 |
|
192 |
do |
193 |
{ |
194 |
test = va_arg(ap, char *); |
195 |
if (*test && (strncmp(pos, test, length) == 0)) |
196 |
return 1; |
197 |
} |
198 |
while (strcmp(test, "")); |
199 |
|
200 |
va_end(ap); |
201 |
|
202 |
return 0; |
203 |
} |
204 |
|
205 |
|
206 |
void |
207 |
MetaphAdd(metastring * s, char *new_str) |
208 |
{ |
209 |
int add_length; |
210 |
|
211 |
if (new_str == NULL) |
212 |
return; |
213 |
|
214 |
add_length = strlen(new_str); |
215 |
if ((s->length + add_length) > (s->bufsize - 1)) |
216 |
{ |
217 |
IncreaseBuffer(s, add_length); |
218 |
} |
219 |
|
220 |
strcat(s->str, new_str); |
221 |
s->length += add_length; |
222 |
} |
223 |
|
224 |
|
225 |
void |
226 |
DoubleMetaphone(char *str, char **codes) |
227 |
{ |
228 |
int length; |
229 |
metastring *original; |
230 |
metastring *primary; |
231 |
metastring *secondary; |
232 |
int current; |
233 |
int last; |
234 |
|
235 |
current = 0; |
236 |
/* we need the real length and last prior to padding */ |
237 |
length = strlen(str); |
238 |
last = length - 1; |
239 |
original = NewMetaString(str); |
240 |
/* Pad original so we can index beyond end */ |
241 |
MetaphAdd(original, " "); |
242 |
|
243 |
primary = NewMetaString(""); |
244 |
secondary = NewMetaString(""); |
245 |
primary->free_string_on_destroy = 0; |
246 |
secondary->free_string_on_destroy = 0; |
247 |
|
248 |
MakeUpper(original); |
249 |
|
250 |
/* skip these when at start of word */ |
251 |
if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", "")) |
252 |
current += 1; |
253 |
|
254 |
/* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */ |
255 |
if (GetAt(original, 0) == 'X') |
256 |
{ |
257 |
MetaphAdd(primary, "S"); /* 'Z' maps to 'S' */ |
258 |
MetaphAdd(secondary, "S"); |
259 |
current += 1; |
260 |
} |
261 |
|
262 |
/* main loop */ |
263 |
while ((primary->length < 4) || (secondary->length < 4)) |
264 |
{ |
265 |
if (current >= length) |
266 |
break; |
267 |
|
268 |
switch (GetAt(original, current)) |
269 |
{ |
270 |
case 'A': |
271 |
case 'E': |
272 |
case 'I': |
273 |
case 'O': |
274 |
case 'U': |
275 |
case 'Y': |
276 |
if (current == 0) |
277 |
{ |
278 |
/* all init vowels now map to 'A' */ |
279 |
MetaphAdd(primary, "A"); |
280 |
MetaphAdd(secondary, "A"); |
281 |
} |
282 |
current += 1; |
283 |
break; |
284 |
|
285 |
case 'B': |
286 |
|
287 |
/* "-mb", e.g", "dumb", already skipped over... */ |
288 |
MetaphAdd(primary, "P"); |
289 |
MetaphAdd(secondary, "P"); |
290 |
|
291 |
if (GetAt(original, current + 1) == 'B') |
292 |
current += 2; |
293 |
else |
294 |
current += 1; |
295 |
break; |
296 |
|
297 |
case 'Ç': |
298 |
MetaphAdd(primary, "S"); |
299 |
MetaphAdd(secondary, "S"); |
300 |
current += 1; |
301 |
break; |
302 |
|
303 |
case 'C': |
304 |
/* various germanic */ |
305 |
if ((current > 1) |
306 |
&& !IsVowel(original, current - 2) |
307 |
&& StringAt(original, (current - 1), 3, "ACH", "") |
308 |
&& ((GetAt(original, current + 2) != 'I') |
309 |
&& ((GetAt(original, current + 2) != 'E') |
310 |
|| StringAt(original, (current - 2), 6, "BACHER", |
311 |
"MACHER", "")))) |
312 |
{ |
313 |
MetaphAdd(primary, "K"); |
314 |
MetaphAdd(secondary, "K"); |
315 |
current += 2; |
316 |
break; |
317 |
} |
318 |
|
319 |
/* special case 'caesar' */ |
320 |
if ((current == 0) |
321 |
&& StringAt(original, current, 6, "CAESAR", "")) |
322 |
{ |
323 |
MetaphAdd(primary, "S"); |
324 |
MetaphAdd(secondary, "S"); |
325 |
current += 2; |
326 |
break; |
327 |
} |
328 |
|
329 |
/* italian 'chianti' */ |
330 |
if (StringAt(original, current, 4, "CHIA", "")) |
331 |
{ |
332 |
MetaphAdd(primary, "K"); |
333 |
MetaphAdd(secondary, "K"); |
334 |
current += 2; |
335 |
break; |
336 |
} |
337 |
|
338 |
if (StringAt(original, current, 2, "CH", "")) |
339 |
{ |
340 |
/* find 'michael' */ |
341 |
if ((current > 0) |
342 |
&& StringAt(original, current, 4, "CHAE", "")) |
343 |
{ |
344 |
MetaphAdd(primary, "K"); |
345 |
MetaphAdd(secondary, "X"); |
346 |
current += 2; |
347 |
break; |
348 |
} |
349 |
|
350 |
/* greek roots e.g. 'chemistry', 'chorus' */ |
351 |
if ((current == 0) |
352 |
&& (StringAt(original, (current + 1), 5, "HARAC", "HARIS", "") |
353 |
|| StringAt(original, (current + 1), 3, "HOR", |
354 |
"HYM", "HIA", "HEM", "")) |
355 |
&& !StringAt(original, 0, 5, "CHORE", "")) |
356 |
{ |
357 |
MetaphAdd(primary, "K"); |
358 |
MetaphAdd(secondary, "K"); |
359 |
current += 2; |
360 |
break; |
361 |
} |
362 |
|
363 |
/* germanic, greek, or otherwise 'ch' for 'kh' sound */ |
364 |
if ( |
365 |
(StringAt(original, 0, 4, "VAN ", "VON ", "") |
366 |
|| StringAt(original, 0, 3, "SCH", "")) |
367 |
/* 'architect but not 'arch', 'orchestra', 'orchid' */ |
368 |
|| StringAt(original, (current - 2), 6, "ORCHES", |
369 |
"ARCHIT", "ORCHID", "") |
370 |
|| StringAt(original, (current + 2), 1, "T", "S", |
371 |
"") |
372 |
|| ((StringAt(original, (current - 1), 1, "A", "O", "U", "E", "") |
373 |
|| (current == 0)) |
374 |
/* e.g., 'wachtler', 'wechsler', but not 'tichner' */ |
375 |
&& StringAt(original, (current + 2), 1, "L", "R", |
376 |
"N", "M", "B", "H", "F", "V", "W", " ", ""))) |
377 |
{ |
378 |
MetaphAdd(primary, "K"); |
379 |
MetaphAdd(secondary, "K"); |
380 |
} |
381 |
else |
382 |
{ |
383 |
if (current > 0) |
384 |
{ |
385 |
if (StringAt(original, 0, 2, "MC", "")) |
386 |
{ |
387 |
/* e.g., "McHugh" */ |
388 |
MetaphAdd(primary, "K"); |
389 |
MetaphAdd(secondary, "K"); |
390 |
} |
391 |
else |
392 |
{ |
393 |
MetaphAdd(primary, "X"); |
394 |
MetaphAdd(secondary, "K"); |
395 |
} |
396 |
} |
397 |
else |
398 |
{ |
399 |
MetaphAdd(primary, "X"); |
400 |
MetaphAdd(secondary, "X"); |
401 |
} |
402 |
} |
403 |
current += 2; |
404 |
break; |
405 |
} |
406 |
/* e.g, 'czerny' */ |
407 |
if (StringAt(original, current, 2, "CZ", "") |
408 |
&& !StringAt(original, (current - 2), 4, "WICZ", "")) |
409 |
{ |
410 |
MetaphAdd(primary, "S"); |
411 |
MetaphAdd(secondary, "X"); |
412 |
current += 2; |
413 |
break; |
414 |
} |
415 |
|
416 |
/* e.g., 'focaccia' */ |
417 |
if (StringAt(original, (current + 1), 3, "CIA", "")) |
418 |
{ |
419 |
MetaphAdd(primary, "X"); |
420 |
MetaphAdd(secondary, "X"); |
421 |
current += 3; |
422 |
break; |
423 |
} |
424 |
|
425 |
/* double 'C', but not if e.g. 'McClellan' */ |
426 |
if (StringAt(original, current, 2, "CC", "") |
427 |
&& !((current == 1) && (GetAt(original, 0) == 'M'))) |
428 |
{ |
429 |
/* 'bellocchio' but not 'bacchus' */ |
430 |
if (StringAt(original, (current + 2), 1, "I", "E", "H", "") |
431 |
&& !StringAt(original, (current + 2), 2, "HU", "")) |
432 |
{ |
433 |
/* 'accident', 'accede' 'succeed' */ |
434 |
if ( |
435 |
((current == 1) |
436 |
&& (GetAt(original, current - 1) == 'A')) |
437 |
|| StringAt(original, (current - 1), 5, "UCCEE", |
438 |
"UCCES", "")) |
439 |
{ |
440 |
MetaphAdd(primary, "KS"); |
441 |
MetaphAdd(secondary, "KS"); |
442 |
/* 'bacci', 'bertucci', other italian */ |
443 |
} |
444 |
else |
445 |
{ |
446 |
MetaphAdd(primary, "X"); |
447 |
MetaphAdd(secondary, "X"); |
448 |
} |
449 |
current += 3; |
450 |
break; |
451 |
} |
452 |
else |
453 |
{ /* Pierce's rule */ |
454 |
MetaphAdd(primary, "K"); |
455 |
MetaphAdd(secondary, "K"); |
456 |
current += 2; |
457 |
break; |
458 |
} |
459 |
} |
460 |
|
461 |
if (StringAt(original, current, 2, "CK", "CG", "CQ", "")) |
462 |
{ |
463 |
MetaphAdd(primary, "K"); |
464 |
MetaphAdd(secondary, "K"); |
465 |
current += 2; |
466 |
break; |
467 |
} |
468 |
|
469 |
if (StringAt(original, current, 2, "CI", "CE", "CY", "")) |
470 |
{ |
471 |
/* italian vs. english */ |
472 |
if (StringAt |
473 |
(original, current, 3, "CIO", "CIE", "CIA", "")) |
474 |
{ |
475 |
MetaphAdd(primary, "S"); |
476 |
MetaphAdd(secondary, "X"); |
477 |
} |
478 |
else |
479 |
{ |
480 |
MetaphAdd(primary, "S"); |
481 |
MetaphAdd(secondary, "S"); |
482 |
} |
483 |
current += 2; |
484 |
break; |
485 |
} |
486 |
|
487 |
/* else */ |
488 |
MetaphAdd(primary, "K"); |
489 |
MetaphAdd(secondary, "K"); |
490 |
|
491 |
/* name sent in 'mac caffrey', 'mac gregor */ |
492 |
if (StringAt(original, (current + 1), 2, " C", " Q", " G", "")) |
493 |
current += 3; |
494 |
else |
495 |
if (StringAt(original, (current + 1), 1, "C", "K", "Q", "") |
496 |
&& !StringAt(original, (current + 1), 2, "CE", "CI", "")) |
497 |
current += 2; |
498 |
else |
499 |
current += 1; |
500 |
break; |
501 |
|
502 |
case 'D': |
503 |
if (StringAt(original, current, 2, "DG", "")) |
504 |
{ |
505 |
if (StringAt(original, (current + 2), 1, "I", "E", "Y", "")) |
506 |
{ |
507 |
/* e.g. 'edge' */ |
508 |
MetaphAdd(primary, "J"); |
509 |
MetaphAdd(secondary, "J"); |
510 |
current += 3; |
511 |
break; |
512 |
} |
513 |
else |
514 |
{ |
515 |
/* e.g. 'edgar' */ |
516 |
MetaphAdd(primary, "TK"); |
517 |
MetaphAdd(secondary, "TK"); |
518 |
current += 2; |
519 |
break; |
520 |
} |
521 |
} |
522 |
|
523 |
if (StringAt(original, current, 2, "DT", "DD", "")) |
524 |
{ |
525 |
MetaphAdd(primary, "T"); |
526 |
MetaphAdd(secondary, "T"); |
527 |
current += 2; |
528 |
break; |
529 |
} |
530 |
|
531 |
/* else */ |
532 |
MetaphAdd(primary, "T"); |
533 |
MetaphAdd(secondary, "T"); |
534 |
current += 1; |
535 |
break; |
536 |
|
537 |
case 'F': |
538 |
if (GetAt(original, current + 1) == 'F') |
539 |
current += 2; |
540 |
else |
541 |
current += 1; |
542 |
MetaphAdd(primary, "F"); |
543 |
MetaphAdd(secondary, "F"); |
544 |
break; |
545 |
|
546 |
case 'G': |
547 |
if (GetAt(original, current + 1) == 'H') |
548 |
{ |
549 |
if ((current > 0) && !IsVowel(original, current - 1)) |
550 |
{ |
551 |
MetaphAdd(primary, "K"); |
552 |
MetaphAdd(secondary, "K"); |
553 |
current += 2; |
554 |
break; |
555 |
} |
556 |
|
557 |
if (current < 3) |
558 |
{ |
559 |
/* 'ghislane', ghiradelli */ |
560 |
if (current == 0) |
561 |
{ |
562 |
if (GetAt(original, current + 2) == 'I') |
563 |
{ |
564 |
MetaphAdd(primary, "J"); |
565 |
MetaphAdd(secondary, "J"); |
566 |
} |
567 |
else |
568 |
{ |
569 |
MetaphAdd(primary, "K"); |
570 |
MetaphAdd(secondary, "K"); |
571 |
} |
572 |
current += 2; |
573 |
break; |
574 |
} |
575 |
} |
576 |
/* Parker's rule (with some further refinements) - e.g., 'hugh' */ |
577 |
if ( |
578 |
((current > 1) |
579 |
&& StringAt(original, (current - 2), 1, "B", "H", "D", "")) |
580 |
/* e.g., 'bough' */ |
581 |
|| ((current > 2) |
582 |
&& StringAt(original, (current - 3), 1, "B", "H", "D", "")) |
583 |
/* e.g., 'broughton' */ |
584 |
|| ((current > 3) |
585 |
&& StringAt(original, (current - 4), 1, "B", "H", ""))) |
586 |
{ |
587 |
current += 2; |
588 |
break; |
589 |
} |
590 |
else |
591 |
{ |
592 |
/* e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' */ |
593 |
if ((current > 2) |
594 |
&& (GetAt(original, current - 1) == 'U') |
595 |
&& StringAt(original, (current - 3), 1, "C", |
596 |
"G", "L", "R", "T", "")) |
597 |
{ |
598 |
MetaphAdd(primary, "F"); |
599 |
MetaphAdd(secondary, "F"); |
600 |
} |
601 |
else if ((current > 0) |
602 |
&& GetAt(original, current - 1) != 'I') |
603 |
{ |
604 |
|
605 |
|
606 |
MetaphAdd(primary, "K"); |
607 |
MetaphAdd(secondary, "K"); |
608 |
} |
609 |
|
610 |
current += 2; |
611 |
break; |
612 |
} |
613 |
} |
614 |
|
615 |
if (GetAt(original, current + 1) == 'N') |
616 |
{ |
617 |
if ((current == 1) && IsVowel(original, 0) |
618 |
&& !SlavoGermanic(original)) |
619 |
{ |
620 |
MetaphAdd(primary, "KN"); |
621 |
MetaphAdd(secondary, "N"); |
622 |
} |
623 |
else |
624 |
/* not e.g. 'cagney' */ |
625 |
if (!StringAt(original, (current + 2), 2, "EY", "") |
626 |
&& (GetAt(original, current + 1) != 'Y') |
627 |
&& !SlavoGermanic(original)) |
628 |
{ |
629 |
MetaphAdd(primary, "N"); |
630 |
MetaphAdd(secondary, "KN"); |
631 |
} |
632 |
else |
633 |
{ |
634 |
MetaphAdd(primary, "KN"); |
635 |
MetaphAdd(secondary, "KN"); |
636 |
} |
637 |
current += 2; |
638 |
break; |
639 |
} |
640 |
|
641 |
/* 'tagliaro' */ |
642 |
if (StringAt(original, (current + 1), 2, "LI", "") |
643 |
&& !SlavoGermanic(original)) |
644 |
{ |
645 |
MetaphAdd(primary, "KL"); |
646 |
MetaphAdd(secondary, "L"); |
647 |
current += 2; |
648 |
break; |
649 |
} |
650 |
|
651 |
/* -ges-,-gep-,-gel-, -gie- at beginning */ |
652 |
if ((current == 0) |
653 |
&& ((GetAt(original, current + 1) == 'Y') |
654 |
|| StringAt(original, (current + 1), 2, "ES", "EP", |
655 |
"EB", "EL", "EY", "IB", "IL", "IN", "IE", |
656 |
"EI", "ER", ""))) |
657 |
{ |
658 |
MetaphAdd(primary, "K"); |
659 |
MetaphAdd(secondary, "J"); |
660 |
current += 2; |
661 |
break; |
662 |
} |
663 |
|
664 |
/* -ger-, -gy- */ |
665 |
if ( |
666 |
(StringAt(original, (current + 1), 2, "ER", "") |
667 |
|| (GetAt(original, current + 1) == 'Y')) |
668 |
&& !StringAt(original, 0, 6, "DANGER", "RANGER", "MANGER", "") |
669 |
&& !StringAt(original, (current - 1), 1, "E", "I", "") |
670 |
&& !StringAt(original, (current - 1), 3, "RGY", "OGY", |
671 |
"")) |
672 |
{ |
673 |
MetaphAdd(primary, "K"); |
674 |
MetaphAdd(secondary, "J"); |
675 |
current += 2; |
676 |
break; |
677 |
} |
678 |
|
679 |
/* italian e.g, 'biaggi' */ |
680 |
if (StringAt(original, (current + 1), 1, "E", "I", "Y", "") |
681 |
|| StringAt(original, (current - 1), 4, "AGGI", "OGGI", "")) |
682 |
{ |
683 |
/* obvious germanic */ |
684 |
if ( |
685 |
(StringAt(original, 0, 4, "VAN ", "VON ", "") |
686 |
|| StringAt(original, 0, 3, "SCH", "")) |
687 |
|| StringAt(original, (current + 1), 2, "ET", "")) |
688 |
{ |
689 |
MetaphAdd(primary, "K"); |
690 |
MetaphAdd(secondary, "K"); |
691 |
} |
692 |
else |
693 |
{ |
694 |
/* always soft if french ending */ |
695 |
if (StringAt |
696 |
(original, (current + 1), 4, "IER ", "")) |
697 |
{ |
698 |
MetaphAdd(primary, "J"); |
699 |
MetaphAdd(secondary, "J"); |
700 |
} |
701 |
else |
702 |
{ |
703 |
MetaphAdd(primary, "J"); |
704 |
MetaphAdd(secondary, "K"); |
705 |
} |
706 |
} |
707 |
current += 2; |
708 |
break; |
709 |
} |
710 |
|
711 |
if (GetAt(original, current + 1) == 'G') |
712 |
current += 2; |
713 |
else |
714 |
current += 1; |
715 |
MetaphAdd(primary, "K"); |
716 |
MetaphAdd(secondary, "K"); |
717 |
break; |
718 |
|
719 |
case 'H': |
720 |
/* only keep if first & before vowel or btw. 2 vowels */ |
721 |
if (((current == 0) || IsVowel(original, current - 1)) |
722 |
&& IsVowel(original, current + 1)) |
723 |
{ |
724 |
MetaphAdd(primary, "H"); |
725 |
MetaphAdd(secondary, "H"); |
726 |
current += 2; |
727 |
} |
728 |
else /* also takes care of 'HH' */ |
729 |
current += 1; |
730 |
break; |
731 |
|
732 |
case 'J': |
733 |
/* obvious spanish, 'jose', 'san jacinto' */ |
734 |
if (StringAt(original, current, 4, "JOSE", "") |
735 |
|| StringAt(original, 0, 4, "SAN ", "")) |
736 |
{ |
737 |
if (((current == 0) |
738 |
&& (GetAt(original, current + 4) == ' ')) |
739 |
|| StringAt(original, 0, 4, "SAN ", "")) |
740 |
{ |
741 |
MetaphAdd(primary, "H"); |
742 |
MetaphAdd(secondary, "H"); |
743 |
} |
744 |
else |
745 |
{ |
746 |
MetaphAdd(primary, "J"); |
747 |
MetaphAdd(secondary, "H"); |
748 |
} |
749 |
current += 1; |
750 |
break; |
751 |
} |
752 |
|
753 |
if ((current == 0) |
754 |
&& !StringAt(original, current, 4, "JOSE", "")) |
755 |
{ |
756 |
MetaphAdd(primary, "J"); /* Yankelovich/Jankelowicz */ |
757 |
MetaphAdd(secondary, "A"); |
758 |
} |
759 |
else |
760 |
{ |
761 |
/* spanish pron. of e.g. 'bajador' */ |
762 |
if (IsVowel(original, current - 1) |
763 |
&& !SlavoGermanic(original) |
764 |
&& ((GetAt(original, current + 1) == 'A') |
765 |
|| (GetAt(original, current + 1) == 'O'))) |
766 |
{ |
767 |
MetaphAdd(primary, "J"); |
768 |
MetaphAdd(secondary, "H"); |
769 |
} |
770 |
else |
771 |
{ |
772 |
if (current == last) |
773 |
{ |
774 |
MetaphAdd(primary, "J"); |
775 |
MetaphAdd(secondary, ""); |
776 |
} |
777 |
else |
778 |
{ |
779 |
if (!StringAt(original, (current + 1), 1, "L", "T", |
780 |
"K", "S", "N", "M", "B", "Z", "") |
781 |
&& !StringAt(original, (current - 1), 1, |
782 |
"S", "K", "L", "")) |
783 |
{ |
784 |
MetaphAdd(primary, "J"); |
785 |
MetaphAdd(secondary, "J"); |
786 |
} |
787 |
} |
788 |
} |
789 |
} |
790 |
|
791 |
if (GetAt(original, current + 1) == 'J') /* it could happen! */ |
792 |
current += 2; |
793 |
else |
794 |
current += 1; |
795 |
break; |
796 |
|
797 |
case 'K': |
798 |
if (GetAt(original, current + 1) == 'K') |
799 |
current += 2; |
800 |
else |
801 |
current += 1; |
802 |
MetaphAdd(primary, "K"); |
803 |
MetaphAdd(secondary, "K"); |
804 |
break; |
805 |
|
806 |
case 'L': |
807 |
if (GetAt(original, current + 1) == 'L') |
808 |
{ |
809 |
/* spanish e.g. 'cabrillo', 'gallegos' */ |
810 |
if (((current == (length - 3)) |
811 |
&& StringAt(original, (current - 1), 4, "ILLO", |
812 |
"ILLA", "ALLE", "")) |
813 |
|| ((StringAt(original, (last - 1), 2, "AS", "OS", "") |
814 |
|| StringAt(original, last, 1, "A", "O", "")) |
815 |
&& StringAt(original, (current - 1), 4, "ALLE", ""))) |
816 |
{ |
817 |
MetaphAdd(primary, "L"); |
818 |
MetaphAdd(secondary, ""); |
819 |
current += 2; |
820 |
break; |
821 |
} |
822 |
current += 2; |
823 |
} |
824 |
else |
825 |
current += 1; |
826 |
MetaphAdd(primary, "L"); |
827 |
MetaphAdd(secondary, "L"); |
828 |
break; |
829 |
|
830 |
case 'M': |
831 |
if ((StringAt(original, (current - 1), 3, "UMB", "") |
832 |
&& (((current + 1) == last) |
833 |
|| StringAt(original, (current + 2), 2, "ER", ""))) |
834 |
/* 'dumb','thumb' */ |
835 |
|| (GetAt(original, current + 1) == 'M')) |
836 |
current += 2; |
837 |
else |
838 |
current += 1; |
839 |
MetaphAdd(primary, "M"); |
840 |
MetaphAdd(secondary, "M"); |
841 |
break; |
842 |
|
843 |
case 'N': |
844 |
if (GetAt(original, current + 1) == 'N') |
845 |
current += 2; |
846 |
else |
847 |
current += 1; |
848 |
MetaphAdd(primary, "N"); |
849 |
MetaphAdd(secondary, "N"); |
850 |
break; |
851 |
|
852 |
case 'Ñ': |
853 |
current += 1; |
854 |
MetaphAdd(primary, "N"); |
855 |
MetaphAdd(secondary, "N"); |
856 |
break; |
857 |
|
858 |
case 'P': |
859 |
if (GetAt(original, current + 1) == 'H') |
860 |
{ |
861 |
MetaphAdd(primary, "F"); |
862 |
MetaphAdd(secondary, "F"); |
863 |
current += 2; |
864 |
break; |
865 |
} |
866 |
|
867 |
/* also account for "campbell", "raspberry" */ |
868 |
if (StringAt(original, (current + 1), 1, "P", "B", "")) |
869 |
current += 2; |
870 |
else |
871 |
current += 1; |
872 |
MetaphAdd(primary, "P"); |
873 |
MetaphAdd(secondary, "P"); |
874 |
break; |
875 |
|
876 |
case 'Q': |
877 |
if (GetAt(original, current + 1) == 'Q') |
878 |
current += 2; |
879 |
else |
880 |
current += 1; |
881 |
MetaphAdd(primary, "K"); |
882 |
MetaphAdd(secondary, "K"); |
883 |
break; |
884 |
|
885 |
case 'R': |
886 |
/* french e.g. 'rogier', but exclude 'hochmeier' */ |
887 |
if ((current == last) |
888 |
&& !SlavoGermanic(original) |
889 |
&& StringAt(original, (current - 2), 2, "IE", "") |
890 |
&& !StringAt(original, (current - 4), 2, "ME", "MA", "")) |
891 |
{ |
892 |
MetaphAdd(primary, ""); |
893 |
MetaphAdd(secondary, "R"); |
894 |
} |
895 |
else |
896 |
{ |
897 |
MetaphAdd(primary, "R"); |
898 |
MetaphAdd(secondary, "R"); |
899 |
} |
900 |
|
901 |
if (GetAt(original, current + 1) == 'R') |
902 |
current += 2; |
903 |
else |
904 |
current += 1; |
905 |
break; |
906 |
|
907 |
case 'S': |
908 |
/* special cases 'island', 'isle', 'carlisle', 'carlysle' */ |
909 |
if (StringAt(original, (current - 1), 3, "ISL", "YSL", "")) |
910 |
{ |
911 |
current += 1; |
912 |
break; |
913 |
} |
914 |
|
915 |
/* special case 'sugar-' */ |
916 |
if ((current == 0) |
917 |
&& StringAt(original, current, 5, "SUGAR", "")) |
918 |
{ |
919 |
MetaphAdd(primary, "X"); |
920 |
MetaphAdd(secondary, "S"); |
921 |
current += 1; |
922 |
break; |
923 |
} |
924 |
|
925 |
if (StringAt(original, current, 2, "SH", "")) |
926 |
{ |
927 |
/* germanic */ |
928 |
if (StringAt |
929 |
(original, (current + 1), 4, "HEIM", "HOEK", "HOLM", |
930 |
"HOLZ", "")) |
931 |
{ |
932 |
MetaphAdd(primary, "S"); |
933 |
MetaphAdd(secondary, "S"); |
934 |
} |
935 |
else |
936 |
{ |
937 |
MetaphAdd(primary, "X"); |
938 |
MetaphAdd(secondary, "X"); |
939 |
} |
940 |
current += 2; |
941 |
break; |
942 |
} |
943 |
|
944 |
/* italian & armenian */ |
945 |
if (StringAt(original, current, 3, "SIO", "SIA", "") |
946 |
|| StringAt(original, current, 4, "SIAN", "")) |
947 |
{ |
948 |
if (!SlavoGermanic(original)) |
949 |
{ |
950 |
MetaphAdd(primary, "S"); |
951 |
MetaphAdd(secondary, "X"); |
952 |
} |
953 |
else |
954 |
{ |
955 |
MetaphAdd(primary, "S"); |
956 |
MetaphAdd(secondary, "S"); |
957 |
} |
958 |
current += 3; |
959 |
break; |
960 |
} |
961 |
|
962 |
/* german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' |
963 |
also, -sz- in slavic language altho in hungarian it is pronounced 's' */ |
964 |
if (((current == 0) |
965 |
&& StringAt(original, (current + 1), 1, "M", "N", "L", "W", "")) |
966 |
|| StringAt(original, (current + 1), 1, "Z", "")) |
967 |
{ |
968 |
MetaphAdd(primary, "S"); |
969 |
MetaphAdd(secondary, "X"); |
970 |
if (StringAt(original, (current + 1), 1, "Z", "")) |
971 |
current += 2; |
972 |
else |
973 |
current += 1; |
974 |
break; |
975 |
} |
976 |
|
977 |
if (StringAt(original, current, 2, "SC", "")) |
978 |
{ |
979 |
/* Schlesinger's rule */ |
980 |
if (GetAt(original, current + 2) == 'H') |
981 |
{ |
982 |
/* dutch origin, e.g. 'school', 'schooner' */ |
983 |
if (StringAt(original, (current + 3), 2, "OO", "ER", "EN", |
984 |
"UY", "ED", "EM", "")) |
985 |
{ |
986 |
/* 'schermerhorn', 'schenker' */ |
987 |
if (StringAt(original, (current + 3), 2, "ER", "EN", "")) |
988 |
{ |
989 |
MetaphAdd(primary, "X"); |
990 |
MetaphAdd(secondary, "SK"); |
991 |
} |
992 |
else |
993 |
{ |
994 |
MetaphAdd(primary, "SK"); |
995 |
MetaphAdd(secondary, "SK"); |
996 |
} |
997 |
current += 3; |
998 |
break; |
999 |
} |
1000 |
else |
1001 |
{ |
1002 |
if ((current == 0) && !IsVowel(original, 3) |
1003 |
&& (GetAt(original, 3) != 'W')) |
1004 |
{ |
1005 |
MetaphAdd(primary, "X"); |
1006 |
MetaphAdd(secondary, "S"); |
1007 |
} |
1008 |
else |
1009 |
{ |
1010 |
MetaphAdd(primary, "X"); |
1011 |
MetaphAdd(secondary, "X"); |
1012 |
} |
1013 |
current += 3; |
1014 |
break; |
1015 |
} |
1016 |
|
1017 |
if (StringAt(original, (current + 2), 1, "I", "E", "Y", "")) |
1018 |
{ |
1019 |
MetaphAdd(primary, "S"); |
1020 |
MetaphAdd(secondary, "S"); |
1021 |
current += 3; |
1022 |
break; |
1023 |
} |
1024 |
/* else */ |
1025 |
MetaphAdd(primary, "SK"); |
1026 |
MetaphAdd(secondary, "SK"); |
1027 |
current += 3; |
1028 |
break; |
1029 |
} |
1030 |
} |
1031 |
|
1032 |
/* french e.g. 'resnais', 'artois' */ |
1033 |
if ((current == last) |
1034 |
&& StringAt(original, (current - 2), 2, "AI", "OI", "")) |
1035 |
{ |
1036 |
MetaphAdd(primary, ""); |
1037 |
MetaphAdd(secondary, "S"); |
1038 |
} |
1039 |
else |
1040 |
{ |
1041 |
MetaphAdd(primary, "S"); |
1042 |
MetaphAdd(secondary, "S"); |
1043 |
} |
1044 |
|
1045 |
if (StringAt(original, (current + 1), 1, "S", "Z", "")) |
1046 |
current += 2; |
1047 |
else |
1048 |
current += 1; |
1049 |
break; |
1050 |
|
1051 |
case 'T': |
1052 |
if (StringAt(original, current, 4, "TION", "")) |
1053 |
{ |
1054 |
MetaphAdd(primary, "X"); |
1055 |
MetaphAdd(secondary, "X"); |
1056 |
current += 3; |
1057 |
break; |
1058 |
} |
1059 |
|
1060 |
if (StringAt(original, current, 3, "TIA", "TCH", "")) |
1061 |
{ |
1062 |
MetaphAdd(primary, "X"); |
1063 |
MetaphAdd(secondary, "X"); |
1064 |
current += 3; |
1065 |
break; |
1066 |
} |
1067 |
|
1068 |
if (StringAt(original, current, 2, "TH", "") |
1069 |
|| StringAt(original, current, 3, "TTH", "")) |
1070 |
{ |
1071 |
/* special case 'thomas', 'thames' or germanic */ |
1072 |
if (StringAt(original, (current + 2), 2, "OM", "AM", "") |
1073 |
|| StringAt(original, 0, 4, "VAN ", "VON ", "") |
1074 |
|| StringAt(original, 0, 3, "SCH", "")) |
1075 |
{ |
1076 |
MetaphAdd(primary, "T"); |
1077 |
MetaphAdd(secondary, "T"); |
1078 |
} |
1079 |
else |
1080 |
{ |
1081 |
MetaphAdd(primary, "0"); |
1082 |
MetaphAdd(secondary, "T"); |
1083 |
} |
1084 |
current += 2; |
1085 |
break; |
1086 |
} |
1087 |
|
1088 |
if (StringAt(original, (current + 1), 1, "T", "D", "")) |
1089 |
current += 2; |
1090 |
else |
1091 |
current += 1; |
1092 |
MetaphAdd(primary, "T"); |
1093 |
MetaphAdd(secondary, "T"); |
1094 |
break; |
1095 |
|
1096 |
case 'V': |
1097 |
if (GetAt(original, current + 1) == 'V') |
1098 |
current += 2; |
1099 |
else |
1100 |
current += 1; |
1101 |
MetaphAdd(primary, "F"); |
1102 |
MetaphAdd(secondary, "F"); |
1103 |
break; |
1104 |
|
1105 |
case 'W': |
1106 |
/* can also be in middle of word */ |
1107 |
if (StringAt(original, current, 2, "WR", "")) |
1108 |
{ |
1109 |
MetaphAdd(primary, "R"); |
1110 |
MetaphAdd(secondary, "R"); |
1111 |
current += 2; |
1112 |
break; |
1113 |
} |
1114 |
|
1115 |
if ((current == 0) |
1116 |
&& (IsVowel(original, current + 1) |
1117 |
|| StringAt(original, current, 2, "WH", ""))) |
1118 |
{ |
1119 |
/* Wasserman should match Vasserman */ |
1120 |
if (IsVowel(original, current + 1)) |
1121 |
{ |
1122 |
MetaphAdd(primary, "A"); |
1123 |
MetaphAdd(secondary, "F"); |
1124 |
} |
1125 |
else |
1126 |
{ |
1127 |
/* need Uomo to match Womo */ |
1128 |
MetaphAdd(primary, "A"); |
1129 |
MetaphAdd(secondary, "A"); |
1130 |
} |
1131 |
} |
1132 |
|
1133 |
/* Arnow should match Arnoff */ |
1134 |
if (((current == last) && IsVowel(original, current - 1)) |
1135 |
|| StringAt(original, (current - 1), 5, "EWSKI", "EWSKY", |
1136 |
"OWSKI", "OWSKY", "") |
1137 |
|| StringAt(original, 0, 3, "SCH", "")) |
1138 |
{ |
1139 |
MetaphAdd(primary, ""); |
1140 |
MetaphAdd(secondary, "F"); |
1141 |
current += 1; |
1142 |
break; |
1143 |
} |
1144 |
|
1145 |
/* polish e.g. 'filipowicz' */ |
1146 |
if (StringAt(original, current, 4, "WICZ", "WITZ", "")) |
1147 |
{ |
1148 |
MetaphAdd(primary, "TS"); |
1149 |
MetaphAdd(secondary, "FX"); |
1150 |
current += 4; |
1151 |
break; |
1152 |
} |
1153 |
|
1154 |
/* else skip it */ |
1155 |
current += 1; |
1156 |
break; |
1157 |
|
1158 |
case 'X': |
1159 |
/* french e.g. breaux */ |
1160 |
if (!((current == last) |
1161 |
&& (StringAt(original, (current - 3), 3, "IAU", "EAU", "") |
1162 |
|| StringAt(original, (current - 2), 2, "AU", "OU", "")))) |
1163 |
{ |
1164 |
MetaphAdd(primary, "KS"); |
1165 |
MetaphAdd(secondary, "KS"); |
1166 |
} |
1167 |
|
1168 |
|
1169 |
if (StringAt(original, (current + 1), 1, "C", "X", "")) |
1170 |
current += 2; |
1171 |
else |
1172 |
current += 1; |
1173 |
break; |
1174 |
|
1175 |
case 'Z': |
1176 |
/* chinese pinyin e.g. 'zhao' */ |
1177 |
if (GetAt(original, current + 1) == 'H') |
1178 |
{ |
1179 |
MetaphAdd(primary, "J"); |
1180 |
MetaphAdd(secondary, "J"); |
1181 |
current += 2; |
1182 |
break; |
1183 |
} |
1184 |
else if (StringAt(original, (current + 1), 2, "ZO", "ZI", "ZA", "") |
1185 |
|| (SlavoGermanic(original) |
1186 |
&& ((current > 0) |
1187 |
&& GetAt(original, current - 1) != 'T'))) |
1188 |
{ |
1189 |
MetaphAdd(primary, "S"); |
1190 |
MetaphAdd(secondary, "TS"); |
1191 |
} |
1192 |
else |
1193 |
{ |
1194 |
MetaphAdd(primary, "S"); |
1195 |
MetaphAdd(secondary, "S"); |
1196 |
} |
1197 |
|
1198 |
if (GetAt(original, current + 1) == 'Z') |
1199 |
current += 2; |
1200 |
else |
1201 |
current += 1; |
1202 |
break; |
1203 |
|
1204 |
default: |
1205 |
current += 1; |
1206 |
} |
1207 |
/* printf("PRIMARY: %s\n", primary->str); |
1208 |
printf("SECONDARY: %s\n", secondary->str); */ |
1209 |
} |
1210 |
|
1211 |
|
1212 |
if (primary->length > 4) |
1213 |
SetAt(primary, 4, '\0'); |
1214 |
|
1215 |
if (secondary->length > 4) |
1216 |
SetAt(secondary, 4, '\0'); |
1217 |
|
1218 |
*codes = primary->str; |
1219 |
*++codes = secondary->str; |
1220 |
|
1221 |
DestroyMetaString(original); |
1222 |
DestroyMetaString(primary); |
1223 |
DestroyMetaString(secondary); |
1224 |
} |
1225 |
|