diff options
Diffstat (limited to 'contrib/fuzzystrmatch/fuzzystrmatch.c')
-rw-r--r-- | contrib/fuzzystrmatch/fuzzystrmatch.c | 499 |
1 files changed, 265 insertions, 234 deletions
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c index d6ec0f5c30a..94b9e9de779 100644 --- a/contrib/fuzzystrmatch/fuzzystrmatch.c +++ b/contrib/fuzzystrmatch/fuzzystrmatch.c @@ -24,13 +24,13 @@ * documentation for any purpose, without fee, and without a written agreement * is hereby granted, provided that the above copyright notice and this * paragraph and the following two paragraphs appear in all copies. - * + * * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. - * + * * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS @@ -50,23 +50,22 @@ PG_FUNCTION_INFO_V1(levenshtein); Datum levenshtein(PG_FUNCTION_ARGS) { - char *str_s; - char *str_s0; - char *str_t; - int cols = 0; - int rows = 0; - int *u_cells; - int *l_cells; - int *tmp; - int i; - int j; + char *str_s; + char *str_s0; + char *str_t; + int cols = 0; + int rows = 0; + int *u_cells; + int *l_cells; + int *tmp; + int i; + int j; /* - * Fetch the arguments. - * str_s is referred to as the "source" - * cols = length of source + 1 to allow for the initialization column - * str_t is referred to as the "target", rows = length of target + 1 - * rows = length of target + 1 to allow for the initialization row + * Fetch the arguments. str_s is referred to as the "source" cols = + * length of source + 1 to allow for the initialization column str_t + * is referred to as the "target", rows = length of target + 1 rows = + * length of target + 1 to allow for the initialization row */ str_s = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0)))); str_t = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1)))); @@ -75,18 +74,19 @@ levenshtein(PG_FUNCTION_ARGS) rows = strlen(str_t) + 1; /* - * Restrict the length of the strings being compared to something reasonable - * because we will have to perform rows * cols calcualtions. If longer strings need to be - * compared, increase MAX_LEVENSHTEIN_STRLEN to suit (but within your tolerance for - * speed and memory usage). + * Restrict the length of the strings being compared to something + * reasonable because we will have to perform rows * cols + * calcualtions. If longer strings need to be compared, increase + * MAX_LEVENSHTEIN_STRLEN to suit (but within your tolerance for speed + * and memory usage). */ if ((cols > MAX_LEVENSHTEIN_STRLEN + 1) || (rows > MAX_LEVENSHTEIN_STRLEN + 1)) elog(ERROR, "levenshtein: Arguments may not exceed %d characters in length", MAX_LEVENSHTEIN_STRLEN); /* - * If either rows or cols is 0, the answer is the other value. - * This makes sense since it would take that many insertions - * the build a matching string + * If either rows or cols is 0, the answer is the other value. This + * makes sense since it would take that many insertions the build a + * matching string */ if (cols == 0) @@ -96,8 +96,9 @@ levenshtein(PG_FUNCTION_ARGS) PG_RETURN_INT32(cols); /* - * Allocate two vectors of integers. One will be used for the "upper" row, - * the other for the "lower" row. Initialize the "upper" row to 0..cols + * Allocate two vectors of integers. One will be used for the "upper" + * row, the other for the "lower" row. Initialize the "upper" row to + * 0..cols */ u_cells = palloc(sizeof(int) * cols); for (i = 0; i < cols; i++) @@ -106,33 +107,35 @@ levenshtein(PG_FUNCTION_ARGS) l_cells = palloc(sizeof(int) * cols); /* - * Use str_s0 to "rewind" the pointer to str_s in the nested for loop below + * Use str_s0 to "rewind" the pointer to str_s in the nested for loop + * below */ str_s0 = str_s; /* - * Loop throught the rows, starting at row 1. Row 0 is used for the initial - * "upper" row. + * Loop throught the rows, starting at row 1. Row 0 is used for the + * initial "upper" row. */ for (j = 1; j < rows; j++) { /* - * We'll always start with col 1, - * and initialize lower row col 0 to j + * We'll always start with col 1, and initialize lower row col 0 + * to j */ l_cells[0] = j; for (i = 1; i < cols; i++) { - int c = 0; - int c1 = 0; - int c2 = 0; - int c3 = 0; + int c = 0; + int c1 = 0; + int c2 = 0; + int c3 = 0; /* - * The "cost" value is 0 if the character at the current col position - * in the source string, matches the character at the current row position - * in the target string; cost is 1 otherwise. + * The "cost" value is 0 if the character at the current col + * position in the source string, matches the character at the + * current row position in the target string; cost is 1 + * otherwise. */ c = ((CHAREQ(str_s, str_t)) ? 0 : 1); @@ -152,8 +155,7 @@ levenshtein(PG_FUNCTION_ARGS) c3 = u_cells[i - 1] + c; /* - * The lower right cell is set to the minimum - * of c1, c2, c3 + * The lower right cell is set to the minimum of c1, c2, c3 */ l_cells[i] = (c1 < c2 ? c1 : c2) < c3 ? (c1 < c2 ? c1 : c2) : c3; @@ -164,8 +166,8 @@ levenshtein(PG_FUNCTION_ARGS) } /* - * Lower row now becomes the upper row, and the upper row - * gets reused as the new lower row. + * Lower row now becomes the upper row, and the upper row gets + * reused as the new lower row. */ tmp = u_cells; u_cells = l_cells; @@ -183,8 +185,8 @@ levenshtein(PG_FUNCTION_ARGS) } /* - * Because the final value (at position row, col) was swapped from - * the lower row to the upper row, that's where we'll find it. + * Because the final value (at position row, col) was swapped from the + * lower row to the upper row, that's where we'll find it. */ PG_RETURN_INT32(u_cells[cols - 1]); } @@ -199,10 +201,10 @@ Datum metaphone(PG_FUNCTION_ARGS) { int reqlen; - char *str_i; + char *str_i; size_t str_i_len; - char *metaph; - text *result_text; + char *metaph; + text *result_text; int retval; str_i = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0)))); @@ -240,7 +242,7 @@ metaphone(PG_FUNCTION_ARGS) } -/* +/* * Original code by Michael G Schwern starts here. * Code slightly modified for use as PostgreSQL * function (palloc, etc). Original includes @@ -255,26 +257,28 @@ metaphone(PG_FUNCTION_ARGS) /* Look at the current letter in the word */ #define Curr_Letter (toupper(word[w_idx])) /* Go N letters back. */ -#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0') +#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0') /* Previous letter. I dunno, should this return null on failure? */ #define Prev_Letter (Look_Back_Letter(1)) /* Look two letters down. It makes sure you don't walk off the string. */ #define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \ - : '\0') + : '\0') #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n))) - + /* Allows us to safely look ahead an arbitrary # of letters */ /* I probably could have just used strlen... */ -char Lookahead(char * word, int how_far) { - char letter_ahead = '\0'; /* null by default */ - int idx; - for(idx = 0; word[idx] != '\0' && idx < how_far; idx++); - /* Edge forward in the string... */ - - letter_ahead = word[idx]; /* idx will be either == to how_far or - * at the end of the string - */ +char +Lookahead(char *word, int how_far) +{ + char letter_ahead = '\0'; /* null by default */ + int idx; + + for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); + /* Edge forward in the string... */ + + letter_ahead = word[idx]; /* idx will be either == to how_far or at + * the end of the string */ return letter_ahead; } @@ -282,29 +286,30 @@ char Lookahead(char * word, int how_far) { /* phonize one letter */ #define Phonize(c) do {(*phoned_word)[p_idx++] = c;} while (0) /* Slap a null character on the end of the phoned word */ -#define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0) +#define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0) /* How long is the phoned word? */ #define Phone_Len (p_idx) /* Note is a letter is a 'break' in the word */ -#define Isbreak(c) (!isalpha(c)) - - -int _metaphone ( - /* IN */ - char * word, - int max_phonemes, - /* OUT */ - char ** phoned_word -) { - int w_idx = 0; /* point in the phonization we're at. */ - int p_idx = 0; /* end of the phoned phrase */ - +#define Isbreak(c) (!isalpha(c)) + + +int +_metaphone( + /* IN */ + char *word, + int max_phonemes, + /* OUT */ + char **phoned_word +) +{ + int w_idx = 0; /* point in the phonization we're at. */ + int p_idx = 0; /* end of the phoned phrase */ + /*-- Parameter checks --*/ /* - * Shouldn't be necessary, but left these here anyway - * jec Aug 3, 2001 + * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001 */ /* Negative phoneme length is meaningless */ @@ -314,13 +319,16 @@ int _metaphone ( /* Empty/null string is meaningless */ if ((word == NULL) || !(strlen(word) > 0)) elog(ERROR, "metaphone: Input string length must be > 0"); - + /*-- Allocate memory for our phoned_phrase --*/ - if (max_phonemes == 0) { /* Assume largest possible */ - *phoned_word = palloc(sizeof(char) * strlen(word) + 1); + if (max_phonemes == 0) + { /* Assume largest possible */ + *phoned_word = palloc(sizeof(char) * strlen(word) +1); if (!*phoned_word) return META_ERROR; - } else { + } + else + { *phoned_word = palloc(sizeof(char) * max_phonemes + 1); if (!*phoned_word) return META_ERROR; @@ -328,62 +336,70 @@ int _metaphone ( /*-- The first phoneme has to be processed specially. --*/ /* Find our first letter */ - for( ; !isalpha(Curr_Letter); w_idx++ ) { + for (; !isalpha(Curr_Letter); w_idx++) + { /* On the off chance we were given nothing but crap... */ - if( Curr_Letter == '\0' ) { + if (Curr_Letter == '\0') + { End_Phoned_Word - return META_SUCCESS; /* For testing */ + return META_SUCCESS; /* For testing */ } - } - - switch (Curr_Letter) { - /* AE becomes E */ + } + + switch (Curr_Letter) + { + /* AE becomes E */ case 'A': - if( Next_Letter == 'E' ) { + if (Next_Letter == 'E') + { Phonize('E'); - w_idx+=2; + w_idx += 2; } /* Remember, preserve vowels at the beginning */ - else { + else + { Phonize('A'); w_idx++; } break; - /* [GKP]N becomes N */ + /* [GKP]N becomes N */ case 'G': case 'K': case 'P': - if( Next_Letter == 'N' ) { + if (Next_Letter == 'N') + { Phonize('N'); - w_idx+=2; + w_idx += 2; } break; - /* WH becomes H, - WR becomes R - W if followed by a vowel */ + + /* + * WH becomes H, WR becomes R W if followed by a vowel + */ case 'W': - if( Next_Letter == 'H' || - Next_Letter == 'R' ) + if (Next_Letter == 'H' || + Next_Letter == 'R') { - Phonize(Next_Letter); - w_idx+=2; + Phonize(Next_Letter); + w_idx += 2; } - else if ( isvowel(Next_Letter) ) { - Phonize('W'); - w_idx+=2; + else if (isvowel(Next_Letter)) + { + Phonize('W'); + w_idx += 2; } /* else ignore */ break; - /* X becomes S */ + /* X becomes S */ case 'X': Phonize('S'); w_idx++; break; - /* Vowels are kept */ - /* We did A already - case 'A': - case 'a': - */ + /* Vowels are kept */ + + /* + * We did A already case 'A': case 'a': + */ case 'E': case 'I': case 'O': @@ -395,220 +411,235 @@ int _metaphone ( /* do nothing */ break; } - - - + + + /* On to the metaphoning */ - for(; Curr_Letter != '\0' && - (max_phonemes == 0 || Phone_Len < max_phonemes); - w_idx++) { - /* How many letters to skip because an eariler encoding handled - * multiple letters */ - unsigned short int skip_letter = 0; - - - /* THOUGHT: It would be nice if, rather than having things like... - * well, SCI. For SCI you encode the S, then have to remember - * to skip the C. So the phonome SCI invades both S and C. It would - * be better, IMHO, to skip the C from the S part of the encoding. - * Hell, I'm trying it. + for (; Curr_Letter != '\0' && + (max_phonemes == 0 || Phone_Len < max_phonemes); + w_idx++) + { + /* + * How many letters to skip because an eariler encoding handled + * multiple letters */ - + unsigned short int skip_letter = 0; + + + /* + * THOUGHT: It would be nice if, rather than having things + * like... well, SCI. For SCI you encode the S, then have to + * remember to skip the C. So the phonome SCI invades both S and + * C. It would be better, IMHO, to skip the C from the S part of + * the encoding. Hell, I'm trying it. + */ + /* Ignore non-alphas */ - if( !isalpha(Curr_Letter) ) + if (!isalpha(Curr_Letter)) continue; - + /* Drop duplicates, except CC */ - if( Curr_Letter == Prev_Letter && - Curr_Letter != 'C' ) + if (Curr_Letter == Prev_Letter && + Curr_Letter != 'C') continue; - - switch (Curr_Letter) { - /* B -> B unless in MB */ + + switch (Curr_Letter) + { + /* B -> B unless in MB */ case 'B': - if( Prev_Letter != 'M' ) + if (Prev_Letter != 'M') Phonize('B'); break; - /* 'sh' if -CIA- or -CH, but not SCH, except SCHW. - * (SCHW is handled in S) - * S if -CI-, -CE- or -CY- - * dropped if -SCI-, SCE-, -SCY- (handed in S) - * else K - */ + + /* + * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW + * is handled in S) S if -CI-, -CE- or -CY- dropped if + * -SCI-, SCE-, -SCY- (handed in S) else K + */ case 'C': - if( MAKESOFT(Next_Letter) ) { /* C[IEY] */ - if( After_Next_Letter == 'A' && - Next_Letter == 'I' ) { /* CIA */ + if (MAKESOFT(Next_Letter)) + { /* C[IEY] */ + if (After_Next_Letter == 'A' && + Next_Letter == 'I') + { /* CIA */ Phonize(SH); } /* SC[IEY] */ - else if ( Prev_Letter == 'S' ) { - /* Dropped */ - } - else { - Phonize('S'); + else if (Prev_Letter == 'S') + { + /* Dropped */ } + else + Phonize('S'); } - else if ( Next_Letter == 'H' ) { + else if (Next_Letter == 'H') + { #ifndef USE_TRADITIONAL_METAPHONE - if( After_Next_Letter == 'R' || - Prev_Letter == 'S' ) { /* Christ, School */ + if (After_Next_Letter == 'R' || + Prev_Letter == 'S') + { /* Christ, School */ Phonize('K'); } - else { + else Phonize(SH); - } #else Phonize(SH); #endif skip_letter++; } - else { + else Phonize('K'); - } break; - /* J if in -DGE-, -DGI- or -DGY- - * else T - */ + + /* + * J if in -DGE-, -DGI- or -DGY- else T + */ case 'D': - if( Next_Letter == 'G' && - MAKESOFT(After_Next_Letter) ) { + if (Next_Letter == 'G' && + MAKESOFT(After_Next_Letter)) + { Phonize('J'); skip_letter++; } else Phonize('T'); break; - /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH - * else dropped if -GNED, -GN, - * else dropped if -DGE-, -DGI- or -DGY- (handled in D) - * else J if in -GE-, -GI, -GY and not GG - * else K - */ + + /* + * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else + * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or + * -DGY- (handled in D) else J if in -GE-, -GI, -GY and + * not GG else K + */ case 'G': - if( Next_Letter == 'H' ) { - if( !( NOGHTOF(Look_Back_Letter(3)) || - Look_Back_Letter(4) == 'H' ) ) { + if (Next_Letter == 'H') + { + if (!(NOGHTOF(Look_Back_Letter(3)) || + Look_Back_Letter(4) == 'H')) + { Phonize('F'); skip_letter++; } - else { + else + { /* silent */ } } - else if( Next_Letter == 'N' ) { - if( Isbreak(After_Next_Letter) || - ( After_Next_Letter == 'E' && - Look_Ahead_Letter(3) == 'D' ) ) { + else if (Next_Letter == 'N') + { + if (Isbreak(After_Next_Letter) || + (After_Next_Letter == 'E' && + Look_Ahead_Letter(3) == 'D')) + { /* dropped */ } else Phonize('K'); } - else if( MAKESOFT(Next_Letter) && - Prev_Letter != 'G' ) { + else if (MAKESOFT(Next_Letter) && + Prev_Letter != 'G') Phonize('J'); - } - else { + else Phonize('K'); - } break; - /* H if before a vowel and not after C,G,P,S,T */ + /* H if before a vowel and not after C,G,P,S,T */ case 'H': - if( isvowel(Next_Letter) && - !AFFECTH(Prev_Letter) ) + if (isvowel(Next_Letter) && + !AFFECTH(Prev_Letter)) Phonize('H'); break; - /* dropped if after C - * else K - */ + + /* + * dropped if after C else K + */ case 'K': - if( Prev_Letter != 'C' ) + if (Prev_Letter != 'C') Phonize('K'); break; - /* F if before H - * else P - */ + + /* + * F if before H else P + */ case 'P': - if( Next_Letter == 'H' ) { + if (Next_Letter == 'H') Phonize('F'); - } - else { + else Phonize('P'); - } break; - /* K - */ + + /* + * K + */ case 'Q': Phonize('K'); break; - /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- - * else S - */ + + /* + * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S + */ case 'S': - if( Next_Letter == 'I' && - ( After_Next_Letter == 'O' || - After_Next_Letter == 'A' ) ) { + if (Next_Letter == 'I' && + (After_Next_Letter == 'O' || + After_Next_Letter == 'A')) Phonize(SH); - } - else if ( Next_Letter == 'H' ) { + else if (Next_Letter == 'H') + { Phonize(SH); skip_letter++; } #ifndef USE_TRADITIONAL_METAPHONE - else if ( Next_Letter == 'C' && - Look_Ahead_Letter(2) == 'H' && - Look_Ahead_Letter(3) == 'W' ) { + else if (Next_Letter == 'C' && + Look_Ahead_Letter(2) == 'H' && + Look_Ahead_Letter(3) == 'W') + { Phonize(SH); skip_letter += 2; } #endif - else { + else Phonize('S'); - } break; - /* 'sh' in -TIA- or -TIO- - * else 'th' before H - * else T - */ + + /* + * 'sh' in -TIA- or -TIO- else 'th' before H else T + */ case 'T': - if( Next_Letter == 'I' && - ( After_Next_Letter == 'O' || - After_Next_Letter == 'A' ) ) { + if (Next_Letter == 'I' && + (After_Next_Letter == 'O' || + After_Next_Letter == 'A')) Phonize(SH); - } - else if ( Next_Letter == 'H' ) { + else if (Next_Letter == 'H') + { Phonize(TH); skip_letter++; } - else { + else Phonize('T'); - } break; - /* F */ + /* F */ case 'V': Phonize('F'); break; - /* W before a vowel, else dropped */ + /* W before a vowel, else dropped */ case 'W': - if( isvowel(Next_Letter) ) + if (isvowel(Next_Letter)) Phonize('W'); break; - /* KS */ + /* KS */ case 'X': Phonize('K'); Phonize('S'); break; - /* Y if followed by a vowel */ + /* Y if followed by a vowel */ case 'Y': - if( isvowel(Next_Letter) ) + if (isvowel(Next_Letter)) Phonize('Y'); break; - /* S */ + /* S */ case 'Z': Phonize('S'); break; - /* No transformation */ + /* No transformation */ case 'F': case 'J': case 'L': @@ -620,15 +651,15 @@ int _metaphone ( default: /* nothing */ break; - } /* END SWITCH */ - + } /* END SWITCH */ + w_idx += skip_letter; - } /* END FOR */ - + } /* END FOR */ + End_Phoned_Word; - - return(META_SUCCESS); -} /* END metaphone */ + + return (META_SUCCESS); +} /* END metaphone */ /* |