/*------------------------------------------------------------------------- * * wparser_def.c * Standard word parser * * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.3 2007/09/07 15:09:55 teodor Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "commands/defrem.h" #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" #include "utils/builtins.h" /* rememder !!!! */ #define LASTNUM 23 #define LATWORD 1 #define CYRWORD 2 #define UWORD 3 #define EMAIL 4 #define FURL 5 #define HOST 6 #define SCIENTIFIC 7 #define VERSIONNUMBER 8 #define PARTHYPHENWORD 9 #define CYRPARTHYPHENWORD 10 #define LATPARTHYPHENWORD 11 #define SPACE 12 #define TAG 13 #define PROTOCOL 14 #define HYPHENWORD 15 #define LATHYPHENWORD 16 #define CYRHYPHENWORD 17 #define URI 18 #define FILEPATH 19 #define DECIMAL 20 #define SIGNEDINT 21 #define UNSIGNEDINT 22 #define HTMLENTITY 23 static const char *lex_descr[] = { "", "Latin word", "Non-latin word", "Word", "Email", "URL", "Host", "Scientific notation", "VERSION", "Part of hyphenated word", "Non-latin part of hyphenated word", "Latin part of hyphenated word", "Space symbols", "HTML Tag", "Protocol head", "Hyphenated word", "Latin hyphenated word", "Non-latin hyphenated word", "URI", "File or path name", "Decimal notation", "Signed integer", "Unsigned integer", "HTML Entity" }; static const char *tok_alias[] = { "", "lword", "nlword", "word", "email", "url", "host", "sfloat", "version", "part_hword", "nlpart_hword", "lpart_hword", "blank", "tag", "protocol", "hword", "lhword", "nlhword", "uri", "file", "float", "int", "uint", "entity" }; typedef enum { TPS_Base = 0, TPS_InUWord, TPS_InLatWord, TPS_InCyrWord, TPS_InUnsignedInt, TPS_InSignedIntFirst, TPS_InSignedInt, TPS_InSpace, TPS_InUDecimalFirst, TPS_InUDecimal, TPS_InDecimalFirst, TPS_InDecimal, TPS_InVerVersion, TPS_InSVerVersion, TPS_InVersionFirst, TPS_InVersion, TPS_InMantissaFirst, TPS_InMantissaSign, TPS_InMantissa, TPS_InHTMLEntityFirst, TPS_InHTMLEntity, TPS_InHTMLEntityNumFirst, TPS_InHTMLEntityNum, TPS_InHTMLEntityEnd, TPS_InTagFirst, TPS_InXMLBegin, TPS_InTagCloseFirst, TPS_InTagName, TPS_InTagBeginEnd, TPS_InTag, TPS_InTagEscapeK, TPS_InTagEscapeKK, TPS_InTagBackSleshed, TPS_InTagEnd, TPS_InCommentFirst, TPS_InCommentLast, TPS_InComment, TPS_InCloseCommentFirst, TPS_InCloseCommentLast, TPS_InCommentEnd, TPS_InHostFirstDomain, TPS_InHostDomainSecond, TPS_InHostDomain, TPS_InPortFirst, TPS_InPort, TPS_InHostFirstAN, TPS_InHost, TPS_InEmail, TPS_InFileFirst, TPS_InFileTwiddle, TPS_InPathFirst, TPS_InPathFirstFirst, TPS_InPathSecond, TPS_InFile, TPS_InFileNext, TPS_InURIFirst, TPS_InURIStart, TPS_InURI, TPS_InFURL, TPS_InProtocolFirst, TPS_InProtocolSecond, TPS_InProtocolEnd, TPS_InHyphenLatWordFirst, TPS_InHyphenLatWord, TPS_InHyphenCyrWordFirst, TPS_InHyphenCyrWord, TPS_InHyphenUWordFirst, TPS_InHyphenUWord, TPS_InHyphenValueFirst, TPS_InHyphenValue, TPS_InHyphenValueExact, TPS_InParseHyphen, TPS_InParseHyphenHyphen, TPS_InHyphenCyrWordPart, TPS_InHyphenLatWordPart, TPS_InHyphenUWordPart, TPS_InHyphenUnsignedInt, TPS_InHDecimalPartFirst, TPS_InHDecimalPart, TPS_InHVersionPartFirst, TPS_InHVersionPart, TPS_Null /* last state (fake value) */ } TParserState; /* forward declaration */ struct TParser; typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions * except p_iseq */ typedef void (*TParserSpecial) (struct TParser *); /* special handler for * special cases... */ typedef struct { TParserCharTest isclass; char c; uint16 flags; TParserState tostate; int type; TParserSpecial special; } TParserStateActionItem; typedef struct { TParserState state; TParserStateActionItem *action; } TParserStateAction; typedef struct TParserPosition { int posbyte; /* position of parser in bytes */ int poschar; /* osition of parser in characters */ int charlen; /* length of current char */ int lenbytelexeme; int lencharlexeme; TParserState state; struct TParserPosition *prev; int flags; TParserStateActionItem *pushedAtAction; } TParserPosition; typedef struct TParser { /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ #ifdef TS_USE_WIDE wchar_t *wstr; /* wide character string */ int lenwstr; /* length of wsting */ #endif /* State of parse */ int charmaxlen; bool usewide; TParserPosition *state; bool ignore; bool wanthost; /* silly char */ char c; /* out */ char *lexeme; int lenbytelexeme; int lencharlexeme; int type; } TParser; static TParserPosition * newTParserPosition(TParserPosition * prev) { TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition)); if (prev) memcpy(res, prev, sizeof(TParserPosition)); else memset(res, 0, sizeof(TParserPosition)); res->prev = prev; res->pushedAtAction = NULL; return res; } static TParser * TParserInit(char *str, int len) { TParser *prs = (TParser *) palloc0(sizeof(TParser)); prs->charmaxlen = pg_database_encoding_max_length(); prs->str = str; prs->lenstr = len; #ifdef TS_USE_WIDE /* * Use wide char code only when max encoding length > 1. */ if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); } else #endif prs->usewide = false; prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; return prs; } static bool TParserGet(TParser * prs); static void TParserClose(TParser * prs) { while (prs->state) { TParserPosition *ptr = prs->state->prev; pfree(prs->state); prs->state = ptr; } #ifdef TS_USE_WIDE if (prs->wstr) pfree(prs->wstr); #endif pfree(prs); } /* * defining support function, equvalent is* macroses, but * working with any possible encodings and locales. Note, * that with multibyte encoding and C-locale isw* function may fail * or give wrong result. Note 2: multibyte encoding and C-local * often are used for Asian languages */ #ifdef TS_USE_WIDE #define p_iswhat(type) \ static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ if ( prs->usewide ) \ { \ if ( lc_ctype_is_c() ) \ return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \ \ return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \ } \ \ return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ } \ \ static int \ p_isnot##type(TParser *prs) { \ return !p_is##type(prs); \ } static int p_isalnum(TParser * prs) { Assert(prs->state); if (prs->usewide) { if (lc_ctype_is_c()) { unsigned int c = *(prs->wstr + prs->state->poschar); /* * any non-ascii symbol with multibyte encoding with C-locale is * an alpha character */ if (c > 0x7f) return 1; return isalnum(0xff & c); } return iswalnum((wint_t) *(prs->wstr + prs->state->poschar)); } return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte)); } static int p_isnotalnum(TParser * prs) { return !p_isalnum(prs); } static int p_isalpha(TParser * prs) { Assert(prs->state); if (prs->usewide) { if (lc_ctype_is_c()) { unsigned int c = *(prs->wstr + prs->state->poschar); /* * any non-ascii symbol with multibyte encoding with C-locale is * an alpha character */ if (c > 0x7f) return 1; return isalpha(0xff & c); } return iswalpha((wint_t) *(prs->wstr + prs->state->poschar)); } return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte)); } static int p_isnotalpha(TParser * prs) { return !p_isalpha(prs); } /* p_iseq should be used only for ascii symbols */ static int p_iseq(TParser * prs, char c) { Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } #else /* TS_USE_WIDE */ #define p_iswhat(type) \ static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ } \ \ static int \ p_isnot##type(TParser *prs) { \ return !p_is##type(prs); \ } static int p_iseq(TParser * prs, char c) { Assert(prs->state); return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; } p_iswhat(alnum) p_iswhat(alpha) #endif /* TS_USE_WIDE */ p_iswhat(digit) p_iswhat(lower) p_iswhat(print) p_iswhat(punct) p_iswhat(space) p_iswhat(upper) p_iswhat(xdigit) static int p_isEOF(TParser * prs) { Assert(prs->state); return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0; } static int p_iseqC(TParser * prs) { return p_iseq(prs, prs->c); } static int p_isneC(TParser * prs) { return !p_iseq(prs, prs->c); } static int p_isascii(TParser * prs) { return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0; } static int p_islatin(TParser * prs) { return (p_isalpha(prs) && p_isascii(prs)) ? 1 : 0; } static int p_isnonlatin(TParser * prs) { return (p_isalpha(prs) && !p_isascii(prs)) ? 1 : 0; } void _make_compiler_happy(void); void _make_compiler_happy(void) { p_isalnum(NULL); p_isnotalnum(NULL); p_isalpha(NULL); p_isnotalpha(NULL); p_isdigit(NULL); p_isnotdigit(NULL); p_islower(NULL); p_isnotlower(NULL); p_isprint(NULL); p_isnotprint(NULL); p_ispunct(NULL); p_isnotpunct(NULL); p_isspace(NULL); p_isnotspace(NULL); p_isupper(NULL); p_isnotupper(NULL); p_isxdigit(NULL); p_isnotxdigit(NULL); p_isEOF(NULL); p_iseqC(NULL); p_isneC(NULL); } static void SpecialTags(TParser * prs) { switch (prs->state->lencharlexeme) { case 8: /* lexeme, "ignore = false; break; case 7: /*