diff options
Diffstat (limited to 'src/backend/regex/regc_locale.c')
-rw-r--r-- | src/backend/regex/regc_locale.c | 615 |
1 files changed, 615 insertions, 0 deletions
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c new file mode 100644 index 00000000000..41ea9fe1f29 --- /dev/null +++ b/src/backend/regex/regc_locale.c @@ -0,0 +1,615 @@ +/* + * regc_locale.c -- + * + * This file contains locale-specific regexp routines. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998 by Scriptics Corporation. + * + * This software is copyrighted by the Regents of the University of + * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState + * Corporation and other parties. The following terms apply to all files + * associated with the software unless explicitly disclaimed in + * individual files. + * + * The authors hereby grant permission to use, copy, modify, distribute, + * and license this software and its documentation for any purpose, provided + * that existing copyright notices are retained in all copies and that this + * notice is included verbatim in any distributions. No written agreement, + * license, or royalty fee is required for any of the authorized uses. + * Modifications to this software may be copyrighted by their authors + * and need not follow the licensing terms described here, provided that + * the new terms are clearly indicated on the first page of each file where + * they apply. + * + * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY + * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY + * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE + * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE + * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR + * MODIFICATIONS. + * + * GOVERNMENT USE: If you are acquiring this software on behalf of the + * U.S. government, the Government shall have only "Restricted Rights" + * in the software and related documentation as defined in the Federal + * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you + * are acquiring the software on behalf of the Department of Defense, the + * software shall be classified as "Commercial Computer Software" and the + * Government shall have only "Restricted Rights" as defined in Clause + * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the + * authors grant the U.S. Government and others acting in its behalf + * permission to use and distribute the software in accordance with the + * terms specified in this license. + * + * $Header: /cvsroot/pgsql/src/backend/regex/regc_locale.c,v 1.1 2003/02/05 17:41:32 tgl Exp $ + */ + +/* ASCII character-name table */ + +static struct cname { + char *name; + char code; +} cnames[] = { + {"NUL", '\0'}, + {"SOH", '\001'}, + {"STX", '\002'}, + {"ETX", '\003'}, + {"EOT", '\004'}, + {"ENQ", '\005'}, + {"ACK", '\006'}, + {"BEL", '\007'}, + {"alert", '\007'}, + {"BS", '\010'}, + {"backspace", '\b'}, + {"HT", '\011'}, + {"tab", '\t'}, + {"LF", '\012'}, + {"newline", '\n'}, + {"VT", '\013'}, + {"vertical-tab", '\v'}, + {"FF", '\014'}, + {"form-feed", '\f'}, + {"CR", '\015'}, + {"carriage-return", '\r'}, + {"SO", '\016'}, + {"SI", '\017'}, + {"DLE", '\020'}, + {"DC1", '\021'}, + {"DC2", '\022'}, + {"DC3", '\023'}, + {"DC4", '\024'}, + {"NAK", '\025'}, + {"SYN", '\026'}, + {"ETB", '\027'}, + {"CAN", '\030'}, + {"EM", '\031'}, + {"SUB", '\032'}, + {"ESC", '\033'}, + {"IS4", '\034'}, + {"FS", '\034'}, + {"IS3", '\035'}, + {"GS", '\035'}, + {"IS2", '\036'}, + {"RS", '\036'}, + {"IS1", '\037'}, + {"US", '\037'}, + {"space", ' '}, + {"exclamation-mark",'!'}, + {"quotation-mark", '"'}, + {"number-sign", '#'}, + {"dollar-sign", '$'}, + {"percent-sign", '%'}, + {"ampersand", '&'}, + {"apostrophe", '\''}, + {"left-parenthesis",'('}, + {"right-parenthesis", ')'}, + {"asterisk", '*'}, + {"plus-sign", '+'}, + {"comma", ','}, + {"hyphen", '-'}, + {"hyphen-minus", '-'}, + {"period", '.'}, + {"full-stop", '.'}, + {"slash", '/'}, + {"solidus", '/'}, + {"zero", '0'}, + {"one", '1'}, + {"two", '2'}, + {"three", '3'}, + {"four", '4'}, + {"five", '5'}, + {"six", '6'}, + {"seven", '7'}, + {"eight", '8'}, + {"nine", '9'}, + {"colon", ':'}, + {"semicolon", ';'}, + {"less-than-sign", '<'}, + {"equals-sign", '='}, + {"greater-than-sign", '>'}, + {"question-mark", '?'}, + {"commercial-at", '@'}, + {"left-square-bracket", '['}, + {"backslash", '\\'}, + {"reverse-solidus", '\\'}, + {"right-square-bracket", ']'}, + {"circumflex", '^'}, + {"circumflex-accent", '^'}, + {"underscore", '_'}, + {"low-line", '_'}, + {"grave-accent", '`'}, + {"left-brace", '{'}, + {"left-curly-bracket", '{'}, + {"vertical-line", '|'}, + {"right-brace", '}'}, + {"right-curly-bracket", '}'}, + {"tilde", '~'}, + {"DEL", '\177'}, + {NULL, 0} +}; + +/* + * some ctype functions with non-ascii-char guard + */ +static int +pg_isdigit(pg_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c)); +} + +static int +pg_isalpha(pg_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c)); +} + +static int +pg_isalnum(pg_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c)); +} + +static int +pg_isupper(pg_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c)); +} + +static int +pg_islower(pg_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c)); +} + +static int +pg_isgraph(pg_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c)); +} + +static int +pg_ispunct(pg_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c)); +} + +static int +pg_isspace(pg_wchar c) +{ + return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c)); +} + +static pg_wchar +pg_toupper(pg_wchar c) +{ + if (c >= 0 && c <= UCHAR_MAX) + return toupper((unsigned char) c); + return c; +} + +static pg_wchar +pg_tolower(pg_wchar c) +{ + if (c >= 0 && c <= UCHAR_MAX) + return tolower((unsigned char) c); + return c; +} + + +/* + * nmcces - how many distinct MCCEs are there? + */ +static int +nmcces(struct vars *v) +{ + /* + * No multi-character collating elements defined at the moment. + */ + return 0; +} + +/* + * nleaders - how many chrs can be first chrs of MCCEs? + */ +static int +nleaders(struct vars *v) +{ + return 0; +} + +/* + * allmcces - return a cvec with all the MCCEs of the locale + */ +static struct cvec * +allmcces(struct vars *v, /* context */ + struct cvec *cv) /* this is supposed to have enough room */ +{ + return clearcvec(cv); +} + +/* + * element - map collating-element name to celt + */ +static celt +element(struct vars *v, /* context */ + chr *startp, /* points to start of name */ + chr *endp) /* points just past end of name */ +{ + struct cname *cn; + size_t len; + + /* generic: one-chr names stand for themselves */ + assert(startp < endp); + len = endp - startp; + if (len == 1) { + return *startp; + } + + NOTE(REG_ULOCALE); + + /* search table */ + for (cn=cnames; cn->name!=NULL; cn++) { + if (strlen(cn->name)==len && + pg_char_and_wchar_strncmp(cn->name, startp, len)==0) { + break; /* NOTE BREAK OUT */ + } + } + if (cn->name != NULL) { + return CHR(cn->code); + } + + /* couldn't find it */ + ERR(REG_ECOLLATE); + return 0; +} + +/* + * range - supply cvec for a range, including legality check + */ +static struct cvec * +range(struct vars *v, /* context */ + celt a, /* range start */ + celt b, /* range end, might equal a */ + int cases) /* case-independent? */ +{ + int nchrs; + struct cvec *cv; + celt c, lc, uc; + + if (a != b && !before(a, b)) { + ERR(REG_ERANGE); + return NULL; + } + + if (!cases) { /* easy version */ + cv = getcvec(v, 0, 1, 0); + NOERRN(); + addrange(cv, a, b); + return cv; + } + + /* + * When case-independent, it's hard to decide when cvec ranges are + * usable, so for now at least, we won't try. We allocate enough + * space for two case variants plus a little extra for the two + * title case variants. + */ + + nchrs = (b - a + 1)*2 + 4; + + cv = getcvec(v, nchrs, 0, 0); + NOERRN(); + + for (c=a; c<=b; c++) { + addchr(cv, c); + lc = pg_tolower((chr)c); + if (c != lc) { + addchr(cv, lc); + } + uc = pg_toupper((chr)c); + if (c != uc) { + addchr(cv, uc); + } + } + + return cv; +} + +/* + * before - is celt x before celt y, for purposes of range legality? + */ +static int /* predicate */ +before(celt x, celt y) +{ + /* trivial because no MCCEs */ + if (x < y) { + return 1; + } + return 0; +} + +/* + * eclass - supply cvec for an equivalence class + * Must include case counterparts on request. + */ +static struct cvec * +eclass(struct vars *v, /* context */ + celt c, /* Collating element representing + * the equivalence class. */ + int cases) /* all cases? */ +{ + struct cvec *cv; + + /* crude fake equivalence class for testing */ + if ((v->cflags®_FAKE) && c == 'x') { + cv = getcvec(v, 4, 0, 0); + addchr(cv, (chr)'x'); + addchr(cv, (chr)'y'); + if (cases) { + addchr(cv, (chr)'X'); + addchr(cv, (chr)'Y'); + } + return cv; + } + + /* otherwise, none */ + if (cases) { + return allcases(v, c); + } + cv = getcvec(v, 1, 0, 0); + assert(cv != NULL); + addchr(cv, (chr)c); + return cv; +} + +/* + * cclass - supply cvec for a character class + * + * Must include case counterparts on request. + */ +static struct cvec * +cclass(struct vars *v, /* context */ + chr *startp, /* where the name starts */ + chr *endp, /* just past the end of the name */ + int cases) /* case-independent? */ +{ + size_t len; + struct cvec *cv = NULL; + char **namePtr; + int i, index; + + /* + * The following arrays define the valid character class names. + */ + + static char *classNames[] = { + "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", + "lower", "print", "punct", "space", "upper", "xdigit", NULL + }; + + enum classes { + CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, + CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT + }; + + /* + * Map the name to the corresponding enumerated value. + */ + len = endp - startp; + index = -1; + for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) { + if (strlen(*namePtr) == len && + pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) { + index = i; + break; + } + } + if (index == -1) { + ERR(REG_ECTYPE); + return NULL; + } + + /* + * Remap lower and upper to alpha if the match is case insensitive. + */ + + if (cases && + ((enum classes) index == CC_LOWER || + (enum classes) index == CC_UPPER)) + index = (int) CC_ALPHA; + + /* + * Now compute the character class contents. + * + * For the moment, assume that only char codes < 256 can be in these + * classes. + */ + + switch((enum classes) index) { + case CC_PRINT: + case CC_ALNUM: + cv = getcvec(v, UCHAR_MAX, 1, 0); + if (cv) { + for (i=0 ; i<= UCHAR_MAX ; i++) { + if (pg_isalpha((chr) i)) + addchr(cv, (chr) i); + } + addrange(cv, (chr) '0', (chr) '9'); + } + break; + case CC_ALPHA: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) { + for (i=0 ; i<= UCHAR_MAX ; i++) { + if (pg_isalpha((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_ASCII: + cv = getcvec(v, 0, 1, 0); + if (cv) { + addrange(cv, 0, 0x7f); + } + break; + case CC_BLANK: + cv = getcvec(v, 2, 0, 0); + addchr(cv, '\t'); + addchr(cv, ' '); + break; + case CC_CNTRL: + cv = getcvec(v, 0, 2, 0); + addrange(cv, 0x0, 0x1f); + addrange(cv, 0x7f, 0x9f); + break; + case CC_DIGIT: + cv = getcvec(v, 0, 1, 0); + if (cv) { + addrange(cv, (chr) '0', (chr) '9'); + } + break; + case CC_PUNCT: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) { + for (i=0 ; i<= UCHAR_MAX ; i++) { + if (pg_ispunct((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_XDIGIT: + cv = getcvec(v, 0, 3, 0); + if (cv) { + addrange(cv, '0', '9'); + addrange(cv, 'a', 'f'); + addrange(cv, 'A', 'F'); + } + break; + case CC_SPACE: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) { + for (i=0 ; i<= UCHAR_MAX ; i++) { + if (pg_isspace((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_LOWER: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) { + for (i=0 ; i<= UCHAR_MAX ; i++) { + if (pg_islower((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_UPPER: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) { + for (i=0 ; i<= UCHAR_MAX ; i++) { + if (pg_isupper((chr) i)) + addchr(cv, (chr) i); + } + } + break; + case CC_GRAPH: + cv = getcvec(v, UCHAR_MAX, 0, 0); + if (cv) { + for (i=0 ; i<= UCHAR_MAX ; i++) { + if (pg_isgraph((chr) i)) + addchr(cv, (chr) i); + } + } + break; + } + if (cv == NULL) { + ERR(REG_ESPACE); + } + return cv; +} + +/* + * allcases - supply cvec for all case counterparts of a chr (including itself) + * + * This is a shortcut, preferably an efficient one, for simple characters; + * messy cases are done via range(). + */ +static struct cvec * +allcases(struct vars *v, /* context */ + chr pc) /* character to get case equivs of */ +{ + struct cvec *cv; + chr c = (chr)pc; + chr lc, uc; + + lc = pg_tolower((chr)c); + uc = pg_toupper((chr)c); + + cv = getcvec(v, 2, 0, 0); + addchr(cv, lc); + if (lc != uc) { + addchr(cv, uc); + } + return cv; +} + +/* + * cmp - chr-substring compare + * + * Backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +cmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + return memcmp(VS(x), VS(y), len*sizeof(chr)); +} + +/* + * casecmp - case-independent chr-substring compare + * + * REG_ICASE backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +casecmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + for (; len > 0; len--, x++, y++) { + if ((*x!=*y) && (pg_tolower(*x) != pg_tolower(*y))) { + return 1; + } + } + return 0; +} |