1 files changed, 615 insertions, 0 deletions
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
new file mode 100644
index 00000000000..41ea9fe1f29
--- /dev/null
+++ b/src/backend/regex/regc_locale.c
@@ -0,0 +1,615 @@
+/* 
+ * regc_locale.c --
+ *
+ *	This file contains locale-specific regexp routines.
+ *	This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ *
+ * This software is copyrighted by the Regents of the University of
+ * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+ * Corporation and other parties.  The following terms apply to all files
+ * associated with the software unless explicitly disclaimed in
+ * individual files.
+ * 
+ * The authors hereby grant permission to use, copy, modify, distribute,
+ * and license this software and its documentation for any purpose, provided
+ * that existing copyright notices are retained in all copies and that this
+ * notice is included verbatim in any distributions. No written agreement,
+ * license, or royalty fee is required for any of the authorized uses.
+ * Modifications to this software may be copyrighted by their authors
+ * and need not follow the licensing terms described here, provided that
+ * the new terms are clearly indicated on the first page of each file where
+ * they apply.
+ * 
+ * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+ * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+ * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
+ * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+ * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+ * MODIFICATIONS.
+ * 
+ * GOVERNMENT USE: If you are acquiring this software on behalf of the
+ * U.S. government, the Government shall have only "Restricted Rights"
+ * in the software and related documentation as defined in the Federal 
+ * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
+ * are acquiring the software on behalf of the Department of Defense, the
+ * software shall be classified as "Commercial Computer Software" and the
+ * Government shall have only "Restricted Rights" as defined in Clause
+ * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+ * authors grant the U.S. Government and others acting in its behalf
+ * permission to use and distribute the software in accordance with the
+ * terms specified in this license. 
+ *
+ * $Header: /cvsroot/pgsql/src/backend/regex/regc_locale.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
+ */
+
+/* ASCII character-name table */
+
+static struct cname {
+    char *name;
+    char code;
+} cnames[] = {
+    {"NUL",		'\0'},
+    {"SOH",		'\001'},
+    {"STX",		'\002'},
+    {"ETX",		'\003'},
+    {"EOT",		'\004'},
+    {"ENQ",		'\005'},
+    {"ACK",		'\006'},
+    {"BEL",		'\007'},
+    {"alert",		'\007'},
+    {"BS",		'\010'},
+    {"backspace",	'\b'},
+    {"HT",		'\011'},
+    {"tab",		'\t'},
+    {"LF",		'\012'},
+    {"newline",		'\n'},
+    {"VT",		'\013'},
+    {"vertical-tab",	'\v'},
+    {"FF",		'\014'},
+    {"form-feed",	'\f'},
+    {"CR",		'\015'},
+    {"carriage-return",	'\r'},
+    {"SO",		'\016'},
+    {"SI",		'\017'},
+    {"DLE",		'\020'},
+    {"DC1",		'\021'},
+    {"DC2",		'\022'},
+    {"DC3",		'\023'},
+    {"DC4",		'\024'},
+    {"NAK",		'\025'},
+    {"SYN",		'\026'},
+    {"ETB",		'\027'},
+    {"CAN",		'\030'},
+    {"EM",		'\031'},
+    {"SUB",		'\032'},
+    {"ESC",		'\033'},
+    {"IS4",		'\034'},
+    {"FS",		'\034'},
+    {"IS3",		'\035'},
+    {"GS",		'\035'},
+    {"IS2",		'\036'},
+    {"RS",		'\036'},
+    {"IS1",		'\037'},
+    {"US",		'\037'},
+    {"space",		' '},
+    {"exclamation-mark",'!'},
+    {"quotation-mark",	'"'},
+    {"number-sign",	'#'},
+    {"dollar-sign",	'$'},
+    {"percent-sign",	'%'},
+    {"ampersand",	'&'},
+    {"apostrophe",	'\''},
+    {"left-parenthesis",'('},
+    {"right-parenthesis", ')'},
+    {"asterisk",	'*'},
+    {"plus-sign",	'+'},
+    {"comma",		','},
+    {"hyphen",		'-'},
+    {"hyphen-minus",	'-'},
+    {"period",		'.'},
+    {"full-stop",	'.'},
+    {"slash",		'/'},
+    {"solidus",		'/'},
+    {"zero",		'0'},
+    {"one",		'1'},
+    {"two",		'2'},
+    {"three",		'3'},
+    {"four",		'4'},
+    {"five",		'5'},
+    {"six",		'6'},
+    {"seven",		'7'},
+    {"eight",		'8'},
+    {"nine",		'9'},
+    {"colon",		':'},
+    {"semicolon",	';'},
+    {"less-than-sign",	'<'},
+    {"equals-sign",	'='},
+    {"greater-than-sign", '>'},
+    {"question-mark",	'?'},
+    {"commercial-at",	'@'},
+    {"left-square-bracket", '['},
+    {"backslash",	'\\'},
+    {"reverse-solidus",	'\\'},
+    {"right-square-bracket", ']'},
+    {"circumflex",	'^'},
+    {"circumflex-accent", '^'},
+    {"underscore",	'_'},
+    {"low-line",	'_'},
+    {"grave-accent",	'`'},
+    {"left-brace",	'{'},
+    {"left-curly-bracket", '{'},
+    {"vertical-line",	'|'},
+    {"right-brace",	'}'},
+    {"right-curly-bracket", '}'},
+    {"tilde",		'~'},
+    {"DEL",		'\177'},
+    {NULL,		0}
+};
+
+/*
+ * some ctype functions with non-ascii-char guard
+ */
+static int
+pg_isdigit(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
+}
+
+static int
+pg_isalpha(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
+}
+
+static int
+pg_isalnum(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
+}
+
+static int
+pg_isupper(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
+}
+
+static int
+pg_islower(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
+}
+
+static int
+pg_isgraph(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
+}
+
+static int
+pg_ispunct(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
+}
+
+static int
+pg_isspace(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
+}
+
+static pg_wchar
+pg_toupper(pg_wchar c)
+{
+	if (c >= 0 && c <= UCHAR_MAX)
+		return toupper((unsigned char) c);
+	return c;
+}
+
+static pg_wchar
+pg_tolower(pg_wchar c)
+{
+	if (c >= 0 && c <= UCHAR_MAX)
+		return tolower((unsigned char) c);
+	return c;
+}
+
+
+/*
+ * nmcces - how many distinct MCCEs are there?
+ */
+static int
+nmcces(struct vars *v)
+{
+    /*
+     * No multi-character collating elements defined at the moment.
+     */
+    return 0;
+}
+
+/*
+ * nleaders - how many chrs can be first chrs of MCCEs?
+ */
+static int
+nleaders(struct vars *v)
+{
+    return 0;
+}
+
+/*
+ * allmcces - return a cvec with all the MCCEs of the locale
+ */
+static struct cvec *
+allmcces(struct vars *v,			/* context */
+		 struct cvec *cv)			/* this is supposed to have enough room */
+{
+    return clearcvec(cv);
+}
+
+/*
+ * element - map collating-element name to celt
+ */
+static celt
+element(struct vars *v,			/* context */
+		chr *startp,			/* points to start of name */
+		chr *endp)				/* points just past end of name */
+{
+    struct cname *cn;
+    size_t len;
+
+    /* generic:  one-chr names stand for themselves */
+    assert(startp < endp);
+    len = endp - startp;
+    if (len == 1) {
+	return *startp;
+    }
+
+    NOTE(REG_ULOCALE);
+
+    /* search table */
+    for (cn=cnames; cn->name!=NULL; cn++) {
+	if (strlen(cn->name)==len &&
+		pg_char_and_wchar_strncmp(cn->name, startp, len)==0) {
+	    break;			/* NOTE BREAK OUT */
+	}
+    }
+    if (cn->name != NULL) {
+	return CHR(cn->code);
+    }
+
+    /* couldn't find it */
+    ERR(REG_ECOLLATE);
+    return 0;
+}
+
+/*
+ * range - supply cvec for a range, including legality check
+ */
+static struct cvec *
+range(struct vars *v,			/* context */
+	  celt a,				/* range start */
+	  celt b,				/* range end, might equal a */
+	  int cases)				/* case-independent? */
+{
+    int nchrs;
+    struct cvec *cv;
+    celt c, lc, uc;
+
+    if (a != b && !before(a, b)) {
+	ERR(REG_ERANGE);
+	return NULL;
+    }
+
+    if (!cases) {			/* easy version */
+	cv = getcvec(v, 0, 1, 0);
+	NOERRN();
+	addrange(cv, a, b);
+	return cv;
+    }
+
+    /*
+     * When case-independent, it's hard to decide when cvec ranges are
+     * usable, so for now at least, we won't try.  We allocate enough
+     * space for two case variants plus a little extra for the two
+     * title case variants.
+     */
+
+    nchrs = (b - a + 1)*2 + 4;
+
+    cv = getcvec(v, nchrs, 0, 0);
+    NOERRN();
+
+    for (c=a; c<=b; c++) {
+	addchr(cv, c);
+	lc = pg_tolower((chr)c);
+	if (c != lc) {
+	    addchr(cv, lc);
+	}
+	uc = pg_toupper((chr)c);
+	if (c != uc) {
+	    addchr(cv, uc);
+	}
+    }
+
+    return cv;
+}
+
+/*
+ * before - is celt x before celt y, for purposes of range legality?
+ */
+static int				/* predicate */
+before(celt x, celt y)
+{
+    /* trivial because no MCCEs */
+    if (x < y) {
+	return 1;
+    }
+    return 0;
+}
+
+/*
+ * eclass - supply cvec for an equivalence class
+ * Must include case counterparts on request.
+ */
+static struct cvec *
+eclass(struct vars *v,			/* context */
+	   celt c,					/* Collating element representing
+								 * the equivalence class. */
+	   int cases)				/* all cases? */
+{
+    struct cvec *cv;
+
+    /* crude fake equivalence class for testing */
+    if ((v->cflags&REG_FAKE) && c == 'x') {
+	cv = getcvec(v, 4, 0, 0);
+	addchr(cv, (chr)'x');
+	addchr(cv, (chr)'y');
+	if (cases) {
+	    addchr(cv, (chr)'X');
+	    addchr(cv, (chr)'Y');
+	}
+	return cv;
+    }
+
+    /* otherwise, none */
+    if (cases) {
+	return allcases(v, c);
+    }
+    cv = getcvec(v, 1, 0, 0);
+    assert(cv != NULL);
+    addchr(cv, (chr)c);
+    return cv;
+}
+
+/*
+ * cclass - supply cvec for a character class
+ *
+ * Must include case counterparts on request.
+ */
+static struct cvec *
+cclass(struct vars *v,			/* context */
+	   chr *startp,			/* where the name starts */
+	   chr *endp,				/* just past the end of the name */
+	   int cases)				/* case-independent? */
+{
+    size_t len;
+    struct cvec *cv = NULL;
+    char **namePtr;
+    int i, index;
+
+    /*
+     * The following arrays define the valid character class names.
+     */
+
+    static char *classNames[] = {
+	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
+	"lower", "print", "punct", "space", "upper", "xdigit", NULL
+    };
+
+    enum classes {
+	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
+	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
+    };
+
+    /*
+     * Map the name to the corresponding enumerated value.
+     */
+    len = endp - startp;
+    index = -1;
+    for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {
+	if (strlen(*namePtr) == len &&
+		pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) {
+	    index = i;
+	    break;
+	}
+    }
+    if (index == -1) {
+	ERR(REG_ECTYPE);
+	return NULL;
+    }
+
+    /*
+     * Remap lower and upper to alpha if the match is case insensitive.
+     */
+
+    if (cases &&
+		((enum classes) index == CC_LOWER ||
+		 (enum classes) index == CC_UPPER))
+		index = (int) CC_ALPHA;
+    
+    /*
+     * Now compute the character class contents.
+	 *
+	 * For the moment, assume that only char codes < 256 can be in these
+	 * classes.
+     */
+
+    switch((enum classes) index) {
+    case CC_PRINT:
+    case CC_ALNUM:
+	cv = getcvec(v, UCHAR_MAX, 1, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isalpha((chr) i))
+				addchr(cv, (chr) i);
+	    }
+		addrange(cv, (chr) '0', (chr) '9');
+	}
+	break;
+    case CC_ALPHA:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isalpha((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_ASCII:
+	cv = getcvec(v, 0, 1, 0);
+	if (cv) {
+	    addrange(cv, 0, 0x7f);
+	}
+	break;
+    case CC_BLANK:
+	cv = getcvec(v, 2, 0, 0);
+	addchr(cv, '\t');
+	addchr(cv, ' ');
+	break;
+    case CC_CNTRL:
+	cv = getcvec(v, 0, 2, 0);
+	addrange(cv, 0x0, 0x1f);
+	addrange(cv, 0x7f, 0x9f);
+	break;
+    case CC_DIGIT:
+	cv = getcvec(v, 0, 1, 0);
+	if (cv) {	
+		addrange(cv, (chr) '0', (chr) '9');
+	}
+	break;
+    case CC_PUNCT:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_ispunct((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_XDIGIT:
+	cv = getcvec(v, 0, 3, 0);
+	if (cv) {	
+	    addrange(cv, '0', '9');
+	    addrange(cv, 'a', 'f');
+	    addrange(cv, 'A', 'F');
+	}
+	break;
+    case CC_SPACE:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isspace((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_LOWER:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_islower((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_UPPER:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isupper((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_GRAPH:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isgraph((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    }
+    if (cv == NULL) {
+	ERR(REG_ESPACE);
+    }
+    return cv;
+}
+
+/*
+ * allcases - supply cvec for all case counterparts of a chr (including itself)
+ *
+ * This is a shortcut, preferably an efficient one, for simple characters;
+ * messy cases are done via range().
+ */
+static struct cvec *
+allcases(struct vars *v,			/* context */
+		 chr pc)				/* character to get case equivs of */
+{
+    struct cvec *cv;
+    chr c = (chr)pc;
+    chr lc, uc;
+
+    lc = pg_tolower((chr)c);
+    uc = pg_toupper((chr)c);
+
+	cv = getcvec(v, 2, 0, 0);
+    addchr(cv, lc);
+    if (lc != uc) {
+	addchr(cv, uc);
+    }
+    return cv;
+}
+
+/*
+ * cmp - chr-substring compare
+ *
+ * Backrefs need this.  It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int				/* 0 for equal, nonzero for unequal */
+cmp(const chr *x, const chr *y,			/* strings to compare */
+    size_t len)				/* exact length of comparison */
+{
+    return memcmp(VS(x), VS(y), len*sizeof(chr));
+}
+
+/*
+ * casecmp - case-independent chr-substring compare
+ *
+ * REG_ICASE backrefs need this.  It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int				/* 0 for equal, nonzero for unequal */
+casecmp(const chr *x, const chr *y,			/* strings to compare */
+		size_t len)				/* exact length of comparison */
+{
+    for (; len > 0; len--, x++, y++) {
+	if ((*x!=*y) && (pg_tolower(*x) != pg_tolower(*y))) {
+	    return 1;
+	}
+    }
+    return 0;
+}