diff options
author | Marc G. Fournier <scrappy@hub.org> | 1998-03-15 07:39:04 +0000 |
---|---|---|
committer | Marc G. Fournier <scrappy@hub.org> | 1998-03-15 07:39:04 +0000 |
commit | 661ecf3c48e16a9add216287eb969d7615e47968 (patch) | |
tree | 91b54d5905aa2e22bd0ae9ea8c6b0f3cab75d3f4 /src/backend/regex/regcomp.c | |
parent | 31a925c4d07675bc098a742ee9ca642ec79a40ee (diff) | |
download | postgresql-661ecf3c48e16a9add216287eb969d7615e47968.tar.gz postgresql-661ecf3c48e16a9add216287eb969d7615e47968.zip |
From: t-ishii@sra.co.jp
Included are patches intended for allowing PostgreSQL to handle
multi-byte charachter sets such as EUC(Extende Unix Code), Unicode and
Mule internal code. With the MB patch you can use multi-byte character
sets in regexp and LIKE. The encoding system chosen is determined at
the compile time.
To enable the MB extension, you need to define a variable "MB" in
Makefile.global or in Makefile.custom. For further information please
take a look at README.mb under doc directory.
(Note that unlike "jp patch" I do not use modified GNU regexp any
more. I changed Henry Spencer's regexp coming with PostgreSQL.)
Diffstat (limited to 'src/backend/regex/regcomp.c')
-rw-r--r-- | src/backend/regex/regcomp.c | 178 |
1 files changed, 142 insertions, 36 deletions
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index e31f8654049..6b7c472f1b9 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -62,8 +62,8 @@ static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; */ struct parse { - char *next; /* next character in RE */ - char *end; /* end of string (-> NUL normally) */ + pg_wchar *next; /* next character in RE */ + pg_wchar *end; /* end of string (-> NUL normally) */ int error; /* has an error been seen? */ sop *strip; /* malloced strip */ sopno ssize; /* malloced strip size (allocated) */ @@ -93,7 +93,7 @@ extern "C" static void p_b_term(struct parse * p, cset *cs); static void p_b_cclass(struct parse * p, cset *cs); static void p_b_eclass(struct parse * p, cset *cs); - static char p_b_symbol(struct parse * p); + static pg_wchar p_b_symbol(struct parse * p); static char p_b_coll_elem(struct parse * p, int endc); static char othercase(int ch); static void bothcases(struct parse * p, int ch); @@ -120,6 +120,10 @@ extern "C" static void stripsnug(struct parse * p, struct re_guts * g); static void findmust(struct parse * p, struct re_guts * g); static sopno pluscount(struct parse * p, struct re_guts * g); + static int pg_isdigit(int c); + static int pg_isalpha(int c); + static int pg_isupper(int c); + static int pg_islower(int c); #ifdef __cplusplus } @@ -127,7 +131,7 @@ extern "C" #endif /* ========= end header generated by ./mkh ========= */ -static char nuls[10]; /* place to point scanner in event of +static pg_wchar nuls[10]; /* place to point scanner in event of * error */ /* @@ -190,6 +194,9 @@ int cflags; struct parse *p = &pa; int i; size_t len; +#ifdef MB + pg_wchar *wcp; +#endif #ifdef REDEBUG #define GOODFLAGS(f) (f) @@ -203,12 +210,31 @@ int cflags; if (cflags & REG_PEND) { +#ifdef MB + wcp = preg->patsave; + if (preg->re_endp < wcp) + return (REG_INVARG); + len = preg->re_endp - wcp; +#else if (preg->re_endp < pattern) return (REG_INVARG); len = preg->re_endp - pattern; +#endif + } + else { +#ifdef MB + wcp = (pg_wchar *)malloc((strlen(pattern)+1) * sizeof(pg_wchar)); + if (wcp == NULL) { + return (REG_ESPACE); + } + preg->patsave = wcp; + (void)pg_mb2wchar((unsigned char *)pattern,wcp); + len = pg_wchar_strlen(wcp); +#else + + len = strlen((char *) pattern); +#endif } - else - len = strlen((char *) pattern); /* do the mallocs early so failure handling is easy */ g = (struct re_guts *) malloc(sizeof(struct re_guts) + @@ -227,7 +253,11 @@ int cflags; /* set things up */ p->g = g; - p->next = (char *) pattern; /* convenience; we do not modify it */ +#ifdef MB + p->next = wcp; +#else + p->next = pattern; /* convenience; we do not modify it */ +#endif p->end = p->next + len; p->error = 0; p->ncsalloc = 0; @@ -342,7 +372,7 @@ static void p_ere_exp(p) struct parse *p; { - char c; + pg_wchar c; sopno pos; int count; int count2; @@ -420,7 +450,7 @@ struct parse *p; break; case '{': /* okay as ordinary except if digit * follows */ - REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); + REQUIRE(!MORE() || !pg_isdigit(PEEK()), REG_BADRPT); /* FALLTHROUGH */ default: ordinary(p, c); @@ -432,7 +462,7 @@ struct parse *p; c = PEEK(); /* we call { a repetition if followed by a digit */ if (!(c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit(PEEK2())))) + (c == '{' && MORE2() && pg_isdigit(PEEK2())))) return; /* no repetition, we're done */ NEXT(); @@ -463,7 +493,7 @@ struct parse *p; count = p_count(p); if (EAT(',')) { - if (isdigit(PEEK())) + if (pg_isdigit(PEEK())) { count2 = p_count(p); REQUIRE(count <= count2, REG_BADBR); @@ -490,7 +520,7 @@ struct parse *p; return; c = PEEK(); if (!(c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit(PEEK2())))) + (c == '{' && MORE2() && pg_isdigit(PEEK2())))) return; SETERROR(REG_BADRPT); } @@ -568,7 +598,7 @@ int starordinary; /* is a leading * an ordinary character? */ int i; sopno subno; -#define BACKSL (1<<CHAR_BIT) +#define BACKSL (1<<24) pos = HERE(); /* repetion op, if any, covers from here */ @@ -577,7 +607,11 @@ int starordinary; /* is a leading * an ordinary character? */ if (c == '\\') { REQUIRE(MORE(), REG_EESCAPE); +#ifdef MB + c = BACKSL | (pg_wchar) GETNEXT(); +#else c = BACKSL | (unsigned char) GETNEXT(); +#endif } switch (c) { @@ -660,7 +694,7 @@ int starordinary; /* is a leading * an ordinary character? */ count = p_count(p); if (EAT(',')) { - if (MORE() && isdigit(PEEK())) + if (MORE() && pg_isdigit(PEEK())) { count2 = p_count(p); REQUIRE(count <= count2, REG_BADBR); @@ -698,7 +732,7 @@ struct parse *p; int count = 0; int ndigits = 0; - while (MORE() && isdigit(PEEK()) && count <= DUPMAX) + while (MORE() && pg_isdigit(PEEK()) && count <= DUPMAX) { count = count * 10 + (GETNEXT() - '0'); ndigits++; @@ -721,15 +755,27 @@ struct parse *p; { cset *cs = allocset(p); int invert = 0; +#ifdef MB + pg_wchar sp1[] = {'[', ':', '<', ':', ']', ']'}; + pg_wchar sp2[] = {'[', ':', '>', ':', ']', ']'}; +#endif /* Dept of Truly Sickening Special-Case Kludges */ +#ifdef MB + if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp1, 6) == 0) +#else if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) +#endif { EMIT(OBOW, 0); NEXTn(6); return; } +#ifdef MB + if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp2, 6) == 0) +#else if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) +#endif { EMIT(OEOW, 0); NEXTn(6); @@ -757,7 +803,7 @@ struct parse *p; int ci; for (i = p->g->csetsize - 1; i >= 0; i--) - if (CHIN(cs, i) && isalpha(i)) + if (CHIN(cs, i) && pg_isalpha(i)) { ci = othercase(i); if (ci != i) @@ -801,8 +847,8 @@ p_b_term(p, cs) struct parse *p; cset *cs; { - char c; - char start, + pg_wchar c; + pg_wchar start, finish; int i; @@ -857,6 +903,11 @@ cset *cs; finish = start; /* xxx what about signed chars here... */ REQUIRE(start <= finish, REG_ERANGE); +#ifdef MB + if (CHlc(start) != CHlc(finish)) { + SETERROR(REG_ERANGE); + } +#endif for (i = start; i <= finish; i++) CHadd(cs, i); break; @@ -872,17 +923,21 @@ p_b_cclass(p, cs) struct parse *p; cset *cs; { - char *sp = p->next; + pg_wchar *sp = p->next; struct cclass *cp; size_t len; char *u; char c; - while (MORE() && isalpha(PEEK())) + while (MORE() && pg_isalpha(PEEK())) NEXT(); len = p->next - sp; for (cp = cclasses; cp->name != NULL; cp++) +#ifdef MB + if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +#else if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +#endif break; if (cp->name == NULL) { @@ -919,11 +974,11 @@ cset *cs; - p_b_symbol - parse a character or [..]ed multicharacter collating symbol == static char p_b_symbol(struct parse *p); */ -static char /* value of symbol */ +static pg_wchar /* value of symbol */ p_b_symbol(p) struct parse *p; { - char value; + pg_wchar value; REQUIRE(MORE(), REG_EBRACK); if (!EATTWO('[', '.')) @@ -944,7 +999,7 @@ p_b_coll_elem(p, endc) struct parse *p; int endc; /* name ended by endc,']' */ { - char *sp = p->next; + pg_wchar *sp = p->next; struct cname *cp; int len; @@ -957,7 +1012,11 @@ int endc; /* name ended by endc,']' */ } len = p->next - sp; for (cp = cnames; cp->name != NULL; cp++) +#ifdef MB + if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +#else if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +#endif return (cp->code); /* known name */ if (len == 1) return (*sp); /* single character */ @@ -973,10 +1032,10 @@ static char /* if no counterpart, return ch */ othercase(ch) int ch; { - assert(isalpha(ch)); - if (isupper(ch)) + assert(pg_isalpha(ch)); + if (pg_isupper(ch)) return (tolower(ch)); - else if (islower(ch)) + else if (pg_islower(ch)) return (toupper(ch)); else /* peculiar, but could happen */ @@ -994,9 +1053,9 @@ bothcases(p, ch) struct parse *p; int ch; { - char *oldnext = p->next; - char *oldend = p->end; - char bracket[3]; + pg_wchar *oldnext = p->next; + pg_wchar *oldend = p->end; + pg_wchar bracket[3]; assert(othercase(ch) != ch);/* p_bracket() would recurse */ p->next = bracket; @@ -1021,12 +1080,16 @@ int ch; { cat_t *cap = p->g->categories; - if ((p->g->cflags & REG_ICASE) && isalpha(ch) && othercase(ch) != ch) + if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch) bothcases(p, ch); else { +#ifdef MB + EMIT(OCHAR, (pg_wchar) ch); +#else EMIT(OCHAR, (unsigned char) ch); - if (cap[ch] == 0) +#endif + if (ch >= CHAR_MIN && ch <= CHAR_MAX && cap[ch] == 0) cap[ch] = p->g->ncategories++; } } @@ -1041,9 +1104,9 @@ static void nonnewline(p) struct parse *p; { - char *oldnext = p->next; - char *oldend = p->end; - char bracket[4]; + pg_wchar *oldnext = p->next; + pg_wchar *oldend = p->end; + pg_wchar bracket[4]; p->next = bracket; p->end = bracket + 3; @@ -1674,7 +1737,7 @@ struct re_guts *g; sop *newstart = 0; sopno newlen; sop s; - char *cp; + pg_wchar *cp; sopno i; /* avoid making error situations worse */ @@ -1729,7 +1792,11 @@ struct re_guts *g; return; /* turn it into a character string */ +#ifdef MB + g->must = (pg_wchar *)malloc((size_t) (g->mlen + 1)*sizeof(pg_wchar)); +#else g->must = malloc((size_t) g->mlen + 1); +#endif if (g->must == NULL) { /* argh; just forget it */ g->mlen = 0; @@ -1742,7 +1809,7 @@ struct re_guts *g; while (OP(s = *scan++) != OCHAR) continue; assert(cp < g->must + g->mlen); - *cp++ = (char) OPND(s); + *cp++ = (pg_wchar) OPND(s); } assert(cp == g->must + g->mlen); *cp++ = '\0'; /* just on general principles */ @@ -1785,3 +1852,42 @@ struct re_guts *g; g->iflags |= BAD; return (maxnest); } + +/* + * some ctype functions with none-ascii-char guard + */ +static int pg_isdigit(int c) +{ +#ifdef MB + return(c >= 0 && c <= UCHAR_MAX && isdigit(c)); +#else + return(isdigit(c)); +#endif +} + +static int pg_isalpha(int c) +{ +#ifdef MB + return(c >= 0 && c <= UCHAR_MAX && isalpha(c)); +#else + return(isalpha(c)); +#endif +} + +static int pg_isupper(int c) +{ +#ifdef MB + return(c >= 0 && c <= UCHAR_MAX && isupper(c)); +#else + return(isupper(c)); +#endif +} + +static int pg_islower(int c) +{ +#ifdef MB + return(c >= 0 && c <= UCHAR_MAX && islower(c)); +#else + return(islower(c)); +#endif +} |