aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex/regcomp.c
diff options
context:
space:
mode:
authorMarc G. Fournier <scrappy@hub.org>1998-03-15 07:39:04 +0000
committerMarc G. Fournier <scrappy@hub.org>1998-03-15 07:39:04 +0000
commit661ecf3c48e16a9add216287eb969d7615e47968 (patch)
tree91b54d5905aa2e22bd0ae9ea8c6b0f3cab75d3f4 /src/backend/regex/regcomp.c
parent31a925c4d07675bc098a742ee9ca642ec79a40ee (diff)
downloadpostgresql-661ecf3c48e16a9add216287eb969d7615e47968.tar.gz
postgresql-661ecf3c48e16a9add216287eb969d7615e47968.zip
From: t-ishii@sra.co.jp
Included are patches intended for allowing PostgreSQL to handle multi-byte charachter sets such as EUC(Extende Unix Code), Unicode and Mule internal code. With the MB patch you can use multi-byte character sets in regexp and LIKE. The encoding system chosen is determined at the compile time. To enable the MB extension, you need to define a variable "MB" in Makefile.global or in Makefile.custom. For further information please take a look at README.mb under doc directory. (Note that unlike "jp patch" I do not use modified GNU regexp any more. I changed Henry Spencer's regexp coming with PostgreSQL.)
Diffstat (limited to 'src/backend/regex/regcomp.c')
-rw-r--r--src/backend/regex/regcomp.c178
1 files changed, 142 insertions, 36 deletions
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index e31f8654049..6b7c472f1b9 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -62,8 +62,8 @@ static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
*/
struct parse
{
- char *next; /* next character in RE */
- char *end; /* end of string (-> NUL normally) */
+ pg_wchar *next; /* next character in RE */
+ pg_wchar *end; /* end of string (-> NUL normally) */
int error; /* has an error been seen? */
sop *strip; /* malloced strip */
sopno ssize; /* malloced strip size (allocated) */
@@ -93,7 +93,7 @@ extern "C"
static void p_b_term(struct parse * p, cset *cs);
static void p_b_cclass(struct parse * p, cset *cs);
static void p_b_eclass(struct parse * p, cset *cs);
- static char p_b_symbol(struct parse * p);
+ static pg_wchar p_b_symbol(struct parse * p);
static char p_b_coll_elem(struct parse * p, int endc);
static char othercase(int ch);
static void bothcases(struct parse * p, int ch);
@@ -120,6 +120,10 @@ extern "C"
static void stripsnug(struct parse * p, struct re_guts * g);
static void findmust(struct parse * p, struct re_guts * g);
static sopno pluscount(struct parse * p, struct re_guts * g);
+ static int pg_isdigit(int c);
+ static int pg_isalpha(int c);
+ static int pg_isupper(int c);
+ static int pg_islower(int c);
#ifdef __cplusplus
}
@@ -127,7 +131,7 @@ extern "C"
#endif
/* ========= end header generated by ./mkh ========= */
-static char nuls[10]; /* place to point scanner in event of
+static pg_wchar nuls[10]; /* place to point scanner in event of
* error */
/*
@@ -190,6 +194,9 @@ int cflags;
struct parse *p = &pa;
int i;
size_t len;
+#ifdef MB
+ pg_wchar *wcp;
+#endif
#ifdef REDEBUG
#define GOODFLAGS(f) (f)
@@ -203,12 +210,31 @@ int cflags;
if (cflags & REG_PEND)
{
+#ifdef MB
+ wcp = preg->patsave;
+ if (preg->re_endp < wcp)
+ return (REG_INVARG);
+ len = preg->re_endp - wcp;
+#else
if (preg->re_endp < pattern)
return (REG_INVARG);
len = preg->re_endp - pattern;
+#endif
+ }
+ else {
+#ifdef MB
+ wcp = (pg_wchar *)malloc((strlen(pattern)+1) * sizeof(pg_wchar));
+ if (wcp == NULL) {
+ return (REG_ESPACE);
+ }
+ preg->patsave = wcp;
+ (void)pg_mb2wchar((unsigned char *)pattern,wcp);
+ len = pg_wchar_strlen(wcp);
+#else
+
+ len = strlen((char *) pattern);
+#endif
}
- else
- len = strlen((char *) pattern);
/* do the mallocs early so failure handling is easy */
g = (struct re_guts *) malloc(sizeof(struct re_guts) +
@@ -227,7 +253,11 @@ int cflags;
/* set things up */
p->g = g;
- p->next = (char *) pattern; /* convenience; we do not modify it */
+#ifdef MB
+ p->next = wcp;
+#else
+ p->next = pattern; /* convenience; we do not modify it */
+#endif
p->end = p->next + len;
p->error = 0;
p->ncsalloc = 0;
@@ -342,7 +372,7 @@ static void
p_ere_exp(p)
struct parse *p;
{
- char c;
+ pg_wchar c;
sopno pos;
int count;
int count2;
@@ -420,7 +450,7 @@ struct parse *p;
break;
case '{': /* okay as ordinary except if digit
* follows */
- REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
+ REQUIRE(!MORE() || !pg_isdigit(PEEK()), REG_BADRPT);
/* FALLTHROUGH */
default:
ordinary(p, c);
@@ -432,7 +462,7 @@ struct parse *p;
c = PEEK();
/* we call { a repetition if followed by a digit */
if (!(c == '*' || c == '+' || c == '?' ||
- (c == '{' && MORE2() && isdigit(PEEK2()))))
+ (c == '{' && MORE2() && pg_isdigit(PEEK2()))))
return; /* no repetition, we're done */
NEXT();
@@ -463,7 +493,7 @@ struct parse *p;
count = p_count(p);
if (EAT(','))
{
- if (isdigit(PEEK()))
+ if (pg_isdigit(PEEK()))
{
count2 = p_count(p);
REQUIRE(count <= count2, REG_BADBR);
@@ -490,7 +520,7 @@ struct parse *p;
return;
c = PEEK();
if (!(c == '*' || c == '+' || c == '?' ||
- (c == '{' && MORE2() && isdigit(PEEK2()))))
+ (c == '{' && MORE2() && pg_isdigit(PEEK2()))))
return;
SETERROR(REG_BADRPT);
}
@@ -568,7 +598,7 @@ int starordinary; /* is a leading * an ordinary character? */
int i;
sopno subno;
-#define BACKSL (1<<CHAR_BIT)
+#define BACKSL (1<<24)
pos = HERE(); /* repetion op, if any, covers from here */
@@ -577,7 +607,11 @@ int starordinary; /* is a leading * an ordinary character? */
if (c == '\\')
{
REQUIRE(MORE(), REG_EESCAPE);
+#ifdef MB
+ c = BACKSL | (pg_wchar) GETNEXT();
+#else
c = BACKSL | (unsigned char) GETNEXT();
+#endif
}
switch (c)
{
@@ -660,7 +694,7 @@ int starordinary; /* is a leading * an ordinary character? */
count = p_count(p);
if (EAT(','))
{
- if (MORE() && isdigit(PEEK()))
+ if (MORE() && pg_isdigit(PEEK()))
{
count2 = p_count(p);
REQUIRE(count <= count2, REG_BADBR);
@@ -698,7 +732,7 @@ struct parse *p;
int count = 0;
int ndigits = 0;
- while (MORE() && isdigit(PEEK()) && count <= DUPMAX)
+ while (MORE() && pg_isdigit(PEEK()) && count <= DUPMAX)
{
count = count * 10 + (GETNEXT() - '0');
ndigits++;
@@ -721,15 +755,27 @@ struct parse *p;
{
cset *cs = allocset(p);
int invert = 0;
+#ifdef MB
+ pg_wchar sp1[] = {'[', ':', '<', ':', ']', ']'};
+ pg_wchar sp2[] = {'[', ':', '>', ':', ']', ']'};
+#endif
/* Dept of Truly Sickening Special-Case Kludges */
+#ifdef MB
+ if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp1, 6) == 0)
+#else
if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0)
+#endif
{
EMIT(OBOW, 0);
NEXTn(6);
return;
}
+#ifdef MB
+ if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp2, 6) == 0)
+#else
if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0)
+#endif
{
EMIT(OEOW, 0);
NEXTn(6);
@@ -757,7 +803,7 @@ struct parse *p;
int ci;
for (i = p->g->csetsize - 1; i >= 0; i--)
- if (CHIN(cs, i) && isalpha(i))
+ if (CHIN(cs, i) && pg_isalpha(i))
{
ci = othercase(i);
if (ci != i)
@@ -801,8 +847,8 @@ p_b_term(p, cs)
struct parse *p;
cset *cs;
{
- char c;
- char start,
+ pg_wchar c;
+ pg_wchar start,
finish;
int i;
@@ -857,6 +903,11 @@ cset *cs;
finish = start;
/* xxx what about signed chars here... */
REQUIRE(start <= finish, REG_ERANGE);
+#ifdef MB
+ if (CHlc(start) != CHlc(finish)) {
+ SETERROR(REG_ERANGE);
+ }
+#endif
for (i = start; i <= finish; i++)
CHadd(cs, i);
break;
@@ -872,17 +923,21 @@ p_b_cclass(p, cs)
struct parse *p;
cset *cs;
{
- char *sp = p->next;
+ pg_wchar *sp = p->next;
struct cclass *cp;
size_t len;
char *u;
char c;
- while (MORE() && isalpha(PEEK()))
+ while (MORE() && pg_isalpha(PEEK()))
NEXT();
len = p->next - sp;
for (cp = cclasses; cp->name != NULL; cp++)
+#ifdef MB
+ if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+#else
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+#endif
break;
if (cp->name == NULL)
{
@@ -919,11 +974,11 @@ cset *cs;
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
== static char p_b_symbol(struct parse *p);
*/
-static char /* value of symbol */
+static pg_wchar /* value of symbol */
p_b_symbol(p)
struct parse *p;
{
- char value;
+ pg_wchar value;
REQUIRE(MORE(), REG_EBRACK);
if (!EATTWO('[', '.'))
@@ -944,7 +999,7 @@ p_b_coll_elem(p, endc)
struct parse *p;
int endc; /* name ended by endc,']' */
{
- char *sp = p->next;
+ pg_wchar *sp = p->next;
struct cname *cp;
int len;
@@ -957,7 +1012,11 @@ int endc; /* name ended by endc,']' */
}
len = p->next - sp;
for (cp = cnames; cp->name != NULL; cp++)
+#ifdef MB
+ if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+#else
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+#endif
return (cp->code); /* known name */
if (len == 1)
return (*sp); /* single character */
@@ -973,10 +1032,10 @@ static char /* if no counterpart, return ch */
othercase(ch)
int ch;
{
- assert(isalpha(ch));
- if (isupper(ch))
+ assert(pg_isalpha(ch));
+ if (pg_isupper(ch))
return (tolower(ch));
- else if (islower(ch))
+ else if (pg_islower(ch))
return (toupper(ch));
else
/* peculiar, but could happen */
@@ -994,9 +1053,9 @@ bothcases(p, ch)
struct parse *p;
int ch;
{
- char *oldnext = p->next;
- char *oldend = p->end;
- char bracket[3];
+ pg_wchar *oldnext = p->next;
+ pg_wchar *oldend = p->end;
+ pg_wchar bracket[3];
assert(othercase(ch) != ch);/* p_bracket() would recurse */
p->next = bracket;
@@ -1021,12 +1080,16 @@ int ch;
{
cat_t *cap = p->g->categories;
- if ((p->g->cflags & REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
+ if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
bothcases(p, ch);
else
{
+#ifdef MB
+ EMIT(OCHAR, (pg_wchar) ch);
+#else
EMIT(OCHAR, (unsigned char) ch);
- if (cap[ch] == 0)
+#endif
+ if (ch >= CHAR_MIN && ch <= CHAR_MAX && cap[ch] == 0)
cap[ch] = p->g->ncategories++;
}
}
@@ -1041,9 +1104,9 @@ static void
nonnewline(p)
struct parse *p;
{
- char *oldnext = p->next;
- char *oldend = p->end;
- char bracket[4];
+ pg_wchar *oldnext = p->next;
+ pg_wchar *oldend = p->end;
+ pg_wchar bracket[4];
p->next = bracket;
p->end = bracket + 3;
@@ -1674,7 +1737,7 @@ struct re_guts *g;
sop *newstart = 0;
sopno newlen;
sop s;
- char *cp;
+ pg_wchar *cp;
sopno i;
/* avoid making error situations worse */
@@ -1729,7 +1792,11 @@ struct re_guts *g;
return;
/* turn it into a character string */
+#ifdef MB
+ g->must = (pg_wchar *)malloc((size_t) (g->mlen + 1)*sizeof(pg_wchar));
+#else
g->must = malloc((size_t) g->mlen + 1);
+#endif
if (g->must == NULL)
{ /* argh; just forget it */
g->mlen = 0;
@@ -1742,7 +1809,7 @@ struct re_guts *g;
while (OP(s = *scan++) != OCHAR)
continue;
assert(cp < g->must + g->mlen);
- *cp++ = (char) OPND(s);
+ *cp++ = (pg_wchar) OPND(s);
}
assert(cp == g->must + g->mlen);
*cp++ = '\0'; /* just on general principles */
@@ -1785,3 +1852,42 @@ struct re_guts *g;
g->iflags |= BAD;
return (maxnest);
}
+
+/*
+ * some ctype functions with none-ascii-char guard
+ */
+static int pg_isdigit(int c)
+{
+#ifdef MB
+ return(c >= 0 && c <= UCHAR_MAX && isdigit(c));
+#else
+ return(isdigit(c));
+#endif
+}
+
+static int pg_isalpha(int c)
+{
+#ifdef MB
+ return(c >= 0 && c <= UCHAR_MAX && isalpha(c));
+#else
+ return(isalpha(c));
+#endif
+}
+
+static int pg_isupper(int c)
+{
+#ifdef MB
+ return(c >= 0 && c <= UCHAR_MAX && isupper(c));
+#else
+ return(isupper(c));
+#endif
+}
+
+static int pg_islower(int c)
+{
+#ifdef MB
+ return(c >= 0 && c <= UCHAR_MAX && islower(c));
+#else
+ return(islower(c));
+#endif
+}