diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/regex/Makefile | 2 | ||||
-rw-r--r-- | src/backend/regex/README | 9 | ||||
-rw-r--r-- | src/backend/regex/regexport.c | 292 | ||||
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 12 | ||||
-rw-r--r-- | src/include/mb/pg_wchar.h | 5 | ||||
-rw-r--r-- | src/include/regex/regexport.h | 57 |
6 files changed, 373 insertions, 4 deletions
diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile index 74a4c0c89d8..a6100ad35db 100644 --- a/src/backend/regex/Makefile +++ b/src/backend/regex/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/regex top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o +OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o regexport.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/regex/README b/src/backend/regex/README index c5d21e8c99d..29521c6636f 100644 --- a/src/backend/regex/README +++ b/src/backend/regex/README @@ -7,8 +7,8 @@ So this file is an attempt to reverse-engineer some docs. General source-file layout -------------------------- -There are five separately-compilable source files, each exposing exactly -one exported function: +There are six separately-compilable source files, five of which expose +exactly one exported function apiece: regcomp.c: pg_regcomp regexec.c: pg_regexec regerror.c: pg_regerror @@ -19,6 +19,9 @@ library version from any similar one that might be present on a particular system. They'd need to be removed or replaced in any standalone version of the library.) +The sixth file, regexport.c, exposes multiple functions that allow extraction +of info about a compiled regex (see regexport.h). + There are additional source files regc_*.c that are #include'd in regcomp, and similarly additional source files rege_*.c that are #include'd in regexec. This was done to avoid exposing internal symbols globally; @@ -45,6 +48,7 @@ regexec.c Top-level regex execution code rege_dfa.c DFA creation and execution regerror.c pg_regerror: generate text for a regex error code regfree.c pg_regfree: API to free a no-longer-needed regex_t +regexport.c Functions for extracting info from a regex_t regprefix.c Code for extracting a common prefix from a regex_t The locale-specific code is concerned primarily with case-folding and with @@ -56,6 +60,7 @@ The header files for the library are in src/include/regex/: regcustom.h Customizes library for particular application regerrs.h Error message list regex.h Exported API +regexport.h Exported API for regexport.c regguts.h Internals declarations diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c new file mode 100644 index 00000000000..419565cf936 --- /dev/null +++ b/src/backend/regex/regexport.c @@ -0,0 +1,292 @@ +/*------------------------------------------------------------------------- + * + * regexport.c + * Functions for exporting info about a regex's NFA + * + * In this implementation, the NFA defines a necessary but not sufficient + * condition for a string to match the regex: that is, there can be strings + * that match the NFA but don't match the full regex, but not vice versa. + * Thus, for example, it is okay for the functions below to ignore lookahead + * constraints, which merely constrain the string some more. + * + * Notice that these functions return info into caller-provided arrays + * rather than doing their own malloc's. This simplifies the APIs by + * eliminating a class of error conditions, and in the case of colors + * allows the caller to decide how big is too big to bother with. + * + * + * Portions Copyright (c) 2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/backend/regex/regexport.c + * + *------------------------------------------------------------------------- + */ + +#include "regex/regguts.h" + +#include "regex/regexport.h" + +static void scancolormap(struct colormap * cm, int co, + union tree * t, int level, chr partial, + pg_wchar **chars, int *chars_len); + + +/* + * Get total number of NFA states. + */ +int +pg_reg_getnumstates(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->nstates; +} + +/* + * Get initial state of NFA. + */ +int +pg_reg_getinitialstate(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->pre; +} + +/* + * Get final state of NFA. + */ +int +pg_reg_getfinalstate(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->post; +} + +/* + * Get number of outgoing NFA arcs of state number "st". + * + * Note: LACON arcs are ignored, both here and in pg_reg_getoutarcs(). + */ +int +pg_reg_getnumoutarcs(const regex_t *regex, int st) +{ + struct cnfa *cnfa; + struct carc *ca; + int count; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (st < 0 || st >= cnfa->nstates) + return 0; + count = 0; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co < cnfa->ncolors) + count++; + } + return count; +} + +/* + * Write array of outgoing NFA arcs of state number "st" into arcs[], + * whose length arcs_len must be at least as long as indicated by + * pg_reg_getnumoutarcs(), else not all arcs will be returned. + */ +void +pg_reg_getoutarcs(const regex_t *regex, int st, + regex_arc_t *arcs, int arcs_len) +{ + struct cnfa *cnfa; + struct carc *ca; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (st < 0 || st >= cnfa->nstates || arcs_len <= 0) + return; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co < cnfa->ncolors) + { + arcs->co = ca->co; + arcs->to = ca->to; + arcs++; + if (--arcs_len == 0) + break; + } + } +} + +/* + * Get total number of colors. + */ +int +pg_reg_getnumcolors(const regex_t *regex) +{ + struct colormap *cm; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + return cm->max + 1; +} + +/* + * Check if color is beginning of line/string. + * + * (We might at some point need to offer more refined handling of pseudocolors, + * but this will do for now.) + */ +int +pg_reg_colorisbegin(const regex_t *regex, int co) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (co == cnfa->bos[0] || co == cnfa->bos[1]) + return true; + else + return false; +} + +/* + * Check if color is end of line/string. + */ +int +pg_reg_colorisend(const regex_t *regex, int co) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (co == cnfa->eos[0] || co == cnfa->eos[1]) + return true; + else + return false; +} + +/* + * Get number of member chrs of color number "co". + * + * Note: we return -1 if the color number is invalid, or if it is a special + * color (WHITE or a pseudocolor), or if the number of members is uncertain. + * The latter case cannot arise right now but is specified to allow for future + * improvements (see musings about run-time handling of higher character codes + * in regex/README). Callers should not try to extract the members if -1 is + * returned. + */ +int +pg_reg_getnumcharacters(const regex_t *regex, int co) +{ + struct colormap *cm; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */ + return -1; + if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */ + return -1; + + return cm->cd[co].nchrs; +} + +/* + * Write array of member chrs of color number "co" into chars[], + * whose length chars_len must be at least as long as indicated by + * pg_reg_getnumcharacters(), else not all chars will be returned. + * + * Fetching the members of WHITE or a pseudocolor is not supported. + * + * Caution: this is a relatively expensive operation. + */ +void +pg_reg_getcharacters(const regex_t *regex, int co, + pg_wchar *chars, int chars_len) +{ + struct colormap *cm; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + if (co <= 0 || co > cm->max || chars_len <= 0) + return; + if (cm->cd[co].flags & PSEUDO) + return; + + /* Recursively search the colormap tree */ + scancolormap(cm, co, cm->tree, 0, 0, &chars, &chars_len); +} + +/* + * Recursively scan the colormap tree to find chrs belonging to color "co". + * See regex/README for info about the tree structure. + * + * t: tree block to scan + * level: level (from 0) of t + * partial: partial chr code for chrs within t + * chars, chars_len: output area + */ +static void +scancolormap(struct colormap * cm, int co, + union tree * t, int level, chr partial, + pg_wchar **chars, int *chars_len) +{ + int i; + + if (level < NBYTS - 1) + { + /* non-leaf node */ + for (i = 0; i < BYTTAB; i++) + { + /* + * We do not support search for chrs of color 0 (WHITE), so + * all-white subtrees need not be searched. These can be + * recognized because they are represented by the fill blocks in + * the colormap struct. This typically allows us to avoid + * scanning large regions of higher-numbered chrs. + */ + if (t->tptr[i] == &cm->tree[level + 1]) + continue; + + /* Recursively scan next level down */ + scancolormap(cm, co, + t->tptr[i], level + 1, + (partial | (chr) i) << BYTBITS, + chars, chars_len); + } + } + else + { + /* leaf node */ + for (i = 0; i < BYTTAB; i++) + { + if (t->tcolor[i] == co) + { + if (*chars_len > 0) + { + **chars = partial | (chr) i; + (*chars)++; + (*chars_len)--; + } + } + } + } +} diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 72c2c30ad40..656d03b69aa 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6746,6 +6746,7 @@ gincost_pattern(IndexOptInfo *index, int indexcol, GinQualCounts *counts) { Oid extractProcOid; + Oid collation; int strategy_op; Oid lefttype, righttype; @@ -6783,7 +6784,16 @@ gincost_pattern(IndexOptInfo *index, int indexcol, get_rel_name(index->indexoid)); } - OidFunctionCall7(extractProcOid, + /* + * Choose collation to pass to extractProc (should match initGinState). + */ + if (OidIsValid(index->indexcollations[indexcol])) + collation = index->indexcollations[indexcol]; + else + collation = DEFAULT_COLLATION_OID; + + OidFunctionCall7Coll(extractProcOid, + collation, query, PointerGetDatum(&nentries), UInt16GetDatum(strategy_op), diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 78ddaf92e33..92299c6f94c 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -25,6 +25,11 @@ typedef unsigned int pg_wchar; /* + * Maximum byte length of multibyte characters in any backend encoding + */ +#define MAX_MULTIBYTE_CHAR_LEN 4 + +/* * various definitions for EUC */ #define SS2 0x8e /* single shift 2 (JIS0201) */ diff --git a/src/include/regex/regexport.h b/src/include/regex/regexport.h new file mode 100644 index 00000000000..35889da294b --- /dev/null +++ b/src/include/regex/regexport.h @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * regexport.h + * Declarations for exporting info about a regex's NFA (nondeterministic + * finite automaton) + * + * The functions declared here provide accessors to extract the NFA state + * graph and color character sets of a successfully-compiled regex. + * + * An NFA contains one or more states, numbered 0..N-1. There is an initial + * state, as well as a final state --- reaching the final state denotes + * successful matching of an input string. Each state except the final one + * has some out-arcs that lead to successor states, each arc being labeled + * with a color that represents one or more concrete character codes. + * (The colors of a state's out-arcs need not be distinct, since this is an + * NFA not a DFA.) There are also "pseudocolors" representing start/end of + * line and start/end of string. Colors are numbered 0..C-1, but note that + * color 0 is "white" (all unused characters) and can generally be ignored. + * + * Portions Copyright (c) 2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/include/regex/regexport.h + * + *------------------------------------------------------------------------- + */ +#ifndef _REGEXPORT_H_ +#define _REGEXPORT_H_ + +#include "regex/regex.h" + +/* information about one arc of a regex's NFA */ +typedef struct +{ + int co; /* label (character-set color) of arc */ + int to; /* next state number */ +} regex_arc_t; + + +/* Functions for gathering information about NFA states and arcs */ +extern int pg_reg_getnumstates(const regex_t *regex); +extern int pg_reg_getinitialstate(const regex_t *regex); +extern int pg_reg_getfinalstate(const regex_t *regex); +extern int pg_reg_getnumoutarcs(const regex_t *regex, int st); +extern void pg_reg_getoutarcs(const regex_t *regex, int st, + regex_arc_t *arcs, int arcs_len); + +/* Functions for gathering information about colors */ +extern int pg_reg_getnumcolors(const regex_t *regex); +extern int pg_reg_colorisbegin(const regex_t *regex, int co); +extern int pg_reg_colorisend(const regex_t *regex, int co); +extern int pg_reg_getnumcharacters(const regex_t *regex, int co); +extern void pg_reg_getcharacters(const regex_t *regex, int co, + pg_wchar *chars, int chars_len); + +#endif /* _REGEXPORT_H_ */ |