aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/regex/Makefile2
-rw-r--r--src/backend/regex/README9
-rw-r--r--src/backend/regex/regexport.c292
-rw-r--r--src/backend/utils/adt/selfuncs.c12
-rw-r--r--src/include/mb/pg_wchar.h5
-rw-r--r--src/include/regex/regexport.h57
6 files changed, 373 insertions, 4 deletions
diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile
index 74a4c0c89d8..a6100ad35db 100644
--- a/src/backend/regex/Makefile
+++ b/src/backend/regex/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/regex
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o
+OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o regexport.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/regex/README b/src/backend/regex/README
index c5d21e8c99d..29521c6636f 100644
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -7,8 +7,8 @@ So this file is an attempt to reverse-engineer some docs.
General source-file layout
--------------------------
-There are five separately-compilable source files, each exposing exactly
-one exported function:
+There are six separately-compilable source files, five of which expose
+exactly one exported function apiece:
regcomp.c: pg_regcomp
regexec.c: pg_regexec
regerror.c: pg_regerror
@@ -19,6 +19,9 @@ library version from any similar one that might be present on a particular
system. They'd need to be removed or replaced in any standalone version
of the library.)
+The sixth file, regexport.c, exposes multiple functions that allow extraction
+of info about a compiled regex (see regexport.h).
+
There are additional source files regc_*.c that are #include'd in regcomp,
and similarly additional source files rege_*.c that are #include'd in
regexec. This was done to avoid exposing internal symbols globally;
@@ -45,6 +48,7 @@ regexec.c Top-level regex execution code
rege_dfa.c DFA creation and execution
regerror.c pg_regerror: generate text for a regex error code
regfree.c pg_regfree: API to free a no-longer-needed regex_t
+regexport.c Functions for extracting info from a regex_t
regprefix.c Code for extracting a common prefix from a regex_t
The locale-specific code is concerned primarily with case-folding and with
@@ -56,6 +60,7 @@ The header files for the library are in src/include/regex/:
regcustom.h Customizes library for particular application
regerrs.h Error message list
regex.h Exported API
+regexport.h Exported API for regexport.c
regguts.h Internals declarations
diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c
new file mode 100644
index 00000000000..419565cf936
--- /dev/null
+++ b/src/backend/regex/regexport.c
@@ -0,0 +1,292 @@
+/*-------------------------------------------------------------------------
+ *
+ * regexport.c
+ * Functions for exporting info about a regex's NFA
+ *
+ * In this implementation, the NFA defines a necessary but not sufficient
+ * condition for a string to match the regex: that is, there can be strings
+ * that match the NFA but don't match the full regex, but not vice versa.
+ * Thus, for example, it is okay for the functions below to ignore lookahead
+ * constraints, which merely constrain the string some more.
+ *
+ * Notice that these functions return info into caller-provided arrays
+ * rather than doing their own malloc's. This simplifies the APIs by
+ * eliminating a class of error conditions, and in the case of colors
+ * allows the caller to decide how big is too big to bother with.
+ *
+ *
+ * Portions Copyright (c) 2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ * src/backend/regex/regexport.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+#include "regex/regexport.h"
+
+static void scancolormap(struct colormap * cm, int co,
+ union tree * t, int level, chr partial,
+ pg_wchar **chars, int *chars_len);
+
+
+/*
+ * Get total number of NFA states.
+ */
+int
+pg_reg_getnumstates(const regex_t *regex)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ return cnfa->nstates;
+}
+
+/*
+ * Get initial state of NFA.
+ */
+int
+pg_reg_getinitialstate(const regex_t *regex)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ return cnfa->pre;
+}
+
+/*
+ * Get final state of NFA.
+ */
+int
+pg_reg_getfinalstate(const regex_t *regex)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ return cnfa->post;
+}
+
+/*
+ * Get number of outgoing NFA arcs of state number "st".
+ *
+ * Note: LACON arcs are ignored, both here and in pg_reg_getoutarcs().
+ */
+int
+pg_reg_getnumoutarcs(const regex_t *regex, int st)
+{
+ struct cnfa *cnfa;
+ struct carc *ca;
+ int count;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (st < 0 || st >= cnfa->nstates)
+ return 0;
+ count = 0;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co < cnfa->ncolors)
+ count++;
+ }
+ return count;
+}
+
+/*
+ * Write array of outgoing NFA arcs of state number "st" into arcs[],
+ * whose length arcs_len must be at least as long as indicated by
+ * pg_reg_getnumoutarcs(), else not all arcs will be returned.
+ */
+void
+pg_reg_getoutarcs(const regex_t *regex, int st,
+ regex_arc_t *arcs, int arcs_len)
+{
+ struct cnfa *cnfa;
+ struct carc *ca;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
+ return;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co < cnfa->ncolors)
+ {
+ arcs->co = ca->co;
+ arcs->to = ca->to;
+ arcs++;
+ if (--arcs_len == 0)
+ break;
+ }
+ }
+}
+
+/*
+ * Get total number of colors.
+ */
+int
+pg_reg_getnumcolors(const regex_t *regex)
+{
+ struct colormap *cm;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cm = &((struct guts *) regex->re_guts)->cmap;
+
+ return cm->max + 1;
+}
+
+/*
+ * Check if color is beginning of line/string.
+ *
+ * (We might at some point need to offer more refined handling of pseudocolors,
+ * but this will do for now.)
+ */
+int
+pg_reg_colorisbegin(const regex_t *regex, int co)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (co == cnfa->bos[0] || co == cnfa->bos[1])
+ return true;
+ else
+ return false;
+}
+
+/*
+ * Check if color is end of line/string.
+ */
+int
+pg_reg_colorisend(const regex_t *regex, int co)
+{
+ struct cnfa *cnfa;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cnfa = &((struct guts *) regex->re_guts)->search;
+
+ if (co == cnfa->eos[0] || co == cnfa->eos[1])
+ return true;
+ else
+ return false;
+}
+
+/*
+ * Get number of member chrs of color number "co".
+ *
+ * Note: we return -1 if the color number is invalid, or if it is a special
+ * color (WHITE or a pseudocolor), or if the number of members is uncertain.
+ * The latter case cannot arise right now but is specified to allow for future
+ * improvements (see musings about run-time handling of higher character codes
+ * in regex/README). Callers should not try to extract the members if -1 is
+ * returned.
+ */
+int
+pg_reg_getnumcharacters(const regex_t *regex, int co)
+{
+ struct colormap *cm;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cm = &((struct guts *) regex->re_guts)->cmap;
+
+ if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */
+ return -1;
+ if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */
+ return -1;
+
+ return cm->cd[co].nchrs;
+}
+
+/*
+ * Write array of member chrs of color number "co" into chars[],
+ * whose length chars_len must be at least as long as indicated by
+ * pg_reg_getnumcharacters(), else not all chars will be returned.
+ *
+ * Fetching the members of WHITE or a pseudocolor is not supported.
+ *
+ * Caution: this is a relatively expensive operation.
+ */
+void
+pg_reg_getcharacters(const regex_t *regex, int co,
+ pg_wchar *chars, int chars_len)
+{
+ struct colormap *cm;
+
+ assert(regex != NULL && regex->re_magic == REMAGIC);
+ cm = &((struct guts *) regex->re_guts)->cmap;
+
+ if (co <= 0 || co > cm->max || chars_len <= 0)
+ return;
+ if (cm->cd[co].flags & PSEUDO)
+ return;
+
+ /* Recursively search the colormap tree */
+ scancolormap(cm, co, cm->tree, 0, 0, &chars, &chars_len);
+}
+
+/*
+ * Recursively scan the colormap tree to find chrs belonging to color "co".
+ * See regex/README for info about the tree structure.
+ *
+ * t: tree block to scan
+ * level: level (from 0) of t
+ * partial: partial chr code for chrs within t
+ * chars, chars_len: output area
+ */
+static void
+scancolormap(struct colormap * cm, int co,
+ union tree * t, int level, chr partial,
+ pg_wchar **chars, int *chars_len)
+{
+ int i;
+
+ if (level < NBYTS - 1)
+ {
+ /* non-leaf node */
+ for (i = 0; i < BYTTAB; i++)
+ {
+ /*
+ * We do not support search for chrs of color 0 (WHITE), so
+ * all-white subtrees need not be searched. These can be
+ * recognized because they are represented by the fill blocks in
+ * the colormap struct. This typically allows us to avoid
+ * scanning large regions of higher-numbered chrs.
+ */
+ if (t->tptr[i] == &cm->tree[level + 1])
+ continue;
+
+ /* Recursively scan next level down */
+ scancolormap(cm, co,
+ t->tptr[i], level + 1,
+ (partial | (chr) i) << BYTBITS,
+ chars, chars_len);
+ }
+ }
+ else
+ {
+ /* leaf node */
+ for (i = 0; i < BYTTAB; i++)
+ {
+ if (t->tcolor[i] == co)
+ {
+ if (*chars_len > 0)
+ {
+ **chars = partial | (chr) i;
+ (*chars)++;
+ (*chars_len)--;
+ }
+ }
+ }
+ }
+}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 72c2c30ad40..656d03b69aa 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6746,6 +6746,7 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
GinQualCounts *counts)
{
Oid extractProcOid;
+ Oid collation;
int strategy_op;
Oid lefttype,
righttype;
@@ -6783,7 +6784,16 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
get_rel_name(index->indexoid));
}
- OidFunctionCall7(extractProcOid,
+ /*
+ * Choose collation to pass to extractProc (should match initGinState).
+ */
+ if (OidIsValid(index->indexcollations[indexcol]))
+ collation = index->indexcollations[indexcol];
+ else
+ collation = DEFAULT_COLLATION_OID;
+
+ OidFunctionCall7Coll(extractProcOid,
+ collation,
query,
PointerGetDatum(&nentries),
UInt16GetDatum(strategy_op),
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 78ddaf92e33..92299c6f94c 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -25,6 +25,11 @@
typedef unsigned int pg_wchar;
/*
+ * Maximum byte length of multibyte characters in any backend encoding
+ */
+#define MAX_MULTIBYTE_CHAR_LEN 4
+
+/*
* various definitions for EUC
*/
#define SS2 0x8e /* single shift 2 (JIS0201) */
diff --git a/src/include/regex/regexport.h b/src/include/regex/regexport.h
new file mode 100644
index 00000000000..35889da294b
--- /dev/null
+++ b/src/include/regex/regexport.h
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * regexport.h
+ * Declarations for exporting info about a regex's NFA (nondeterministic
+ * finite automaton)
+ *
+ * The functions declared here provide accessors to extract the NFA state
+ * graph and color character sets of a successfully-compiled regex.
+ *
+ * An NFA contains one or more states, numbered 0..N-1. There is an initial
+ * state, as well as a final state --- reaching the final state denotes
+ * successful matching of an input string. Each state except the final one
+ * has some out-arcs that lead to successor states, each arc being labeled
+ * with a color that represents one or more concrete character codes.
+ * (The colors of a state's out-arcs need not be distinct, since this is an
+ * NFA not a DFA.) There are also "pseudocolors" representing start/end of
+ * line and start/end of string. Colors are numbered 0..C-1, but note that
+ * color 0 is "white" (all unused characters) and can generally be ignored.
+ *
+ * Portions Copyright (c) 2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ * src/include/regex/regexport.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _REGEXPORT_H_
+#define _REGEXPORT_H_
+
+#include "regex/regex.h"
+
+/* information about one arc of a regex's NFA */
+typedef struct
+{
+ int co; /* label (character-set color) of arc */
+ int to; /* next state number */
+} regex_arc_t;
+
+
+/* Functions for gathering information about NFA states and arcs */
+extern int pg_reg_getnumstates(const regex_t *regex);
+extern int pg_reg_getinitialstate(const regex_t *regex);
+extern int pg_reg_getfinalstate(const regex_t *regex);
+extern int pg_reg_getnumoutarcs(const regex_t *regex, int st);
+extern void pg_reg_getoutarcs(const regex_t *regex, int st,
+ regex_arc_t *arcs, int arcs_len);
+
+/* Functions for gathering information about colors */
+extern int pg_reg_getnumcolors(const regex_t *regex);
+extern int pg_reg_colorisbegin(const regex_t *regex, int co);
+extern int pg_reg_colorisend(const regex_t *regex, int co);
+extern int pg_reg_getnumcharacters(const regex_t *regex, int co);
+extern void pg_reg_getcharacters(const regex_t *regex, int co,
+ pg_wchar *chars, int chars_len);
+
+#endif /* _REGEXPORT_H_ */