aboutsummaryrefslogtreecommitdiff
path: root/src/include/regex/regguts.h
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2016-09-05 17:06:29 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2016-09-05 17:06:29 -0400
commitc54159d44ceaba26ceda9fea1804f0de122a8f30 (patch)
treeeaab96027f054cf5ff864d0745e446e8b8e13544 /src/include/regex/regguts.h
parentf80049f76a32858601510eaaef19ab8160e4c9b3 (diff)
downloadpostgresql-c54159d44ceaba26ceda9fea1804f0de122a8f30.tar.gz
postgresql-c54159d44ceaba26ceda9fea1804f0de122a8f30.zip
Make locale-dependent regex character classes work for large char codes.
Previously, we failed to recognize Unicode characters above U+7FF as being members of locale-dependent character classes such as [[:alpha:]]. (Actually, the same problem occurs for large pg_wchar values in any multibyte encoding, but UTF8 is the only case people have actually complained about.) It's impractical to get Spencer's original code to handle character classes or ranges containing many thousands of characters, because it insists on considering each member character individually at regex compile time, whether or not the character will ever be of interest at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which we process characters individually as before, and deal with entire ranges or classes as single entities above that. We can actually make things cheaper than before for chars below the cutoff, because the color map can now be a simple linear array for those chars, rather than the multilevel tree structure Spencer designed. It's more expensive than before for chars above the cutoff, because we must do a binary search in a list of high chars and char ranges used in the regex pattern, plus call iswalpha() and friends for each locale-dependent character class used in the pattern. However, multibyte encodings are normally designed to give smaller codes to popular characters, so that we can expect that the slow path will be taken relatively infrequently. In any case, the speed penalty appears minor except when we have to apply iswalpha() etc. to high character codes at runtime --- and the previous coding gave wrong answers for those cases, so whether it was faster is moot. Tom Lane, reviewed by Heikki Linnakangas Discussion: <15563.1471913698@sss.pgh.pa.us>
Diffstat (limited to 'src/include/regex/regguts.h')
-rw-r--r--src/include/regex/regguts.h141
1 files changed, 66 insertions, 75 deletions
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index b0aa641cc4f..69816f1449a 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -127,23 +127,6 @@
#define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
-
-/*
- * We dissect a chr into byts for colormap table indexing. Here we define
- * a byt, which will be the same as a byte on most machines... The exact
- * size of a byt is not critical, but about 8 bits is good, and extraction
- * of 8-bit chunks is sometimes especially fast.
- */
-#ifndef BYTBITS
-#define BYTBITS 8 /* bits in a byt */
-#endif
-#define BYTTAB (1<<BYTBITS) /* size of table with one entry per byt value */
-#define BYTMASK (BYTTAB-1) /* bit mask for byt */
-#define NBYTS ((CHRBITS+BYTBITS-1)/BYTBITS)
-/* the definition of GETCOLOR(), below, assumes NBYTS <= 4 */
-
-
-
/*
* As soon as possible, we map chrs into equivalence classes -- "colors" --
* which are of much more manageable number.
@@ -153,43 +136,10 @@ typedef short color; /* colors of characters */
#define MAX_COLOR 32767 /* max color (must fit in 'color' datatype) */
#define COLORLESS (-1) /* impossible color */
#define WHITE 0 /* default color, parent of all others */
-
+/* Note: various places in the code know that WHITE is zero */
/*
- * A colormap is a tree -- more precisely, a DAG -- indexed at each level
- * by a byt of the chr, to map the chr to a color efficiently. Because
- * lower sections of the tree can be shared, it can exploit the usual
- * sparseness of such a mapping table. The tree is always NBYTS levels
- * deep (in the past it was shallower during construction but was "filled"
- * to full depth at the end of that); areas that are unaltered as yet point
- * to "fill blocks" which are entirely WHITE in color.
- *
- * Leaf-level tree blocks are of type "struct colors", while upper-level
- * blocks are of type "struct ptrs". Pointers into the tree are generally
- * declared as "union tree *" to be agnostic about what level they point to.
- */
-
-/* the tree itself */
-struct colors
-{
- color ccolor[BYTTAB];
-};
-struct ptrs
-{
- union tree *pptr[BYTTAB];
-};
-union tree
-{
- struct colors colors;
- struct ptrs ptrs;
-};
-
-/* use these pseudo-field names when dereferencing a "union tree" pointer */
-#define tcolor colors.ccolor
-#define tptr ptrs.pptr
-
-/*
* Per-color data structure for the compile-time color machinery
*
* If "sub" is not NOSUB then it is the number of the color's current
@@ -203,26 +153,56 @@ union tree
*/
struct colordesc
{
- uchr nchrs; /* number of chars of this color */
+ int nschrs; /* number of simple chars of this color */
+ int nuchrs; /* number of upper map entries of this color */
color sub; /* open subcolor, if any; or free-chain ptr */
#define NOSUB COLORLESS /* value of "sub" when no open subcolor */
struct arc *arcs; /* chain of all arcs of this color */
- chr firstchr; /* char first assigned to this color */
+ chr firstchr; /* simple char first assigned to this color */
int flags; /* bit values defined next */
#define FREECOL 01 /* currently free */
#define PSEUDO 02 /* pseudocolor, no real chars */
#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
- union tree *block; /* block of solid color, if any */
};
/*
* The color map itself
*
- * Much of the data in the colormap struct is only used at compile time.
- * However, the bulk of the space usage is in the "tree" structure, so it's
- * not clear that there's much point in converting the rest to a more compact
- * form when compilation is finished.
+ * This struct holds both data used only at compile time, and the chr to
+ * color mapping information, used at both compile and run time. The latter
+ * is the bulk of the space, so it's not really worth separating out the
+ * compile-only portion.
+ *
+ * Ideally, the mapping data would just be an array of colors indexed by
+ * chr codes; but for large character sets that's impractical. Fortunately,
+ * common characters have smaller codes, so we can use a simple array for chr
+ * codes up to MAX_SIMPLE_CHR, and do something more complex for codes above
+ * that, without much loss of performance. The "something more complex" is a
+ * 2-D array of color entries, where row indexes correspond to individual chrs
+ * or chr ranges that have been mentioned in the regex (with row zero
+ * representing all other chrs), and column indexes correspond to different
+ * sets of locale-dependent character classes such as "isalpha". The
+ * classbits[k] entry is zero if we do not care about the k'th character class
+ * in this regex, and otherwise it is the bit to be OR'd into the column index
+ * if the character in question is a member of that class. We find the color
+ * of a high-valued chr by identifying which colormaprange it is in to get
+ * the row index (use row zero if it's in none of them), identifying which of
+ * the interesting cclasses it's in to get the column index, and then indexing
+ * into the 2-D hicolormap array.
+ *
+ * The colormapranges are required to be nonempty, nonoverlapping, and to
+ * appear in increasing chr-value order.
*/
+
+#define NUM_CCLASSES 13 /* must match data in regc_locale.c */
+
+typedef struct colormaprange
+{
+ chr cmin; /* range represents cmin..cmax inclusive */
+ chr cmax;
+ int rownum; /* row index in hicolormap array (>= 1) */
+} colormaprange;
+
struct colormap
{
int magic;
@@ -233,27 +213,27 @@ struct colormap
color free; /* beginning of free chain (if non-0) */
struct colordesc *cd; /* pointer to array of colordescs */
#define CDEND(cm) (&(cm)->cd[(cm)->max + 1])
+
+ /* mapping data for chrs <= MAX_SIMPLE_CHR: */
+ color *locolormap; /* simple array indexed by chr code */
+
+ /* mapping data for chrs > MAX_SIMPLE_CHR: */
+ int classbits[NUM_CCLASSES]; /* see comment above */
+ int numcmranges; /* number of colormapranges */
+ colormaprange *cmranges; /* ranges of high chrs */
+ color *hicolormap; /* 2-D array of color entries */
+ int maxarrayrows; /* number of array rows allocated */
+ int hiarrayrows; /* number of array rows in use */
+ int hiarraycols; /* number of array columns (2^N) */
+
/* If we need up to NINLINECDS, we store them here to save a malloc */
-#define NINLINECDS ((size_t)10)
+#define NINLINECDS ((size_t) 10)
struct colordesc cdspace[NINLINECDS];
- union tree tree[NBYTS]; /* tree top, plus lower-level fill blocks */
};
-/* optimization magic to do fast chr->color mapping */
-#define B0(c) ((c) & BYTMASK)
-#define B1(c) (((c)>>BYTBITS) & BYTMASK)
-#define B2(c) (((c)>>(2*BYTBITS)) & BYTMASK)
-#define B3(c) (((c)>>(3*BYTBITS)) & BYTMASK)
-#if NBYTS == 1
-#define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)])
-#endif
-/* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */
-#if NBYTS == 2
-#define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)])
-#endif
-#if NBYTS == 4
-#define GETCOLOR(cm, c) ((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)])
-#endif
+/* fetch color for chr; beware of multiple evaluation of c argument */
+#define GETCOLOR(cm, c) \
+ ((c) <= MAX_SIMPLE_CHR ? (cm)->locolormap[(c) - CHR_MIN] : pg_reg_getcolor(cm, c))
/*
@@ -264,6 +244,11 @@ struct colormap
* Representation of a set of characters. chrs[] represents individual
* code points, ranges[] represents ranges in the form min..max inclusive.
*
+ * If the cvec represents a locale-specific character class, eg [[:alpha:]],
+ * then the chrs[] and ranges[] arrays contain only members of that class
+ * up to MAX_SIMPLE_CHR (inclusive). cclasscode is set to regc_locale.c's
+ * code for the class, rather than being -1 as it is in an ordinary cvec.
+ *
* Note that in cvecs gotten from newcvec() and intended to be freed by
* freecvec(), both arrays of chrs are after the end of the struct, not
* separately malloc'd; so chrspace and rangespace are effectively immutable.
@@ -276,6 +261,7 @@ struct cvec
int nranges; /* number of ranges (chr pairs) */
int rangespace; /* number of ranges allocated in ranges[] */
chr *ranges; /* pointer to vector of chr pairs */
+ int cclasscode; /* value of "enum classes", or -1 */
};
@@ -489,3 +475,8 @@ struct guts
int nlacons; /* size of lacons[]; note that only slots
* numbered 1 .. nlacons-1 are used */
};
+
+
+/* prototypes for functions that are exported from regcomp.c to regexec.c */
+extern void pg_set_regex_collation(Oid collation);
+extern color pg_reg_getcolor(struct colormap * cm, chr c);