aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/pg_trgm/expected/pg_trgm.out19
-rw-r--r--contrib/pg_trgm/sql/pg_trgm.sql3
-rw-r--r--contrib/pg_trgm/trgm_regexp.c156
3 files changed, 136 insertions, 42 deletions
diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out
index 6b2fa7eb496..c3304b0ceb9 100644
--- a/contrib/pg_trgm/expected/pg_trgm.out
+++ b/contrib/pg_trgm/expected/pg_trgm.out
@@ -3497,6 +3497,7 @@ create table test2(t text COLLATE "C");
insert into test2 values ('abcdef');
insert into test2 values ('quark');
insert into test2 values (' z foo bar');
+insert into test2 values ('/123/-45/');
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
explain (costs off)
@@ -3598,7 +3599,8 @@ select * from test2 where t ~ '(abc)*$';
abcdef
quark
z foo bar
-(3 rows)
+ /123/-45/
+(4 rows)
select * from test2 where t ~* 'DEF';
t
@@ -3690,6 +3692,12 @@ select * from test2 where t ~ 'qua(?!foo)';
quark
(1 row)
+select * from test2 where t ~ '/\d+/-\d';
+ t
+-----------
+ /123/-45/
+(1 row)
+
drop index test2_idx_gin;
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
@@ -3784,7 +3792,8 @@ select * from test2 where t ~ '(abc)*$';
abcdef
quark
z foo bar
-(3 rows)
+ /123/-45/
+(4 rows)
select * from test2 where t ~* 'DEF';
t
@@ -3876,6 +3885,12 @@ select * from test2 where t ~ 'qua(?!foo)';
quark
(1 row)
+select * from test2 where t ~ '/\d+/-\d';
+ t
+-----------
+ /123/-45/
+(1 row)
+
-- Check similarity threshold (bug #14202)
CREATE TEMP TABLE restaurants (city text);
INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000);
diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql
index 2619ad5ee62..fe8d0a7495d 100644
--- a/contrib/pg_trgm/sql/pg_trgm.sql
+++ b/contrib/pg_trgm/sql/pg_trgm.sql
@@ -52,6 +52,7 @@ create table test2(t text COLLATE "C");
insert into test2 values ('abcdef');
insert into test2 values ('quark');
insert into test2 values (' z foo bar');
+insert into test2 values ('/123/-45/');
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
explain (costs off)
@@ -87,6 +88,7 @@ select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo';
select * from test2 where t ~ 'qua(?!foo)';
+select * from test2 where t ~ '/\d+/-\d';
drop index test2_idx_gin;
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
@@ -124,6 +126,7 @@ select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo';
select * from test2 where t ~ 'qua(?!foo)';
+select * from test2 where t ~ '/\d+/-\d';
-- Check similarity threshold (bug #14202)
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index 0e9d5a62e44..6ab2e49eb94 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -200,9 +200,10 @@
/*
- * Uncomment to print intermediate stages, for exploring and debugging the
- * algorithm implementation. This produces three graph files in /tmp,
- * in Graphviz .dot format.
+ * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
+ * for exploring and debugging the algorithm implementation.
+ * This produces three graph files in /tmp, in Graphviz .dot format.
+ * Some progress information is also printed to postmaster stderr.
*/
/* #define TRGM_REGEXP_DEBUG */
@@ -226,7 +227,7 @@
* Penalty multipliers for trigram counts depending on whitespace contents.
* Numbers based on analysis of real-life texts.
*/
-const float4 penalties[8] = {
+static const float4 penalties[8] = {
1.0f, /* "aaa" */
3.5f, /* "aa " */
0.0f, /* "a a" (impossible) */
@@ -319,9 +320,12 @@ typedef struct
* enterKeys - enter keys reachable from this state without reading any
* predictable trigram (List of TrgmStateKey)
* flags - flag bits
+ * snumber - number of this state (initially assigned as -1, -2, etc,
+ * for debugging purposes only; then at the packaging stage,
+ * surviving states are renumbered with positive numbers)
* parent - parent state, if this state has been merged into another
+ * tentFlags - flags this state would acquire via planned merges
* tentParent - planned parent state, if considering a merge
- * number - number of this state (used at the packaging stage)
*/
#define TSTATE_INIT 0x01 /* flag indicating this state is initial */
#define TSTATE_FIN 0x02 /* flag indicating this state is final */
@@ -332,9 +336,10 @@ typedef struct TrgmState
List *arcs;
List *enterKeys;
int flags;
+ int snumber;
struct TrgmState *parent;
+ int tentFlags;
struct TrgmState *tentParent;
- int number;
} TrgmState;
/*
@@ -361,7 +366,7 @@ typedef struct
* Information about color trigram (used in stage 3)
*
* ctrgm - trigram itself
- * number - number of this trigram (used in the packaging stage)
+ * cnumber - number of this trigram (used in the packaging stage)
* count - number of simple trigrams created from this color trigram
* expanded - indicates this color trigram is expanded into simple trigrams
* arcs - list of all arcs labeled with this color trigram.
@@ -369,7 +374,7 @@ typedef struct
typedef struct
{
ColorTrgm ctrgm;
- int number;
+ int cnumber;
int count;
float4 penalty;
bool expanded;
@@ -403,6 +408,7 @@ typedef struct
/* Expanded graph (stage 2) */
HTAB *states;
TrgmState *initState;
+ int nstates;
/* Workspace for stage 2 */
List *queue;
@@ -918,6 +924,7 @@ transformGraph(TrgmNFA *trgmNFA)
1024,
&hashCtl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ trgmNFA->nstates = 0;
/* Create initial state: ambiguous prefix, NFA's initial state */
MemSet(&initkey, 0, sizeof(initkey));
@@ -1387,9 +1394,11 @@ getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
state->arcs = NIL;
state->enterKeys = NIL;
state->flags = 0;
+ /* states are initially given negative numbers */
+ state->snumber = -(++trgmNFA->nstates);
state->parent = NULL;
+ state->tentFlags = 0;
state->tentParent = NULL;
- state->number = -1;
trgmNFA->queue = lappend(trgmNFA->queue, state);
}
@@ -1454,10 +1463,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
ColorTrgmInfo *colorTrgms;
int64 totalTrgmCount;
float4 totalTrgmPenalty;
- int number;
+ int cnumber;
/* Collect color trigrams from all arcs */
- colorTrgms = (ColorTrgmInfo *) palloc(sizeof(ColorTrgmInfo) * arcsCount);
+ colorTrgms = (ColorTrgmInfo *) palloc0(sizeof(ColorTrgmInfo) * arcsCount);
trgmNFA->colorTrgms = colorTrgms;
i = 0;
@@ -1470,12 +1479,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
{
TrgmArc *arc = (TrgmArc *) lfirst(cell);
TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo));
+ ColorTrgmInfo *trgmInfo = &colorTrgms[i];
arcInfo->source = state;
arcInfo->target = arc->target;
- colorTrgms[i].arcs = list_make1(arcInfo);
- colorTrgms[i].expanded = true;
- colorTrgms[i].ctrgm = arc->ctrgm;
+ trgmInfo->ctrgm = arc->ctrgm;
+ trgmInfo->cnumber = -1;
+ /* count and penalty will be set below */
+ trgmInfo->expanded = true;
+ trgmInfo->arcs = list_make1(arcInfo);
i++;
}
}
@@ -1573,6 +1585,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
break;
+#ifdef TRGM_REGEXP_DEBUG
+ fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
+ trgmInfo->ctrgm.colors[0],
+ trgmInfo->ctrgm.colors[1],
+ trgmInfo->ctrgm.colors[2],
+ trgmInfo->penalty,
+ list_length(trgmInfo->arcs));
+#endif
+
/*
* Does any arc of this color trigram connect initial and final
* states? If so we can't remove it.
@@ -1585,26 +1606,44 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
int source_flags,
target_flags;
+#ifdef TRGM_REGEXP_DEBUG
+ fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
+ -target->snumber, target->flags,
+ -source->snumber, source->flags);
+#endif
+
/* examine parent states, if any merging has already happened */
while (source->parent)
source = source->parent;
while (target->parent)
target = target->parent;
+#ifdef TRGM_REGEXP_DEBUG
+ fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
+ -target->snumber, target->flags,
+ -source->snumber, source->flags);
+#endif
+
/* we must also consider merges we are planning right now */
- source_flags = source->flags;
+ source_flags = source->flags | source->tentFlags;
while (source->tentParent)
{
source = source->tentParent;
- source_flags |= source->flags;
+ source_flags |= source->flags | source->tentFlags;
}
- target_flags = target->flags;
+ target_flags = target->flags | target->tentFlags;
while (target->tentParent)
{
target = target->tentParent;
- target_flags |= target->flags;
+ target_flags |= target->flags | target->tentFlags;
}
+#ifdef TRGM_REGEXP_DEBUG
+ fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
+ -target->snumber, target_flags,
+ -source->snumber, source_flags);
+#endif
+
/* would fully-merged state have both INIT and FIN set? */
if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
(TSTATE_INIT | TSTATE_FIN))
@@ -1615,29 +1654,62 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
/* ok so far, so remember planned merge */
if (source != target)
+ {
+#ifdef TRGM_REGEXP_DEBUG
+ fprintf(stderr, " ... tentatively merging s%d into s%d\n",
+ -target->snumber, -source->snumber);
+#endif
target->tentParent = source;
+ source->tentFlags |= target_flags;
+ }
}
- /* We must clear all the tentParent fields before continuing */
+ /*
+ * We must reset all the tentFlags/tentParent fields before
+ * continuing. tentFlags could only have become set in states that
+ * are the source or parent or tentative parent of one of the current
+ * arcs; likewise tentParent could only have become set in states that
+ * are the target or parent or tentative parent of one of the current
+ * arcs. There might be some overlap between those sets, but if we
+ * clear tentFlags in target states as well as source states, we
+ * should be okay even if we visit a state as target before visiting
+ * it as a source.
+ */
foreach(cell, trgmInfo->arcs)
{
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
- TrgmState *target = arcInfo->target;
+ TrgmState *source = arcInfo->source,
+ *target = arcInfo->target;
TrgmState *ttarget;
+ /* no need to touch previously-merged states */
+ while (source->parent)
+ source = source->parent;
while (target->parent)
target = target->parent;
+ while (source)
+ {
+ source->tentFlags = 0;
+ source = source->tentParent;
+ }
+
while ((ttarget = target->tentParent) != NULL)
{
target->tentParent = NULL;
+ target->tentFlags = 0; /* in case it was also a source */
target = ttarget;
}
}
/* Now, move on if we can't drop this trigram */
if (!canRemove)
+ {
+#ifdef TRGM_REGEXP_DEBUG
+ fprintf(stderr, " ... not ok to merge\n");
+#endif
continue;
+ }
/* OK, merge states linked by each arc labeled by the trigram */
foreach(cell, trgmInfo->arcs)
@@ -1652,6 +1724,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
target = target->parent;
if (source != target)
{
+#ifdef TRGM_REGEXP_DEBUG
+ fprintf(stderr, "merging s%d into s%d\n",
+ -target->snumber, -source->snumber);
+#endif
mergeStates(source, target);
/* Assert we didn't merge initial and final states */
Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
@@ -1675,15 +1751,15 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
* Sort color trigrams by colors (will be useful for bsearch in packGraph)
* and enumerate the color trigrams that are expanded.
*/
- number = 0;
+ cnumber = 0;
qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
colorTrgmInfoCmp);
for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{
if (colorTrgms[i].expanded)
{
- colorTrgms[i].number = number;
- number++;
+ colorTrgms[i].cnumber = cnumber;
+ cnumber++;
}
}
@@ -1854,7 +1930,7 @@ colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
static TrgmPackedGraph *
packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
{
- int number = 2,
+ int snumber = 2,
arcIndex,
arcsCount;
HASH_SEQ_STATUS scan_status;
@@ -1874,16 +1950,16 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
while (state->parent)
state = state->parent;
- if (state->number < 0)
+ if (state->snumber < 0)
{
if (state->flags & TSTATE_INIT)
- state->number = 0;
+ state->snumber = 0;
else if (state->flags & TSTATE_FIN)
- state->number = 1;
+ state->snumber = 1;
else
{
- state->number = number;
- number++;
+ state->snumber = snumber;
+ snumber++;
}
}
}
@@ -1909,7 +1985,7 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
while (target->parent)
target = target->parent;
- if (source->number != target->number)
+ if (source->snumber != target->snumber)
{
ColorTrgmInfo *ctrgm;
@@ -1921,9 +1997,9 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
Assert(ctrgm != NULL);
Assert(ctrgm->expanded);
- arcs[arcIndex].sourceState = source->number;
- arcs[arcIndex].targetState = target->number;
- arcs[arcIndex].colorTrgm = ctrgm->number;
+ arcs[arcIndex].sourceState = source->snumber;
+ arcs[arcIndex].targetState = target->snumber;
+ arcs[arcIndex].colorTrgm = ctrgm->cnumber;
arcIndex++;
}
}
@@ -1969,13 +2045,13 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
}
/* Pack states and arcs information */
- result->statesCount = number;
+ result->statesCount = snumber;
result->states = (TrgmPackedState *)
- MemoryContextAlloc(rcontext, number * sizeof(TrgmPackedState));
+ MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
packedArcs = (TrgmPackedArc *)
MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
j = 0;
- for (i = 0; i < number; i++)
+ for (i = 0; i < snumber; i++)
{
int cnt = 0;
@@ -2141,7 +2217,7 @@ printTrgmNFA(TrgmNFA *trgmNFA)
{
ListCell *cell;
- appendStringInfo(&buf, "s%p", (void *) state);
+ appendStringInfo(&buf, "s%d", -state->snumber);
if (state->flags & TSTATE_FIN)
appendStringInfoString(&buf, " [shape = doublecircle]");
if (state->flags & TSTATE_INIT)
@@ -2153,8 +2229,8 @@ printTrgmNFA(TrgmNFA *trgmNFA)
{
TrgmArc *arc = (TrgmArc *) lfirst(cell);
- appendStringInfo(&buf, " s%p -> s%p [label = \"",
- (void *) state, (void *) arc->target);
+ appendStringInfo(&buf, " s%d -> s%d [label = \"",
+ -state->snumber, -arc->target->snumber);
printTrgmColor(&buf, arc->ctrgm.colors[0]);
appendStringInfoChar(&buf, ' ');
printTrgmColor(&buf, arc->ctrgm.colors[1]);
@@ -2167,7 +2243,7 @@ printTrgmNFA(TrgmNFA *trgmNFA)
if (initstate)
{
appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
- appendStringInfo(&buf, " initial -> s%p;\n", (void *) initstate);
+ appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
}
appendStringInfoString(&buf, "}\n");