aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-02-16 12:07:14 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-02-16 12:07:14 -0500
commit38bb3aef354ca98ff88cb37337995039a3b5135f (patch)
tree834dbe2f03bf6df53ebf8bf0e5bc1ec3797dc1e2
parentf672df5fdd22dac14c98d0a0bf5bbaa6ab17f8a5 (diff)
downloadpostgresql-38bb3aef354ca98ff88cb37337995039a3b5135f.tar.gz
postgresql-38bb3aef354ca98ff88cb37337995039a3b5135f.zip
Convert tsginidx.c's GIN indexing logic to fully ternary operation.
Commit 2f2007fbb did this partially, but there were two remaining warts. checkcondition_gin handled some uncertain cases by setting the out-of-band recheck flag, some by returning TS_MAYBE, and some by doing both. Meanwhile, TS_execute arbitrarily converted a TS_MAYBE result to TS_YES. Thus, if checkcondition_gin chose to only return TS_MAYBE, the outcome would be TS_YES with no recheck flag, potentially resulting in wrong query outputs. The case where this'd happen is if there were GIN_MAYBE entries in the indexscan results passed to gin_tsquery_[tri]consistent, which so far as I can see would only happen if the tidbitmap used to accumulate indexscan results grew large enough to become lossy. I initially thought of fixing this by ensuring we always set the recheck flag as well as returning TS_MAYBE in uncertain cases. But that errs in the other direction, potentially forcing rechecks of rows that provably match the query (since the recheck flag remains set even if TS_execute later finds that the answer must be TS_YES). Instead, let's get rid of the out-of-band recheck flag altogether and rely on returning TS_MAYBE. This requires exporting a version of TS_execute that will actually return the full ternary result of the evaluation ... but we likely should have done that to start with. Unfortunately it doesn't seem practical to add a regression test case that covers this: the amount of data needed to cause the GIN bitmap to become lossy results in a longer runtime than I think we want to have in the tests. (I'm wondering about allowing smaller work_mem settings to ameliorate that, but it'd be a matter for a separate patch.) Per bug #16865 from Dimitri NĂ¼scheler. Back-patch to v13 where the faulty commit came in. Discussion: https://postgr.es/m/16865-4ffdc3e682e6d75b@postgresql.org
-rw-r--r--src/backend/utils/adt/tsginidx.c57
-rw-r--r--src/backend/utils/adt/tsvector_op.c12
-rw-r--r--src/include/tsearch/ts_utils.h3
3 files changed, 44 insertions, 28 deletions
diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c
index 6c913baabac..7e0e31f69df 100644
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -175,7 +175,6 @@ typedef struct
QueryItem *first_item;
GinTernaryValue *check;
int *map_item_operand;
- bool *need_recheck;
} GinChkVal;
/*
@@ -186,25 +185,22 @@ checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data)
{
GinChkVal *gcv = (GinChkVal *) checkval;
int j;
-
- /*
- * if any val requiring a weight is used or caller needs position
- * information then set recheck flag
- */
- if (val->weight != 0 || data != NULL)
- *(gcv->need_recheck) = true;
+ GinTernaryValue result;
/* convert item's number to corresponding entry's (operand's) number */
j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item];
+ /* determine presence of current entry in indexed value */
+ result = gcv->check[j];
+
/*
- * return presence of current entry in indexed value; but TRUE becomes
- * MAYBE in the presence of a query requiring recheck
+ * If any val requiring a weight is used or caller needs position
+ * information then we must recheck, so replace TRUE with MAYBE.
*/
- if (gcv->check[j] == GIN_TRUE)
+ if (result == GIN_TRUE)
{
if (val->weight != 0 || data != NULL)
- return TS_MAYBE;
+ result = GIN_MAYBE;
}
/*
@@ -212,7 +208,7 @@ checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data)
* assignments. We could use a switch statement to map the values if that
* ever stops being true, but it seems unlikely to happen.
*/
- return (TSTernaryValue) gcv->check[j];
+ return (TSTernaryValue) result;
}
Datum
@@ -244,12 +240,23 @@ gin_tsquery_consistent(PG_FUNCTION_ARGS)
"sizes of GinTernaryValue and bool are not equal");
gcv.check = (GinTernaryValue *) check;
gcv.map_item_operand = (int *) (extra_data[0]);
- gcv.need_recheck = recheck;
- res = TS_execute(GETQUERY(query),
- &gcv,
- TS_EXEC_PHRASE_NO_POS,
- checkcondition_gin);
+ switch (TS_execute_ternary(GETQUERY(query),
+ &gcv,
+ TS_EXEC_PHRASE_NO_POS,
+ checkcondition_gin))
+ {
+ case TS_NO:
+ res = false;
+ break;
+ case TS_YES:
+ res = true;
+ break;
+ case TS_MAYBE:
+ res = true;
+ *recheck = true;
+ break;
+ }
}
PG_RETURN_BOOL(res);
@@ -266,10 +273,6 @@ gin_tsquery_triconsistent(PG_FUNCTION_ARGS)
/* int32 nkeys = PG_GETARG_INT32(3); */
Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
GinTernaryValue res = GIN_FALSE;
- bool recheck;
-
- /* Initially assume query doesn't require recheck */
- recheck = false;
if (query->size > 0)
{
@@ -282,13 +285,11 @@ gin_tsquery_triconsistent(PG_FUNCTION_ARGS)
gcv.first_item = GETQUERY(query);
gcv.check = check;
gcv.map_item_operand = (int *) (extra_data[0]);
- gcv.need_recheck = &recheck;
- if (TS_execute(GETQUERY(query),
- &gcv,
- TS_EXEC_PHRASE_NO_POS,
- checkcondition_gin))
- res = recheck ? GIN_MAYBE : GIN_TRUE;
+ res = TS_execute_ternary(GETQUERY(query),
+ &gcv,
+ TS_EXEC_PHRASE_NO_POS,
+ checkcondition_gin);
}
PG_RETURN_GIN_TERNARY_VALUE(res);
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 2939fb5c210..9236ebcc8fe 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -1855,6 +1855,18 @@ TS_execute(QueryItem *curitem, void *arg, uint32 flags,
}
/*
+ * Evaluate tsquery boolean expression.
+ *
+ * This is the same as TS_execute except that TS_MAYBE is returned as-is.
+ */
+TSTernaryValue
+TS_execute_ternary(QueryItem *curitem, void *arg, uint32 flags,
+ TSExecuteCallback chkcond)
+{
+ return TS_execute_recurse(curitem, arg, flags, chkcond);
+}
+
+/*
* TS_execute recursion for operators above any phrase operator. Here we do
* not need to worry about lexeme positions. As soon as we hit an OP_PHRASE
* operator, we pass it off to TS_phrase_execute which does worry.
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h
index 69a9ba8524e..4266560151f 100644
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -199,6 +199,9 @@ typedef TSTernaryValue (*TSExecuteCallback) (void *arg, QueryOperand *val,
extern bool TS_execute(QueryItem *curitem, void *arg, uint32 flags,
TSExecuteCallback chkcond);
+extern TSTernaryValue TS_execute_ternary(QueryItem *curitem, void *arg,
+ uint32 flags,
+ TSExecuteCallback chkcond);
extern bool tsquery_requires_match(QueryItem *curitem);
/*