diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2020-07-24 15:26:51 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2020-07-24 15:26:51 -0400 |
commit | 2f2007fbb255be178aca586780967f43885203a7 (patch) | |
tree | f8b550822ccb33b1a63acc292556c7ee2ae90683 /src/backend/utils | |
parent | 25244b8972a34b838c4033fe9efc1d31cba9d0e3 (diff) | |
download | postgresql-2f2007fbb255be178aca586780967f43885203a7.tar.gz postgresql-2f2007fbb255be178aca586780967f43885203a7.zip |
Fix assorted bugs by changing TS_execute's callback API to ternary logic.
Text search sometimes failed to find valid matches, for instance
'!crew:A'::tsquery might fail to locate 'crew:1B'::tsvector during
an index search. The root of the issue is that TS_execute's callback
functions were not changed to use ternary (yes/no/maybe) reporting
when we made the search logic itself do so. It's somewhat annoying
to break that API, but on the other hand we now see that any code
using plain boolean logic is almost certainly broken since the
addition of phrase search. There seem to be very few outside callers
of this code anyway, so we'll just break them intentionally to get
them to adapt.
This allows removal of tsginidx.c's private re-implementation of
TS_execute, since that's now entirely duplicative. It's also no
longer necessary to avoid use of CALC_NOT in tsgistidx.c, since
the underlying callbacks can now do something reasonable.
Back-patch into v13. We can't change this in stable branches,
but it seems not quite too late to fix it in v13.
Tom Lane and Pavel Borisov
Discussion: https://postgr.es/m/CALT9ZEE-aLotzBg-pOp2GFTesGWVYzXA3=mZKzRDa_OKnLF7Mg@mail.gmail.com
Diffstat (limited to 'src/backend/utils')
-rw-r--r-- | src/backend/utils/adt/tsginidx.c | 133 | ||||
-rw-r--r-- | src/backend/utils/adt/tsgistidx.c | 26 | ||||
-rw-r--r-- | src/backend/utils/adt/tsrank.c | 12 | ||||
-rw-r--r-- | src/backend/utils/adt/tsvector_op.c | 147 |
4 files changed, 146 insertions, 172 deletions
diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c index 2d656168fca..3128f0a7da0 100644 --- a/src/backend/utils/adt/tsginidx.c +++ b/src/backend/utils/adt/tsginidx.c @@ -178,9 +178,13 @@ typedef struct bool *need_recheck; } GinChkVal; -static GinTernaryValue -checkcondition_gin_internal(GinChkVal *gcv, QueryOperand *val, ExecPhraseData *data) +/* + * TS_execute callback for matching a tsquery operand to GIN index data + */ +static TSTernaryValue +checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data) { + GinChkVal *gcv = (GinChkVal *) checkval; int j; /* @@ -193,112 +197,22 @@ checkcondition_gin_internal(GinChkVal *gcv, QueryOperand *val, ExecPhraseData *d /* convert item's number to corresponding entry's (operand's) number */ j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item]; - /* return presence of current entry in indexed value */ - return gcv->check[j]; -} - -/* - * Wrapper of check condition function for TS_execute. - */ -static bool -checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data) -{ - return checkcondition_gin_internal((GinChkVal *) checkval, - val, - data) != GIN_FALSE; -} - -/* - * Evaluate tsquery boolean expression using ternary logic. - * - * Note: the reason we can't use TS_execute() for this is that its API - * for the checkcondition callback doesn't allow a MAYBE result to be - * returned, but we might have MAYBEs in the gcv->check array. - * Perhaps we should change that API. - */ -static GinTernaryValue -TS_execute_ternary(GinChkVal *gcv, QueryItem *curitem, bool in_phrase) -{ - GinTernaryValue val1, - val2, - result; - - /* since this function recurses, it could be driven to stack overflow */ - check_stack_depth(); - - if (curitem->type == QI_VAL) - return - checkcondition_gin_internal(gcv, - (QueryOperand *) curitem, - NULL /* don't have position info */ ); - - switch (curitem->qoperator.oper) + /* + * return presence of current entry in indexed value; but TRUE becomes + * MAYBE in the presence of a query requiring recheck + */ + if (gcv->check[j] == GIN_TRUE) { - case OP_NOT: - - /* - * Below a phrase search, force NOT's result to MAYBE. We cannot - * invert a TRUE result from the subexpression to FALSE, since - * TRUE only says that the subexpression matches somewhere, not - * that it matches everywhere, so there might be positions where - * the NOT will match. We could invert FALSE to TRUE, but there's - * little point in distinguishing TRUE from MAYBE, since a recheck - * will have been forced already. - */ - if (in_phrase) - return GIN_MAYBE; - - result = TS_execute_ternary(gcv, curitem + 1, in_phrase); - if (result == GIN_MAYBE) - return result; - return !result; - - case OP_PHRASE: - - /* - * GIN doesn't contain any information about positions, so treat - * OP_PHRASE as OP_AND with recheck requirement, and always - * reporting MAYBE not TRUE. - */ - *(gcv->need_recheck) = true; - /* Pass down in_phrase == true in case there's a NOT below */ - in_phrase = true; - - /* FALL THRU */ - - case OP_AND: - val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left, - in_phrase); - if (val1 == GIN_FALSE) - return GIN_FALSE; - val2 = TS_execute_ternary(gcv, curitem + 1, in_phrase); - if (val2 == GIN_FALSE) - return GIN_FALSE; - if (val1 == GIN_TRUE && val2 == GIN_TRUE && - curitem->qoperator.oper != OP_PHRASE) - return GIN_TRUE; - else - return GIN_MAYBE; - - case OP_OR: - val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left, - in_phrase); - if (val1 == GIN_TRUE) - return GIN_TRUE; - val2 = TS_execute_ternary(gcv, curitem + 1, in_phrase); - if (val2 == GIN_TRUE) - return GIN_TRUE; - if (val1 == GIN_FALSE && val2 == GIN_FALSE) - return GIN_FALSE; - else - return GIN_MAYBE; - - default: - elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + if (val->weight != 0 || data != NULL) + return TS_MAYBE; } - /* not reachable, but keep compiler quiet */ - return false; + /* + * We rely on GinTernaryValue and TSTernaryValue using equivalent value + * assignments. We could use a switch statement to map the values if that + * ever stops being true, but it seems unlikely to happen. + */ + return (TSTernaryValue) gcv->check[j]; } Datum @@ -370,10 +284,11 @@ gin_tsquery_triconsistent(PG_FUNCTION_ARGS) gcv.map_item_operand = (int *) (extra_data[0]); gcv.need_recheck = &recheck; - res = TS_execute_ternary(&gcv, GETQUERY(query), false); - - if (res == GIN_TRUE && recheck) - res = GIN_MAYBE; + if (TS_execute(GETQUERY(query), + &gcv, + TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS, + checkcondition_gin)) + res = recheck ? GIN_MAYBE : GIN_TRUE; } PG_RETURN_GIN_TERNARY_VALUE(res); diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c index c3f25800e7b..927aed91564 100644 --- a/src/backend/utils/adt/tsgistidx.c +++ b/src/backend/utils/adt/tsgistidx.c @@ -273,9 +273,9 @@ typedef struct } CHKVAL; /* - * is there value 'val' in array or not ? + * TS_execute callback for matching a tsquery operand to GIST leaf-page data */ -static bool +static TSTernaryValue checkcondition_arr(void *checkval, QueryOperand *val, ExecPhraseData *data) { int32 *StopLow = ((CHKVAL *) checkval)->arrb; @@ -288,23 +288,26 @@ checkcondition_arr(void *checkval, QueryOperand *val, ExecPhraseData *data) * we are not able to find a prefix by hash value */ if (val->prefix) - return true; + return TS_MAYBE; while (StopLow < StopHigh) { StopMiddle = StopLow + (StopHigh - StopLow) / 2; if (*StopMiddle == val->valcrc) - return true; + return TS_MAYBE; else if (*StopMiddle < val->valcrc) StopLow = StopMiddle + 1; else StopHigh = StopMiddle; } - return false; + return TS_NO; } -static bool +/* + * TS_execute callback for matching a tsquery operand to GIST non-leaf data + */ +static TSTernaryValue checkcondition_bit(void *checkval, QueryOperand *val, ExecPhraseData *data) { void *key = (SignTSVector *) checkval; @@ -313,8 +316,12 @@ checkcondition_bit(void *checkval, QueryOperand *val, ExecPhraseData *data) * we are not able to find a prefix in signature tree */ if (val->prefix) - return true; - return GETBIT(GETSIGN(key), HASHVAL(val->valcrc, GETSIGLEN(key))); + return TS_MAYBE; + + if (GETBIT(GETSIGN(key), HASHVAL(val->valcrc, GETSIGLEN(key)))) + return TS_MAYBE; + else + return TS_NO; } Datum @@ -339,10 +346,9 @@ gtsvector_consistent(PG_FUNCTION_ARGS) if (ISALLTRUE(key)) PG_RETURN_BOOL(true); - /* since signature is lossy, cannot specify CALC_NOT here */ PG_RETURN_BOOL(TS_execute(GETQUERY(query), key, - TS_EXEC_PHRASE_NO_POS, + TS_EXEC_PHRASE_NO_POS | TS_EXEC_CALC_NOT, checkcondition_bit)); } else diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c index 07251dd577c..cbd97abccde 100644 --- a/src/backend/utils/adt/tsrank.c +++ b/src/backend/utils/adt/tsrank.c @@ -556,14 +556,18 @@ typedef struct #define QR_GET_OPERAND_DATA(q, v) \ ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) ) -static bool -checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data) +/* + * TS_execute callback for matching a tsquery operand to QueryRepresentation + */ +static TSTernaryValue +checkcondition_QueryOperand(void *checkval, QueryOperand *val, + ExecPhraseData *data) { QueryRepresentation *qr = (QueryRepresentation *) checkval; QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val); if (!opData->operandexists) - return false; + return TS_NO; if (data) { @@ -573,7 +577,7 @@ checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *d data->pos += MAXQROPOS - opData->npos; } - return true; + return TS_YES; } typedef struct diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 51619c396c7..6df943abd4e 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -67,14 +67,6 @@ typedef struct StatEntry *root; } TSVectorStat; -/* TS_execute requires ternary logic to handle NOT with phrase matches */ -typedef enum -{ - TS_NO, /* definitely no match */ - TS_YES, /* definitely does match */ - TS_MAYBE /* can't verify match for lack of pos data */ -} TSTernaryValue; - static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags, @@ -1188,13 +1180,15 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix) /* * Check weight info or/and fill 'data' with the required positions */ -static bool +static TSTernaryValue checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, ExecPhraseData *data) { - bool result = false; + TSTernaryValue result = TS_NO; - if (entry->haspos && (val->weight || data)) + Assert(data == NULL || data->npos == 0); + + if (entry->haspos) { WordEntryPosVector *posvec; @@ -1232,7 +1226,13 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, data->npos = dptr - data->pos; if (data->npos > 0) - result = true; + result = TS_YES; + else + { + pfree(data->pos); + data->pos = NULL; + data->allocated = false; + } } else if (val->weight) { @@ -1243,40 +1243,57 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, { if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) { - result = true; + result = TS_YES; break; /* no need to go further */ } posvec_iter++; } } - else /* data != NULL */ + else if (data) { data->npos = posvec->npos; data->pos = posvec->pos; data->allocated = false; - result = true; + result = TS_YES; + } + else + { + /* simplest case: no weight check, positions not needed */ + result = TS_YES; } } else { - result = true; + /* + * Position info is lacking, so if the caller requires it, we can only + * say that maybe there is a match. + * + * Notice, however, that we *don't* check val->weight here. + * Historically, stripped tsvectors are considered to match queries + * whether or not the query has a weight restriction; that's a little + * dubious but we'll preserve the behavior. + */ + if (data) + result = TS_MAYBE; + else + result = TS_YES; } return result; } /* - * is there value 'val' in array or not ? + * TS_execute callback for matching a tsquery operand to plain tsvector data */ -static bool +static TSTernaryValue checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) { CHKVAL *chkval = (CHKVAL *) checkval; WordEntry *StopLow = chkval->arrb; WordEntry *StopHigh = chkval->arre; WordEntry *StopMiddle = StopHigh; - bool res = false; + TSTernaryValue res = TS_NO; /* Loop invariant: StopLow <= val < StopHigh */ while (StopLow < StopHigh) @@ -1302,36 +1319,69 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) StopHigh = StopMiddle; } - if ((!res || data) && val->prefix) + /* + * If it's a prefix search, we should also consider lexemes that the + * search term is a prefix of (which will necessarily immediately follow + * the place we found in the above loop). But we can skip them if there + * was a definite match on the exact term AND the caller doesn't need + * position info. + */ + if (val->prefix && (res != TS_YES || data)) { WordEntryPos *allpos = NULL; int npos = 0, totalpos = 0; - /* - * there was a failed exact search, so we should scan further to find - * a prefix match. We also need to do so if caller needs position info - */ + /* adjust start position for corner case */ if (StopLow >= StopHigh) StopMiddle = StopHigh; - while ((!res || data) && StopMiddle < chkval->arre && + /* we don't try to re-use any data from the initial match */ + if (data) + { + if (data->allocated) + pfree(data->pos); + data->pos = NULL; + data->allocated = false; + data->npos = 0; + } + res = TS_NO; + + while ((res != TS_YES || data) && + StopMiddle < chkval->arre && tsCompareString(chkval->operand + val->distance, val->length, chkval->values + StopMiddle->pos, StopMiddle->len, true) == 0) { - if (data) - { - /* - * We need to join position information - */ - res = checkclass_str(chkval, StopMiddle, val, data); + TSTernaryValue subres; + + subres = checkclass_str(chkval, StopMiddle, val, data); - if (res) + if (subres != TS_NO) + { + if (data) { - while (npos + data->npos >= totalpos) + /* + * We need to join position information + */ + if (subres == TS_MAYBE) + { + /* + * No position info for this match, so we must report + * MAYBE overall. + */ + res = TS_MAYBE; + /* forget any previous positions */ + npos = 0; + /* don't leak storage */ + if (allpos) + pfree(allpos); + break; + } + + while (npos + data->npos > totalpos) { if (totalpos == 0) { @@ -1347,22 +1397,27 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos); npos += data->npos; + + /* don't leak storage from individual matches */ + if (data->allocated) + pfree(data->pos); + data->pos = NULL; + data->allocated = false; + /* it's important to reset data->npos before next loop */ + data->npos = 0; } else { - /* at loop exit, res must be true if we found matches */ - res = (npos > 0); + /* Don't need positions, just handle YES/MAYBE */ + if (subres == TS_YES || res == TS_NO) + res = subres; } } - else - { - res = checkclass_str(chkval, StopMiddle, val, NULL); - } StopMiddle++; } - if (res && data) + if (data && npos > 0) { /* Sort and make unique array of found positions */ data->pos = allpos; @@ -1370,6 +1425,7 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) data->npos = qunique(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos); data->allocated = true; + res = TS_YES; } } @@ -1561,14 +1617,7 @@ TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags, check_stack_depth(); if (curitem->type == QI_VAL) - { - if (!chkcond(arg, (QueryOperand *) curitem, data)) - return TS_NO; - if (data->npos > 0 || data->negate) - return TS_YES; - /* If we have no position data, we must return TS_MAYBE */ - return TS_MAYBE; - } + return chkcond(arg, (QueryOperand *) curitem, data); switch (curitem->qoperator.oper) { @@ -1821,7 +1870,7 @@ TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags, if (curitem->type == QI_VAL) return chkcond(arg, (QueryOperand *) curitem, - NULL /* don't need position info */ ) ? TS_YES : TS_NO; + NULL /* don't need position info */ ); switch (curitem->qoperator.oper) { |