aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/tsvector_op.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/tsvector_op.c')
-rw-r--r--src/backend/utils/adt/tsvector_op.c477
1 files changed, 360 insertions, 117 deletions
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 36cc10c9017..01c721f835e 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -11,9 +11,10 @@
*
*-------------------------------------------------------------------------
*/
-
#include "postgres.h"
+#include <limits.h>
+
#include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
@@ -1405,147 +1406,394 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
}
/*
+ * Compute output position list for a tsquery operator in phrase mode.
+ *
+ * Merge the position lists in Ldata and Rdata as specified by "emit",
+ * returning the result list into *data. The input position lists must be
+ * sorted and unique, and the output will be as well.
+ *
+ * data: pointer to initially-all-zeroes output struct, or NULL
+ * Ldata, Rdata: input position lists
+ * emit: bitmask of TSPO_XXX flags
+ * Loffset: offset to be added to Ldata positions before comparing/outputting
+ * Roffset: offset to be added to Rdata positions before comparing/outputting
+ * max_npos: maximum possible required size of output position array
+ *
+ * Loffset and Roffset should not be negative, else we risk trying to output
+ * negative positions, which won't fit into WordEntryPos.
+ *
+ * Returns true if any positions were emitted to *data; or if data is NULL,
+ * returns true if any positions would have been emitted.
+ */
+#define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */
+#define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */
+#define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */
+
+static bool
+TS_phrase_output(ExecPhraseData *data,
+ ExecPhraseData *Ldata,
+ ExecPhraseData *Rdata,
+ int emit,
+ int Loffset,
+ int Roffset,
+ int max_npos)
+{
+ int Lindex,
+ Rindex;
+
+ /* Loop until both inputs are exhausted */
+ Lindex = Rindex = 0;
+ while (Lindex < Ldata->npos || Rindex < Rdata->npos)
+ {
+ int Lpos,
+ Rpos;
+ int output_pos = 0;
+
+ /*
+ * Fetch current values to compare. WEP_GETPOS() is needed because
+ * ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
+ */
+ if (Lindex < Ldata->npos)
+ Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset;
+ else
+ {
+ /* L array exhausted, so we're done if R_ONLY isn't set */
+ if (!(emit & TSPO_R_ONLY))
+ break;
+ Lpos = INT_MAX;
+ }
+ if (Rindex < Rdata->npos)
+ Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset;
+ else
+ {
+ /* R array exhausted, so we're done if L_ONLY isn't set */
+ if (!(emit & TSPO_L_ONLY))
+ break;
+ Rpos = INT_MAX;
+ }
+
+ /* Merge-join the two input lists */
+ if (Lpos < Rpos)
+ {
+ /* Lpos is not matched in Rdata, should we output it? */
+ if (emit & TSPO_L_ONLY)
+ output_pos = Lpos;
+ Lindex++;
+ }
+ else if (Lpos == Rpos)
+ {
+ /* Lpos and Rpos match ... should we output it? */
+ if (emit & TSPO_BOTH)
+ output_pos = Rpos;
+ Lindex++;
+ Rindex++;
+ }
+ else /* Lpos > Rpos */
+ {
+ /* Rpos is not matched in Ldata, should we output it? */
+ if (emit & TSPO_R_ONLY)
+ output_pos = Rpos;
+ Rindex++;
+ }
+
+ if (output_pos > 0)
+ {
+ if (data)
+ {
+ /* Store position, first allocating output array if needed */
+ if (data->pos == NULL)
+ {
+ data->pos = (WordEntryPos *)
+ palloc(max_npos * sizeof(WordEntryPos));
+ data->allocated = true;
+ }
+ data->pos[data->npos++] = output_pos;
+ }
+ else
+ {
+ /*
+ * Exact positions not needed, so return true as soon as we
+ * know there is at least one.
+ */
+ return true;
+ }
+ }
+ }
+
+ if (data && data->npos > 0)
+ {
+ /* Let's assert we didn't overrun the array */
+ Assert(data->npos <= max_npos);
+ return true;
+ }
+ return false;
+}
+
+/*
* Execute tsquery at or below an OP_PHRASE operator.
*
- * This handles the recursion at levels where we need to care about
- * match locations. In addition to the same arguments used for TS_execute,
- * the caller may pass a preinitialized-to-zeroes ExecPhraseData struct to
- * be filled with lexeme match positions on success. data == NULL if no
- * match data need be returned. (In practice, outside callers pass NULL,
- * and only the internal recursion cases pass a data pointer.)
+ * This handles tsquery execution at recursion levels where we need to care
+ * about match locations.
+ *
+ * In addition to the same arguments used for TS_execute, the caller may pass
+ * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme
+ * match position info on success. data == NULL if no position data need be
+ * returned. (In practice, outside callers pass NULL, and only the internal
+ * recursion cases pass a data pointer.)
+ * Note: the function assumes data != NULL for operators other than OP_PHRASE.
+ * This is OK because an outside call always starts from an OP_PHRASE node.
+ *
+ * The detailed semantics of the match data, given that the function returned
+ * "true" (successful match, or possible match), are:
+ *
+ * npos > 0, negate = false:
+ * query is matched at specified position(s) (and only those positions)
+ * npos > 0, negate = true:
+ * query is matched at all positions *except* specified position(s)
+ * npos = 0, negate = false:
+ * query is possibly matched, matching position(s) are unknown
+ * (this should only be returned when TS_EXEC_PHRASE_NO_POS flag is set)
+ * npos = 0, negate = true:
+ * query is matched at all positions
+ *
+ * Successful matches also return a "width" value which is the match width in
+ * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches,
+ * and is the sum of the phrase operator distances for phrase matches. Note
+ * that when width > 0, the listed positions represent the ends of matches not
+ * the starts. (This unintuitive rule is needed to avoid possibly generating
+ * negative positions, which wouldn't fit into the WordEntryPos arrays.)
+ *
+ * When the function returns "false" (no match), it must return npos = 0,
+ * negate = false (which is the state initialized by the caller); but the
+ * "width" output in such cases is undefined.
*/
static bool
TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
- ExecPhraseData *data,
- TSExecuteCallback chkcond)
+ TSExecuteCallback chkcond,
+ ExecPhraseData *data)
{
+ ExecPhraseData Ldata,
+ Rdata;
+ bool lmatch,
+ rmatch;
+ int Loffset,
+ Roffset,
+ maxwidth;
+
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
if (curitem->type == QI_VAL)
- {
return chkcond(arg, (QueryOperand *) curitem, data);
- }
- else
+
+ switch (curitem->qoperator.oper)
{
- ExecPhraseData Ldata = {0, false, NULL},
- Rdata = {0, false, NULL};
- WordEntryPos *Lpos,
- *LposStart,
- *Rpos,
- *pos_iter = NULL;
+ case OP_NOT:
- Assert(curitem->qoperator.oper == OP_PHRASE);
+ /*
+ * Because a "true" result with no specific positions is taken as
+ * uncertain, we need no special care here for !TS_EXEC_CALC_NOT.
+ * If it's a false positive, the right things happen anyway.
+ *
+ * Also, we need not touch data->width, since a NOT operation does
+ * not change the match width.
+ */
+ if (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data))
+ {
+ if (data->npos > 0)
+ {
+ /* we have some positions, invert negate flag */
+ data->negate = !data->negate;
+ return true;
+ }
+ else if (data->negate)
+ {
+ /* change "match everywhere" to "match nowhere" */
+ data->negate = false;
+ return false;
+ }
+ /* match positions are, and remain, uncertain */
+ return true;
+ }
+ else
+ {
+ /* change "match nowhere" to "match everywhere" */
+ Assert(data->npos == 0 && !data->negate);
+ data->negate = true;
+ return true;
+ }
- if (!TS_phrase_execute(curitem + curitem->qoperator.left,
- arg, flags, &Ldata, chkcond))
- return false;
+ case OP_PHRASE:
+ case OP_AND:
+ memset(&Ldata, 0, sizeof(Ldata));
+ memset(&Rdata, 0, sizeof(Rdata));
- if (!TS_phrase_execute(curitem + 1, arg, flags, &Rdata, chkcond))
- return false;
+ if (!TS_phrase_execute(curitem + curitem->qoperator.left,
+ arg, flags, chkcond, &Ldata))
+ return false;
- /*
- * If either operand has no position information, then we normally
- * return false. But if TS_EXEC_PHRASE_AS_AND flag is set then we
- * return true, treating OP_PHRASE as if it were OP_AND.
- */
- if (Ldata.npos == 0 || Rdata.npos == 0)
- return (flags & TS_EXEC_PHRASE_AS_AND) ? true : false;
+ if (!TS_phrase_execute(curitem + 1,
+ arg, flags, chkcond, &Rdata))
+ return false;
- /*
- * Prepare output position array if needed.
- */
- if (data)
- {
/*
- * We can recycle the righthand operand's result array if it was
- * palloc'd, else must allocate our own. The number of matches
- * couldn't be more than the smaller of the two operands' matches.
+ * If either operand has no position information, then we can't
+ * return position data, only a "possible match" result. "Possible
+ * match" answers are only wanted when TS_EXEC_PHRASE_NO_POS flag
+ * is set, otherwise return false.
*/
- if (!Rdata.allocated)
- data->pos = palloc(sizeof(WordEntryPos) * Min(Ldata.npos, Rdata.npos));
- else
- data->pos = Rdata.pos;
+ if ((Ldata.npos == 0 && !Ldata.negate) ||
+ (Rdata.npos == 0 && !Rdata.negate))
+ return (flags & TS_EXEC_PHRASE_NO_POS) ? true : false;
- data->allocated = true;
- data->npos = 0;
- pos_iter = data->pos;
- }
+ if (curitem->qoperator.oper == OP_PHRASE)
+ {
+ /*
+ * Compute Loffset and Roffset suitable for phrase match, and
+ * compute overall width of whole phrase match.
+ */
+ Loffset = curitem->qoperator.distance + Rdata.width;
+ Roffset = 0;
+ if (data)
+ data->width = curitem->qoperator.distance +
+ Ldata.width + Rdata.width;
+ }
+ else
+ {
+ /*
+ * For OP_AND, set output width and alignment like OP_OR (see
+ * comment below)
+ */
+ maxwidth = Max(Ldata.width, Rdata.width);
+ Loffset = maxwidth - Ldata.width;
+ Roffset = maxwidth - Rdata.width;
+ if (data)
+ data->width = maxwidth;
+ }
- /*
- * Find matches by distance. WEP_GETPOS() is needed because
- * ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
- *
- * Note that the output positions are those of the matching RIGHT
- * operands.
- */
- Rpos = Rdata.pos;
- LposStart = Ldata.pos;
- while (Rpos < Rdata.pos + Rdata.npos)
- {
- /*
- * We need to check all possible distances, so reset Lpos to
- * guaranteed not yet satisfied position.
- */
- Lpos = LposStart;
- while (Lpos < Ldata.pos + Ldata.npos)
+ if (Ldata.negate && Rdata.negate)
{
- if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) ==
- curitem->qoperator.distance)
- {
- /* MATCH! */
- if (data)
- {
- /* Store position for upper phrase operator */
- *pos_iter = WEP_GETPOS(*Rpos);
- pos_iter++;
-
- /*
- * Set left start position to next, because current
- * one could not satisfy distance for any other right
- * position
- */
- LposStart = Lpos + 1;
- break;
- }
- else
- {
- /*
- * We are at the root of the phrase tree and hence we
- * don't have to identify all the match positions.
- * Just report success.
- */
- return true;
- }
+ /* !L & !R: treat as !(L | R) */
+ (void) TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
+ Loffset, Roffset,
+ Ldata.npos + Rdata.npos);
+ if (data)
+ data->negate = true;
+ return true;
+ }
+ else if (Ldata.negate)
+ {
+ /* !L & R */
+ return TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_R_ONLY,
+ Loffset, Roffset,
+ Rdata.npos);
+ }
+ else if (Rdata.negate)
+ {
+ /* L & !R */
+ return TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_L_ONLY,
+ Loffset, Roffset,
+ Ldata.npos);
+ }
+ else
+ {
+ /* straight AND */
+ return TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_BOTH,
+ Loffset, Roffset,
+ Min(Ldata.npos, Rdata.npos));
+ }
- }
- else if (WEP_GETPOS(*Rpos) <= WEP_GETPOS(*Lpos) ||
- WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <
- curitem->qoperator.distance)
- {
- /*
- * Go to the next Rpos, because Lpos is ahead or on less
- * distance than required by current operator
- */
- break;
+ case OP_OR:
+ memset(&Ldata, 0, sizeof(Ldata));
+ memset(&Rdata, 0, sizeof(Rdata));
- }
+ lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
+ arg, flags, chkcond, &Ldata);
+ rmatch = TS_phrase_execute(curitem + 1,
+ arg, flags, chkcond, &Rdata);
- Lpos++;
- }
+ if (!lmatch && !rmatch)
+ return false;
- Rpos++;
- }
+ /*
+ * If a valid operand has no position information, then we can't
+ * return position data, only a "possible match" result. "Possible
+ * match" answers are only wanted when TS_EXEC_PHRASE_NO_POS flag
+ * is set, otherwise return false.
+ */
+ if ((lmatch && Ldata.npos == 0 && !Ldata.negate) ||
+ (rmatch && Rdata.npos == 0 && !Rdata.negate))
+ return (flags & TS_EXEC_PHRASE_NO_POS) ? true : false;
- if (data)
- {
- data->npos = pos_iter - data->pos;
+ /*
+ * Cope with undefined output width from failed submatch. (This
+ * takes less code than trying to ensure that all failure returns
+ * set data->width to zero.)
+ */
+ if (!lmatch)
+ Ldata.width = 0;
+ if (!rmatch)
+ Rdata.width = 0;
- if (data->npos > 0)
+ /*
+ * For OP_AND and OP_OR, report the width of the wider of the two
+ * inputs, and align the narrower input's positions to the right
+ * end of that width. This rule deals at least somewhat
+ * reasonably with cases like "x <-> (y | z <-> q)".
+ */
+ maxwidth = Max(Ldata.width, Rdata.width);
+ Loffset = maxwidth - Ldata.width;
+ Roffset = maxwidth - Rdata.width;
+ data->width = maxwidth;
+
+ if (Ldata.negate && Rdata.negate)
+ {
+ /* !L | !R: treat as !(L & R) */
+ (void) TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_BOTH,
+ Loffset, Roffset,
+ Min(Ldata.npos, Rdata.npos));
+ data->negate = true;
return true;
- }
+ }
+ else if (Ldata.negate)
+ {
+ /* !L | R: treat as !(L & !R) */
+ (void) TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_L_ONLY,
+ Loffset, Roffset,
+ Ldata.npos);
+ data->negate = true;
+ return true;
+ }
+ else if (Rdata.negate)
+ {
+ /* L | !R: treat as !(!L & R) */
+ (void) TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_R_ONLY,
+ Loffset, Roffset,
+ Rdata.npos);
+ data->negate = true;
+ return true;
+ }
+ else
+ {
+ /* straight OR */
+ return TS_phrase_output(data, &Ldata, &Rdata,
+ TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
+ Loffset, Roffset,
+ Ldata.npos + Rdata.npos);
+ }
+
+ default:
+ elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
}
+ /* not reachable, but keep compiler quiet */
return false;
}
@@ -1594,12 +1842,7 @@ TS_execute(QueryItem *curitem, void *arg, uint32 flags,
return TS_execute(curitem + 1, arg, flags, chkcond);
case OP_PHRASE:
-
- /*
- * do not check TS_EXEC_PHRASE_AS_AND here because chkcond() could
- * do something more if it's called from TS_phrase_execute()
- */
- return TS_phrase_execute(curitem, arg, flags, NULL, chkcond);
+ return TS_phrase_execute(curitem, arg, flags, chkcond, NULL);
default:
elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);