aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/tsquery.c
diff options
context:
space:
mode:
authorTeodor Sigaev <teodor@sigaev.ru>2016-04-07 18:44:18 +0300
committerTeodor Sigaev <teodor@sigaev.ru>2016-04-07 18:44:18 +0300
commitbb140506df605fab58f48926ee1db1f80bdafb59 (patch)
tree581f9aeb71e3596000af3b4904e0c62a372d77b3 /src/backend/utils/adt/tsquery.c
parent015e88942aa50f0d419ddac00e63bb06d6e62e86 (diff)
downloadpostgresql-bb140506df605fab58f48926ee1db1f80bdafb59.tar.gz
postgresql-bb140506df605fab58f48926ee1db1f80bdafb59.zip
Phrase full text search.
Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery. On-disk and binary in/out format of tsquery are backward compatible. It has two side effect: - change order for tsquery, so, users, who has a btree index over tsquery, should reindex it - less number of parenthesis in tsquery output, and tsquery becomes more readable Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov Reviewers: Alexander Korotkov, Artur Zakirov
Diffstat (limited to 'src/backend/utils/adt/tsquery.c')
-rw-r--r--src/backend/utils/adt/tsquery.c311
1 files changed, 235 insertions, 76 deletions
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c
index 07320606784..257b5d33456 100644
--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@@ -56,7 +56,7 @@ struct TSQueryParserStateData
/*
* subroutine to parse the modifiers (weight and prefix flag currently)
- * part, like ':1AB' of a query.
+ * part, like ':AB*' of a query.
*/
static char *
get_modifiers(char *buf, int16 *weight, bool *prefix)
@@ -101,6 +101,94 @@ get_modifiers(char *buf, int16 *weight, bool *prefix)
}
/*
+ * Parse phrase operator. The operator
+ * may take the following forms:
+ *
+ * a <X> b (distance is no greater than X)
+ * a <-> b (default distance = 1)
+ *
+ * The buffer should begin with '<' char
+ */
+static char *
+parse_phrase_operator(char *buf, int16 *distance)
+{
+ enum
+ {
+ PHRASE_OPEN = 0,
+ PHRASE_DIST,
+ PHRASE_CLOSE,
+ PHRASE_ERR,
+ PHRASE_FINISH
+ } state = PHRASE_OPEN;
+
+ char *ptr = buf;
+ char *endptr;
+ long l = 1;
+
+ while (*ptr)
+ {
+ switch(state)
+ {
+ case PHRASE_OPEN:
+ Assert(t_iseq(ptr, '<'));
+ state = PHRASE_DIST;
+ ptr++;
+ break;
+
+ case PHRASE_DIST:
+ if (t_iseq(ptr, '-'))
+ {
+ state = PHRASE_CLOSE;
+ ptr++;
+ break;
+ }
+ else if (!t_isdigit(ptr))
+ {
+ state = PHRASE_ERR;
+ break;
+ }
+
+ l = strtol(ptr, &endptr, 10);
+ if (ptr == endptr)
+ state = PHRASE_ERR;
+ else if (errno == ERANGE || l > MAXENTRYPOS)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("distance in phrase operator should not be greater than %d",
+ MAXENTRYPOS)));
+ else
+ {
+ state = PHRASE_CLOSE;
+ ptr = endptr;
+ }
+ break;
+
+ case PHRASE_CLOSE:
+ if (t_iseq(ptr, '>'))
+ {
+ state = PHRASE_FINISH;
+ ptr++;
+ }
+ else
+ state = PHRASE_ERR;
+ break;
+
+ case PHRASE_FINISH:
+ *distance = (int16) l;
+ return ptr;
+
+ case PHRASE_ERR:
+ default:
+ goto err;
+ }
+ }
+
+ err:
+ *distance = -1;
+ return buf;
+}
+
+/*
* token types for parsing
*/
typedef enum
@@ -116,8 +204,10 @@ typedef enum
/*
* get token from query string
*
- * *operator is filled in with OP_* when return values is PT_OPR
+ * *operator is filled in with OP_* when return values is PT_OPR,
+ * but *weight could contain a distance value in case of phrase operator.
* *strval, *lenval and *weight are filled in when return value is PT_VAL
+ *
*/
static ts_tokentype
gettoken_query(TSQueryParserState state,
@@ -185,13 +275,23 @@ gettoken_query(TSQueryParserState state,
(state->buf)++;
return PT_OPR;
}
- if (t_iseq(state->buf, '|'))
+ else if (t_iseq(state->buf, '|'))
{
state->state = WAITOPERAND;
*operator = OP_OR;
(state->buf)++;
return PT_OPR;
}
+ else if (t_iseq(state->buf, '<'))
+ {
+ state->state = WAITOPERAND;
+ *operator = OP_PHRASE;
+ /* weight var is used as storage for distance */
+ state->buf = parse_phrase_operator(state->buf, weight);
+ if (*weight < 0)
+ return PT_ERR;
+ return PT_OPR;
+ }
else if (t_iseq(state->buf, ')'))
{
(state->buf)++;
@@ -223,15 +323,16 @@ gettoken_query(TSQueryParserState state,
* Push an operator to state->polstr
*/
void
-pushOperator(TSQueryParserState state, int8 oper)
+pushOperator(TSQueryParserState state, int8 oper, int16 distance)
{
QueryOperator *tmp;
- Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR);
+ Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE);
tmp = (QueryOperator *) palloc0(sizeof(QueryOperator));
tmp->type = QI_OPR;
tmp->oper = oper;
+ tmp->distance = (oper == OP_PHRASE) ? distance : 0;
/* left is filled in later with findoprnd */
state->polstr = lcons(tmp, state->polstr);
@@ -330,14 +431,18 @@ makepol(TSQueryParserState state,
PushFunction pushval,
Datum opaque)
{
- int8 operator = 0;
- ts_tokentype type;
- int lenval = 0;
- char *strval = NULL;
- int8 opstack[STACKDEPTH];
- int lenstack = 0;
- int16 weight = 0;
- bool prefix;
+ int8 operator = 0;
+ ts_tokentype type;
+ int lenval = 0;
+ char *strval = NULL;
+ struct
+ {
+ int8 op;
+ int16 distance;
+ } opstack[STACKDEPTH];
+ int lenstack = 0;
+ int16 weight = 0;
+ bool prefix;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
@@ -348,39 +453,48 @@ makepol(TSQueryParserState state,
{
case PT_VAL:
pushval(opaque, state, strval, lenval, weight, prefix);
- while (lenstack && (opstack[lenstack - 1] == OP_AND ||
- opstack[lenstack - 1] == OP_NOT))
+ while (lenstack && (opstack[lenstack - 1].op == OP_AND ||
+ opstack[lenstack - 1].op == OP_PHRASE ||
+ opstack[lenstack - 1].op == OP_NOT))
{
lenstack--;
- pushOperator(state, opstack[lenstack]);
+ pushOperator(state,
+ opstack[lenstack].op,
+ opstack[lenstack].distance);
}
break;
case PT_OPR:
if (lenstack && operator == OP_OR)
- pushOperator(state, OP_OR);
+ pushOperator(state, OP_OR, 0);
else
{
if (lenstack == STACKDEPTH) /* internal error */
elog(ERROR, "tsquery stack too small");
- opstack[lenstack] = operator;
+ opstack[lenstack].op = operator;
+ opstack[lenstack].distance = weight;
lenstack++;
}
break;
case PT_OPEN:
makepol(state, pushval, opaque);
- while (lenstack && (opstack[lenstack - 1] == OP_AND ||
- opstack[lenstack - 1] == OP_NOT))
+ while (lenstack && (opstack[lenstack - 1].op == OP_AND ||
+ opstack[lenstack - 1].op == OP_PHRASE ||
+ opstack[lenstack - 1].op == OP_NOT))
{
lenstack--;
- pushOperator(state, opstack[lenstack]);
+ pushOperator(state,
+ opstack[lenstack].op,
+ opstack[lenstack].distance);
}
break;
case PT_CLOSE:
while (lenstack)
{
lenstack--;
- pushOperator(state, opstack[lenstack]);
+ pushOperator(state,
+ opstack[lenstack].op,
+ opstack[lenstack].distance);
};
return;
case PT_ERR:
@@ -394,12 +508,14 @@ makepol(TSQueryParserState state,
while (lenstack)
{
lenstack--;
- pushOperator(state, opstack[lenstack]);
+ pushOperator(state,
+ opstack[lenstack].op,
+ opstack[lenstack].distance);
}
}
static void
-findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
+findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
@@ -407,10 +523,13 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
if (*pos >= nnodes)
elog(ERROR, "malformed tsquery: operand not found");
- if (ptr[*pos].type == QI_VAL ||
- ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, they
- * haven't been cleaned away yet. */
+ if (ptr[*pos].type == QI_VAL)
+ {
+ (*pos)++;
+ }
+ else if (ptr[*pos].type == QI_VALSTOP)
{
+ *needcleanup = true; /* we'll have to remove stop words */
(*pos)++;
}
else
@@ -419,21 +538,32 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
if (ptr[*pos].qoperator.oper == OP_NOT)
{
- ptr[*pos].qoperator.left = 1;
+ ptr[*pos].qoperator.left = 1; /* fixed offset */
(*pos)++;
- findoprnd_recurse(ptr, pos, nnodes);
+
+ /* process the only argument */
+ findoprnd_recurse(ptr, pos, nnodes, needcleanup);
}
else
{
- QueryOperator *curitem = &ptr[*pos].qoperator;
- int tmp = *pos;
+ QueryOperator *curitem = &ptr[*pos].qoperator;
+ int tmp = *pos; /* save current position */
+
+ Assert(curitem->oper == OP_AND ||
+ curitem->oper == OP_OR ||
+ curitem->oper == OP_PHRASE);
- Assert(curitem->oper == OP_AND || curitem->oper == OP_OR);
+ if (curitem->oper == OP_PHRASE)
+ *needcleanup = true; /* push OP_PHRASE down later */
(*pos)++;
- findoprnd_recurse(ptr, pos, nnodes);
- curitem->left = *pos - tmp;
- findoprnd_recurse(ptr, pos, nnodes);
+
+ /* process RIGHT argument */
+ findoprnd_recurse(ptr, pos, nnodes, needcleanup);
+ curitem->left = *pos - tmp; /* set LEFT arg's offset */
+
+ /* process LEFT argument */
+ findoprnd_recurse(ptr, pos, nnodes, needcleanup);
}
}
}
@@ -444,12 +574,13 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
* QueryItems must be in polish (prefix) notation.
*/
static void
-findoprnd(QueryItem *ptr, int size)
+findoprnd(QueryItem *ptr, int size, bool *needcleanup)
{
uint32 pos;
+ *needcleanup = false;
pos = 0;
- findoprnd_recurse(ptr, &pos, size);
+ findoprnd_recurse(ptr, &pos, size, needcleanup);
if (pos != size)
elog(ERROR, "malformed tsquery: extra nodes");
@@ -466,9 +597,6 @@ findoprnd(QueryItem *ptr, int size)
*
* opaque is passed on to pushval as is, pushval can use it to store its
* private state.
- *
- * The returned query might contain QI_STOPVAL nodes. The caller is responsible
- * for cleaning them up (with clean_fakeval)
*/
TSQuery
parse_tsquery(char *buf,
@@ -482,6 +610,7 @@ parse_tsquery(char *buf,
int commonlen;
QueryItem *ptr;
ListCell *cell;
+ bool needcleanup;
/* init state */
state.buffer = buf;
@@ -531,7 +660,7 @@ parse_tsquery(char *buf,
i = 0;
foreach(cell, state.polstr)
{
- QueryItem *item = (QueryItem *) lfirst(cell);
+ QueryItem *item = (QueryItem *) lfirst(cell);
switch (item->type)
{
@@ -555,7 +684,14 @@ parse_tsquery(char *buf,
pfree(state.op);
/* Set left operand pointers for every operator. */
- findoprnd(ptr, query->size);
+ findoprnd(ptr, query->size, &needcleanup);
+
+ /*
+ * QI_VALSTOP nodes should be cleaned and
+ * and OP_PHRASE should be pushed down
+ */
+ if (needcleanup)
+ return cleanup_fakeval_and_phrase(query);
return query;
}
@@ -600,12 +736,15 @@ while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \
(inf)->cur = (inf)->buf + len; \
}
+#define PRINT_PRIORITY(x) \
+ ( (QO_PRIORITY(x) == OP_NOT) ? OP_NOT_PHRASE : QO_PRIORITY(x) )
+
/*
- * recursive walk on tree and print it in
- * infix (human-readable) view
+ * recursively traverse the tree and
+ * print it in infix (human-readable) form
*/
static void
-infix(INFIX *in, bool first)
+infix(INFIX *in, int parentPriority)
{
/* since this function recurses, it could be driven to stack overflow. */
check_stack_depth();
@@ -674,24 +813,22 @@ infix(INFIX *in, bool first)
}
else if (in->curpol->qoperator.oper == OP_NOT)
{
- bool isopr = false;
+ int priority = PRINT_PRIORITY(in->curpol);
- RESIZEBUF(in, 1);
- *(in->cur) = '!';
- in->cur++;
- *(in->cur) = '\0';
- in->curpol++;
-
- if (in->curpol->type == QI_OPR)
+ if (priority < parentPriority)
{
- isopr = true;
RESIZEBUF(in, 2);
sprintf(in->cur, "( ");
in->cur = strchr(in->cur, '\0');
}
+ RESIZEBUF(in, 1);
+ *(in->cur) = '!';
+ in->cur++;
+ *(in->cur) = '\0';
+ in->curpol++;
- infix(in, isopr);
- if (isopr)
+ infix(in, priority);
+ if (priority < parentPriority)
{
RESIZEBUF(in, 2);
sprintf(in->cur, " )");
@@ -701,11 +838,18 @@ infix(INFIX *in, bool first)
else
{
int8 op = in->curpol->qoperator.oper;
+ int priority = PRINT_PRIORITY(in->curpol);
+ int16 distance = in->curpol->qoperator.distance;
INFIX nrm;
+ bool needParenthesis = false;
in->curpol++;
- if (op == OP_OR && !first)
+ if (priority < parentPriority ||
+ (op == OP_PHRASE &&
+ (priority == parentPriority || /* phrases are not commutative! */
+ parentPriority == OP_PRIORITY(OP_AND))))
{
+ needParenthesis = true;
RESIZEBUF(in, 2);
sprintf(in->cur, "( ");
in->cur = strchr(in->cur, '\0');
@@ -717,14 +861,14 @@ infix(INFIX *in, bool first)
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
/* get right operand */
- infix(&nrm, false);
+ infix(&nrm, priority);
/* get & print left operand */
in->curpol = nrm.curpol;
- infix(in, false);
+ infix(in, priority);
/* print operator & right operand */
- RESIZEBUF(in, 3 + (nrm.cur - nrm.buf));
+ RESIZEBUF(in, 3 + (2 + 10 /* distance */) + (nrm.cur - nrm.buf));
switch (op)
{
case OP_OR:
@@ -733,6 +877,12 @@ infix(INFIX *in, bool first)
case OP_AND:
sprintf(in->cur, " & %s", nrm.buf);
break;
+ case OP_PHRASE:
+ if (distance != 1)
+ sprintf(in->cur, " <%d> %s", distance, nrm.buf);
+ else
+ sprintf(in->cur, " <-> %s", nrm.buf);
+ break;
default:
/* OP_NOT is handled in above if-branch */
elog(ERROR, "unrecognized operator type: %d", op);
@@ -740,7 +890,7 @@ infix(INFIX *in, bool first)
in->cur = strchr(in->cur, '\0');
pfree(nrm.buf);
- if (op == OP_OR && !first)
+ if (needParenthesis)
{
RESIZEBUF(in, 2);
sprintf(in->cur, " )");
@@ -749,7 +899,6 @@ infix(INFIX *in, bool first)
}
}
-
Datum
tsqueryout(PG_FUNCTION_ARGS)
{
@@ -768,7 +917,7 @@ tsqueryout(PG_FUNCTION_ARGS)
nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen);
*(nrm.cur) = '\0';
nrm.op = GETOPERAND(query);
- infix(&nrm, true);
+ infix(&nrm, -1 /* lowest priority */);
PG_FREE_IF_COPY(query, 0);
PG_RETURN_CSTRING(nrm.buf);
@@ -789,7 +938,8 @@ tsqueryout(PG_FUNCTION_ARGS)
*
* For each operator:
* uint8 type, QI_OPR
- * uint8 operator, one of OP_AND, OP_OR, OP_NOT.
+ * uint8 operator, one of OP_AND, OP_PHRASE OP_OR, OP_NOT.
+ * uint16 distance (only for OP_PHRASE)
*/
Datum
tsquerysend(PG_FUNCTION_ARGS)
@@ -815,6 +965,9 @@ tsquerysend(PG_FUNCTION_ARGS)
break;
case QI_OPR:
pq_sendint(&buf, item->qoperator.oper, sizeof(item->qoperator.oper));
+ if (item->qoperator.oper == OP_PHRASE)
+ pq_sendint(&buf, item->qoperator.distance,
+ sizeof(item->qoperator.distance));
break;
default:
elog(ERROR, "unrecognized tsquery node type: %d", item->type);
@@ -830,15 +983,16 @@ tsquerysend(PG_FUNCTION_ARGS)
Datum
tsqueryrecv(PG_FUNCTION_ARGS)
{
- StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
- TSQuery query;
- int i,
- len;
- QueryItem *item;
- int datalen;
- char *ptr;
- uint32 size;
- const char **operands;
+ StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
+ TSQuery query;
+ int i,
+ len;
+ QueryItem *item;
+ int datalen;
+ char *ptr;
+ uint32 size;
+ const char **operands;
+ bool needcleanup;
size = pq_getmsgint(buf, sizeof(uint32));
if (size > (MaxAllocSize / sizeof(QueryItem)))
@@ -907,13 +1061,15 @@ tsqueryrecv(PG_FUNCTION_ARGS)
int8 oper;
oper = (int8) pq_getmsgint(buf, sizeof(int8));
- if (oper != OP_NOT && oper != OP_OR && oper != OP_AND)
+ if (oper != OP_NOT && oper != OP_OR && oper != OP_AND && oper != OP_PHRASE)
elog(ERROR, "invalid tsquery: unrecognized operator type %d",
(int) oper);
if (i == size - 1)
elog(ERROR, "invalid pointer to right operand");
item->qoperator.oper = oper;
+ if (oper == OP_PHRASE)
+ item->qoperator.distance = (int16) pq_getmsgint(buf, sizeof(int16));
}
else
elog(ERROR, "unrecognized tsquery node type: %d", item->type);
@@ -930,7 +1086,7 @@ tsqueryrecv(PG_FUNCTION_ARGS)
* Fill in the left-pointers. Checks that the tree is well-formed as a
* side-effect.
*/
- findoprnd(item, size);
+ findoprnd(item, size, &needcleanup);
/* Copy operands to output struct */
for (i = 0; i < size; i++)
@@ -949,7 +1105,10 @@ tsqueryrecv(PG_FUNCTION_ARGS)
SET_VARSIZE(query, len + datalen);
- PG_RETURN_TSVECTOR(query);
+ if (needcleanup)
+ PG_RETURN_TSQUERY(cleanup_fakeval_and_phrase(query));
+
+ PG_RETURN_TSQUERY(query);
}
/*