diff options
author | Teodor Sigaev <teodor@sigaev.ru> | 2016-04-07 18:44:18 +0300 |
---|---|---|
committer | Teodor Sigaev <teodor@sigaev.ru> | 2016-04-07 18:44:18 +0300 |
commit | bb140506df605fab58f48926ee1db1f80bdafb59 (patch) | |
tree | 581f9aeb71e3596000af3b4904e0c62a372d77b3 /src/backend/utils/adt/tsquery.c | |
parent | 015e88942aa50f0d419ddac00e63bb06d6e62e86 (diff) | |
download | postgresql-bb140506df605fab58f48926ee1db1f80bdafb59.tar.gz postgresql-bb140506df605fab58f48926ee1db1f80bdafb59.zip |
Phrase full text search.
Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery.
On-disk and binary in/out format of tsquery are backward compatible.
It has two side effect:
- change order for tsquery, so, users, who has a btree index over tsquery,
should reindex it
- less number of parenthesis in tsquery output, and tsquery becomes more
readable
Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov
Reviewers: Alexander Korotkov, Artur Zakirov
Diffstat (limited to 'src/backend/utils/adt/tsquery.c')
-rw-r--r-- | src/backend/utils/adt/tsquery.c | 311 |
1 files changed, 235 insertions, 76 deletions
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 07320606784..257b5d33456 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -56,7 +56,7 @@ struct TSQueryParserStateData /* * subroutine to parse the modifiers (weight and prefix flag currently) - * part, like ':1AB' of a query. + * part, like ':AB*' of a query. */ static char * get_modifiers(char *buf, int16 *weight, bool *prefix) @@ -101,6 +101,94 @@ get_modifiers(char *buf, int16 *weight, bool *prefix) } /* + * Parse phrase operator. The operator + * may take the following forms: + * + * a <X> b (distance is no greater than X) + * a <-> b (default distance = 1) + * + * The buffer should begin with '<' char + */ +static char * +parse_phrase_operator(char *buf, int16 *distance) +{ + enum + { + PHRASE_OPEN = 0, + PHRASE_DIST, + PHRASE_CLOSE, + PHRASE_ERR, + PHRASE_FINISH + } state = PHRASE_OPEN; + + char *ptr = buf; + char *endptr; + long l = 1; + + while (*ptr) + { + switch(state) + { + case PHRASE_OPEN: + Assert(t_iseq(ptr, '<')); + state = PHRASE_DIST; + ptr++; + break; + + case PHRASE_DIST: + if (t_iseq(ptr, '-')) + { + state = PHRASE_CLOSE; + ptr++; + break; + } + else if (!t_isdigit(ptr)) + { + state = PHRASE_ERR; + break; + } + + l = strtol(ptr, &endptr, 10); + if (ptr == endptr) + state = PHRASE_ERR; + else if (errno == ERANGE || l > MAXENTRYPOS) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("distance in phrase operator should not be greater than %d", + MAXENTRYPOS))); + else + { + state = PHRASE_CLOSE; + ptr = endptr; + } + break; + + case PHRASE_CLOSE: + if (t_iseq(ptr, '>')) + { + state = PHRASE_FINISH; + ptr++; + } + else + state = PHRASE_ERR; + break; + + case PHRASE_FINISH: + *distance = (int16) l; + return ptr; + + case PHRASE_ERR: + default: + goto err; + } + } + + err: + *distance = -1; + return buf; +} + +/* * token types for parsing */ typedef enum @@ -116,8 +204,10 @@ typedef enum /* * get token from query string * - * *operator is filled in with OP_* when return values is PT_OPR + * *operator is filled in with OP_* when return values is PT_OPR, + * but *weight could contain a distance value in case of phrase operator. * *strval, *lenval and *weight are filled in when return value is PT_VAL + * */ static ts_tokentype gettoken_query(TSQueryParserState state, @@ -185,13 +275,23 @@ gettoken_query(TSQueryParserState state, (state->buf)++; return PT_OPR; } - if (t_iseq(state->buf, '|')) + else if (t_iseq(state->buf, '|')) { state->state = WAITOPERAND; *operator = OP_OR; (state->buf)++; return PT_OPR; } + else if (t_iseq(state->buf, '<')) + { + state->state = WAITOPERAND; + *operator = OP_PHRASE; + /* weight var is used as storage for distance */ + state->buf = parse_phrase_operator(state->buf, weight); + if (*weight < 0) + return PT_ERR; + return PT_OPR; + } else if (t_iseq(state->buf, ')')) { (state->buf)++; @@ -223,15 +323,16 @@ gettoken_query(TSQueryParserState state, * Push an operator to state->polstr */ void -pushOperator(TSQueryParserState state, int8 oper) +pushOperator(TSQueryParserState state, int8 oper, int16 distance) { QueryOperator *tmp; - Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR); + Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR || oper == OP_PHRASE); tmp = (QueryOperator *) palloc0(sizeof(QueryOperator)); tmp->type = QI_OPR; tmp->oper = oper; + tmp->distance = (oper == OP_PHRASE) ? distance : 0; /* left is filled in later with findoprnd */ state->polstr = lcons(tmp, state->polstr); @@ -330,14 +431,18 @@ makepol(TSQueryParserState state, PushFunction pushval, Datum opaque) { - int8 operator = 0; - ts_tokentype type; - int lenval = 0; - char *strval = NULL; - int8 opstack[STACKDEPTH]; - int lenstack = 0; - int16 weight = 0; - bool prefix; + int8 operator = 0; + ts_tokentype type; + int lenval = 0; + char *strval = NULL; + struct + { + int8 op; + int16 distance; + } opstack[STACKDEPTH]; + int lenstack = 0; + int16 weight = 0; + bool prefix; /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); @@ -348,39 +453,48 @@ makepol(TSQueryParserState state, { case PT_VAL: pushval(opaque, state, strval, lenval, weight, prefix); - while (lenstack && (opstack[lenstack - 1] == OP_AND || - opstack[lenstack - 1] == OP_NOT)) + while (lenstack && (opstack[lenstack - 1].op == OP_AND || + opstack[lenstack - 1].op == OP_PHRASE || + opstack[lenstack - 1].op == OP_NOT)) { lenstack--; - pushOperator(state, opstack[lenstack]); + pushOperator(state, + opstack[lenstack].op, + opstack[lenstack].distance); } break; case PT_OPR: if (lenstack && operator == OP_OR) - pushOperator(state, OP_OR); + pushOperator(state, OP_OR, 0); else { if (lenstack == STACKDEPTH) /* internal error */ elog(ERROR, "tsquery stack too small"); - opstack[lenstack] = operator; + opstack[lenstack].op = operator; + opstack[lenstack].distance = weight; lenstack++; } break; case PT_OPEN: makepol(state, pushval, opaque); - while (lenstack && (opstack[lenstack - 1] == OP_AND || - opstack[lenstack - 1] == OP_NOT)) + while (lenstack && (opstack[lenstack - 1].op == OP_AND || + opstack[lenstack - 1].op == OP_PHRASE || + opstack[lenstack - 1].op == OP_NOT)) { lenstack--; - pushOperator(state, opstack[lenstack]); + pushOperator(state, + opstack[lenstack].op, + opstack[lenstack].distance); } break; case PT_CLOSE: while (lenstack) { lenstack--; - pushOperator(state, opstack[lenstack]); + pushOperator(state, + opstack[lenstack].op, + opstack[lenstack].distance); }; return; case PT_ERR: @@ -394,12 +508,14 @@ makepol(TSQueryParserState state, while (lenstack) { lenstack--; - pushOperator(state, opstack[lenstack]); + pushOperator(state, + opstack[lenstack].op, + opstack[lenstack].distance); } } static void -findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) +findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes, bool *needcleanup) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); @@ -407,10 +523,13 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) if (*pos >= nnodes) elog(ERROR, "malformed tsquery: operand not found"); - if (ptr[*pos].type == QI_VAL || - ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, they - * haven't been cleaned away yet. */ + if (ptr[*pos].type == QI_VAL) + { + (*pos)++; + } + else if (ptr[*pos].type == QI_VALSTOP) { + *needcleanup = true; /* we'll have to remove stop words */ (*pos)++; } else @@ -419,21 +538,32 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) if (ptr[*pos].qoperator.oper == OP_NOT) { - ptr[*pos].qoperator.left = 1; + ptr[*pos].qoperator.left = 1; /* fixed offset */ (*pos)++; - findoprnd_recurse(ptr, pos, nnodes); + + /* process the only argument */ + findoprnd_recurse(ptr, pos, nnodes, needcleanup); } else { - QueryOperator *curitem = &ptr[*pos].qoperator; - int tmp = *pos; + QueryOperator *curitem = &ptr[*pos].qoperator; + int tmp = *pos; /* save current position */ + + Assert(curitem->oper == OP_AND || + curitem->oper == OP_OR || + curitem->oper == OP_PHRASE); - Assert(curitem->oper == OP_AND || curitem->oper == OP_OR); + if (curitem->oper == OP_PHRASE) + *needcleanup = true; /* push OP_PHRASE down later */ (*pos)++; - findoprnd_recurse(ptr, pos, nnodes); - curitem->left = *pos - tmp; - findoprnd_recurse(ptr, pos, nnodes); + + /* process RIGHT argument */ + findoprnd_recurse(ptr, pos, nnodes, needcleanup); + curitem->left = *pos - tmp; /* set LEFT arg's offset */ + + /* process LEFT argument */ + findoprnd_recurse(ptr, pos, nnodes, needcleanup); } } } @@ -444,12 +574,13 @@ findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes) * QueryItems must be in polish (prefix) notation. */ static void -findoprnd(QueryItem *ptr, int size) +findoprnd(QueryItem *ptr, int size, bool *needcleanup) { uint32 pos; + *needcleanup = false; pos = 0; - findoprnd_recurse(ptr, &pos, size); + findoprnd_recurse(ptr, &pos, size, needcleanup); if (pos != size) elog(ERROR, "malformed tsquery: extra nodes"); @@ -466,9 +597,6 @@ findoprnd(QueryItem *ptr, int size) * * opaque is passed on to pushval as is, pushval can use it to store its * private state. - * - * The returned query might contain QI_STOPVAL nodes. The caller is responsible - * for cleaning them up (with clean_fakeval) */ TSQuery parse_tsquery(char *buf, @@ -482,6 +610,7 @@ parse_tsquery(char *buf, int commonlen; QueryItem *ptr; ListCell *cell; + bool needcleanup; /* init state */ state.buffer = buf; @@ -531,7 +660,7 @@ parse_tsquery(char *buf, i = 0; foreach(cell, state.polstr) { - QueryItem *item = (QueryItem *) lfirst(cell); + QueryItem *item = (QueryItem *) lfirst(cell); switch (item->type) { @@ -555,7 +684,14 @@ parse_tsquery(char *buf, pfree(state.op); /* Set left operand pointers for every operator. */ - findoprnd(ptr, query->size); + findoprnd(ptr, query->size, &needcleanup); + + /* + * QI_VALSTOP nodes should be cleaned and + * and OP_PHRASE should be pushed down + */ + if (needcleanup) + return cleanup_fakeval_and_phrase(query); return query; } @@ -600,12 +736,15 @@ while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \ (inf)->cur = (inf)->buf + len; \ } +#define PRINT_PRIORITY(x) \ + ( (QO_PRIORITY(x) == OP_NOT) ? OP_NOT_PHRASE : QO_PRIORITY(x) ) + /* - * recursive walk on tree and print it in - * infix (human-readable) view + * recursively traverse the tree and + * print it in infix (human-readable) form */ static void -infix(INFIX *in, bool first) +infix(INFIX *in, int parentPriority) { /* since this function recurses, it could be driven to stack overflow. */ check_stack_depth(); @@ -674,24 +813,22 @@ infix(INFIX *in, bool first) } else if (in->curpol->qoperator.oper == OP_NOT) { - bool isopr = false; + int priority = PRINT_PRIORITY(in->curpol); - RESIZEBUF(in, 1); - *(in->cur) = '!'; - in->cur++; - *(in->cur) = '\0'; - in->curpol++; - - if (in->curpol->type == QI_OPR) + if (priority < parentPriority) { - isopr = true; RESIZEBUF(in, 2); sprintf(in->cur, "( "); in->cur = strchr(in->cur, '\0'); } + RESIZEBUF(in, 1); + *(in->cur) = '!'; + in->cur++; + *(in->cur) = '\0'; + in->curpol++; - infix(in, isopr); - if (isopr) + infix(in, priority); + if (priority < parentPriority) { RESIZEBUF(in, 2); sprintf(in->cur, " )"); @@ -701,11 +838,18 @@ infix(INFIX *in, bool first) else { int8 op = in->curpol->qoperator.oper; + int priority = PRINT_PRIORITY(in->curpol); + int16 distance = in->curpol->qoperator.distance; INFIX nrm; + bool needParenthesis = false; in->curpol++; - if (op == OP_OR && !first) + if (priority < parentPriority || + (op == OP_PHRASE && + (priority == parentPriority || /* phrases are not commutative! */ + parentPriority == OP_PRIORITY(OP_AND)))) { + needParenthesis = true; RESIZEBUF(in, 2); sprintf(in->cur, "( "); in->cur = strchr(in->cur, '\0'); @@ -717,14 +861,14 @@ infix(INFIX *in, bool first) nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); /* get right operand */ - infix(&nrm, false); + infix(&nrm, priority); /* get & print left operand */ in->curpol = nrm.curpol; - infix(in, false); + infix(in, priority); /* print operator & right operand */ - RESIZEBUF(in, 3 + (nrm.cur - nrm.buf)); + RESIZEBUF(in, 3 + (2 + 10 /* distance */) + (nrm.cur - nrm.buf)); switch (op) { case OP_OR: @@ -733,6 +877,12 @@ infix(INFIX *in, bool first) case OP_AND: sprintf(in->cur, " & %s", nrm.buf); break; + case OP_PHRASE: + if (distance != 1) + sprintf(in->cur, " <%d> %s", distance, nrm.buf); + else + sprintf(in->cur, " <-> %s", nrm.buf); + break; default: /* OP_NOT is handled in above if-branch */ elog(ERROR, "unrecognized operator type: %d", op); @@ -740,7 +890,7 @@ infix(INFIX *in, bool first) in->cur = strchr(in->cur, '\0'); pfree(nrm.buf); - if (op == OP_OR && !first) + if (needParenthesis) { RESIZEBUF(in, 2); sprintf(in->cur, " )"); @@ -749,7 +899,6 @@ infix(INFIX *in, bool first) } } - Datum tsqueryout(PG_FUNCTION_ARGS) { @@ -768,7 +917,7 @@ tsqueryout(PG_FUNCTION_ARGS) nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); *(nrm.cur) = '\0'; nrm.op = GETOPERAND(query); - infix(&nrm, true); + infix(&nrm, -1 /* lowest priority */); PG_FREE_IF_COPY(query, 0); PG_RETURN_CSTRING(nrm.buf); @@ -789,7 +938,8 @@ tsqueryout(PG_FUNCTION_ARGS) * * For each operator: * uint8 type, QI_OPR - * uint8 operator, one of OP_AND, OP_OR, OP_NOT. + * uint8 operator, one of OP_AND, OP_PHRASE OP_OR, OP_NOT. + * uint16 distance (only for OP_PHRASE) */ Datum tsquerysend(PG_FUNCTION_ARGS) @@ -815,6 +965,9 @@ tsquerysend(PG_FUNCTION_ARGS) break; case QI_OPR: pq_sendint(&buf, item->qoperator.oper, sizeof(item->qoperator.oper)); + if (item->qoperator.oper == OP_PHRASE) + pq_sendint(&buf, item->qoperator.distance, + sizeof(item->qoperator.distance)); break; default: elog(ERROR, "unrecognized tsquery node type: %d", item->type); @@ -830,15 +983,16 @@ tsquerysend(PG_FUNCTION_ARGS) Datum tsqueryrecv(PG_FUNCTION_ARGS) { - StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); - TSQuery query; - int i, - len; - QueryItem *item; - int datalen; - char *ptr; - uint32 size; - const char **operands; + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + TSQuery query; + int i, + len; + QueryItem *item; + int datalen; + char *ptr; + uint32 size; + const char **operands; + bool needcleanup; size = pq_getmsgint(buf, sizeof(uint32)); if (size > (MaxAllocSize / sizeof(QueryItem))) @@ -907,13 +1061,15 @@ tsqueryrecv(PG_FUNCTION_ARGS) int8 oper; oper = (int8) pq_getmsgint(buf, sizeof(int8)); - if (oper != OP_NOT && oper != OP_OR && oper != OP_AND) + if (oper != OP_NOT && oper != OP_OR && oper != OP_AND && oper != OP_PHRASE) elog(ERROR, "invalid tsquery: unrecognized operator type %d", (int) oper); if (i == size - 1) elog(ERROR, "invalid pointer to right operand"); item->qoperator.oper = oper; + if (oper == OP_PHRASE) + item->qoperator.distance = (int16) pq_getmsgint(buf, sizeof(int16)); } else elog(ERROR, "unrecognized tsquery node type: %d", item->type); @@ -930,7 +1086,7 @@ tsqueryrecv(PG_FUNCTION_ARGS) * Fill in the left-pointers. Checks that the tree is well-formed as a * side-effect. */ - findoprnd(item, size); + findoprnd(item, size, &needcleanup); /* Copy operands to output struct */ for (i = 0; i < size; i++) @@ -949,7 +1105,10 @@ tsqueryrecv(PG_FUNCTION_ARGS) SET_VARSIZE(query, len + datalen); - PG_RETURN_TSVECTOR(query); + if (needcleanup) + PG_RETURN_TSQUERY(cleanup_fakeval_and_phrase(query)); + + PG_RETURN_TSQUERY(query); } /* |