diff options
author | Teodor Sigaev <teodor@sigaev.ru> | 2007-09-07 15:09:56 +0000 |
---|---|---|
committer | Teodor Sigaev <teodor@sigaev.ru> | 2007-09-07 15:09:56 +0000 |
commit | e5be89981fc70648eedb325781cf2fbd4da05ba8 (patch) | |
tree | 8a2c587b7ae857fcbe2c2ed4ff21772c11d935a3 /src/backend/utils/adt/tsquery.c | |
parent | da1248401d00bce0cb75fa3ab278f906f407bec9 (diff) | |
download | postgresql-e5be89981fc70648eedb325781cf2fbd4da05ba8.tar.gz postgresql-e5be89981fc70648eedb325781cf2fbd4da05ba8.zip |
Refactoring by Heikki Linnakangas <heikki@enterprisedb.com> with
small editorization by me
- Brake the QueryItem struct into QueryOperator and QueryOperand.
Type was really the only common field between them. QueryItem still
exists, and is used in the TSQuery struct as before, but it's now a
union of the two. Many other changes fell from that, like separation
of pushval_asis function into pushValue, pushOperator and pushStop.
- Moved some structs that were for internal use only from header files
to the right .c-files.
- Moved tsvector parser to a new tsvector_parser.c file. Parser code was
about half of the size of tsvector.c, it's also used from tsquery.c, and
it has some data structures of its own, so it seems better to separate
it. Cleaned up the API so that TSVectorParserState is not accessed from
outside tsvector_parser.c.
- Separated enumerations (#defines, really) used for QueryItem.type
field and as return codes from gettoken_query. It was just accidental
code sharing.
- Removed ParseQueryNode struct used internally by makepol and friends.
push*-functions now construct QueryItems directly.
- Changed int4 variables to just ints for variables like "i" or "array
size", where the storage-size was not significant.
Diffstat (limited to 'src/backend/utils/adt/tsquery.c')
-rw-r--r-- | src/backend/utils/adt/tsquery.c | 582 |
1 files changed, 376 insertions, 206 deletions
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 83759728ff9..27b93eb64d7 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.2 2007/08/31 02:26:29 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.3 2007/09/07 15:09:56 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -23,6 +23,29 @@ #include "utils/pg_crc.h" +struct TSQueryParserStateData +{ + /* State for gettoken_query */ + char *buffer; /* entire string we are scanning */ + char *buf; /* current scan point */ + int state; + int count; /* nesting count, incremented by (, + decremented by ) */ + + /* polish (prefix) notation in list, filled in by push* functions */ + List *polstr; + + /* Strings from operands are collected in op. curop is a pointer to + * the end of used space of op. */ + char *op; + char *curop; + int lenop; /* allocated size of op */ + int sumlen; /* used size of op */ + + /* state for value's parser */ + TSVectorParseState valstate; +}; + /* parser's states */ #define WAITOPERAND 1 #define WAITOPERATOR 2 @@ -30,21 +53,10 @@ #define WAITSINGLEOPERAND 4 /* - * node of query tree, also used - * for storing polish notation in parser + * subroutine to parse the weight part, like ':1AB' of a query. */ -typedef struct ParseQueryNode -{ - int2 weight; - int2 type; - int4 val; - int2 distance; - int2 length; - struct ParseQueryNode *next; -} ParseQueryNode; - static char * -get_weight(char *buf, int2 *weight) +get_weight(char *buf, int16 *weight) { *weight = 0; @@ -82,10 +94,27 @@ get_weight(char *buf, int2 *weight) } /* + * token types for parsing + */ +typedef enum { + PT_END = 0, + PT_ERR = 1, + PT_VAL = 2, + PT_OPR = 3, + PT_OPEN = 4, + PT_CLOSE = 5, +} ts_tokentype; + +/* * get token from query string + * + * *operator is filled in with OP_* when return values is PT_OPR + * *strval, *lenval and *weight are filled in when return value is PT_VAL */ -static int4 -gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strval, int2 *weight) +static ts_tokentype +gettoken_query(TSQueryParserState state, + int8 *operator, + int *lenval, char **strval, int16 *weight) { while (1) { @@ -97,16 +126,16 @@ gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strva { (state->buf)++; /* can safely ++, t_iseq guarantee * that pg_mblen()==1 */ - *val = (int4) '!'; + *operator = OP_NOT; state->state = WAITOPERAND; - return OPR; + return PT_OPR; } else if (t_iseq(state->buf, '(')) { state->count++; (state->buf)++; state->state = WAITOPERAND; - return OPEN; + return PT_OPEN; } else if (t_iseq(state->buf, ':')) { @@ -117,17 +146,16 @@ gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strva } else if (!t_isspace(state->buf)) { - state->valstate.prsbuf = state->buf; - if (gettoken_tsvector(&(state->valstate))) + /* We rely on the tsvector parser to parse the value for us */ + reset_tsvector_parser(state->valstate, state->buf); + if (gettoken_tsvector(state->valstate, strval, lenval, NULL, NULL, &state->buf)) { - *strval = state->valstate.word; - *lenval = state->valstate.curpos - state->valstate.word; - state->buf = get_weight(state->valstate.prsbuf, weight); + state->buf = get_weight(state->buf, weight); state->state = WAITOPERATOR; - return VAL; + return PT_VAL; } else if (state->state == WAITFIRSTOPERAND) - return END; + return PT_END; else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -136,52 +164,71 @@ gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strva } break; case WAITOPERATOR: - if (t_iseq(state->buf, '&') || t_iseq(state->buf, '|')) + if (t_iseq(state->buf, '&')) + { + state->state = WAITOPERAND; + *operator = OP_AND; + (state->buf)++; + return PT_OPR; + } + if (t_iseq(state->buf, '|')) { state->state = WAITOPERAND; - *val = (int4) *(state->buf); + *operator = OP_OR; (state->buf)++; - return OPR; + return PT_OPR; } else if (t_iseq(state->buf, ')')) { (state->buf)++; state->count--; - return (state->count < 0) ? ERR : CLOSE; + return (state->count < 0) ? PT_ERR : PT_CLOSE; } else if (*(state->buf) == '\0') - return (state->count) ? ERR : END; + return (state->count) ? PT_ERR : PT_END; else if (!t_isspace(state->buf)) - return ERR; + return PT_ERR; break; case WAITSINGLEOPERAND: if (*(state->buf) == '\0') - return END; + return PT_END; *strval = state->buf; *lenval = strlen(state->buf); state->buf += strlen(state->buf); state->count++; - return VAL; + return PT_VAL; default: - return ERR; + return PT_ERR; break; } state->buf += pg_mblen(state->buf); } - return END; + return PT_END; } /* - * push new one in polish notation reverse view + * Push an operator to state->polstr */ void -pushquery(TSQueryParserState * state, int4 type, int4 val, int4 distance, int4 lenval, int2 weight) +pushOperator(TSQueryParserState state, int8 oper) { - ParseQueryNode *tmp = (ParseQueryNode *) palloc(sizeof(ParseQueryNode)); + QueryOperator *tmp; + + Assert(oper == OP_NOT || oper == OP_AND || oper == OP_OR); + + tmp = (QueryOperator *) palloc(sizeof(QueryOperator)); + tmp->type = QI_OPR; + tmp->oper = oper; + /* left is filled in later with findoprnd */ + + state->polstr = lcons(tmp, state->polstr); +} + +static void +pushValue_internal(TSQueryParserState state, pg_crc32 valcrc, int distance, int lenval, int weight) +{ + QueryOperand *tmp; - tmp->weight = weight; - tmp->type = type; - tmp->val = val; if (distance >= MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -192,20 +239,27 @@ pushquery(TSQueryParserState * state, int4 type, int4 val, int4 distance, int4 l (errcode(ERRCODE_SYNTAX_ERROR), errmsg("operand is too long in tsearch query: \"%s\"", state->buffer))); - tmp->distance = distance; + + tmp = (QueryOperand *) palloc(sizeof(QueryOperand)); + tmp->type = QI_VAL; + tmp->weight = weight; + tmp->valcrc = (int32) valcrc; tmp->length = lenval; - tmp->next = state->str; - state->str = tmp; - state->num++; + tmp->distance = distance; + + state->polstr = lcons(tmp, state->polstr); } /* - * This function is used for tsquery parsing + * Push an operand to state->polstr. + * + * strval must point to a string equal to state->curop. lenval is the length + * of the string. */ void -pushval_asis(TSQueryParserState * state, int type, char *strval, int lenval, int2 weight) +pushValue(TSQueryParserState state, char *strval, int lenval, int2 weight) { - pg_crc32 c; + pg_crc32 valcrc; if (lenval >= MAXSTRLEN) ereport(ERROR, @@ -213,162 +267,202 @@ pushval_asis(TSQueryParserState * state, int type, char *strval, int lenval, int errmsg("word is too long in tsearch query: \"%s\"", state->buffer))); - INIT_CRC32(c); - COMP_CRC32(c, strval, lenval); - FIN_CRC32(c); - pushquery(state, type, *(int4 *) &c, - state->curop - state->op, lenval, weight); + INIT_CRC32(valcrc); + COMP_CRC32(valcrc, strval, lenval); + FIN_CRC32(valcrc); + pushValue_internal(state, valcrc, state->curop - state->op, lenval, weight); + /* append the value string to state.op, enlarging buffer if needed first */ while (state->curop - state->op + lenval + 1 >= state->lenop) { - int4 tmp = state->curop - state->op; + int used = state->curop - state->op; state->lenop *= 2; state->op = (char *) repalloc((void *) state->op, state->lenop); - state->curop = state->op + tmp; + state->curop = state->op + used; } memcpy((void *) state->curop, (void *) strval, lenval); state->curop += lenval; *(state->curop) = '\0'; state->curop++; state->sumlen += lenval + 1 /* \0 */ ; - return; } + +/* + * Push a stopword placeholder to state->polstr + */ +void +pushStop(TSQueryParserState state) +{ + QueryOperand *tmp; + + tmp = (QueryOperand *) palloc(sizeof(QueryOperand)); + tmp->type = QI_VALSTOP; + + state->polstr = lcons(tmp, state->polstr); +} + + #define STACKDEPTH 32 /* - * make polish notation of query + * Make polish (prefix) notation of query. + * + * See parse_tsquery for explanation of pushval. */ -static int4 -makepol(TSQueryParserState * state, - void (*pushval) (TSQueryParserState *, int, char *, int, int2)) +static void +makepol(TSQueryParserState state, + PushFunction pushval, + void *opaque) { - int4 val = 0, - type; - int4 lenval = 0; + int8 operator = 0; + ts_tokentype type; + int lenval = 0; char *strval = NULL; - int4 stack[STACKDEPTH]; - int4 lenstack = 0; - int2 weight = 0; + int8 opstack[STACKDEPTH]; + int lenstack = 0; + int16 weight = 0; /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); - while ((type = gettoken_query(state, &val, &lenval, &strval, &weight)) != END) + while ((type = gettoken_query(state, &operator, &lenval, &strval, &weight)) != PT_END) { switch (type) { - case VAL: - pushval(state, VAL, strval, lenval, weight); - while (lenstack && (stack[lenstack - 1] == (int4) '&' || - stack[lenstack - 1] == (int4) '!')) + case PT_VAL: + pushval(opaque, state, strval, lenval, weight); + while (lenstack && (opstack[lenstack - 1] == OP_AND || + opstack[lenstack - 1] == OP_NOT)) { lenstack--; - pushquery(state, OPR, stack[lenstack], 0, 0, 0); + pushOperator(state, opstack[lenstack]); } break; - case OPR: - if (lenstack && val == (int4) '|') - pushquery(state, OPR, val, 0, 0, 0); + case PT_OPR: + if (lenstack && operator == OP_OR) + pushOperator(state, OP_OR); else { if (lenstack == STACKDEPTH) /* internal error */ elog(ERROR, "tsquery stack too small"); - stack[lenstack] = val; + opstack[lenstack] = operator; lenstack++; } break; - case OPEN: - if (makepol(state, pushval) == ERR) - return ERR; - if (lenstack && (stack[lenstack - 1] == (int4) '&' || - stack[lenstack - 1] == (int4) '!')) + case PT_OPEN: + makepol(state, pushval, opaque); + + if (lenstack && (opstack[lenstack - 1] == OP_AND || + opstack[lenstack - 1] == OP_NOT)) { lenstack--; - pushquery(state, OPR, stack[lenstack], 0, 0, 0); + pushOperator(state, opstack[lenstack]); } break; - case CLOSE: + case PT_CLOSE: while (lenstack) { lenstack--; - pushquery(state, OPR, stack[lenstack], 0, 0, 0); + pushOperator(state, opstack[lenstack]); }; - return END; - break; - case ERR: + return; + case PT_ERR: default: ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error in tsearch query: \"%s\"", state->buffer))); - return ERR; - } } while (lenstack) { lenstack--; - pushquery(state, OPR, stack[lenstack], 0, 0, 0); - }; - return END; + pushOperator(state, opstack[lenstack]); + } } +/* + * Fills in the left-fields previously left unfilled. The input + * QueryItems must be in polish (prefix) notation. + */ static void -findoprnd(QueryItem * ptr, int4 *pos) +findoprnd(QueryItem *ptr, int *pos) { - if (ptr[*pos].type == VAL || ptr[*pos].type == VALSTOP) - { - ptr[*pos].left = 0; - (*pos)++; - } - else if (ptr[*pos].val == (int4) '!') + /* since this function recurses, it could be driven to stack overflow. */ + check_stack_depth(); + + if (ptr[*pos].type == QI_VAL || + ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, + * they haven't been cleansed + * away yet. + */ { - ptr[*pos].left = 1; (*pos)++; - findoprnd(ptr, pos); } - else + else { - QueryItem *curitem = &ptr[*pos]; - int4 tmp = *pos; + Assert(ptr[*pos].type == QI_OPR); - (*pos)++; - findoprnd(ptr, pos); - curitem->left = *pos - tmp; - findoprnd(ptr, pos); + if (ptr[*pos].operator.oper == OP_NOT) + { + ptr[*pos].operator.left = 1; + (*pos)++; + findoprnd(ptr, pos); + } + else + { + QueryOperator *curitem = &ptr[*pos].operator; + int tmp = *pos; + + Assert(curitem->oper == OP_AND || curitem->oper == OP_OR); + + (*pos)++; + findoprnd(ptr, pos); + curitem->left = *pos - tmp; + findoprnd(ptr, pos); + } } } - /* - * input + * Each value (operand) in the query is be passed to pushval. pushval can + * transform the simple value to an arbitrarily complex expression using + * pushValue and pushOperator. It must push a single value with pushValue, + * a complete expression with all operands, or a a stopword placeholder + * with pushStop, otherwise the prefix notation representation will be broken, + * having an operator with no operand. + * + * opaque is passed on to pushval as is, pushval can use it to store its + * private state. + * + * The returned query might contain QI_STOPVAL nodes. The caller is responsible + * for cleaning them up (with clean_fakeval) */ TSQuery -parse_tsquery(char *buf, void (*pushval) (TSQueryParserState *, int, char *, int, int2), Oid cfg_id, bool isplain) +parse_tsquery(char *buf, + PushFunction pushval, + void *opaque, + bool isplain) { - TSQueryParserState state; - int4 i; + struct TSQueryParserStateData state; + int i; TSQuery query; - int4 commonlen; + int commonlen; QueryItem *ptr; - ParseQueryNode *tmp; - int4 pos = 0; + int pos = 0; + ListCell *cell; /* init state */ state.buffer = buf; state.buf = buf; state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND; state.count = 0; - state.num = 0; - state.str = NULL; - state.cfg_id = cfg_id; + state.polstr = NIL; /* init value parser's state */ - state.valstate.oprisdelim = true; - state.valstate.len = 32; - state.valstate.word = (char *) palloc(state.valstate.len); + state.valstate = init_tsvector_parser(NULL, true); /* init list of operand */ state.sumlen = 0; @@ -377,9 +471,11 @@ parse_tsquery(char *buf, void (*pushval) (TSQueryParserState *, int, char *, int *(state.curop) = '\0'; /* parse query & make polish notation (postfix, but in reverse order) */ - makepol(&state, pushval); - pfree(state.valstate.word); - if (!state.num) + makepol(&state, pushval, opaque); + + close_tsvector_parser(state.valstate); + + if (list_length(state.polstr) == 0) { ereport(NOTICE, (errmsg("tsearch query doesn't contain lexeme(s): \"%s\"", @@ -390,37 +486,54 @@ parse_tsquery(char *buf, void (*pushval) (TSQueryParserState *, int, char *, int return query; } - /* make finish struct */ - commonlen = COMPUTESIZE(state.num, state.sumlen); - query = (TSQuery) palloc(commonlen); + /* Pack the QueryItems in the final TSQuery struct to return to caller */ + commonlen = COMPUTESIZE(list_length(state.polstr), state.sumlen); + query = (TSQuery) palloc0(commonlen); SET_VARSIZE(query, commonlen); - query->size = state.num; + query->size = list_length(state.polstr); ptr = GETQUERY(query); - /* set item in polish notation */ - for (i = 0; i < state.num; i++) + /* Copy QueryItems to TSQuery */ + i = 0; + foreach(cell, state.polstr) { - ptr[i].weight = state.str->weight; - ptr[i].type = state.str->type; - ptr[i].val = state.str->val; - ptr[i].distance = state.str->distance; - ptr[i].length = state.str->length; - tmp = state.str->next; - pfree(state.str); - state.str = tmp; + QueryItem *item = (QueryItem *) lfirst(cell); + + switch(item->type) + { + case QI_VAL: + memcpy(&ptr[i], item, sizeof(QueryOperand)); + break; + case QI_VALSTOP: + ptr[i].type = QI_VALSTOP; + break; + case QI_OPR: + memcpy(&ptr[i], item, sizeof(QueryOperator)); + break; + default: + elog(ERROR, "unknown QueryItem type %d", item->type); + } + i++; } - /* set user friendly-operand view */ + /* Copy all the operand strings to TSQuery */ memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen); pfree(state.op); - /* set left operand's position for every operator */ + /* Set left operand pointers for every operator. */ pos = 0; findoprnd(ptr, &pos); return query; } +static void +pushval_asis(void *opaque, TSQueryParserState state, char *strval, int lenval, + int16 weight) +{ + pushValue(state, strval, lenval, weight); +} + /* * in without morphology */ @@ -431,7 +544,7 @@ tsqueryin(PG_FUNCTION_ARGS) pg_verifymbstr(in, strlen(in), false); - PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, InvalidOid, false)); + PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, NULL, false)); } /* @@ -443,13 +556,14 @@ typedef struct char *buf; char *cur; char *op; - int4 buflen; + int buflen; } INFIX; -#define RESIZEBUF(inf,addsize) \ +/* Makes sure inf->buf is large enough for adding 'addsize' bytes */ +#define RESIZEBUF(inf, addsize) \ while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \ { \ - int4 len = (inf)->cur - (inf)->buf; \ + int len = (inf)->cur - (inf)->buf; \ (inf)->buflen *= 2; \ (inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \ (inf)->cur = (inf)->buf + len; \ @@ -462,12 +576,16 @@ while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \ static void infix(INFIX * in, bool first) { - if (in->curpol->type == VAL) + /* since this function recurses, it could be driven to stack overflow. */ + check_stack_depth(); + + if (in->curpol->type == QI_VAL) { - char *op = in->op + in->curpol->distance; + QueryOperand *curpol = &in->curpol->operand; + char *op = in->op + curpol->distance; int clen; - RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 5); + RESIZEBUF(in, curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 5); *(in->cur) = '\''; in->cur++; while (*op) @@ -485,26 +603,26 @@ infix(INFIX * in, bool first) } *(in->cur) = '\''; in->cur++; - if (in->curpol->weight) + if (curpol->weight) { *(in->cur) = ':'; in->cur++; - if (in->curpol->weight & (1 << 3)) + if (curpol->weight & (1 << 3)) { *(in->cur) = 'A'; in->cur++; } - if (in->curpol->weight & (1 << 2)) + if (curpol->weight & (1 << 2)) { *(in->cur) = 'B'; in->cur++; } - if (in->curpol->weight & (1 << 1)) + if (curpol->weight & (1 << 1)) { *(in->cur) = 'C'; in->cur++; } - if (in->curpol->weight & 1) + if (curpol->weight & 1) { *(in->cur) = 'D'; in->cur++; @@ -513,7 +631,7 @@ infix(INFIX * in, bool first) *(in->cur) = '\0'; in->curpol++; } - else if (in->curpol->val == (int4) '!') + else if (in->curpol->operator.oper == OP_NOT) { bool isopr = false; @@ -522,13 +640,15 @@ infix(INFIX * in, bool first) in->cur++; *(in->cur) = '\0'; in->curpol++; - if (in->curpol->type == OPR) + + if (in->curpol->type == QI_OPR) { isopr = true; RESIZEBUF(in, 2); sprintf(in->cur, "( "); in->cur = strchr(in->cur, '\0'); } + infix(in, isopr); if (isopr) { @@ -539,11 +659,11 @@ infix(INFIX * in, bool first) } else { - int4 op = in->curpol->val; + int8 op = in->curpol->operator.oper; INFIX nrm; in->curpol++; - if (op == (int4) '|' && !first) + if (op == OP_OR && !first) { RESIZEBUF(in, 2); sprintf(in->cur, "( "); @@ -564,11 +684,22 @@ infix(INFIX * in, bool first) /* print operator & right operand */ RESIZEBUF(in, 3 + (nrm.cur - nrm.buf)); - sprintf(in->cur, " %c %s", op, nrm.buf); + switch(op) + { + case OP_OR: + sprintf(in->cur, " | %s", nrm.buf); + break; + case OP_AND: + sprintf(in->cur, " & %s", nrm.buf); + break; + default: + /* OP_NOT is handled in above if-branch*/ + elog(ERROR, "unexpected operator type %d", op); + } in->cur = strchr(in->cur, '\0'); pfree(nrm.buf); - if (op == (int4) '|' && !first) + if (op == OP_OR && !first) { RESIZEBUF(in, 2); sprintf(in->cur, " )"); @@ -615,28 +746,33 @@ tsquerysend(PG_FUNCTION_ARGS) pq_sendint(&buf, query->size, sizeof(int32)); for (i = 0; i < query->size; i++) { - int tmp; - pq_sendint(&buf, item->type, sizeof(item->type)); - pq_sendint(&buf, item->weight, sizeof(item->weight)); - pq_sendint(&buf, item->left, sizeof(item->left)); - pq_sendint(&buf, item->val, sizeof(item->val)); - - /* - * We are sure that sizeof(WordEntry) == sizeof(int32), and about - * layout of QueryItem - */ - tmp = *(int32 *) (((char *) item) + HDRSIZEQI); - pq_sendint(&buf, tmp, sizeof(tmp)); + switch(item->type) + { + case QI_VAL: + pq_sendint(&buf, item->operand.weight, sizeof(item->operand.weight)); + pq_sendint(&buf, item->operand.valcrc, sizeof(item->operand.valcrc)); + pq_sendint(&buf, item->operand.length, sizeof(int16)); + /* istrue flag is just for temporary use in tsrank.c/Cover, + * so we don't need to transfer that */ + break; + case QI_OPR: + pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper)); + if (item->operator.oper != OP_NOT) + pq_sendint(&buf, item->operator.left, sizeof(item->operator.left)); + break; + default: + elog(ERROR, "unknown tsquery node type %d", item->type); + } item++; } item = GETQUERY(query); for (i = 0; i < query->size; i++) { - if (item->type == VAL) - pq_sendbytes(&buf, GETOPERAND(query) + item->distance, item->length); + if (item->type == QI_VAL) + pq_sendbytes(&buf, GETOPERAND(query) + item->operand.distance, item->operand.length); item++; } @@ -652,8 +788,7 @@ tsqueryrecv(PG_FUNCTION_ARGS) TSQuery query; int i, size, - tmp, - len = HDRSIZETQ; + len; QueryItem *item; int datalen = 0; char *ptr; @@ -661,7 +796,8 @@ tsqueryrecv(PG_FUNCTION_ARGS) size = pq_getmsgint(buf, sizeof(uint32)); if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem))) elog(ERROR, "invalid size of tsquery"); - len += sizeof(QueryItem) * size; + + len = HDRSIZETQ + sizeof(QueryItem) * size; query = (TSQuery) palloc(len); query->size = size; @@ -670,32 +806,67 @@ tsqueryrecv(PG_FUNCTION_ARGS) for (i = 0; i < size; i++) { item->type = (int8) pq_getmsgint(buf, sizeof(int8)); - item->weight = (int8) pq_getmsgint(buf, sizeof(int8)); - item->left = (int16) pq_getmsgint(buf, sizeof(int16)); - item->val = (int32) pq_getmsgint(buf, sizeof(int32)); - tmp = pq_getmsgint(buf, sizeof(int32)); - memcpy((((char *) item) + HDRSIZEQI), &tmp, sizeof(int32)); - - /* - * Sanity checks - */ - if (item->type == VAL) - { - datalen += item->length + 1; /* \0 */ - } - else if (item->type == OPR) + + switch(item->type) { - if (item->val == '|' || item->val == '&') - { - if (item->left <= 0 || i + item->left >= size) - elog(ERROR, "invalid pointer to left operand"); - } + case QI_VAL: + item->operand.weight = (int8) pq_getmsgint(buf, sizeof(int8)); + item->operand.valcrc = (int32) pq_getmsgint(buf, sizeof(int32)); + item->operand.length = pq_getmsgint(buf, sizeof(int16)); + + /* + * Check that datalen doesn't grow too large. Without the + * check, a malicious client could induce a buffer overflow + * by sending a tsquery whose size exceeds 2GB. datalen + * would overflow, we would allocate a too small buffer below, + * and overflow the buffer. Because operand.length is a 20-bit + * field, adding one such value to datalen must exceed + * MaxAllocSize before wrapping over the 32-bit datalen field, + * so this check will protect from it. + */ + if (datalen > MAXSTRLEN) + elog(ERROR, "invalid tsquery; total operand length exceeded"); + + /* We can calculate distance from datalen, no need to send it + * through the wire. If we did, we would have to check that + * it's valid anyway. + */ + item->operand.distance = datalen; + + datalen += item->operand.length + 1; /* \0 */ - if (i == size - 1) - elog(ERROR, "invalid pointer to right operand"); + break; + case QI_OPR: + item->operator.oper = (int8) pq_getmsgint(buf, sizeof(int8)); + if (item->operator.oper != OP_NOT && + item->operator.oper != OP_OR && + item->operator.oper != OP_AND) + elog(ERROR, "unknown operator type %d", (int) item->operator.oper); + if(item->operator.oper != OP_NOT) + { + item->operator.left = (int16) pq_getmsgint(buf, sizeof(int16)); + /* + * Sanity checks + */ + if (item->operator.left <= 0 || i + item->operator.left >= size) + elog(ERROR, "invalid pointer to left operand"); + + /* XXX: Though there's no way to construct a TSQuery that's + * not in polish notation, we don't enforce that for + * queries received from client in binary mode. Is there + * anything that relies on it? + * + * XXX: The tree could be malformed in other ways too, + * a node could have two parents, for example. + */ + } + + if (i == size - 1) + elog(ERROR, "invalid pointer to right operand"); + break; + default: + elog(ERROR, "unknown tsquery node type %d", item->type); } - else - elog(ERROR, "unknown tsquery node type"); item++; } @@ -706,13 +877,12 @@ tsqueryrecv(PG_FUNCTION_ARGS) ptr = GETOPERAND(query); for (i = 0; i < size; i++) { - if (item->type == VAL) + if (item->type == QI_VAL) { - item->distance = ptr - GETOPERAND(query); memcpy(ptr, - pq_getmsgbytes(buf, item->length), - item->length); - ptr += item->length; + pq_getmsgbytes(buf, item->operand.length), + item->operand.length); + ptr += item->operand.length; *ptr++ = '\0'; } item++; @@ -736,7 +906,7 @@ tsquerytree(PG_FUNCTION_ARGS) INFIX nrm; text *res; QueryItem *q; - int4 len; + int len; if (query->size == 0) { |