diff options
Diffstat (limited to 'src/backend/utils/adt/tsquery.c')
-rw-r--r-- | src/backend/utils/adt/tsquery.c | 767 |
1 files changed, 767 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c new file mode 100644 index 00000000000..1f8abb3298e --- /dev/null +++ b/src/backend/utils/adt/tsquery.c @@ -0,0 +1,767 @@ +/*------------------------------------------------------------------------- + * + * tsquery.c + * I/O functions for tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqformat.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/memutils.h" +#include "utils/pg_crc.h" + +/* parser's states */ +#define WAITOPERAND 1 +#define WAITOPERATOR 2 +#define WAITFIRSTOPERAND 3 +#define WAITSINGLEOPERAND 4 + +/* + * node of query tree, also used + * for storing polish notation in parser + */ +typedef struct ParseQueryNode +{ + int2 weight; + int2 type; + int4 val; + int2 distance; + int2 length; + struct ParseQueryNode *next; +} ParseQueryNode; + +static char * +get_weight(char *buf, int2 *weight) +{ + *weight = 0; + + if (!t_iseq(buf, ':')) + return buf; + + buf++; + while (*buf && pg_mblen(buf) == 1) + { + switch (*buf) + { + case 'a': + case 'A': + *weight |= 1 << 3; + break; + case 'b': + case 'B': + *weight |= 1 << 2; + break; + case 'c': + case 'C': + *weight |= 1 << 1; + break; + case 'd': + case 'D': + *weight |= 1; + break; + default: + return buf; + } + buf++; + } + + return buf; +} + +/* + * get token from query string + */ +static int4 +gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strval, int2 *weight) +{ + while (1) + { + switch (state->state) + { + case WAITFIRSTOPERAND: + case WAITOPERAND: + if (t_iseq(state->buf, '!')) + { + (state->buf)++; /* can safely ++, t_iseq guarantee + * that pg_mblen()==1 */ + *val = (int4) '!'; + state->state = WAITOPERAND; + return OPR; + } + else if (t_iseq(state->buf, '(')) + { + state->count++; + (state->buf)++; + state->state = WAITOPERAND; + return OPEN; + } + else if (t_iseq(state->buf, ':')) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error at start of operand in tsearch query: \"%s\"", + state->buffer))); + } + else if (!t_isspace(state->buf)) + { + state->valstate.prsbuf = state->buf; + if (gettoken_tsvector(&(state->valstate))) + { + *strval = state->valstate.word; + *lenval = state->valstate.curpos - state->valstate.word; + state->buf = get_weight(state->valstate.prsbuf, weight); + state->state = WAITOPERATOR; + return VAL; + } + else if (state->state == WAITFIRSTOPERAND) + return END; + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("no operand in tsearch query: \"%s\"", + state->buffer))); + } + break; + case WAITOPERATOR: + if (t_iseq(state->buf, '&') || t_iseq(state->buf, '|')) + { + state->state = WAITOPERAND; + *val = (int4) *(state->buf); + (state->buf)++; + return OPR; + } + else if (t_iseq(state->buf, ')')) + { + (state->buf)++; + state->count--; + return (state->count < 0) ? ERR : CLOSE; + } + else if (*(state->buf) == '\0') + return (state->count) ? ERR : END; + else if (!t_isspace(state->buf)) + return ERR; + break; + case WAITSINGLEOPERAND: + if (*(state->buf) == '\0') + return END; + *strval = state->buf; + *lenval = strlen(state->buf); + state->buf += strlen(state->buf); + state->count++; + return VAL; + default: + return ERR; + break; + } + state->buf += pg_mblen(state->buf); + } + return END; +} + +/* + * push new one in polish notation reverse view + */ +void +pushquery(TSQueryParserState * state, int4 type, int4 val, int4 distance, int4 lenval, int2 weight) +{ + ParseQueryNode *tmp = (ParseQueryNode *) palloc(sizeof(ParseQueryNode)); + + tmp->weight = weight; + tmp->type = type; + tmp->val = val; + if (distance >= MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("value is too big in tsearch query: \"%s\"", + state->buffer))); + if (lenval >= MAXSTRLEN) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("operand is too long in tsearch query: \"%s\"", + state->buffer))); + tmp->distance = distance; + tmp->length = lenval; + tmp->next = state->str; + state->str = tmp; + state->num++; +} + +/* + * This function is used for tsquery parsing + */ +void +pushval_asis(TSQueryParserState * state, int type, char *strval, int lenval, int2 weight) +{ + pg_crc32 c; + + if (lenval >= MAXSTRLEN) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("word is too long in tsearch query: \"%s\"", + state->buffer))); + + INIT_CRC32(c); + COMP_CRC32(c, strval, lenval); + FIN_CRC32(c); + pushquery(state, type, *(int4 *) &c, + state->curop - state->op, lenval, weight); + + while (state->curop - state->op + lenval + 1 >= state->lenop) + { + int4 tmp = state->curop - state->op; + + state->lenop *= 2; + state->op = (char *) repalloc((void *) state->op, state->lenop); + state->curop = state->op + tmp; + } + memcpy((void *) state->curop, (void *) strval, lenval); + state->curop += lenval; + *(state->curop) = '\0'; + state->curop++; + state->sumlen += lenval + 1 /* \0 */ ; + return; +} + +#define STACKDEPTH 32 +/* + * make polish notation of query + */ +static int4 +makepol(TSQueryParserState * state, void (*pushval) (TSQueryParserState *, int, char *, int, int2)) +{ + int4 val = 0, + type; + int4 lenval = 0; + char *strval = NULL; + int4 stack[STACKDEPTH]; + int4 lenstack = 0; + int2 weight = 0; + + while ((type = gettoken_query(state, &val, &lenval, &strval, &weight)) != END) + { + switch (type) + { + case VAL: + pushval(state, VAL, strval, lenval, weight); + while (lenstack && (stack[lenstack - 1] == (int4) '&' || + stack[lenstack - 1] == (int4) '!')) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + } + break; + case OPR: + if (lenstack && val == (int4) '|') + pushquery(state, OPR, val, 0, 0, 0); + else + { + if (lenstack == STACKDEPTH) /* internal error */ + elog(ERROR, "tsquery stack too small"); + stack[lenstack] = val; + lenstack++; + } + break; + case OPEN: + if (makepol(state, pushval) == ERR) + return ERR; + if (lenstack && (stack[lenstack - 1] == (int4) '&' || + stack[lenstack - 1] == (int4) '!')) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + } + break; + case CLOSE: + while (lenstack) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + }; + return END; + break; + case ERR: + default: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsearch query: \"%s\"", + state->buffer))); + return ERR; + + } + } + while (lenstack) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + }; + return END; +} + +static void +findoprnd(QueryItem * ptr, int4 *pos) +{ + if (ptr[*pos].type == VAL || ptr[*pos].type == VALSTOP) + { + ptr[*pos].left = 0; + (*pos)++; + } + else if (ptr[*pos].val == (int4) '!') + { + ptr[*pos].left = 1; + (*pos)++; + findoprnd(ptr, pos); + } + else + { + QueryItem *curitem = &ptr[*pos]; + int4 tmp = *pos; + + (*pos)++; + findoprnd(ptr, pos); + curitem->left = *pos - tmp; + findoprnd(ptr, pos); + } +} + + +/* + * input + */ +TSQuery +parse_tsquery(char *buf, void (*pushval) (TSQueryParserState *, int, char *, int, int2), Oid cfg_id, bool isplain) +{ + TSQueryParserState state; + int4 i; + TSQuery query; + int4 commonlen; + QueryItem *ptr; + ParseQueryNode *tmp; + int4 pos = 0; + + /* init state */ + state.buffer = buf; + state.buf = buf; + state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND; + state.count = 0; + state.num = 0; + state.str = NULL; + state.cfg_id = cfg_id; + + /* init value parser's state */ + state.valstate.oprisdelim = true; + state.valstate.len = 32; + state.valstate.word = (char *) palloc(state.valstate.len); + + /* init list of operand */ + state.sumlen = 0; + state.lenop = 64; + state.curop = state.op = (char *) palloc(state.lenop); + *(state.curop) = '\0'; + + /* parse query & make polish notation (postfix, but in reverse order) */ + makepol(&state, pushval); + pfree(state.valstate.word); + if (!state.num) + { + ereport(NOTICE, + (errmsg("tsearch query doesn't contain lexeme(s): \"%s\"", + state.buffer))); + query = (TSQuery) palloc(HDRSIZETQ); + SET_VARSIZE(query, HDRSIZETQ); + query->size = 0; + return query; + } + + /* make finish struct */ + commonlen = COMPUTESIZE(state.num, state.sumlen); + query = (TSQuery) palloc(commonlen); + SET_VARSIZE(query, commonlen); + query->size = state.num; + ptr = GETQUERY(query); + + /* set item in polish notation */ + for (i = 0; i < state.num; i++) + { + ptr[i].weight = state.str->weight; + ptr[i].type = state.str->type; + ptr[i].val = state.str->val; + ptr[i].distance = state.str->distance; + ptr[i].length = state.str->length; + tmp = state.str->next; + pfree(state.str); + state.str = tmp; + } + + /* set user friendly-operand view */ + memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen); + pfree(state.op); + + /* set left operand's position for every operator */ + pos = 0; + findoprnd(ptr, &pos); + + return query; +} + +/* + * in without morphology + */ +Datum +tsqueryin(PG_FUNCTION_ARGS) +{ + char *in = PG_GETARG_CSTRING(0); + + pg_verifymbstr(in, strlen(in), false); + + PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, InvalidOid, false)); +} + +/* + * out function + */ +typedef struct +{ + QueryItem *curpol; + char *buf; + char *cur; + char *op; + int4 buflen; +} INFIX; + +#define RESIZEBUF(inf,addsize) \ +while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \ +{ \ + int4 len = (inf)->cur - (inf)->buf; \ + (inf)->buflen *= 2; \ + (inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \ + (inf)->cur = (inf)->buf + len; \ +} + +/* + * recursive walk on tree and print it in + * infix (human-readable) view + */ +static void +infix(INFIX * in, bool first) +{ + if (in->curpol->type == VAL) + { + char *op = in->op + in->curpol->distance; + int clen; + + RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 5); + *(in->cur) = '\''; + in->cur++; + while (*op) + { + if (t_iseq(op, '\'')) + { + *(in->cur) = '\''; + in->cur++; + } + COPYCHAR(in->cur, op); + + clen = pg_mblen(op); + op += clen; + in->cur += clen; + } + *(in->cur) = '\''; + in->cur++; + if (in->curpol->weight) + { + *(in->cur) = ':'; + in->cur++; + if (in->curpol->weight & (1 << 3)) + { + *(in->cur) = 'A'; + in->cur++; + } + if (in->curpol->weight & (1 << 2)) + { + *(in->cur) = 'B'; + in->cur++; + } + if (in->curpol->weight & (1 << 1)) + { + *(in->cur) = 'C'; + in->cur++; + } + if (in->curpol->weight & 1) + { + *(in->cur) = 'D'; + in->cur++; + } + } + *(in->cur) = '\0'; + in->curpol++; + } + else if (in->curpol->val == (int4) '!') + { + bool isopr = false; + + RESIZEBUF(in, 1); + *(in->cur) = '!'; + in->cur++; + *(in->cur) = '\0'; + in->curpol++; + if (in->curpol->type == OPR) + { + isopr = true; + RESIZEBUF(in, 2); + sprintf(in->cur, "( "); + in->cur = strchr(in->cur, '\0'); + } + infix(in, isopr); + if (isopr) + { + RESIZEBUF(in, 2); + sprintf(in->cur, " )"); + in->cur = strchr(in->cur, '\0'); + } + } + else + { + int4 op = in->curpol->val; + INFIX nrm; + + in->curpol++; + if (op == (int4) '|' && !first) + { + RESIZEBUF(in, 2); + sprintf(in->cur, "( "); + in->cur = strchr(in->cur, '\0'); + } + + nrm.curpol = in->curpol; + nrm.op = in->op; + nrm.buflen = 16; + nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + + /* get right operand */ + infix(&nrm, false); + + /* get & print left operand */ + in->curpol = nrm.curpol; + infix(in, false); + + /* print operator & right operand */ + RESIZEBUF(in, 3 + (nrm.cur - nrm.buf)); + sprintf(in->cur, " %c %s", op, nrm.buf); + in->cur = strchr(in->cur, '\0'); + pfree(nrm.buf); + + if (op == (int4) '|' && !first) + { + RESIZEBUF(in, 2); + sprintf(in->cur, " )"); + in->cur = strchr(in->cur, '\0'); + } + } +} + + +Datum +tsqueryout(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + INFIX nrm; + + if (query->size == 0) + { + char *b = palloc(1); + + *b = '\0'; + PG_RETURN_POINTER(b); + } + nrm.curpol = GETQUERY(query); + nrm.buflen = 32; + nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + *(nrm.cur) = '\0'; + nrm.op = GETOPERAND(query); + infix(&nrm, true); + + PG_FREE_IF_COPY(query, 0); + PG_RETURN_CSTRING(nrm.buf); +} + +Datum +tsquerysend(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + StringInfoData buf; + int i; + QueryItem *item = GETQUERY(query); + + pq_begintypsend(&buf); + + pq_sendint(&buf, query->size, sizeof(int32)); + for (i = 0; i < query->size; i++) + { + int tmp; + + pq_sendint(&buf, item->type, sizeof(item->type)); + pq_sendint(&buf, item->weight, sizeof(item->weight)); + pq_sendint(&buf, item->left, sizeof(item->left)); + pq_sendint(&buf, item->val, sizeof(item->val)); + + /* + * We are sure that sizeof(WordEntry) == sizeof(int32), and about + * layout of QueryItem + */ + tmp = *(int32 *) (((char *) item) + HDRSIZEQI); + pq_sendint(&buf, tmp, sizeof(tmp)); + + item++; + } + + item = GETQUERY(query); + for (i = 0; i < query->size; i++) + { + if (item->type == VAL) + pq_sendbytes(&buf, GETOPERAND(query) + item->distance, item->length); + item++; + } + + PG_FREE_IF_COPY(query, 0); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +Datum +tsqueryrecv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + TSQuery query; + int i, + size, + tmp, + len = HDRSIZETQ; + QueryItem *item; + int datalen = 0; + char *ptr; + + size = pq_getmsgint(buf, sizeof(uint32)); + if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem))) + elog(ERROR, "invalid size of tsquery"); + len += sizeof(QueryItem) * size; + + query = (TSQuery) palloc(len); + query->size = size; + item = GETQUERY(query); + + for (i = 0; i < size; i++) + { + item->type = (int8) pq_getmsgint(buf, sizeof(int8)); + item->weight = (int8) pq_getmsgint(buf, sizeof(int8)); + item->left = (int16) pq_getmsgint(buf, sizeof(int16)); + item->val = (int32) pq_getmsgint(buf, sizeof(int32)); + tmp = pq_getmsgint(buf, sizeof(int32)); + memcpy((((char *) item) + HDRSIZEQI), &tmp, sizeof(int32)); + + /* + * Sanity checks + */ + if (item->type == VAL) + { + datalen += item->length + 1; /* \0 */ + } + else if (item->type == OPR) + { + if (item->val == '|' || item->val == '&') + { + if (item->left <= 0 || i + item->left >= size) + elog(ERROR, "invalid pointer to left operand"); + } + + if (i == size - 1) + elog(ERROR, "invalid pointer to right operand"); + } + else + elog(ERROR, "unknown tsquery node type"); + + item++; + } + + query = (TSQuery) repalloc(query, len + datalen); + + item = GETQUERY(query); + ptr = GETOPERAND(query); + for (i = 0; i < size; i++) + { + if (item->type == VAL) + { + item->distance = ptr - GETOPERAND(query); + memcpy(ptr, + pq_getmsgbytes(buf, item->length), + item->length); + ptr += item->length; + *ptr++ = '\0'; + } + item++; + } + + Assert(ptr - GETOPERAND(query) == datalen); + + SET_VARSIZE(query, len + datalen); + + PG_RETURN_TSVECTOR(query); +} + +/* + * debug function, used only for view query + * which will be executed in non-leaf pages in index + */ +Datum +tsquerytree(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + INFIX nrm; + text *res; + QueryItem *q; + int4 len; + + if (query->size == 0) + { + res = (text *) palloc(VARHDRSZ); + SET_VARSIZE(res, VARHDRSZ); + PG_RETURN_POINTER(res); + } + + q = clean_NOT(GETQUERY(query), &len); + + if (!q) + { + res = (text *) palloc(1 + VARHDRSZ); + SET_VARSIZE(res, 1 + VARHDRSZ); + *((char *) VARDATA(res)) = 'T'; + } + else + { + nrm.curpol = q; + nrm.buflen = 32; + nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + *(nrm.cur) = '\0'; + nrm.op = GETOPERAND(query); + infix(&nrm, true); + + res = (text *) palloc(nrm.cur - nrm.buf + VARHDRSZ); + SET_VARSIZE(res, nrm.cur - nrm.buf + VARHDRSZ); + strncpy(VARDATA(res), nrm.buf, nrm.cur - nrm.buf); + pfree(q); + } + + PG_FREE_IF_COPY(query, 0); + + PG_RETURN_POINTER(res); +} |