diff options
Diffstat (limited to 'src/backend/utils/adt/tsvector.c')
-rw-r--r-- | src/backend/utils/adt/tsvector.c | 683 |
1 files changed, 683 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c new file mode 100644 index 00000000000..04b6345e162 --- /dev/null +++ b/src/backend/utils/adt/tsvector.c @@ -0,0 +1,683 @@ +/*------------------------------------------------------------------------- + * + * tsvector.c + * I/O functions for tsvector + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqformat.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/memutils.h" + + +static int +comparePos(const void *a, const void *b) +{ + if (WEP_GETPOS(*(WordEntryPos *) a) == WEP_GETPOS(*(WordEntryPos *) b)) + return 0; + return (WEP_GETPOS(*(WordEntryPos *) a) > WEP_GETPOS(*(WordEntryPos *) b)) ? 1 : -1; +} + +static int +uniquePos(WordEntryPos * a, int4 l) +{ + WordEntryPos *ptr, + *res; + + if (l == 1) + return l; + + res = a; + qsort((void *) a, l, sizeof(WordEntryPos), comparePos); + + ptr = a + 1; + while (ptr - a < l) + { + if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res)) + { + res++; + *res = *ptr; + if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1) + break; + } + else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) + WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr)); + ptr++; + } + + return res + 1 - a; +} + +static int +compareentry(const void *a, const void *b, void *arg) +{ + char *BufferStr = (char *) arg; + + if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len) + { + return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos], + &BufferStr[((WordEntryIN *) b)->entry.pos], + ((WordEntryIN *) a)->entry.len); + } + + return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1; +} + +static int +uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen) +{ + WordEntryIN *ptr, + *res; + + res = a; + if (l == 1) + { + if (a->entry.haspos) + { + *(uint16 *) (a->pos) = uniquePos(&(a->pos[1]), *(uint16 *) (a->pos)); + *outbuflen = SHORTALIGN(res->entry.len) + (*(uint16 *) (a->pos) + 1) * sizeof(WordEntryPos); + } + return l; + } + + ptr = a + 1; + qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf); + + while (ptr - a < l) + { + if (!(ptr->entry.len == res->entry.len && + strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0)) + { + if (res->entry.haspos) + { + *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos)); + *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos); + } + *outbuflen += SHORTALIGN(res->entry.len); + res++; + memcpy(res, ptr, sizeof(WordEntryIN)); + } + else if (ptr->entry.haspos) + { + if (res->entry.haspos) + { + int4 len = *(uint16 *) (ptr->pos) + 1 + *(uint16 *) (res->pos); + + res->pos = (WordEntryPos *) repalloc(res->pos, len * sizeof(WordEntryPos)); + memcpy(&(res->pos[*(uint16 *) (res->pos) + 1]), + &(ptr->pos[1]), *(uint16 *) (ptr->pos) * sizeof(WordEntryPos)); + *(uint16 *) (res->pos) += *(uint16 *) (ptr->pos); + pfree(ptr->pos); + } + else + { + res->entry.haspos = 1; + res->pos = ptr->pos; + } + } + ptr++; + } + if (res->entry.haspos) + { + *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos)); + *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos); + } + *outbuflen += SHORTALIGN(res->entry.len); + + return res + 1 - a; +} + +static int +WordEntryCMP(WordEntry * a, WordEntry * b, char *buf) +{ + return compareentry(a, b, buf); +} + +#define WAITWORD 1 +#define WAITENDWORD 2 +#define WAITNEXTCHAR 3 +#define WAITENDCMPLX 4 +#define WAITPOSINFO 5 +#define INPOSINFO 6 +#define WAITPOSDELIM 7 +#define WAITCHARCMPLX 8 + +#define RESIZEPRSBUF \ +do { \ + if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \ + { \ + int4 clen = state->curpos - state->word; \ + state->len *= 2; \ + state->word = (char*)repalloc( (void*)state->word, state->len ); \ + state->curpos = state->word + clen; \ + } \ +} while (0) + +bool +gettoken_tsvector(TSVectorParseState *state) +{ + int4 oldstate = 0; + + state->curpos = state->word; + state->state = WAITWORD; + state->alen = 0; + + while (1) + { + if (state->state == WAITWORD) + { + if (*(state->prsbuf) == '\0') + return false; + else if (t_iseq(state->prsbuf, '\'')) + state->state = WAITENDCMPLX; + else if (t_iseq(state->prsbuf, '\\')) + { + state->state = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + else if (!t_isspace(state->prsbuf)) + { + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + state->state = WAITENDWORD; + } + } + else if (state->state == WAITNEXTCHAR) + { + if (*(state->prsbuf) == '\0') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("there is no escaped character"))); + else + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + state->state = oldstate; + } + } + else if (state->state == WAITENDWORD) + { + if (t_iseq(state->prsbuf, '\\')) + { + state->state = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || + (state->oprisdelim && ISOPERATOR(state->prsbuf))) + { + RESIZEPRSBUF; + if (state->curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + *(state->curpos) = '\0'; + return true; + } + else if (t_iseq(state->prsbuf, ':')) + { + if (state->curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + *(state->curpos) = '\0'; + if (state->oprisdelim) + return true; + else + state->state = INPOSINFO; + } + else + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + } + } + else if (state->state == WAITENDCMPLX) + { + if (t_iseq(state->prsbuf, '\'')) + { + state->state = WAITCHARCMPLX; + } + else if (t_iseq(state->prsbuf, '\\')) + { + state->state = WAITNEXTCHAR; + oldstate = WAITENDCMPLX; + } + else if (*(state->prsbuf) == '\0') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + else + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + } + } + else if (state->state == WAITCHARCMPLX) + { + if (t_iseq(state->prsbuf, '\'')) + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + state->state = WAITENDCMPLX; + } + else + { + RESIZEPRSBUF; + *(state->curpos) = '\0'; + if (state->curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + if (state->oprisdelim) + { + /* state->prsbuf+=pg_mblen(state->prsbuf); */ + return true; + } + else + state->state = WAITPOSINFO; + continue; /* recheck current character */ + } + } + else if (state->state == WAITPOSINFO) + { + if (t_iseq(state->prsbuf, ':')) + state->state = INPOSINFO; + else + return true; + } + else if (state->state == INPOSINFO) + { + if (t_isdigit(state->prsbuf)) + { + if (state->alen == 0) + { + state->alen = 4; + state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen); + *(uint16 *) (state->pos) = 0; + } + else if (*(uint16 *) (state->pos) + 1 >= state->alen) + { + state->alen *= 2; + state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen); + } + (*(uint16 *) (state->pos))++; + WEP_SETPOS(state->pos[*(uint16 *) (state->pos)], LIMITPOS(atoi(state->prsbuf))); + if (WEP_GETPOS(state->pos[*(uint16 *) (state->pos)]) == 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("wrong position info in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); + state->state = WAITPOSDELIM; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + } + else if (state->state == WAITPOSDELIM) + { + if (t_iseq(state->prsbuf, ',')) + state->state = INPOSINFO; + else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3); + } + else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2); + } + else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1); + } + else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); + } + else if (t_isspace(state->prsbuf) || + *(state->prsbuf) == '\0') + return true; + else if (!t_isdigit(state->prsbuf)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + } + else /* internal error */ + elog(ERROR, "internal error in gettoken_tsvector"); + + /* get next char */ + state->prsbuf += pg_mblen(state->prsbuf); + } + + return false; +} + +Datum +tsvectorin(PG_FUNCTION_ARGS) +{ + char *buf = PG_GETARG_CSTRING(0); + TSVectorParseState state; + WordEntryIN *arr; + WordEntry *inarr; + int4 len = 0, + totallen = 64; + TSVector in; + char *tmpbuf, + *cur; + int4 i, + buflen = 256; + + pg_verifymbstr(buf, strlen(buf), false); + state.prsbuf = buf; + state.len = 32; + state.word = (char *) palloc(state.len); + state.oprisdelim = false; + + arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * totallen); + cur = tmpbuf = (char *) palloc(buflen); + + while (gettoken_tsvector(&state)) + { + /* + * Realloc buffers if it's needed + */ + if (len >= totallen) + { + totallen *= 2; + arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * totallen); + } + + while ((cur - tmpbuf) + (state.curpos - state.word) >= buflen) + { + int4 dist = cur - tmpbuf; + + buflen *= 2; + tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); + cur = tmpbuf + dist; + } + + if (state.curpos - state.word >= MAXSTRLEN) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("word is too long (%d bytes, max %d bytes)", + state.curpos - state.word, MAXSTRLEN))); + + arr[len].entry.len = state.curpos - state.word; + if (cur - tmpbuf > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("position value too large"))); + arr[len].entry.pos = cur - tmpbuf; + memcpy((void *) cur, (void *) state.word, arr[len].entry.len); + cur += arr[len].entry.len; + + if (state.alen) + { + arr[len].entry.haspos = 1; + arr[len].pos = state.pos; + } + else + arr[len].entry.haspos = 0; + len++; + } + pfree(state.word); + + if (len > 0) + len = uniqueentry(arr, len, tmpbuf, &buflen); + else + buflen = 0; + totallen = CALCDATASIZE(len, buflen); + in = (TSVector) palloc0(totallen); + + SET_VARSIZE(in, totallen); + in->size = len; + cur = STRPTR(in); + inarr = ARRPTR(in); + for (i = 0; i < len; i++) + { + memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len); + arr[i].entry.pos = cur - STRPTR(in); + cur += SHORTALIGN(arr[i].entry.len); + if (arr[i].entry.haspos) + { + memcpy(cur, arr[i].pos, (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos)); + cur += (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos); + pfree(arr[i].pos); + } + inarr[i] = arr[i].entry; + } + + PG_RETURN_TSVECTOR(in); +} + +Datum +tsvectorout(PG_FUNCTION_ARGS) +{ + TSVector out = PG_GETARG_TSVECTOR(0); + char *outbuf; + int4 i, + lenbuf = 0, + pp; + WordEntry *ptr = ARRPTR(out); + char *curbegin, + *curin, + *curout; + + lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; + for (i = 0; i < out->size; i++) + { + lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; + if (ptr[i].haspos) + lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); + } + + curout = outbuf = (char *) palloc(lenbuf); + for (i = 0; i < out->size; i++) + { + curbegin = curin = STRPTR(out) + ptr->pos; + if (i != 0) + *curout++ = ' '; + *curout++ = '\''; + while (curin - curbegin < ptr->len) + { + int len = pg_mblen(curin); + + if (t_iseq(curin, '\'')) + *curout++ = '\''; + + while (len--) + *curout++ = *curin++; + } + + *curout++ = '\''; + if ((pp = POSDATALEN(out, ptr)) != 0) + { + WordEntryPos *wptr; + + *curout++ = ':'; + wptr = POSDATAPTR(out, ptr); + while (pp) + { + curout += sprintf(curout, "%d", WEP_GETPOS(*wptr)); + switch (WEP_GETWEIGHT(*wptr)) + { + case 3: + *curout++ = 'A'; + break; + case 2: + *curout++ = 'B'; + break; + case 1: + *curout++ = 'C'; + break; + case 0: + default: + break; + } + + if (pp > 1) + *curout++ = ','; + pp--; + wptr++; + } + } + ptr++; + } + + *curout = '\0'; + PG_FREE_IF_COPY(out, 0); + PG_RETURN_CSTRING(outbuf); +} + +Datum +tsvectorsend(PG_FUNCTION_ARGS) +{ + TSVector vec = PG_GETARG_TSVECTOR(0); + StringInfoData buf; + int i, + j; + WordEntry *weptr = ARRPTR(vec); + + pq_begintypsend(&buf); + + pq_sendint(&buf, vec->size, sizeof(int32)); + for (i = 0; i < vec->size; i++) + { + /* + * We are sure that sizeof(WordEntry) == sizeof(int32) + */ + pq_sendint(&buf, *(int32 *) weptr, sizeof(int32)); + + pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len); + if (weptr->haspos) + { + WordEntryPos *wepptr = POSDATAPTR(vec, weptr); + + pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos)); + for (j = 0; j < POSDATALEN(vec, weptr); j++) + pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); + } + weptr++; + } + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +Datum +tsvectorrecv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + TSVector vec; + int i, + size, + len = DATAHDRSIZE; + WordEntry *weptr; + int datalen = 0; + + size = pq_getmsgint(buf, sizeof(uint32)); + if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry))) + elog(ERROR, "invalid size of tsvector"); + + len += sizeof(WordEntry) * size; + + len *= 2; + vec = (TSVector) palloc0(len); + vec->size = size; + + weptr = ARRPTR(vec); + for (i = 0; i < size; i++) + { + int tmp; + + weptr = ARRPTR(vec) + i; + + /* + * We are sure that sizeof(WordEntry) == sizeof(int32) + */ + tmp = pq_getmsgint(buf, sizeof(int32)); + *weptr = *(WordEntry *) & tmp; + + while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len) + { + len *= 2; + vec = (TSVector) repalloc(vec, len); + weptr = ARRPTR(vec) + i; + } + + memcpy(STRPTR(vec) + weptr->pos, + pq_getmsgbytes(buf, weptr->len), + weptr->len); + datalen += SHORTALIGN(weptr->len); + + if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0) + elog(ERROR, "lexemes are unordered"); + + if (weptr->haspos) + { + uint16 j, + npos; + WordEntryPos *wepptr; + + npos = (uint16) pq_getmsgint(buf, sizeof(int16)); + if (npos > MAXNUMPOS) + elog(ERROR, "unexpected number of positions"); + + while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len) + { + len *= 2; + vec = (TSVector) repalloc(vec, len); + weptr = ARRPTR(vec) + i; + } + + memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16)); + wepptr = POSDATAPTR(vec, weptr); + for (j = 0; j < npos; j++) + { + wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16)); + if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) + elog(ERROR, "position information is unordered"); + } + + datalen += (npos + 1) * sizeof(WordEntry); + } + } + + SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen)); + + PG_RETURN_TSVECTOR(vec); +} |