aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/tsvector.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/tsvector.c')
-rw-r--r--src/backend/utils/adt/tsvector.c683
1 files changed, 683 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
new file mode 100644
index 00000000000..04b6345e162
--- /dev/null
+++ b/src/backend/utils/adt/tsvector.c
@@ -0,0 +1,683 @@
+/*-------------------------------------------------------------------------
+ *
+ * tsvector.c
+ * I/O functions for tsvector
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.1 2007/08/21 01:11:19 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "libpq/pqformat.h"
+#include "tsearch/ts_type.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+#include "utils/memutils.h"
+
+
+static int
+comparePos(const void *a, const void *b)
+{
+ if (WEP_GETPOS(*(WordEntryPos *) a) == WEP_GETPOS(*(WordEntryPos *) b))
+ return 0;
+ return (WEP_GETPOS(*(WordEntryPos *) a) > WEP_GETPOS(*(WordEntryPos *) b)) ? 1 : -1;
+}
+
+static int
+uniquePos(WordEntryPos * a, int4 l)
+{
+ WordEntryPos *ptr,
+ *res;
+
+ if (l == 1)
+ return l;
+
+ res = a;
+ qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
+
+ ptr = a + 1;
+ while (ptr - a < l)
+ {
+ if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
+ {
+ res++;
+ *res = *ptr;
+ if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1)
+ break;
+ }
+ else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
+ WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
+ ptr++;
+ }
+
+ return res + 1 - a;
+}
+
+static int
+compareentry(const void *a, const void *b, void *arg)
+{
+ char *BufferStr = (char *) arg;
+
+ if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len)
+ {
+ return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos],
+ &BufferStr[((WordEntryIN *) b)->entry.pos],
+ ((WordEntryIN *) a)->entry.len);
+ }
+
+ return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1;
+}
+
+static int
+uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
+{
+ WordEntryIN *ptr,
+ *res;
+
+ res = a;
+ if (l == 1)
+ {
+ if (a->entry.haspos)
+ {
+ *(uint16 *) (a->pos) = uniquePos(&(a->pos[1]), *(uint16 *) (a->pos));
+ *outbuflen = SHORTALIGN(res->entry.len) + (*(uint16 *) (a->pos) + 1) * sizeof(WordEntryPos);
+ }
+ return l;
+ }
+
+ ptr = a + 1;
+ qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf);
+
+ while (ptr - a < l)
+ {
+ if (!(ptr->entry.len == res->entry.len &&
+ strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0))
+ {
+ if (res->entry.haspos)
+ {
+ *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
+ *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
+ }
+ *outbuflen += SHORTALIGN(res->entry.len);
+ res++;
+ memcpy(res, ptr, sizeof(WordEntryIN));
+ }
+ else if (ptr->entry.haspos)
+ {
+ if (res->entry.haspos)
+ {
+ int4 len = *(uint16 *) (ptr->pos) + 1 + *(uint16 *) (res->pos);
+
+ res->pos = (WordEntryPos *) repalloc(res->pos, len * sizeof(WordEntryPos));
+ memcpy(&(res->pos[*(uint16 *) (res->pos) + 1]),
+ &(ptr->pos[1]), *(uint16 *) (ptr->pos) * sizeof(WordEntryPos));
+ *(uint16 *) (res->pos) += *(uint16 *) (ptr->pos);
+ pfree(ptr->pos);
+ }
+ else
+ {
+ res->entry.haspos = 1;
+ res->pos = ptr->pos;
+ }
+ }
+ ptr++;
+ }
+ if (res->entry.haspos)
+ {
+ *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
+ *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
+ }
+ *outbuflen += SHORTALIGN(res->entry.len);
+
+ return res + 1 - a;
+}
+
+static int
+WordEntryCMP(WordEntry * a, WordEntry * b, char *buf)
+{
+ return compareentry(a, b, buf);
+}
+
+#define WAITWORD 1
+#define WAITENDWORD 2
+#define WAITNEXTCHAR 3
+#define WAITENDCMPLX 4
+#define WAITPOSINFO 5
+#define INPOSINFO 6
+#define WAITPOSDELIM 7
+#define WAITCHARCMPLX 8
+
+#define RESIZEPRSBUF \
+do { \
+ if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
+ { \
+ int4 clen = state->curpos - state->word; \
+ state->len *= 2; \
+ state->word = (char*)repalloc( (void*)state->word, state->len ); \
+ state->curpos = state->word + clen; \
+ } \
+} while (0)
+
+bool
+gettoken_tsvector(TSVectorParseState *state)
+{
+ int4 oldstate = 0;
+
+ state->curpos = state->word;
+ state->state = WAITWORD;
+ state->alen = 0;
+
+ while (1)
+ {
+ if (state->state == WAITWORD)
+ {
+ if (*(state->prsbuf) == '\0')
+ return false;
+ else if (t_iseq(state->prsbuf, '\''))
+ state->state = WAITENDCMPLX;
+ else if (t_iseq(state->prsbuf, '\\'))
+ {
+ state->state = WAITNEXTCHAR;
+ oldstate = WAITENDWORD;
+ }
+ else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ else if (!t_isspace(state->prsbuf))
+ {
+ COPYCHAR(state->curpos, state->prsbuf);
+ state->curpos += pg_mblen(state->prsbuf);
+ state->state = WAITENDWORD;
+ }
+ }
+ else if (state->state == WAITNEXTCHAR)
+ {
+ if (*(state->prsbuf) == '\0')
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("there is no escaped character")));
+ else
+ {
+ RESIZEPRSBUF;
+ COPYCHAR(state->curpos, state->prsbuf);
+ state->curpos += pg_mblen(state->prsbuf);
+ state->state = oldstate;
+ }
+ }
+ else if (state->state == WAITENDWORD)
+ {
+ if (t_iseq(state->prsbuf, '\\'))
+ {
+ state->state = WAITNEXTCHAR;
+ oldstate = WAITENDWORD;
+ }
+ else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
+ (state->oprisdelim && ISOPERATOR(state->prsbuf)))
+ {
+ RESIZEPRSBUF;
+ if (state->curpos == state->word)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ *(state->curpos) = '\0';
+ return true;
+ }
+ else if (t_iseq(state->prsbuf, ':'))
+ {
+ if (state->curpos == state->word)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ *(state->curpos) = '\0';
+ if (state->oprisdelim)
+ return true;
+ else
+ state->state = INPOSINFO;
+ }
+ else
+ {
+ RESIZEPRSBUF;
+ COPYCHAR(state->curpos, state->prsbuf);
+ state->curpos += pg_mblen(state->prsbuf);
+ }
+ }
+ else if (state->state == WAITENDCMPLX)
+ {
+ if (t_iseq(state->prsbuf, '\''))
+ {
+ state->state = WAITCHARCMPLX;
+ }
+ else if (t_iseq(state->prsbuf, '\\'))
+ {
+ state->state = WAITNEXTCHAR;
+ oldstate = WAITENDCMPLX;
+ }
+ else if (*(state->prsbuf) == '\0')
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ else
+ {
+ RESIZEPRSBUF;
+ COPYCHAR(state->curpos, state->prsbuf);
+ state->curpos += pg_mblen(state->prsbuf);
+ }
+ }
+ else if (state->state == WAITCHARCMPLX)
+ {
+ if (t_iseq(state->prsbuf, '\''))
+ {
+ RESIZEPRSBUF;
+ COPYCHAR(state->curpos, state->prsbuf);
+ state->curpos += pg_mblen(state->prsbuf);
+ state->state = WAITENDCMPLX;
+ }
+ else
+ {
+ RESIZEPRSBUF;
+ *(state->curpos) = '\0';
+ if (state->curpos == state->word)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ if (state->oprisdelim)
+ {
+ /* state->prsbuf+=pg_mblen(state->prsbuf); */
+ return true;
+ }
+ else
+ state->state = WAITPOSINFO;
+ continue; /* recheck current character */
+ }
+ }
+ else if (state->state == WAITPOSINFO)
+ {
+ if (t_iseq(state->prsbuf, ':'))
+ state->state = INPOSINFO;
+ else
+ return true;
+ }
+ else if (state->state == INPOSINFO)
+ {
+ if (t_isdigit(state->prsbuf))
+ {
+ if (state->alen == 0)
+ {
+ state->alen = 4;
+ state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen);
+ *(uint16 *) (state->pos) = 0;
+ }
+ else if (*(uint16 *) (state->pos) + 1 >= state->alen)
+ {
+ state->alen *= 2;
+ state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen);
+ }
+ (*(uint16 *) (state->pos))++;
+ WEP_SETPOS(state->pos[*(uint16 *) (state->pos)], LIMITPOS(atoi(state->prsbuf)));
+ if (WEP_GETPOS(state->pos[*(uint16 *) (state->pos)]) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("wrong position info in tsvector")));
+ WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
+ state->state = WAITPOSDELIM;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ }
+ else if (state->state == WAITPOSDELIM)
+ {
+ if (t_iseq(state->prsbuf, ','))
+ state->state = INPOSINFO;
+ else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
+ {
+ if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
+ }
+ else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
+ {
+ if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
+ }
+ else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
+ {
+ if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
+ }
+ else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
+ {
+ if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
+ }
+ else if (t_isspace(state->prsbuf) ||
+ *(state->prsbuf) == '\0')
+ return true;
+ else if (!t_isdigit(state->prsbuf))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("syntax error in tsvector")));
+ }
+ else /* internal error */
+ elog(ERROR, "internal error in gettoken_tsvector");
+
+ /* get next char */
+ state->prsbuf += pg_mblen(state->prsbuf);
+ }
+
+ return false;
+}
+
+Datum
+tsvectorin(PG_FUNCTION_ARGS)
+{
+ char *buf = PG_GETARG_CSTRING(0);
+ TSVectorParseState state;
+ WordEntryIN *arr;
+ WordEntry *inarr;
+ int4 len = 0,
+ totallen = 64;
+ TSVector in;
+ char *tmpbuf,
+ *cur;
+ int4 i,
+ buflen = 256;
+
+ pg_verifymbstr(buf, strlen(buf), false);
+ state.prsbuf = buf;
+ state.len = 32;
+ state.word = (char *) palloc(state.len);
+ state.oprisdelim = false;
+
+ arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * totallen);
+ cur = tmpbuf = (char *) palloc(buflen);
+
+ while (gettoken_tsvector(&state))
+ {
+ /*
+ * Realloc buffers if it's needed
+ */
+ if (len >= totallen)
+ {
+ totallen *= 2;
+ arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * totallen);
+ }
+
+ while ((cur - tmpbuf) + (state.curpos - state.word) >= buflen)
+ {
+ int4 dist = cur - tmpbuf;
+
+ buflen *= 2;
+ tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
+ cur = tmpbuf + dist;
+ }
+
+ if (state.curpos - state.word >= MAXSTRLEN)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("word is too long (%d bytes, max %d bytes)",
+ state.curpos - state.word, MAXSTRLEN)));
+
+ arr[len].entry.len = state.curpos - state.word;
+ if (cur - tmpbuf > MAXSTRPOS)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("position value too large")));
+ arr[len].entry.pos = cur - tmpbuf;
+ memcpy((void *) cur, (void *) state.word, arr[len].entry.len);
+ cur += arr[len].entry.len;
+
+ if (state.alen)
+ {
+ arr[len].entry.haspos = 1;
+ arr[len].pos = state.pos;
+ }
+ else
+ arr[len].entry.haspos = 0;
+ len++;
+ }
+ pfree(state.word);
+
+ if (len > 0)
+ len = uniqueentry(arr, len, tmpbuf, &buflen);
+ else
+ buflen = 0;
+ totallen = CALCDATASIZE(len, buflen);
+ in = (TSVector) palloc0(totallen);
+
+ SET_VARSIZE(in, totallen);
+ in->size = len;
+ cur = STRPTR(in);
+ inarr = ARRPTR(in);
+ for (i = 0; i < len; i++)
+ {
+ memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
+ arr[i].entry.pos = cur - STRPTR(in);
+ cur += SHORTALIGN(arr[i].entry.len);
+ if (arr[i].entry.haspos)
+ {
+ memcpy(cur, arr[i].pos, (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos));
+ cur += (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos);
+ pfree(arr[i].pos);
+ }
+ inarr[i] = arr[i].entry;
+ }
+
+ PG_RETURN_TSVECTOR(in);
+}
+
+Datum
+tsvectorout(PG_FUNCTION_ARGS)
+{
+ TSVector out = PG_GETARG_TSVECTOR(0);
+ char *outbuf;
+ int4 i,
+ lenbuf = 0,
+ pp;
+ WordEntry *ptr = ARRPTR(out);
+ char *curbegin,
+ *curin,
+ *curout;
+
+ lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
+ for (i = 0; i < out->size; i++)
+ {
+ lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
+ if (ptr[i].haspos)
+ lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
+ }
+
+ curout = outbuf = (char *) palloc(lenbuf);
+ for (i = 0; i < out->size; i++)
+ {
+ curbegin = curin = STRPTR(out) + ptr->pos;
+ if (i != 0)
+ *curout++ = ' ';
+ *curout++ = '\'';
+ while (curin - curbegin < ptr->len)
+ {
+ int len = pg_mblen(curin);
+
+ if (t_iseq(curin, '\''))
+ *curout++ = '\'';
+
+ while (len--)
+ *curout++ = *curin++;
+ }
+
+ *curout++ = '\'';
+ if ((pp = POSDATALEN(out, ptr)) != 0)
+ {
+ WordEntryPos *wptr;
+
+ *curout++ = ':';
+ wptr = POSDATAPTR(out, ptr);
+ while (pp)
+ {
+ curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
+ switch (WEP_GETWEIGHT(*wptr))
+ {
+ case 3:
+ *curout++ = 'A';
+ break;
+ case 2:
+ *curout++ = 'B';
+ break;
+ case 1:
+ *curout++ = 'C';
+ break;
+ case 0:
+ default:
+ break;
+ }
+
+ if (pp > 1)
+ *curout++ = ',';
+ pp--;
+ wptr++;
+ }
+ }
+ ptr++;
+ }
+
+ *curout = '\0';
+ PG_FREE_IF_COPY(out, 0);
+ PG_RETURN_CSTRING(outbuf);
+}
+
+Datum
+tsvectorsend(PG_FUNCTION_ARGS)
+{
+ TSVector vec = PG_GETARG_TSVECTOR(0);
+ StringInfoData buf;
+ int i,
+ j;
+ WordEntry *weptr = ARRPTR(vec);
+
+ pq_begintypsend(&buf);
+
+ pq_sendint(&buf, vec->size, sizeof(int32));
+ for (i = 0; i < vec->size; i++)
+ {
+ /*
+ * We are sure that sizeof(WordEntry) == sizeof(int32)
+ */
+ pq_sendint(&buf, *(int32 *) weptr, sizeof(int32));
+
+ pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len);
+ if (weptr->haspos)
+ {
+ WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
+
+ pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos));
+ for (j = 0; j < POSDATALEN(vec, weptr); j++)
+ pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
+ }
+ weptr++;
+ }
+
+ PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
+}
+
+Datum
+tsvectorrecv(PG_FUNCTION_ARGS)
+{
+ StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
+ TSVector vec;
+ int i,
+ size,
+ len = DATAHDRSIZE;
+ WordEntry *weptr;
+ int datalen = 0;
+
+ size = pq_getmsgint(buf, sizeof(uint32));
+ if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry)))
+ elog(ERROR, "invalid size of tsvector");
+
+ len += sizeof(WordEntry) * size;
+
+ len *= 2;
+ vec = (TSVector) palloc0(len);
+ vec->size = size;
+
+ weptr = ARRPTR(vec);
+ for (i = 0; i < size; i++)
+ {
+ int tmp;
+
+ weptr = ARRPTR(vec) + i;
+
+ /*
+ * We are sure that sizeof(WordEntry) == sizeof(int32)
+ */
+ tmp = pq_getmsgint(buf, sizeof(int32));
+ *weptr = *(WordEntry *) & tmp;
+
+ while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len)
+ {
+ len *= 2;
+ vec = (TSVector) repalloc(vec, len);
+ weptr = ARRPTR(vec) + i;
+ }
+
+ memcpy(STRPTR(vec) + weptr->pos,
+ pq_getmsgbytes(buf, weptr->len),
+ weptr->len);
+ datalen += SHORTALIGN(weptr->len);
+
+ if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0)
+ elog(ERROR, "lexemes are unordered");
+
+ if (weptr->haspos)
+ {
+ uint16 j,
+ npos;
+ WordEntryPos *wepptr;
+
+ npos = (uint16) pq_getmsgint(buf, sizeof(int16));
+ if (npos > MAXNUMPOS)
+ elog(ERROR, "unexpected number of positions");
+
+ while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len)
+ {
+ len *= 2;
+ vec = (TSVector) repalloc(vec, len);
+ weptr = ARRPTR(vec) + i;
+ }
+
+ memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16));
+ wepptr = POSDATAPTR(vec, weptr);
+ for (j = 0; j < npos; j++)
+ {
+ wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16));
+ if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
+ elog(ERROR, "position information is unordered");
+ }
+
+ datalen += (npos + 1) * sizeof(WordEntry);
+ }
+ }
+
+ SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen));
+
+ PG_RETURN_TSVECTOR(vec);
+}