1 files changed, 683 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
new file mode 100644
index 00000000000..04b6345e162
--- /dev/null
+++ b/src/backend/utils/adt/tsvector.c
@@ -0,0 +1,683 @@
+/*-------------------------------------------------------------------------
+ *
+ * tsvector.c
+ *	  I/O functions for tsvector
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.1 2007/08/21 01:11:19 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "libpq/pqformat.h"
+#include "tsearch/ts_type.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+#include "utils/memutils.h"
+
+
+static int
+comparePos(const void *a, const void *b)
+{
+	if (WEP_GETPOS(*(WordEntryPos *) a) == WEP_GETPOS(*(WordEntryPos *) b))
+		return 0;
+	return (WEP_GETPOS(*(WordEntryPos *) a) > WEP_GETPOS(*(WordEntryPos *) b)) ? 1 : -1;
+}
+
+static int
+uniquePos(WordEntryPos * a, int4 l)
+{
+	WordEntryPos *ptr,
+			   *res;
+
+	if (l == 1)
+		return l;
+
+	res = a;
+	qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
+
+	ptr = a + 1;
+	while (ptr - a < l)
+	{
+		if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
+		{
+			res++;
+			*res = *ptr;
+			if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1)
+				break;
+		}
+		else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
+			WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
+		ptr++;
+	}
+
+	return res + 1 - a;
+}
+
+static int
+compareentry(const void *a, const void *b, void *arg)
+{
+	char	   *BufferStr = (char *) arg;
+
+	if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len)
+	{
+		return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos],
+					   &BufferStr[((WordEntryIN *) b)->entry.pos],
+					   ((WordEntryIN *) a)->entry.len);
+	}
+
+	return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1;
+}
+
+static int
+uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
+{
+	WordEntryIN *ptr,
+			   *res;
+
+	res = a;
+	if (l == 1)
+	{
+		if (a->entry.haspos)
+		{
+			*(uint16 *) (a->pos) = uniquePos(&(a->pos[1]), *(uint16 *) (a->pos));
+			*outbuflen = SHORTALIGN(res->entry.len) + (*(uint16 *) (a->pos) + 1) * sizeof(WordEntryPos);
+		}
+		return l;
+	}
+
+	ptr = a + 1;
+	qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf);
+
+	while (ptr - a < l)
+	{
+		if (!(ptr->entry.len == res->entry.len &&
+			  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0))
+		{
+			if (res->entry.haspos)
+			{
+				*(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
+				*outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
+			}
+			*outbuflen += SHORTALIGN(res->entry.len);
+			res++;
+			memcpy(res, ptr, sizeof(WordEntryIN));
+		}
+		else if (ptr->entry.haspos)
+		{
+			if (res->entry.haspos)
+			{
+				int4		len = *(uint16 *) (ptr->pos) + 1 + *(uint16 *) (res->pos);
+
+				res->pos = (WordEntryPos *) repalloc(res->pos, len * sizeof(WordEntryPos));
+				memcpy(&(res->pos[*(uint16 *) (res->pos) + 1]),
+					   &(ptr->pos[1]), *(uint16 *) (ptr->pos) * sizeof(WordEntryPos));
+				*(uint16 *) (res->pos) += *(uint16 *) (ptr->pos);
+				pfree(ptr->pos);
+			}
+			else
+			{
+				res->entry.haspos = 1;
+				res->pos = ptr->pos;
+			}
+		}
+		ptr++;
+	}
+	if (res->entry.haspos)
+	{
+		*(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
+		*outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
+	}
+	*outbuflen += SHORTALIGN(res->entry.len);
+
+	return res + 1 - a;
+}
+
+static int
+WordEntryCMP(WordEntry * a, WordEntry * b, char *buf)
+{
+	return compareentry(a, b, buf);
+}
+
+#define WAITWORD		1
+#define WAITENDWORD		2
+#define WAITNEXTCHAR	3
+#define WAITENDCMPLX	4
+#define WAITPOSINFO		5
+#define INPOSINFO		6
+#define WAITPOSDELIM	7
+#define WAITCHARCMPLX	8
+
+#define RESIZEPRSBUF \
+do { \
+	if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
+	{ \
+		int4 clen = state->curpos - state->word; \
+		state->len *= 2; \
+		state->word = (char*)repalloc( (void*)state->word, state->len ); \
+		state->curpos = state->word + clen; \
+	} \
+} while (0)
+
+bool
+gettoken_tsvector(TSVectorParseState *state)
+{
+	int4		oldstate = 0;
+
+	state->curpos = state->word;
+	state->state = WAITWORD;
+	state->alen = 0;
+
+	while (1)
+	{
+		if (state->state == WAITWORD)
+		{
+			if (*(state->prsbuf) == '\0')
+				return false;
+			else if (t_iseq(state->prsbuf, '\''))
+				state->state = WAITENDCMPLX;
+			else if (t_iseq(state->prsbuf, '\\'))
+			{
+				state->state = WAITNEXTCHAR;
+				oldstate = WAITENDWORD;
+			}
+			else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("syntax error in tsvector")));
+			else if (!t_isspace(state->prsbuf))
+			{
+				COPYCHAR(state->curpos, state->prsbuf);
+				state->curpos += pg_mblen(state->prsbuf);
+				state->state = WAITENDWORD;
+			}
+		}
+		else if (state->state == WAITNEXTCHAR)
+		{
+			if (*(state->prsbuf) == '\0')
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("there is no escaped character")));
+			else
+			{
+				RESIZEPRSBUF;
+				COPYCHAR(state->curpos, state->prsbuf);
+				state->curpos += pg_mblen(state->prsbuf);
+				state->state = oldstate;
+			}
+		}
+		else if (state->state == WAITENDWORD)
+		{
+			if (t_iseq(state->prsbuf, '\\'))
+			{
+				state->state = WAITNEXTCHAR;
+				oldstate = WAITENDWORD;
+			}
+			else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
+					 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
+			{
+				RESIZEPRSBUF;
+				if (state->curpos == state->word)
+					ereport(ERROR,
+							(errcode(ERRCODE_SYNTAX_ERROR),
+							 errmsg("syntax error in tsvector")));
+				*(state->curpos) = '\0';
+				return true;
+			}
+			else if (t_iseq(state->prsbuf, ':'))
+			{
+				if (state->curpos == state->word)
+					ereport(ERROR,
+							(errcode(ERRCODE_SYNTAX_ERROR),
+							 errmsg("syntax error in tsvector")));
+				*(state->curpos) = '\0';
+				if (state->oprisdelim)
+					return true;
+				else
+					state->state = INPOSINFO;
+			}
+			else
+			{
+				RESIZEPRSBUF;
+				COPYCHAR(state->curpos, state->prsbuf);
+				state->curpos += pg_mblen(state->prsbuf);
+			}
+		}
+		else if (state->state == WAITENDCMPLX)
+		{
+			if (t_iseq(state->prsbuf, '\''))
+			{
+				state->state = WAITCHARCMPLX;
+			}
+			else if (t_iseq(state->prsbuf, '\\'))
+			{
+				state->state = WAITNEXTCHAR;
+				oldstate = WAITENDCMPLX;
+			}
+			else if (*(state->prsbuf) == '\0')
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("syntax error in tsvector")));
+			else
+			{
+				RESIZEPRSBUF;
+				COPYCHAR(state->curpos, state->prsbuf);
+				state->curpos += pg_mblen(state->prsbuf);
+			}
+		}
+		else if (state->state == WAITCHARCMPLX)
+		{
+			if (t_iseq(state->prsbuf, '\''))
+			{
+				RESIZEPRSBUF;
+				COPYCHAR(state->curpos, state->prsbuf);
+				state->curpos += pg_mblen(state->prsbuf);
+				state->state = WAITENDCMPLX;
+			}
+			else
+			{
+				RESIZEPRSBUF;
+				*(state->curpos) = '\0';
+				if (state->curpos == state->word)
+					ereport(ERROR,
+							(errcode(ERRCODE_SYNTAX_ERROR),
+							 errmsg("syntax error in tsvector")));
+				if (state->oprisdelim)
+				{
+					/* state->prsbuf+=pg_mblen(state->prsbuf); */
+					return true;
+				}
+				else
+					state->state = WAITPOSINFO;
+				continue;		/* recheck current character */
+			}
+		}
+		else if (state->state == WAITPOSINFO)
+		{
+			if (t_iseq(state->prsbuf, ':'))
+				state->state = INPOSINFO;
+			else
+				return true;
+		}
+		else if (state->state == INPOSINFO)
+		{
+			if (t_isdigit(state->prsbuf))
+			{
+				if (state->alen == 0)
+				{
+					state->alen = 4;
+					state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen);
+					*(uint16 *) (state->pos) = 0;
+				}
+				else if (*(uint16 *) (state->pos) + 1 >= state->alen)
+				{
+					state->alen *= 2;
+					state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen);
+				}
+				(*(uint16 *) (state->pos))++;
+				WEP_SETPOS(state->pos[*(uint16 *) (state->pos)], LIMITPOS(atoi(state->prsbuf)));
+				if (WEP_GETPOS(state->pos[*(uint16 *) (state->pos)]) == 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_SYNTAX_ERROR),
+							 errmsg("wrong position info in tsvector")));
+				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
+				state->state = WAITPOSDELIM;
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("syntax error in tsvector")));
+		}
+		else if (state->state == WAITPOSDELIM)
+		{
+			if (t_iseq(state->prsbuf, ','))
+				state->state = INPOSINFO;
+			else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
+			{
+				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
+					ereport(ERROR,
+							(errcode(ERRCODE_SYNTAX_ERROR),
+							 errmsg("syntax error in tsvector")));
+				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
+			}
+			else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
+			{
+				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
+					ereport(ERROR,
+							(errcode(ERRCODE_SYNTAX_ERROR),
+							 errmsg("syntax error in tsvector")));
+				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
+			}
+			else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
+			{
+				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
+					ereport(ERROR,
+							(errcode(ERRCODE_SYNTAX_ERROR),
+							 errmsg("syntax error in tsvector")));
+				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
+			}
+			else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
+			{
+				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
+					ereport(ERROR,
+							(errcode(ERRCODE_SYNTAX_ERROR),
+							 errmsg("syntax error in tsvector")));
+				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
+			}
+			else if (t_isspace(state->prsbuf) ||
+					 *(state->prsbuf) == '\0')
+				return true;
+			else if (!t_isdigit(state->prsbuf))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("syntax error in tsvector")));
+		}
+		else					/* internal error */
+			elog(ERROR, "internal error in gettoken_tsvector");
+
+		/* get next char */
+		state->prsbuf += pg_mblen(state->prsbuf);
+	}
+
+	return false;
+}
+
+Datum
+tsvectorin(PG_FUNCTION_ARGS)
+{
+	char	   *buf = PG_GETARG_CSTRING(0);
+	TSVectorParseState state;
+	WordEntryIN *arr;
+	WordEntry  *inarr;
+	int4		len = 0,
+				totallen = 64;
+	TSVector	in;
+	char	   *tmpbuf,
+			   *cur;
+	int4		i,
+				buflen = 256;
+
+	pg_verifymbstr(buf, strlen(buf), false);
+	state.prsbuf = buf;
+	state.len = 32;
+	state.word = (char *) palloc(state.len);
+	state.oprisdelim = false;
+
+	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * totallen);
+	cur = tmpbuf = (char *) palloc(buflen);
+
+	while (gettoken_tsvector(&state))
+	{
+		/*
+		 * Realloc buffers if it's needed
+		 */
+		if (len >= totallen)
+		{
+			totallen *= 2;
+			arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * totallen);
+		}
+
+		while ((cur - tmpbuf) + (state.curpos - state.word) >= buflen)
+		{
+			int4		dist = cur - tmpbuf;
+
+			buflen *= 2;
+			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
+			cur = tmpbuf + dist;
+		}
+
+		if (state.curpos - state.word >= MAXSTRLEN)
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("word is too long (%d bytes, max %d bytes)",
+							state.curpos - state.word, MAXSTRLEN)));
+
+		arr[len].entry.len = state.curpos - state.word;
+		if (cur - tmpbuf > MAXSTRPOS)
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("position value too large")));
+		arr[len].entry.pos = cur - tmpbuf;
+		memcpy((void *) cur, (void *) state.word, arr[len].entry.len);
+		cur += arr[len].entry.len;
+
+		if (state.alen)
+		{
+			arr[len].entry.haspos = 1;
+			arr[len].pos = state.pos;
+		}
+		else
+			arr[len].entry.haspos = 0;
+		len++;
+	}
+	pfree(state.word);
+
+	if (len > 0)
+		len = uniqueentry(arr, len, tmpbuf, &buflen);
+	else
+		buflen = 0;
+	totallen = CALCDATASIZE(len, buflen);
+	in = (TSVector) palloc0(totallen);
+
+	SET_VARSIZE(in, totallen);
+	in->size = len;
+	cur = STRPTR(in);
+	inarr = ARRPTR(in);
+	for (i = 0; i < len; i++)
+	{
+		memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
+		arr[i].entry.pos = cur - STRPTR(in);
+		cur += SHORTALIGN(arr[i].entry.len);
+		if (arr[i].entry.haspos)
+		{
+			memcpy(cur, arr[i].pos, (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos));
+			cur += (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos);
+			pfree(arr[i].pos);
+		}
+		inarr[i] = arr[i].entry;
+	}
+
+	PG_RETURN_TSVECTOR(in);
+}
+
+Datum
+tsvectorout(PG_FUNCTION_ARGS)
+{
+	TSVector	out = PG_GETARG_TSVECTOR(0);
+	char	   *outbuf;
+	int4		i,
+				lenbuf = 0,
+				pp;
+	WordEntry  *ptr = ARRPTR(out);
+	char	   *curbegin,
+			   *curin,
+			   *curout;
+
+	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
+	for (i = 0; i < out->size; i++)
+	{
+		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
+		if (ptr[i].haspos)
+			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
+	}
+
+	curout = outbuf = (char *) palloc(lenbuf);
+	for (i = 0; i < out->size; i++)
+	{
+		curbegin = curin = STRPTR(out) + ptr->pos;
+		if (i != 0)
+			*curout++ = ' ';
+		*curout++ = '\'';
+		while (curin - curbegin < ptr->len)
+		{
+			int			len = pg_mblen(curin);
+
+			if (t_iseq(curin, '\''))
+				*curout++ = '\'';
+
+			while (len--)
+				*curout++ = *curin++;
+		}
+
+		*curout++ = '\'';
+		if ((pp = POSDATALEN(out, ptr)) != 0)
+		{
+			WordEntryPos *wptr;
+
+			*curout++ = ':';
+			wptr = POSDATAPTR(out, ptr);
+			while (pp)
+			{
+				curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
+				switch (WEP_GETWEIGHT(*wptr))
+				{
+					case 3:
+						*curout++ = 'A';
+						break;
+					case 2:
+						*curout++ = 'B';
+						break;
+					case 1:
+						*curout++ = 'C';
+						break;
+					case 0:
+					default:
+						break;
+				}
+
+				if (pp > 1)
+					*curout++ = ',';
+				pp--;
+				wptr++;
+			}
+		}
+		ptr++;
+	}
+
+	*curout = '\0';
+	PG_FREE_IF_COPY(out, 0);
+	PG_RETURN_CSTRING(outbuf);
+}
+
+Datum
+tsvectorsend(PG_FUNCTION_ARGS)
+{
+	TSVector	vec = PG_GETARG_TSVECTOR(0);
+	StringInfoData buf;
+	int			i,
+				j;
+	WordEntry  *weptr = ARRPTR(vec);
+
+	pq_begintypsend(&buf);
+
+	pq_sendint(&buf, vec->size, sizeof(int32));
+	for (i = 0; i < vec->size; i++)
+	{
+		/*
+		 * We are sure that sizeof(WordEntry) == sizeof(int32)
+		 */
+		pq_sendint(&buf, *(int32 *) weptr, sizeof(int32));
+
+		pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len);
+		if (weptr->haspos)
+		{
+			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
+
+			pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos));
+			for (j = 0; j < POSDATALEN(vec, weptr); j++)
+				pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
+		}
+		weptr++;
+	}
+
+	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
+}
+
+Datum
+tsvectorrecv(PG_FUNCTION_ARGS)
+{
+	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
+	TSVector	vec;
+	int			i,
+				size,
+				len = DATAHDRSIZE;
+	WordEntry  *weptr;
+	int			datalen = 0;
+
+	size = pq_getmsgint(buf, sizeof(uint32));
+	if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry)))
+		elog(ERROR, "invalid size of tsvector");
+
+	len += sizeof(WordEntry) * size;
+
+	len *= 2;
+	vec = (TSVector) palloc0(len);
+	vec->size = size;
+
+	weptr = ARRPTR(vec);
+	for (i = 0; i < size; i++)
+	{
+		int			tmp;
+
+		weptr = ARRPTR(vec) + i;
+
+		/*
+		 * We are sure that sizeof(WordEntry) == sizeof(int32)
+		 */
+		tmp = pq_getmsgint(buf, sizeof(int32));
+		*weptr = *(WordEntry *) & tmp;
+
+		while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len)
+		{
+			len *= 2;
+			vec = (TSVector) repalloc(vec, len);
+			weptr = ARRPTR(vec) + i;
+		}
+
+		memcpy(STRPTR(vec) + weptr->pos,
+			   pq_getmsgbytes(buf, weptr->len),
+			   weptr->len);
+		datalen += SHORTALIGN(weptr->len);
+
+		if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0)
+			elog(ERROR, "lexemes are unordered");
+
+		if (weptr->haspos)
+		{
+			uint16		j,
+						npos;
+			WordEntryPos *wepptr;
+
+			npos = (uint16) pq_getmsgint(buf, sizeof(int16));
+			if (npos > MAXNUMPOS)
+				elog(ERROR, "unexpected number of positions");
+
+			while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len)
+			{
+				len *= 2;
+				vec = (TSVector) repalloc(vec, len);
+				weptr = ARRPTR(vec) + i;
+			}
+
+			memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16));
+			wepptr = POSDATAPTR(vec, weptr);
+			for (j = 0; j < npos; j++)
+			{
+				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16));
+				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
+					elog(ERROR, "position information is unordered");
+			}
+
+			datalen += (npos + 1) * sizeof(WordEntry);
+		}
+	}
+
+	SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen));
+
+	PG_RETURN_TSVECTOR(vec);
+}