/*-------------------------------------------------------------------------
 *
 * tsvector.c
 *	  I/O functions for tsvector
 *
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "libpq/pqformat.h"
#include "tsearch/ts_type.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
#include "utils/memutils.h"

typedef struct
{
	WordEntry	entry;			/* should be first ! */
	WordEntryPos *pos;
	int			poslen;			/* number of elements in pos */
} WordEntryIN;

static int
comparePos(const void *a, const void *b)
{
	int apos = WEP_GETPOS(*(WordEntryPos *) a);
	int bpos = WEP_GETPOS(*(WordEntryPos *) b);

	if (apos == bpos)
		return 0;
	return (apos > bpos) ? 1 : -1;
}

/*
 * Removes duplicate pos entries. If there's two entries with same pos
 * but different weight, the higher weight is retained.
 *
 * Returns new length.
 */
static int
uniquePos(WordEntryPos * a, int l)
{
	WordEntryPos *ptr,
			   *res;

	if (l <= 1)
		return l;

	res = a;
	qsort((void *) a, l, sizeof(WordEntryPos), comparePos);

	ptr = a + 1;
	while (ptr - a < l)
	{
		if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
		{
			res++;
			*res = *ptr;
			if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1)
				break;
		}
		else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
			WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
		ptr++;
	}

	return res + 1 - a;
}

static int
compareentry(const void *va, const void *vb, void *arg)
{
	char	   *BufferStr = (char *) arg;
	WordEntryIN *a = (WordEntryIN *) va;
	WordEntryIN *b = (WordEntryIN *) vb;

	if (a->entry.len == b->entry.len)
	{
		return strncmp(&BufferStr[a->entry.pos],
					   &BufferStr[b->entry.pos],
					   a->entry.len);
	}

	return (a->entry.len > b->entry.len) ? 1 : -1;
}

static int
uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
{
	WordEntryIN *ptr,
			   *res;

	Assert(l >= 1);

	if (l == 1)
	{
		if (a->entry.haspos)
		{
			a->poslen = uniquePos(a->pos, a->poslen);
			*outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos);
		}
		else
			*outbuflen = a->entry.len;

		return l;
	}
	res = a;

	ptr = a + 1;
	qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf);

	while (ptr - a < l)
	{
		if (!(ptr->entry.len == res->entry.len &&
			  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0))
		{
			if (res->entry.haspos)
			{
				*outbuflen += SHORTALIGN(res->entry.len);
				res->poslen = uniquePos(res->pos, res->poslen);
				*outbuflen += res->poslen * sizeof(WordEntryPos);
			}
			else
				*outbuflen += res->entry.len;
			res++;
			memcpy(res, ptr, sizeof(WordEntryIN));
		}
		else if (ptr->entry.haspos)
		{
			if (res->entry.haspos)
			{
				int	newlen = ptr->poslen + res->poslen;

				/* Append res to pos */

				res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos));
				memcpy(&res->pos[res->poslen],
					   ptr->pos, ptr->poslen * sizeof(WordEntryPos));
				res->poslen = newlen;
				pfree(ptr->pos);
			}
			else
			{
				res->entry.haspos = 1;
				res->pos = ptr->pos;
			}
		}
		ptr++;
	}

	/* add last item */

	if (res->entry.haspos)
	{
		*outbuflen += SHORTALIGN(res->entry.len);

		res->poslen = uniquePos(res->pos, res->poslen);
		*outbuflen += res->poslen * sizeof(WordEntryPos);
	}
	else
		*outbuflen += res->entry.len;

	return res + 1 - a;
}

static int
WordEntryCMP(WordEntry * a, WordEntry * b, char *buf)
{
	return compareentry(a, b, buf);
}


Datum
tsvectorin(PG_FUNCTION_ARGS)
{
	char	   *buf = PG_GETARG_CSTRING(0);
	TSVectorParseState state;
	WordEntryIN *arr;
	int			totallen;
	int			arrlen;  /* allocated size of arr */
	WordEntry  *inarr;
	int			len = 0;
	TSVector	in;
	int			i;
	char	   *token;
	int			toklen;
	WordEntryPos *pos;
	int			poslen;

	/*
	 * Tokens are appended to tmpbuf, cur is a pointer
	 * to the end of used space in tmpbuf.
	 */
	char	   *tmpbuf;
	char	   *cur;
	int			buflen = 256; /* allocated size of tmpbuf */

	pg_verifymbstr(buf, strlen(buf), false);

	state = init_tsvector_parser(buf, false);
	
	arrlen = 64;
	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
	cur = tmpbuf = (char *) palloc(buflen);

	while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
	{

		if (toklen >= MAXSTRLEN)
			ereport(ERROR,
					(errcode(ERRCODE_SYNTAX_ERROR),
					 errmsg("word is too long (%ld bytes, max %ld bytes)",
							(long) toklen,
							(long) MAXSTRLEN)));


		if (cur - tmpbuf > MAXSTRPOS)
			ereport(ERROR,
					(errcode(ERRCODE_SYNTAX_ERROR),
					 errmsg("position value too large")));

		/*
		 * Enlarge buffers if needed
		 */
		if (len >= arrlen)
		{
			arrlen *= 2;
			arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
		}
		while ((cur - tmpbuf) + toklen >= buflen)
		{
			int	dist = cur - tmpbuf;

			buflen *= 2;
			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
			cur = tmpbuf + dist;
		}
		arr[len].entry.len = toklen;
		arr[len].entry.pos = cur - tmpbuf;
		memcpy((void *) cur, (void *) token, toklen);
		cur += toklen;

		if (poslen != 0)
		{
			arr[len].entry.haspos = 1;
			arr[len].pos = pos;
			arr[len].poslen = poslen;
		}
		else
			arr[len].entry.haspos = 0;
		len++;
	}

	close_tsvector_parser(state);

	if (len > 0)
		len = uniqueentry(arr, len, tmpbuf, &buflen);
	else
		buflen = 0;
	totallen = CALCDATASIZE(len, buflen);
	in = (TSVector) palloc0(totallen);

	SET_VARSIZE(in, totallen);
	in->size = len;
	cur = STRPTR(in);
	inarr = ARRPTR(in);
	for (i = 0; i < len; i++)
	{
		memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
		arr[i].entry.pos = cur - STRPTR(in);
		cur += SHORTALIGN(arr[i].entry.len);
		if (arr[i].entry.haspos)
		{
			uint16 tmplen;

			if(arr[i].poslen > 0xFFFF)
				elog(ERROR, "positions array too long");

			tmplen = (uint16) arr[i].poslen;

			/* Copy length to output struct */
			memcpy(cur, &tmplen, sizeof(uint16));
			cur += sizeof(uint16);

			/* Copy positions */
			memcpy(cur, arr[i].pos, (arr[i].poslen) * sizeof(WordEntryPos));
			cur += arr[i].poslen * sizeof(WordEntryPos);

			pfree(arr[i].pos);
		}
		inarr[i] = arr[i].entry;
	}

	PG_RETURN_TSVECTOR(in);
}

Datum
tsvectorout(PG_FUNCTION_ARGS)
{
	TSVector	out = PG_GETARG_TSVECTOR(0);
	char	   *outbuf;
	int4		i,
				lenbuf = 0,
				pp;
	WordEntry  *ptr = ARRPTR(out);
	char	   *curbegin,
			   *curin,
			   *curout;

	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
	for (i = 0; i < out->size; i++)
	{
		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
		if (ptr[i].haspos)
			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
	}

	curout = outbuf = (char *) palloc(lenbuf);
	for (i = 0; i < out->size; i++)
	{
		curbegin = curin = STRPTR(out) + ptr->pos;
		if (i != 0)
			*curout++ = ' ';
		*curout++ = '\'';
		while (curin - curbegin < ptr->len)
		{
			int			len = pg_mblen(curin);

			if (t_iseq(curin, '\''))
				*curout++ = '\'';

			while (len--)
				*curout++ = *curin++;
		}

		*curout++ = '\'';
		if ((pp = POSDATALEN(out, ptr)) != 0)
		{
			WordEntryPos *wptr;

			*curout++ = ':';
			wptr = POSDATAPTR(out, ptr);
			while (pp)
			{
				curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
				switch (WEP_GETWEIGHT(*wptr))
				{
					case 3:
						*curout++ = 'A';
						break;
					case 2:
						*curout++ = 'B';
						break;
					case 1:
						*curout++ = 'C';
						break;
					case 0:
					default:
						break;
				}

				if (pp > 1)
					*curout++ = ',';
				pp--;
				wptr++;
			}
		}
		ptr++;
	}

	*curout = '\0';
	PG_FREE_IF_COPY(out, 0);
	PG_RETURN_CSTRING(outbuf);
}

/*
 * Binary Input / Output functions. The binary format is as follows:
 *
 * uint32	number of lexemes
 * 
 * for each lexeme:
 *		lexeme text in client encoding, null-terminated
 * 		uint16	number of positions
 * 		for each position:
 *			uint16 WordEntryPos
 */

Datum
tsvectorsend(PG_FUNCTION_ARGS)
{
	TSVector	vec = PG_GETARG_TSVECTOR(0);
	StringInfoData buf;
	int			i,
				j;
	WordEntry  *weptr = ARRPTR(vec);

	pq_begintypsend(&buf);

	pq_sendint(&buf, vec->size, sizeof(int32));
	for (i = 0; i < vec->size; i++)
	{
		uint16 npos;

		/* the strings in the TSVector array are not null-terminated, so 
		 * we have to send the null-terminator separately
		 */
		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
		pq_sendbyte(&buf, '\0');

		npos = POSDATALEN(vec, weptr);
		pq_sendint(&buf, npos, sizeof(uint16));

		if(npos > 0)
		{
			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);

			for (j = 0; j < npos; j++)
				pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
		}
		weptr++;
	}

	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}

Datum
tsvectorrecv(PG_FUNCTION_ARGS)
{
	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
	TSVector	vec;
	int			i;
	int32		nentries;
	int			datalen; /* number of bytes used in the variable size area
						  * after fixed size TSVector header and WordEntries
						  */
	Size		hdrlen;
	Size		len;  /* allocated size of vec */

	nentries = pq_getmsgint(buf, sizeof(int32));
	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
		elog(ERROR, "invalid size of tsvector");

	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;

	len = hdrlen * 2; /* times two to make room for lexemes */
	vec = (TSVector) palloc0(len);
	vec->size = nentries;

	datalen = 0;
	for (i = 0; i < nentries; i++)
	{
		const char *lexeme;
		uint16 npos;
		size_t lex_len;

		lexeme = pq_getmsgstring(buf);
		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));

		/* sanity checks */

		lex_len = strlen(lexeme);
		if (lex_len < 0 || lex_len > MAXSTRLEN)
			elog(ERROR, "invalid tsvector; lexeme too long");

		if (datalen > MAXSTRPOS)
			elog(ERROR, "invalid tsvector; maximum total lexeme length exceeded"); 

		if (npos > MAXNUMPOS)
			elog(ERROR, "unexpected number of positions");

		/*
		 * Looks valid. Fill the WordEntry struct, and copy lexeme.
		 *
		 * But make sure the buffer is large enough first.
		 */
		while (hdrlen + SHORTALIGN(datalen + lex_len) +
			   (npos + 1) * sizeof(WordEntryPos) >= len)
		{
			len *= 2;
			vec = (TSVector) repalloc(vec, len);
		}

		vec->entries[i].haspos = (npos > 0) ? 1 : 0;
		vec->entries[i].len = lex_len;
		vec->entries[i].pos = datalen;

		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);

		datalen += lex_len;

		if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0)
			elog(ERROR, "lexemes are unordered");

		/* Receive positions */

		if (npos > 0)
		{
			uint16		j;
			WordEntryPos *wepptr;

			/*
			 * Pad to 2-byte alignment if necessary. Though we used palloc0
			 * for the initial allocation, subsequent repalloc'd memory
			 * areas are not initialized to zero.
			 */
			if (datalen != SHORTALIGN(datalen))
			{
				*(STRPTR(vec) + datalen) = '\0';
				datalen = SHORTALIGN(datalen);
			}

			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));

			wepptr = POSDATAPTR(vec, &vec->entries[i]);
			for (j = 0; j < npos; j++)
			{
				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
					elog(ERROR, "position information is unordered");
			}

			datalen += (npos + 1) * sizeof(WordEntry);
		}
	}

	SET_VARSIZE(vec, hdrlen + datalen);

	PG_RETURN_TSVECTOR(vec);
}