diff options
Diffstat (limited to 'src/backend/utils/adt/tsvector.c')
-rw-r--r-- | src/backend/utils/adt/tsvector.c | 137 |
1 files changed, 74 insertions, 63 deletions
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 0d82da1f902..cb90274943b 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.5 2007/10/21 22:29:56 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.6 2007/10/23 00:51:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,16 +22,18 @@ typedef struct { - WordEntry entry; /* should be first ! */ + WordEntry entry; /* must be first! */ WordEntryPos *pos; int poslen; /* number of elements in pos */ } WordEntryIN; + +/* Compare two WordEntryPos values for qsort */ static int comparePos(const void *a, const void *b) { - int apos = WEP_GETPOS(*(WordEntryPos *) a); - int bpos = WEP_GETPOS(*(WordEntryPos *) b); + int apos = WEP_GETPOS(*(const WordEntryPos *) a); + int bpos = WEP_GETPOS(*(const WordEntryPos *) b); if (apos == bpos) return 0; @@ -53,9 +55,9 @@ uniquePos(WordEntryPos * a, int l) if (l <= 1) return l; - res = a; qsort((void *) a, l, sizeof(WordEntryPos), comparePos); + res = a; ptr = a + 1; while (ptr - a < l) { @@ -63,7 +65,8 @@ uniquePos(WordEntryPos * a, int l) { res++; *res = *ptr; - if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1) + if (res - a >= MAXNUMPOS - 1 || + WEP_GETPOS(*res) == MAXENTRYPOS - 1) break; } else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) @@ -74,12 +77,13 @@ uniquePos(WordEntryPos * a, int l) return res + 1 - a; } +/* Compare two WordEntryIN values for qsort */ static int compareentry(const void *va, const void *vb, void *arg) { + const WordEntryIN *a = (const WordEntryIN *) va; + const WordEntryIN *b = (const WordEntryIN *) vb; char *BufferStr = (char *) arg; - WordEntryIN *a = (WordEntryIN *) va; - WordEntryIN *b = (WordEntryIN *) vb; if (a->entry.len == b->entry.len) { @@ -91,44 +95,40 @@ compareentry(const void *va, const void *vb, void *arg) return (a->entry.len > b->entry.len) ? 1 : -1; } +/* + * Sort an array of WordEntryIN, remove duplicates. + * *outbuflen receives the amount of space needed for strings and positions. + */ static int uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) { + int buflen; WordEntryIN *ptr, *res; Assert(l >= 1); - if (l == 1) - { - if (a->entry.haspos) - { - a->poslen = uniquePos(a->pos, a->poslen); - *outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos); - } - else - *outbuflen = a->entry.len; + if (l > 1) + qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, + (void *) buf); - return l; - } + buflen = 0; res = a; - ptr = a + 1; - qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf); - while (ptr - a < l) { if (!(ptr->entry.len == res->entry.len && - strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0)) + strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], + res->entry.len) == 0)) { + /* done accumulating data into *res, count space needed */ + buflen += res->entry.len; if (res->entry.haspos) { - *outbuflen += SHORTALIGN(res->entry.len); res->poslen = uniquePos(res->pos, res->poslen); - *outbuflen += res->poslen * sizeof(WordEntryPos); + buflen = SHORTALIGN(buflen); + buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); } - else - *outbuflen += res->entry.len; res++; memcpy(res, ptr, sizeof(WordEntryIN)); } @@ -136,37 +136,37 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) { if (res->entry.haspos) { + /* append ptr's positions to res's positions */ int newlen = ptr->poslen + res->poslen; - /* Append res to pos */ - - res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos)); - memcpy(&res->pos[res->poslen], - ptr->pos, ptr->poslen * sizeof(WordEntryPos)); + res->pos = (WordEntryPos *) + repalloc(res->pos, newlen * sizeof(WordEntryPos)); + memcpy(&res->pos[res->poslen], ptr->pos, + ptr->poslen * sizeof(WordEntryPos)); res->poslen = newlen; pfree(ptr->pos); } else { + /* just give ptr's positions to pos */ res->entry.haspos = 1; res->pos = ptr->pos; + res->poslen = ptr->poslen; } } ptr++; } - /* add last item */ - + /* count space needed for last item */ + buflen += res->entry.len; if (res->entry.haspos) { - *outbuflen += SHORTALIGN(res->entry.len); - res->poslen = uniquePos(res->pos, res->poslen); - *outbuflen += res->poslen * sizeof(WordEntryPos); + buflen = SHORTALIGN(buflen); + buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); } - else - *outbuflen += res->entry.len; + *outbuflen = buflen; return res + 1 - a; } @@ -193,6 +193,8 @@ tsvectorin(PG_FUNCTION_ARGS) int toklen; WordEntryPos *pos; int poslen; + char *strbuf; + int stroff; /* * Tokens are appended to tmpbuf, cur is a pointer @@ -212,19 +214,17 @@ tsvectorin(PG_FUNCTION_ARGS) while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) { - if (toklen >= MAXSTRLEN) ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long (%ld bytes, max %ld bytes)", (long) toklen, - (long) MAXSTRLEN))); - + (long) (MAXSTRLEN-1)))); if (cur - tmpbuf > MAXSTRPOS) ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("position value is too large"))); + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector"))); /* * Enlarge buffers if needed @@ -232,7 +232,8 @@ tsvectorin(PG_FUNCTION_ARGS) if (len >= arrlen) { arrlen *= 2; - arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); + arr = (WordEntryIN *) + repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); } while ((cur - tmpbuf) + toklen >= buflen) { @@ -254,7 +255,11 @@ tsvectorin(PG_FUNCTION_ARGS) arr[len].poslen = poslen; } else + { arr[len].entry.haspos = 0; + arr[len].pos = NULL; + arr[len].poslen = 0; + } len++; } @@ -264,40 +269,45 @@ tsvectorin(PG_FUNCTION_ARGS) len = uniqueentry(arr, len, tmpbuf, &buflen); else buflen = 0; + + if (buflen > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector"))); + totallen = CALCDATASIZE(len, buflen); in = (TSVector) palloc0(totallen); - SET_VARSIZE(in, totallen); in->size = len; - cur = STRPTR(in); inarr = ARRPTR(in); + strbuf = STRPTR(in); + stroff = 0; for (i = 0; i < len; i++) { - memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len); - arr[i].entry.pos = cur - STRPTR(in); - cur += SHORTALIGN(arr[i].entry.len); + memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len); + arr[i].entry.pos = stroff; + stroff += arr[i].entry.len; if (arr[i].entry.haspos) { - uint16 tmplen; - - if(arr[i].poslen > 0xFFFF) + if (arr[i].poslen > 0xFFFF) elog(ERROR, "positions array too long"); - tmplen = (uint16) arr[i].poslen; - - /* Copy length to output struct */ - memcpy(cur, &tmplen, sizeof(uint16)); - cur += sizeof(uint16); + /* Copy number of positions */ + stroff = SHORTALIGN(stroff); + *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen; + stroff += sizeof(uint16); /* Copy positions */ - memcpy(cur, arr[i].pos, (arr[i].poslen) * sizeof(WordEntryPos)); - cur += arr[i].poslen * sizeof(WordEntryPos); + memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos)); + stroff += arr[i].poslen * sizeof(WordEntryPos); pfree(arr[i].pos); } inarr[i] = arr[i].entry; } + Assert((strbuf + stroff - (char *) in) == totallen); + PG_RETURN_TSVECTOR(in); } @@ -495,11 +505,12 @@ tsvectorrecv(PG_FUNCTION_ARGS) datalen += lex_len; - if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0) + if (i > 0 && WordEntryCMP(&vec->entries[i], + &vec->entries[i - 1], + STRPTR(vec)) <= 0) elog(ERROR, "lexemes are misordered"); /* Receive positions */ - if (npos > 0) { uint16 j; |