diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2007-10-23 00:51:23 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2007-10-23 00:51:23 +0000 |
commit | bb36c51fcdca16ad4dfd4c03e2673f2471c7e341 (patch) | |
tree | 9dc4fdc1192918a076ef8f10f77f2126d4854280 /src/backend/utils/adt/tsvector.c | |
parent | f5513484171e26456a76a5cdeaad0dbf41980dd8 (diff) | |
download | postgresql-bb36c51fcdca16ad4dfd4c03e2673f2471c7e341.tar.gz postgresql-bb36c51fcdca16ad4dfd4c03e2673f2471c7e341.zip |
Fix several bugs in tsvectorin, including crash due to uninitialized field and
miscomputation of required palloc size. The crash could only occur if the
input contained lexemes both with and without positions, which is probably not
common in practice. The miscomputation would definitely result in wasted
space. Also fix some inconsistent coding around alignment of strings and
positions in a tsvector value; these errors could also lead to crashes given
mixed with/without position data and a machine that's picky about alignment.
And be more careful about checking for overflow of string offsets.
Patch is only against HEAD --- I have not looked to see if same bugs are
in back-branch contrib/tsearch2 code.
Diffstat (limited to 'src/backend/utils/adt/tsvector.c')
-rw-r--r-- | src/backend/utils/adt/tsvector.c | 137 |
1 files changed, 74 insertions, 63 deletions
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 0d82da1f902..cb90274943b 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.5 2007/10/21 22:29:56 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.6 2007/10/23 00:51:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,16 +22,18 @@ typedef struct { - WordEntry entry; /* should be first ! */ + WordEntry entry; /* must be first! */ WordEntryPos *pos; int poslen; /* number of elements in pos */ } WordEntryIN; + +/* Compare two WordEntryPos values for qsort */ static int comparePos(const void *a, const void *b) { - int apos = WEP_GETPOS(*(WordEntryPos *) a); - int bpos = WEP_GETPOS(*(WordEntryPos *) b); + int apos = WEP_GETPOS(*(const WordEntryPos *) a); + int bpos = WEP_GETPOS(*(const WordEntryPos *) b); if (apos == bpos) return 0; @@ -53,9 +55,9 @@ uniquePos(WordEntryPos * a, int l) if (l <= 1) return l; - res = a; qsort((void *) a, l, sizeof(WordEntryPos), comparePos); + res = a; ptr = a + 1; while (ptr - a < l) { @@ -63,7 +65,8 @@ uniquePos(WordEntryPos * a, int l) { res++; *res = *ptr; - if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1) + if (res - a >= MAXNUMPOS - 1 || + WEP_GETPOS(*res) == MAXENTRYPOS - 1) break; } else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) @@ -74,12 +77,13 @@ uniquePos(WordEntryPos * a, int l) return res + 1 - a; } +/* Compare two WordEntryIN values for qsort */ static int compareentry(const void *va, const void *vb, void *arg) { + const WordEntryIN *a = (const WordEntryIN *) va; + const WordEntryIN *b = (const WordEntryIN *) vb; char *BufferStr = (char *) arg; - WordEntryIN *a = (WordEntryIN *) va; - WordEntryIN *b = (WordEntryIN *) vb; if (a->entry.len == b->entry.len) { @@ -91,44 +95,40 @@ compareentry(const void *va, const void *vb, void *arg) return (a->entry.len > b->entry.len) ? 1 : -1; } +/* + * Sort an array of WordEntryIN, remove duplicates. + * *outbuflen receives the amount of space needed for strings and positions. + */ static int uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) { + int buflen; WordEntryIN *ptr, *res; Assert(l >= 1); - if (l == 1) - { - if (a->entry.haspos) - { - a->poslen = uniquePos(a->pos, a->poslen); - *outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos); - } - else - *outbuflen = a->entry.len; + if (l > 1) + qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, + (void *) buf); - return l; - } + buflen = 0; res = a; - ptr = a + 1; - qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf); - while (ptr - a < l) { if (!(ptr->entry.len == res->entry.len && - strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0)) + strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], + res->entry.len) == 0)) { + /* done accumulating data into *res, count space needed */ + buflen += res->entry.len; if (res->entry.haspos) { - *outbuflen += SHORTALIGN(res->entry.len); res->poslen = uniquePos(res->pos, res->poslen); - *outbuflen += res->poslen * sizeof(WordEntryPos); + buflen = SHORTALIGN(buflen); + buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); } - else - *outbuflen += res->entry.len; res++; memcpy(res, ptr, sizeof(WordEntryIN)); } @@ -136,37 +136,37 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) { if (res->entry.haspos) { + /* append ptr's positions to res's positions */ int newlen = ptr->poslen + res->poslen; - /* Append res to pos */ - - res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos)); - memcpy(&res->pos[res->poslen], - ptr->pos, ptr->poslen * sizeof(WordEntryPos)); + res->pos = (WordEntryPos *) + repalloc(res->pos, newlen * sizeof(WordEntryPos)); + memcpy(&res->pos[res->poslen], ptr->pos, + ptr->poslen * sizeof(WordEntryPos)); res->poslen = newlen; pfree(ptr->pos); } else { + /* just give ptr's positions to pos */ res->entry.haspos = 1; res->pos = ptr->pos; + res->poslen = ptr->poslen; } } ptr++; } - /* add last item */ - + /* count space needed for last item */ + buflen += res->entry.len; if (res->entry.haspos) { - *outbuflen += SHORTALIGN(res->entry.len); - res->poslen = uniquePos(res->pos, res->poslen); - *outbuflen += res->poslen * sizeof(WordEntryPos); + buflen = SHORTALIGN(buflen); + buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); } - else - *outbuflen += res->entry.len; + *outbuflen = buflen; return res + 1 - a; } @@ -193,6 +193,8 @@ tsvectorin(PG_FUNCTION_ARGS) int toklen; WordEntryPos *pos; int poslen; + char *strbuf; + int stroff; /* * Tokens are appended to tmpbuf, cur is a pointer @@ -212,19 +214,17 @@ tsvectorin(PG_FUNCTION_ARGS) while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) { - if (toklen >= MAXSTRLEN) ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long (%ld bytes, max %ld bytes)", (long) toklen, - (long) MAXSTRLEN))); - + (long) (MAXSTRLEN-1)))); if (cur - tmpbuf > MAXSTRPOS) ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("position value is too large"))); + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector"))); /* * Enlarge buffers if needed @@ -232,7 +232,8 @@ tsvectorin(PG_FUNCTION_ARGS) if (len >= arrlen) { arrlen *= 2; - arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); + arr = (WordEntryIN *) + repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); } while ((cur - tmpbuf) + toklen >= buflen) { @@ -254,7 +255,11 @@ tsvectorin(PG_FUNCTION_ARGS) arr[len].poslen = poslen; } else + { arr[len].entry.haspos = 0; + arr[len].pos = NULL; + arr[len].poslen = 0; + } len++; } @@ -264,40 +269,45 @@ tsvectorin(PG_FUNCTION_ARGS) len = uniqueentry(arr, len, tmpbuf, &buflen); else buflen = 0; + + if (buflen > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("string is too long for tsvector"))); + totallen = CALCDATASIZE(len, buflen); in = (TSVector) palloc0(totallen); - SET_VARSIZE(in, totallen); in->size = len; - cur = STRPTR(in); inarr = ARRPTR(in); + strbuf = STRPTR(in); + stroff = 0; for (i = 0; i < len; i++) { - memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len); - arr[i].entry.pos = cur - STRPTR(in); - cur += SHORTALIGN(arr[i].entry.len); + memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len); + arr[i].entry.pos = stroff; + stroff += arr[i].entry.len; if (arr[i].entry.haspos) { - uint16 tmplen; - - if(arr[i].poslen > 0xFFFF) + if (arr[i].poslen > 0xFFFF) elog(ERROR, "positions array too long"); - tmplen = (uint16) arr[i].poslen; - - /* Copy length to output struct */ - memcpy(cur, &tmplen, sizeof(uint16)); - cur += sizeof(uint16); + /* Copy number of positions */ + stroff = SHORTALIGN(stroff); + *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen; + stroff += sizeof(uint16); /* Copy positions */ - memcpy(cur, arr[i].pos, (arr[i].poslen) * sizeof(WordEntryPos)); - cur += arr[i].poslen * sizeof(WordEntryPos); + memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos)); + stroff += arr[i].poslen * sizeof(WordEntryPos); pfree(arr[i].pos); } inarr[i] = arr[i].entry; } + Assert((strbuf + stroff - (char *) in) == totallen); + PG_RETURN_TSVECTOR(in); } @@ -495,11 +505,12 @@ tsvectorrecv(PG_FUNCTION_ARGS) datalen += lex_len; - if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0) + if (i > 0 && WordEntryCMP(&vec->entries[i], + &vec->entries[i - 1], + STRPTR(vec)) <= 0) elog(ERROR, "lexemes are misordered"); /* Receive positions */ - if (npos > 0) { uint16 j; |