aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/tsvector_op.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/tsvector_op.c')
-rw-r--r--src/backend/utils/adt/tsvector_op.c367
1 files changed, 160 insertions, 207 deletions
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 539a9be9789..4e7d50b526a 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.15 2008/04/08 18:20:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.16 2008/05/16 16:31:01 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -127,11 +127,7 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
{
return (aptr->haspos > bptr->haspos) ? -1 : 1;
}
- else if (aptr->len != bptr->len)
- {
- return (aptr->len > bptr->len) ? -1 : 1;
- }
- else if ((res = strncmp(STRPTR(a) + aptr->pos, STRPTR(b) + bptr->pos, bptr->len)) != 0)
+ else if ( (res=tsCompareString( STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) !=0 )
{
return res;
}
@@ -286,18 +282,10 @@ tsvector_setweight(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(out);
}
-static int
-compareEntry(char *ptra, WordEntry *a, char *ptrb, WordEntry *b)
-{
- if (a->len == b->len)
- {
- return strncmp(
- ptra + a->pos,
- ptrb + b->pos,
- a->len);
- }
- return (a->len > b->len) ? 1 : -1;
-}
+#define compareEntry(pa, a, pb, b) \
+ tsCompareString((pa) + (a)->pos, (a)->len, \
+ (pb) + (b)->pos, (b)->len, \
+ false)
/*
* Add positions from src to dest after offsetting them by maxpos.
@@ -534,18 +522,46 @@ tsvector_concat(PG_FUNCTION_ARGS)
}
/*
- * compare 2 string values
+ * Compare two strings by tsvector rules.
+ * if isPrefix = true then it returns not-zero value if b has prefix a
*/
-static int4
-ValCompare(CHKVAL *chkval, WordEntry *ptr, QueryOperand *item)
+int4
+tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
{
- if (ptr->len == item->length)
- return strncmp(
- &(chkval->values[ptr->pos]),
- &(chkval->operand[item->distance]),
- item->length);
+ int cmp;
+
+ if ( lena == 0 )
+ {
+ if ( prefix )
+ cmp = 0; /* emtry string is equal to any if a prefix match */
+ else
+ cmp = (lenb>0) ? -1 : 0;
+ }
+ else if ( lenb == 0 )
+ {
+ cmp = (lena>0) ? 1 : 0;
+ }
+ else
+ {
+ cmp = memcmp(a, b, Min(lena, lenb));
- return (ptr->len > item->length) ? 1 : -1;
+ if ( prefix )
+ {
+ if ( cmp == 0 && lena > lenb )
+ {
+ /*
+ * b argument is not beginning with argument a
+ */
+ cmp=1;
+ }
+ }
+ else if ( (cmp == 0) && (lena != lenb) )
+ {
+ cmp = (lena < lenb) ? -1 : 1;
+ }
+ }
+
+ return cmp;
}
/*
@@ -582,25 +598,52 @@ checkcondition_str(void *checkval, QueryOperand *val)
CHKVAL *chkval = (CHKVAL *) checkval;
WordEntry *StopLow = chkval->arrb;
WordEntry *StopHigh = chkval->arre;
- WordEntry *StopMiddle;
- int difference;
+ WordEntry *StopMiddle = StopHigh;
+ int difference = -1;
+ bool res=false;
/* Loop invariant: StopLow <= val < StopHigh */
-
while (StopLow < StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
- difference = ValCompare(chkval, StopMiddle, val);
+ difference = tsCompareString( chkval->operand + val->distance, val->length,
+ chkval->values + StopMiddle->pos, StopMiddle->len,
+ false);
+
if (difference == 0)
- return (val->weight && StopMiddle->haspos) ?
+ {
+ res = (val->weight && StopMiddle->haspos) ?
checkclass_str(chkval, StopMiddle, val) : true;
- else if (difference < 0)
+ break;
+ }
+ else if (difference > 0)
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
- return (false);
+ if ( res == false && val->prefix == true )
+ {
+ /*
+ * there was a failed exact search, so we should scan further to find
+ * a prefix match.
+ */
+ if ( StopLow >= StopHigh )
+ StopMiddle = StopHigh;
+
+ while( res == false && StopMiddle < chkval->arre &&
+ tsCompareString( chkval->operand + val->distance, val->length,
+ chkval->values + StopMiddle->pos, StopMiddle->len,
+ true) == 0 )
+ {
+ res = (val->weight && StopMiddle->haspos) ?
+ checkclass_str(chkval, StopMiddle, val) : true;
+
+ StopMiddle++;
+ }
+ }
+
+ return res;
}
/*
@@ -758,50 +801,38 @@ check_weight(TSVector txt, WordEntry *wptr, int8 weight)
return num;
}
-static WordEntry **
-SEI_realloc(WordEntry **in, uint32 *len)
-{
- if (*len == 0 || in == NULL)
- {
- *len = 8;
- in = palloc(sizeof(WordEntry *) * (*len));
- }
- else
- {
- *len *= 2;
- in = repalloc(in, sizeof(WordEntry *) * (*len));
- }
- return in;
-}
+#define compareStatWord(a,e,s,t) \
+ tsCompareString(STATSTRPTR(s) + (a)->pos, (a)->len, \
+ STRPTR(t) + (e)->pos, (e)->len, \
+ false)
-static int
-compareStatWord(StatEntry *a, WordEntry *b, tsstat *stat, TSVector txt)
+typedef struct WordEntryMark
{
- if (a->len == b->len)
- return strncmp(
- STATSTRPTR(stat) + a->pos,
- STRPTR(txt) + b->pos,
- a->len
- );
- return (a->len > b->len) ? 1 : -1;
-}
+ WordEntry *newentry;
+ StatEntry *pos;
+} WordEntryMark;
static tsstat *
-formstat(tsstat *stat, TSVector txt, WordEntry **entry, uint32 len)
+formstat(tsstat *stat, TSVector txt, List *entries)
{
- tsstat *newstat;
- uint32 totallen,
- nentry;
- uint32 slen = 0;
- WordEntry **ptr = entry;
- char *curptr;
- StatEntry *sptr,
- *nptr;
-
- while (ptr - entry < len)
+ tsstat *newstat;
+ uint32 totallen,
+ nentry,
+ len = list_length(entries);
+ uint32 slen = 0;
+ WordEntry *ptr;
+ char *curptr;
+ StatEntry *sptr,
+ *nptr;
+ ListCell *entry;
+ StatEntry *PosSE = STATPTR(stat),
+ *prevPosSE;
+ WordEntryMark *mark;
+
+ foreach( entry, entries )
{
- slen += (*ptr)->len;
- ptr++;
+ mark = (WordEntryMark*)lfirst(entry);
+ slen += mark->newentry->len;
}
nentry = stat->size + len;
@@ -815,78 +846,46 @@ formstat(tsstat *stat, TSVector txt, WordEntry **entry, uint32 len)
memcpy(STATSTRPTR(newstat), STATSTRPTR(stat), STATSTRSIZE(stat));
curptr = STATSTRPTR(newstat) + STATSTRSIZE(stat);
- ptr = entry;
sptr = STATPTR(stat);
nptr = STATPTR(newstat);
- if (len == 1)
+ foreach(entry, entries)
{
- StatEntry *StopLow = STATPTR(stat);
- StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat);
+ prevPosSE = PosSE;
- while (StopLow < StopHigh)
+ mark = (WordEntryMark*)lfirst(entry);
+ ptr = mark->newentry;
+ PosSE = mark->pos;
+
+ /*
+ * Copy missed entries
+ */
+ if ( PosSE > prevPosSE )
{
- sptr = StopLow + (StopHigh - StopLow) / 2;
- if (compareStatWord(sptr, *ptr, stat, txt) < 0)
- StopLow = sptr + 1;
- else
- StopHigh = sptr;
+ memcpy( nptr, prevPosSE, sizeof(StatEntry) * (PosSE-prevPosSE) );
+ nptr += PosSE-prevPosSE;
}
- nptr = STATPTR(newstat) + (StopLow - STATPTR(stat));
- memcpy(STATPTR(newstat), STATPTR(stat), sizeof(StatEntry) * (StopLow - STATPTR(stat)));
- if ((*ptr)->haspos)
- nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
+
+ /*
+ * Copy new entry
+ */
+ if (ptr->haspos)
+ nptr->nentry = (stat->weight) ? check_weight(txt, ptr, stat->weight) : POSDATALEN(txt, ptr);
else
nptr->nentry = 1;
nptr->ndoc = 1;
- nptr->len = (*ptr)->len;
- memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
+ nptr->len = ptr->len;
+ memcpy(curptr, STRPTR(txt) + ptr->pos, nptr->len);
nptr->pos = curptr - STATSTRPTR(newstat);
- memcpy(nptr + 1, StopLow, sizeof(StatEntry) * (((StatEntry *) STATSTRPTR(stat)) - StopLow));
- }
- else
- {
- while (sptr - STATPTR(stat) < stat->size && ptr - entry < len)
- {
- if (compareStatWord(sptr, *ptr, stat, txt) < 0)
- {
- memcpy(nptr, sptr, sizeof(StatEntry));
- sptr++;
- }
- else
- {
- if ((*ptr)->haspos)
- nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
- else
- nptr->nentry = 1;
- nptr->ndoc = 1;
- nptr->len = (*ptr)->len;
- memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
- nptr->pos = curptr - STATSTRPTR(newstat);
- curptr += nptr->len;
- ptr++;
- }
- nptr++;
- }
+ curptr += nptr->len;
+ nptr++;
- memcpy(nptr, sptr, sizeof(StatEntry) * (stat->size - (sptr - STATPTR(stat))));
-
- while (ptr - entry < len)
- {
- if ((*ptr)->haspos)
- nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
- else
- nptr->nentry = 1;
- nptr->ndoc = 1;
- nptr->len = (*ptr)->len;
- memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
- nptr->pos = curptr - STATSTRPTR(newstat);
- curptr += nptr->len;
- ptr++;
- nptr++;
- }
+ pfree(mark);
}
+ if ( PosSE < (StatEntry *) STATSTRPTR(stat) )
+ memcpy(nptr, PosSE, sizeof(StatEntry) * (stat->size - (PosSE - STATPTR(stat))));
+
return newstat;
}
@@ -907,12 +906,11 @@ ts_accum(tsstat *stat, Datum data)
{
tsstat *newstat;
TSVector txt = DatumGetTSVector(data);
- WordEntry **newentry = NULL;
- uint32 len = 0,
- cur = 0;
StatEntry *sptr;
WordEntry *wptr;
int n = 0;
+ List *newentries=NIL;
+ StatEntry *StopLow;
if (stat == NULL)
{ /* Init in first */
@@ -932,16 +930,23 @@ ts_accum(tsstat *stat, Datum data)
sptr = STATPTR(stat);
wptr = ARRPTR(txt);
+ StopLow = STATPTR(stat);
- if (stat->size < 100 * txt->size)
- { /* merge */
- while (sptr - STATPTR(stat) < stat->size && wptr - ARRPTR(txt) < txt->size)
- {
- int cmp = compareStatWord(sptr, wptr, stat, txt);
+ while (wptr - ARRPTR(txt) < txt->size)
+ {
+ StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat);
+ int cmp;
+
+ /*
+ * We do not set StopLow to begin of array because tsvector is ordered
+ * with the sames rule, so we can search from last stopped position
+ */
- if (cmp < 0)
- sptr++;
- else if (cmp == 0)
+ while (StopLow < StopHigh)
+ {
+ sptr = StopLow + (StopHigh - StopLow) / 2;
+ cmp = compareStatWord(sptr, wptr, stat, txt);
+ if (cmp == 0)
{
if (stat->weight == 0)
{
@@ -953,90 +958,38 @@ ts_accum(tsstat *stat, Datum data)
sptr->ndoc++;
sptr->nentry += n;
}
- sptr++;
- wptr++;
+ break;
}
+ else if (cmp < 0)
+ StopLow = sptr + 1;
else
- {
- if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
- {
- if (cur == len)
- newentry = SEI_realloc(newentry, &len);
- newentry[cur] = wptr;
- cur++;
- }
- wptr++;
- }
+ StopHigh = sptr;
}
- while (wptr - ARRPTR(txt) < txt->size)
- {
+ if (StopLow >= StopHigh)
+ { /* not found */
if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
{
- if (cur == len)
- newentry = SEI_realloc(newentry, &len);
- newentry[cur] = wptr;
- cur++;
- }
- wptr++;
- }
- }
- else
- { /* search */
- while (wptr - ARRPTR(txt) < txt->size)
- {
- StatEntry *StopLow = STATPTR(stat);
- StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat);
- int cmp;
+ WordEntryMark *mark = (WordEntryMark*)palloc(sizeof(WordEntryMark));
- while (StopLow < StopHigh)
- {
- sptr = StopLow + (StopHigh - StopLow) / 2;
- cmp = compareStatWord(sptr, wptr, stat, txt);
- if (cmp == 0)
- {
- if (stat->weight == 0)
- {
- sptr->ndoc++;
- sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
- }
- else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0)
- {
- sptr->ndoc++;
- sptr->nentry += n;
- }
- break;
- }
- else if (cmp < 0)
- StopLow = sptr + 1;
- else
- StopHigh = sptr;
- }
+ mark->newentry = wptr;
+ mark->pos = StopLow;
+ newentries = lappend( newentries, mark );
- if (StopLow >= StopHigh)
- { /* not found */
- if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
- {
- if (cur == len)
- newentry = SEI_realloc(newentry, &len);
- newentry[cur] = wptr;
- cur++;
- }
}
- wptr++;
}
+ wptr++;
}
-
- if (cur == 0)
+ if (list_length(newentries) == 0)
{ /* no new words */
if (txt != (TSVector) DatumGetPointer(data))
pfree(txt);
return stat;
}
- newstat = formstat(stat, txt, newentry, cur);
- pfree(newentry);
+ newstat = formstat(stat, txt, newentries);
+ list_free(newentries);
if (txt != (TSVector) DatumGetPointer(data))
pfree(txt);