aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/fts2/fts2.c188
-rw-r--r--manifest12
-rw-r--r--manifest.uuid2
3 files changed, 121 insertions, 81 deletions
diff --git a/ext/fts2/fts2.c b/ext/fts2/fts2.c
index aedae5186..3f49a2958 100644
--- a/ext/fts2/fts2.c
+++ b/ext/fts2/fts2.c
@@ -690,6 +690,7 @@ static void docListValidate(DocListType iType, const char *pData, int nData,
** dlwDestroy - clear the writer's memory. Does not free buffer.
** dlwAppend - append raw doclist data to buffer.
** dlwAdd - construct doclist element and append to buffer.
+** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
*/
typedef struct DLWriter {
DocListType iType;
@@ -751,24 +752,14 @@ static void dlwAppend(DLWriter *pWriter,
}
pWriter->iPrevDocid = iLastDocid;
}
-static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid,
- const char *pPosList, int nPosList){
+static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
char c[VARINT_MAX];
int n = putVarint(c, iDocid-pWriter->iPrevDocid);
assert( pWriter->iPrevDocid<iDocid );
- assert( pPosList==0 || pWriter->iType>DL_DOCIDS );
+ assert( pWriter->iType==DL_DOCIDS );
dataBufferAppend(pWriter->b, c, n);
-
- if( pWriter->iType>DL_DOCIDS ){
- n = putVarint(c, 0);
- if( nPosList>0 ){
- dataBufferAppend2(pWriter->b, pPosList, nPosList, c, n);
- }else{
- dataBufferAppend(pWriter->b, c, n);
- }
- }
pWriter->iPrevDocid = iDocid;
}
@@ -854,11 +845,10 @@ static void plrStep(PLReader *pReader){
pReader->nData -= n;
}
-static void plrInit(PLReader *pReader, DocListType iType,
- const char *pData, int nData){
- pReader->pData = pData;
- pReader->nData = nData;
- pReader->iType = iType;
+static void plrInit(PLReader *pReader, DLReader *pDLReader){
+ pReader->pData = dlrPosData(pDLReader);
+ pReader->nData = dlrPosDataLen(pDLReader);
+ pReader->iType = pDLReader->iType;
pReader->iColumn = 0;
pReader->iPosition = 0;
pReader->iStartOffset = 0;
@@ -872,34 +862,38 @@ static void plrDestroy(PLReader *pReader){
/*******************************************************************/
/* PLWriter is used in constructing a document's position list. As a
** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
+** PLWriter writes to the associated DLWriter's buffer.
**
** plwInit - init for writing a document's poslist.
-** plwReset - reset the writer for a new document.
** plwDestroy - clear a writer.
-** plwNew - malloc storage and initialize it.
-** plwDelete - clear and free storage.
-** plwDlwAdd - append the docid and poslist to a doclist writer.
** plwAdd - append position and offset information.
+** plwTerminate - add any necessary doclist terminator.
+**
+** Calling plwAdd() after plwTerminate() may result in a corrupt
+** doclist.
*/
-/* TODO(shess) PLWriter is used in two ways. fulltextUpdate() uses it
-** in construction of a new doclist. docListTrim() and mergePosList()
-** use it when trimming. In the former case, it wants to own the
-** DataBuffer, in the latter it's possible it could encode into a
-** pre-existing DataBuffer.
+/* TODO(shess) Until we've written the second item, we can cache the
+** first item's information. Then we'd have three states:
+**
+** - initialized with docid, no positions.
+** - docid and one position.
+** - docid and multiple positions.
+**
+** Only the last state needs to actually write to dlw->b, which would
+** be an improvement in the DLCollector case.
*/
typedef struct PLWriter {
- DataBuffer b;
+ DLWriter *dlw;
- sqlite_int64 iDocid;
- DocListType iType;
int iColumn; /* the last column written */
int iPos; /* the last position written */
int iOffset; /* the last start offset written */
} PLWriter;
-static void plwDlwAdd(PLWriter *pWriter, DLWriter *dlWriter){
- dlwAdd(dlWriter, pWriter->iDocid, pWriter->b.pData, pWriter->b.nData);
-}
+/* TODO(shess) In the case where the parent is reading these values
+** from a PLReader, we could optimize to a copy if that PLReader has
+** the same type as pWriter.
+*/
static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
int iStartOffset, int iEndOffset){
/* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
@@ -908,7 +902,10 @@ static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
char c[5*VARINT_MAX];
int n = 0;
- if( pWriter->iType==DL_DOCIDS ) return;
+ /* Ban plwAdd() after plwTerminate(). */
+ assert( pWriter->iPos!=-1 );
+
+ if( pWriter->dlw->iType==DL_DOCIDS ) return;
if( iColumn!=pWriter->iColumn ){
n += putVarint(c+n, POS_COLUMN);
@@ -920,30 +917,50 @@ static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
assert( iPos>=pWriter->iPos );
n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
pWriter->iPos = iPos;
- if( pWriter->iType==DL_POSITIONS_OFFSETS ){
+ if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
assert( iStartOffset>=pWriter->iOffset );
n += putVarint(c+n, iStartOffset-pWriter->iOffset);
pWriter->iOffset = iStartOffset;
assert( iEndOffset>=iStartOffset );
n += putVarint(c+n, iEndOffset-iStartOffset);
}
- dataBufferAppend(&pWriter->b, c, n);
+ dataBufferAppend(pWriter->dlw->b, c, n);
}
-static void plwReset(PLWriter *pWriter,
- sqlite_int64 iDocid, DocListType iType){
- dataBufferReset(&pWriter->b);
- pWriter->iDocid = iDocid;
- pWriter->iType = iType;
+static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
+ char c[VARINT_MAX];
+ int n;
+
+ pWriter->dlw = dlw;
+
+ assert( iDocid>pWriter->dlw->iPrevDocid );
+ n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
+ dataBufferAppend(pWriter->dlw->b, c, n);
+ pWriter->dlw->iPrevDocid = iDocid;
+
pWriter->iColumn = 0;
pWriter->iPos = 0;
pWriter->iOffset = 0;
}
-static void plwInit(PLWriter *pWriter, sqlite_int64 iDocid, DocListType iType){
- dataBufferInit(&pWriter->b, 0);
- plwReset(pWriter, iDocid, iType);
+/* TODO(shess) Should plwDestroy() also terminate the doclist? But
+** then plwDestroy() would no longer be just a destructor, it would
+** also be doing work, which isn't consistent with the overall idiom.
+** Another option would be for plwAdd() to always append any necessary
+** terminator, so that the output is always correct. But that would
+** add incremental work to the common case with the only benefit being
+** API elegance. Punt for now.
+*/
+static void plwTerminate(PLWriter *pWriter){
+ if( pWriter->dlw->iType>DL_DOCIDS ){
+ char c[VARINT_MAX];
+ int n = putVarint(c, POS_END);
+ dataBufferAppend(pWriter->dlw->b, c, n);
+ }
+#ifndef NDEBUG
+ /* Mark as terminated for assert in plwAdd(). */
+ pWriter->iPos = -1;
+#endif
}
static void plwDestroy(PLWriter *pWriter){
- dataBufferDestroy(&pWriter->b);
SCRAMBLE(pWriter);
}
@@ -957,14 +974,27 @@ static void plwDestroy(PLWriter *pWriter){
** dlcAddDoclist - add the collected doclist to the given buffer.
*/
typedef struct DLCollector {
+ DataBuffer b;
+ DLWriter dlw;
PLWriter plw;
} DLCollector;
+/* TODO(shess) This could also be done by calling plwTerminate() and
+** dataBufferAppend(). I tried that, expecting nominal performance
+** differences, but it seemed to pretty reliably be worth 1% to code
+** it this way. I suspect it's the incremental malloc overhead (some
+** percentage of the plwTerminate() calls will cause a realloc), so
+** this might be worth revisiting if the DataBuffer implementation
+** changes.
+*/
static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
- DLWriter dlw;
- dlwInit(&dlw, pCollector->plw.iType, b);
- plwDlwAdd(&pCollector->plw, &dlw);
- dlwDestroy(&dlw);
+ if( pCollector->dlw.iType>DL_DOCIDS ){
+ char c[VARINT_MAX];
+ int n = putVarint(c, POS_END);
+ dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
+ }else{
+ dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
+ }
}
static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
int iStartOffset, int iEndOffset){
@@ -973,11 +1003,15 @@ static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
DLCollector *pCollector = malloc(sizeof(DLCollector));
- plwInit(&pCollector->plw, iDocid, iType);
+ dataBufferInit(&pCollector->b, 0);
+ dlwInit(&pCollector->dlw, iType, &pCollector->b);
+ plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
return pCollector;
}
static void dlcDelete(DLCollector *pCollector){
plwDestroy(&pCollector->plw);
+ dlwDestroy(&pCollector->dlw);
+ dataBufferDestroy(&pCollector->b);
SCRAMBLE(pCollector);
free(pCollector);
}
@@ -985,43 +1019,50 @@ static void dlcDelete(DLCollector *pCollector){
/* Copy the doclist data of iType in pData/nData into *out, trimming
** unnecessary data as we go. Only columns matching iColumn are
-** copied, all columns copied if iColimn is -1. Elements with no
+** copied, all columns copied if iColumn is -1. Elements with no
** matching columns are dropped. The output is an iOutType doclist.
*/
+/* NOTE(shess) This code is only valid after all doclists are merged.
+** If this is run before merges, then doclist items which represent
+** deletion will be trimmed, and will thus not effect a deletion
+** during the merge.
+*/
static void docListTrim(DocListType iType, const char *pData, int nData,
int iColumn, DocListType iOutType, DataBuffer *out){
DLReader dlReader;
DLWriter dlWriter;
- PLWriter plWriter;
assert( iOutType<=iType );
dlrInit(&dlReader, iType, pData, nData);
dlwInit(&dlWriter, iOutType, out);
- plwInit(&plWriter, 0, iOutType);
while( !dlrAtEnd(&dlReader) ){
PLReader plReader;
+ PLWriter plWriter;
int match = 0;
- plrInit(&plReader, dlReader.iType,
- dlrPosData(&dlReader), dlrPosDataLen(&dlReader));
- plwReset(&plWriter, dlrDocid(&dlReader), iOutType);
+ plrInit(&plReader, &dlReader);
while( !plrAtEnd(&plReader) ){
if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
- match = 1;
+ if( !match ){
+ plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
+ match = 1;
+ }
plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
plrStartOffset(&plReader), plrEndOffset(&plReader));
}
plrStep(&plReader);
}
- if( match ) plwDlwAdd(&plWriter, &dlWriter);
+ if( match ){
+ plwTerminate(&plWriter);
+ plwDestroy(&plWriter);
+ }
plrDestroy(&plReader);
dlrStep(&dlReader);
}
- plwDestroy(&plWriter);
dlwDestroy(&dlWriter);
dlrDestroy(&dlReader);
}
@@ -1172,9 +1213,8 @@ static void mergePosList(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
assert( dlrDocid(pLeft)==dlrDocid(pRight) );
assert( pOut->iType!=DL_POSITIONS_OFFSETS );
- plrInit(&left, pLeft->iType, dlrPosData(pLeft), dlrPosDataLen(pLeft));
- plrInit(&right, pRight->iType, dlrPosData(pRight), dlrPosDataLen(pRight));
- plwInit(&writer, dlrDocid(pLeft), pOut->iType);
+ plrInit(&left, pLeft);
+ plrInit(&right, pRight);
while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
if( plrColumn(&left)<plrColumn(&right) ){
@@ -1186,23 +1226,23 @@ static void mergePosList(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
}else if( plrPosition(&left)+1>plrPosition(&right) ){
plrStep(&right);
}else{
- match = 1;
+ if( !match ){
+ plwInit(&writer, pOut, dlrDocid(pLeft));
+ match = 1;
+ }
plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
plrStep(&left);
plrStep(&right);
}
}
- /* TODO(shess) We could remember the output position, encode the
- ** docid, then encode the poslist directly into the output. If no
- ** match, we back out to the stored output position. This would
- ** also reduce the malloc count.
- */
- if( match ) plwDlwAdd(&writer, pOut);
+ if( match ){
+ plwTerminate(&writer);
+ plwDestroy(&writer);
+ }
plrDestroy(&left);
plrDestroy(&right);
- plwDestroy(&writer);
}
/* We have two doclists with positions: pLeft and pRight.
@@ -1272,7 +1312,7 @@ static void docListAndMerge(
}else if( dlrDocid(&right)<dlrDocid(&left) ){
dlrStep(&right);
}else{
- dlwAdd(&writer, dlrDocid(&left), 0, 0);
+ dlwAdd(&writer, dlrDocid(&left));
dlrStep(&left);
dlrStep(&right);
}
@@ -1310,13 +1350,13 @@ static void docListOrMerge(
while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
- dlwAdd(&writer, dlrDocid(&left), 0, 0);
+ dlwAdd(&writer, dlrDocid(&left));
dlrStep(&left);
}else if( dlrAtEnd(&left) || dlrDocid(&right)<dlrDocid(&left) ){
- dlwAdd(&writer, dlrDocid(&right), 0, 0);
+ dlwAdd(&writer, dlrDocid(&right));
dlrStep(&right);
}else{
- dlwAdd(&writer, dlrDocid(&left), 0, 0);
+ dlwAdd(&writer, dlrDocid(&left));
dlrStep(&left);
dlrStep(&right);
}
@@ -1354,7 +1394,7 @@ static void docListExceptMerge(
dlrStep(&right);
}
if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
- dlwAdd(&writer, dlrDocid(&left), 0, 0);
+ dlwAdd(&writer, dlrDocid(&left));
}
dlrStep(&left);
}
diff --git a/manifest b/manifest
index 16907b08a..307f08b32 100644
--- a/manifest
+++ b/manifest
@@ -1,5 +1,5 @@
-C Refactor\sPLWriter\sin\spreparation\sfor\sbuffered-document\schange.\nCurrently,\sPLWriter\s(Position\sList\sWriter)\screates\sa\slocally-owned\nDataBuffer\sto\swrite\sinto.\s\sThis\sis\snecessary\sto\ssupport\sdoclist\ncollection\sduring\stokenization,\swhere\sthere\sis\sno\sobvious\sbuffer\sto\nwrite\soutput\sto,\sbut\sis\snot\snecessary\sfor\sthe\sother\susers\sof\sPLWriter.\n\sThis\schange\sadds\sa\sDLCollector\s(Doc\sList\sCollector)\sstructure\sto\nhandle\sthe\stokenization\scase.\n\nAlso\sfix\sa\spotential\smemory\sleak\sin\swriteZeroSegment().\s\sIn\scase\sof\nerror\sfrom\sleafWriterStep(),\sthe\sDataBuffer\sdl\swas\sbeing\sleaked.\s(CVS\s3706)
-D 2007-03-20T23:52:38
+C Refactor\sPLWriter\sto\sremove\sowned\sbuffer.\s\sDLCollector\s(Document\sList\nCollector)\snow\shandles\sthe\scase\swhere\sPLWriter\s(Position\sList\sWriter)\nneeded\sa\slocal\sbuffer.\s\sChange\sto\susing\sthe\sassociated\sDLWriter\n(Document\sList\sWriter)\sbuffer,\swhich\sreduces\sthe\snumber\sof\smemory\ncopies\sneeded\sin\sdoclist\sprocessing,\sand\sbrings\sPLWriter\soperation\sin\nline\swith\sDLWriter\soperation.\s(CVS\s3707)
+D 2007-03-22T00:14:29
F Makefile.in 1fe3d0b46e40fd684e1e61f8e8056cefed16de9f
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@@ -34,7 +34,7 @@ F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d
F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
-F ext/fts2/fts2.c aba63e7f4892a2e7cf50054181cda3d246c3ba0a
+F ext/fts2/fts2.c de8321a2ad1edea1f0dd223cb86cf008451784a4
F ext/fts2/fts2.h bbdab26d34f91974d5b9ade8b7836c140a7c4ce1
F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b
F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e
@@ -437,7 +437,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
-P 7dc7658887046f066b564a5994578074a99756ba
-R 28415623e14534daa33e7418f28a0adb
+P 1b9918e20767aebc9c1e7523027139e5fbc12688
+R 86ecbb6dcb3fabbb334fec798aed3031
U shess
-Z d8903aa3843e1c017cd54e70c455deff
+Z f6bd67aa8facf9e71ae06b9f1a1aa4bb
diff --git a/manifest.uuid b/manifest.uuid
index 8516d5c30..ee3dcfcf1 100644
--- a/manifest.uuid
+++ b/manifest.uuid
@@ -1 +1 @@
-1b9918e20767aebc9c1e7523027139e5fbc12688 \ No newline at end of file
+d04fa3a13a84f49074c673b8ee2fb6541da061b5 \ No newline at end of file