Extend the Rowset object to contain all the capabilities of Rowhash in

addition to its legacy capabilities. Use Rowset to replace Rowhash. In addition to requiring less code, This removes the 2^32 result row limitation, uses less memory, and gives better bounds on worst-case performance. The Rowhash implementation has yet to be removed. (CVS 6534) FossilOrigin-Name: b101cf70b75c9772aaf50e0eadd0cfa37c84d193
author: drh <drh@noemail.net> 2009-04-22 00:47:00 +0000
committer: drh <drh@noemail.net> 2009-04-22 00:47:00 +0000
commit: 733bf1b1e2a332caaef41056a32c371bee62a266 (patch)
tree: ad11ee64e6fbe377a540b363317bbbb9b2698847 /src
parent: dcc1f44020da1ee775354154fef7afc0b127c9db (diff)
download: sqlite-733bf1b1e2a332caaef41056a32c371bee62a266.tar.gz
sqlite-733bf1b1e2a332caaef41056a32c371bee62a266.zip
4 files changed, 260 insertions, 67 deletions
diff --git a/src/rowhash.c b/src/rowhash.c
index 28c344707..50559e6d8 100644
--- a/src/rowhash.c
+++ b/src/rowhash.c
@@ -31,7 +31,7 @@
 ** The caller is responsible for insuring that there are no duplicate
 ** INSERTs.
 **
-** $Id: rowhash.c,v 1.4 2009/04/21 18:20:45 danielk1977 Exp $
+** $Id: rowhash.c,v 1.5 2009/04/22 00:47:01 drh Exp $
 */
 #include "sqliteInt.h"
 
@@ -135,9 +135,9 @@ struct RowHashBlock {
 ** around and used as opaque handles by code in other modules.
 */
 struct RowHash {
-  int nUsed;              /* Number of used entries in first RowHashBlock */
-  int nEntry;             /* Number of used entries over all RowHashBlocks */
+  unsigned int nEntry;    /* Number of used entries over all RowHashBlocks */
   int iBatch;             /* The current insert batch number */
+  u16 nUsed;              /* Number of used entries in first RowHashBlock */
   u8 nHeight;             /* Height of tree of hash pages */
   u8 nLinearLimit;        /* Linear search limit (used if pHash==0) */
   int nBucket;            /* Number of buckets in hash table */
diff --git a/src/rowset.c b/src/rowset.c
index e8f7a52e4..4e27676cb 100644
--- a/src/rowset.c
+++ b/src/rowset.c
@@ -10,47 +10,89 @@
 **
 *************************************************************************
 **
-** This module implements an object we call a "Row Set".
+** This module implements an object we call a "RowSet".
 **
-** The RowSet object is a bag of rowids.  Rowids
-** are inserted into the bag in an arbitrary order.  Then they are
-** pulled from the bag in sorted order.  Rowids only appear in the
-** bag once.  If the same rowid is inserted multiple times, the
-** second and subsequent inserts make no difference on the output.
+** The RowSet object is a collection of rowids.  Rowids
+** are inserted into the RowSet in an arbitrary order.  Inserts
+** can be intermixed with tests to see if a given rowid has been
+** previously inserted into the RowSet.
 **
-** This implementation accumulates rowids in a linked list.  For
-** output, it first sorts the linked list (removing duplicates during
-** the sort) then returns elements one by one by walking the list.
+** After all inserts are finished, it is possible to extract the
+** elements of the RowSet in sorted order.  Once this extraction
+** process has started, no new elements may be inserted.
 **
-** Big chunks of rowid/next-ptr pairs are allocated at a time, to
-** reduce the malloc overhead.
+** Hence, the primitive operations for a RowSet are:
 **
-** $Id: rowset.c,v 1.4 2009/04/01 19:35:55 drh Exp $
+**    CREATE
+**    INSERT
+**    TEST
+**    SMALLEST
+**    DESTROY
+**
+** The CREATE and DESTROY primitives are the constructor and destructor,
+** obviously.  The INSERT primitive adds a new element to the RowSet.
+** TEST checks to see if an element is already in the RowSet.  SMALLEST
+** extracts the least value from the RowSet.
+**
+** The INSERT primitive might allocate additional memory.  Memory is
+** allocated in chunks so most INSERTs do no allocation.  There is an 
+** upper bound on the size of allocated memory.  No memory is freed
+** until DESTROY.
+**
+** The TEST primitive includes a "batch" number.  The TEST primitive
+** will only see elements that were inserted before the last change
+** in the batch number.  In other words, if an INSERT occurs between
+** two TESTs where the TESTs have the same batch nubmer, then the
+** value added by the INSERT will not be visible to the second TEST.
+** The initial batch number is zero, so if the very first TEST contains
+** a non-zero batch number, it will see all prior INSERTs.
+**
+** No INSERTs may occurs after a SMALLEST.  An assertion will fail if
+** that is attempted.
+**
+** The cost of an INSERT is roughly constant.  (Sometime new memory
+** has to be allocated on an INSERT.)  The cost of a TEST with a new
+** batch number is O(NlogN) where N is the number of elements in the RowSet.
+** The cost of a TEST using the same batch number is O(logN).  The cost
+** of the first SMALLEST is O(NlogN).  Second and subsequent SMALLEST
+** primitives are constant time.  The cost of DESTROY is O(N).
+**
+** There is an added cost of O(N) when switching between TEST and
+** SMALLEST primitives.
+**
+** $Id: rowset.c,v 1.5 2009/04/22 00:47:01 drh Exp $
 */
 #include "sqliteInt.h"
 
+
+/*
+** Target size for allocation chunks.
+*/
+#define ROWSET_ALLOCATION_SIZE 1024
+
 /*
 ** The number of rowset entries per allocation chunk.
 */
-#define ROWSET_ENTRY_PER_CHUNK  63
+#define ROWSET_ENTRY_PER_CHUNK  \
+                       ((ROWSET_ALLOCATION_SIZE-8)/sizeof(struct RowSetEntry))
 
 /*
-** Each entry in a RowSet is an instance of the following
-** structure:
+** Each entry in a RowSet is an instance of the following object.
 */
 struct RowSetEntry {            
   i64 v;                        /* ROWID value for this entry */
-  struct RowSetEntry *pNext;    /* Next entry on a list of all entries */
+  struct RowSetEntry *pRight;   /* Right subtree (larger entries) or list */
+  struct RowSetEntry *pLeft;    /* Left subtree (smaller entries) */
 };
 
 /*
-** Index entries are allocated in large chunks (instances of the
+** RowSetEntry objects are allocated in large chunks (instances of the
 ** following structure) to reduce memory allocation overhead.  The
 ** chunks are kept on a linked list so that they can be deallocated
 ** when the RowSet is destroyed.
 */
 struct RowSetChunk {
-  struct RowSetChunk *pNext;             /* Next chunk on list of them all */
+  struct RowSetChunk *pNextChunk;        /* Next chunk on list of them all */
   struct RowSetEntry aEntry[ROWSET_ENTRY_PER_CHUNK]; /* Allocated entries */
 };
 
@@ -62,11 +104,13 @@ struct RowSetChunk {
 struct RowSet {
   struct RowSetChunk *pChunk;    /* List of all chunk allocations */
   sqlite3 *db;                   /* The database connection */
-  struct RowSetEntry *pEntry;    /* List of entries in the rowset */
+  struct RowSetEntry *pEntry;    /* List of entries using pRight */
   struct RowSetEntry *pLast;     /* Last entry on the pEntry list */
   struct RowSetEntry *pFresh;    /* Source of new entry objects */
+  struct RowSetEntry *pTree;     /* Binary tree of entries */
   u16 nFresh;                    /* Number of objects on pFresh */
-  u8 isSorted;                   /* True if content is sorted */
+  u8 isSorted;                   /* True if pEntry is sorted */
+  u8 iBatch;                     /* Current insert batch */
 };
 
 /*
@@ -89,25 +133,30 @@ RowSet *sqlite3RowSetInit(sqlite3 *db, void *pSpace, unsigned int N){
   p->db = db;
   p->pEntry = 0;
   p->pLast = 0;
+  p->pTree = 0;
   p->pFresh = (struct RowSetEntry*)&p[1];
   p->nFresh = (u16)((N - sizeof(*p))/sizeof(struct RowSetEntry));
   p->isSorted = 1;
+  p->iBatch = 0;
   return p;
 }
 
 /*
-** Deallocate all chunks from a RowSet.
+** Deallocate all chunks from a RowSet.  This frees all memory that
+** the RowSet has allocated over its lifetime.  This routine is
+** the destructor for the RowSet.
 */
 void sqlite3RowSetClear(RowSet *p){
   struct RowSetChunk *pChunk, *pNextChunk;
   for(pChunk=p->pChunk; pChunk; pChunk = pNextChunk){
-    pNextChunk = pChunk->pNext;
+    pNextChunk = pChunk->pNextChunk;
     sqlite3DbFree(p->db, pChunk);
   }
   p->pChunk = 0;
   p->nFresh = 0;
   p->pEntry = 0;
   p->pLast = 0;
+  p->pTree = 0;
   p->isSorted = 1;
 }
 
@@ -118,8 +167,8 @@ void sqlite3RowSetClear(RowSet *p){
 ** memory allocation fails.
 */
 void sqlite3RowSetInsert(RowSet *p, i64 rowid){
-  struct RowSetEntry *pEntry;
-  struct RowSetEntry *pLast;
+  struct RowSetEntry *pEntry;  /* The new entry */
+  struct RowSetEntry *pLast;   /* The last prior entry */
   assert( p!=0 );
   if( p->nFresh==0 ){
     struct RowSetChunk *pNew;
@@ -127,7 +176,7 @@ void sqlite3RowSetInsert(RowSet *p, i64 rowid){
     if( pNew==0 ){
       return;
     }
-    pNew->pNext = p->pChunk;
+    pNew->pNextChunk = p->pChunk;
     p->pChunk = pNew;
     p->pFresh = pNew->aEntry;
     p->nFresh = ROWSET_ENTRY_PER_CHUNK;
@@ -135,26 +184,27 @@ void sqlite3RowSetInsert(RowSet *p, i64 rowid){
   pEntry = p->pFresh++;
   p->nFresh--;
   pEntry->v = rowid;
-  pEntry->pNext = 0;
+  pEntry->pRight = 0;
   pLast = p->pLast;
   if( pLast ){
     if( p->isSorted && rowid<=pLast->v ){
       p->isSorted = 0;
     }
-    pLast->pNext = pEntry;
+    pLast->pRight = pEntry;
   }else{
-    assert( p->pEntry==0 );
+    assert( p->pEntry==0 ); /* Fires if INSERT after SMALLEST */
     p->pEntry = pEntry;
   }
   p->pLast = pEntry;
 }
 
 /*
-** Merge two lists of RowSet entries.  Remove duplicates.
+** Merge two lists of RowSetEntry objects.  Remove duplicates.
 **
-** The input lists are assumed to be in sorted order.
+** The input lists are connected via pRight pointers and are 
+** assumed to each already be in sorted order.
 */
-static struct RowSetEntry *boolidxMerge(
+static struct RowSetEntry *rowSetMerge(
   struct RowSetEntry *pA,    /* First sorted list to be merged */
   struct RowSetEntry *pB     /* Second sorted list to be merged */
 ){
@@ -163,34 +213,34 @@ static struct RowSetEntry *boolidxMerge(
 
   pTail = &head;
   while( pA && pB ){
-    assert( pA->pNext==0 || pA->v<=pA->pNext->v );
-    assert( pB->pNext==0 || pB->v<=pB->pNext->v );
+    assert( pA->pRight==0 || pA->v<=pA->pRight->v );
+    assert( pB->pRight==0 || pB->v<=pB->pRight->v );
     if( pA->v<pB->v ){
-      pTail->pNext = pA;
-      pA = pA->pNext;
-      pTail = pTail->pNext;
+      pTail->pRight = pA;
+      pA = pA->pRight;
+      pTail = pTail->pRight;
     }else if( pB->v<pA->v ){
-      pTail->pNext = pB;
-      pB = pB->pNext;
-      pTail = pTail->pNext;
+      pTail->pRight = pB;
+      pB = pB->pRight;
+      pTail = pTail->pRight;
     }else{
-      pA = pA->pNext;
+      pA = pA->pRight;
     }
   }
   if( pA ){
-    assert( pA->pNext==0 || pA->v<=pA->pNext->v );
-    pTail->pNext = pA;
+    assert( pA->pRight==0 || pA->v<=pA->pRight->v );
+    pTail->pRight = pA;
   }else{
-    assert( pB==0 || pB->pNext==0 || pB->v<=pB->pNext->v );
-    pTail->pNext = pB;
+    assert( pB==0 || pB->pRight==0 || pB->v<=pB->pRight->v );
+    pTail->pRight = pB;
   }
-  return head.pNext;
+  return head.pRight;
 }
 
 /*
-** Sort all elements of the RowSet into ascending order.
+** Sort all elements on the pEntry list of the RowSet into ascending order.
 */ 
-static void sqlite3RowSetSort(RowSet *p){
+static void rowSetSort(RowSet *p){
   unsigned int i;
   struct RowSetEntry *pEntry;
   struct RowSetEntry *aBucket[40];
@@ -199,35 +249,147 @@ static void sqlite3RowSetSort(RowSet *p){
   memset(aBucket, 0, sizeof(aBucket));
   while( p->pEntry ){
     pEntry = p->pEntry;
-    p->pEntry = pEntry->pNext;
-    pEntry->pNext = 0;
+    p->pEntry = pEntry->pRight;
+    pEntry->pRight = 0;
     for(i=0; aBucket[i]; i++){
-      pEntry = boolidxMerge(aBucket[i],pEntry);
+      pEntry = rowSetMerge(aBucket[i], pEntry);
       aBucket[i] = 0;
     }
     aBucket[i] = pEntry;
   }
   pEntry = 0;
   for(i=0; i<sizeof(aBucket)/sizeof(aBucket[0]); i++){
-    pEntry = boolidxMerge(pEntry,aBucket[i]);
+    pEntry = rowSetMerge(pEntry, aBucket[i]);
   }
   p->pEntry = pEntry;
   p->pLast = 0;
   p->isSorted = 1;
 }
 
+
+/*
+** The input, pIn, is a binary tree (or subtree) of RowSetEntry objects.
+** Convert this tree into a linked list connected by the pRight pointers
+** and return pointers to the first and last elements of the new list.
+*/
+static void rowSetTreeToList(
+  struct RowSetEntry *pIn,         /* Root of the input tree */
+  struct RowSetEntry **ppFirst,    /* Write head of the output list here */
+  struct RowSetEntry **ppLast      /* Write tail of the output list here */
+){
+  if( pIn==0 ){
+    *ppFirst = *ppLast = 0;
+  }
+  if( pIn->pLeft ){
+    struct RowSetEntry *p;
+    rowSetTreeToList(pIn->pLeft, ppFirst, &p);
+    p->pRight = pIn;
+  }else{
+    *ppFirst = pIn;
+  }
+  if( pIn->pRight ){
+    rowSetTreeToList(pIn->pRight, &pIn->pRight, ppLast);
+  }else{
+    *ppLast = pIn;
+  }
+  assert( (*ppLast)->pRight==0 );
+}
+
+
+/*
+** Convert a sorted list of elements (connected by pRight) into a binary
+** tree with depth of iDepth.  A depth of 1 means the tree contains a single
+** node taken from the head of *ppList.  A depth of 2 means a tree with
+** three nodes.  And so forth.
+**
+** Use as many entries from the input list as required and update the
+** *ppList to point to the unused elements of the list.  If the input
+** list contains too few elements, then construct an incomplete tree
+** and leave *ppList set to NULL.
+**
+** Return a pointer to the root of the constructed binary tree.
+*/
+static struct RowSetEntry *rowSetNDeepTree(
+  struct RowSetEntry **ppList,
+  int iDepth
+){
+  struct RowSetEntry *p;         /* Root of the new tree */
+  struct RowSetEntry *pLeft;     /* Left subtree */
+  if( *ppList==0 ){
+    return 0;
+  }
+  if( iDepth==1 ){
+    p = *ppList;
+    *ppList = p->pRight;
+    p->pLeft = p->pRight = 0;
+    return p;
+  }
+  pLeft = rowSetNDeepTree(ppList, iDepth-1);
+  p = *ppList;
+  if( p==0 ){
+    return pLeft;
+  }
+  p->pLeft = pLeft;
+  *ppList = p->pRight;
+  p->pRight = rowSetNDeepTree(ppList, iDepth-1);
+  return p;
+}
+
+/*
+** Convert a sorted list of elements into a binary tree. Make the tree
+** as deep as it needs to be in order to contain the entire list.
+*/
+static struct RowSetEntry *rowSetListToTree(struct RowSetEntry *pList){
+  int iDepth;           /* Depth of the tree so far */
+  struct RowSetEntry *p;       /* Current tree root */
+  struct RowSetEntry *pLeft;   /* Left subtree */
+
+  if( pList==0 ){
+    return 0;
+  }
+  p = pList;
+  pList = p->pRight;
+  p->pLeft = p->pRight = 0;
+  for(iDepth=1; pList; iDepth++){
+    pLeft = p;
+    p = pList;
+    pList = p->pRight;
+    p->pLeft = pLeft;
+    p->pRight = rowSetNDeepTree(&pList, iDepth);
+  }
+  return p;
+}
+
+/*
+** Convert the list in p->pEntry into a sorted list if it is not
+** sorted already.  If there is a binary tree on p->pTree, then
+** convert it into a list too and merge it into the p->pEntry list.
+*/
+static void rowSetToList(RowSet *p){
+  if( !p->isSorted ){
+    rowSetSort(p);
+  }
+  if( p->pTree ){
+    struct RowSetEntry *pHead, *pTail;
+    rowSetTreeToList(p->pTree, &pHead, &pTail);
+    p->pTree = 0;
+    p->pEntry = rowSetMerge(p->pEntry, pHead);
+  }
+}
+
 /*
-** Extract the next (smallest) element from the RowSet.
+** Extract the smallest element from the RowSet.
 ** Write the element into *pRowid.  Return 1 on success.  Return
 ** 0 if the RowSet is already empty.
+**
+** After this routine has been called, the sqlite3RowSetInsert()
+** routine may not be called again.  
 */
 int sqlite3RowSetNext(RowSet *p, i64 *pRowid){
-  if( !p->isSorted ){
-    sqlite3RowSetSort(p);
-  }
+  rowSetToList(p);
   if( p->pEntry ){
     *pRowid = p->pEntry->v;
-    p->pEntry = p->pEntry->pNext;
+    p->pEntry = p->pEntry->pRight;
     if( p->pEntry==0 ){
       sqlite3RowSetClear(p);
     }
@@ -236,3 +398,31 @@ int sqlite3RowSetNext(RowSet *p, i64 *pRowid){
     return 0;
   }
 }
+
+/*
+** Check to see if element iRowid was inserted into the the rowset as
+** part of any insert batch prior to iBatch.  Return 1 or 0.
+*/
+int sqlite3RowSetTest(RowSet *pRowSet, u8 iBatch, sqlite3_int64 iRowid){
+  struct RowSetEntry *p;
+  if( iBatch!=pRowSet->iBatch ){
+    if( pRowSet->pEntry ){
+      rowSetToList(pRowSet);
+      pRowSet->pTree = rowSetListToTree(pRowSet->pEntry);
+      pRowSet->pEntry = 0;
+      pRowSet->pLast = 0;
+    }
+    pRowSet->iBatch = iBatch;
+  }
+  p = pRowSet->pTree;
+  while( p ){
+    if( p->v<iRowid ){
+      p = p->pRight;
+    }else if( p->v>iRowid ){
+      p = p->pLeft;
+    }else{
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/src/sqliteInt.h b/src/sqliteInt.h
index fe2b89333..2e51888e0 100644
--- a/src/sqliteInt.h
+++ b/src/sqliteInt.h
@@ -11,7 +11,7 @@
 *************************************************************************
 ** Internal interface definitions for SQLite.
 **
-** @(#) $Id: sqliteInt.h,v 1.856 2009/04/21 16:15:15 drh Exp $
+** @(#) $Id: sqliteInt.h,v 1.857 2009/04/22 00:47:01 drh Exp $
 */
 #ifndef _SQLITEINT_H_
 #define _SQLITEINT_H_
@@ -2400,6 +2400,7 @@ int sqlite3BitvecBuiltinTest(int,int*);
 RowSet *sqlite3RowSetInit(sqlite3*, void*, unsigned int);
 void sqlite3RowSetClear(RowSet*);
 void sqlite3RowSetInsert(RowSet*, i64);
+int sqlite3RowSetTest(RowSet*, u8 iBatch, i64);
 int sqlite3RowSetNext(RowSet*, i64*);
 
 int sqlite3RowhashInsert(sqlite3*, RowHash **pp, i64 iVal);
diff --git a/src/vdbe.c b/src/vdbe.c
index 6f22d9148..8cc33d0be 100644
--- a/src/vdbe.c
+++ b/src/vdbe.c
@@ -43,7 +43,7 @@
 ** in this file for details.  If in doubt, do not deviate from existing
 ** commenting and indentation practices when changing or adding code.
 **
-** $Id: vdbe.c,v 1.835 2009/04/21 16:15:15 drh Exp $
+** $Id: vdbe.c,v 1.836 2009/04/22 00:47:01 drh Exp $
 */
 #include "sqliteInt.h"
 #include "vdbeInt.h"
@@ -428,6 +428,8 @@ static void memTracePrint(FILE *out, Mem *p){
     fprintf(out, " i:%lld", p->u.i);
   }else if( p->flags & MEM_Real ){
     fprintf(out, " r:%g", p->r);
+  }else if( p->flags & MEM_RowSet ){
+    fprintf(out, " (rowset)");
   }else{
     char zBuf[200];
     sqlite3VdbeMemPrettyPrint(p, zBuf);
@@ -4631,23 +4633,23 @@ case OP_RowHash: {                     /* jump, in1, in3 */
   ** delete it now and initialize P1 with an empty row-hash (a null pointer
   ** is an acceptable representation of an empty row-hash).
   */
-  if( (pIn1->flags & MEM_RowHash)==0 ){
-    sqlite3VdbeMemReleaseExternal(pIn1);
-    pIn1->u.pRowHash = 0;
-    pIn1->flags = MEM_RowHash;
+  if( (pIn1->flags & MEM_RowSet)==0 ){
+    sqlite3VdbeMemSetRowSet(pIn1);
+    if( (pIn1->flags & MEM_RowSet)==0 ) goto no_mem;
   }
 
   assert( pOp->p4type==P4_INT32 );
   if( iSet ){
     int exists;
-    rc = sqlite3RowhashTest(pIn1->u.pRowHash, pOp->p4.i, pIn3->u.i, &exists);
+    exists = sqlite3RowSetTest(pIn1->u.pRowSet, iSet>=0 ? iSet & 0xf : 0xff,
+                               pIn3->u.i);
     if( exists ){
       pc = pOp->p2 - 1;
       break;
     }
   }
   if( iSet>=0 ){
-    rc = sqlite3RowhashInsert(db, &pIn1->u.pRowHash, pIn3->u.i);
+    sqlite3RowSetInsert(pIn1->u.pRowSet, pIn3->u.i);
   }
   break;
 }
author	drh <drh@noemail.net>	2009-04-22 00:47:00 +0000
committer	drh <drh@noemail.net>	2009-04-22 00:47:00 +0000
commit	733bf1b1e2a332caaef41056a32c371bee62a266 (patch)
tree	ad11ee64e6fbe377a540b363317bbbb9b2698847 /src
parent	dcc1f44020da1ee775354154fef7afc0b127c9db (diff)
download	sqlite-733bf1b1e2a332caaef41056a32c371bee62a266.tar.gz sqlite-733bf1b1e2a332caaef41056a32c371bee62a266.zip