4 files changed, 413 insertions, 78 deletions
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index ebac5957bd2..943978e589a 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.165 2007/09/10 21:59:37 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.166 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -119,6 +119,7 @@ DefineIndex(RangeVar *heapRelation,
 	Oid			namespaceId;
 	Oid			tablespaceId;
 	Relation	rel;
+	Relation	indexRelation;
 	HeapTuple	tuple;
 	Form_pg_am	accessMethodForm;
 	bool		amcanorder;
@@ -420,7 +421,10 @@ DefineIndex(RangeVar *heapRelation,
 	indexInfo->ii_Predicate = make_ands_implicit(predicate);
 	indexInfo->ii_PredicateState = NIL;
 	indexInfo->ii_Unique = unique;
+	/* In a concurrent build, mark it not-ready-for-inserts */
+	indexInfo->ii_ReadyForInserts = !concurrent;
 	indexInfo->ii_Concurrent = concurrent;
+	indexInfo->ii_BrokenHotChain = false;
 
 	classObjectId = (Oid *) palloc(numberOfAttributes * sizeof(Oid));
 	coloptions = (int16 *) palloc(numberOfAttributes * sizeof(int16));
@@ -439,23 +443,38 @@ DefineIndex(RangeVar *heapRelation,
 				  primary ? "PRIMARY KEY" : "UNIQUE",
 				  indexRelationName, RelationGetRelationName(rel))));
 
-	/* save lockrelid for below, then close rel */
+	/* save lockrelid and locktag for below, then close rel */
 	heaprelid = rel->rd_lockInfo.lockRelId;
+	SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
 	heap_close(rel, NoLock);
 
+	if (!concurrent)
+	{
+		indexRelationId =
+			index_create(relationId, indexRelationName, indexRelationId,
+						 indexInfo, accessMethodId, tablespaceId, classObjectId,
+						 coloptions, reloptions, primary, isconstraint,
+						 allowSystemTableMods, skip_build, concurrent);
+
+		return;					/* We're done, in the standard case */
+	}
+
+	/*
+	 * For a concurrent build, we next insert the catalog entry and add
+	 * constraints.  We don't build the index just yet; we must first make
+	 * the catalog entry so that the new index is visible to updating
+	 * transactions.  That will prevent them from making incompatible HOT
+	 * updates.  The new index will be marked not indisready and not
+	 * indisvalid, so that no one else tries to either insert into it or use
+	 * it for queries.  We pass skip_build = true to prevent the build.
+	 */
 	indexRelationId =
 		index_create(relationId, indexRelationName, indexRelationId,
 					 indexInfo, accessMethodId, tablespaceId, classObjectId,
 					 coloptions, reloptions, primary, isconstraint,
-					 allowSystemTableMods, skip_build, concurrent);
-
-	if (!concurrent)
-		return;					/* We're done, in the standard case */
+				 	 allowSystemTableMods, true, concurrent);
 
 	/*
-	 * Phase 2 of concurrent index build (see comments for validate_index()
-	 * for an overview of how this works)
-	 *
 	 * We must commit our current transaction so that the index becomes
 	 * visible; then start another.  Note that all the data structures we just
 	 * built are lost in the commit.  The only data we keep past here are the
@@ -476,6 +495,9 @@ DefineIndex(RangeVar *heapRelation,
 	StartTransactionCommand();
 
 	/*
+	 * Phase 2 of concurrent index build (see comments for validate_index()
+	 * for an overview of how this works)
+	 *
 	 * Now we must wait until no running transaction could have the table open
 	 * with the old list of indexes.  To do this, inquire which xacts
 	 * currently would conflict with ShareLock on the table -- ie, which ones
@@ -494,7 +516,91 @@ DefineIndex(RangeVar *heapRelation,
 	 * check for that.  Also, prepared xacts are not reported, which is
 	 * fine since they certainly aren't going to do anything more.
 	 */
-	SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
+	old_lockholders = GetLockConflicts(&heaplocktag, ShareLock);
+
+	while (VirtualTransactionIdIsValid(*old_lockholders))
+	{
+		VirtualXactLockTableWait(*old_lockholders);
+		old_lockholders++;
+	}
+
+	/*
+	 * At this moment we are sure that there are no transactions with the
+	 * table open for write that don't have this new index in their list of
+	 * indexes.  We have waited out all the existing transactions and any new
+	 * transaction will have the new index in its list, but the index is still
+	 * marked as "not-ready-for-inserts".  The index is consulted while
+	 * deciding HOT-safety though.  This arrangement ensures that no new HOT
+	 * chains can be created where the new tuple and the old tuple in the
+	 * chain have different index keys.
+	 *
+	 * We now take a new snapshot, and build the index using all tuples that
+	 * are visible in this snapshot.  We can be sure that any HOT updates
+	 * to these tuples will be compatible with the index, since any updates
+	 * made by transactions that didn't know about the index are now committed
+	 * or rolled back.  Thus, each visible tuple is either the end of its
+	 * HOT-chain or the extension of the chain is HOT-safe for this index.
+	 */
+
+	/* Open and lock the parent heap relation */
+	rel = heap_openrv(heapRelation, ShareUpdateExclusiveLock);
+
+	/* And the target index relation */
+	indexRelation = index_open(indexRelationId, RowExclusiveLock);
+
+	/* Set ActiveSnapshot since functions in the indexes may need it */
+	ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
+
+	/* We have to re-build the IndexInfo struct, since it was lost in commit */
+	indexInfo = BuildIndexInfo(indexRelation);
+	Assert(!indexInfo->ii_ReadyForInserts);
+	indexInfo->ii_Concurrent = true;
+	indexInfo->ii_BrokenHotChain = false;
+
+	/* Now build the index */
+	index_build(rel, indexRelation, indexInfo, primary);
+
+	/* Close both the relations, but keep the locks */
+	heap_close(rel, NoLock);
+	index_close(indexRelation, NoLock);
+
+	/*
+	 * Update the pg_index row to mark the index as ready for inserts.
+	 * Once we commit this transaction, any new transactions that
+	 * open the table must insert new entries into the index for insertions
+	 * and non-HOT updates.
+	 */
+	pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+
+	indexTuple = SearchSysCacheCopy(INDEXRELID,
+									ObjectIdGetDatum(indexRelationId),
+									0, 0, 0);
+	if (!HeapTupleIsValid(indexTuple))
+		elog(ERROR, "cache lookup failed for index %u", indexRelationId);
+	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+	Assert(!indexForm->indisready);
+	Assert(!indexForm->indisvalid);
+
+	indexForm->indisready = true;
+
+	simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
+	CatalogUpdateIndexes(pg_index, indexTuple);
+
+	heap_close(pg_index, RowExclusiveLock);
+
+	/*
+	 * Commit this transaction to make the indisready update visible.
+	 */
+	CommitTransactionCommand();
+	StartTransactionCommand();
+
+	/*
+	 * Phase 3 of concurrent index build
+	 *
+	 * We once again wait until no transaction can have the table open with
+	 * the index marked as read-only for updates.
+	 */
 	old_lockholders = GetLockConflicts(&heaplocktag, ShareLock);
 
 	while (VirtualTransactionIdIsValid(*old_lockholders))
@@ -505,7 +611,7 @@ DefineIndex(RangeVar *heapRelation,
 
 	/*
 	 * Now take the "reference snapshot" that will be used by validate_index()
-	 * to filter candidate tuples.  Beware!  There might be still snapshots
+	 * to filter candidate tuples.  Beware!  There might still be snapshots
 	 * in use that treat some transaction as in-progress that our reference
 	 * snapshot treats as committed.  If such a recently-committed transaction
 	 * deleted tuples in the table, we will not include them in the index; yet
@@ -560,7 +666,7 @@ DefineIndex(RangeVar *heapRelation,
 		elog(ERROR, "cache lookup failed for index %u", indexRelationId);
 	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 
-	Assert(indexForm->indexrelid = indexRelationId);
+	Assert(indexForm->indisready);
 	Assert(!indexForm->indisvalid);
 
 	indexForm->indisvalid = true;
@@ -575,7 +681,8 @@ DefineIndex(RangeVar *heapRelation,
 	 * relcache entries for the index itself, but we should also send a
 	 * relcache inval on the parent table to force replanning of cached plans.
 	 * Otherwise existing sessions might fail to use the new index where it
-	 * would be useful.
+	 * would be useful.  (Note that our earlier commits did not create
+	 * reasons to replan; relcache flush on the index itself was sufficient.)
 	 */
 	CacheInvalidateRelcacheByRelid(heaprelid.relId);
 
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 73024a7e703..25d1e2311b6 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.145 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.146 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1281,7 +1281,7 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record)
 	itemsz = record->xl_len - sizeof(xl_seq_rec);
 	itemsz = MAXALIGN(itemsz);
 	if (PageAddItem(page, (Item) item, itemsz,
-					FirstOffsetNumber, false) == InvalidOffsetNumber)
+					FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(PANIC, "seq_redo: failed to add item to page");
 
 	PageSetLSN(page, lsn);
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index f9b9423534e..5630fc2730d 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.358 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.359 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -124,10 +124,11 @@ typedef VTupleMoveData *VTupleMove;
 typedef struct VRelStats
 {
 	/* miscellaneous statistics */
-	BlockNumber rel_pages;
-	double		rel_tuples;
-	Size		min_tlen;
-	Size		max_tlen;
+	BlockNumber rel_pages;		/* pages in relation */
+	double		rel_tuples;		/* tuples that remain after vacuuming */
+	double		rel_indexed_tuples;		/* indexed tuples that remain */
+	Size		min_tlen;		/* min surviving tuple size */
+	Size		max_tlen;		/* max surviving tuple size */
 	bool		hasindex;
 	/* vtlinks array for tuple chain following - sorted by new_tid */
 	int			num_vtlinks;
@@ -1177,6 +1178,7 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 	vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
 	vacrelstats->rel_pages = 0;
 	vacrelstats->rel_tuples = 0;
+	vacrelstats->rel_indexed_tuples = 0;
 	vacrelstats->hasindex = false;
 
 	/* scan the heap */
@@ -1195,13 +1197,13 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 		{
 			for (i = 0; i < nindexes; i++)
 				vacuum_index(&vacuum_pages, Irel[i],
-							 vacrelstats->rel_tuples, 0);
+							 vacrelstats->rel_indexed_tuples, 0);
 		}
 		else
 		{
 			/* just scan indexes to update statistic */
 			for (i = 0; i < nindexes; i++)
-				scan_index(Irel[i], vacrelstats->rel_tuples);
+				scan_index(Irel[i], vacrelstats->rel_indexed_tuples);
 		}
 	}
 
@@ -1256,6 +1258,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 	BlockNumber empty_pages,
 				empty_end_pages;
 	double		num_tuples,
+				num_indexed_tuples,
 				tups_vacuumed,
 				nkeep,
 				nunused;
@@ -1278,7 +1281,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 					relname)));
 
 	empty_pages = empty_end_pages = 0;
-	num_tuples = tups_vacuumed = nkeep = nunused = 0;
+	num_tuples = num_indexed_tuples = tups_vacuumed = nkeep = nunused = 0;
 	free_space = 0;
 
 	nblocks = RelationGetNumberOfBlocks(onerel);
@@ -1313,9 +1316,13 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		 * background writer will try to write the page if it's already marked
 		 * dirty.  To ensure that invalid data doesn't get written to disk, we
 		 * must take exclusive buffer lock wherever we potentially modify
-		 * pages.
+		 * pages.  In fact, we insist on cleanup lock so that we can safely
+		 * call heap_page_prune().  (This might be overkill, since the bgwriter
+		 * pays no attention to individual tuples, but on the other hand it's
+		 * unlikely that the bgwriter has this particular page pinned at this
+		 * instant.  So violating the coding rule would buy us little anyway.)
 		 */
-		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		LockBufferForCleanup(buf);
 
 		vacpage->blkno = blkno;
 		vacpage->offsets_used = 0;
@@ -1356,6 +1363,21 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 			continue;
 		}
 
+		/* 
+		 * Prune all HOT-update chains in this page.
+		 *
+		 * We use the redirect_move option so that redirecting line pointers
+		 * get collapsed out; this allows us to not worry about them below.
+		 *
+		 * We count tuples removed by the pruning step as removed by VACUUM.
+		 */
+		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
+										 true, false);
+
+		/*
+		 * Now scan the page to collect vacuumable items and check for
+		 * tuples requiring freezing.
+		 */
 		nfrozen = 0;
 		notup = true;
 		maxoff = PageGetMaxOffsetNumber(page);
@@ -1369,7 +1391,9 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 
 			/*
 			 * Collect un-used items too - it's possible to have indexes
-			 * pointing here after crash.
+			 * pointing here after crash.  (That's an ancient comment and
+			 * is likely obsolete with WAL, but we might as well continue
+			 * to check for such problems.)
 			 */
 			if (!ItemIdIsUsed(itemid))
 			{
@@ -1378,6 +1402,23 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 				continue;
 			}
 
+			/*
+			 * DEAD item pointers are to be vacuumed normally; but we don't
+			 * count them in tups_vacuumed, else we'd be double-counting
+			 * (at least in the common case where heap_page_prune() just
+			 * freed up a non-HOT tuple).
+			 */
+			if (ItemIdIsDead(itemid))
+			{
+				vacpage->offsets[vacpage->offsets_free++] = offnum;
+				continue;
+			}
+
+			/* Shouldn't have any redirected items anymore */
+			if (!ItemIdIsNormal(itemid))
+				elog(ERROR, "relation \"%s\" TID %u/%u: unexpected redirect item",
+					 relname, blkno, offnum);
+
 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 			tuple.t_len = ItemIdGetLength(itemid);
 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
@@ -1410,12 +1451,45 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 					}
 					break;
 				case HEAPTUPLE_DEAD:
-					tupgone = true;		/* we can delete the tuple */
 					/*
-					 * We need not require XMIN_COMMITTED or XMAX_COMMITTED to
-					 * be set, since we will remove the tuple without any
-					 * further examination of its hint bits.
+					 * Ordinarily, DEAD tuples would have been removed by
+					 * heap_page_prune(), but it's possible that the tuple
+					 * state changed since heap_page_prune() looked.  In
+					 * particular an INSERT_IN_PROGRESS tuple could have
+					 * changed to DEAD if the inserter aborted.  So this
+					 * cannot be considered an error condition, though it
+					 * does suggest that someone released a lock early.
+					 *
+					 * If the tuple is HOT-updated then it must only be
+					 * removed by a prune operation; so we keep it as if it
+					 * were RECENTLY_DEAD, and abandon shrinking. (XXX is it
+					 * worth trying to make the shrinking code smart enough
+					 * to handle this?  It's an unusual corner case.)
+					 *
+					 * DEAD heap-only tuples can safely be removed if they
+					 * aren't themselves HOT-updated, although this is a bit
+					 * inefficient since we'll uselessly try to remove
+					 * index entries for them.
 					 */
+					if (HeapTupleIsHotUpdated(&tuple))
+					{
+						nkeep += 1;
+						if (do_shrinking)
+							ereport(LOG,
+									(errmsg("relation \"%s\" TID %u/%u: dead HOT-updated tuple --- cannot shrink relation",
+											relname, blkno, offnum)));
+						do_shrinking = false;
+					}
+					else
+					{
+						tupgone = true;		/* we can delete the tuple */
+						/*
+						 * We need not require XMIN_COMMITTED or
+						 * XMAX_COMMITTED to be set, since we will remove the
+						 * tuple without any further examination of its hint
+						 * bits.
+						 */
+					}
 					break;
 				case HEAPTUPLE_RECENTLY_DEAD:
 
@@ -1530,6 +1604,8 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 			else
 			{
 				num_tuples += 1;
+				if (!HeapTupleIsHeapOnly(&tuple))
+					num_indexed_tuples += 1;
 				notup = false;
 				if (tuple.t_len < min_tlen)
 					min_tlen = tuple.t_len;
@@ -1549,7 +1625,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		if (tempPage != NULL)
 		{
 			/* Some tuples are removable; figure free space after removal */
-			PageRepairFragmentation(tempPage, NULL);
+			PageRepairFragmentation(tempPage);
 			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, tempPage);
 			pfree(tempPage);
 			do_reap = true;
@@ -1558,7 +1634,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		{
 			/* Just use current available space */
 			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
-			/* Need to reap the page if it has LP_UNUSED line pointers */
+			/* Need to reap the page if it has UNUSED or DEAD line pointers */
 			do_reap = (vacpage->offsets_free > 0);
 		}
 
@@ -1621,6 +1697,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 
 	/* save stats in the rel list for use later */
 	vacrelstats->rel_tuples = num_tuples;
+	vacrelstats->rel_indexed_tuples = num_indexed_tuples;
 	vacrelstats->rel_pages = nblocks;
 	if (num_tuples == 0)
 		min_tlen = max_tlen = 0;
@@ -1720,6 +1797,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				num_fraged_pages,
 				vacuumed_pages;
 	int			keep_tuples = 0;
+	int			keep_indexed_tuples = 0;
 	PGRUsage	ru0;
 
 	pg_rusage_init(&ru0);
@@ -1845,6 +1923,16 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			if (!ItemIdIsUsed(itemid))
 				continue;
 
+			if (ItemIdIsDead(itemid))
+			{
+				/* just remember it for vacuum_page() */
+				vacpage->offsets[vacpage->offsets_free++] = offnum;
+				continue;
+			}
+
+			/* Shouldn't have any redirected items now */
+			Assert(ItemIdIsNormal(itemid));
+
 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 			tuple_len = tuple.t_len = ItemIdGetLength(itemid);
 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
@@ -1906,12 +1994,28 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					if (i >= vacpage->offsets_free)		/* not found */
 					{
 						vacpage->offsets[vacpage->offsets_free++] = offnum;
+						/*
+						 * If this is not a heap-only tuple, there must be an
+						 * index entry for this item which will be removed in
+						 * the index cleanup. Decrement the keep_indexed_tuples
+						 * count to remember this.
+						 */
+						if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+							keep_indexed_tuples--;
 						keep_tuples--;
 					}
 				}
 				else
 				{
 					vacpage->offsets[vacpage->offsets_free++] = offnum;
+					/*
+					 * If this is not a heap-only tuple, there must be an
+					 * index entry for this item which will be removed in
+					 * the index cleanup. Decrement the keep_indexed_tuples
+					 * count to remember this.
+					 */
+					if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+						keep_indexed_tuples--;
 					keep_tuples--;
 				}
 				continue;
@@ -2028,7 +2132,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 						break;
 					}
 					nextItemid = PageGetItemId(nextPage, nextOffnum);
-					if (!ItemIdIsUsed(nextItemid))
+					if (!ItemIdIsNormal(nextItemid))
 					{
 						ReleaseBuffer(nextBuf);
 						break;
@@ -2166,7 +2270,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					Pitemid = PageGetItemId(Ppage,
 								   ItemPointerGetOffsetNumber(&(tp.t_self)));
 					/* this can't happen since we saw tuple earlier: */
-					if (!ItemIdIsUsed(Pitemid))
+					if (!ItemIdIsNormal(Pitemid))
 						elog(ERROR, "parent itemid marked as unused");
 					PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
 
@@ -2268,6 +2372,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 									 dst_buffer, dst_page, destvacpage,
 									 &ec, &Ctid, vtmove[ti].cleanVpd);
 
+					/*
+					 * If the tuple we are moving is a heap-only tuple,
+					 * this move will generate an additional index entry,
+					 * so increment the rel_indexed_tuples count.
+					 */ 
+					if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
+						vacrelstats->rel_indexed_tuples++;
+
 					num_moved++;
 					if (destvacpage->blkno > last_move_dest_block)
 						last_move_dest_block = destvacpage->blkno;
@@ -2280,7 +2392,31 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 						vacpage->offsets[vacpage->offsets_free++] =
 							ItemPointerGetOffsetNumber(&(tuple.t_self));
 					else
+					{
+						/*
+						 * When we move tuple chains, we may need to move
+						 * tuples from a block that we haven't yet scanned in
+						 * the outer walk-along-the-relation loop. Note that we
+						 * can't be moving a tuple from a block that we have
+						 * already scanned because if such a tuple exists, then
+						 * we must have moved the chain along with that tuple
+						 * when we scanned that block. IOW the test of
+						 * (Cbuf != buf) guarantees that the tuple we are
+						 * looking at right now is in a block which is yet to
+						 * be scanned.
+						 *
+						 * We maintain two counters to correctly count the
+						 * moved-off tuples from blocks that are not yet
+						 * scanned (keep_tuples) and how many of them have
+						 * index pointers (keep_indexed_tuples).  The main
+						 * reason to track the latter is to help verify
+						 * that indexes have the expected number of entries
+						 * when all the dust settles.
+						 */
+						if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+							keep_indexed_tuples++;
 						keep_tuples++;
+					}
 
 					ReleaseBuffer(dst_buffer);
 					ReleaseBuffer(Cbuf);
@@ -2328,6 +2464,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			move_plain_tuple(onerel, buf, page, &tuple,
 							 dst_buffer, dst_page, dst_vacpage, &ec);
 
+			/*
+			 * If the tuple we are moving is a heap-only tuple,
+			 * this move will generate an additional index entry,
+			 * so increment the rel_indexed_tuples count.
+			 */
+			if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
+				vacrelstats->rel_indexed_tuples++;
+
 			num_moved++;
 			if (dst_vacpage->blkno > last_move_dest_block)
 				last_move_dest_block = dst_vacpage->blkno;
@@ -2361,6 +2505,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 
 				if (!ItemIdIsUsed(itemid))
 					continue;
+				/* Shouldn't be any DEAD or REDIRECT items anymore */
+				Assert(ItemIdIsNormal(itemid));
+
 				htup = (HeapTupleHeader) PageGetItem(page, itemid);
 				if (htup->t_infomask & HEAP_XMIN_COMMITTED)
 					continue;
@@ -2389,6 +2536,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					{
 						vacpage->offsets[vacpage->offsets_free++] = off;
 						Assert(keep_tuples > 0);
+						/*
+						 * If this is not a heap-only tuple, there must be an
+						 * index entry for this item which will be removed in
+						 * the index cleanup. Decrement the keep_indexed_tuples
+						 * count to remember this.
+						 */
+						if (!HeapTupleHeaderIsHeapOnly(htup))
+							keep_indexed_tuples--;
 						keep_tuples--;
 					}
 				}
@@ -2396,6 +2551,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				{
 					vacpage->offsets[vacpage->offsets_free++] = off;
 					Assert(keep_tuples > 0);
+					if (!HeapTupleHeaderIsHeapOnly(htup))
+						keep_indexed_tuples--;
 					keep_tuples--;
 				}
 			}
@@ -2529,11 +2686,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			 * page during chain moves but not been scanned over subsequently.
 			 * The tuple ids of these tuples are not recorded as free offsets
 			 * for any VacPage, so they will not be cleared from the indexes.
+			 * keep_indexed_tuples is the portion of these that are expected
+			 * to have index entries.
 			 */
 			Assert(keep_tuples >= 0);
 			for (i = 0; i < nindexes; i++)
 				vacuum_index(&Nvacpagelist, Irel[i],
-							 vacrelstats->rel_tuples, keep_tuples);
+							 vacrelstats->rel_indexed_tuples,
+							 keep_indexed_tuples);
 		}
 
 		/*
@@ -2551,7 +2711,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			OffsetNumber unused[MaxOffsetNumber];
 			OffsetNumber offnum,
 						maxoff;
-			int			uncnt;
+			int			uncnt = 0;
 			int			num_tuples = 0;
 
 			buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy);
@@ -2567,6 +2727,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 
 				if (!ItemIdIsUsed(itemid))
 					continue;
+				/* Shouldn't be any DEAD or REDIRECT items anymore */
+				Assert(ItemIdIsNormal(itemid));
+
 				htup = (HeapTupleHeader) PageGetItem(page, itemid);
 				if (htup->t_infomask & HEAP_XMIN_COMMITTED)
 					continue;
@@ -2584,12 +2747,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 
 				ItemIdSetUnused(itemid);
 				num_tuples++;
+
+				unused[uncnt++] = offnum;
 			}
 			Assert(vacpage->offsets_free == num_tuples);
 
 			START_CRIT_SECTION();
 
-			uncnt = PageRepairFragmentation(page, unused);
+			PageRepairFragmentation(page);
 
 			MarkBufferDirty(buf);
 
@@ -2598,7 +2763,10 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			{
 				XLogRecPtr	recptr;
 
-				recptr = log_heap_clean(onerel, buf, unused, uncnt);
+				recptr = log_heap_clean(onerel, buf,
+										NULL, 0, NULL, 0,
+										unused, uncnt,
+										false);
 				PageSetLSN(page, recptr);
 				PageSetTLI(page, ThisTimeLineID);
 			}
@@ -2706,15 +2874,17 @@ move_chain_tuple(Relation rel,
 
 	/*
 	 * Update the state of the copied tuple, and store it on the destination
-	 * page.
+	 * page.  The copied tuple is never part of a HOT chain.
 	 */
 	newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
 								   HEAP_XMIN_INVALID |
 								   HEAP_MOVED_OFF);
 	newtup.t_data->t_infomask |= HEAP_MOVED_IN;
+	HeapTupleHeaderClearHotUpdated(newtup.t_data);
+	HeapTupleHeaderClearHeapOnly(newtup.t_data);
 	HeapTupleHeaderSetXvac(newtup.t_data, myXID);
 	newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
-						 InvalidOffsetNumber, false);
+						 InvalidOffsetNumber, false, true);
 	if (newoff == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain",
 			 (unsigned long) tuple_len, dst_vacpage->blkno);
@@ -2809,17 +2979,19 @@ move_plain_tuple(Relation rel,
 	START_CRIT_SECTION();
 
 	/*
-	 * Mark new tuple as MOVED_IN by me.
+	 * Mark new tuple as MOVED_IN by me; also mark it not HOT.
 	 */
 	newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
 								   HEAP_XMIN_INVALID |
 								   HEAP_MOVED_OFF);
 	newtup.t_data->t_infomask |= HEAP_MOVED_IN;
+	HeapTupleHeaderClearHotUpdated(newtup.t_data);
+	HeapTupleHeaderClearHeapOnly(newtup.t_data);
 	HeapTupleHeaderSetXvac(newtup.t_data, myXID);
 
 	/* add tuple to the page */
 	newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
-						 InvalidOffsetNumber, false);
+						 InvalidOffsetNumber, false, true);
 	if (newoff == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
 			 (unsigned long) tuple_len,
@@ -2934,6 +3106,9 @@ update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,
 
 			if (!ItemIdIsUsed(itemid))
 				continue;
+			/* Shouldn't be any DEAD or REDIRECT items anymore */
+			Assert(ItemIdIsNormal(itemid));
+
 			htup = (HeapTupleHeader) PageGetItem(page, itemid);
 			if (htup->t_infomask & HEAP_XMIN_COMMITTED)
 				continue;
@@ -3019,10 +3194,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
 static void
 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 {
-	OffsetNumber unused[MaxOffsetNumber];
-	int			uncnt;
 	Page		page = BufferGetPage(buffer);
-	ItemId		itemid;
 	int			i;
 
 	/* There shouldn't be any tuples moved onto the page yet! */
@@ -3032,11 +3204,12 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 
 	for (i = 0; i < vacpage->offsets_free; i++)
 	{
-		itemid = PageGetItemId(page, vacpage->offsets[i]);
+		ItemId		itemid = PageGetItemId(page, vacpage->offsets[i]);
+
 		ItemIdSetUnused(itemid);
 	}
 
-	uncnt = PageRepairFragmentation(page, unused);
+	PageRepairFragmentation(page);
 
 	MarkBufferDirty(buffer);
 
@@ -3045,7 +3218,10 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 	{
 		XLogRecPtr	recptr;
 
-		recptr = log_heap_clean(onerel, buffer, unused, uncnt);
+		recptr = log_heap_clean(onerel, buffer,
+								NULL, 0, NULL, 0,
+								vacpage->offsets, vacpage->offsets_free,
+								false);
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
@@ -3527,8 +3703,7 @@ enough_space(VacPage vacpage, Size len)
 static Size
 PageGetFreeSpaceWithFillFactor(Relation relation, Page page)
 {
-	PageHeader	pd = (PageHeader) page;
-	Size		freespace = pd->pd_upper - pd->pd_lower;
+	Size		freespace = PageGetHeapFreeSpace(page);
 	Size		targetfree;
 
 	targetfree = RelationGetTargetPageFreeSpace(relation,
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 3faf172acbf..b9050719cb4 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -36,7 +36,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.96 2007/09/16 02:37:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.97 2007/09/20 17:56:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -326,8 +326,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
 		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
 
-		/* Initially, we only need shared access to the buffer */
-		LockBuffer(buf, BUFFER_LOCK_SHARE);
+		/* We need buffer cleanup lock so that we can prune HOT chains. */
+		LockBufferForCleanup(buf);
 
 		page = BufferGetPage(buf);
 
@@ -341,11 +341,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			 * We have to be careful here because we could be looking at a
 			 * page that someone has just added to the relation and not yet
 			 * been able to initialize (see RelationGetBufferForTuple). To
-			 * interlock against that, release the buffer read lock (which we
-			 * must do anyway) and grab the relation extension lock before
-			 * re-locking in exclusive mode.  If the page is still
-			 * uninitialized by then, it must be left over from a crashed
-			 * backend, and we can initialize it.
+			 * protect against that, release the buffer lock, grab the
+			 * relation extension lock momentarily, and re-lock the buffer.
+			 * If the page is still uninitialized by then, it must be left
+			 * over from a crashed backend, and we can initialize it.
 			 *
 			 * We don't really need the relation lock when this is a new or
 			 * temp relation, but it's probably not worth the code space to
@@ -357,7 +356,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 			LockRelationForExtension(onerel, ExclusiveLock);
 			UnlockRelationForExtension(onerel, ExclusiveLock);
-			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+			LockBufferForCleanup(buf);
 			if (PageIsNew(page))
 			{
 				ereport(WARNING,
@@ -366,7 +365,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 				PageInit(page, BufferGetPageSize(buf), 0);
 				empty_pages++;
 				lazy_record_free_space(vacrelstats, blkno,
-									   PageGetFreeSpace(page));
+									   PageGetHeapFreeSpace(page));
 			}
 			MarkBufferDirty(buf);
 			UnlockReleaseBuffer(buf);
@@ -377,11 +376,23 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		{
 			empty_pages++;
 			lazy_record_free_space(vacrelstats, blkno,
-								   PageGetFreeSpace(page));
+								   PageGetHeapFreeSpace(page));
 			UnlockReleaseBuffer(buf);
 			continue;
 		}
 
+		/* 
+		 * Prune all HOT-update chains in this page.
+		 *
+		 * We count tuples removed by the pruning step as removed by VACUUM.
+		 */
+		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
+										 false, false);
+
+		/*
+		 * Now scan the page to collect vacuumable items and check for
+		 * tuples requiring freezing.
+		 */
 		nfrozen = 0;
 		hastup = false;
 		prev_dead_count = vacrelstats->num_dead_tuples;
@@ -394,22 +405,64 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
 			itemid = PageGetItemId(page, offnum);
 
+			/* Unused items require no processing, but we count 'em */
 			if (!ItemIdIsUsed(itemid))
 			{
 				nunused += 1;
 				continue;
 			}
 
+			/* Redirect items mustn't be touched */
+ 			if (ItemIdIsRedirected(itemid))
+ 			{
+				hastup = true;	/* this page won't be truncatable */
+ 				continue;
+ 			}
+
+ 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
+
+			/*
+			 * DEAD item pointers are to be vacuumed normally; but we don't
+			 * count them in tups_vacuumed, else we'd be double-counting
+			 * (at least in the common case where heap_page_prune() just
+			 * freed up a non-HOT tuple).
+			 */
+			if (ItemIdIsDead(itemid))
+			{
+				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+				continue;
+			}
+
+			Assert(ItemIdIsNormal(itemid));
+
 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 			tuple.t_len = ItemIdGetLength(itemid);
-			ItemPointerSet(&(tuple.t_self), blkno, offnum);
 
 			tupgone = false;
 
 			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
 			{
 				case HEAPTUPLE_DEAD:
-					tupgone = true;		/* we can delete the tuple */
+					/*
+					 * Ordinarily, DEAD tuples would have been removed by
+					 * heap_page_prune(), but it's possible that the tuple
+					 * state changed since heap_page_prune() looked.  In
+					 * particular an INSERT_IN_PROGRESS tuple could have
+					 * changed to DEAD if the inserter aborted.  So this
+					 * cannot be considered an error condition.
+					 *
+					 * If the tuple is HOT-updated then it must only be
+					 * removed by a prune operation; so we keep it just as
+					 * if it were RECENTLY_DEAD.  Also, if it's a heap-only
+					 * tuple, we choose to keep it, because it'll be a
+					 * lot cheaper to get rid of it in the next pruning pass
+					 * than to treat it like an indexed tuple.
+					 */
+					if (HeapTupleIsHotUpdated(&tuple) ||
+						HeapTupleIsHeapOnly(&tuple))
+						nkeep += 1;
+					else
+						tupgone = true;		/* we can delete the tuple */
 					break;
 				case HEAPTUPLE_LIVE:
 					/* Tuple is good --- but let's do some validity checks */
@@ -449,11 +502,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
 				/*
 				 * Each non-removable tuple must be checked to see if it
-				 * needs freezing.  If we already froze anything, then
-				 * we've already switched the buffer lock to exclusive.
+				 * needs freezing.  Note we already have exclusive buffer lock.
 				 */
 				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
-									  (nfrozen > 0) ? InvalidBuffer : buf))
+									  InvalidBuffer))
 					frozen[nfrozen++] = offnum;
 			}
 		}						/* scan along page */
@@ -485,9 +537,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		if (nindexes == 0 &&
 			vacrelstats->num_dead_tuples > 0)
 		{
-			/* Trade in buffer share lock for super-exclusive lock */
-			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-			LockBufferForCleanup(buf);
 			/* Remove tuples from heap */
 			lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
 			/* Forget the now-vacuumed tuples, and press on */
@@ -505,7 +554,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		if (vacrelstats->num_dead_tuples == prev_dead_count)
 		{
 			lazy_record_free_space(vacrelstats, blkno,
-								   PageGetFreeSpace(page));
+								   PageGetHeapFreeSpace(page));
 		}
 
 		/* Remember the location of the last page with nonremovable tuples */
@@ -598,7 +647,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 		/* Now that we've compacted the page, record its available space */
 		page = BufferGetPage(buf);
 		lazy_record_free_space(vacrelstats, tblk,
-							   PageGetFreeSpace(page));
+							   PageGetHeapFreeSpace(page));
 		UnlockReleaseBuffer(buf);
 		npages++;
 	}
@@ -615,7 +664,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
  *	lazy_vacuum_page() -- free dead tuples on a page
  *					 and repair its fragmentation.
  *
- * Caller must hold pin and lock on the buffer.
+ * Caller must hold pin and buffer cleanup lock on the buffer.
  *
  * tupindex is the index in vacrelstats->dead_tuples of the first dead
  * tuple for this page.  We assume the rest follow sequentially.
@@ -625,10 +674,9 @@ static int
 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 				 int tupindex, LVRelStats *vacrelstats)
 {
-	OffsetNumber unused[MaxOffsetNumber];
-	int			uncnt;
 	Page		page = BufferGetPage(buffer);
-	ItemId		itemid;
+	OffsetNumber unused[MaxOffsetNumber];
+	int			uncnt = 0;
 
 	START_CRIT_SECTION();
 
@@ -636,6 +684,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 	{
 		BlockNumber tblk;
 		OffsetNumber toff;
+		ItemId		itemid;
 
 		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
 		if (tblk != blkno)
@@ -643,9 +692,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 		toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
 		itemid = PageGetItemId(page, toff);
 		ItemIdSetUnused(itemid);
+		unused[uncnt++] = toff;
 	}
 
-	uncnt = PageRepairFragmentation(page, unused);
+	PageRepairFragmentation(page);
 
 	MarkBufferDirty(buffer);
 
@@ -654,7 +704,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 	{
 		XLogRecPtr	recptr;
 
-		recptr = log_heap_clean(onerel, buffer, unused, uncnt);
+		recptr = log_heap_clean(onerel, buffer,
+								NULL, 0, NULL, 0,
+								unused, uncnt,
+								false);
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
@@ -980,7 +1033,7 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats,
 	/*
 	 * The array shouldn't overflow under normal behavior, but perhaps it
 	 * could if we are given a really small maintenance_work_mem. In that
-	 * case, just forget the last few tuples.
+	 * case, just forget the last few tuples (we'll get 'em next time).
 	 */
 	if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
 	{