HOT updates. When we update a tuple without changing any of its indexed

columns, and the new version can be stored on the same heap page, we no longer generate extra index entries for the new version. Instead, index searches follow the HOT-chain links to ensure they find the correct tuple version. In addition, this patch introduces the ability to "prune" dead tuples on a per-page basis, without having to do a complete VACUUM pass to recover space. VACUUM is still needed to clean up dead index entries, however. Pavan Deolasee, with help from a bunch of other people.
author: Tom Lane <tgl@sss.pgh.pa.us> 2007-09-20 17:56:33 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2007-09-20 17:56:33 +0000
commit: 282d2a03dd30804b01f8042f640d638c2ee76604 (patch)
tree: 004f08ce31f1bfb03ab55571ad7867babe5b3d7f /src/backend/access
parent: bbf4fdc2538097bb3103806e1419ceef1f289203 (diff)
download: postgresql-282d2a03dd30804b01f8042f640d638c2ee76604.tar.gz
postgresql-282d2a03dd30804b01f8042f640d638c2ee76604.zip
20 files changed, 2063 insertions, 198 deletions
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c
index a5253da0211..70867ac40ba 100644
--- a/src/backend/access/gin/ginentrypage.c
+++ b/src/backend/access/gin/ginentrypage.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.8 2007/09/12 22:10:25 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.9 2007/09/20 17:56:30 tgl Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -359,7 +359,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd
 	*prdata = rdata;
 	data.updateBlkno = entryPreparePage(btree, page, off);
 
-	placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false);
+	placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
 	if (placed != off)
 		elog(ERROR, "failed to add item to index page in \"%s\"",
 			 RelationGetRelationName(btree->index));
@@ -488,7 +488,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
 			lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
 		}
 
-		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in \"%s\"",
 				 RelationGetRelationName(btree->index));
 		ptr += MAXALIGN(IndexTupleSize(itup));
@@ -563,11 +563,11 @@ entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
 	page = BufferGetPage(root);
 
 	itup = ginPageGetLinkItup(lbuf);
-	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to index root page");
 
 	itup = ginPageGetLinkItup(rbuf);
-	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to index root page");
 }
 
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 91f7f3e5f8b..1f26869d646 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.16 2007/09/12 22:10:25 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.17 2007/09/20 17:56:30 tgl Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -544,7 +544,7 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
 				itup = GinFormTuple(&gvs->ginstate, value, GinGetPosting(itup), newN);
 				PageIndexTupleDelete(tmppage, i);
 
-				if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false) != i)
+				if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i)
 					elog(ERROR, "failed to add item to index page in \"%s\"",
 						 RelationGetRelationName(gvs->index));
 
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index db2d6b39336..bf2174c37c7 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			 $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.8 2007/09/12 22:10:25 tgl Exp $
+ *			 $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.9 2007/09/20 17:56:30 tgl Exp $
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -199,7 +199,7 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
 
 		itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert));
 
-		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false) == InvalidOffsetNumber)
+		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false, false) == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				 data->node.spcNode, data->node.dbNode, data->node.relNode);
 
@@ -281,7 +281,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
 
 		for (i = 0; i < data->separator; i++)
 		{
-			if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				  data->node.spcNode, data->node.dbNode, data->node.relNode);
 			itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@@ -289,7 +289,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
 
 		for (i = data->separator; i < data->nitem; i++)
 		{
-			if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				  data->node.spcNode, data->node.dbNode, data->node.relNode);
 			itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@@ -375,7 +375,7 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)
 
 		for (i = 0; i < data->nitem; i++)
 		{
-			if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				  data->node.spcNode, data->node.dbNode, data->node.relNode);
 			itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index fce9a94ebae..0c1b94d7d38 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.146 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.147 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -366,7 +366,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
 			data = (char *) (ptr->list);
 			for (i = 0; i < ptr->block.num; i++)
 			{
-				if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+				if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 					elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r));
 				data += IndexTupleSize((IndexTuple) data);
 			}
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 6d4f31d53b2..409377d1d14 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.23 2007/09/12 22:10:25 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.24 2007/09/20 17:56:30 tgl Exp $
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -42,7 +42,7 @@ gistfillbuffer(Relation r, Page page, IndexTuple *itup,
 	for (i = 0; i < len; i++)
 	{
 		l = PageAddItem(page, (Item) itup[i], IndexTupleSize(itup[i]),
-						off, false);
+						off, false, false);
 		if (l == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in \"%s\"",
 				 RelationGetRelationName(r));
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 0abd0197ad3..212995e7c57 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.31 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.32 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -201,7 +201,7 @@ vacuumSplitPage(GistVacuum *gv, Page tempPage, Buffer buffer, IndexTuple *addon,
 		data = (char *) (ptr->list);
 		for (i = 0; i < ptr->block.num; i++)
 		{
-			if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gv->index));
 			data += IndexTupleSize((IndexTuple) data);
 		}
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index c82ad0ad9fb..8cbf0294acf 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.46 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.47 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -200,7 +200,7 @@ _hash_pgaddtup(Relation rel,
 	page = BufferGetPage(buf);
 
 	itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
-	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false)
+	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
 		== InvalidOffsetNumber)
 		elog(ERROR, "failed to add index item to \"%s\"",
 			 RelationGetRelationName(rel));
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index 7e87f308b26..e4ea24a62d1 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.59 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.60 2007/09/20 17:56:30 tgl Exp $
  *
  * NOTES
  *	  Overflow pages look like ordinary relation pages.
@@ -684,7 +684,7 @@ _hash_squeezebucket(Relation rel,
 			 * we have found room so insert on the "write" page.
 			 */
 			woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
-			if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false)
+			if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false, false)
 				== InvalidOffsetNumber)
 				elog(ERROR, "failed to add index item to \"%s\"",
 					 RelationGetRelationName(rel));
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 4b1450926d3..807dbed8a8c 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.69 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.70 2007/09/20 17:56:30 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -830,7 +830,7 @@ _hash_splitbucket(Relation rel,
 			}
 
 			noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
-			if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false)
+			if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false, false)
 				== InvalidOffsetNumber)
 				elog(ERROR, "failed to add index item to \"%s\"",
 					 RelationGetRelationName(rel));
diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile
index ac2401232bb..aff2847bab5 100644
--- a/src/backend/access/heap/Makefile
+++ b/src/backend/access/heap/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for access/heap
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.16 2007/06/08 18:23:52 tgl Exp $
+#    $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.17 2007/09/20 17:56:30 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,7 +12,7 @@ subdir = src/backend/access/heap
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = heapam.o hio.o rewriteheap.o syncscan.o tuptoaster.o
+OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o
 
 all: SUBSYS.o
 
diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT
new file mode 100644
index 00000000000..8cf0fa44de6
--- /dev/null
+++ b/src/backend/access/heap/README.HOT
@@ -0,0 +1,489 @@
+$PostgreSQL: pgsql/src/backend/access/heap/README.HOT,v 1.1 2007/09/20 17:56:30 tgl Exp $
+
+                           Heap Only Tuples (HOT)
+
+Introduction
+------------
+
+The Heap Only Tuple (HOT) feature eliminates redundant index entries and
+allows the re-use of space taken by DELETEd or obsoleted UPDATEd tuples
+without performing a table-wide vacuum.  It does this by allowing
+single-page vacuuming, also called "defragmentation".
+
+Note: there is a Glossary at the end of this document that may be helpful
+for first-time readers.
+
+
+Technical Challenges
+--------------------
+
+Page-at-a-time vacuuming is normally impractical because of the costs of
+finding and removing the index entries that link to the tuples to be
+reclaimed.  Standard vacuuming scans the indexes to ensure all such index
+entries are removed, amortizing the index scan cost across as many dead
+tuples as possible; this approach does not scale down well to the case of
+reclaiming just a few tuples.  In principle one could recompute the index
+keys and do standard index searches to find the index entries, but this is
+risky in the presence of possibly-buggy user-defined functions in
+functional indexes.  An allegedly immutable function that in fact is not
+immutable might prevent us from re-finding an index entry (and we cannot
+throw an error for not finding it, in view of the fact that dead index
+entries are sometimes reclaimed early).  That would lead to a seriously
+corrupt index, in the form of entries pointing to tuple slots that by now
+contain some unrelated content.  In any case we would prefer to be able
+to do vacuuming without invoking any user-written code.
+
+HOT solves this problem for a restricted but useful special case:
+where a tuple is repeatedly updated in ways that do not change its
+indexed columns.  (Here, "indexed column" means any column referenced
+at all in an index definition, including for example columns that are
+tested in a partial-index predicate but are not stored in the index.)
+
+An additional property of HOT is that it reduces index size by avoiding
+the creation of identically-keyed index entries.  This improves search
+speeds.
+
+
+Update Chains With a Single Index Entry
+---------------------------------------
+
+Without HOT, every version of a row in an update chain has its own index
+entries, even if all indexed columns are the same.  With HOT, a new tuple
+placed on the same page and with all indexed columns the same as its
+parent row version does not get new index entries.  This means there is
+only one index entry for the entire update chain on the heap page.
+An index-entry-less tuple is marked with the HEAP_ONLY_TUPLE flag.
+The prior row version is marked HEAP_HOT_UPDATED, and (as always in an
+update chain) its t_ctid field links forward to the newer version.
+
+For example:
+
+	Index points to 1
+	lp [1]  [2]
+
+	[111111111]->[2222222222]
+
+In the above diagram, the index points to line pointer 1, and tuple 1 is
+marked as HEAP_HOT_UPDATED.  Tuple 2 is a HOT tuple, meaning it has
+no index entry pointing to it, and is marked as HEAP_ONLY_TUPLE.
+Although tuple 2 is not directly referenced by the index, it can still be
+found by an index search: after traversing from the index to tuple 1,
+the index search proceeds forward to child tuples as long as it sees the
+HEAP_HOT_UPDATED flag set.  Since we restrict the HOT chain to lie within
+a single page, this requires no additional page fetches and doesn't
+introduce much performance penalty.
+
+Eventually, tuple 1 will no longer be visible to any transaction.
+At that point its space could be reclaimed, but its line pointer cannot,
+since the index still links to that line pointer and we still need to
+be able to find tuple 2 in an index search.  HOT handles this by turning
+line pointer 1 into a "redirecting line pointer", which links to tuple 2
+but has no actual tuple attached.  This state of affairs looks like
+
+	Index points to 1
+	lp [1]->[2]
+
+	[2222222222]
+
+If now the row is updated again, to version 3, the page looks like this:
+
+	Index points to 1
+	lp [1]->[2]  [3]
+
+	[2222222222]->[3333333333]
+
+At some later time when no transaction can see tuple 2 in its snapshot,
+tuple 2 and its line pointer can be pruned entirely:
+
+	Index points to 1
+	lp [1]------>[3]
+
+	[3333333333]
+
+This is safe because no index entry points to line pointer 2.  Subsequent
+insertions into the page can now recycle both line pointer 2 and the
+space formerly used by tuple 2.
+
+If an update changes any indexed column, or there is not room on the
+same page for the new tuple, then the HOT chain ends: the last member
+has a regular t_ctid link to the next version and is not marked
+HEAP_HOT_UPDATED.  (In principle we could continue a HOT chain across
+pages, but this would destroy the desired property of being able to
+reclaim space with just page-local manipulations.  Anyway, we don't
+want to have to chase through multiple heap pages to get from an index
+entry to the desired tuple, so it seems better to create a new index
+entry for the new tuple.)  If further updates occur, the next version
+could become the root of a new HOT chain.
+
+Line pointer 1 has to remain as long as there is any non-dead member of
+the chain on the page.  When there is not, it is marked "dead".
+This lets us reclaim the last child line pointer and associated tuple
+immediately.  The next regular VACUUM pass can reclaim the index entries
+pointing at the line pointer and then the line pointer itself.  Since a
+line pointer is small compared to a tuple, this does not represent an
+undue space cost.
+
+Note: we can use a "dead" line pointer for any DELETEd tuple,
+whether it was part of a HOT chain or not.  This allows space reclamation
+in advance of running VACUUM for plain DELETEs as well as HOT updates.
+
+The requirement for doing a HOT update is that none of the indexed
+columns are changed.  This is checked at execution time by comparing the
+binary representation of the old and new values.  We insist on bitwise
+equality rather than using datatype-specific equality routines.  The
+main reason to avoid the latter is that there might be multiple notions
+of equality for a datatype, and we don't know exactly which one is
+relevant for the indexes at hand.  We assume that bitwise equality
+guarantees equality for all purposes.
+
+
+Abort Cases
+-----------
+
+If a heap-only tuple's xmin is aborted, then it can be removed immediately:
+it was never visible to any other transaction, and all descendant row
+versions must be aborted as well.  Therefore we need not consider it part
+of a HOT chain.  By the same token, if a HOT-updated tuple's xmax is
+aborted, there is no need to follow the chain link.  However, there is a
+race condition here: the transaction that did the HOT update might abort
+between the time we inspect the HOT-updated tuple and the time we reach
+the descendant heap-only tuple.  It is conceivable that someone prunes
+the heap-only tuple before that, and even conceivable that the line pointer
+is re-used for another purpose.  Therefore, when following a HOT chain,
+it is always necessary to be prepared for the possibility that the
+linked-to item pointer is unused, dead, or redirected; and if it is a
+normal item pointer, we still have to check that XMIN of the tuple matches
+the XMAX of the tuple we left.  Otherwise we should assume that we have
+come to the end of the HOT chain.  Note that this sort of XMIN/XMAX
+matching is required when following ordinary update chains anyway.
+
+(Early versions of the HOT code assumed that holding pin on the page
+buffer while following a HOT link would prevent this type of problem,
+but checking XMIN/XMAX matching is a much more robust solution.)
+
+
+Index/Sequential Scans
+----------------------
+
+When doing an index scan, whenever we reach a HEAP_HOT_UPDATED tuple whose
+xmax is not aborted, we need to follow its t_ctid link and check that
+entry as well; possibly repeatedly until we reach the end of the HOT
+chain.  (When using an MVCC snapshot it is possible to optimize this a
+bit: there can be at most one visible tuple in the chain, so we can stop
+when we find it.  This rule does not work for non-MVCC snapshots, though.)
+
+Sequential scans do not need to pay attention to the HOT links because
+they scan every item pointer on the page anyway.  The same goes for a
+bitmap heap scan with a lossy bitmap.
+
+
+Pruning
+-------
+
+HOT pruning means updating item pointers so that HOT chains are
+reduced in length, by collapsing out line pointers for intermediate dead
+tuples.  Although this makes those line pointers available for re-use,
+it does not immediately make the space occupied by their tuples available.
+
+
+Defragmentation
+---------------
+
+Defragmentation centralizes unused space.  After we have converted root
+line pointers to redirected line pointers and pruned away any dead
+intermediate line pointers, the tuples they linked to are free space.
+But unless that space is adjacent to the central "hole" on the page
+(the pd_lower-to-pd_upper area) it cannot be used by tuple insertion.
+Defragmentation moves the surviving tuples to coalesce all the free
+space into one "hole".  This is done with the same PageRepairFragmentation
+function that regular VACUUM uses.
+
+
+When can/should we prune or defragment?
+---------------------------------------
+
+This is the most interesting question in HOT implementation, since there
+is no simple right answer: we must use heuristics to determine when it's
+most efficient to perform pruning and/or defragmenting.
+
+We cannot prune or defragment unless we can get a "buffer cleanup lock"
+on the target page; otherwise, pruning might destroy line pointers that
+other backends have live references to, and defragmenting might move
+tuples that other backends have live pointers to.  Thus the general
+approach must be to heuristically decide if we should try to prune
+or defragment, and if so try to acquire the buffer cleanup lock without
+blocking.  If we succeed we can proceed with our housekeeping work.
+If we cannot get the lock (which should not happen often, except under
+very heavy contention) then the housekeeping has to be postponed till
+some other time.  The worst-case consequence of this is only that an
+UPDATE cannot be made HOT but has to link to a new tuple version placed on
+some other page, for lack of centralized space on the original page.
+
+Ideally we would do defragmenting only when we are about to attempt
+heap_update on a HOT-safe tuple.  The difficulty with this approach
+is that the update query has certainly got a pin on the old tuple, and
+therefore our attempt to acquire a buffer cleanup lock will always fail.
+(This corresponds to the idea that we don't want to move the old tuple
+out from under where the query's HeapTuple pointer points.  It might
+be possible to finesse that, but it seems fragile.)
+
+Pruning, however, is potentially useful even when we are not about to
+insert a new tuple, since shortening a HOT chain reduces the cost of
+subsequent index searches.  However it is unclear that this gain is
+large enough to accept any extra maintenance burden for.
+
+The currently planned heuristic is to prune and defrag when first accessing
+a page that potentially has prunable tuples (flagged by the PD_PRUNABLE
+page hint bit) and that either has free space less than MAX(fillfactor
+target free space, BLCKSZ/10) *or* has recently had an UPDATE fail to
+find enough free space to store an updated tuple version.  (These rules
+are subject to change.)
+
+We have effectively implemented the "truncate dead tuples to just line
+pointer" idea that has been proposed and rejected before because of fear
+of line pointer bloat: we might end up with huge numbers of line pointers
+and just a few actual tuples on a page.  To limit the damage in the worst
+case, and to keep various work arrays as well as the bitmaps in bitmap
+scans reasonably sized, the maximum number of line pointers per page
+is arbitrarily capped at MaxHeapTuplesPerPage (the most tuples that
+could fit without HOT pruning).
+
+
+VACUUM
+------
+
+There is little change to regular vacuum.  It performs pruning to remove
+dead heap-only tuples, and cleans up any dead line pointers as if they were
+regular dead tuples.
+
+
+VACUUM FULL
+-----------
+
+VACUUM FULL performs an extra operation of collapsing out redirecting line
+pointers, by moving the first non-DEAD tuple of each HOT chain to the root
+position and clearing its heap-only-tuple flag.  This effectively changes
+the user-visible CTID of that tuple.  This would be completely unsafe
+during normal concurrent operation, but since VACUUM FULL takes full
+exclusive lock on the table, it should be OK.  (Note that VACUUM FULL has
+always felt free to change tuples' CTIDs by moving them across pages.)
+Eliminating redirection links means that the main body of VACUUM FULL
+doesn't have to deal with them, which seems a good thing since VACUUM FULL
+is horrendously complex already.
+
+When VACUUM FULL tries to move tuple chains, it does not distinguish regular
+and heap-only tuples, but just moves both types the same.  This is OK because
+it will move the entire non-DEAD tail of an update chain and remove index
+entries for each item moved.  At worst, we'll uselessly search for index
+entries matching the heap-only tuples included in the move.
+
+
+Statistics
+----------
+
+Currently, we count HOT updates the same as cold updates for statistics
+purposes, though there is an additional per-table counter that counts
+only HOT updates.  When a page pruning operation is able to remove a
+physical tuple by eliminating an intermediate heap-only tuple or
+replacing a physical root tuple by a redirect pointer, a decrement in
+the table's number of dead tuples is reported to pgstats, which may
+postpone autovacuuming.  Note that we do not count replacing a root tuple
+by a DEAD item pointer as decrementing n_dead_tuples; we still want
+autovacuum to run to clean up the index entries and DEAD item.
+
+This area probably needs further work ...
+
+
+CREATE INDEX
+------------
+
+CREATE INDEX presents a problem for HOT updates.  While the existing HOT
+chains all have the same index values for existing indexes, the columns
+in the new index might change within a pre-existing HOT chain, creating
+a "broken" chain that can't be indexed properly.
+
+To address this issue, regular (non-concurrent) CREATE INDEX makes the
+new index usable only by transactions newer than the CREATE INDEX
+command.  This prevents transactions that can see the inconsistent HOT
+chains from trying to use the new index and getting incorrect results.  
+New transactions can only see the rows visible after the index was
+created, hence the HOT chains are consistent for them.
+
+Entries in the new index point to root tuples (tuples with current index
+pointers) so that our index uses the same index pointers as all other
+indexes on the table.  However the row we want to index is actually at
+the *end* of the chain, ie, the most recent live tuple on the HOT chain.
+That is the one we compute the index entry values for, but the TID
+we put into the index is that of the root tuple.  Since transactions that
+will be allowed to use the new index cannot see any of the older tuple
+versions in the chain, the fact that they might not match the index entry
+isn't a problem.  (Such transactions will check the tuple visibility
+information of the older versions and ignore them, without ever looking at
+their contents, so the content inconsistency is OK.)  Subsequent updates
+to the live tuple will be allowed to extend the HOT chain only if they are
+HOT-safe for all the indexes.
+
+Because we have ShareLock on the table, any DELETE_IN_PROGRESS or
+INSERT_IN_PROGRESS tuples should have come from our own transaction.
+Therefore we can consider them committed since if the CREATE INDEX
+commits, they will be committed, and if it aborts the index is discarded.
+An exception to this is that early lock release is customary for system
+catalog updates, and so we might find such tuples when reindexing a system
+catalog.  In that case we deal with it by waiting for the source
+transaction to commit or roll back.  (We could do that for user tables
+too, but since the case is unexpected we prefer to throw an error.)
+
+Practically, we prevent old transactions from using the new index by
+setting pg_index.indcheckxmin to TRUE.  Queries are allowed to use such an
+index only after pg_index.xmin is below their TransactionXmin horizon,
+thereby ensuring that any incompatible rows in HOT chains are dead to them.
+(pg_index.xmin will be the XID of the CREATE INDEX transaction.  The reason
+for using xmin rather than a normal column is that the regular vacuum
+freezing mechanism will take care of converting xmin to FrozenTransactionId
+before it can wrap around.)
+
+This means in particular that the transaction creating the index will be
+unable to use the index.  We alleviate that problem somewhat by not setting
+indcheckxmin unless the table actually contains HOT chains with
+RECENTLY_DEAD members.  (In 8.4 we may be able to improve the situation,
+at least for non-serializable transactions, because we expect to be able to
+advance TransactionXmin intratransaction.)
+
+Another unpleasant consequence is that it is now risky to use SnapshotAny
+in an index scan: if the index was created more recently than the last
+vacuum, it's possible that some of the visited tuples do not match the
+index entry they are linked to.  This does not seem to be a fatal
+objection, since there are few users of SnapshotAny and most use seqscans.
+The only exception at this writing is CLUSTER, which is okay because it
+does not require perfect ordering of the indexscan readout (and especially
+so because CLUSTER tends to write recently-dead tuples out of order anyway).
+
+
+CREATE INDEX CONCURRENTLY
+-------------------------
+
+In the concurrent case we must take a different approach.  We create the
+pg_index entry immediately, before we scan the table.  The pg_index entry
+is marked as "not ready for inserts".  Then we commit and wait for any
+transactions which have the table open to finish.  This ensures that no
+new HOT updates will change the key value for our new index, because all
+transactions will see the existence of the index and will respect its
+constraint on which updates can be HOT.  Other transactions must include
+such an index when determining HOT-safety of updates, even though they
+must ignore it for both insertion and searching purposes.
+
+We must do this to avoid making incorrect index entries.  For example,
+suppose we are building an index on column X and we make an index entry for
+a non-HOT tuple with X=1.  Then some other backend, unaware that X is an
+indexed column, HOT-updates the row to have X=2, and commits.  We now have
+an index entry for X=1 pointing at a HOT chain whose live row has X=2.
+We could make an index entry with X=2 during the validation pass, but
+there is no nice way to get rid of the wrong entry with X=1.  So we must
+have the HOT-safety property enforced before we start to build the new
+index.
+
+After waiting for transactions which had the table open, we build the index
+for all rows that are valid in a fresh snapshot.  Any tuples visible in the
+snapshot will have only valid forward-growing HOT chains.  (They might have
+older HOT updates behind them which are broken, but this is OK for the same
+reason it's OK in a regular index build.)  As above, we point the index
+entry at the root of the HOT-update chain but we use the key value from the
+live tuple.
+
+We mark the index open for inserts (but still not ready for reads) then
+we again wait for transactions which have the table open.  Then we take
+a second reference snapshot and validate the index.  This searches for
+tuples missing from the index, and inserts any missing ones.  Again,
+the index entries have to have TIDs equal to HOT-chain root TIDs, but
+the value to be inserted is the one from the live tuple.
+
+Then we wait until every transaction that could have a snapshot older than
+the second reference snapshot is finished.  This ensures that nobody is
+alive any longer who could need to see any tuples that might be missing
+from the index, as well as ensuring that no one can see any inconsistent
+rows in a broken HOT chain (the first condition is stronger than the
+second).  Finally, we can mark the index valid for searches.
+
+
+Limitations and Restrictions
+----------------------------
+
+It is worth noting that HOT forever forecloses alternative approaches
+to vacuuming, specifically the recompute-the-index-keys approach alluded
+to in Technical Challenges above.  It'll be tough to recompute the index
+keys for a root line pointer you don't have data for anymore ...
+
+
+Glossary
+--------
+
+Broken HOT Chain
+
+	A HOT chain in which the key value for an index has changed.
+
+	This is not allowed to occur normally but if a new index is created
+	it can happen.  In that case various strategies are used to ensure
+	that no transaction for which the older tuples are visible can
+	use the index.
+
+Cold update
+
+	A normal, non-HOT update, in which index entries are made for
+	the new version of the tuple.
+
+Dead line pointer
+
+	A stub line pointer, that does not point to anything, but cannot
+	be removed or reused yet because there are index pointers to it.
+	Semantically same as a dead tuple.  It has state LP_DEAD.
+
+Heap-only tuple
+
+	A heap tuple with no index pointers, which can only be reached
+	from indexes indirectly through its ancestral root tuple.
+	Marked with HEAP_ONLY_TUPLE flag.
+
+HOT-safe
+
+	A proposed tuple update is said to be HOT-safe if it changes
+	none of the tuple's indexed columns.  It will only become an
+	actual HOT update if we can find room on the same page for
+	the new tuple version.
+
+HOT update
+
+	An UPDATE where the new tuple becomes a heap-only tuple, and no
+	new index entries are made.
+
+HOT-updated tuple
+
+	An updated tuple, for which the next tuple in the chain is a
+	heap-only tuple.  Marked with HEAP_HOT_UPDATED flag.
+
+Indexed column
+
+	A column used in an index definition.  The column might not
+	actually be stored in the index --- it could be used in a
+	functional index's expression, or used in a partial index
+	predicate.  HOT treats all these cases alike.
+
+Redirecting line pointer
+
+	A line pointer that points to another line pointer and has no
+	associated tuple.  It has the special lp_flags state LP_REDIRECT,
+	and lp_off is the OffsetNumber of the line pointer it links to.
+	This is used when a root tuple becomes dead but we cannot prune
+	the line pointer because there are non-dead heap-only tuples
+	further down the chain.
+
+Root tuple
+
+	The first tuple in a HOT update chain; the one that indexes point to.
+
+Update chain
+
+	A chain of updated tuples, in which each tuple's ctid points to
+	the next tuple in the chain. A HOT update chain is an update chain
+	(or portion of an update chain) that consists of a root tuple and
+	one or more heap-only tuples.  A complete update chain can contain
+	both HOT and non-HOT (cold) updated tuples.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 09a70d813f7..d5a2f9a43d1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.240 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.241 2007/09/20 17:56:30 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -52,6 +52,7 @@
 #include "pgstat.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
+#include "utils/datum.h"
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/relcache.h"
@@ -64,6 +65,8 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
 											bool is_bitmapscan);
 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
 		   ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
+static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+					   HeapTuple oldtup, HeapTuple newtup);
 
 
 /* ----------------------------------------------------------------
@@ -184,6 +187,11 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
 	snapshot = scan->rs_snapshot;
 
 	/*
+	 * Prune and repair fragmentation for the whole page, if possible.
+	 */
+	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+
+	/*
 	 * We must hold share lock on the buffer content while examining tuple
 	 * visibility.	Afterwards, however, the tuples we have found to be
 	 * visible are guaranteed good as long as we hold the buffer pin.
@@ -316,7 +324,7 @@ heapgettup(HeapScanDesc scan,
 			 * forward scanners.
 			 */
 			scan->rs_syncscan = false;
-			/* start from last page of the scan */ 
+			/* start from last page of the scan */
 			if (scan->rs_startblock > 0)
 				page = scan->rs_startblock - 1;
 			else
@@ -368,6 +376,7 @@ heapgettup(HeapScanDesc scan,
 		dp = (Page) BufferGetPage(scan->rs_cbuf);
 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
 		lpp = PageGetItemId(dp, lineoff);
+		Assert(ItemIdIsNormal(lpp));
 
 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
 		tuple->t_len = ItemIdGetLength(lpp);
@@ -583,7 +592,7 @@ heapgettup_pagemode(HeapScanDesc scan,
 			 * forward scanners.
 			 */
 			scan->rs_syncscan = false;
-			/* start from last page of the scan */ 
+			/* start from last page of the scan */
 			if (scan->rs_startblock > 0)
 				page = scan->rs_startblock - 1;
 			else
@@ -632,6 +641,7 @@ heapgettup_pagemode(HeapScanDesc scan,
 		dp = (Page) BufferGetPage(scan->rs_cbuf);
 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
 		lpp = PageGetItemId(dp, lineoff);
+		Assert(ItemIdIsNormal(lpp));
 
 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
 		tuple->t_len = ItemIdGetLength(lpp);
@@ -1246,6 +1256,9 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
  * for statistical purposes.  (This could be the heap rel itself, an
  * associated index, or NULL to not count the fetch at all.)
  *
+ * heap_fetch does not follow HOT chains: only the exact TID requested will
+ * be fetched.
+ *
  * It is somewhat inconsistent that we ereport() on invalid block number but
  * return false on invalid item number.  There are a couple of reasons though.
  * One is that the caller can relatively easily check the block number for
@@ -1390,6 +1403,143 @@ heap_release_fetch(Relation relation,
 }
 
 /*
+ *	heap_hot_search_buffer	- search HOT chain for tuple satisfying snapshot
+ *
+ * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
+ * of a HOT chain), and buffer is the buffer holding this tuple.  We search
+ * for the first chain member satisfying the given snapshot.  If one is
+ * found, we update *tid to reference that tuple's offset number, and
+ * return TRUE.  If no match, return FALSE without modifying *tid.
+ *
+ * If all_dead is not NULL, we check non-visible tuples to see if they are
+ * globally dead; *all_dead is set TRUE if all members of the HOT chain
+ * are vacuumable, FALSE if not.
+ *
+ * Unlike heap_fetch, the caller must already have pin and (at least) share
+ * lock on the buffer; it is still pinned/locked at exit.  Also unlike
+ * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
+ */
+bool
+heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
+					   bool *all_dead)
+{
+	Page dp = (Page) BufferGetPage(buffer);
+	TransactionId prev_xmax = InvalidTransactionId;
+	OffsetNumber offnum;
+	bool at_chain_start;
+
+	if (all_dead)
+		*all_dead = true;
+
+	Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
+	offnum = ItemPointerGetOffsetNumber(tid);
+	at_chain_start = true;
+
+	/* Scan through possible multiple members of HOT-chain */
+	for (;;)
+	{
+		ItemId lp;
+		HeapTupleData heapTuple;
+
+		/* check for bogus TID */
+		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
+			break;
+
+		lp = PageGetItemId(dp, offnum);
+
+		/* check for unused, dead, or redirected items */
+		if (!ItemIdIsNormal(lp))
+		{
+			/* We should only see a redirect at start of chain */
+			if (ItemIdIsRedirected(lp) && at_chain_start)
+			{
+				/* Follow the redirect */
+				offnum = ItemIdGetRedirect(lp);
+				at_chain_start = false;
+				continue;
+			}
+			/* else must be end of chain */
+			break;
+		}
+
+		heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+		heapTuple.t_len = ItemIdGetLength(lp);
+
+		/*
+		 * Shouldn't see a HEAP_ONLY tuple at chain start.
+		 */
+		if (at_chain_start && HeapTupleIsHeapOnly(&heapTuple))
+			break;
+
+		/*
+		 * The xmin should match the previous xmax value, else chain is broken.
+		 */
+		if (TransactionIdIsValid(prev_xmax) &&
+			!TransactionIdEquals(prev_xmax,
+								 HeapTupleHeaderGetXmin(heapTuple.t_data)))
+			break;
+
+		/* If it's visible per the snapshot, we must return it */
+		if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
+		{
+			ItemPointerSetOffsetNumber(tid, offnum);
+			if (all_dead)
+				*all_dead = false;
+			return true;
+		}
+
+		/*
+		 * If we can't see it, maybe no one else can either.  At caller
+		 * request, check whether all chain members are dead to all
+		 * transactions.
+		 */
+		if (all_dead && *all_dead &&
+			HeapTupleSatisfiesVacuum(heapTuple.t_data, RecentGlobalXmin,
+									 buffer) != HEAPTUPLE_DEAD)
+			*all_dead = false;
+
+		/*
+		 * Check to see if HOT chain continues past this tuple; if so
+		 * fetch the next offnum and loop around.
+		 */
+		if (HeapTupleIsHotUpdated(&heapTuple))
+		{
+			Assert(ItemPointerGetBlockNumber(&heapTuple.t_data->t_ctid) ==
+				   ItemPointerGetBlockNumber(tid));
+			offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid);
+			at_chain_start = false;
+			prev_xmax = HeapTupleHeaderGetXmax(heapTuple.t_data);
+		}
+		else
+			break;			/* end of chain */
+	}
+
+	return false;
+}
+
+/*
+ *	heap_hot_search		- search HOT chain for tuple satisfying snapshot
+ *
+ * This has the same API as heap_hot_search_buffer, except that the caller
+ * does not provide the buffer containing the page, rather we access it
+ * locally.
+ */
+bool
+heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
+				bool *all_dead)
+{
+	bool	result;
+	Buffer	buffer;
+
+	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
+	LockBuffer(buffer, BUFFER_LOCK_SHARE);
+	result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
+	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	ReleaseBuffer(buffer);
+	return result;
+}
+
+/*
  *	heap_get_latest_tid -  get the latest tid of a specified tuple
  *
  * Actually, this gets the latest version that is visible according to
@@ -1594,6 +1744,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	}
 
 	tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+	tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
 	tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
 	HeapTupleHeaderSetXmin(tup->t_data, xid);
 	HeapTupleHeaderSetCmin(tup->t_data, cid);
@@ -1628,6 +1779,17 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 
 	RelationPutHeapTuple(relation, buffer, heaptup);
 
+	/*
+	 * XXX Should we set PageSetPrunable on this page ?
+	 *
+	 * The inserting transaction may eventually abort thus making this tuple
+	 * DEAD and hence available for pruning. Though we don't want to optimize
+	 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
+	 * aborted tuple will never be pruned until next vacuum is triggered.
+	 *
+	 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
+	 */
+
 	MarkBufferDirty(buffer);
 
 	/* XLOG stuff */
@@ -1904,12 +2066,21 @@ l1:
 
 	START_CRIT_SECTION();
 
+	/*
+	 * If this transaction commits, the tuple will become DEAD sooner or
+	 * later. Set hint bit that this page is a candidate for pruning.  If
+	 * the transaction finally aborts, the subsequent page pruning will be
+	 * a no-op and the hint will be cleared.
+	 */
+	PageSetPrunable((Page) dp);
+
 	/* store transaction information of xact deleting the tuple */
 	tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 							   HEAP_XMAX_INVALID |
 							   HEAP_XMAX_IS_MULTI |
 							   HEAP_IS_LOCKED |
 							   HEAP_MOVED);
+	HeapTupleHeaderClearHotUpdated(tp.t_data);
 	HeapTupleHeaderSetXmax(tp.t_data, xid);
 	HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
 	/* Make sure there is no forward chain link in t_ctid */
@@ -2045,7 +2216,8 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  *
  * On success, the header fields of *newtup are updated to match the new
  * stored tuple; in particular, newtup->t_self is set to the TID where the
- * new tuple was inserted.	However, any TOAST changes in the new tuple's
+ * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
+ * update was done.  However, any TOAST changes in the new tuple's
  * data are not reflected into *newtup.
  *
  * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
@@ -2060,6 +2232,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 {
 	HTSU_Result result;
 	TransactionId xid = GetCurrentTransactionId();
+	Bitmapset  *hot_attrs;
 	ItemId		lp;
 	HeapTupleData oldtup;
 	HeapTuple	heaptup;
@@ -2072,9 +2245,24 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 				pagefree;
 	bool		have_tuple_lock = false;
 	bool		iscombo;
+	bool		use_hot_update = false;
 
 	Assert(ItemPointerIsValid(otid));
 
+	/*
+	 * Fetch the list of attributes to be checked for HOT update.  This is
+	 * wasted effort if we fail to update or have to put the new tuple on
+	 * a different page.  But we must compute the list before obtaining
+	 * buffer lock --- in the worst case, if we are doing an update on one
+	 * of the relevant system catalogs, we could deadlock if we try to
+	 * fetch the list later.  In any case, the relcache caches the data
+	 * so this is usually pretty cheap.
+	 *
+	 * Note that we get a copy here, so we need not worry about relcache
+	 * flush happening midway through.
+	 */
+	hot_attrs = RelationGetIndexAttrBitmap(relation);
+
 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
@@ -2208,6 +2396,7 @@ l2:
 		UnlockReleaseBuffer(buffer);
 		if (have_tuple_lock)
 			UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+		bms_free(hot_attrs);
 		return result;
 	}
 
@@ -2227,6 +2416,7 @@ l2:
 	}
 
 	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
 	newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
 	HeapTupleHeaderSetXmin(newtup->t_data, xid);
 	HeapTupleHeaderSetCmin(newtup->t_data, cid);
@@ -2261,17 +2451,20 @@ l2:
 					  HeapTupleHasExternal(newtup) ||
 					  newtup->t_len > TOAST_TUPLE_THRESHOLD);
 
-	pagefree = PageGetFreeSpace((Page) dp);
+	pagefree = PageGetHeapFreeSpace((Page) dp);
 
 	newtupsize = MAXALIGN(newtup->t_len);
 
 	if (need_toast || newtupsize > pagefree)
 	{
+		/* Clear obsolete visibility flags ... */
 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 									   HEAP_XMAX_INVALID |
 									   HEAP_XMAX_IS_MULTI |
 									   HEAP_IS_LOCKED |
 									   HEAP_MOVED);
+		HeapTupleClearHotUpdated(&oldtup);
+		/* ... and store info about transaction updating this tuple */
 		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 		/* temporarily make it look not-updated */
@@ -2324,7 +2517,7 @@ l2:
 			/* Re-acquire the lock on the old tuple's page. */
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 			/* Re-check using the up-to-date free space */
-			pagefree = PageGetFreeSpace((Page) dp);
+			pagefree = PageGetHeapFreeSpace((Page) dp);
 			if (newtupsize > pagefree)
 			{
 				/*
@@ -2357,18 +2550,66 @@ l2:
 	 * one pin is held.
 	 */
 
+	if (newbuf == buffer)
+	{
+		/*
+		 * Since the new tuple is going into the same page, we might be able
+		 * to do a HOT update.  Check if any of the index columns have been
+		 * changed.  If not, then HOT update is possible.
+		 */
+		if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
+			use_hot_update = true;
+	}
+	else
+	{
+		/* Set a hint that the old page could use prune/defrag */
+		PageSetFull(dp);
+	}
+
 	/* NO EREPORT(ERROR) from here till changes are logged */
 	START_CRIT_SECTION();
 
+	/*
+	 * If this transaction commits, the old tuple will become DEAD sooner or
+	 * later. Set hint bit that this page is a candidate for pruning.  If
+	 * the transaction finally aborts, the subsequent page pruning will be
+	 * a no-op and the hint will be cleared.
+	 *
+	 * XXX Should we set hint on newbuf as well?  If the transaction
+	 * aborts, there would be a prunable tuple in the newbuf; but for now
+	 * we choose not to optimize for aborts.  Note that heap_xlog_update
+	 * must be kept in sync if this changes.
+	 */
+	PageSetPrunable(dp);
+
+	if (use_hot_update)
+	{
+		/* Mark the old tuple as HOT-updated */
+		HeapTupleSetHotUpdated(&oldtup);
+		/* And mark the new tuple as heap-only */
+		HeapTupleSetHeapOnly(heaptup);
+		/* Mark the caller's copy too, in case different from heaptup */
+		HeapTupleSetHeapOnly(newtup);
+	}
+	else
+	{
+		/* Make sure tuples are correctly marked as not-HOT */
+		HeapTupleClearHotUpdated(&oldtup);
+		HeapTupleClearHeapOnly(heaptup);
+		HeapTupleClearHeapOnly(newtup);
+	}
+
 	RelationPutHeapTuple(relation, newbuf, heaptup);	/* insert new tuple */
 
 	if (!already_marked)
 	{
+		/* Clear obsolete visibility flags ... */
 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 									   HEAP_XMAX_INVALID |
 									   HEAP_XMAX_IS_MULTI |
 									   HEAP_IS_LOCKED |
 									   HEAP_MOVED);
+		/* ... and store info about transaction updating this tuple */
 		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 	}
@@ -2427,7 +2668,7 @@ l2:
 	if (have_tuple_lock)
 		UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
 
-	pgstat_count_heap_update(relation);
+	pgstat_count_heap_update(relation, use_hot_update);
 
 	/*
 	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
@@ -2439,10 +2680,120 @@ l2:
 		heap_freetuple(heaptup);
 	}
 
+	bms_free(hot_attrs);
+
 	return HeapTupleMayBeUpdated;
 }
 
 /*
+ * Check if the specified attribute's value is same in both given tuples.
+ * Subroutine for HeapSatisfiesHOTUpdate.
+ */
+static bool
+heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
+					   HeapTuple tup1, HeapTuple tup2)
+{
+	Datum value1, value2;
+	bool isnull1, isnull2;
+	Form_pg_attribute att;
+
+	/*
+	 * If it's a whole-tuple reference, say "not equal".  It's not really
+	 * worth supporting this case, since it could only succeed after a
+	 * no-op update, which is hardly a case worth optimizing for.
+	 */
+	if (attrnum == 0)
+		return false;
+
+	/*
+	 * Likewise, automatically say "not equal" for any system attribute
+	 * other than OID and tableOID; we cannot expect these to be consistent
+	 * in a HOT chain, or even to be set correctly yet in the new tuple.
+	 */
+	if (attrnum < 0)
+	{
+		if (attrnum != ObjectIdAttributeNumber &&
+			attrnum != TableOidAttributeNumber)
+			return false;
+	}
+
+	/*
+	 * Extract the corresponding values.  XXX this is pretty inefficient
+	 * if there are many indexed columns.  Should HeapSatisfiesHOTUpdate
+	 * do a single heap_deform_tuple call on each tuple, instead?  But
+	 * that doesn't work for system columns ...
+	 */
+	value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
+	value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
+
+	/*
+	 * If one value is NULL and other is not, then they are certainly
+	 * not equal
+	 */
+	if (isnull1 != isnull2)
+		return false;
+
+	/*
+	 * If both are NULL, they can be considered equal.
+	 */
+	if (isnull1)
+		return true;
+
+	/*
+	 * We do simple binary comparison of the two datums.  This may be overly
+	 * strict because there can be multiple binary representations for the
+	 * same logical value.  But we should be OK as long as there are no false
+	 * positives.  Using a type-specific equality operator is messy because
+	 * there could be multiple notions of equality in different operator
+	 * classes; furthermore, we cannot safely invoke user-defined functions
+	 * while holding exclusive buffer lock.
+	 */
+	if (attrnum <= 0)
+	{
+		/* The only allowed system columns are OIDs, so do this */
+		return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
+	}
+	else
+	{
+		Assert(attrnum <= tupdesc->natts);
+		att	= tupdesc->attrs[attrnum - 1];
+		return datumIsEqual(value1, value2, att->attbyval, att->attlen);
+	}
+}
+
+/*
+ * Check if the old and new tuples represent a HOT-safe update. To be able
+ * to do a HOT update, we must not have changed any columns used in index
+ * definitions.
+ *
+ * The set of attributes to be checked is passed in (we dare not try to
+ * compute it while holding exclusive buffer lock...)  NOTE that hot_attrs
+ * is destructively modified!  That is OK since this is invoked at most once
+ * by heap_update().
+ *
+ * Returns true if safe to do HOT update.
+ */
+static bool
+HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+					   HeapTuple oldtup, HeapTuple newtup)
+{
+	int attrnum;
+
+	while ((attrnum = bms_first_member(hot_attrs)) >= 0)
+	{
+		/* Adjust for system attributes */
+		attrnum += FirstLowInvalidHeapAttributeNumber;
+
+		/* If the attribute value has changed, we can't do HOT update */
+		if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
+									oldtup, newtup))
+			return false;
+	}
+
+	return true;
+}
+
+/*
  *	simple_heap_update - replace a tuple
  *
  * This routine may be used to update a tuple when concurrent updates of
@@ -2865,6 +3216,7 @@ l3:
 	 * avoids possibly generating a useless combo CID.
 	 */
 	tuple->t_data->t_infomask = new_infomask;
+	HeapTupleHeaderClearHotUpdated(tuple->t_data);
 	HeapTupleHeaderSetXmax(tuple->t_data, xid);
 	/* Make sure there is no forward chain link in t_ctid */
 	tuple->t_data->t_ctid = *tid;
@@ -3110,6 +3462,7 @@ recheck_xmax:
 			 */
 			tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
 			tuple->t_infomask |= HEAP_XMAX_INVALID;
+			HeapTupleHeaderClearHotUpdated(tuple);
 			changed = true;
 		}
 	}
@@ -3245,21 +3598,29 @@ heap_restrpos(HeapScanDesc scan)
  * Perform XLogInsert for a heap-clean operation.  Caller must already
  * have modified the buffer and marked it dirty.
  *
- * Note: for historical reasons, the entries in the unused[] array should
- * be zero-based tuple indexes, not one-based.
+ * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
+ * zero-based tuple indexes.  Now they are one-based like other uses
+ * of OffsetNumber.
  */
 XLogRecPtr
-log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
+log_heap_clean(Relation reln, Buffer buffer,
+			   OffsetNumber *redirected, int nredirected,
+			   OffsetNumber *nowdead, int ndead,
+			   OffsetNumber *nowunused, int nunused,
+			   bool redirect_move)
 {
 	xl_heap_clean xlrec;
+	uint8		info;
 	XLogRecPtr	recptr;
-	XLogRecData rdata[2];
+	XLogRecData rdata[4];
 
 	/* Caller should not call me on a temp relation */
 	Assert(!reln->rd_istemp);
 
 	xlrec.node = reln->rd_node;
 	xlrec.block = BufferGetBlockNumber(buffer);
+	xlrec.nredirected = nredirected;
+	xlrec.ndead = ndead;
 
 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapClean;
@@ -3267,14 +3628,17 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 	rdata[0].next = &(rdata[1]);
 
 	/*
-	 * The unused-offsets array is not actually in the buffer, but pretend
-	 * that it is.	When XLogInsert stores the whole buffer, the offsets array
-	 * need not be stored too.
+	 * The OffsetNumber arrays are not actually in the buffer, but we pretend
+	 * that they are.  When XLogInsert stores the whole buffer, the offset
+	 * arrays need not be stored too.  Note that even if all three arrays
+	 * are empty, we want to expose the buffer as a candidate for whole-page
+	 * storage, since this record type implies a defragmentation operation
+	 * even if no item pointers changed state.
 	 */
-	if (uncnt > 0)
+	if (nredirected > 0)
 	{
-		rdata[1].data = (char *) unused;
-		rdata[1].len = uncnt * sizeof(OffsetNumber);
+		rdata[1].data = (char *) redirected;
+		rdata[1].len = nredirected * sizeof(OffsetNumber) * 2;
 	}
 	else
 	{
@@ -3283,9 +3647,38 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 	}
 	rdata[1].buffer = buffer;
 	rdata[1].buffer_std = true;
-	rdata[1].next = NULL;
+	rdata[1].next = &(rdata[2]);
+
+	if (ndead > 0)
+	{
+		rdata[2].data = (char *) nowdead;
+		rdata[2].len = ndead * sizeof(OffsetNumber);
+	}
+	else
+	{
+		rdata[2].data = NULL;
+		rdata[2].len = 0;
+	}
+	rdata[2].buffer = buffer;
+	rdata[2].buffer_std = true;
+	rdata[2].next = &(rdata[3]);
+
+	if (nunused > 0)
+	{
+		rdata[3].data = (char *) nowunused;
+		rdata[3].len = nunused * sizeof(OffsetNumber);
+	}
+	else
+	{
+		rdata[3].data = NULL;
+		rdata[3].len = 0;
+	}
+	rdata[3].buffer = buffer;
+	rdata[3].buffer_std = true;
+	rdata[3].next = NULL;
 
-	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CLEAN, rdata);
+	info = redirect_move ? XLOG_HEAP2_CLEAN_MOVE : XLOG_HEAP2_CLEAN;
+	recptr = XLogInsert(RM_HEAP2_ID, info, rdata);
 
 	return recptr;
 }
@@ -3293,8 +3686,6 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 /*
  * Perform XLogInsert for a heap-freeze operation.  Caller must already
  * have modified the buffer and marked it dirty.
- *
- * Unlike log_heap_clean(), the offsets[] entries are one-based.
  */
 XLogRecPtr
 log_heap_freeze(Relation reln, Buffer buffer,
@@ -3363,17 +3754,28 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
 	}			xlhdr;
 	int			hsize = SizeOfHeapHeader;
 	xl_heap_update xlrec;
+	uint8		info;
 	XLogRecPtr	recptr;
 	XLogRecData rdata[4];
 	Page		page = BufferGetPage(newbuf);
-	uint8		info = (move) ? XLOG_HEAP_MOVE : XLOG_HEAP_UPDATE;
 
 	/* Caller should not call me on a temp relation */
 	Assert(!reln->rd_istemp);
 
+	if (move)
+	{
+		Assert(!HeapTupleIsHeapOnly(newtup));
+		info = XLOG_HEAP_MOVE;
+	}
+	else if (HeapTupleIsHeapOnly(newtup))
+		info = XLOG_HEAP_HOT_UPDATE;
+	else
+		info = XLOG_HEAP_UPDATE;
+
 	xlrec.target.node = reln->rd_node;
 	xlrec.target.tid = from;
 	xlrec.newtid = newtup->t_self;
+
 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapUpdate;
 	rdata[0].buffer = InvalidBuffer;
@@ -3489,13 +3891,21 @@ log_newpage(RelFileNode *rnode, BlockNumber blkno, Page page)
 	return recptr;
 }
 
+/*
+ * Handles CLEAN and CLEAN_MOVE record types
+ */
 static void
-heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
+heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
 {
 	xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
 	Relation	reln;
 	Buffer		buffer;
 	Page		page;
+	OffsetNumber *offnum;
+	OffsetNumber *end;
+	int nredirected;
+	int ndead;
+	int i;
 
 	if (record->xl_info & XLR_BKP_BLOCK_1)
 		return;
@@ -3512,25 +3922,63 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
 		return;
 	}
 
-	if (record->xl_len > SizeOfHeapClean)
-	{
-		OffsetNumber *unused;
-		OffsetNumber *unend;
-		ItemId		lp;
+	nredirected = xlrec->nredirected;
+	ndead = xlrec->ndead;
+	offnum = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
+	end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
 
-		unused = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
-		unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+	/* Update all redirected or moved line pointers */
+	for (i = 0; i < nredirected; i++)
+	{
+		OffsetNumber fromoff = *offnum++;
+		OffsetNumber tooff = *offnum++;
+		ItemId	fromlp = PageGetItemId(page, fromoff);
 
-		while (unused < unend)
+		if (clean_move)
 		{
-			/* unused[] entries are zero-based */
-			lp = PageGetItemId(page, *unused + 1);
-			ItemIdSetUnused(lp);
-			unused++;
+			/* Physically move the "to" item to the "from" slot */
+			ItemId	tolp = PageGetItemId(page, tooff);
+			HeapTupleHeader htup;
+
+			*fromlp = *tolp;
+			ItemIdSetUnused(tolp);
+
+			/* We also have to clear the tuple's heap-only bit */
+			Assert(ItemIdIsNormal(fromlp));
+			htup = (HeapTupleHeader) PageGetItem(page, fromlp);
+			Assert(HeapTupleHeaderIsHeapOnly(htup));
+			HeapTupleHeaderClearHeapOnly(htup);
+		}
+		else
+		{
+			/* Just insert a REDIRECT link at fromoff */
+			ItemIdSetRedirect(fromlp, tooff);
 		}
 	}
 
-	PageRepairFragmentation(page, NULL);
+	/* Update all now-dead line pointers */
+	for (i = 0; i < ndead; i++)
+	{
+		OffsetNumber off = *offnum++;
+		ItemId	lp = PageGetItemId(page, off);
+
+		ItemIdSetDead(lp);
+	}
+
+	/* Update all now-unused line pointers */
+	while (offnum < end)
+	{
+		OffsetNumber off = *offnum++;
+		ItemId	lp = PageGetItemId(page, off);
+
+		ItemIdSetUnused(lp);
+	}
+
+	/*
+	 * Finally, repair any fragmentation, and update the page's hint bit
+	 * about whether it has free pointers.
+	 */
+	PageRepairFragmentation(page);
 
 	PageSetLSN(page, lsn);
 	PageSetTLI(page, ThisTimeLineID);
@@ -3655,8 +4103,13 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 						  HEAP_XMAX_IS_MULTI |
 						  HEAP_IS_LOCKED |
 						  HEAP_MOVED);
+	HeapTupleHeaderClearHotUpdated(htup);
 	HeapTupleHeaderSetXmax(htup, record->xl_xid);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+
+	/* Mark the page as a candidate for pruning */
+	PageSetPrunable(page);
+
 	/* Make sure there is no forward chain link in t_ctid */
 	htup->t_ctid = xlrec->target.tid;
 	PageSetLSN(page, lsn);
@@ -3736,7 +4189,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
 	HeapTupleHeaderSetCmin(htup, FirstCommandId);
 	htup->t_ctid = xlrec->target.tid;
 
-	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "heap_insert_redo: failed to add tuple");
 	PageSetLSN(page, lsn);
@@ -3746,10 +4199,10 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
 }
 
 /*
- * Handles UPDATE & MOVE
+ * Handles UPDATE, HOT_UPDATE & MOVE
  */
 static void
-heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
+heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
 {
 	xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
 	Relation	reln = XLogOpenRelation(xlrec->target.node);
@@ -3808,6 +4261,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
 							  HEAP_XMIN_INVALID |
 							  HEAP_MOVED_IN);
 		htup->t_infomask |= HEAP_MOVED_OFF;
+		HeapTupleHeaderClearHotUpdated(htup);
 		HeapTupleHeaderSetXvac(htup, record->xl_xid);
 		/* Make sure there is no forward chain link in t_ctid */
 		htup->t_ctid = xlrec->target.tid;
@@ -3819,12 +4273,19 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
 							  HEAP_XMAX_IS_MULTI |
 							  HEAP_IS_LOCKED |
 							  HEAP_MOVED);
+		if (hot_update)
+			HeapTupleHeaderSetHotUpdated(htup);
+		else
+			HeapTupleHeaderClearHotUpdated(htup);
 		HeapTupleHeaderSetXmax(htup, record->xl_xid);
 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 		/* Set forward chain link in t_ctid */
 		htup->t_ctid = xlrec->newtid;
 	}
 
+	/* Mark the page as a candidate for pruning */
+	PageSetPrunable(page);
+
 	/*
 	 * this test is ugly, but necessary to avoid thinking that insert change
 	 * is already applied
@@ -3914,7 +4375,7 @@ newsame:;
 	/* Make sure there is no forward chain link in t_ctid */
 	htup->t_ctid = xlrec->newtid;
 
-	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "heap_update_redo: failed to add tuple");
 	PageSetLSN(page, lsn);
@@ -3971,6 +4432,7 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
 		htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
 	else
 		htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
+	HeapTupleHeaderClearHotUpdated(htup);
 	HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 	/* Make sure there is no forward chain link in t_ctid */
@@ -4039,25 +4501,35 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-	info &= XLOG_HEAP_OPMASK;
-	if (info == XLOG_HEAP_INSERT)
-		heap_xlog_insert(lsn, record);
-	else if (info == XLOG_HEAP_DELETE)
-		heap_xlog_delete(lsn, record);
-	else if (info == XLOG_HEAP_UPDATE)
-		heap_xlog_update(lsn, record, false);
-	else if (info == XLOG_HEAP_MOVE)
-		heap_xlog_update(lsn, record, true);
-	else if (info == XLOG_HEAP_CLEAN)
-		heap_xlog_clean(lsn, record);
-	else if (info == XLOG_HEAP_NEWPAGE)
-		heap_xlog_newpage(lsn, record);
-	else if (info == XLOG_HEAP_LOCK)
-		heap_xlog_lock(lsn, record);
-	else if (info == XLOG_HEAP_INPLACE)
-		heap_xlog_inplace(lsn, record);
-	else
-		elog(PANIC, "heap_redo: unknown op code %u", info);
+	switch (info & XLOG_HEAP_OPMASK)
+	{
+		case XLOG_HEAP_INSERT:
+			heap_xlog_insert(lsn, record);
+			break;
+		case XLOG_HEAP_DELETE:
+			heap_xlog_delete(lsn, record);
+			break;
+		case XLOG_HEAP_UPDATE:
+			heap_xlog_update(lsn, record, false, false);
+			break;
+		case XLOG_HEAP_MOVE:
+			heap_xlog_update(lsn, record, true, false);
+			break;
+		case XLOG_HEAP_HOT_UPDATE:
+			heap_xlog_update(lsn, record, false, true);
+			break;
+		case XLOG_HEAP_NEWPAGE:
+			heap_xlog_newpage(lsn, record);
+			break;
+		case XLOG_HEAP_LOCK:
+			heap_xlog_lock(lsn, record);
+			break;
+		case XLOG_HEAP_INPLACE:
+			heap_xlog_inplace(lsn, record);
+			break;
+		default:
+			elog(PANIC, "heap_redo: unknown op code %u", info);
+	}
 }
 
 void
@@ -4065,11 +4537,20 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-	info &= XLOG_HEAP_OPMASK;
-	if (info == XLOG_HEAP2_FREEZE)
-		heap_xlog_freeze(lsn, record);
-	else
-		elog(PANIC, "heap2_redo: unknown op code %u", info);
+	switch (info & XLOG_HEAP_OPMASK)
+	{
+		case XLOG_HEAP2_FREEZE:
+			heap_xlog_freeze(lsn, record);
+			break;
+		case XLOG_HEAP2_CLEAN:
+			heap_xlog_clean(lsn, record, false);
+			break;
+		case XLOG_HEAP2_CLEAN_MOVE:
+			heap_xlog_clean(lsn, record, true);
+			break;
+		default:
+			elog(PANIC, "heap2_redo: unknown op code %u", info);
+	}
 }
 
 static void
@@ -4130,13 +4611,18 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
 						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
 	}
-	else if (info == XLOG_HEAP_CLEAN)
+	else if (info == XLOG_HEAP_HOT_UPDATE)
 	{
-		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+		xl_heap_update *xlrec = (xl_heap_update *) rec;
 
-		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
-						 xlrec->node.spcNode, xlrec->node.dbNode,
-						 xlrec->node.relNode, xlrec->block);
+		if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */
+			appendStringInfo(buf, "hot_update(init): ");
+		else
+			appendStringInfo(buf, "hot_update: ");
+		out_target(buf, &(xlrec->target));
+		appendStringInfo(buf, "; new %u/%u",
+						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
+						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
 	}
 	else if (info == XLOG_HEAP_NEWPAGE)
 	{
@@ -4187,6 +4673,22 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 xlrec->node.relNode, xlrec->block,
 						 xlrec->cutoff_xid);
 	}
+	else if (info == XLOG_HEAP2_CLEAN)
+	{
+		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
+						 xlrec->node.spcNode, xlrec->node.dbNode,
+						 xlrec->node.relNode, xlrec->block);
+	}
+	else if (info == XLOG_HEAP2_CLEAN_MOVE)
+	{
+		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+		appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
+						 xlrec->node.spcNode, xlrec->node.dbNode,
+						 xlrec->node.relNode, xlrec->block);
+	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 6dbdf13fbe0..cd13d8f87c9 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.66 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.67 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -41,7 +41,7 @@ RelationPutHeapTuple(Relation relation,
 	pageHeader = BufferGetPage(buffer);
 
 	offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
-						 tuple->t_len, InvalidOffsetNumber, false);
+						 tuple->t_len, InvalidOffsetNumber, false, true);
 
 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "failed to add tuple to page");
@@ -218,7 +218,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 		 * we're done.
 		 */
 		pageHeader = (Page) BufferGetPage(buffer);
-		pageFreeSpace = PageGetFreeSpace(pageHeader);
+		pageFreeSpace = PageGetHeapFreeSpace(pageHeader);
 		if (len + saveFreeSpace <= pageFreeSpace)
 		{
 			/* use this page as future insert target, too */
@@ -311,7 +311,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 
 	PageInit(pageHeader, BufferGetPageSize(buffer), 0);
 
-	if (len > PageGetFreeSpace(pageHeader))
+	if (len > PageGetHeapFreeSpace(pageHeader))
 	{
 		/* We should not get here given the test at the top */
 		elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
new file mode 100644
index 00000000000..d5496689003
--- /dev/null
+++ b/src/backend/access/heap/pruneheap.c
@@ -0,0 +1,702 @@
+/*-------------------------------------------------------------------------
+ *
+ * pruneheap.c
+ *	  heap page pruning and HOT-chain management code
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.1 2007/09/20 17:56:30 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/inval.h"
+
+
+/* Local functions */
+static int	heap_prune_chain(Relation relation, Buffer buffer,
+							 OffsetNumber rootoffnum,
+							 TransactionId OldestXmin,
+							 OffsetNumber *redirected, int *nredirected,
+							 OffsetNumber *nowdead, int *ndead,
+							 OffsetNumber *nowunused, int *nunused,
+							 bool redirect_move);
+static void heap_prune_record_redirect(OffsetNumber *redirected,
+			int *nredirected,
+			OffsetNumber offnum,
+			OffsetNumber rdoffnum);
+static void heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+			OffsetNumber offnum);
+static void heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+			OffsetNumber offnum);
+
+
+/*
+ * Optionally prune and repair fragmentation in the specified page.
+ *
+ * This is an opportunistic function.  It will perform housekeeping
+ * only if the page heuristically looks like a candidate for pruning and we
+ * can acquire buffer cleanup lock without blocking.
+ *
+ * Note: this is called quite often.  It's important that it fall out quickly
+ * if there's not any use in pruning.
+ *
+ * Caller must have pin on the buffer, and must *not* have a lock on it.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ */
+void
+heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
+{
+	PageHeader	dp = (PageHeader) BufferGetPage(buffer);
+	Size		minfree;
+
+	/*
+	 * Let's see if we really need pruning.
+	 *
+	 * Forget it if page is not hinted to contain something prunable
+	 */
+	if (!PageIsPrunable(dp))
+		return;
+
+	/*
+	 * We prune when a previous UPDATE failed to find enough space on the
+	 * page for a new tuple version, or when free space falls below the
+	 * relation's fill-factor target (but not less than 10%).
+	 *
+	 * Checking free space here is questionable since we aren't holding
+	 * any lock on the buffer; in the worst case we could get a bogus
+	 * answer.  It's unlikely to be *seriously* wrong, though, since
+	 * reading either pd_lower or pd_upper is probably atomic.  Avoiding
+	 * taking a lock seems better than sometimes getting a wrong answer
+	 * in what is after all just a heuristic estimate.
+	 */
+	minfree = RelationGetTargetPageFreeSpace(relation,
+											 HEAP_DEFAULT_FILLFACTOR);
+	minfree = Max(minfree, BLCKSZ / 10);
+
+	if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+	{
+		/* OK, try to get exclusive buffer lock */
+		if (!ConditionalLockBufferForCleanup(buffer))
+			return;
+
+		/*
+		 * Now that we have buffer lock, get accurate information about the
+		 * page's free space, and recheck the heuristic about whether to prune.
+		 */
+		if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+		{
+			/* OK to prune (though not to remove redirects) */
+			(void) heap_page_prune(relation, buffer, OldestXmin, false, true);
+		}
+
+		/* And release buffer lock */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	}
+}
+
+
+/*
+ * Prune and repair fragmentation in the specified page.
+ *
+ * Caller must have pin and buffer cleanup lock on the page.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ *
+ * If redirect_move is set, we remove redirecting line pointers by
+ * updating the root line pointer to point directly to the first non-dead
+ * tuple in the chain.  NOTE: eliminating the redirect changes the first
+ * tuple's effective CTID, and is therefore unsafe except within VACUUM FULL.
+ * The only reason we support this capability at all is that by using it,
+ * VACUUM FULL need not cope with LP_REDIRECT items at all; which seems a
+ * good thing since VACUUM FULL is overly complicated already.
+ *
+ * If report_stats is true then we send the number of reclaimed heap-only
+ * tuples to pgstats.  (This must be FALSE during vacuum, since vacuum will
+ * send its own new total to pgstats, and we don't want this delta applied
+ * on top of that.)
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+int
+heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
+				bool redirect_move, bool report_stats)
+{
+	int				ndeleted = 0;
+	Page			page = BufferGetPage(buffer);
+	OffsetNumber 	offnum,
+					maxoff;
+	OffsetNumber	redirected[MaxHeapTuplesPerPage * 2];
+	OffsetNumber	nowdead[MaxHeapTuplesPerPage];
+	OffsetNumber	nowunused[MaxHeapTuplesPerPage];
+	int				nredirected = 0;
+	int				ndead = 0;
+	int				nunused = 0;
+
+	START_CRIT_SECTION();
+
+	/*
+	 * Mark the page as clear of prunable tuples. If we find a tuple which
+	 * may soon become prunable, we shall set the hint again.  Also clear
+	 * the "page is full" flag, since there's no point in repeating the
+	 * prune/defrag process until something else happens to the page.
+	 */
+	PageClearPrunable(page);
+	PageClearFull(page);
+
+	/* Scan the page */
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId itemid = PageGetItemId(page, offnum);
+
+		/* Nothing to do if slot is empty or already dead */
+		if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
+			continue;
+
+		/* Process this item or chain of items */
+		ndeleted += heap_prune_chain(relation, buffer, offnum,
+									 OldestXmin,
+									 redirected, &nredirected,
+									 nowdead, &ndead,
+									 nowunused, &nunused,
+									 redirect_move);
+	}
+
+	/* Have we pruned any items? */
+	if (nredirected > 0 || ndead > 0 || nunused > 0)
+	{
+		/*
+		 * Repair page fragmentation, and update the page's hint bit about
+		 * whether it has free line pointers.
+		 */
+		PageRepairFragmentation((Page) page);
+
+		MarkBufferDirty(buffer);
+
+		/*
+		 * Emit a WAL HEAP_CLEAN or HEAP_CLEAN_MOVE record showing what we did
+		 */
+		if (!relation->rd_istemp)
+		{
+			XLogRecPtr	recptr;
+
+			recptr = log_heap_clean(relation, buffer,
+									redirected, nredirected,
+									nowdead, ndead,
+									nowunused, nunused,
+									redirect_move);
+			PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
+			PageSetLSN(BufferGetPage(buffer), recptr);
+		}
+	}
+
+	END_CRIT_SECTION();
+
+	/*
+	 * If requested, report the number of tuples reclaimed to pgstats.
+	 * This is ndeleted minus ndead, because we don't want to count a now-DEAD
+	 * root item as a deletion for this purpose.
+	 */
+	if (report_stats && ndeleted > ndead)
+		pgstat_update_heap_dead_tuples(relation, ndeleted - ndead);
+
+	/*
+	 * XXX Should we update the FSM information of this page ?
+	 *
+	 * There are two schools of thought here. We may not want to update
+	 * FSM information so that the page is not used for unrelated
+	 * UPDATEs/INSERTs and any free space in this page will remain
+	 * available for further UPDATEs in *this* page, thus improving
+	 * chances for doing HOT updates.
+	 *
+	 * But for a large table and where a page does not receive further
+	 * UPDATEs for a long time, we might waste this space by not
+	 * updating the FSM information. The relation may get extended and
+	 * fragmented further.
+	 *
+	 * One possibility is to leave "fillfactor" worth of space in this
+	 * page and update FSM with the remaining space.
+	 *
+	 * In any case, the current FSM implementation doesn't accept
+	 * one-page-at-a-time updates, so this is all academic for now.
+	 */
+
+	return ndeleted;
+}
+
+
+/*
+ * Prune specified item pointer or a HOT chain originating at that item.
+ *
+ * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
+ * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
+ * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
+ * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
+ * DEAD, the OldestXmin test is just too coarse to detect it.
+ *
+ * The root line pointer is redirected to the tuple immediately after the
+ * latest DEAD tuple.  If all tuples in the chain are DEAD, the root line
+ * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
+ * tuple, which we treat as a chain of length 1.)
+ *
+ * OldestXmin is the cutoff XID used to identify dead tuples.
+ *
+ * Redirected items are added to the redirected[] array (two entries per
+ * redirection); items set to LP_DEAD state are added to nowdead[]; and
+ * items set to LP_UNUSED state are added to nowunused[].  (These arrays
+ * will be used to generate a WAL record after all chains are pruned.)
+ *
+ * If redirect_move is true, we get rid of redirecting line pointers.
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+static int
+heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
+				 TransactionId OldestXmin,
+				 OffsetNumber *redirected, int *nredirected,
+				 OffsetNumber *nowdead, int *ndead,
+				 OffsetNumber *nowunused, int *nunused,
+				 bool redirect_move)
+{
+	int				ndeleted = 0;
+	Page			dp = (Page) BufferGetPage(buffer);
+	TransactionId	priorXmax = InvalidTransactionId;
+	ItemId			rootlp;
+	HeapTupleHeader	htup;
+	OffsetNumber	latestdead = InvalidOffsetNumber,
+					maxoff = PageGetMaxOffsetNumber(dp),
+					offnum;
+	OffsetNumber	chainitems[MaxHeapTuplesPerPage];
+	int				nchain = 0,
+					i;
+
+	rootlp = PageGetItemId(dp, rootoffnum);
+
+	/*
+	 * If it's a heap-only tuple, then it is not the start of a HOT chain.
+	 */
+	if (ItemIdIsNormal(rootlp))
+	{
+		htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
+		if (HeapTupleHeaderIsHeapOnly(htup))
+		{
+			/*
+			 * If the tuple is DEAD and doesn't chain to anything else, mark it
+			 * unused immediately.  (If it does chain, we can only remove it as
+			 * part of pruning its chain.)
+			 *
+			 * We need this primarily to handle aborted HOT updates, that is,
+			 * XMIN_INVALID heap-only tuples.  Those might not be linked to
+			 * by any chain, since the parent tuple might be re-updated before
+			 * any pruning occurs.  So we have to be able to reap them
+			 * separately from chain-pruning.
+			 *
+			 * Note that we might first arrive at a dead heap-only tuple
+			 * either here or while following a chain below.  Whichever path
+			 * gets there first will mark the tuple unused.
+			 */
+			if (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer)
+				== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
+			{
+				ItemIdSetUnused(rootlp);
+				heap_prune_record_unused(nowunused, nunused, rootoffnum);
+				ndeleted++;
+			}
+
+			/* Nothing more to do */
+			return ndeleted;
+		}
+	}
+
+	/* Start from the root tuple */
+	offnum = rootoffnum;
+
+	/* while not end of the chain */
+	for (;;)
+	{
+		ItemId			lp;
+		bool			tupdead,
+						recent_dead;
+
+		/* Some sanity checks */
+		if (offnum < FirstOffsetNumber || offnum > maxoff)
+			break;
+
+		lp = PageGetItemId(dp, offnum);
+
+		if (!ItemIdIsUsed(lp))
+			break;
+
+		/*
+		 * If we are looking at the redirected root line pointer,
+		 * jump to the first normal tuple in the chain.  If we find
+		 * a redirect somewhere else, stop --- it must not be same chain.
+		 */
+		if (ItemIdIsRedirected(lp))
+		{
+			if (nchain > 0)
+				break;			/* not at start of chain */
+			chainitems[nchain++] = offnum;
+			offnum = ItemIdGetRedirect(rootlp);
+			continue;
+		}
+
+		/*
+		 * Likewise, a dead item pointer can't be part of the chain.
+		 * (We already eliminated the case of dead root tuple outside
+		 * this function.)
+		 */
+		if (ItemIdIsDead(lp))
+			break;
+
+		Assert(ItemIdIsNormal(lp));
+		htup = (HeapTupleHeader) PageGetItem(dp, lp);
+
+		/*
+		 * Check the tuple XMIN against prior XMAX, if any
+		 */
+		if (TransactionIdIsValid(priorXmax) &&
+			!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
+			break;
+
+		/*
+		 * OK, this tuple is indeed a member of the chain.
+		 */
+		chainitems[nchain++] = offnum;
+
+		/*
+		 * Check tuple's visibility status.
+		 */
+		tupdead = recent_dead = false;
+
+		switch (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer))
+		{
+			case HEAPTUPLE_DEAD:
+				tupdead = true;
+				break;
+
+			case HEAPTUPLE_RECENTLY_DEAD:
+				recent_dead = true;
+				/*
+				 * This tuple may soon become DEAD. Re-set the hint bit so
+				 * that the page is reconsidered for pruning in future.
+				 */
+				PageSetPrunable(dp);
+				break;
+
+			case HEAPTUPLE_DELETE_IN_PROGRESS:
+				/*
+				 * This tuple may soon become DEAD. Re-set the hint bit so
+				 * that the page is reconsidered for pruning in future.
+				 */
+				PageSetPrunable(dp);
+				break;
+
+			case HEAPTUPLE_LIVE:
+			case HEAPTUPLE_INSERT_IN_PROGRESS:
+				/*
+				 * If we wanted to optimize for aborts, we might consider
+				 * marking the page prunable when we see INSERT_IN_PROGRESS.
+				 * But we don't.  See related decisions about when to mark
+				 * the page prunable in heapam.c.
+				 */
+				break;
+
+			default:
+				elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+				break;
+		}
+
+		/*
+		 * Remember the last DEAD tuple seen.  We will advance past
+		 * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
+		 * but we can't advance past anything else.  (XXX is it really worth
+		 * continuing to scan beyond RECENTLY_DEAD?  The case where we will
+		 * find another DEAD tuple is a fairly unusual corner case.)
+		 */
+		if (tupdead)
+			latestdead = offnum;
+		else if (!recent_dead)
+			break;
+
+		/*
+		 * If the tuple is not HOT-updated, then we are at the end of this
+		 * HOT-update chain.
+		 */
+		if (!HeapTupleHeaderIsHotUpdated(htup))
+			break;
+
+		/*
+		 * Advance to next chain member.
+		 */
+		Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
+			   BufferGetBlockNumber(buffer));
+		offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+		priorXmax = HeapTupleHeaderGetXmax(htup);
+	}
+
+	/*
+	 * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
+	 * the DEAD tuples at the start of the chain are removed and the root line
+	 * pointer is appropriately redirected.
+	 */
+	if (OffsetNumberIsValid(latestdead))
+	{
+		/*
+		 * Mark as unused each intermediate item that we are able to remove
+		 * from the chain.
+		 *
+		 * When the previous item is the last dead tuple seen, we are at
+		 * the right candidate for redirection.
+		 */
+		for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
+		{
+			ItemId lp = PageGetItemId(dp, chainitems[i]);
+
+			ItemIdSetUnused(lp);
+			heap_prune_record_unused(nowunused, nunused, chainitems[i]);
+			ndeleted++;
+		}
+
+		/*
+		 * If the root entry had been a normal tuple, we are deleting it,
+		 * so count it in the result.  But changing a redirect (even to
+		 * DEAD state) doesn't count.
+		 */
+		if (ItemIdIsNormal(rootlp))
+			ndeleted++;
+
+		/*
+		 * If the DEAD tuple is at the end of the chain, the entire chain is
+		 * dead and the root line pointer can be marked dead.  Otherwise
+		 * just redirect the root to the correct chain member.
+		 */
+		if (i >= nchain)
+		{
+			ItemIdSetDead(rootlp);
+			heap_prune_record_dead(nowdead, ndead, rootoffnum);
+		}
+		else
+		{
+			ItemIdSetRedirect(rootlp, chainitems[i]);
+			heap_prune_record_redirect(redirected, nredirected,
+									   rootoffnum,
+									   chainitems[i]);
+		}
+	}
+	else if (nchain < 2 && ItemIdIsRedirected(rootlp))
+	{
+		/*
+		 * We found a redirect item that doesn't point to a valid follow-on
+		 * item.  This can happen if the loop in heap_page_prune caused us
+		 * to visit the dead successor of a redirect item before visiting
+		 * the redirect item.  We can clean up by setting the redirect item
+		 * to DEAD state.
+		 */
+		ItemIdSetDead(rootlp);
+		heap_prune_record_dead(nowdead, ndead, rootoffnum);
+	}
+
+	/*
+	 * If requested, eliminate LP_REDIRECT items by moving tuples.  Note that
+	 * if the root item is LP_REDIRECT and doesn't point to a valid follow-on
+	 * item, we already killed it above.
+	 */
+	if (redirect_move && ItemIdIsRedirected(rootlp))
+	{
+		OffsetNumber firstoffnum = ItemIdGetRedirect(rootlp);
+		ItemId firstlp = PageGetItemId(dp, firstoffnum);
+		HeapTupleData	firsttup;
+
+		Assert(ItemIdIsNormal(firstlp));
+		/* Set up firsttup to reference the tuple at its existing CTID */
+		firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp);
+		firsttup.t_len = ItemIdGetLength(firstlp);
+		ItemPointerSet(&firsttup.t_self,
+					   BufferGetBlockNumber(buffer),
+					   firstoffnum);
+		firsttup.t_tableOid = RelationGetRelid(relation);
+
+		/*
+		 * Mark the tuple for invalidation.  Needed because we're changing
+		 * its CTID.
+		 */
+		CacheInvalidateHeapTuple(relation, &firsttup);
+
+		/*
+		 * Change heap-only status of the tuple because after the line
+		 * pointer manipulation, it's no longer a heap-only tuple, but is
+		 * directly pointed to by index entries.
+		 */
+		Assert(HeapTupleIsHeapOnly(&firsttup));
+		HeapTupleClearHeapOnly(&firsttup);
+
+		/* Now move the item pointer */
+		*rootlp = *firstlp;
+		ItemIdSetUnused(firstlp);
+
+		/*
+		 * If latestdead is valid, we have already recorded the redirection
+		 * above.  Otherwise, do it now.
+		 *
+		 * We don't record firstlp in the nowunused[] array, since the
+		 * redirection entry is enough to tell heap_xlog_clean what to do.
+		 */
+		if (!OffsetNumberIsValid(latestdead))
+			heap_prune_record_redirect(redirected, nredirected, rootoffnum,
+									   firstoffnum);
+	}
+
+	return ndeleted;
+}
+
+
+/* Record newly-redirected item pointer */
+static void
+heap_prune_record_redirect(OffsetNumber *redirected, int *nredirected,
+			OffsetNumber offnum, OffsetNumber rdoffnum)
+{
+	Assert(*nredirected < MaxHeapTuplesPerPage);
+	redirected[*nredirected * 2] = offnum;
+	redirected[*nredirected * 2 + 1] = rdoffnum;
+	(*nredirected)++;
+}
+
+/* Record newly-dead item pointer */
+static void
+heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+					   OffsetNumber offnum)
+{
+	Assert(*ndead < MaxHeapTuplesPerPage);
+	nowdead[*ndead] = offnum;
+	(*ndead)++;
+}
+
+/* Record newly-unused item pointer */
+static void
+heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+						 OffsetNumber offnum)
+{
+	Assert(*nunused < MaxHeapTuplesPerPage);
+	nowunused[*nunused] = offnum;
+	(*nunused)++;
+}
+
+
+/*
+ * For all items in this page, find their respective root line pointers.
+ * If item k is part of a HOT-chain with root at item j, then we set
+ * root_offsets[k - 1] = j.
+ *
+ * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
+ * We zero out all unused entries.
+ *
+ * The function must be called with at least share lock on the buffer, to
+ * prevent concurrent prune operations.
+ *
+ * Note: The information collected here is valid only as long as the caller
+ * holds a pin on the buffer. Once pin is released, a tuple might be pruned
+ * and reused by a completely unrelated tuple.
+ */
+void
+heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
+{
+	OffsetNumber	offnum, maxoff;
+
+	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
+
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++)
+	{
+		ItemId			lp = PageGetItemId(page, offnum);
+		HeapTupleHeader	htup;
+		OffsetNumber	nextoffnum;
+		TransactionId	priorXmax;
+
+		/* skip unused and dead items */
+		if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
+			continue;
+
+		if (ItemIdIsNormal(lp))
+		{
+			htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+			/*
+			 * Check if this tuple is part of a HOT-chain rooted at some other
+			 * tuple. If so, skip it for now; we'll process it when we find
+			 * its root.
+			 */
+			if (HeapTupleHeaderIsHeapOnly(htup))
+				continue;
+
+			/*
+			 * This is either a plain tuple or the root of a HOT-chain.
+			 * Remember it in the mapping.
+			 */
+			root_offsets[offnum - 1] = offnum;
+
+			/* If it's not the start of a HOT-chain, we're done with it */
+			if (!HeapTupleHeaderIsHotUpdated(htup))
+				continue;
+
+			/* Set up to scan the HOT-chain */
+			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+			priorXmax = HeapTupleHeaderGetXmax(htup);
+		}
+		else
+		{
+			/* Must be a redirect item. We do not set its root_offsets entry */
+			Assert(ItemIdIsRedirected(lp));
+			/* Set up to scan the HOT-chain */
+			nextoffnum = ItemIdGetRedirect(lp);
+			priorXmax = InvalidTransactionId;
+		}
+
+		/*
+		 * Now follow the HOT-chain and collect other tuples in the chain.
+		 *
+		 * Note: Even though this is a nested loop, the complexity of the
+		 * function is O(N) because a tuple in the page should be visited not
+		 * more than twice, once in the outer loop and once in HOT-chain
+		 * chases.
+		 */
+		for (;;)
+		{
+			lp = PageGetItemId(page, nextoffnum);
+
+			/* Check for broken chains */
+			if (!ItemIdIsNormal(lp))
+				break;
+
+			htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+			if (TransactionIdIsValid(priorXmax) &&
+				!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
+				break;
+
+			/* Remember the root line pointer for this item */
+			root_offsets[nextoffnum - 1] = offnum;
+
+			/* Advance to next chain member, if any */
+			if (!HeapTupleHeaderIsHotUpdated(htup))
+				break;
+
+			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+			priorXmax = HeapTupleHeaderGetXmax(htup);
+		}
+	}
+}
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 60aab58de38..e8c5eec50ac 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -96,7 +96,7 @@
  * Portions Copyright (c) 1994-5, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.6 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.7 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -320,12 +320,14 @@ rewrite_heap_tuple(RewriteState state,
 	 * Copy the original tuple's visibility information into new_tuple.
 	 *
 	 * XXX we might later need to copy some t_infomask2 bits, too?
+	 * Right now, we intentionally clear the HOT status bits.
 	 */
 	memcpy(&new_tuple->t_data->t_choice.t_heap,
 		   &old_tuple->t_data->t_choice.t_heap,
 		   sizeof(HeapTupleFields));
 
 	new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK;
+	new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK;
 	new_tuple->t_data->t_infomask |=
 		old_tuple->t_data->t_infomask & HEAP_XACT_MASK;
 
@@ -593,7 +595,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 	/* Now we can check to see if there's enough free space already. */
 	if (state->rs_buffer_valid)
 	{
-		pageFreeSpace = PageGetFreeSpace(page);
+		pageFreeSpace = PageGetHeapFreeSpace(page);
 
 		if (len + saveFreeSpace > pageFreeSpace)
 		{
@@ -628,7 +630,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 
 	/* And now we can insert the tuple into the page */
 	newoff = PageAddItem(page, (Item) heaptup->t_data, len,
-						 InvalidOffsetNumber, false);
+						 InvalidOffsetNumber, false, true);
 	if (newoff == InvalidOffsetNumber)
 		elog(ERROR, "failed to add tuple");
 
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 0009739180c..7bf1e43cb45 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.62 2007/05/27 03:50:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.63 2007/09/20 17:56:30 tgl Exp $
  *
  * NOTES
  *	  many of the old access method routines have been turned into
@@ -21,6 +21,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 
@@ -95,6 +96,9 @@ RelationGetIndexScan(Relation indexRelation,
 	ItemPointerSetInvalid(&scan->xs_ctup.t_self);
 	scan->xs_ctup.t_data = NULL;
 	scan->xs_cbuf = InvalidBuffer;
+	scan->xs_prev_xmax = InvalidTransactionId;
+	scan->xs_next_hot = InvalidOffsetNumber;
+	scan->xs_hot_dead = false;
 
 	/*
 	 * Let the AM fill in the key and any opaque data it wants.
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index d905013a5fc..fd727ca68c8 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.98 2007/05/27 03:50:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.99 2007/09/20 17:56:30 tgl Exp $
  *
  * INTERFACE ROUTINES
  *		index_open		- open an index relation by relation OID
@@ -64,6 +64,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "pgstat.h"
 #include "utils/relcache.h"
 
@@ -313,6 +314,8 @@ index_rescan(IndexScanDesc scan, ScanKey key)
 		scan->xs_cbuf = InvalidBuffer;
 	}
 
+	scan->xs_next_hot = InvalidOffsetNumber;
+
 	scan->kill_prior_tuple = false;		/* for safety */
 
 	FunctionCall2(procedure,
@@ -370,6 +373,14 @@ index_markpos(IndexScanDesc scan)
  * NOTE: this only restores the internal scan state of the index AM.
  * The current result tuple (scan->xs_ctup) doesn't change.  See comments
  * for ExecRestrPos().
+ *
+ * NOTE: in the presence of HOT chains, mark/restore only works correctly
+ * if the scan's snapshot is MVCC-safe; that ensures that there's at most one
+ * returnable tuple in each HOT chain, and so restoring the prior state at the
+ * granularity of the index AM is sufficient.  Since the only current user
+ * of mark/restore functionality is nodeMergejoin.c, this effectively means
+ * that merge-join plans only work for MVCC snapshots.  This could be fixed
+ * if necessary, but for now it seems unimportant.
  * ----------------
  */
 void
@@ -377,9 +388,13 @@ index_restrpos(IndexScanDesc scan)
 {
 	FmgrInfo   *procedure;
 
+	Assert(IsMVCCSnapshot(scan->xs_snapshot));
+
 	SCAN_CHECKS;
 	GET_SCAN_PROCEDURE(amrestrpos);
 
+	scan->xs_next_hot = InvalidOffsetNumber;
+
 	scan->kill_prior_tuple = false;		/* for safety */
 
 	FunctionCall1(procedure, PointerGetDatum(scan));
@@ -398,72 +413,224 @@ HeapTuple
 index_getnext(IndexScanDesc scan, ScanDirection direction)
 {
 	HeapTuple	heapTuple = &scan->xs_ctup;
+	ItemPointer	tid = &heapTuple->t_self;
 	FmgrInfo   *procedure;
 
 	SCAN_CHECKS;
 	GET_SCAN_PROCEDURE(amgettuple);
 
-	/* just make sure this is false... */
-	scan->kill_prior_tuple = false;
+	/*
+	 * We always reset xs_hot_dead; if we are here then either we are just
+	 * starting the scan, or we previously returned a visible tuple, and in
+	 * either case it's inappropriate to kill the prior index entry.
+	 */
+	scan->xs_hot_dead = false;
 
 	for (;;)
 	{
-		bool		found;
+		OffsetNumber offnum;
+		bool at_chain_start;
+		Page dp;
 
-		/*
-		 * The AM's gettuple proc finds the next tuple matching the scan keys.
-		 */
-		found = DatumGetBool(FunctionCall2(procedure,
-										   PointerGetDatum(scan),
-										   Int32GetDatum(direction)));
+		if (scan->xs_next_hot != InvalidOffsetNumber)
+		{
+			/*
+			 * We are resuming scan of a HOT chain after having returned
+			 * an earlier member.  Must still hold pin on current heap page.
+			 */
+			Assert(BufferIsValid(scan->xs_cbuf));
+			Assert(ItemPointerGetBlockNumber(tid) ==
+				   BufferGetBlockNumber(scan->xs_cbuf));
+			Assert(TransactionIdIsValid(scan->xs_prev_xmax));
+			offnum = scan->xs_next_hot;
+			at_chain_start = false;
+			scan->xs_next_hot = InvalidOffsetNumber;
+		}
+		else
+		{
+			bool		found;
+			Buffer		prev_buf;
+
+			/*
+			 * If we scanned a whole HOT chain and found only dead tuples,
+			 * tell index AM to kill its entry for that TID.
+			 */
+			scan->kill_prior_tuple = scan->xs_hot_dead;
+
+			/*
+			 * The AM's gettuple proc finds the next index entry matching the
+			 * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid).
+			 */
+			found = DatumGetBool(FunctionCall2(procedure,
+											   PointerGetDatum(scan),
+											   Int32GetDatum(direction)));
+
+			/* Reset kill flag immediately for safety */
+			scan->kill_prior_tuple = false;
+
+			/* If we're out of index entries, break out of outer loop */
+			if (!found)
+				break;
+
+			pgstat_count_index_tuples(scan->indexRelation, 1);
+
+			/* Switch to correct buffer if we don't have it already */
+			prev_buf = scan->xs_cbuf;
+			scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
+												 scan->heapRelation,
+											 ItemPointerGetBlockNumber(tid));
+
+			/*
+			 * Prune page, but only if we weren't already on this page
+			 */
+			if (prev_buf != scan->xs_cbuf)
+				heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
+									RecentGlobalXmin);
+
+			/* Prepare to scan HOT chain starting at index-referenced offnum */
+			offnum = ItemPointerGetOffsetNumber(tid);
+			at_chain_start = true;
+
+			/* We don't know what the first tuple's xmin should be */
+			scan->xs_prev_xmax = InvalidTransactionId;
+
+			/* Initialize flag to detect if all entries are dead */
+			scan->xs_hot_dead = true;
+		}
+
+		/* Obtain share-lock on the buffer so we can examine visibility */
+		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
 
-		/* Reset kill flag immediately for safety */
-		scan->kill_prior_tuple = false;
+		dp = (Page) BufferGetPage(scan->xs_cbuf);
 
-		if (!found)
+		/* Scan through possible multiple members of HOT-chain */
+		for (;;)
 		{
-			/* Release any held pin on a heap page */
-			if (BufferIsValid(scan->xs_cbuf))
-			{
-				ReleaseBuffer(scan->xs_cbuf);
-				scan->xs_cbuf = InvalidBuffer;
-			}
-			return NULL;		/* failure exit */
-		}
+			ItemId lp;
+			ItemPointer ctid;
 
-		pgstat_count_index_tuples(scan->indexRelation, 1);
+			/* check for bogus TID */
+			if (offnum < FirstOffsetNumber ||
+				offnum > PageGetMaxOffsetNumber(dp))
+				break;
 
-		/*
-		 * Fetch the heap tuple and see if it matches the snapshot.
-		 */
-		if (heap_release_fetch(scan->heapRelation, scan->xs_snapshot,
-							   heapTuple, &scan->xs_cbuf, true,
-							   scan->indexRelation))
-			break;
+			lp = PageGetItemId(dp, offnum);
 
-		/* Skip if no undeleted tuple at this location */
-		if (heapTuple->t_data == NULL)
-			continue;
+			/* check for unused, dead, or redirected items */
+			if (!ItemIdIsNormal(lp))
+			{
+				/* We should only see a redirect at start of chain */
+				if (ItemIdIsRedirected(lp) && at_chain_start)
+				{
+					/* Follow the redirect */
+					offnum = ItemIdGetRedirect(lp);
+					at_chain_start = false;
+					continue;
+				}
+				/* else must be end of chain */
+				break;
+			}
 
-		/*
-		 * If we can't see it, maybe no one else can either.  Check to see if
-		 * the tuple is dead to all transactions.  If so, signal the index AM
-		 * to not return it on future indexscans.
-		 *
-		 * We told heap_release_fetch to keep a pin on the buffer, so we can
-		 * re-access the tuple here.  But we must re-lock the buffer first.
-		 */
-		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
+			/*
+			 * We must initialize all of *heapTuple (ie, scan->xs_ctup)
+			 * since it is returned to the executor on success.
+			 */
+			heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+			heapTuple->t_len = ItemIdGetLength(lp);
+			ItemPointerSetOffsetNumber(tid, offnum);
+			heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation);
+			ctid = &heapTuple->t_data->t_ctid;
+
+			/*
+			 * Shouldn't see a HEAP_ONLY tuple at chain start.  (This test
+			 * should be unnecessary, since the chain root can't be removed
+			 * while we have pin on the index entry, but let's make it anyway.)
+			 */
+			if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
+				break;
+
+			/*
+			 * The xmin should match the previous xmax value, else chain is
+			 * broken.  (Note: this test is not optional because it protects
+			 * us against the case where the prior chain member's xmax
+			 * aborted since we looked at it.)
+			 */
+			if (TransactionIdIsValid(scan->xs_prev_xmax) &&
+				!TransactionIdEquals(scan->xs_prev_xmax,
+								 HeapTupleHeaderGetXmin(heapTuple->t_data)))
+				break;
+
+			/* If it's visible per the snapshot, we must return it */
+			if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
+											 scan->xs_cbuf))
+			{
+				/*
+				 * If the snapshot is MVCC, we know that it could accept
+				 * at most one member of the HOT chain, so we can skip
+				 * examining any more members.  Otherwise, check for
+				 * continuation of the HOT-chain, and set state for next time.
+				 */
+				if (IsMVCCSnapshot(scan->xs_snapshot))
+					scan->xs_next_hot = InvalidOffsetNumber;
+				else if (HeapTupleIsHotUpdated(heapTuple))
+				{
+					Assert(ItemPointerGetBlockNumber(ctid) ==
+						   ItemPointerGetBlockNumber(tid));
+					scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid);
+					scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+				}
+				else
+					scan->xs_next_hot = InvalidOffsetNumber;
+
+				LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+				pgstat_count_heap_fetch(scan->indexRelation);
+
+				return heapTuple;
+			}
 
-		if (HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
-									 scan->xs_cbuf) == HEAPTUPLE_DEAD)
-			scan->kill_prior_tuple = true;
+			/*
+			 * If we can't see it, maybe no one else can either.  Check to see
+			 * if the tuple is dead to all transactions.  If we find that all
+			 * the tuples in the HOT chain are dead, we'll signal the index AM
+			 * to not return that TID on future indexscans.
+			 */
+			if (scan->xs_hot_dead &&
+				HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
+										 scan->xs_cbuf) != HEAPTUPLE_DEAD)
+				scan->xs_hot_dead = false;
+
+			/*
+			 * Check to see if HOT chain continues past this tuple; if so
+			 * fetch the next offnum (we don't bother storing it into
+			 * xs_next_hot, but must store xs_prev_xmax), and loop around.
+			 */
+			if (HeapTupleIsHotUpdated(heapTuple))
+			{
+				Assert(ItemPointerGetBlockNumber(ctid) ==
+					   ItemPointerGetBlockNumber(tid));
+				offnum = ItemPointerGetOffsetNumber(ctid);
+				at_chain_start = false;
+				scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+			}
+			else
+				break;			/* end of chain */
+		} /* loop over a single HOT chain */
 
 		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+		/* Loop around to ask index AM for another TID */
+		scan->xs_next_hot = InvalidOffsetNumber;
+	}
+
+	/* Release any held pin on a heap page */
+	if (BufferIsValid(scan->xs_cbuf))
+	{
+		ReleaseBuffer(scan->xs_cbuf);
+		scan->xs_cbuf = InvalidBuffer;
 	}
 
-	/* Success exit */
-	return heapTuple;
+	return NULL;				/* failure exit */
 }
 
 /* ----------------
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 7dbaa2c245f..5f7ecbe16da 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.159 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.160 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -193,8 +193,6 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 	 */
 	for (;;)
 	{
-		HeapTupleData htup;
-		Buffer		hbuffer;
 		ItemId		curitemid;
 		IndexTuple	curitup;
 		BlockNumber nblkno;
@@ -223,6 +221,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 			 */
 			if (!ItemIdIsDead(curitemid))
 			{
+				ItemPointerData htid;
+				bool all_dead;
+
 				/*
 				 * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
 				 * how we handling NULLs - and so we must not use _bt_compare
@@ -234,17 +235,20 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 
 				/* okay, we gotta fetch the heap tuple ... */
 				curitup = (IndexTuple) PageGetItem(page, curitemid);
-				htup.t_self = curitup->t_tid;
-				if (heap_fetch(heapRel, &SnapshotDirty, &htup, &hbuffer,
-							   true, NULL))
+				htid = curitup->t_tid;
+
+				/*
+				 * We check the whole HOT-chain to see if there is any tuple
+				 * that satisfies SnapshotDirty.  This is necessary because
+				 * we have just a single index entry for the entire chain.
+				 */
+				if (heap_hot_search(&htid, heapRel, &SnapshotDirty, &all_dead))
 				{
 					/* it is a duplicate */
 					TransactionId xwait =
 					(TransactionIdIsValid(SnapshotDirty.xmin)) ?
 					SnapshotDirty.xmin : SnapshotDirty.xmax;
 
-					ReleaseBuffer(hbuffer);
-
 					/*
 					 * If this tuple is being updated by other transaction
 					 * then we have to wait for its commit/abort.
@@ -263,15 +267,22 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 					 * is itself now committed dead --- if so, don't complain.
 					 * This is a waste of time in normal scenarios but we must
 					 * do it to support CREATE INDEX CONCURRENTLY.
+					 * 
+					 * We must follow HOT-chains here because during
+					 * concurrent index build, we insert the root TID though
+					 * the actual tuple may be somewhere in the HOT-chain.
+					 * While following the chain we might not stop at the exact
+					 * tuple which triggered the insert, but that's OK because
+					 * if we find a live tuple anywhere in this chain, we have
+					 * a unique key conflict.  The other live tuple is not part
+					 * of this chain because it had a different index entry.
 					 */
-					htup.t_self = itup->t_tid;
-					if (heap_fetch(heapRel, SnapshotSelf, &htup, &hbuffer,
-								   false, NULL))
+					htid = itup->t_tid;
+					if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
 					{
 						/* Normal case --- it's still live */
-						ReleaseBuffer(hbuffer);
 					}
-					else if (htup.t_data != NULL)
+					else
 					{
 						/*
 						 * It's been deleted, so no error, and no need to
@@ -279,39 +290,27 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 						 */
 						break;
 					}
-					else
-					{
-						/* couldn't find the tuple?? */
-						elog(ERROR, "failed to fetch tuple being inserted");
-					}
 
 					ereport(ERROR,
 							(errcode(ERRCODE_UNIQUE_VIOLATION),
 					errmsg("duplicate key value violates unique constraint \"%s\"",
 						   RelationGetRelationName(rel))));
 				}
-				else if (htup.t_data != NULL)
+				else if (all_dead)
 				{
 					/*
-					 * Hmm, if we can't see the tuple, maybe it can be marked
-					 * killed.	This logic should match index_getnext and
-					 * btgettuple.
+					 * The conflicting tuple (or whole HOT chain) is dead to
+					 * everyone, so we may as well mark the index entry
+					 * killed.
 					 */
-					LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
-					if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
-												 hbuffer) == HEAPTUPLE_DEAD)
-					{
-						ItemIdMarkDead(curitemid);
-						opaque->btpo_flags |= BTP_HAS_GARBAGE;
-						/* be sure to mark the proper buffer dirty... */
-						if (nbuf != InvalidBuffer)
-							SetBufferCommitInfoNeedsSave(nbuf);
-						else
-							SetBufferCommitInfoNeedsSave(buf);
-					}
-					LockBuffer(hbuffer, BUFFER_LOCK_UNLOCK);
+					ItemIdMarkDead(curitemid);
+					opaque->btpo_flags |= BTP_HAS_GARBAGE;
+					/* be sure to mark the proper buffer dirty... */
+					if (nbuf != InvalidBuffer)
+						SetBufferCommitInfoNeedsSave(nbuf);
+					else
+						SetBufferCommitInfoNeedsSave(buf);
 				}
-				ReleaseBuffer(hbuffer);
 			}
 		}
 
@@ -840,7 +839,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
-						false) == InvalidOffsetNumber)
+						false, false) == InvalidOffsetNumber)
 			elog(PANIC, "failed to add hikey to the right sibling");
 		rightoff = OffsetNumberNext(rightoff);
 	}
@@ -865,7 +864,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 	}
 	if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add hikey to the left sibling");
 	leftoff = OffsetNumberNext(leftoff);
 
@@ -1700,7 +1699,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	 * benefit of _bt_restore_page().
 	 */
 	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add leftkey to new root page");
 	pfree(new_item);
 
@@ -1718,7 +1717,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	 * insert the right page pointer into the new root page.
 	 */
 	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add rightkey to new root page");
 	pfree(new_item);
 
@@ -1805,7 +1804,7 @@ _bt_pgaddtup(Relation rel,
 	}
 
 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item to the %s for \"%s\"",
 			 where, RelationGetRelationName(rel));
 }
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 118dc22bb35..6293792b9f5 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -57,7 +57,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.112 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.113 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -400,7 +400,7 @@ _bt_sortaddtup(Page page,
 	}
 
 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to the index page");
 }
 
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index db64422b19f..499129c48f1 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.45 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.46 2007/09/20 17:56:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -141,8 +141,8 @@ _bt_restore_page(Page page, char *from, int len)
 		memcpy(&itupdata, from, sizeof(IndexTupleData));
 		itemsz = IndexTupleDSize(itupdata);
 		itemsz = MAXALIGN(itemsz);
-		if (PageAddItem(page, (Item) from, itemsz,
-						FirstOffsetNumber, false) == InvalidOffsetNumber)
+		if (PageAddItem(page, (Item) from, itemsz, FirstOffsetNumber,
+						false, false) == InvalidOffsetNumber)
 			elog(PANIC, "_bt_restore_page: cannot add item to page");
 		from += itemsz;
 	}
@@ -238,7 +238,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
 			{
 				if (PageAddItem(page, (Item) datapos, datalen,
 							ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
-								false) == InvalidOffsetNumber)
+								false, false) == InvalidOffsetNumber)
 					elog(PANIC, "btree_insert_redo: failed to add item");
 
 				PageSetLSN(page, lsn);
@@ -389,7 +389,7 @@ btree_xlog_split(bool onleft, bool isroot,
 				if (onleft)
 				{
 					if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
-									false) == InvalidOffsetNumber)
+									false, false) == InvalidOffsetNumber)
 						elog(PANIC, "failed to add new item to left page after split");
 				}
 
@@ -398,7 +398,7 @@ btree_xlog_split(bool onleft, bool isroot,
 				hiItem = PageGetItem(rpage, hiItemId);
 
 				if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
-								P_HIKEY, false) == InvalidOffsetNumber)
+								P_HIKEY, false, false) == InvalidOffsetNumber)
 					elog(PANIC, "failed to add high key to left page after split");
 
 				/* Fix opaque fields */
author	Tom Lane <tgl@sss.pgh.pa.us>	2007-09-20 17:56:33 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2007-09-20 17:56:33 +0000
commit	282d2a03dd30804b01f8042f640d638c2ee76604 (patch)
tree	004f08ce31f1bfb03ab55571ad7867babe5b3d7f /src/backend/access
parent	bbf4fdc2538097bb3103806e1419ceef1f289203 (diff)
download	postgresql-282d2a03dd30804b01f8042f640d638c2ee76604.tar.gz postgresql-282d2a03dd30804b01f8042f640d638c2ee76604.zip