Rewrite some RI code to avoid using SPI

Modify the subroutines called by RI trigger functions that want to check if a given referenced value exists in the referenced relation to simply scan the foreign key constraint's unique index, instead of using SPI to execute SELECT 1 FROM referenced_relation WHERE ref_key = $1 This saves a lot of work, especially when inserting into or updating a referencing relation. This rewrite allows to fix a PK row visibility bug caused by a partition descriptor hack which requires ActiveSnapshot to be set to come up with the correct set of partitions for the RI query running under REPEATABLE READ isolation. We now set that snapshot indepedently of the snapshot to be used by the PK index scan, so the two no longer interfere. The buggy output in src/test/isolation/expected/fk-snapshot.out of the relevant test case added by commit 00cb86e75d6d has been corrected. (The bug still exists in branch 14, however, but this fix is too invasive to backpatch.) Author: Amit Langote <amitlangote09@gmail.com> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Corey Huinker <corey.huinker@gmail.com> Reviewed-by: Li Japin <japinli@hotmail.com> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Zhihong Yu <zyu@yugabyte.com> Discussion: https://postgr.es/m/CA+HiwqGkfJfYdeq5vHPh6eqPKjSbfpDDY+j-kXYFePQedtSLeg@mail.gmail.com
author: Alvaro Herrera <alvherre@alvh.no-ip.org> 2022-04-07 21:04:36 +0200
committer: Alvaro Herrera <alvherre@alvh.no-ip.org> 2022-04-07 21:10:03 +0200
commit: 99392cdd78b788295e52b9f4942fa11992fd5ba9 (patch)
tree: 1c8929b166e87df54b3bd86e389a06be6674a154 /src/backend/executor
parent: dbe29b0d2c96f34b3f3222c6fc1710fcff065f18 (diff)
download: postgresql-99392cdd78b788295e52b9f4942fa11992fd5ba9.tar.gz
postgresql-99392cdd78b788295e52b9f4942fa11992fd5ba9.zip
2 files changed, 258 insertions, 77 deletions
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 615bd809735..c22c9ac0966 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -176,8 +176,9 @@ static void FormPartitionKeyDatum(PartitionDispatch pd,
 								  EState *estate,
 								  Datum *values,
 								  bool *isnull);
-static int	get_partition_for_tuple(PartitionDispatch pd, Datum *values,
-									bool *isnull);
+static int	get_partition_for_tuple(PartitionKey key,
+									PartitionDesc partdesc,
+									Datum *values, bool *isnull);
 static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
 												  Datum *values,
 												  bool *isnull,
@@ -318,7 +319,9 @@ ExecFindPartition(ModifyTableState *mtstate,
 		 * these values, error out.
 		 */
 		if (partdesc->nparts == 0 ||
-			(partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0)
+			(partidx = get_partition_for_tuple(dispatch->key,
+											   dispatch->partdesc,
+											   values, isnull)) < 0)
 		{
 			char	   *val_desc;
 
@@ -1341,12 +1344,12 @@ FormPartitionKeyDatum(PartitionDispatch pd,
  * found or -1 if none found.
  */
 static int
-get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
+get_partition_for_tuple(PartitionKey key,
+						PartitionDesc partdesc,
+						Datum *values, bool *isnull)
 {
 	int			bound_offset;
 	int			part_index = -1;
-	PartitionKey key = pd->key;
-	PartitionDesc partdesc = pd->partdesc;
 	PartitionBoundInfo boundinfo = partdesc->boundinfo;
 
 	/* Route as appropriate based on partitioning strategy. */
@@ -1439,6 +1442,165 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
 }
 
 /*
+ * ExecGetLeafPartitionForKey
+ *		Finds the leaf partition of partitioned table 'root_rel' that would
+ *		contain the specified key tuple.
+ *
+ * A subset of the table's columns (including all of the partition key columns)
+ * must be specified:
+ * - 'key_natts' indicats the number of columns contained in the key
+ * - 'key_attnums' indicates their attribute numbers as defined in 'root_rel'
+ * - 'key_vals' and 'key_nulls' specify the key tuple
+ *
+ * Returns the leaf partition, locked with the given lockmode, or NULL if
+ * there isn't one.  Caller is responsibly for closing it.  All intermediate
+ * partitions are also locked with the same lockmode.  Caller must have locked
+ * the root already.
+ *
+ * In addition, the OID of the index of a unique constraint on the root table
+ * must be given as 'root_idxoid'; *leaf_idxoid will be set to the OID of the
+ * corresponding index on the returned leaf partition.  (This can be used by
+ * caller to search for a tuple matching the key in the leaf partition.)
+ *
+ * This works because the unique key defined on the root relation is required
+ * to contain the partition key columns of all of the ancestors that lead up to
+ * a given leaf partition.
+ */
+Relation
+ExecGetLeafPartitionForKey(Relation root_rel, int key_natts,
+						   const AttrNumber *key_attnums,
+						   Datum *key_vals, char *key_nulls,
+						   Oid root_idxoid, int lockmode,
+						   Oid *leaf_idxoid)
+{
+	Relation	found_leafpart = NULL;
+	Relation	rel = root_rel;
+	Oid			constr_idxoid = root_idxoid;
+	PartitionDirectory partdir;
+
+	Assert(root_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+
+	*leaf_idxoid = InvalidOid;
+
+	partdir = CreatePartitionDirectory(CurrentMemoryContext, true);
+
+	/*
+	 * Descend through partitioned parents to find the leaf partition that
+	 * would accept a row with the provided key values, starting with the root
+	 * parent.
+	 */
+	for (;;)
+	{
+		PartitionKey partkey = RelationGetPartitionKey(rel);
+		PartitionDesc partdesc;
+		Datum		partkey_vals[PARTITION_MAX_KEYS];
+		bool		partkey_isnull[PARTITION_MAX_KEYS];
+		AttrNumber *root_partattrs = partkey->partattrs;
+		int			found_att;
+		int			partidx;
+		Oid			partoid;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Collect partition key values from the unique key.
+		 *
+		 * Because we only have the root table's copy of pk_attnums, must map
+		 * any non-root table's partition key attribute numbers to the root
+		 * table's.
+		 */
+		if (rel != root_rel)
+		{
+			/*
+			 * map->attnums will contain root table attribute numbers for each
+			 * attribute of the current partitioned relation.
+			 */
+			AttrMap    *map;
+
+			map = build_attrmap_by_name_if_req(RelationGetDescr(root_rel),
+											   RelationGetDescr(rel));
+			if (map)
+			{
+				root_partattrs = palloc(partkey->partnatts *
+										sizeof(AttrNumber));
+				for (int att = 0; att < partkey->partnatts; att++)
+				{
+					AttrNumber	partattno = partkey->partattrs[att];
+
+					root_partattrs[att] = map->attnums[partattno - 1];
+				}
+
+				free_attrmap(map);
+			}
+		}
+
+		/*
+		 * Map the values/isnulls to match the partition description, as
+		 * necessary.
+		 *
+		 * (Referenced key specification does not allow expressions, so there
+		 * would not be expressions in the partition keys either.)
+		 */
+		Assert(partkey->partexprs == NIL);
+		found_att = 0;
+		for (int keyatt = 0; keyatt < key_natts; keyatt++)
+		{
+			for (int att = 0; att < partkey->partnatts; att++)
+			{
+				if (root_partattrs[att] == key_attnums[keyatt])
+				{
+					partkey_vals[found_att] = key_vals[keyatt];
+					partkey_isnull[found_att] = (key_nulls[keyatt] == 'n');
+					found_att++;
+					break;
+				}
+			}
+		}
+		/* We had better have found values for all partition keys */
+		Assert(found_att == partkey->partnatts);
+
+		if (root_partattrs != partkey->partattrs)
+			pfree(root_partattrs);
+
+		/* Get the PartitionDesc using the partition directory machinery.  */
+		partdesc = PartitionDirectoryLookup(partdir, rel);
+		if (partdesc->nparts == 0)
+			break;
+
+		/* Find the partition for the key. */
+		partidx = get_partition_for_tuple(partkey, partdesc,
+										  partkey_vals, partkey_isnull);
+		Assert(partidx < 0 || partidx < partdesc->nparts);
+
+		/* close the previous parent if any, but keep lock */
+		if (rel != root_rel)
+			table_close(rel, NoLock);
+
+		/* No partition found. */
+		if (partidx < 0)
+			break;
+
+		partoid = partdesc->oids[partidx];
+		rel = table_open(partoid, lockmode);
+		constr_idxoid = index_get_partition(rel, constr_idxoid);
+
+		/*
+		 * We're done if the partition is a leaf, else find its partition in
+		 * the next iteration.
+		 */
+		if (partdesc->is_leaf[partidx])
+		{
+			*leaf_idxoid = constr_idxoid;
+			found_leafpart = rel;
+			break;
+		}
+	}
+
+	DestroyPartitionDirectory(partdir);
+	return found_leafpart;
+}
+
+/*
  * ExecBuildSlotPartitionKeyDescription
  *
  * This works very much like BuildIndexValueDescription() and is currently
diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c
index 1a9dab25dd6..bbccafb2cfd 100644
--- a/src/backend/executor/nodeLockRows.c
+++ b/src/backend/executor/nodeLockRows.c
@@ -79,10 +79,7 @@ lnext:
 		Datum		datum;
 		bool		isNull;
 		ItemPointerData tid;
-		TM_FailureData tmfd;
 		LockTupleMode lockmode;
-		int			lockflags = 0;
-		TM_Result	test;
 		TupleTableSlot *markSlot;
 
 		/* clear any leftover test tuple for this rel */
@@ -179,74 +176,11 @@ lnext:
 				break;
 		}
 
-		lockflags = TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS;
-		if (!IsolationUsesXactSnapshot())
-			lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION;
-
-		test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot,
-								markSlot, estate->es_output_cid,
-								lockmode, erm->waitPolicy,
-								lockflags,
-								&tmfd);
-
-		switch (test)
-		{
-			case TM_WouldBlock:
-				/* couldn't lock tuple in SKIP LOCKED mode */
-				goto lnext;
-
-			case TM_SelfModified:
-
-				/*
-				 * The target tuple was already updated or deleted by the
-				 * current command, or by a later command in the current
-				 * transaction.  We *must* ignore the tuple in the former
-				 * case, so as to avoid the "Halloween problem" of repeated
-				 * update attempts.  In the latter case it might be sensible
-				 * to fetch the updated tuple instead, but doing so would
-				 * require changing heap_update and heap_delete to not
-				 * complain about updating "invisible" tuples, which seems
-				 * pretty scary (table_tuple_lock will not complain, but few
-				 * callers expect TM_Invisible, and we're not one of them). So
-				 * for now, treat the tuple as deleted and do not process.
-				 */
-				goto lnext;
-
-			case TM_Ok:
-
-				/*
-				 * Got the lock successfully, the locked tuple saved in
-				 * markSlot for, if needed, EvalPlanQual testing below.
-				 */
-				if (tmfd.traversed)
-					epq_needed = true;
-				break;
-
-			case TM_Updated:
-				if (IsolationUsesXactSnapshot())
-					ereport(ERROR,
-							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
-							 errmsg("could not serialize access due to concurrent update")));
-				elog(ERROR, "unexpected table_tuple_lock status: %u",
-					 test);
-				break;
-
-			case TM_Deleted:
-				if (IsolationUsesXactSnapshot())
-					ereport(ERROR,
-							(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
-							 errmsg("could not serialize access due to concurrent update")));
-				/* tuple was deleted so don't return it */
-				goto lnext;
-
-			case TM_Invisible:
-				elog(ERROR, "attempted to lock invisible tuple");
-				break;
-
-			default:
-				elog(ERROR, "unrecognized table_tuple_lock status: %u",
-					 test);
-		}
+		/* skip tuple if it couldn't be locked */
+		if (!ExecLockTableTuple(erm->relation, &tid, markSlot,
+								estate->es_snapshot, estate->es_output_cid,
+								lockmode, erm->waitPolicy, &epq_needed))
+			goto lnext;
 
 		/* Remember locked tuple's TID for EPQ testing and WHERE CURRENT OF */
 		erm->curCtid = tid;
@@ -281,6 +215,91 @@ lnext:
 	return slot;
 }
 
+/*
+ * ExecLockTableTuple
+ * 		Locks tuple with the specified TID in lockmode following given wait
+ * 		policy
+ *
+ * Returns true if the tuple was successfully locked.  Locked tuple is loaded
+ * into provided slot.
+ */
+bool
+ExecLockTableTuple(Relation relation, ItemPointer tid, TupleTableSlot *slot,
+				   Snapshot snapshot, CommandId cid,
+				   LockTupleMode lockmode, LockWaitPolicy waitPolicy,
+				   bool *epq_needed)
+{
+	TM_FailureData tmfd;
+	int			lockflags = TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS;
+	TM_Result	test;
+
+	if (!IsolationUsesXactSnapshot())
+		lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION;
+
+	test = table_tuple_lock(relation, tid, snapshot, slot, cid, lockmode,
+							waitPolicy, lockflags, &tmfd);
+
+	switch (test)
+	{
+		case TM_WouldBlock:
+			/* couldn't lock tuple in SKIP LOCKED mode */
+			return false;
+
+		case TM_SelfModified:
+
+			/*
+			 * The target tuple was already updated or deleted by the current
+			 * command, or by a later command in the current transaction.  We
+			 * *must* ignore the tuple in the former case, so as to avoid the
+			 * "Halloween problem" of repeated update attempts.  In the latter
+			 * case it might be sensible to fetch the updated tuple instead,
+			 * but doing so would require changing heap_update and heap_delete
+			 * to not complain about updating "invisible" tuples, which seems
+			 * pretty scary (table_tuple_lock will not complain, but few
+			 * callers expect TM_Invisible, and we're not one of them). So for
+			 * now, treat the tuple as deleted and do not process.
+			 */
+			return false;
+
+		case TM_Ok:
+
+			/*
+			 * Got the lock successfully, the locked tuple saved in slot for
+			 * EvalPlanQual, if asked by the caller.
+			 */
+			if (tmfd.traversed && epq_needed)
+				*epq_needed = true;
+			break;
+
+		case TM_Updated:
+			if (IsolationUsesXactSnapshot())
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to concurrent update")));
+			elog(ERROR, "unexpected table_tuple_lock status: %u",
+				 test);
+			break;
+
+		case TM_Deleted:
+			if (IsolationUsesXactSnapshot())
+				ereport(ERROR,
+						(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+						 errmsg("could not serialize access due to concurrent update")));
+			/* tuple was deleted so don't return it */
+			return false;
+
+		case TM_Invisible:
+			elog(ERROR, "attempted to lock invisible tuple");
+			return false;
+
+		default:
+			elog(ERROR, "unrecognized table_tuple_lock status: %u", test);
+			return false;
+	}
+
+	return true;
+}
+
 /* ----------------------------------------------------------------
  *		ExecInitLockRows
  *
author	Alvaro Herrera <alvherre@alvh.no-ip.org>	2022-04-07 21:04:36 +0200
committer	Alvaro Herrera <alvherre@alvh.no-ip.org>	2022-04-07 21:10:03 +0200
commit	99392cdd78b788295e52b9f4942fa11992fd5ba9 (patch)
tree	1c8929b166e87df54b3bd86e389a06be6674a154 /src/backend/executor
parent	dbe29b0d2c96f34b3f3222c6fc1710fcff065f18 (diff)
download	postgresql-99392cdd78b788295e52b9f4942fa11992fd5ba9.tar.gz postgresql-99392cdd78b788295e52b9f4942fa11992fd5ba9.zip