Tighten TAP tests' tracking of postmaster state some more.

Commits 6c4a8903b et al. had a couple of deficiencies: * The logic I added to Cluster::start to see if a PID file is present could be fooled by a stale PID file left over from a previous postmaster. To fix, if we're not sure whether we expect to find a running postmaster or not, validate the PID using "kill 0". * 017_shm.pl has a loop in which it just issues repeated Cluster::start calls; this will fail if some invocation fails but leaves self->_pid set. Per buildfarm results, the above fix is not enough to make this safe: we might have "validated" a PID for a postmaster that exits immediately after we look. Hence, match each failed start call with a stop call that will get us back to the self->_pid == undef state. Add a fail_ok option to Cluster::stop to make this work. Discussion: https://postgr.es/m/CA+hUKGKV6fOHvfiPt8=dOKzvswjAyLoFoJF1iQXMNpi7+hD1JQ@mail.gmail.com
author: Tom Lane <tgl@sss.pgh.pa.us> 2022-01-20 17:28:07 -0500
committer: Tom Lane <tgl@sss.pgh.pa.us> 2022-01-20 17:28:07 -0500
commit: 4828a80069abc3e4b5baffe3ebffb69e6dd5bc10 (patch)
tree: a8575bdcb3a3ee89d356b2af21d0cb0249a003f1
parent: 31680730ef7bac82ff09efa0824c2d8e5e8840f1 (diff)
download: postgresql-4828a80069abc3e4b5baffe3ebffb69e6dd5bc10.tar.gz
postgresql-4828a80069abc3e4b5baffe3ebffb69e6dd5bc10.zip
2 files changed, 37 insertions, 6 deletions
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 11227408c3b..b85068e0da1 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -831,20 +831,37 @@ Note: if the node is already known stopped, this does nothing.
 However, if we think it's running and it's not, it's important for
 this to fail.  Otherwise, tests might fail to detect server crashes.
 
+With optional extra param fail_ok => 1, returns 0 for failure
+instead of bailing out.
+
 =cut
 
 sub stop
 {
-	my ($self, $mode) = @_;
-	my $port   = $self->port;
+	my ($self, $mode, %params) = @_;
 	my $pgdata = $self->data_dir;
 	my $name   = $self->name;
+	my $ret;
 	$mode = 'fast' unless defined $mode;
-	return unless defined $self->{_pid};
+	return 1 unless defined $self->{_pid};
+
 	print "### Stopping node \"$name\" using mode $mode\n";
-	TestLib::system_or_bail('pg_ctl', '-D', $pgdata, '-m', $mode, 'stop');
+	$ret = TestLib::system_log('pg_ctl', '-D', $pgdata,
+		'-m', $mode, 'stop');
+
+	if ($ret != 0)
+	{
+		print "# pg_ctl stop failed: $ret\n";
+
+		# Check to see if we still have a postmaster or not.
+		$self->_update_pid(-1);
+
+		BAIL_OUT("pg_ctl stop failed") unless $params{fail_ok};
+		return 0;
+	}
+
 	$self->_update_pid(0);
-	return;
+	return 1;
 }
 
 =pod
@@ -1066,9 +1083,20 @@ sub _update_pid
 	if (open my $pidfile, '<', $self->data_dir . "/postmaster.pid")
 	{
 		chomp($self->{_pid} = <$pidfile>);
-		print "# Postmaster PID for node \"$name\" is $self->{_pid}\n";
 		close $pidfile;
 
+		# If we aren't sure what to expect, validate the PID using kill().
+		# This protects against stale PID files left by crashed postmasters.
+		if ($is_running == -1 && kill(0, $self->{_pid}) == 0)
+		{
+			print
+			  "# Stale postmaster.pid file for node \"$name\": PID $self->{_pid} no longer exists\n";
+			$self->{_pid} = undef;
+			return;
+		}
+
+		print "# Postmaster PID for node \"$name\" is $self->{_pid}\n";
+
 		# If we found a pidfile when there shouldn't be one, complain.
 		BAIL_OUT("postmaster.pid unexpectedly present") if $is_running == 0;
 		return;
diff --git a/src/test/recovery/t/017_shm.pl b/src/test/recovery/t/017_shm.pl
index dc0dcd3ca27..ba6410a7aa3 100644
--- a/src/test/recovery/t/017_shm.pl
+++ b/src/test/recovery/t/017_shm.pl
@@ -204,6 +204,9 @@ sub poll_start
 		# Wait 0.1 second before retrying.
 		usleep(100_000);
 
+		# Clean up in case the start attempt just timed out or some such.
+		$node->stop('fast', fail_ok => 1);
+
 		$attempts++;
 	}
author	Tom Lane <tgl@sss.pgh.pa.us>	2022-01-20 17:28:07 -0500
committer	Tom Lane <tgl@sss.pgh.pa.us>	2022-01-20 17:28:07 -0500
commit	4828a80069abc3e4b5baffe3ebffb69e6dd5bc10 (patch)
tree	a8575bdcb3a3ee89d356b2af21d0cb0249a003f1
parent	31680730ef7bac82ff09efa0824c2d8e5e8840f1 (diff)
download	postgresql-4828a80069abc3e4b5baffe3ebffb69e6dd5bc10.tar.gz postgresql-4828a80069abc3e4b5baffe3ebffb69e6dd5bc10.zip