aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2025-03-30 16:10:51 -0400
committerAndres Freund <andres@anarazel.de>2025-03-30 16:12:04 -0400
commitb96d3c389755fc5d20f4a5b9ded58b68541b8ba3 (patch)
tree7f51dd0ca8b8ce07e2bfa02e3aceda709d2c86d1
parent4244cf68769773ba30b868354f1f2fe93238e98b (diff)
downloadpostgresql-b96d3c389755fc5d20f4a5b9ded58b68541b8ba3.tar.gz
postgresql-b96d3c389755fc5d20f4a5b9ded58b68541b8ba3.zip
pgstat: Allow checksum errors to be reported in critical sections
For AIO we execute completion callbacks in critical sections (to ensure that AIO can in the future be used for WAL, which in turn requires that we can call completion callbacks in critical sections, to get the resources for WAL io). To report checksum errors a backend now has to call pgstat_prepare_report_checksum_failure(), before entering a critical section, which guarantees the relevant pgstats entry is in shared memory, the relevant DSM segment is mapped into the backend's memory and the address is known via a PgStat_EntryRef. Reviewed-by: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/wkjj4p2rmkevutkwc6tewoovdqznj6c6nvjmvii4oo5wmbh5sr@retq7d6uqs4j
-rw-r--r--src/backend/backup/basebackup.c1
-rw-r--r--src/backend/catalog/storage.c1
-rw-r--r--src/backend/storage/buffer/bufmgr.c1
-rw-r--r--src/backend/utils/activity/pgstat_database.c51
-rw-r--r--src/include/pgstat.h1
5 files changed, 52 insertions, 3 deletions
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index 3f8a3c55725..891637e3a44 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -1817,6 +1817,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
checksum_failures,
readfilename, checksum_failures)));
+ pgstat_prepare_report_checksum_failure(dboid);
pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
}
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index cacf16c1cdb..2577f69cbde 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -524,6 +524,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
{
RelFileLocatorBackend rloc = src->smgr_rlocator;
+ pgstat_prepare_report_checksum_failure(rloc.locator.dbOid);
pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 03317b49025..16b5b69efda 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1590,6 +1590,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
{
RelFileLocatorBackend rloc = operation->smgr->smgr_rlocator;
+ pgstat_prepare_report_checksum_failure(rloc.locator.dbOid);
pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
}
diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c
index c0c02efd9d3..fbaf8364117 100644
--- a/src/backend/utils/activity/pgstat_database.c
+++ b/src/backend/utils/activity/pgstat_database.c
@@ -134,7 +134,33 @@ pgstat_report_deadlock(void)
}
/*
+ * Allow this backend to later report checksum failures for dboid, even if in
+ * a critical section at the time of the report.
+ *
+ * Without this function having been called first, the backend might need to
+ * allocate an EntryRef or might need to map in DSM segments. Neither should
+ * happen in a critical section.
+ */
+void
+pgstat_prepare_report_checksum_failure(Oid dboid)
+{
+ Assert(!CritSectionCount);
+
+ /*
+ * Just need to ensure this backend has an entry ref for the database.
+ * That will allows us to report checksum failures without e.g. needing to
+ * map in DSM segments.
+ */
+ pgstat_get_entry_ref(PGSTAT_KIND_DATABASE, dboid, InvalidOid,
+ true, NULL);
+}
+
+/*
* Report one or more checksum failures.
+ *
+ * To be allowed to report checksum failures in critical sections, we require
+ * pgstat_prepare_report_checksum_failure() to have been called before this
+ * function is called.
*/
void
pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
@@ -147,10 +173,29 @@ pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
/*
* Update the shared stats directly - checksum failures should never be
- * common enough for that to be a problem.
+ * common enough for that to be a problem. Note that we pass create=false
+ * here, as we want to be sure to not require memory allocations, so this
+ * can be called in critical sections.
+ */
+ entry_ref = pgstat_get_entry_ref(PGSTAT_KIND_DATABASE, dboid, InvalidOid,
+ false, NULL);
+
+ /*
+ * Should always have been created by
+ * pgstat_prepare_report_checksum_failure().
+ *
+ * When not using assertions, we don't want to crash should something have
+ * gone wrong, so just return.
*/
- entry_ref =
- pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, dboid, InvalidOid, false);
+ Assert(entry_ref);
+ if (!entry_ref)
+ {
+ elog(WARNING, "could not report %d conflicts for DB %u",
+ failurecount, dboid);
+ return;
+ }
+
+ pgstat_lock_entry(entry_ref, false);
sharedent = (PgStatShared_Database *) entry_ref->shared_stats;
sharedent->stats.checksum_failures += failurecount;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 9f3d13bf1ce..378f2f2c2ba 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -611,6 +611,7 @@ extern void pgstat_drop_database(Oid databaseid);
extern void pgstat_report_autovac(Oid dboid);
extern void pgstat_report_recovery_conflict(int reason);
extern void pgstat_report_deadlock(void);
+extern void pgstat_prepare_report_checksum_failure(Oid dboid);
extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount);
extern void pgstat_report_connect(Oid dboid);
extern void pgstat_update_parallel_workers_stats(PgStat_Counter workers_to_launch,