diff options
author | Andres Freund <andres@anarazel.de> | 2025-03-30 16:10:51 -0400 |
---|---|---|
committer | Andres Freund <andres@anarazel.de> | 2025-03-30 16:12:04 -0400 |
commit | b96d3c389755fc5d20f4a5b9ded58b68541b8ba3 (patch) | |
tree | 7f51dd0ca8b8ce07e2bfa02e3aceda709d2c86d1 | |
parent | 4244cf68769773ba30b868354f1f2fe93238e98b (diff) | |
download | postgresql-b96d3c389755fc5d20f4a5b9ded58b68541b8ba3.tar.gz postgresql-b96d3c389755fc5d20f4a5b9ded58b68541b8ba3.zip |
pgstat: Allow checksum errors to be reported in critical sections
For AIO we execute completion callbacks in critical sections (to ensure that
AIO can in the future be used for WAL, which in turn requires that we can call
completion callbacks in critical sections, to get the resources for WAL
io). To report checksum errors a backend now has to call
pgstat_prepare_report_checksum_failure(), before entering a critical section,
which guarantees the relevant pgstats entry is in shared memory, the relevant
DSM segment is mapped into the backend's memory and the address is known via a
PgStat_EntryRef.
Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/wkjj4p2rmkevutkwc6tewoovdqznj6c6nvjmvii4oo5wmbh5sr@retq7d6uqs4j
-rw-r--r-- | src/backend/backup/basebackup.c | 1 | ||||
-rw-r--r-- | src/backend/catalog/storage.c | 1 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 1 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_database.c | 51 | ||||
-rw-r--r-- | src/include/pgstat.h | 1 |
5 files changed, 52 insertions, 3 deletions
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 3f8a3c55725..891637e3a44 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -1817,6 +1817,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, checksum_failures, readfilename, checksum_failures))); + pgstat_prepare_report_checksum_failure(dboid); pgstat_report_checksum_failures_in_db(dboid, checksum_failures); } diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index cacf16c1cdb..2577f69cbde 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -524,6 +524,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, { RelFileLocatorBackend rloc = src->smgr_rlocator; + pgstat_prepare_report_checksum_failure(rloc.locator.dbOid); pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 03317b49025..16b5b69efda 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1590,6 +1590,7 @@ WaitReadBuffers(ReadBuffersOperation *operation) { RelFileLocatorBackend rloc = operation->smgr->smgr_rlocator; + pgstat_prepare_report_checksum_failure(rloc.locator.dbOid); pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1); } diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c index c0c02efd9d3..fbaf8364117 100644 --- a/src/backend/utils/activity/pgstat_database.c +++ b/src/backend/utils/activity/pgstat_database.c @@ -134,7 +134,33 @@ pgstat_report_deadlock(void) } /* + * Allow this backend to later report checksum failures for dboid, even if in + * a critical section at the time of the report. + * + * Without this function having been called first, the backend might need to + * allocate an EntryRef or might need to map in DSM segments. Neither should + * happen in a critical section. + */ +void +pgstat_prepare_report_checksum_failure(Oid dboid) +{ + Assert(!CritSectionCount); + + /* + * Just need to ensure this backend has an entry ref for the database. + * That will allows us to report checksum failures without e.g. needing to + * map in DSM segments. + */ + pgstat_get_entry_ref(PGSTAT_KIND_DATABASE, dboid, InvalidOid, + true, NULL); +} + +/* * Report one or more checksum failures. + * + * To be allowed to report checksum failures in critical sections, we require + * pgstat_prepare_report_checksum_failure() to have been called before this + * function is called. */ void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount) @@ -147,10 +173,29 @@ pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount) /* * Update the shared stats directly - checksum failures should never be - * common enough for that to be a problem. + * common enough for that to be a problem. Note that we pass create=false + * here, as we want to be sure to not require memory allocations, so this + * can be called in critical sections. + */ + entry_ref = pgstat_get_entry_ref(PGSTAT_KIND_DATABASE, dboid, InvalidOid, + false, NULL); + + /* + * Should always have been created by + * pgstat_prepare_report_checksum_failure(). + * + * When not using assertions, we don't want to crash should something have + * gone wrong, so just return. */ - entry_ref = - pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, dboid, InvalidOid, false); + Assert(entry_ref); + if (!entry_ref) + { + elog(WARNING, "could not report %d conflicts for DB %u", + failurecount, dboid); + return; + } + + pgstat_lock_entry(entry_ref, false); sharedent = (PgStatShared_Database *) entry_ref->shared_stats; sharedent->stats.checksum_failures += failurecount; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 9f3d13bf1ce..378f2f2c2ba 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -611,6 +611,7 @@ extern void pgstat_drop_database(Oid databaseid); extern void pgstat_report_autovac(Oid dboid); extern void pgstat_report_recovery_conflict(int reason); extern void pgstat_report_deadlock(void); +extern void pgstat_prepare_report_checksum_failure(Oid dboid); extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount); extern void pgstat_report_connect(Oid dboid); extern void pgstat_update_parallel_workers_stats(PgStat_Counter workers_to_launch, |