diff options
Diffstat (limited to 'src/backend/utils')
21 files changed, 2333 insertions, 705 deletions
diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile index 791ba68e7e3..690312308f5 100644 --- a/src/backend/utils/activity/Makefile +++ b/src/backend/utils/activity/Makefile @@ -23,6 +23,7 @@ OBJS = \ pgstat_function.o \ pgstat_relation.o \ pgstat_replslot.o \ + pgstat_shmem.o \ pgstat_slru.o \ pgstat_subscription.o \ pgstat_wal.o \ diff --git a/src/backend/utils/activity/pgstat_archiver.c b/src/backend/utils/activity/pgstat_archiver.c index 09bc12070da..851726fd50e 100644 --- a/src/backend/utils/activity/pgstat_archiver.c +++ b/src/backend/utils/activity/pgstat_archiver.c @@ -27,14 +27,85 @@ void pgstat_report_archiver(const char *xlog, bool failed) { - PgStat_MsgArchiver msg; - - /* - * Prepare and send the message - */ - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ARCHIVER); - msg.m_failed = failed; - strlcpy(msg.m_xlog, xlog, sizeof(msg.m_xlog)); - msg.m_timestamp = GetCurrentTimestamp(); - pgstat_send(&msg, sizeof(msg)); + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + TimestampTz now = GetCurrentTimestamp(); + + pgstat_begin_changecount_write(&stats_shmem->changecount); + + if (failed) + { + ++stats_shmem->stats.failed_count; + memcpy(&stats_shmem->stats.last_failed_wal, xlog, + sizeof(stats_shmem->stats.last_failed_wal)); + stats_shmem->stats.last_failed_timestamp = now; + } + else + { + ++stats_shmem->stats.archived_count; + memcpy(&stats_shmem->stats.last_archived_wal, xlog, + sizeof(stats_shmem->stats.last_archived_wal)); + stats_shmem->stats.last_archived_timestamp = now; + } + + pgstat_end_changecount_write(&stats_shmem->changecount); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the archiver statistics struct. + */ +PgStat_ArchiverStats * +pgstat_fetch_stat_archiver(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_ARCHIVER); + + return &pgStatLocal.snapshot.archiver; +} + +void +pgstat_archiver_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + + /* see explanation above PgStatShared_Archiver for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_archiver_snapshot_cb(void) +{ + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + PgStat_ArchiverStats *stat_snap = &pgStatLocal.snapshot.archiver; + PgStat_ArchiverStats *reset_offset = &stats_shmem->reset_offset; + PgStat_ArchiverStats reset; + + pgstat_copy_changecounted_stats(stat_snap, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ + if (stat_snap->archived_count == reset.archived_count) + { + stat_snap->last_archived_wal[0] = 0; + stat_snap->last_archived_timestamp = 0; + } + stat_snap->archived_count -= reset.archived_count; + + if (stat_snap->failed_count == reset.failed_count) + { + stat_snap->last_failed_wal[0] = 0; + stat_snap->last_failed_timestamp = 0; + } + stat_snap->failed_count -= reset.failed_count; } diff --git a/src/backend/utils/activity/pgstat_bgwriter.c b/src/backend/utils/activity/pgstat_bgwriter.c index dfea88eca10..fbb1edc5275 100644 --- a/src/backend/utils/activity/pgstat_bgwriter.c +++ b/src/backend/utils/activity/pgstat_bgwriter.c @@ -20,12 +20,7 @@ #include "utils/pgstat_internal.h" -/* - * BgWriter global statistics counters. Stored directly in a stats - * message structure so they can be sent without needing to copy things - * around. We assume this init to zeroes. - */ -PgStat_MsgBgWriter PendingBgWriterStats; +PgStat_BgWriterStats PendingBgWriterStats = {0}; /* @@ -34,27 +29,82 @@ PgStat_MsgBgWriter PendingBgWriterStats; void pgstat_report_bgwriter(void) { - /* We assume this initializes to zeroes */ - static const PgStat_MsgBgWriter all_zeroes; + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + static const PgStat_BgWriterStats all_zeroes; + Assert(!pgStatLocal.shmem->is_shutdown); pgstat_assert_is_up(); /* * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. + * this case, avoid unnecessarily modifying the stats entry. */ - if (memcmp(&PendingBgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0) + if (memcmp(&PendingBgWriterStats, &all_zeroes, sizeof(all_zeroes)) == 0) return; - /* - * Prepare and send the message - */ - pgstat_setheader(&PendingBgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER); - pgstat_send(&PendingBgWriterStats, sizeof(PendingBgWriterStats)); + pgstat_begin_changecount_write(&stats_shmem->changecount); + +#define BGWRITER_ACC(fld) stats_shmem->stats.fld += PendingBgWriterStats.fld + BGWRITER_ACC(buf_written_clean); + BGWRITER_ACC(maxwritten_clean); + BGWRITER_ACC(buf_alloc); +#undef BGWRITER_ACC + + pgstat_end_changecount_write(&stats_shmem->changecount); /* * Clear out the statistics buffer, so it can be re-used. */ MemSet(&PendingBgWriterStats, 0, sizeof(PendingBgWriterStats)); } + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the bgwriter statistics struct. + */ +PgStat_BgWriterStats * +pgstat_fetch_stat_bgwriter(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_BGWRITER); + + return &pgStatLocal.snapshot.bgwriter; +} + +void +pgstat_bgwriter_reset_all_cb(TimestampTz ts) +{ + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + + /* see explanation above PgStatShared_BgWriter for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_bgwriter_snapshot_cb(void) +{ + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + PgStat_BgWriterStats *reset_offset = &stats_shmem->reset_offset; + PgStat_BgWriterStats reset; + + pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.bgwriter, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ +#define BGWRITER_COMP(fld) pgStatLocal.snapshot.bgwriter.fld -= reset.fld; + BGWRITER_COMP(buf_written_clean); + BGWRITER_COMP(maxwritten_clean); + BGWRITER_COMP(buf_alloc); +#undef BGWRITER_COMP +} diff --git a/src/backend/utils/activity/pgstat_checkpointer.c b/src/backend/utils/activity/pgstat_checkpointer.c index 3f4e2054f55..af8d513e7b4 100644 --- a/src/backend/utils/activity/pgstat_checkpointer.c +++ b/src/backend/utils/activity/pgstat_checkpointer.c @@ -20,12 +20,7 @@ #include "utils/pgstat_internal.h" -/* - * Checkpointer global statistics counters. Stored directly in a stats - * message structure so they can be sent without needing to copy things - * around. We assume this init to zeroes. - */ -PgStat_MsgCheckpointer PendingCheckpointerStats; +PgStat_CheckpointerStats PendingCheckpointerStats = {0}; /* @@ -35,24 +30,92 @@ void pgstat_report_checkpointer(void) { /* We assume this initializes to zeroes */ - static const PgStat_MsgCheckpointer all_zeroes; + static const PgStat_CheckpointerStats all_zeroes; + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + + Assert(!pgStatLocal.shmem->is_shutdown); + pgstat_assert_is_up(); /* * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. + * this case, avoid unnecessarily modifying the stats entry. */ - if (memcmp(&PendingCheckpointerStats, &all_zeroes, sizeof(PgStat_MsgCheckpointer)) == 0) + if (memcmp(&PendingCheckpointerStats, &all_zeroes, + sizeof(all_zeroes)) == 0) return; - /* - * Prepare and send the message - */ - pgstat_setheader(&PendingCheckpointerStats.m_hdr, PGSTAT_MTYPE_CHECKPOINTER); - pgstat_send(&PendingCheckpointerStats, sizeof(PendingCheckpointerStats)); + pgstat_begin_changecount_write(&stats_shmem->changecount); + +#define CHECKPOINTER_ACC(fld) stats_shmem->stats.fld += PendingCheckpointerStats.fld + CHECKPOINTER_ACC(timed_checkpoints); + CHECKPOINTER_ACC(requested_checkpoints); + CHECKPOINTER_ACC(checkpoint_write_time); + CHECKPOINTER_ACC(checkpoint_sync_time); + CHECKPOINTER_ACC(buf_written_checkpoints); + CHECKPOINTER_ACC(buf_written_backend); + CHECKPOINTER_ACC(buf_fsync_backend); +#undef CHECKPOINTER_ACC + + pgstat_end_changecount_write(&stats_shmem->changecount); /* * Clear out the statistics buffer, so it can be re-used. */ MemSet(&PendingCheckpointerStats, 0, sizeof(PendingCheckpointerStats)); } + +/* + * pgstat_fetch_stat_checkpointer() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the checkpointer statistics struct. + */ +PgStat_CheckpointerStats * +pgstat_fetch_stat_checkpointer(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER); + + return &pgStatLocal.snapshot.checkpointer; +} + +void +pgstat_checkpointer_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + + /* see explanation above PgStatShared_Checkpointer for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_checkpointer_snapshot_cb(void) +{ + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + PgStat_CheckpointerStats *reset_offset = &stats_shmem->reset_offset; + PgStat_CheckpointerStats reset; + + pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.checkpointer, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ +#define CHECKPOINTER_COMP(fld) pgStatLocal.snapshot.checkpointer.fld -= reset.fld; + CHECKPOINTER_COMP(timed_checkpoints); + CHECKPOINTER_COMP(requested_checkpoints); + CHECKPOINTER_COMP(checkpoint_write_time); + CHECKPOINTER_COMP(checkpoint_sync_time); + CHECKPOINTER_COMP(buf_written_checkpoints); + CHECKPOINTER_COMP(buf_written_backend); + CHECKPOINTER_COMP(buf_fsync_backend); +#undef CHECKPOINTER_COMP +} diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c index 6d27657bdb2..649d9c69606 100644 --- a/src/backend/utils/activity/pgstat_database.c +++ b/src/backend/utils/activity/pgstat_database.c @@ -19,13 +19,12 @@ #include "utils/pgstat_internal.h" #include "utils/timestamp.h" +#include "storage/procsignal.h" static bool pgstat_should_report_connstat(void); -int pgStatXactCommit = 0; -int pgStatXactRollback = 0; PgStat_Counter pgStatBlockReadTime = 0; PgStat_Counter pgStatBlockWriteTime = 0; PgStat_Counter pgStatActiveTime = 0; @@ -33,25 +32,18 @@ PgStat_Counter pgStatTransactionIdleTime = 0; SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL; +static int pgStatXactCommit = 0; +static int pgStatXactRollback = 0; static PgStat_Counter pgLastSessionReportTime = 0; /* - * Tell the collector that we just dropped a database. - * (If the message gets lost, we will still clean the dead DB eventually - * via future invocations of pgstat_vacuum_stat().) + * Remove entry for the database being dropped. */ void pgstat_drop_database(Oid databaseid) { - PgStat_MsgDropdb msg; - - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB); - msg.m_databaseid = databaseid; - pgstat_send(&msg, sizeof(msg)); + pgstat_drop_transactional(PGSTAT_KIND_DATABASE, databaseid, InvalidOid); } /* @@ -62,16 +54,24 @@ pgstat_drop_database(Oid databaseid) void pgstat_report_autovac(Oid dboid) { - PgStat_MsgAutovacStart msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Database *dbentry; - if (pgStatSock == PGINVALID_SOCKET) - return; + /* can't get here in single user mode */ + Assert(IsUnderPostmaster); + + /* + * End-of-vacuum is reported instantly. Report the start the same way for + * consistency. Vacuum doesn't run frequently and is a long-lasting + * operation so it doesn't matter if we get blocked here a little. + */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, + dboid, InvalidOid, false); - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START); - msg.m_databaseid = dboid; - msg.m_start_time = GetCurrentTimestamp(); + dbentry = (PgStatShared_Database *) entry_ref->shared_stats; + dbentry->stats.last_autovac_time = GetCurrentTimestamp(); - pgstat_send(&msg, sizeof(msg)); + pgstat_unlock_entry(entry_ref); } /* @@ -80,15 +80,39 @@ pgstat_report_autovac(Oid dboid) void pgstat_report_recovery_conflict(int reason) { - PgStat_MsgRecoveryConflict msg; + PgStat_StatDBEntry *dbentry; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + Assert(IsUnderPostmaster); + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT); - msg.m_databaseid = MyDatabaseId; - msg.m_reason = reason; - pgstat_send(&msg, sizeof(msg)); + dbentry = pgstat_prep_database_pending(MyDatabaseId); + + switch (reason) + { + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + + /* + * Since we drop the information about the database as soon as it + * replicates, there is no point in counting these conflicts. + */ + break; + case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + dbentry->n_conflict_tablespace++; + break; + case PROCSIG_RECOVERY_CONFLICT_LOCK: + dbentry->n_conflict_lock++; + break; + case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + dbentry->n_conflict_snapshot++; + break; + case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + dbentry->n_conflict_bufferpin++; + break; + case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + dbentry->n_conflict_startup_deadlock++; + break; + } } /* @@ -97,14 +121,13 @@ pgstat_report_recovery_conflict(int reason) void pgstat_report_deadlock(void) { - PgStat_MsgDeadlock msg; + PgStat_StatDBEntry *dbent; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DEADLOCK); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, sizeof(msg)); + dbent = pgstat_prep_database_pending(MyDatabaseId); + dbent->n_deadlocks++; } /* @@ -113,17 +136,24 @@ pgstat_report_deadlock(void) void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount) { - PgStat_MsgChecksumFailure msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Database *sharedent; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE); - msg.m_databaseid = dboid; - msg.m_failurecount = failurecount; - msg.m_failure_time = GetCurrentTimestamp(); + /* + * Update the shared stats directly - checksum failures should never be + * common enough for that to be a problem. + */ + entry_ref = + pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, dboid, InvalidOid, false); + + sharedent = (PgStatShared_Database *) entry_ref->shared_stats; + sharedent->stats.n_checksum_failures += failurecount; + sharedent->stats.last_checksum_failure = GetCurrentTimestamp(); - pgstat_send(&msg, sizeof(msg)); + pgstat_unlock_entry(entry_ref); } /* @@ -141,15 +171,14 @@ pgstat_report_checksum_failure(void) void pgstat_report_tempfile(size_t filesize) { - PgStat_MsgTempFile msg; + PgStat_StatDBEntry *dbent; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TEMPFILE); - msg.m_databaseid = MyDatabaseId; - msg.m_filesize = filesize; - pgstat_send(&msg, sizeof(msg)); + dbent = pgstat_prep_database_pending(MyDatabaseId); + dbent->n_temp_bytes += filesize; + dbent->n_temp_files++; } /* @@ -158,16 +187,15 @@ pgstat_report_tempfile(size_t filesize) void pgstat_report_connect(Oid dboid) { - PgStat_MsgConnect msg; + PgStat_StatDBEntry *dbentry; if (!pgstat_should_report_connstat()) return; pgLastSessionReportTime = MyStartTimestamp; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CONNECT); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, sizeof(PgStat_MsgConnect)); + dbentry = pgstat_prep_database_pending(MyDatabaseId); + dbentry->n_sessions++; } /* @@ -176,15 +204,42 @@ pgstat_report_connect(Oid dboid) void pgstat_report_disconnect(Oid dboid) { - PgStat_MsgDisconnect msg; + PgStat_StatDBEntry *dbentry; if (!pgstat_should_report_connstat()) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DISCONNECT); - msg.m_databaseid = MyDatabaseId; - msg.m_cause = pgStatSessionEndCause; - pgstat_send(&msg, sizeof(PgStat_MsgDisconnect)); + dbentry = pgstat_prep_database_pending(MyDatabaseId); + + switch (pgStatSessionEndCause) + { + case DISCONNECT_NOT_YET: + case DISCONNECT_NORMAL: + /* we don't collect these */ + break; + case DISCONNECT_CLIENT_EOF: + dbentry->n_sessions_abandoned++; + break; + case DISCONNECT_FATAL: + dbentry->n_sessions_fatal++; + break; + case DISCONNECT_KILLED: + dbentry->n_sessions_killed++; + break; + } +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one database or NULL. NULL doesn't mean + * that the database doesn't exist, just that there are no statistics, so the + * caller is better off to report ZERO instead. + */ +PgStat_StatDBEntry * +pgstat_fetch_stat_dbentry(Oid dboid) +{ + return (PgStat_StatDBEntry *) + pgstat_fetch_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid); } void @@ -205,57 +260,47 @@ AtEOXact_PgStat_Database(bool isCommit, bool parallel) } /* - * Subroutine for pgstat_send_tabstat: Handle xact commit/rollback and I/O + * Subroutine for pgstat_report_stat(): Handle xact commit/rollback and I/O * timings. */ void -pgstat_update_dbstats(PgStat_MsgTabstat *tsmsg, TimestampTz now) +pgstat_update_dbstats(TimestampTz ts) { - if (OidIsValid(tsmsg->m_databaseid)) - { - tsmsg->m_xact_commit = pgStatXactCommit; - tsmsg->m_xact_rollback = pgStatXactRollback; - tsmsg->m_block_read_time = pgStatBlockReadTime; - tsmsg->m_block_write_time = pgStatBlockWriteTime; + PgStat_StatDBEntry *dbentry; - if (pgstat_should_report_connstat()) - { - long secs; - int usecs; + dbentry = pgstat_prep_database_pending(MyDatabaseId); - /* - * pgLastSessionReportTime is initialized to MyStartTimestamp by - * pgstat_report_connect(). - */ - TimestampDifference(pgLastSessionReportTime, now, &secs, &usecs); - pgLastSessionReportTime = now; - tsmsg->m_session_time = (PgStat_Counter) secs * 1000000 + usecs; - tsmsg->m_active_time = pgStatActiveTime; - tsmsg->m_idle_in_xact_time = pgStatTransactionIdleTime; - } - else - { - tsmsg->m_session_time = 0; - tsmsg->m_active_time = 0; - tsmsg->m_idle_in_xact_time = 0; - } - pgStatXactCommit = 0; - pgStatXactRollback = 0; - pgStatBlockReadTime = 0; - pgStatBlockWriteTime = 0; - pgStatActiveTime = 0; - pgStatTransactionIdleTime = 0; - } - else + /* + * Accumulate xact commit/rollback and I/O timings to stats entry of the + * current database. + */ + dbentry->n_xact_commit += pgStatXactCommit; + dbentry->n_xact_rollback += pgStatXactRollback; + dbentry->n_block_read_time += pgStatBlockReadTime; + dbentry->n_block_write_time += pgStatBlockWriteTime; + + if (pgstat_should_report_connstat()) { - tsmsg->m_xact_commit = 0; - tsmsg->m_xact_rollback = 0; - tsmsg->m_block_read_time = 0; - tsmsg->m_block_write_time = 0; - tsmsg->m_session_time = 0; - tsmsg->m_active_time = 0; - tsmsg->m_idle_in_xact_time = 0; + long secs; + int usecs; + + /* + * pgLastSessionReportTime is initialized to MyStartTimestamp by + * pgstat_report_connect(). + */ + TimestampDifference(pgLastSessionReportTime, ts, &secs, &usecs); + pgLastSessionReportTime = ts; + dbentry->total_session_time += (PgStat_Counter) secs * 1000000 + usecs; + dbentry->total_active_time += pgStatActiveTime; + dbentry->total_idle_in_xact_time += pgStatTransactionIdleTime; } + + pgStatXactCommit = 0; + pgStatXactRollback = 0; + pgStatBlockReadTime = 0; + pgStatBlockWriteTime = 0; + pgStatActiveTime = 0; + pgStatTransactionIdleTime = 0; } /* @@ -270,3 +315,111 @@ pgstat_should_report_connstat(void) { return MyBackendType == B_BACKEND; } + +/* + * Find or create a local PgStat_StatDBEntry entry for dboid. + */ +PgStat_StatDBEntry * +pgstat_prep_database_pending(Oid dboid) +{ + PgStat_EntryRef *entry_ref; + + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid, + NULL); + + return entry_ref->pending; + +} + +/* + * Reset the database's reset timestamp, without resetting the contents of the + * database stats. + */ +void +pgstat_reset_database_timestamp(Oid dboid, TimestampTz ts) +{ + PgStat_EntryRef *dbref; + PgStatShared_Database *dbentry; + + dbref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, MyDatabaseId, InvalidOid, + false); + + dbentry = (PgStatShared_Database *) dbref->shared_stats; + dbentry->stats.stat_reset_timestamp = ts; + + pgstat_unlock_entry(dbref); +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + */ +bool +pgstat_database_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + PgStatShared_Database *sharedent; + PgStat_StatDBEntry *pendingent; + + pendingent = (PgStat_StatDBEntry *) entry_ref->pending; + sharedent = (PgStatShared_Database *) entry_ref->shared_stats; + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + +#define PGSTAT_ACCUM_DBCOUNT(item) \ + (sharedent)->stats.item += (pendingent)->item + + PGSTAT_ACCUM_DBCOUNT(n_xact_commit); + PGSTAT_ACCUM_DBCOUNT(n_xact_rollback); + PGSTAT_ACCUM_DBCOUNT(n_blocks_fetched); + PGSTAT_ACCUM_DBCOUNT(n_blocks_hit); + + PGSTAT_ACCUM_DBCOUNT(n_tuples_returned); + PGSTAT_ACCUM_DBCOUNT(n_tuples_fetched); + PGSTAT_ACCUM_DBCOUNT(n_tuples_inserted); + PGSTAT_ACCUM_DBCOUNT(n_tuples_updated); + PGSTAT_ACCUM_DBCOUNT(n_tuples_deleted); + + /* last_autovac_time is reported immediately */ + Assert(pendingent->last_autovac_time == 0); + + PGSTAT_ACCUM_DBCOUNT(n_conflict_tablespace); + PGSTAT_ACCUM_DBCOUNT(n_conflict_lock); + PGSTAT_ACCUM_DBCOUNT(n_conflict_snapshot); + PGSTAT_ACCUM_DBCOUNT(n_conflict_bufferpin); + PGSTAT_ACCUM_DBCOUNT(n_conflict_startup_deadlock); + + PGSTAT_ACCUM_DBCOUNT(n_temp_bytes); + PGSTAT_ACCUM_DBCOUNT(n_temp_files); + PGSTAT_ACCUM_DBCOUNT(n_deadlocks); + + /* checksum failures are reported immediately */ + Assert(pendingent->n_checksum_failures == 0); + Assert(pendingent->last_checksum_failure == 0); + + PGSTAT_ACCUM_DBCOUNT(n_block_read_time); + PGSTAT_ACCUM_DBCOUNT(n_block_write_time); + + PGSTAT_ACCUM_DBCOUNT(n_sessions); + PGSTAT_ACCUM_DBCOUNT(total_session_time); + PGSTAT_ACCUM_DBCOUNT(total_active_time); + PGSTAT_ACCUM_DBCOUNT(total_idle_in_xact_time); + PGSTAT_ACCUM_DBCOUNT(n_sessions_abandoned); + PGSTAT_ACCUM_DBCOUNT(n_sessions_fatal); + PGSTAT_ACCUM_DBCOUNT(n_sessions_killed); +#undef PGSTAT_ACCUM_DBCOUNT + + pgstat_unlock_entry(entry_ref); + + memset(pendingent, 0, sizeof(*pendingent)); + + return true; +} + +void +pgstat_database_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Database *) header)->stats.stat_reset_timestamp = ts; +} diff --git a/src/backend/utils/activity/pgstat_function.c b/src/backend/utils/activity/pgstat_function.c index ad9879afb2a..427d8c47fc6 100644 --- a/src/backend/utils/activity/pgstat_function.c +++ b/src/backend/utils/activity/pgstat_function.c @@ -17,8 +17,10 @@ #include "postgres.h" +#include "fmgr.h" +#include "utils/inval.h" #include "utils/pgstat_internal.h" -#include "utils/timestamp.h" +#include "utils/syscache.h" /* ---------- @@ -29,18 +31,6 @@ int pgstat_track_functions = TRACK_FUNC_OFF; /* - * Indicates if backend has some function stats that it hasn't yet - * sent to the collector. - */ -bool have_function_stats = false; - -/* - * Backends store per-function info that's waiting to be sent to the collector - * in this hash table (indexed by function OID). - */ -static HTAB *pgStatFunctions = NULL; - -/* * Total time charged to functions so far in the current backend. * We use this to help separate "self" and "other" time charges. * (We assume this initializes to zero.) @@ -61,6 +51,10 @@ pgstat_create_function(Oid proid) /* * Ensure that stats are dropped if transaction commits. + * + * NB: This is only reliable because pgstat_init_function_usage() does some + * extra work. If other places start emitting function stats they likely need + * similar logic. */ void pgstat_drop_function(Oid proid) @@ -78,8 +72,9 @@ void pgstat_init_function_usage(FunctionCallInfo fcinfo, PgStat_FunctionCallUsage *fcu) { - PgStat_BackendFunctionEntry *htabent; - bool found; + PgStat_EntryRef *entry_ref; + PgStat_BackendFunctionEntry *pending; + bool created_entry; if (pgstat_track_functions <= fcinfo->flinfo->fn_stats) { @@ -88,29 +83,48 @@ pgstat_init_function_usage(FunctionCallInfo fcinfo, return; } - if (!pgStatFunctions) + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_FUNCTION, + MyDatabaseId, + fcinfo->flinfo->fn_oid, + &created_entry); + + /* + * If no shared entry already exists, check if the function has been + * deleted concurrently. This can go unnoticed until here because + * executing a statement that just calls a function, does not trigger + * cache invalidation processing. The reason we care about this case is + * that otherwise we could create a new stats entry for an already dropped + * function (for relations etc this is not possible because emitting stats + * requires a lock for the relation to already have been acquired). + * + * It's somewhat ugly to have a behavioral difference based on + * track_functions being enabled/disabled. But it seems acceptable, given + * that there's already behavioral differences depending on whether the + * function is the caches etc. + * + * For correctness it'd be sufficient to set ->dropped to true. However, + * the accepted invalidation will commonly cause "low level" failures in + * PL code, with an OID in the error message. Making this harder to + * test... + */ + if (created_entry) { - /* First time through - initialize function stat table */ - HASHCTL hash_ctl; - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry); - pgStatFunctions = hash_create("Function stat entries", - PGSTAT_FUNCTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); + AcceptInvalidationMessages(); + if (!SearchSysCacheExists1(PROCOID, ObjectIdGetDatum(fcinfo->flinfo->fn_oid))) + { + pgstat_drop_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, + fcinfo->flinfo->fn_oid); + ereport(ERROR, errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("function call to dropped function")); + } } - /* Get the stats entry for this function, create if necessary */ - htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid, - HASH_ENTER, &found); - if (!found) - MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts)); + pending = entry_ref->pending; - fcu->fs = &htabent->f_counts; + fcu->fs = &pending->f_counts; /* save stats for this function, later used to compensate for recursion */ - fcu->save_f_total_time = htabent->f_counts.f_total_time; + fcu->save_f_total_time = pending->f_counts.f_total_time; /* save current backend-wide total time */ fcu->save_total = total_func_time; @@ -167,64 +181,37 @@ pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize) fs->f_numcalls++; fs->f_total_time = f_total; INSTR_TIME_ADD(fs->f_self_time, f_self); - - /* indicate that we have something to send */ - have_function_stats = true; } /* - * Subroutine for pgstat_report_stat: populate and send a function stat message + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. */ -void -pgstat_send_funcstats(void) +bool +pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) { - /* we assume this inits to all zeroes: */ - static const PgStat_FunctionCounts all_zeroes; + PgStat_BackendFunctionEntry *localent; + PgStatShared_Function *shfuncent; - PgStat_MsgFuncstat msg; - PgStat_BackendFunctionEntry *entry; - HASH_SEQ_STATUS fstat; + localent = (PgStat_BackendFunctionEntry *) entry_ref->pending; + shfuncent = (PgStatShared_Function *) entry_ref->shared_stats; - if (pgStatFunctions == NULL) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT); - msg.m_databaseid = MyDatabaseId; - msg.m_nentries = 0; - - hash_seq_init(&fstat, pgStatFunctions); - while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL) - { - PgStat_FunctionEntry *m_ent; + /* localent always has non-zero content */ - /* Skip it if no counts accumulated since last time */ - if (memcmp(&entry->f_counts, &all_zeroes, - sizeof(PgStat_FunctionCounts)) == 0) - continue; - - /* need to convert format of time accumulators */ - m_ent = &msg.m_entry[msg.m_nentries]; - m_ent->f_id = entry->f_id; - m_ent->f_numcalls = entry->f_counts.f_numcalls; - m_ent->f_total_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_total_time); - m_ent->f_self_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_self_time); - - if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES) - { - pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + - msg.m_nentries * sizeof(PgStat_FunctionEntry)); - msg.m_nentries = 0; - } + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; - /* reset the entry's counts */ - MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts)); - } + shfuncent->stats.f_numcalls += localent->f_counts.f_numcalls; + shfuncent->stats.f_total_time += + INSTR_TIME_GET_MICROSEC(localent->f_counts.f_total_time); + shfuncent->stats.f_self_time += + INSTR_TIME_GET_MICROSEC(localent->f_counts.f_self_time); - if (msg.m_nentries > 0) - pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + - msg.m_nentries * sizeof(PgStat_FunctionEntry)); + pgstat_unlock_entry(entry_ref); - have_function_stats = false; + return true; } /* @@ -235,12 +222,22 @@ pgstat_send_funcstats(void) PgStat_BackendFunctionEntry * find_funcstat_entry(Oid func_id) { - pgstat_assert_is_up(); + PgStat_EntryRef *entry_ref; - if (pgStatFunctions == NULL) - return NULL; + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id); + + if (entry_ref) + return entry_ref->pending; + return NULL; +} - return (PgStat_BackendFunctionEntry *) hash_search(pgStatFunctions, - (void *) &func_id, - HASH_FIND, NULL); +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one function or NULL. + */ +PgStat_StatFuncEntry * +pgstat_fetch_stat_funcentry(Oid func_id) +{ + return (PgStat_StatFuncEntry *) + pgstat_fetch_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id); } diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index 51a87b66739..bec190c5897 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -19,6 +19,7 @@ #include "access/twophase_rmgr.h" #include "access/xact.h" +#include "catalog/partition.h" #include "postmaster/autovacuum.h" #include "utils/memutils.h" #include "utils/pgstat_internal.h" @@ -26,38 +27,6 @@ #include "utils/timestamp.h" -/* - * Structures in which backends store per-table info that's waiting to be - * sent to the collector. - * - * NOTE: once allocated, TabStatusArray structures are never moved or deleted - * for the life of the backend. Also, we zero out the t_id fields of the - * contained PgStat_TableStatus structs whenever they are not actively in use. - * This allows relcache pgstat_info pointers to be treated as long-lived data, - * avoiding repeated searches in pgstat_init_relation() when a relation is - * repeatedly opened during a transaction. - */ -#define TABSTAT_QUANTUM 100 /* we alloc this many at a time */ - - -typedef struct TabStatusArray -{ - struct TabStatusArray *tsa_next; /* link to next array, if any */ - int tsa_used; /* # entries currently used */ - PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM]; /* per-table data */ -} TabStatusArray; - -static TabStatusArray *pgStatTabList = NULL; - -/* - * pgStatTabHash entry: map from relation OID to PgStat_TableStatus pointer - */ -typedef struct TabStatHashEntry -{ - Oid t_id; - PgStat_TableStatus *tsa_entry; -} TabStatHashEntry; - /* Record that's written to 2PC state file when pgstat state is persisted */ typedef struct TwoPhasePgStatRecord { @@ -74,8 +43,7 @@ typedef struct TwoPhasePgStatRecord } TwoPhasePgStatRecord; -static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared); -static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now); +static PgStat_TableStatus *pgstat_prep_relation_pending(Oid rel_id, bool isshared); static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level); static void ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info); static void save_truncdrop_counters(PgStat_TableXactStatus *trans, bool is_drop); @@ -83,19 +51,6 @@ static void restore_truncdrop_counters(PgStat_TableXactStatus *trans); /* - * Indicates if backend has some relation stats that it hasn't yet - * sent to the collector. - */ -bool have_relation_stats; - - -/* - * Hash table for O(1) t_id -> tsa_entry lookup - */ -static HTAB *pgStatTabHash = NULL; - - -/* * Copy stats between relations. This is used for things like REINDEX * CONCURRENTLY. */ @@ -103,43 +58,39 @@ void pgstat_copy_relation_stats(Relation dst, Relation src) { PgStat_StatTabEntry *srcstats; + PgStatShared_Relation *dstshstats; + PgStat_EntryRef *dst_ref; - srcstats = pgstat_fetch_stat_tabentry(RelationGetRelid(src)); - + srcstats = pgstat_fetch_stat_tabentry_ext(src->rd_rel->relisshared, + RelationGetRelid(src)); if (!srcstats) return; - if (pgstat_should_count_relation(dst)) - { - /* - * XXX: temporarily this does not actually quite do what the name - * says, and just copy index related fields. A subsequent commit will - * do more. - */ - - dst->pgstat_info->t_counts.t_numscans = srcstats->numscans; - dst->pgstat_info->t_counts.t_tuples_returned = srcstats->tuples_returned; - dst->pgstat_info->t_counts.t_tuples_fetched = srcstats->tuples_fetched; - dst->pgstat_info->t_counts.t_blocks_fetched = srcstats->blocks_fetched; - dst->pgstat_info->t_counts.t_blocks_hit = srcstats->blocks_hit; - - /* the data will be sent by the next pgstat_report_stat() call */ - } + dst_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, + dst->rd_rel->relisshared ? InvalidOid : MyDatabaseId, + RelationGetRelid(dst), + false); + + dstshstats = (PgStatShared_Relation *) dst_ref->shared_stats; + dstshstats->stats = *srcstats; + + pgstat_unlock_entry(dst_ref); } /* - * Initialize a relcache entry to count access statistics. - * Called whenever a relation is opened. + * Initialize a relcache entry to count access statistics. Called whenever a + * relation is opened. * - * We assume that a relcache entry's pgstat_info field is zeroed by - * relcache.c when the relcache entry is made; thereafter it is long-lived - * data. We can avoid repeated searches of the TabStatus arrays when the - * same relation is touched repeatedly within a transaction. + * We assume that a relcache entry's pgstat_info field is zeroed by relcache.c + * when the relcache entry is made; thereafter it is long-lived data. + * + * This does not create a reference to a stats entry in shared memory, nor + * allocate memory for the pending stats. That happens in + * pgstat_assoc_relation(). */ void pgstat_init_relation(Relation rel) { - Oid rel_id = rel->rd_id; char relkind = rel->rd_rel->relkind; /* @@ -147,27 +98,68 @@ pgstat_init_relation(Relation rel) */ if (!RELKIND_HAS_STORAGE(relkind) && relkind != RELKIND_PARTITIONED_TABLE) { + rel->pgstat_enabled = false; rel->pgstat_info = NULL; return; } - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) { + if (rel->pgstat_info) + pgstat_unlink_relation(rel); + /* We're not counting at all */ + rel->pgstat_enabled = false; rel->pgstat_info = NULL; return; } - /* - * If we already set up this relation in the current transaction, nothing - * to do. - */ - if (rel->pgstat_info != NULL && - rel->pgstat_info->t_id == rel_id) - return; + rel->pgstat_enabled = true; +} + +/* + * Prepare for statistics for this relation to be collected. + * + * This ensures we have a reference to the stats entry before stats can be + * generated. That is important because a relation drop in another connection + * could otherwise lead to the stats entry being dropped, which then later + * would get recreated when flushing stats. + * + * This is separate from pgstat_init_relation() as it is not uncommon for + * relcache entries to be opened without ever getting stats reported. + */ +void +pgstat_assoc_relation(Relation rel) +{ + Assert(rel->pgstat_enabled); + Assert(rel->pgstat_info == NULL); /* Else find or make the PgStat_TableStatus entry, and update link */ - rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared); + rel->pgstat_info = pgstat_prep_relation_pending(RelationGetRelid(rel), + rel->rd_rel->relisshared); + + /* don't allow link a stats to multiple relcache entries */ + Assert(rel->pgstat_info->relation == NULL); + + /* mark this relation as the owner */ + rel->pgstat_info->relation = rel; +} + +/* + * Break the mutual link between a relcache entry and pending stats entry. + * This must be called whenever one end of the link is removed. + */ +void +pgstat_unlink_relation(Relation rel) +{ + /* remove the link to stats info if any */ + if (rel->pgstat_info == NULL) + return; + + /* link sanity check */ + Assert(rel->pgstat_info->relation == rel); + rel->pgstat_info->relation = NULL; + rel->pgstat_info = NULL; } /* @@ -187,9 +179,26 @@ pgstat_create_relation(Relation rel) void pgstat_drop_relation(Relation rel) { + int nest_level = GetCurrentTransactionNestLevel(); + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + pgstat_drop_transactional(PGSTAT_KIND_RELATION, rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId, RelationGetRelid(rel)); + + /* + * Transactionally set counters to 0. That ensures that accesses to + * pg_stat_xact_all_tables inside the transaction show 0. + */ + if (pgstat_info && + pgstat_info->trans != NULL && + pgstat_info->trans->nest_level == nest_level) + { + save_truncdrop_counters(pgstat_info->trans, true); + pgstat_info->trans->tuples_inserted = 0; + pgstat_info->trans->tuples_updated = 0; + pgstat_info->trans->tuples_deleted = 0; + } } /* @@ -199,19 +208,52 @@ void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter livetuples, PgStat_Counter deadtuples) { - PgStat_MsgVacuum msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Relation *shtabentry; + PgStat_StatTabEntry *tabentry; + Oid dboid = (shared ? InvalidOid : MyDatabaseId); + TimestampTz ts; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM); - msg.m_databaseid = shared ? InvalidOid : MyDatabaseId; - msg.m_tableoid = tableoid; - msg.m_autovacuum = IsAutoVacuumWorkerProcess(); - msg.m_vacuumtime = GetCurrentTimestamp(); - msg.m_live_tuples = livetuples; - msg.m_dead_tuples = deadtuples; - pgstat_send(&msg, sizeof(msg)); + /* Store the data in the table's hash table entry. */ + ts = GetCurrentTimestamp(); + + /* block acquiring lock for the same reason as pgstat_report_autovac() */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, + dboid, tableoid, false); + + shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; + tabentry = &shtabentry->stats; + + tabentry->n_live_tuples = livetuples; + tabentry->n_dead_tuples = deadtuples; + + /* + * It is quite possible that a non-aggressive VACUUM ended up skipping + * various pages, however, we'll zero the insert counter here regardless. + * It's currently used only to track when we need to perform an "insert" + * autovacuum, which are mainly intended to freeze newly inserted tuples. + * Zeroing this may just mean we'll not try to vacuum the table again + * until enough tuples have been inserted to trigger another insert + * autovacuum. An anti-wraparound autovacuum will catch any persistent + * stragglers. + */ + tabentry->inserts_since_vacuum = 0; + + if (IsAutoVacuumWorkerProcess()) + { + tabentry->autovac_vacuum_timestamp = ts; + tabentry->autovac_vacuum_count++; + } + else + { + tabentry->vacuum_timestamp = ts; + tabentry->vacuum_count++; + } + + pgstat_unlock_entry(entry_ref); } /* @@ -225,9 +267,12 @@ pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples, bool resetcounter) { - PgStat_MsgAnalyze msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Relation *shtabentry; + PgStat_StatTabEntry *tabentry; + Oid dboid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId); - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; /* @@ -259,15 +304,39 @@ pgstat_report_analyze(Relation rel, deadtuples = Max(deadtuples, 0); } - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE); - msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId; - msg.m_tableoid = RelationGetRelid(rel); - msg.m_autovacuum = IsAutoVacuumWorkerProcess(); - msg.m_resetcounter = resetcounter; - msg.m_analyzetime = GetCurrentTimestamp(); - msg.m_live_tuples = livetuples; - msg.m_dead_tuples = deadtuples; - pgstat_send(&msg, sizeof(msg)); + /* block acquiring lock for the same reason as pgstat_report_autovac() */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, dboid, + RelationGetRelid(rel), + false); + /* can't get dropped while accessed */ + Assert(entry_ref != NULL && entry_ref->shared_stats != NULL); + + shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; + tabentry = &shtabentry->stats; + + tabentry->n_live_tuples = livetuples; + tabentry->n_dead_tuples = deadtuples; + + /* + * If commanded, reset changes_since_analyze to zero. This forgets any + * changes that were committed while the ANALYZE was in progress, but we + * have no good way to estimate how many of those there were. + */ + if (resetcounter) + tabentry->changes_since_analyze = 0; + + if (IsAutoVacuumWorkerProcess()) + { + tabentry->autovac_analyze_timestamp = GetCurrentTimestamp(); + tabentry->autovac_analyze_count++; + } + else + { + tabentry->analyze_timestamp = GetCurrentTimestamp(); + tabentry->analyze_count++; + } + + pgstat_unlock_entry(entry_ref); } /* @@ -357,29 +426,60 @@ pgstat_update_heap_dead_tuples(Relation rel, int delta) } /* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one table or NULL. NULL doesn't mean + * that the table doesn't exist, just that there are no statistics, so the + * caller is better off to report ZERO instead. + */ +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry(Oid relid) +{ + PgStat_StatTabEntry *tabentry; + + tabentry = pgstat_fetch_stat_tabentry_ext(false, relid); + if (tabentry != NULL) + return tabentry; + + /* + * If we didn't find it, maybe it's a shared table. + */ + tabentry = pgstat_fetch_stat_tabentry_ext(true, relid); + return tabentry; +} + +/* + * More efficient version of pgstat_fetch_stat_tabentry(), allowing to specify + * whether the to-be-accessed table is a shared relation or not. + */ +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry_ext(bool shared, Oid reloid) +{ + Oid dboid = (shared ? InvalidOid : MyDatabaseId); + + return (PgStat_StatTabEntry *) + pgstat_fetch_entry(PGSTAT_KIND_RELATION, dboid, reloid); +} + +/* * find any existing PgStat_TableStatus entry for rel * - * If no entry, return NULL, don't create a new one + * Find any existing PgStat_TableStatus entry for rel_id in the current + * database. If not found, try finding from shared tables. * - * Note: if we got an error in the most recent execution of pgstat_report_stat, - * it's possible that an entry exists but there's no hashtable entry for it. - * That's okay, we'll treat this case as "doesn't exist". + * If no entry found, return NULL, don't create a new one */ PgStat_TableStatus * find_tabstat_entry(Oid rel_id) { - TabStatHashEntry *hash_entry; + PgStat_EntryRef *entry_ref; - /* If hashtable doesn't exist, there are no entries at all */ - if (!pgStatTabHash) - return NULL; + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, MyDatabaseId, rel_id); + if (!entry_ref) + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, InvalidOid, rel_id); - hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_FIND, NULL); - if (!hash_entry) - return NULL; - - /* Note that this step could also return NULL, but that's correct */ - return hash_entry->tsa_entry; + if (entry_ref) + return entry_ref->pending; + return NULL; } /* @@ -536,7 +636,7 @@ AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) for (trans = xact_state->first; trans != NULL; trans = trans->next) { - PgStat_TableStatus *tabstat; + PgStat_TableStatus *tabstat PG_USED_FOR_ASSERTS_ONLY; TwoPhasePgStatRecord record; Assert(trans->nest_level == 1); @@ -594,7 +694,7 @@ pgstat_twophase_postcommit(TransactionId xid, uint16 info, PgStat_TableStatus *pgstat_info; /* Find or create a tabstat entry for the rel */ - pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); + pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared); /* Same math as in AtEOXact_PgStat, commit case */ pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted; @@ -630,7 +730,7 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info, PgStat_TableStatus *pgstat_info; /* Find or create a tabstat entry for the rel */ - pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); + pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared); /* Same math as in AtEOXact_PgStat, abort case */ if (rec->t_truncdropped) @@ -647,204 +747,116 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info, } /* - * Subroutine for pgstat_report_stat: Send relation statistics + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + * + * Some of the stats are copied to the corresponding pending database stats + * entry when successfully flushing. */ -void -pgstat_send_tabstats(TimestampTz now, bool disconnect) +bool +pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) { - /* we assume this inits to all zeroes: */ static const PgStat_TableCounts all_zeroes; - PgStat_MsgTabstat regular_msg; - PgStat_MsgTabstat shared_msg; - TabStatusArray *tsa; - int i; + Oid dboid; + PgStat_TableStatus *lstats; /* pending stats entry */ + PgStatShared_Relation *shtabstats; + PgStat_StatTabEntry *tabentry; /* table entry of shared stats */ + PgStat_StatDBEntry *dbentry; /* pending database entry */ - /* - * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry - * entries it points to. (Should we fail partway through the loop below, - * it's okay to have removed the hashtable already --- the only - * consequence is we'd get multiple entries for the same table in the - * pgStatTabList, and that's safe.) - */ - if (pgStatTabHash) - hash_destroy(pgStatTabHash); - pgStatTabHash = NULL; + dboid = entry_ref->shared_entry->key.dboid; + lstats = (PgStat_TableStatus *) entry_ref->pending; + shtabstats = (PgStatShared_Relation *) entry_ref->shared_stats; /* - * Scan through the TabStatusArray struct(s) to find tables that actually - * have counts, and build messages to send. We have to separate shared - * relations from regular ones because the databaseid field in the message - * header has to depend on that. + * Ignore entries that didn't accumulate any actual counts, such as + * indexes that were opened by the planner but not used. */ - regular_msg.m_databaseid = MyDatabaseId; - shared_msg.m_databaseid = InvalidOid; - regular_msg.m_nentries = 0; - shared_msg.m_nentries = 0; - - for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next) + if (memcmp(&lstats->t_counts, &all_zeroes, + sizeof(PgStat_TableCounts)) == 0) { - for (i = 0; i < tsa->tsa_used; i++) - { - PgStat_TableStatus *entry = &tsa->tsa_entries[i]; - PgStat_MsgTabstat *this_msg; - PgStat_TableEntry *this_ent; + return true; + } - /* Shouldn't have any pending transaction-dependent counts */ - Assert(entry->trans == NULL); + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; - /* - * Ignore entries that didn't accumulate any actual counts, such - * as indexes that were opened by the planner but not used. - */ - if (memcmp(&entry->t_counts, &all_zeroes, - sizeof(PgStat_TableCounts)) == 0) - continue; + /* add the values to the shared entry. */ + tabentry = &shtabstats->stats; - /* - * OK, insert data into the appropriate message, and send if full. - */ - this_msg = entry->t_shared ? &shared_msg : ®ular_msg; - this_ent = &this_msg->m_entry[this_msg->m_nentries]; - this_ent->t_id = entry->t_id; - memcpy(&this_ent->t_counts, &entry->t_counts, - sizeof(PgStat_TableCounts)); - if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES) - { - pgstat_send_tabstat(this_msg, now); - this_msg->m_nentries = 0; - } - } - /* zero out PgStat_TableStatus structs after use */ - MemSet(tsa->tsa_entries, 0, - tsa->tsa_used * sizeof(PgStat_TableStatus)); - tsa->tsa_used = 0; - } + tabentry->numscans += lstats->t_counts.t_numscans; + tabentry->tuples_returned += lstats->t_counts.t_tuples_returned; + tabentry->tuples_fetched += lstats->t_counts.t_tuples_fetched; + tabentry->tuples_inserted += lstats->t_counts.t_tuples_inserted; + tabentry->tuples_updated += lstats->t_counts.t_tuples_updated; + tabentry->tuples_deleted += lstats->t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated += lstats->t_counts.t_tuples_hot_updated; /* - * Send partial messages. Make sure that any pending xact commit/abort - * and connection stats get counted, even if there are no table stats to - * send. + * If table was truncated/dropped, first reset the live/dead counters. */ - if (regular_msg.m_nentries > 0 || - pgStatXactCommit > 0 || pgStatXactRollback > 0 || disconnect) - pgstat_send_tabstat(®ular_msg, now); - if (shared_msg.m_nentries > 0) - pgstat_send_tabstat(&shared_msg, now); + if (lstats->t_counts.t_truncdropped) + { + tabentry->n_live_tuples = 0; + tabentry->n_dead_tuples = 0; + tabentry->inserts_since_vacuum = 0; + } - have_relation_stats = false; + tabentry->n_live_tuples += lstats->t_counts.t_delta_live_tuples; + tabentry->n_dead_tuples += lstats->t_counts.t_delta_dead_tuples; + tabentry->changes_since_analyze += lstats->t_counts.t_changed_tuples; + tabentry->inserts_since_vacuum += lstats->t_counts.t_tuples_inserted; + tabentry->blocks_fetched += lstats->t_counts.t_blocks_fetched; + tabentry->blocks_hit += lstats->t_counts.t_blocks_hit; + + /* Clamp n_live_tuples in case of negative delta_live_tuples */ + tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); + /* Likewise for n_dead_tuples */ + tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); + + pgstat_unlock_entry(entry_ref); + + /* The entry was successfully flushed, add the same to database stats */ + dbentry = pgstat_prep_database_pending(dboid); + dbentry->n_tuples_returned += lstats->t_counts.t_tuples_returned; + dbentry->n_tuples_fetched += lstats->t_counts.t_tuples_fetched; + dbentry->n_tuples_inserted += lstats->t_counts.t_tuples_inserted; + dbentry->n_tuples_updated += lstats->t_counts.t_tuples_updated; + dbentry->n_tuples_deleted += lstats->t_counts.t_tuples_deleted; + dbentry->n_blocks_fetched += lstats->t_counts.t_blocks_fetched; + dbentry->n_blocks_hit += lstats->t_counts.t_blocks_hit; + + return true; } -/* - * Subroutine for pgstat_send_tabstats: finish and send one tabstat message - */ -static void -pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now) +void +pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref) { - int n; - int len; - - /* It's unlikely we'd get here with no socket, but maybe not impossible */ - if (pgStatSock == PGINVALID_SOCKET) - return; + PgStat_TableStatus *pending = (PgStat_TableStatus *) entry_ref->pending; - /* - * Report and reset accumulated xact commit/rollback and I/O timings - * whenever we send a normal tabstat message - */ - pgstat_update_dbstats(tsmsg, now); - - n = tsmsg->m_nentries; - len = offsetof(PgStat_MsgTabstat, m_entry[0]) + - n * sizeof(PgStat_TableEntry); - - pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT); - pgstat_send(tsmsg, len); + if (pending->relation) + pgstat_unlink_relation(pending->relation); } /* - * find or create a PgStat_TableStatus entry for rel + * Find or create a PgStat_TableStatus entry for rel. New entry is created and + * initialized if not exists. */ static PgStat_TableStatus * -get_tabstat_entry(Oid rel_id, bool isshared) +pgstat_prep_relation_pending(Oid rel_id, bool isshared) { - TabStatHashEntry *hash_entry; - PgStat_TableStatus *entry; - TabStatusArray *tsa; - bool found; - - pgstat_assert_is_up(); + PgStat_EntryRef *entry_ref; + PgStat_TableStatus *pending; - have_relation_stats = true; - - /* - * Create hash table if we don't have it already. - */ - if (pgStatTabHash == NULL) - { - HASHCTL ctl; - - ctl.keysize = sizeof(Oid); - ctl.entrysize = sizeof(TabStatHashEntry); - - pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table", - TABSTAT_QUANTUM, - &ctl, - HASH_ELEM | HASH_BLOBS); - } - - /* - * Find an entry or create a new one. - */ - hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_ENTER, &found); - if (!found) - { - /* initialize new entry with null pointer */ - hash_entry->tsa_entry = NULL; - } - - /* - * If entry is already valid, we're done. - */ - if (hash_entry->tsa_entry) - return hash_entry->tsa_entry; - - /* - * Locate the first pgStatTabList entry with free space, making a new list - * entry if needed. Note that we could get an OOM failure here, but if so - * we have left the hashtable and the list in a consistent state. - */ - if (pgStatTabList == NULL) - { - /* Set up first pgStatTabList entry */ - pgStatTabList = (TabStatusArray *) - MemoryContextAllocZero(TopMemoryContext, - sizeof(TabStatusArray)); - } - - tsa = pgStatTabList; - while (tsa->tsa_used >= TABSTAT_QUANTUM) - { - if (tsa->tsa_next == NULL) - tsa->tsa_next = (TabStatusArray *) - MemoryContextAllocZero(TopMemoryContext, - sizeof(TabStatusArray)); - tsa = tsa->tsa_next; - } - - /* - * Allocate a PgStat_TableStatus entry within this list entry. We assume - * the entry was already zeroed, either at creation or after last use. - */ - entry = &tsa->tsa_entries[tsa->tsa_used++]; - entry->t_id = rel_id; - entry->t_shared = isshared; - - /* - * Now we can fill the entry in pgStatTabHash. - */ - hash_entry->tsa_entry = entry; + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_RELATION, + isshared ? InvalidOid : MyDatabaseId, + rel_id, NULL); + pending = entry_ref->pending; + pending->t_id = rel_id; + pending->t_shared = isshared; - return entry; + return pending; } /* diff --git a/src/backend/utils/activity/pgstat_replslot.c b/src/backend/utils/activity/pgstat_replslot.c index ceefc5d59b3..b77c05ab5fa 100644 --- a/src/backend/utils/activity/pgstat_replslot.c +++ b/src/backend/utils/activity/pgstat_replslot.c @@ -8,6 +8,14 @@ * storage implementation and the details about individual types of * statistics. * + * Replication slot stats work a bit different than other other + * variable-numbered stats. Slots do not have oids (so they can be created on + * physical replicas). Use the slot index as object id while running. However, + * the slot index can change when restarting. That is addressed by using the + * name when (de-)serializing. After a restart it is possible for slots to + * have been dropped while shut down, which is addressed by not restoring + * stats for slots that cannot be found by name when starting up. + * * Copyright (c) 2001-2022, PostgreSQL Global Development Group * * IDENTIFICATION @@ -22,6 +30,9 @@ #include "utils/pgstat_internal.h" +static int get_replslot_index(const char *name); + + /* * Reset counters for a single replication slot. * @@ -32,18 +43,10 @@ void pgstat_reset_replslot(const char *name) { ReplicationSlot *slot; - PgStat_MsgResetreplslotcounter msg; AssertArg(name != NULL); - if (pgStatSock == PGINVALID_SOCKET) - return; - - /* - * Check if the slot exists with the given name. It is possible that by - * the time this message is executed the slot is dropped but at least this - * check will ensure that the given name is for a valid slot. - */ + /* Check if the slot exits with the given name. */ slot = SearchNamedReplicationSlot(name, true); if (!slot) @@ -59,10 +62,9 @@ pgstat_reset_replslot(const char *name) if (SlotIsPhysical(slot)) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETREPLSLOTCOUNTER); - namestrcpy(&msg.m_slotname, name); - msg.clearall = false; - pgstat_send(&msg, sizeof(msg)); + /* reset this one entry */ + pgstat_reset(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot)); } /* @@ -71,24 +73,34 @@ pgstat_reset_replslot(const char *name) void pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *repSlotStat) { - PgStat_MsgReplSlot msg; + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + PgStat_StatReplSlotEntry *statent; + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + statent = &shstatent->stats; /* - * Prepare and send the message + * Any mismatch should have been fixed in pgstat_create_replslot() or + * pgstat_acquire_replslot(). */ - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(repSlotStat->slotname)); - msg.m_create = false; - msg.m_drop = false; - msg.m_spill_txns = repSlotStat->spill_txns; - msg.m_spill_count = repSlotStat->spill_count; - msg.m_spill_bytes = repSlotStat->spill_bytes; - msg.m_stream_txns = repSlotStat->stream_txns; - msg.m_stream_count = repSlotStat->stream_count; - msg.m_stream_bytes = repSlotStat->stream_bytes; - msg.m_total_txns = repSlotStat->total_txns; - msg.m_total_bytes = repSlotStat->total_bytes; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); + Assert(namestrcmp(&statent->slotname, NameStr(slot->data.name)) == 0); + + /* Update the replication slot statistics */ +#define REPLSLOT_ACC(fld) statent->fld += repSlotStat->fld + REPLSLOT_ACC(spill_txns); + REPLSLOT_ACC(spill_count); + REPLSLOT_ACC(spill_bytes); + REPLSLOT_ACC(stream_txns); + REPLSLOT_ACC(stream_count); + REPLSLOT_ACC(stream_bytes); + REPLSLOT_ACC(total_txns); + REPLSLOT_ACC(total_bytes); +#undef REPLSLOT_ACC + + pgstat_unlock_entry(entry_ref); } /* @@ -100,13 +112,50 @@ pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *re void pgstat_create_replslot(ReplicationSlot *slot) { - PgStat_MsgReplSlot msg; + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(slot->data.name)); - msg.m_create = true; - msg.m_drop = false; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); + /* + * NB: need to accept that there might be stats from an older slot, e.g. + * if we previously crashed after dropping a slot. + */ + memset(&shstatent->stats, 0, sizeof(shstatent->stats)); + namestrcpy(&shstatent->stats.slotname, NameStr(slot->data.name)); + + pgstat_unlock_entry(entry_ref); +} + +/* + * Report replication slot has been acquired. + */ +void +pgstat_acquire_replslot(ReplicationSlot *slot) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + PgStat_StatReplSlotEntry *statent; + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + statent = &shstatent->stats; + + /* + * NB: need to accept that there might be stats from an older slot, e.g. + * if we previously crashed after dropping a slot. + */ + if (NameStr(statent->slotname)[0] == 0 || + namestrcmp(&statent->slotname, NameStr(slot->data.name)) != 0) + { + memset(statent, 0, sizeof(*statent)); + namestrcpy(&statent->slotname, NameStr(slot->data.name)); + } + + pgstat_unlock_entry(entry_ref); } /* @@ -115,11 +164,65 @@ pgstat_create_replslot(ReplicationSlot *slot) void pgstat_drop_replslot(ReplicationSlot *slot) { - PgStat_MsgReplSlot msg; + pgstat_drop_entry(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot)); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the replication slot statistics struct. + */ +PgStat_StatReplSlotEntry * +pgstat_fetch_replslot(NameData slotname) +{ + int idx = get_replslot_index(NameStr(slotname)); + + if (idx == -1) + return NULL; + + return (PgStat_StatReplSlotEntry *) + pgstat_fetch_entry(PGSTAT_KIND_REPLSLOT, InvalidOid, idx); +} + +void +pgstat_replslot_to_serialized_name_cb(const PgStatShared_Common *header, NameData *name) +{ + namestrcpy(name, NameStr(((PgStatShared_ReplSlot *) header)->stats.slotname)); +} + +bool +pgstat_replslot_from_serialized_name_cb(const NameData *name, PgStat_HashKey *key) +{ + int idx = get_replslot_index(NameStr(*name)); + + /* slot might have been deleted */ + if (idx == -1) + return false; + + key->kind = PGSTAT_KIND_REPLSLOT; + key->dboid = InvalidOid; + key->objoid = idx; + + return true; +} + +void +pgstat_replslot_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_ReplSlot *) header)->stats.stat_reset_timestamp = ts; +} + +static int +get_replslot_index(const char *name) +{ + ReplicationSlot *slot; + + AssertArg(name != NULL); + + slot = SearchNamedReplicationSlot(name, true); + + if (!slot) + return -1; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(slot->data.name)); - msg.m_create = false; - msg.m_drop = true; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); + return ReplicationSlotIndex(slot); } diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c new file mode 100644 index 00000000000..a32740b2f6e --- /dev/null +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -0,0 +1,987 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_shmem.c + * Storage of stats entries in shared memory + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_shmem.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgstat.h" +#include "storage/shmem.h" +#include "utils/memutils.h" +#include "utils/pgstat_internal.h" + + +#define PGSTAT_ENTRY_REF_HASH_SIZE 128 + +/* hash table entry for finding the PgStat_EntryRef for a key */ +typedef struct PgStat_EntryRefHashEntry +{ + PgStat_HashKey key; /* hash key */ + char status; /* for simplehash use */ + PgStat_EntryRef *entry_ref; +} PgStat_EntryRefHashEntry; + + +/* for references to shared statistics entries */ +#define SH_PREFIX pgstat_entry_ref_hash +#define SH_ELEMENT_TYPE PgStat_EntryRefHashEntry +#define SH_KEY_TYPE PgStat_HashKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) \ + pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL) +#define SH_EQUAL(tb, a, b) \ + pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0 +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + + +static void pgstat_drop_database_and_contents(Oid dboid); + +static void pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat); + +static void pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, bool discard_pending); +static bool pgstat_need_entry_refs_gc(void); +static void pgstat_gc_entry_refs(void); +static void pgstat_release_all_entry_refs(bool discard_pending); +typedef bool (*ReleaseMatchCB) (PgStat_EntryRefHashEntry *, Datum data); +static void pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match, Datum match_data); + +static void pgstat_setup_memcxt(void); + + +/* parameter for the shared hash */ +static const dshash_parameters dsh_params = { + sizeof(PgStat_HashKey), + sizeof(PgStatShared_HashEntry), + pgstat_cmp_hash_key, + pgstat_hash_hash_key, + LWTRANCHE_PGSTATS_HASH +}; + + +/* + * Backend local references to shared stats entries. If there are pending + * updates to a stats entry, the PgStat_EntryRef is added to the pgStatPending + * list. + * + * When a stats entry is dropped each backend needs to release its reference + * to it before the memory can be released. To trigger that + * pgStatLocal.shmem->gc_request_count is incremented - which each backend + * compares to their copy of pgStatSharedRefAge on a regular basis. + */ +static pgstat_entry_ref_hash_hash *pgStatEntryRefHash = NULL; +static int pgStatSharedRefAge = 0; /* cache age of pgStatShmLookupCache */ + +/* + * Memory contexts containing the pgStatEntryRefHash table and the + * pgStatSharedRef entries respectively. Kept separate to make it easier to + * track / attribute memory usage. + */ +static MemoryContext pgStatSharedRefContext = NULL; +static MemoryContext pgStatEntryRefHashContext = NULL; + + +/* ------------------------------------------------------------ + * Public functions called from postmaster follow + * ------------------------------------------------------------ + */ + +/* + * The size of the shared memory allocation for stats stored in the shared + * stats hash table. This allocation will be done as part of the main shared + * memory, rather than dynamic shared memory, allowing it to be initialized in + * postmaster. + */ +static Size +pgstat_dsa_init_size(void) +{ + Size sz; + + /* + * The dshash header / initial buckets array needs to fit into "plain" + * shared memory, but it's beneficial to not need dsm segments + * immediately. A size of 256kB seems works well and is not + * disproportional compared to other constant sized shared memory + * allocations. NB: To avoid DSMs further, the user can configure + * min_dynamic_shared_memory. + */ + sz = 256 * 1024; + Assert(dsa_minimum_size() <= sz); + return MAXALIGN(sz); +} + +/* + * Compute shared memory space needed for cumulative statistics + */ +Size +StatsShmemSize(void) +{ + Size sz; + + sz = MAXALIGN(sizeof(PgStat_ShmemControl)); + sz = add_size(sz, pgstat_dsa_init_size()); + + return sz; +} + +/* + * Initialize cumulative statistics system during startup + */ +void +StatsShmemInit(void) +{ + bool found; + Size sz; + + sz = StatsShmemSize(); + pgStatLocal.shmem = (PgStat_ShmemControl *) + ShmemInitStruct("Shared Memory Stats", sz, &found); + + if (!IsUnderPostmaster) + { + dsa_area *dsa; + dshash_table *dsh; + PgStat_ShmemControl *ctl = pgStatLocal.shmem; + char *p = (char *) ctl; + + Assert(!found); + + /* the allocation of pgStatLocal.shmem itself */ + p += MAXALIGN(sizeof(PgStat_ShmemControl)); + + /* + * Create a small dsa allocation in plain shared memory. This is + * required because postmaster cannot use dsm segments. It also + * provides a small efficiency win. + */ + ctl->raw_dsa_area = p; + p += MAXALIGN(pgstat_dsa_init_size()); + dsa = dsa_create_in_place(ctl->raw_dsa_area, + pgstat_dsa_init_size(), + LWTRANCHE_PGSTATS_DSA, 0); + dsa_pin(dsa); + + /* + * To ensure dshash is created in "plain" shared memory, temporarily + * limit size of dsa to the initial size of the dsa. + */ + dsa_set_size_limit(dsa, pgstat_dsa_init_size()); + + /* + * With the limit in place, create the dshash table. XXX: It'd be nice + * if there were dshash_create_in_place(). + */ + dsh = dshash_create(dsa, &dsh_params, 0); + ctl->hash_handle = dshash_get_hash_table_handle(dsh); + + /* lift limit set above */ + dsa_set_size_limit(dsa, -1); + + /* + * Postmaster will never access these again, thus free the local + * dsa/dshash references. + */ + dshash_detach(dsh); + dsa_detach(dsa); + + pg_atomic_init_u64(&ctl->gc_request_count, 1); + + + /* initialize fixed-numbered stats */ + LWLockInitialize(&ctl->archiver.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->bgwriter.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->checkpointer.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->slru.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->wal.lock, LWTRANCHE_PGSTATS_DATA); + } + else + { + Assert(found); + } +} + +void +pgstat_attach_shmem(void) +{ + MemoryContext oldcontext; + + Assert(pgStatLocal.dsa == NULL); + + /* stats shared memory persists for the backend lifetime */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + pgStatLocal.dsa = dsa_attach_in_place(pgStatLocal.shmem->raw_dsa_area, + NULL); + dsa_pin_mapping(pgStatLocal.dsa); + + pgStatLocal.shared_hash = dshash_attach(pgStatLocal.dsa, &dsh_params, + pgStatLocal.shmem->hash_handle, 0); + + MemoryContextSwitchTo(oldcontext); +} + +void +pgstat_detach_shmem(void) +{ + Assert(pgStatLocal.dsa); + + /* we shouldn't leave references to shared stats */ + pgstat_release_all_entry_refs(false); + + dshash_detach(pgStatLocal.shared_hash); + pgStatLocal.shared_hash = NULL; + + dsa_detach(pgStatLocal.dsa); + pgStatLocal.dsa = NULL; +} + + +/* ------------------------------------------------------------ + * Maintenance of shared memory stats entries + * ------------------------------------------------------------ + */ + +PgStatShared_Common * +pgstat_init_entry(PgStat_Kind kind, + PgStatShared_HashEntry *shhashent) +{ + /* Create new stats entry. */ + dsa_pointer chunk; + PgStatShared_Common *shheader; + + /* + * Initialize refcount to 1, marking it as valid / not dropped. The entry + * can't be freed before the initialization because it can't be found as + * long as we hold the dshash partition lock. Caller needs to increase + * further if a longer lived reference is needed. + */ + pg_atomic_init_u32(&shhashent->refcount, 1); + shhashent->dropped = false; + + chunk = dsa_allocate0(pgStatLocal.dsa, pgstat_get_kind_info(kind)->shared_size); + shheader = dsa_get_address(pgStatLocal.dsa, chunk); + shheader->magic = 0xdeadbeef; + + /* Link the new entry from the hash entry. */ + shhashent->body = chunk; + + LWLockInitialize(&shheader->lock, LWTRANCHE_PGSTATS_DATA); + + return shheader; +} + +static PgStatShared_Common * +pgstat_reinit_entry(PgStat_Kind kind, PgStatShared_HashEntry *shhashent) +{ + PgStatShared_Common *shheader; + + shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body); + + /* mark as not dropped anymore */ + pg_atomic_fetch_add_u32(&shhashent->refcount, 1); + shhashent->dropped = false; + + /* reinitialize content */ + Assert(shheader->magic == 0xdeadbeef); + memset(shheader, 0, pgstat_get_kind_info(shhashent->key.kind)->shared_size); + shheader->magic = 0xdeadbeef; + + return shheader; +} + +static void +pgstat_setup_shared_refs(void) +{ + if (likely(pgStatEntryRefHash != NULL)) + return; + + pgStatEntryRefHash = + pgstat_entry_ref_hash_create(pgStatEntryRefHashContext, + PGSTAT_ENTRY_REF_HASH_SIZE, NULL); + pgStatSharedRefAge = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + Assert(pgStatSharedRefAge != 0); +} + +/* + * Helper function for pgstat_get_entry_ref(). + */ +static void +pgstat_acquire_entry_ref(PgStat_EntryRef *entry_ref, + PgStatShared_HashEntry *shhashent, + PgStatShared_Common *shheader) +{ + Assert(shheader->magic == 0xdeadbeef); + Assert(pg_atomic_read_u32(&shhashent->refcount) > 0); + + pg_atomic_fetch_add_u32(&shhashent->refcount, 1); + + dshash_release_lock(pgStatLocal.shared_hash, shhashent); + + entry_ref->shared_stats = shheader; + entry_ref->shared_entry = shhashent; +} + +/* + * Helper function for pgstat_get_entry_ref(). + */ +static bool +pgstat_get_entry_ref_cached(PgStat_HashKey key, PgStat_EntryRef **entry_ref_p) +{ + bool found; + PgStat_EntryRefHashEntry *cache_entry; + + /* + * We immediately insert a cache entry, because it avoids 1) multiple + * hashtable lookups in case of a cache miss 2) having to deal with + * out-of-memory errors after incrementing PgStatShared_Common->refcount. + */ + + cache_entry = pgstat_entry_ref_hash_insert(pgStatEntryRefHash, key, &found); + + if (!found || !cache_entry->entry_ref) + { + PgStat_EntryRef *entry_ref; + + cache_entry->entry_ref = entry_ref = + MemoryContextAlloc(pgStatSharedRefContext, + sizeof(PgStat_EntryRef)); + entry_ref->shared_stats = NULL; + entry_ref->shared_entry = NULL; + entry_ref->pending = NULL; + + found = false; + } + else if (cache_entry->entry_ref->shared_stats == NULL) + { + Assert(cache_entry->entry_ref->pending == NULL); + found = false; + } + else + { + PgStat_EntryRef *entry_ref PG_USED_FOR_ASSERTS_ONLY; + + entry_ref = cache_entry->entry_ref; + Assert(entry_ref->shared_entry != NULL); + Assert(entry_ref->shared_stats != NULL); + + Assert(entry_ref->shared_stats->magic == 0xdeadbeef); + /* should have at least our reference */ + Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) > 0); + } + + *entry_ref_p = cache_entry->entry_ref; + return found; +} + +/* + * Get a shared stats reference. If create is true, the shared stats object is + * created if it does not exist. + * + * When create is true, and created_entry is non-NULL, it'll be set to true + * if the entry is newly created, false otherwise. + */ +PgStat_EntryRef * +pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, Oid objoid, bool create, + bool *created_entry) +{ + PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid}; + PgStatShared_HashEntry *shhashent; + PgStatShared_Common *shheader = NULL; + PgStat_EntryRef *entry_ref; + + /* + * passing in created_entry only makes sense if we possibly could create + * entry. + */ + AssertArg(create || created_entry == NULL); + pgstat_assert_is_up(); + Assert(pgStatLocal.shared_hash != NULL); + Assert(!pgStatLocal.shmem->is_shutdown); + + pgstat_setup_memcxt(); + pgstat_setup_shared_refs(); + + if (created_entry != NULL) + *created_entry = false; + + /* + * Check if other backends dropped stats that could not be deleted because + * somebody held references to it. If so, check this backend's references. + * This is not expected to happen often. The location of the check is a + * bit random, but this is a relatively frequently called path, so better + * than most. + */ + if (pgstat_need_entry_refs_gc()) + pgstat_gc_entry_refs(); + + /* + * First check the lookup cache hashtable in local memory. If we find a + * match here we can avoid taking locks / causing contention. + */ + if (pgstat_get_entry_ref_cached(key, &entry_ref)) + return entry_ref; + + Assert(entry_ref != NULL); + + /* + * Do a lookup in the hash table first - it's quite likely that the entry + * already exists, and that way we only need a shared lock. + */ + shhashent = dshash_find(pgStatLocal.shared_hash, &key, false); + + if (create && !shhashent) + { + bool shfound; + + /* + * It's possible that somebody created the entry since the above + * lookup. If so, fall through to the same path as if we'd have if it + * already had been created before the dshash_find() calls. + */ + shhashent = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &shfound); + if (!shfound) + { + shheader = pgstat_init_entry(kind, shhashent); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + if (created_entry != NULL) + *created_entry = true; + + return entry_ref; + } + } + + if (!shhashent) + { + /* + * If we're not creating, delete the reference again. In all + * likelihood it's just a stats lookup - no point wasting memory for a + * shared ref to nothing... + */ + pgstat_release_entry_ref(key, entry_ref, false); + + return NULL; + } + else + { + /* + * Can get here either because dshash_find() found a match, or if + * dshash_find_or_insert() found a concurrently inserted entry. + */ + + if (shhashent->dropped && create) + { + /* + * There are legitimate cases where the old stats entry might not + * yet have been dropped by the time it's reused. The most obvious + * case are replication slot stats, where a new slot can be + * created with the same index just after dropping. But oid + * wraparound can lead to other cases as well. We just reset the + * stats to their plain state. + */ + shheader = pgstat_reinit_entry(kind, shhashent); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + if (created_entry != NULL) + *created_entry = true; + + return entry_ref; + } + else if (shhashent->dropped) + { + dshash_release_lock(pgStatLocal.shared_hash, shhashent); + pgstat_release_entry_ref(key, entry_ref, false); + + return NULL; + } + else + { + shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + return entry_ref; + } + } +} + +static void +pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, + bool discard_pending) +{ + if (entry_ref && entry_ref->pending) + { + if (discard_pending) + pgstat_delete_pending_entry(entry_ref); + else + elog(ERROR, "releasing ref with pending data"); + } + + if (entry_ref && entry_ref->shared_stats) + { + Assert(entry_ref->shared_stats->magic == 0xdeadbeef); + Assert(entry_ref->pending == NULL); + + /* + * This can't race with another backend looking up the stats entry and + * increasing the refcount because it is not "legal" to create + * additional references to dropped entries. + */ + if (pg_atomic_fetch_sub_u32(&entry_ref->shared_entry->refcount, 1) == 1) + { + PgStatShared_HashEntry *shent; + + /* + * We're the last referrer to this entry, try to drop the shared + * entry. + */ + + /* only dropped entries can reach a 0 refcount */ + Assert(entry_ref->shared_entry->dropped); + + shent = dshash_find(pgStatLocal.shared_hash, + &entry_ref->shared_entry->key, + true); + if (!shent) + elog(ERROR, "could not find just referenced shared stats entry"); + + Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) == 0); + Assert(entry_ref->shared_entry == shent); + + pgstat_free_entry(shent, NULL); + } + } + + if (!pgstat_entry_ref_hash_delete(pgStatEntryRefHash, key)) + elog(ERROR, "entry ref vanished before deletion"); + + if (entry_ref) + pfree(entry_ref); +} + +bool +pgstat_lock_entry(PgStat_EntryRef *entry_ref, bool nowait) +{ + LWLock *lock = &entry_ref->shared_stats->lock; + + if (nowait) + return LWLockConditionalAcquire(lock, LW_EXCLUSIVE); + + LWLockAcquire(lock, LW_EXCLUSIVE); + return true; +} + +void +pgstat_unlock_entry(PgStat_EntryRef *entry_ref) +{ + LWLockRelease(&entry_ref->shared_stats->lock); +} + +/* + * Helper function to fetch and lock shared stats. + */ +PgStat_EntryRef * +pgstat_get_entry_ref_locked(PgStat_Kind kind, Oid dboid, Oid objoid, + bool nowait) +{ + PgStat_EntryRef *entry_ref; + + /* find shared table stats entry corresponding to the local entry */ + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, true, NULL); + + /* lock the shared entry to protect the content, skip if failed */ + if (!pgstat_lock_entry(entry_ref, nowait)) + return NULL; + + return entry_ref; +} + +void +pgstat_request_entry_refs_gc(void) +{ + pg_atomic_fetch_add_u64(&pgStatLocal.shmem->gc_request_count, 1); +} + +static bool +pgstat_need_entry_refs_gc(void) +{ + uint64 curage; + + if (!pgStatEntryRefHash) + return false; + + /* should have been initialized when creating pgStatEntryRefHash */ + Assert(pgStatSharedRefAge != 0); + + curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + + return pgStatSharedRefAge != curage; +} + +static void +pgstat_gc_entry_refs(void) +{ + pgstat_entry_ref_hash_iterator i; + PgStat_EntryRefHashEntry *ent; + uint64 curage; + + curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + Assert(curage != 0); + + /* + * Some entries have been dropped. Invalidate cache pointer to them. + */ + pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i); + while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i)) != NULL) + { + PgStat_EntryRef *entry_ref = ent->entry_ref; + + Assert(!entry_ref->shared_stats || + entry_ref->shared_stats->magic == 0xdeadbeef); + + if (!entry_ref->shared_entry->dropped) + continue; + + /* cannot gc shared ref that has pending data */ + if (entry_ref->pending != NULL) + continue; + + pgstat_release_entry_ref(ent->key, entry_ref, false); + } + + pgStatSharedRefAge = curage; +} + +static void +pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match, + Datum match_data) +{ + pgstat_entry_ref_hash_iterator i; + PgStat_EntryRefHashEntry *ent; + + if (pgStatEntryRefHash == NULL) + return; + + pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i); + + while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i)) + != NULL) + { + Assert(ent->entry_ref != NULL); + + if (match && !match(ent, match_data)) + continue; + + pgstat_release_entry_ref(ent->key, ent->entry_ref, discard_pending); + } +} + +/* + * Release all local references to shared stats entries. + * + * When a process exits it cannot do so while still holding references onto + * stats entries, otherwise the shared stats entries could never be freed. + */ +static void +pgstat_release_all_entry_refs(bool discard_pending) +{ + if (pgStatEntryRefHash == NULL) + return; + + pgstat_release_matching_entry_refs(discard_pending, NULL, 0); + Assert(pgStatEntryRefHash->members == 0); + pgstat_entry_ref_hash_destroy(pgStatEntryRefHash); + pgStatEntryRefHash = NULL; +} + +static bool +match_db(PgStat_EntryRefHashEntry *ent, Datum match_data) +{ + Oid dboid = DatumGetObjectId(match_data); + + return ent->key.dboid == dboid; +} + +static void +pgstat_release_db_entry_refs(Oid dboid) +{ + pgstat_release_matching_entry_refs( /* discard pending = */ true, + match_db, + ObjectIdGetDatum(dboid)); +} + + +/* ------------------------------------------------------------ + * Dropping and resetting of stats entries + * ------------------------------------------------------------ + */ + +static void +pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat) +{ + dsa_pointer pdsa; + + /* + * Fetch dsa pointer before deleting entry - that way we can free the + * memory after releasing the lock. + */ + pdsa = shent->body; + + if (!hstat) + dshash_delete_entry(pgStatLocal.shared_hash, shent); + else + dshash_delete_current(hstat); + + dsa_free(pgStatLocal.dsa, pdsa); +} + +/* + * Helper for both pgstat_drop_database_and_contents() and + * pgstat_drop_entry(). If hstat is non-null delete the shared entry using + * dshash_delete_current(), otherwise use dshash_delete_entry(). In either + * case the entry needs to be already locked. + */ +static bool +pgstat_drop_entry_internal(PgStatShared_HashEntry *shent, + dshash_seq_status *hstat) +{ + Assert(shent->body != InvalidDsaPointer); + + /* should already have released local reference */ + if (pgStatEntryRefHash) + Assert(!pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, shent->key)); + + /* + * Signal that the entry is dropped - this will eventually cause other + * backends to release their references. + */ + if (shent->dropped) + elog(ERROR, "can only drop stats once"); + shent->dropped = true; + + /* release refcount marking entry as not dropped */ + if (pg_atomic_sub_fetch_u32(&shent->refcount, 1) == 0) + { + pgstat_free_entry(shent, hstat); + return true; + } + else + { + if (!hstat) + dshash_release_lock(pgStatLocal.shared_hash, shent); + return false; + } +} + +/* + * Drop stats for the database and all the objects inside that database. + */ +static void +pgstat_drop_database_and_contents(Oid dboid) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + uint64 not_freed_count = 0; + + Assert(OidIsValid(dboid)); + + Assert(pgStatLocal.shared_hash != NULL); + + /* + * This backend might very well be the only backend holding a reference to + * about-to-be-dropped entries. Ensure that we're not preventing it from + * being cleaned up till later. + * + * Doing this separately from the dshash iteration below avoids having to + * do so while holding a partition lock on the shared hashtable. + */ + pgstat_release_db_entry_refs(dboid); + + /* some of the dshash entries are to be removed, take exclusive lock. */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, true); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + if (p->dropped) + continue; + + if (p->key.dboid != dboid) + continue; + + if (!pgstat_drop_entry_internal(p, &hstat)) + { + /* + * Even statistics for a dropped database might currently be + * accessed (consider e.g. database stats for pg_stat_database). + */ + not_freed_count++; + } + } + dshash_seq_term(&hstat); + + /* + * If some of the stats data could not be freed, signal the reference + * holders to run garbage collection of their cached pgStatShmLookupCache. + */ + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +bool +pgstat_drop_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid}; + PgStatShared_HashEntry *shent; + bool freed = true; + + /* delete local reference */ + if (pgStatEntryRefHash) + { + PgStat_EntryRefHashEntry *lohashent = + pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, key); + + if (lohashent) + pgstat_release_entry_ref(lohashent->key, lohashent->entry_ref, + true); + } + + /* mark entry in shared hashtable as deleted, drop if possible */ + shent = dshash_find(pgStatLocal.shared_hash, &key, true); + if (shent) + { + freed = pgstat_drop_entry_internal(shent, NULL); + + /* + * Database stats contain other stats. Drop those as well when + * dropping the database. XXX: Perhaps this should be done in a + * slightly more principled way? But not obvious what that'd look + * like, and so far this is the only case... + */ + if (key.kind == PGSTAT_KIND_DATABASE) + pgstat_drop_database_and_contents(key.dboid); + } + + return freed; +} + +void +pgstat_drop_all_entries(void) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *ps; + uint64 not_freed_count = 0; + + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((ps = dshash_seq_next(&hstat)) != NULL) + { + if (ps->dropped) + continue; + + if (!pgstat_drop_entry_internal(ps, &hstat)) + not_freed_count++; + } + dshash_seq_term(&hstat); + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +static void +shared_stat_reset_contents(PgStat_Kind kind, PgStatShared_Common *header, + TimestampTz ts) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + memset(pgstat_get_entry_data(kind, header), 0, + pgstat_get_entry_len(kind)); + + if (kind_info->reset_timestamp_cb) + kind_info->reset_timestamp_cb(header, ts); +} + +/* + * Reset one variable-numbered stats entry. + */ +void +pgstat_reset_entry(PgStat_Kind kind, Oid dboid, Oid objoid, TimestampTz ts) +{ + PgStat_EntryRef *entry_ref; + + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + if (!entry_ref || entry_ref->shared_entry->dropped) + return; + + pgstat_lock_entry(entry_ref, false); + shared_stat_reset_contents(kind, entry_ref->shared_stats, ts); + pgstat_unlock_entry(entry_ref); +} + +/* + * Scan through the shared hashtable of stats, resetting statistics if + * approved by the provided do_reset() function. + */ +void +pgstat_reset_matching_entries(bool (*do_reset) (PgStatShared_HashEntry *, Datum), + Datum match_data, TimestampTz ts) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + + /* dshash entry is not modified, take shared lock */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + PgStatShared_Common *header; + + if (p->dropped) + continue; + + if (!do_reset(p, match_data)) + continue; + + header = dsa_get_address(pgStatLocal.dsa, p->body); + + LWLockAcquire(&header->lock, LW_EXCLUSIVE); + + shared_stat_reset_contents(p->key.kind, header, ts); + + LWLockRelease(&header->lock); + } + dshash_seq_term(&hstat); +} + +static bool +match_kind(PgStatShared_HashEntry *p, Datum match_data) +{ + return p->key.kind == DatumGetInt32(match_data); +} + +void +pgstat_reset_entries_of_kind(PgStat_Kind kind, TimestampTz ts) +{ + pgstat_reset_matching_entries(match_kind, Int32GetDatum(kind), ts); +} + +static void +pgstat_setup_memcxt(void) +{ + if (unlikely(!pgStatSharedRefContext)) + pgStatSharedRefContext = + AllocSetContextCreate(CacheMemoryContext, + "PgStat Shared Ref", + ALLOCSET_SMALL_SIZES); + if (unlikely(!pgStatEntryRefHashContext)) + pgStatEntryRefHashContext = + AllocSetContextCreate(CacheMemoryContext, + "PgStat Shared Ref Hash", + ALLOCSET_SMALL_SIZES); +} diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c index d932bc74e09..d0b85b62a5f 100644 --- a/src/backend/utils/activity/pgstat_slru.c +++ b/src/backend/utils/activity/pgstat_slru.c @@ -18,18 +18,21 @@ #include "postgres.h" #include "utils/pgstat_internal.h" +#include "utils/timestamp.h" -static inline PgStat_MsgSLRU *get_slru_entry(int slru_idx); +static inline PgStat_SLRUStats *get_slru_entry(int slru_idx); +static void pgstat_reset_slru_counter_internal(int index, TimestampTz ts); /* - * SLRU statistics counts waiting to be sent to the collector. These are - * stored directly in stats message format so they can be sent without needing - * to copy things around. We assume this variable inits to zeroes. Entries - * are one-to-one with slru_names[]. + * SLRU statistics counts waiting to be flushed out. We assume this variable + * inits to zeroes. Entries are one-to-one with slru_names[]. Changes of + * SLRU counters are reported within critical sections so we use static memory + * in order to avoid memory allocation. */ -static PgStat_MsgSLRU SLRUStats[SLRU_NUM_ELEMENTS]; +static PgStat_SLRUStats pending_SLRUStats[SLRU_NUM_ELEMENTS]; +bool have_slrustats = false; /* @@ -41,17 +44,11 @@ static PgStat_MsgSLRU SLRUStats[SLRU_NUM_ELEMENTS]; void pgstat_reset_slru(const char *name) { - PgStat_MsgResetslrucounter msg; + TimestampTz ts = GetCurrentTimestamp(); AssertArg(name != NULL); - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSLRUCOUNTER); - msg.m_index = pgstat_get_slru_index(name); - - pgstat_send(&msg, sizeof(msg)); + pgstat_reset_slru_counter_internal(pgstat_get_slru_index(name), ts); } /* @@ -61,43 +58,55 @@ pgstat_reset_slru(const char *name) void pgstat_count_slru_page_zeroed(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_zeroed += 1; + get_slru_entry(slru_idx)->blocks_zeroed += 1; } void pgstat_count_slru_page_hit(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_hit += 1; + get_slru_entry(slru_idx)->blocks_hit += 1; } void pgstat_count_slru_page_exists(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_exists += 1; + get_slru_entry(slru_idx)->blocks_exists += 1; } void pgstat_count_slru_page_read(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_read += 1; + get_slru_entry(slru_idx)->blocks_read += 1; } void pgstat_count_slru_page_written(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_written += 1; + get_slru_entry(slru_idx)->blocks_written += 1; } void pgstat_count_slru_flush(int slru_idx) { - get_slru_entry(slru_idx)->m_flush += 1; + get_slru_entry(slru_idx)->flush += 1; } void pgstat_count_slru_truncate(int slru_idx) { - get_slru_entry(slru_idx)->m_truncate += 1; + get_slru_entry(slru_idx)->truncate += 1; +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the slru statistics struct. + */ +PgStat_SLRUStats * +pgstat_fetch_slru(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_SLRU); + + return pgStatLocal.snapshot.slru; } /* @@ -135,45 +144,81 @@ pgstat_get_slru_index(const char *name) } /* - * Send SLRU statistics to the collector + * Flush out locally pending SLRU stats entries + * + * If nowait is true, this function returns false on lock failure. Otherwise + * this function always returns true. Writer processes are mutually excluded + * using LWLock, but readers are expected to use change-count protocol to avoid + * interference with writers. + * + * If nowait is true, this function returns true if the lock could not be + * acquired. Otherwise return false. */ -void -pgstat_send_slru(void) +bool +pgstat_slru_flush(bool nowait) { - /* We assume this initializes to zeroes */ - static const PgStat_MsgSLRU all_zeroes; + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + int i; - for (int i = 0; i < SLRU_NUM_ELEMENTS; i++) + if (!have_slrustats) + return false; + + if (!nowait) + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE)) + return true; + + for (i = 0; i < SLRU_NUM_ELEMENTS; i++) { - /* - * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. - */ - if (memcmp(&SLRUStats[i], &all_zeroes, sizeof(PgStat_MsgSLRU)) == 0) - continue; - - /* set the SLRU type before each send */ - SLRUStats[i].m_index = i; - - /* - * Prepare and send the message - */ - pgstat_setheader(&SLRUStats[i].m_hdr, PGSTAT_MTYPE_SLRU); - pgstat_send(&SLRUStats[i], sizeof(PgStat_MsgSLRU)); - - /* - * Clear out the statistics buffer, so it can be re-used. - */ - MemSet(&SLRUStats[i], 0, sizeof(PgStat_MsgSLRU)); + PgStat_SLRUStats *sharedent = &stats_shmem->stats[i]; + PgStat_SLRUStats *pendingent = &pending_SLRUStats[i]; + +#define SLRU_ACC(fld) sharedent->fld += pendingent->fld + SLRU_ACC(blocks_zeroed); + SLRU_ACC(blocks_hit); + SLRU_ACC(blocks_read); + SLRU_ACC(blocks_written); + SLRU_ACC(blocks_exists); + SLRU_ACC(flush); + SLRU_ACC(truncate); +#undef SLRU_ACC } + + /* done, clear the pending entry */ + MemSet(pending_SLRUStats, 0, sizeof(pending_SLRUStats)); + + LWLockRelease(&stats_shmem->lock); + + have_slrustats = false; + + return false; +} + +void +pgstat_slru_reset_all_cb(TimestampTz ts) +{ + for (int i = 0; i < SLRU_NUM_ELEMENTS; i++) + pgstat_reset_slru_counter_internal(i, ts); +} + +void +pgstat_slru_snapshot_cb(void) +{ + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + + memcpy(pgStatLocal.snapshot.slru, &stats_shmem->stats, + sizeof(stats_shmem->stats)); + + LWLockRelease(&stats_shmem->lock); } /* * Returns pointer to entry with counters for given SLRU (based on the name * stored in SlruCtl as lwlock tranche name). */ -static inline PgStat_MsgSLRU * +static inline PgStat_SLRUStats * get_slru_entry(int slru_idx) { pgstat_assert_is_up(); @@ -186,5 +231,20 @@ get_slru_entry(int slru_idx) Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); - return &SLRUStats[slru_idx]; + have_slrustats = true; + + return &pending_SLRUStats[slru_idx]; +} + +static void +pgstat_reset_slru_counter_internal(int index, TimestampTz ts) +{ + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + + memset(&stats_shmem->stats[index], 0, sizeof(PgStat_SLRUStats)); + stats_shmem->stats[index].stat_reset_timestamp = ts; + + LWLockRelease(&stats_shmem->lock); } diff --git a/src/backend/utils/activity/pgstat_subscription.c b/src/backend/utils/activity/pgstat_subscription.c index 689029b30af..e1072bd5bae 100644 --- a/src/backend/utils/activity/pgstat_subscription.c +++ b/src/backend/utils/activity/pgstat_subscription.c @@ -26,12 +26,17 @@ void pgstat_report_subscription_error(Oid subid, bool is_apply_error) { - PgStat_MsgSubscriptionError msg; + PgStat_EntryRef *entry_ref; + PgStat_BackendSubEntry *pending; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_SUBSCRIPTIONERROR); - msg.m_subid = subid; - msg.m_is_apply_error = is_apply_error; - pgstat_send(&msg, sizeof(PgStat_MsgSubscriptionError)); + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_SUBSCRIPTION, + InvalidOid, subid, NULL); + pending = entry_ref->pending; + + if (is_apply_error) + pending->apply_error_count++; + else + pending->sync_error_count++; } /* @@ -54,12 +59,52 @@ pgstat_create_subscription(Oid subid) void pgstat_drop_subscription(Oid subid) { - PgStat_MsgSubscriptionDrop msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_SUBSCRIPTIONDROP); - msg.m_subid = subid; - pgstat_send(&msg, sizeof(PgStat_MsgSubscriptionDrop)); - pgstat_drop_transactional(PGSTAT_KIND_SUBSCRIPTION, InvalidOid, subid); } + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one subscription or NULL. + */ +PgStat_StatSubEntry * +pgstat_fetch_stat_subscription(Oid subid) +{ + return (PgStat_StatSubEntry *) + pgstat_fetch_entry(PGSTAT_KIND_SUBSCRIPTION, InvalidOid, subid); +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + */ +bool +pgstat_subscription_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + PgStat_BackendSubEntry *localent; + PgStatShared_Subscription *shsubent; + + localent = (PgStat_BackendSubEntry *) entry_ref->pending; + shsubent = (PgStatShared_Subscription *) entry_ref->shared_stats; + + /* localent always has non-zero content */ + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + +#define SUB_ACC(fld) shsubent->stats.fld += localent->fld + SUB_ACC(apply_error_count); + SUB_ACC(sync_error_count); +#undef SUB_ACC + + pgstat_unlock_entry(entry_ref); + return true; +} + +void +pgstat_subscription_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Subscription *) header)->stats.stat_reset_timestamp = ts; +} diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c index 8855598f52e..5a878bd1155 100644 --- a/src/backend/utils/activity/pgstat_wal.c +++ b/src/backend/utils/activity/pgstat_wal.c @@ -21,13 +21,7 @@ #include "executor/instrument.h" -/* - * WAL global statistics counters. Stored directly in a stats message - * structure so they can be sent without needing to copy things around. We - * assume these init to zeroes. - */ -PgStat_MsgWal WalStats; - +PgStat_WalStats PendingWalStats = {0}; /* * WAL usage counters saved from pgWALUsage at the previous call to @@ -39,101 +33,100 @@ static WalUsage prevWalUsage; /* - * Send WAL statistics to the collector. + * Calculate how much WAL usage counters have increased and update + * shared statistics. * - * If 'force' is not set, WAL stats message is only sent if enough time has - * passed since last one was sent to reach PGSTAT_STAT_INTERVAL. + * Must be called by processes that generate WAL, that do not call + * pgstat_report_stat(), like walwriter. */ void pgstat_report_wal(bool force) { - static TimestampTz sendTime = 0; + pgstat_flush_wal(force); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the WAL statistics struct. + */ +PgStat_WalStats * +pgstat_fetch_stat_wal(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_WAL); + + return &pgStatLocal.snapshot.wal; +} + +/* + * Calculate how much WAL usage counters have increased by subtracting the + * previous counters from the current ones. + * + * If nowait is true, this function returns true if the lock could not be + * acquired. Otherwise return false. + */ +bool +pgstat_flush_wal(bool nowait) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + WalUsage diff = {0}; + + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + Assert(pgStatLocal.shmem != NULL && + !pgStatLocal.shmem->is_shutdown); /* - * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. - * - * Check wal_records counter to determine whether any WAL activity has - * happened since last time. Note that other WalUsage counters don't need - * to be checked because they are incremented always together with - * wal_records counter. - * - * m_wal_buffers_full also doesn't need to be checked because it's - * incremented only when at least one WAL record is generated (i.e., - * wal_records counter is incremented). But for safely, we assert that - * m_wal_buffers_full is always zero when no WAL record is generated - * - * This function can be called by a process like walwriter that normally - * generates no WAL records. To determine whether any WAL activity has - * happened at that process since the last time, the numbers of WAL writes - * and syncs are also checked. + * This function can be called even if nothing at all has happened. Avoid + * taking lock for nothing in that case. */ - if (pgWalUsage.wal_records == prevWalUsage.wal_records && - WalStats.m_wal_write == 0 && WalStats.m_wal_sync == 0) - { - Assert(WalStats.m_wal_buffers_full == 0); - return; - } - - if (!force) - { - TimestampTz now = GetCurrentTimestamp(); - - /* - * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL - * msec since we last sent one to avoid overloading the stats - * collector. - */ - if (!TimestampDifferenceExceeds(sendTime, now, PGSTAT_STAT_INTERVAL)) - return; - sendTime = now; - } + if (!pgstat_have_pending_wal()) + return false; /* - * Set the counters related to generated WAL data if the counters were - * updated. + * We don't update the WAL usage portion of the local WalStats elsewhere. + * Calculate how much WAL usage counters were increased by subtracting the + * previous counters from the current ones. */ - if (pgWalUsage.wal_records != prevWalUsage.wal_records) - { - WalUsage walusage; - - /* - * Calculate how much WAL usage counters were increased by subtracting - * the previous counters from the current ones. Fill the results in - * WAL stats message. - */ - MemSet(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage); - - WalStats.m_wal_records = walusage.wal_records; - WalStats.m_wal_fpi = walusage.wal_fpi; - WalStats.m_wal_bytes = walusage.wal_bytes; - - /* - * Save the current counters for the subsequent calculation of WAL - * usage. - */ - prevWalUsage = pgWalUsage; - } + WalUsageAccumDiff(&diff, &pgWalUsage, &prevWalUsage); + PendingWalStats.wal_records = diff.wal_records; + PendingWalStats.wal_fpi = diff.wal_fpi; + PendingWalStats.wal_bytes = diff.wal_bytes; + + if (!nowait) + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE)) + return true; + +#define WALSTAT_ACC(fld) stats_shmem->stats.fld += PendingWalStats.fld + WALSTAT_ACC(wal_records); + WALSTAT_ACC(wal_fpi); + WALSTAT_ACC(wal_bytes); + WALSTAT_ACC(wal_buffers_full); + WALSTAT_ACC(wal_write); + WALSTAT_ACC(wal_sync); + WALSTAT_ACC(wal_write_time); + WALSTAT_ACC(wal_sync_time); +#undef WALSTAT_ACC + + LWLockRelease(&stats_shmem->lock); /* - * Prepare and send the message + * Save the current counters for the subsequent calculation of WAL usage. */ - pgstat_setheader(&WalStats.m_hdr, PGSTAT_MTYPE_WAL); - pgstat_send(&WalStats, sizeof(WalStats)); + prevWalUsage = pgWalUsage; /* * Clear out the statistics buffer, so it can be re-used. */ - MemSet(&WalStats, 0, sizeof(WalStats)); + MemSet(&PendingWalStats, 0, sizeof(PendingWalStats)); + + return false; } void pgstat_init_wal(void) { /* - * Initialize prevWalUsage with pgWalUsage so that pgstat_report_wal() can + * Initialize prevWalUsage with pgWalUsage so that pgstat_flush_wal() can * calculate how much pgWalUsage counters are increased by subtracting * prevWalUsage from pgWalUsage. */ @@ -151,6 +144,28 @@ bool pgstat_have_pending_wal(void) { return pgWalUsage.wal_records != prevWalUsage.wal_records || - WalStats.m_wal_write != 0 || - WalStats.m_wal_sync != 0; + PendingWalStats.wal_write != 0 || + PendingWalStats.wal_sync != 0; +} + +void +pgstat_wal_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + memset(&stats_shmem->stats, 0, sizeof(stats_shmem->stats)); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_wal_snapshot_cb(void) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&pgStatLocal.snapshot.wal, &stats_shmem->stats, + sizeof(pgStatLocal.snapshot.wal)); + LWLockRelease(&stats_shmem->lock); } diff --git a/src/backend/utils/activity/pgstat_xact.c b/src/backend/utils/activity/pgstat_xact.c index 3f330873787..230ffa5afce 100644 --- a/src/backend/utils/activity/pgstat_xact.c +++ b/src/backend/utils/activity/pgstat_xact.c @@ -68,6 +68,7 @@ static void AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) { dlist_mutable_iter iter; + int not_freed_count = 0; if (xact_state->pending_drops_count == 0) { @@ -79,6 +80,7 @@ AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) { PgStat_PendingDroppedStatsItem *pending = dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur); + xl_xact_stats_item *it = &pending->item; if (isCommit && !pending->is_create) { @@ -86,7 +88,8 @@ AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) * Transaction that dropped an object committed. Drop the stats * too. */ - /* will do work in subsequent commit */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; } else if (!isCommit && pending->is_create) { @@ -94,13 +97,17 @@ AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) * Transaction that created an object aborted. Drop the stats * associated with the object. */ - /* will do work in subsequent commit */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; } dlist_delete(&pending->node); xact_state->pending_drops_count--; pfree(pending); } + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); } /* @@ -135,6 +142,7 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, { PgStat_SubXactStatus *parent_xact_state; dlist_mutable_iter iter; + int not_freed_count = 0; if (xact_state->pending_drops_count == 0) return; @@ -145,6 +153,7 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, { PgStat_PendingDroppedStatsItem *pending = dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur); + xl_xact_stats_item *it = &pending->item; dlist_delete(&pending->node); xact_state->pending_drops_count--; @@ -155,7 +164,8 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, * Subtransaction creating a new stats object aborted. Drop the * stats object. */ - /* will do work in subsequent commit */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; pfree(pending); } else if (isCommit) @@ -175,6 +185,8 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, } Assert(xact_state->pending_drops_count == 0); + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); } /* @@ -307,13 +319,21 @@ pgstat_get_transactional_drops(bool isCommit, xl_xact_stats_item **items) void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_item *items, bool is_redo) { + int not_freed_count = 0; + if (ndrops == 0) return; for (int i = 0; i < ndrops; i++) { - /* will do work in subsequent commit */ + xl_xact_stats_item *it = &items[i]; + + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; } + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); } static void @@ -345,6 +365,15 @@ create_drop_transactional_internal(PgStat_Kind kind, Oid dboid, Oid objoid, bool void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, Oid objoid) { + if (pgstat_get_entry_ref(kind, dboid, objoid, false, NULL)) + { + ereport(WARNING, + errmsg("resetting existing stats for type %s, db=%d, oid=%d", + (pgstat_get_kind_info(kind))->name, dboid, objoid)); + + pgstat_reset(kind, dboid, objoid); + } + create_drop_transactional_internal(kind, dboid, objoid, /* create */ true); } diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 1c8aba49259..87c15b9c6f3 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -230,9 +230,6 @@ pgstat_get_wait_activity(WaitEventActivity w) case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN: event_name = "LogicalLauncherMain"; break; - case WAIT_EVENT_PGSTAT_MAIN: - event_name = "PgStatMain"; - break; case WAIT_EVENT_RECOVERY_WAL_STREAM: event_name = "RecoveryWalStream"; break; diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index be5470a107c..248d318f866 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -2046,7 +2046,15 @@ pg_stat_get_xact_function_self_time(PG_FUNCTION_ARGS) Datum pg_stat_get_snapshot_timestamp(PG_FUNCTION_ARGS) { - PG_RETURN_TIMESTAMPTZ(pgstat_fetch_global()->stats_timestamp); + bool have_snapshot; + TimestampTz ts; + + ts = pgstat_get_stat_snapshot_timestamp(&have_snapshot); + + if (!have_snapshot) + PG_RETURN_NULL(); + + PG_RETURN_TIMESTAMPTZ(ts); } /* Discard the active statistics snapshot */ diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index a15ce9edb13..1f29670a131 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -73,6 +73,7 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/optimizer.h" +#include "pgstat.h" #include "rewrite/rewriteDefine.h" #include "rewrite/rowsecurity.h" #include "storage/lmgr.h" @@ -2409,6 +2410,9 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) */ RelationCloseSmgr(relation); + /* break mutual link with stats entry */ + pgstat_unlink_relation(relation); + /* * Free all the subsidiary data structures of the relcache entry, then the * entry itself. @@ -2716,8 +2720,9 @@ RelationClearRelation(Relation relation, bool rebuild) SWAPFIELD(RowSecurityDesc *, rd_rsdesc); /* toast OID override must be preserved */ SWAPFIELD(Oid, rd_toastoid); - /* pgstat_info must be preserved */ + /* pgstat_info / enabled must be preserved */ SWAPFIELD(struct PgStat_TableStatus *, pgstat_info); + SWAPFIELD(bool, pgstat_enabled); /* preserve old partition key if we have one */ if (keep_partkey) { diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 3419c099b28..1a5d29ac9ba 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -36,6 +36,7 @@ volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false; volatile sig_atomic_t IdleSessionTimeoutPending = false; volatile sig_atomic_t ProcSignalBarrierPending = false; volatile sig_atomic_t LogMemoryContextPending = false; +volatile sig_atomic_t IdleStatsUpdateTimeoutPending = false; volatile uint32 InterruptHoldoffCount = 0; volatile uint32 QueryCancelHoldoffCount = 0; volatile uint32 CritSectionCount = 0; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index bdc77af7194..0d3cfe8240b 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -288,9 +288,6 @@ GetBackendTypeDesc(BackendType backendType) case B_ARCHIVER: backendDesc = "archiver"; break; - case B_STATS_COLLECTOR: - backendDesc = "stats collector"; - break; case B_LOGGER: backendDesc = "logger"; break; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 342169b1958..a85c2e0260d 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -80,6 +80,7 @@ static void StatementTimeoutHandler(void); static void LockTimeoutHandler(void); static void IdleInTransactionSessionTimeoutHandler(void); static void IdleSessionTimeoutHandler(void); +static void IdleStatsUpdateTimeoutHandler(void); static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); @@ -725,6 +726,8 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, IdleInTransactionSessionTimeoutHandler); RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler); RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler); + RegisterTimeout(IDLE_STATS_UPDATE_TIMEOUT, + IdleStatsUpdateTimeoutHandler); } /* @@ -752,6 +755,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, * Use before_shmem_exit() so that ShutdownXLOG() can rely on DSM * segments etc to work (which in turn is required for pgstats). */ + before_shmem_exit(pgstat_before_server_shutdown, 0); before_shmem_exit(ShutdownXLOG, 0); } @@ -1335,6 +1339,14 @@ IdleSessionTimeoutHandler(void) } static void +IdleStatsUpdateTimeoutHandler(void) +{ + IdleStatsUpdateTimeoutPending = true; + InterruptPending = true; + SetLatch(MyLatch); +} + +static void ClientCheckTimeoutHandler(void) { CheckClientConnectionPending = true; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5538465d7d6..f7758ea4a7b 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -375,6 +375,16 @@ static const struct config_enum_entry track_function_options[] = { StaticAssertDecl(lengthof(track_function_options) == (TRACK_FUNC_ALL + 2), "array length mismatch"); +static const struct config_enum_entry stats_fetch_consistency[] = { + {"none", PGSTAT_FETCH_CONSISTENCY_NONE, false}, + {"cache", PGSTAT_FETCH_CONSISTENCY_CACHE, false}, + {"snapshot", PGSTAT_FETCH_CONSISTENCY_SNAPSHOT, false}, + {NULL, 0, false} +}; + +StaticAssertDecl(lengthof(stats_fetch_consistency) == (PGSTAT_FETCH_CONSISTENCY_SNAPSHOT + 2), + "array length mismatch"); + static const struct config_enum_entry xmlbinary_options[] = { {"base64", XMLBINARY_BASE64, false}, {"hex", XMLBINARY_HEX, false}, @@ -4918,6 +4928,17 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + + { + {"stats_fetch_consistency", PGC_USERSET, STATS_COLLECTOR, + gettext_noop("Sets the consistency of accesses to statistics data"), + NULL + }, + &pgstat_fetch_consistency, + PGSTAT_FETCH_CONSISTENCY_CACHE, stats_fetch_consistency, + NULL, NULL, NULL + }, + { {"wal_compression", PGC_SUSET, WAL_SETTINGS, gettext_noop("Compresses full-page writes written in WAL file with specified method."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 93d221a37b1..5f9a37bed3b 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -614,6 +614,7 @@ #track_wal_io_timing = off #track_functions = none # none, pl, all #stats_temp_directory = 'pg_stat_tmp' +#stats_fetch_consistency = none # - Monitoring - |