diff options
author | Andres Freund <andres@anarazel.de> | 2022-04-06 21:29:46 -0700 |
---|---|---|
committer | Andres Freund <andres@anarazel.de> | 2022-04-06 21:29:46 -0700 |
commit | 5891c7a8ed8f2d3d577e7eea34dacff12d7b6bbd (patch) | |
tree | 909f20fa511d5fde6463c58403bb82508c35cfab /src/include | |
parent | be902e26510788c70a874ea54bad753b723d018f (diff) | |
download | postgresql-5891c7a8ed8f2d3d577e7eea34dacff12d7b6bbd.tar.gz postgresql-5891c7a8ed8f2d3d577e7eea34dacff12d7b6bbd.zip |
pgstat: store statistics in shared memory.
Previously the statistics collector received statistics updates via UDP and
shared statistics data by writing them out to temporary files regularly. These
files can reach tens of megabytes and are written out up to twice a
second. This has repeatedly prevented us from adding additional useful
statistics.
Now statistics are stored in shared memory. Statistics for variable-numbered
objects are stored in a dshash hashtable (backed by dynamic shared
memory). Fixed-numbered stats are stored in plain shared memory.
The header for pgstat.c contains an overview of the architecture.
The stats collector is not needed anymore, remove it.
By utilizing the transactional statistics drop infrastructure introduced in a
prior commit statistics entries cannot "leak" anymore. Previously leaked
statistics were dropped by pgstat_vacuum_stat(), called from [auto-]vacuum. On
systems with many small relations pgstat_vacuum_stat() could be quite
expensive.
Now that replicas drop statistics entries for dropped objects, it is not
necessary anymore to reset stats when starting from a cleanly shut down
replica.
Subsequent commits will perform some further code cleanup, adapt docs and add
tests.
Bumps PGSTAT_FILE_FORMAT_ID.
Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-By: Andres Freund <andres@anarazel.de>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Reviewed-By: Justin Pryzby <pryzby@telsasoft.com>
Reviewed-By: "David G. Johnston" <david.g.johnston@gmail.com>
Reviewed-By: Tomas Vondra <tomas.vondra@2ndquadrant.com> (in a much earlier version)
Reviewed-By: Arthur Zakirov <a.zakirov@postgrespro.ru> (in a much earlier version)
Reviewed-By: Antonin Houska <ah@cybertec.at> (in a much earlier version)
Discussion: https://postgr.es/m/20220303021600.hs34ghqcw6zcokdh@alap3.anarazel.de
Discussion: https://postgr.es/m/20220308205351.2xcn6k4x5yivcxyd@alap3.anarazel.de
Discussion: https://postgr.es/m/20210319235115.y3wz7hpnnrshdyv6@alap3.anarazel.de
Diffstat (limited to 'src/include')
-rw-r--r-- | src/include/miscadmin.h | 2 | ||||
-rw-r--r-- | src/include/pgstat.h | 679 | ||||
-rw-r--r-- | src/include/storage/lwlock.h | 3 | ||||
-rw-r--r-- | src/include/utils/pgstat_internal.h | 663 | ||||
-rw-r--r-- | src/include/utils/rel.h | 1 | ||||
-rw-r--r-- | src/include/utils/timeout.h | 1 | ||||
-rw-r--r-- | src/include/utils/wait_event.h | 1 |
7 files changed, 702 insertions, 648 deletions
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 9321d7f264b..66c404c666d 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -94,6 +94,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending; extern PGDLLIMPORT volatile sig_atomic_t LogMemoryContextPending; +extern PGDLLIMPORT volatile sig_atomic_t IdleStatsUpdateTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending; extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost; @@ -333,7 +334,6 @@ typedef enum BackendType B_WAL_SENDER, B_WAL_WRITER, B_ARCHIVER, - B_STATS_COLLECTOR, B_LOGGER, } BackendType; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 99115bacde7..1d2d3de86c9 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -14,10 +14,8 @@ #include "datatype/timestamp.h" #include "portability/instr_time.h" #include "postmaster/pgarch.h" /* for MAX_XFN_CHARS */ -#include "replication/logicalproto.h" #include "utils/backend_progress.h" /* for backward compatibility */ #include "utils/backend_status.h" /* for backward compatibility */ -#include "utils/hsearch.h" #include "utils/relcache.h" #include "utils/wait_event.h" /* for backward compatibility */ @@ -27,8 +25,8 @@ * ---------- */ #define PGSTAT_STAT_PERMANENT_DIRECTORY "pg_stat" -#define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat" -#define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp" +#define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/pgstat.stat" +#define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/pgstat.tmp" /* Default directory to store temporary statistics data in */ #define PG_STAT_TMP_DIR "pg_stat_tmp" @@ -66,6 +64,13 @@ typedef enum TrackFunctionsLevel TRACK_FUNC_ALL } TrackFunctionsLevel; +typedef enum PgStat_FetchConsistency +{ + PGSTAT_FETCH_CONSISTENCY_NONE, + PGSTAT_FETCH_CONSISTENCY_CACHE, + PGSTAT_FETCH_CONSISTENCY_SNAPSHOT, +} PgStat_FetchConsistency; + /* Values to track the cause of session termination */ typedef enum SessionEndType { @@ -92,7 +97,7 @@ typedef int64 PgStat_Counter; * PgStat_FunctionCounts The actual per-function counts kept by a backend * * This struct should contain only actual event counters, because we memcmp - * it against zeroes to detect whether there are any counts to transmit. + * it against zeroes to detect whether there are any pending stats. * * Note that the time counters are in instr_time format here. We convert to * microseconds in PgStat_Counter format when flushing out pending statistics. @@ -106,12 +111,11 @@ typedef struct PgStat_FunctionCounts } PgStat_FunctionCounts; /* ---------- - * PgStat_BackendFunctionEntry Entry in backend's per-function hash table + * PgStat_BackendFunctionEntry Non-flushed function stats. * ---------- */ typedef struct PgStat_BackendFunctionEntry { - Oid f_id; PgStat_FunctionCounts f_counts; } PgStat_BackendFunctionEntry; @@ -132,12 +136,21 @@ typedef struct PgStat_FunctionCallUsage } PgStat_FunctionCallUsage; /* ---------- + * PgStat_BackendSubEntry Non-flushed subscription stats. + * ---------- + */ +typedef struct PgStat_BackendSubEntry +{ + PgStat_Counter apply_error_count; + PgStat_Counter sync_error_count; +} PgStat_BackendSubEntry; + +/* ---------- * PgStat_TableCounts The actual per-table counts kept by a backend * * This struct should contain only actual event counters, because we memcmp - * it against zeroes to detect whether there are any counts to transmit. - * It is a component of PgStat_TableStatus (within-backend state) and - * PgStat_TableEntry (the transmitted message format). + * it against zeroes to detect whether there are any stats updates to apply. + * It is a component of PgStat_TableStatus (within-backend state). * * Note: for a table, tuples_returned is the number of tuples successfully * fetched by heap_getnext, while tuples_fetched is the number of tuples @@ -194,6 +207,7 @@ typedef struct PgStat_TableStatus bool t_shared; /* is it a shared catalog? */ struct PgStat_TableXactStatus *trans; /* lowest subxact's counts */ PgStat_TableCounts t_counts; /* event counts to be sent */ + Relation relation; /* rel that is using this entry */ } PgStat_TableStatus; /* ---------- @@ -221,569 +235,14 @@ typedef struct PgStat_TableXactStatus /* ------------------------------------------------------------ - * Message formats follow - * ------------------------------------------------------------ - */ - -/* ---------- - * The types of backend -> collector messages - * ---------- - */ -typedef enum StatMsgType -{ - PGSTAT_MTYPE_DUMMY, - PGSTAT_MTYPE_INQUIRY, - PGSTAT_MTYPE_TABSTAT, - PGSTAT_MTYPE_TABPURGE, - PGSTAT_MTYPE_DROPDB, - PGSTAT_MTYPE_RESETCOUNTER, - PGSTAT_MTYPE_RESETSHAREDCOUNTER, - PGSTAT_MTYPE_RESETSINGLECOUNTER, - PGSTAT_MTYPE_RESETSLRUCOUNTER, - PGSTAT_MTYPE_RESETREPLSLOTCOUNTER, - PGSTAT_MTYPE_RESETSUBCOUNTER, - PGSTAT_MTYPE_AUTOVAC_START, - PGSTAT_MTYPE_VACUUM, - PGSTAT_MTYPE_ANALYZE, - PGSTAT_MTYPE_ARCHIVER, - PGSTAT_MTYPE_BGWRITER, - PGSTAT_MTYPE_CHECKPOINTER, - PGSTAT_MTYPE_WAL, - PGSTAT_MTYPE_SLRU, - PGSTAT_MTYPE_FUNCSTAT, - PGSTAT_MTYPE_FUNCPURGE, - PGSTAT_MTYPE_RECOVERYCONFLICT, - PGSTAT_MTYPE_TEMPFILE, - PGSTAT_MTYPE_DEADLOCK, - PGSTAT_MTYPE_CHECKSUMFAILURE, - PGSTAT_MTYPE_REPLSLOT, - PGSTAT_MTYPE_CONNECT, - PGSTAT_MTYPE_DISCONNECT, - PGSTAT_MTYPE_SUBSCRIPTIONDROP, - PGSTAT_MTYPE_SUBSCRIPTIONERROR, -} StatMsgType; - -/* ---------- - * PgStat_MsgHdr The common message header - * ---------- - */ -typedef struct PgStat_MsgHdr -{ - StatMsgType m_type; - int m_size; -} PgStat_MsgHdr; - -/* ---------- - * Space available in a message. This will keep the UDP packets below 1K, - * which should fit unfragmented into the MTU of the loopback interface. - * (Larger values of PGSTAT_MAX_MSG_SIZE would work for that on most - * platforms, but we're being conservative here.) - * ---------- - */ -#define PGSTAT_MAX_MSG_SIZE 1000 -#define PGSTAT_MSG_PAYLOAD (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr)) - - -/* ---------- - * PgStat_MsgDummy A dummy message, ignored by the collector - * ---------- - */ -typedef struct PgStat_MsgDummy -{ - PgStat_MsgHdr m_hdr; -} PgStat_MsgDummy; - -/* ---------- - * PgStat_MsgInquiry Sent by a backend to ask the collector - * to write the stats file(s). - * - * Ordinarily, an inquiry message prompts writing of the global stats file, - * the stats file for shared catalogs, and the stats file for the specified - * database. If databaseid is InvalidOid, only the first two are written. - * - * New file(s) will be written only if the existing file has a timestamp - * older than the specified cutoff_time; this prevents duplicated effort - * when multiple requests arrive at nearly the same time, assuming that - * backends send requests with cutoff_times a little bit in the past. - * - * clock_time should be the requestor's current local time; the collector - * uses this to check for the system clock going backward, but it has no - * effect unless that occurs. We assume clock_time >= cutoff_time, though. - * ---------- - */ -typedef struct PgStat_MsgInquiry -{ - PgStat_MsgHdr m_hdr; - TimestampTz clock_time; /* observed local clock time */ - TimestampTz cutoff_time; /* minimum acceptable file timestamp */ - Oid databaseid; /* requested DB (InvalidOid => shared only) */ -} PgStat_MsgInquiry; - -/* ---------- - * PgStat_TableEntry Per-table info in a MsgTabstat - * ---------- - */ -typedef struct PgStat_TableEntry -{ - Oid t_id; - PgStat_TableCounts t_counts; -} PgStat_TableEntry; - -/* ---------- - * PgStat_MsgTabstat Sent by the backend to report table - * and buffer access statistics. - * ---------- - */ -#define PGSTAT_NUM_TABENTRIES \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 5 * sizeof(PgStat_Counter)) \ - / sizeof(PgStat_TableEntry)) - -typedef struct PgStat_MsgTabstat -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - int m_xact_commit; - int m_xact_rollback; - PgStat_Counter m_block_read_time; /* times in microseconds */ - PgStat_Counter m_block_write_time; - PgStat_Counter m_session_time; - PgStat_Counter m_active_time; - PgStat_Counter m_idle_in_xact_time; - PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES]; -} PgStat_MsgTabstat; - -/* ---------- - * PgStat_MsgTabpurge Sent by the backend to tell the collector - * about dead tables. - * ---------- - */ -#define PGSTAT_NUM_TABPURGE \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(Oid)) - -typedef struct PgStat_MsgTabpurge -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - Oid m_tableid[PGSTAT_NUM_TABPURGE]; -} PgStat_MsgTabpurge; - -/* ---------- - * PgStat_MsgDropdb Sent by the backend to tell the collector - * about a dropped database - * ---------- - */ -typedef struct PgStat_MsgDropdb -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgDropdb; - -/* ---------- - * PgStat_MsgResetcounter Sent by the backend to tell the collector - * to reset counters - * ---------- - */ -typedef struct PgStat_MsgResetcounter -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgResetcounter; - -/* ---------- - * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector - * to reset a shared counter - * ---------- - */ -typedef struct PgStat_MsgResetsharedcounter -{ - PgStat_MsgHdr m_hdr; - PgStat_Kind m_resettarget; -} PgStat_MsgResetsharedcounter; - -/* ---------- - * PgStat_MsgResetsinglecounter Sent by the backend to tell the collector - * to reset a single counter - * ---------- - */ -typedef struct PgStat_MsgResetsinglecounter -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - PgStat_Kind m_resettype; - Oid m_objectid; -} PgStat_MsgResetsinglecounter; - -/* ---------- - * PgStat_MsgResetslrucounter Sent by the backend to tell the collector - * to reset a SLRU counter - * ---------- - */ -typedef struct PgStat_MsgResetslrucounter -{ - PgStat_MsgHdr m_hdr; - int m_index; -} PgStat_MsgResetslrucounter; - -/* ---------- - * PgStat_MsgResetreplslotcounter Sent by the backend to tell the collector - * to reset replication slot counter(s) - * ---------- - */ -typedef struct PgStat_MsgResetreplslotcounter -{ - PgStat_MsgHdr m_hdr; - NameData m_slotname; - bool clearall; -} PgStat_MsgResetreplslotcounter; - -/* ---------- - * PgStat_MsgResetsubcounter Sent by the backend to tell the collector - * to reset subscription counter(s) - * ---------- - */ -typedef struct PgStat_MsgResetsubcounter -{ - PgStat_MsgHdr m_hdr; - Oid m_subid; /* InvalidOid means reset all subscription - * stats */ -} PgStat_MsgResetsubcounter; - -/* ---------- - * PgStat_MsgAutovacStart Sent by the autovacuum daemon to signal - * that a database is going to be processed - * ---------- - */ -typedef struct PgStat_MsgAutovacStart -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - TimestampTz m_start_time; -} PgStat_MsgAutovacStart; - -/* ---------- - * PgStat_MsgVacuum Sent by the backend or autovacuum daemon - * after VACUUM - * ---------- - */ -typedef struct PgStat_MsgVacuum -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - Oid m_tableoid; - bool m_autovacuum; - TimestampTz m_vacuumtime; - PgStat_Counter m_live_tuples; - PgStat_Counter m_dead_tuples; -} PgStat_MsgVacuum; - -/* ---------- - * PgStat_MsgAnalyze Sent by the backend or autovacuum daemon - * after ANALYZE - * ---------- - */ -typedef struct PgStat_MsgAnalyze -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - Oid m_tableoid; - bool m_autovacuum; - bool m_resetcounter; - TimestampTz m_analyzetime; - PgStat_Counter m_live_tuples; - PgStat_Counter m_dead_tuples; -} PgStat_MsgAnalyze; - -/* ---------- - * PgStat_MsgArchiver Sent by the archiver to update statistics. - * ---------- - */ -typedef struct PgStat_MsgArchiver -{ - PgStat_MsgHdr m_hdr; - bool m_failed; /* Failed attempt */ - char m_xlog[MAX_XFN_CHARS + 1]; - TimestampTz m_timestamp; -} PgStat_MsgArchiver; - -/* ---------- - * PgStat_MsgBgWriter Sent by the bgwriter to update statistics. - * ---------- - */ -typedef struct PgStat_MsgBgWriter -{ - PgStat_MsgHdr m_hdr; - - PgStat_Counter m_buf_written_clean; - PgStat_Counter m_maxwritten_clean; - PgStat_Counter m_buf_alloc; -} PgStat_MsgBgWriter; - -/* ---------- - * PgStat_MsgCheckpointer Sent by the checkpointer to update statistics. - * ---------- - */ -typedef struct PgStat_MsgCheckpointer -{ - PgStat_MsgHdr m_hdr; - - PgStat_Counter m_timed_checkpoints; - PgStat_Counter m_requested_checkpoints; - PgStat_Counter m_buf_written_checkpoints; - PgStat_Counter m_buf_written_backend; - PgStat_Counter m_buf_fsync_backend; - PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */ - PgStat_Counter m_checkpoint_sync_time; -} PgStat_MsgCheckpointer; - -/* ---------- - * PgStat_MsgWal Sent by backends and background processes to update WAL statistics. - * ---------- - */ -typedef struct PgStat_MsgWal -{ - PgStat_MsgHdr m_hdr; - PgStat_Counter m_wal_records; - PgStat_Counter m_wal_fpi; - uint64 m_wal_bytes; - PgStat_Counter m_wal_buffers_full; - PgStat_Counter m_wal_write; - PgStat_Counter m_wal_sync; - PgStat_Counter m_wal_write_time; /* time spent writing wal records in - * microseconds */ - PgStat_Counter m_wal_sync_time; /* time spent syncing wal records in - * microseconds */ -} PgStat_MsgWal; - -/* ---------- - * PgStat_MsgSLRU Sent by a backend to update SLRU statistics. - * ---------- - */ -typedef struct PgStat_MsgSLRU -{ - PgStat_MsgHdr m_hdr; - PgStat_Counter m_index; - PgStat_Counter m_blocks_zeroed; - PgStat_Counter m_blocks_hit; - PgStat_Counter m_blocks_read; - PgStat_Counter m_blocks_written; - PgStat_Counter m_blocks_exists; - PgStat_Counter m_flush; - PgStat_Counter m_truncate; -} PgStat_MsgSLRU; - -/* ---------- - * PgStat_MsgReplSlot Sent by a backend or a wal sender to update replication - * slot statistics. - * ---------- - */ -typedef struct PgStat_MsgReplSlot -{ - PgStat_MsgHdr m_hdr; - NameData m_slotname; - bool m_create; - bool m_drop; - PgStat_Counter m_spill_txns; - PgStat_Counter m_spill_count; - PgStat_Counter m_spill_bytes; - PgStat_Counter m_stream_txns; - PgStat_Counter m_stream_count; - PgStat_Counter m_stream_bytes; - PgStat_Counter m_total_txns; - PgStat_Counter m_total_bytes; -} PgStat_MsgReplSlot; - -/* ---------- - * PgStat_MsgSubscriptionDrop Sent by the backend and autovacuum to tell the - * collector about the dead subscription. - * ---------- - */ -typedef struct PgStat_MsgSubscriptionDrop -{ - PgStat_MsgHdr m_hdr; - Oid m_subid; -} PgStat_MsgSubscriptionDrop; - -/* ---------- - * PgStat_MsgSubscriptionError Sent by the apply worker or the table sync - * worker to report an error on the subscription. - * ---------- - */ -typedef struct PgStat_MsgSubscriptionError -{ - PgStat_MsgHdr m_hdr; - - Oid m_subid; - bool m_is_apply_error; -} PgStat_MsgSubscriptionError; - -/* ---------- - * PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict - * ---------- - */ -typedef struct PgStat_MsgRecoveryConflict -{ - PgStat_MsgHdr m_hdr; - - Oid m_databaseid; - int m_reason; -} PgStat_MsgRecoveryConflict; - -/* ---------- - * PgStat_MsgTempFile Sent by the backend upon creating a temp file - * ---------- - */ -typedef struct PgStat_MsgTempFile -{ - PgStat_MsgHdr m_hdr; - - Oid m_databaseid; - size_t m_filesize; -} PgStat_MsgTempFile; - -/* ---------- - * PgStat_FunctionEntry Per-function info in a MsgFuncstat - * ---------- - */ -typedef struct PgStat_FunctionEntry -{ - Oid f_id; - PgStat_Counter f_numcalls; - PgStat_Counter f_total_time; /* times in microseconds */ - PgStat_Counter f_self_time; -} PgStat_FunctionEntry; - -/* ---------- - * PgStat_MsgFuncstat Sent by the backend to report function - * usage statistics. - * ---------- - */ -#define PGSTAT_NUM_FUNCENTRIES \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(PgStat_FunctionEntry)) - -typedef struct PgStat_MsgFuncstat -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES]; -} PgStat_MsgFuncstat; - -/* ---------- - * PgStat_MsgFuncpurge Sent by the backend to tell the collector - * about dead functions. - * ---------- - */ -#define PGSTAT_NUM_FUNCPURGE \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(Oid)) - -typedef struct PgStat_MsgFuncpurge -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - Oid m_functionid[PGSTAT_NUM_FUNCPURGE]; -} PgStat_MsgFuncpurge; - -/* ---------- - * PgStat_MsgDeadlock Sent by the backend to tell the collector - * about a deadlock that occurred. - * ---------- - */ -typedef struct PgStat_MsgDeadlock -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgDeadlock; - -/* ---------- - * PgStat_MsgChecksumFailure Sent by the backend to tell the collector - * about checksum failures noticed. - * ---------- - */ -typedef struct PgStat_MsgChecksumFailure -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_failurecount; - TimestampTz m_failure_time; -} PgStat_MsgChecksumFailure; - -/* ---------- - * PgStat_MsgConnect Sent by the backend upon connection - * establishment - * ---------- - */ -typedef struct PgStat_MsgConnect -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgConnect; - -/* ---------- - * PgStat_MsgDisconnect Sent by the backend when disconnecting - * ---------- - */ -typedef struct PgStat_MsgDisconnect -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - SessionEndType m_cause; -} PgStat_MsgDisconnect; - -/* ---------- - * PgStat_Msg Union over all possible messages. - * ---------- - */ -typedef union PgStat_Msg -{ - PgStat_MsgHdr msg_hdr; - PgStat_MsgDummy msg_dummy; - PgStat_MsgInquiry msg_inquiry; - PgStat_MsgTabstat msg_tabstat; - PgStat_MsgTabpurge msg_tabpurge; - PgStat_MsgDropdb msg_dropdb; - PgStat_MsgResetcounter msg_resetcounter; - PgStat_MsgResetsharedcounter msg_resetsharedcounter; - PgStat_MsgResetsinglecounter msg_resetsinglecounter; - PgStat_MsgResetslrucounter msg_resetslrucounter; - PgStat_MsgResetreplslotcounter msg_resetreplslotcounter; - PgStat_MsgResetsubcounter msg_resetsubcounter; - PgStat_MsgAutovacStart msg_autovacuum_start; - PgStat_MsgVacuum msg_vacuum; - PgStat_MsgAnalyze msg_analyze; - PgStat_MsgArchiver msg_archiver; - PgStat_MsgBgWriter msg_bgwriter; - PgStat_MsgCheckpointer msg_checkpointer; - PgStat_MsgWal msg_wal; - PgStat_MsgSLRU msg_slru; - PgStat_MsgFuncstat msg_funcstat; - PgStat_MsgFuncpurge msg_funcpurge; - PgStat_MsgRecoveryConflict msg_recoveryconflict; - PgStat_MsgDeadlock msg_deadlock; - PgStat_MsgTempFile msg_tempfile; - PgStat_MsgChecksumFailure msg_checksumfailure; - PgStat_MsgReplSlot msg_replslot; - PgStat_MsgConnect msg_connect; - PgStat_MsgDisconnect msg_disconnect; - PgStat_MsgSubscriptionError msg_subscriptionerror; - PgStat_MsgSubscriptionDrop msg_subscriptiondrop; -} PgStat_Msg; - - -/* ------------------------------------------------------------ - * Statistic collector data structures follow + * Data structures on disk and in shared memory follow * * PGSTAT_FILE_FORMAT_ID should be changed whenever any of these * data structures change. * ------------------------------------------------------------ */ -#define PGSTAT_FILE_FORMAT_ID 0x01A5BCA6 +#define PGSTAT_FILE_FORMAT_ID 0x01A5BCA7 typedef struct PgStat_ArchiverStats { @@ -808,7 +267,6 @@ typedef struct PgStat_BgWriterStats typedef struct PgStat_CheckpointerStats { - TimestampTz stats_timestamp; /* time of stats file update */ PgStat_Counter timed_checkpoints; PgStat_Counter requested_checkpoints; PgStat_Counter checkpoint_write_time; /* times in milliseconds */ @@ -820,7 +278,6 @@ typedef struct PgStat_CheckpointerStats typedef struct PgStat_StatDBEntry { - Oid databaseid; PgStat_Counter n_xact_commit; PgStat_Counter n_xact_rollback; PgStat_Counter n_blocks_fetched; @@ -852,34 +309,16 @@ typedef struct PgStat_StatDBEntry PgStat_Counter n_sessions_killed; TimestampTz stat_reset_timestamp; - TimestampTz stats_timestamp; /* time of db stats file update */ - - /* - * tables and functions must be last in the struct, because we don't write - * the pointers out to the stats file. - */ - HTAB *tables; - HTAB *functions; } PgStat_StatDBEntry; typedef struct PgStat_StatFuncEntry { - Oid functionid; - PgStat_Counter f_numcalls; PgStat_Counter f_total_time; /* times in microseconds */ PgStat_Counter f_self_time; } PgStat_StatFuncEntry; -typedef struct PgStat_GlobalStats -{ - TimestampTz stats_timestamp; /* time of stats file update */ - - PgStat_CheckpointerStats checkpointer; - PgStat_BgWriterStats bgwriter; -} PgStat_GlobalStats; - typedef struct PgStat_StatReplSlotEntry { NameData slotname; @@ -908,8 +347,6 @@ typedef struct PgStat_SLRUStats typedef struct PgStat_StatSubEntry { - Oid subid; /* hash key (must be first) */ - PgStat_Counter apply_error_count; PgStat_Counter sync_error_count; TimestampTz stat_reset_timestamp; @@ -917,8 +354,6 @@ typedef struct PgStat_StatSubEntry typedef struct PgStat_StatTabEntry { - Oid tableid; - PgStat_Counter numscans; PgStat_Counter tuples_returned; @@ -966,22 +401,19 @@ typedef struct PgStat_WalStats */ /* functions called from postmaster */ -extern void pgstat_init(void); -extern void pgstat_reset_all(void); -extern int pgstat_start(void); -extern void allow_immediate_pgstat_restart(void); +extern Size StatsShmemSize(void); +extern void StatsShmemInit(void); -#ifdef EXEC_BACKEND -extern void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn(); -#endif +/* Functions called during server startup / shutdown */ +extern void pgstat_restore_stats(void); +extern void pgstat_discard_stats(void); +extern void pgstat_before_server_shutdown(int code, Datum arg); /* Functions for backend initialization */ extern void pgstat_initialize(void); /* Functions called from backends */ -extern void pgstat_report_stat(bool force); -extern void pgstat_vacuum_stat(void); -extern void pgstat_ping(void); +extern long pgstat_report_stat(bool force); extern void pgstat_reset_counters(void); extern void pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objectid); @@ -989,24 +421,17 @@ extern void pgstat_reset_of_kind(PgStat_Kind kind); /* stats accessors */ extern void pgstat_clear_snapshot(void); -extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void); -extern PgStat_BgWriterStats *pgstat_fetch_stat_bgwriter(void); -extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void); -extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid); -extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid); -extern PgStat_GlobalStats *pgstat_fetch_global(void); -extern PgStat_StatReplSlotEntry *pgstat_fetch_replslot(NameData slotname); -extern PgStat_StatSubEntry *pgstat_fetch_stat_subscription(Oid subid); -extern PgStat_SLRUStats *pgstat_fetch_slru(void); -extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); -extern PgStat_WalStats *pgstat_fetch_stat_wal(void); +extern TimestampTz pgstat_get_stat_snapshot_timestamp(bool *have_snapshot); +/* helpers */ +extern PgStat_Kind pgstat_get_kind_from_str(char *kind_str); /* * Functions in pgstat_archiver.c */ extern void pgstat_report_archiver(const char *xlog, bool failed); +extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void); /* @@ -1014,6 +439,7 @@ extern void pgstat_report_archiver(const char *xlog, bool failed); */ extern void pgstat_report_bgwriter(void); +extern PgStat_BgWriterStats *pgstat_fetch_stat_bgwriter(void); /* @@ -1021,6 +447,7 @@ extern void pgstat_report_bgwriter(void); */ extern void pgstat_report_checkpointer(void); +extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void); /* @@ -1044,6 +471,7 @@ extern void pgstat_report_connect(Oid dboid); #define pgstat_count_conn_txn_idle_time(n) \ (pgStatTransactionIdleTime += (n)) +extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid); /* * Functions in pgstat_function.c @@ -1058,6 +486,7 @@ extern void pgstat_init_function_usage(struct FunctionCallInfoBaseData *fcinfo, extern void pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize); +extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid); extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id); @@ -1070,6 +499,8 @@ extern void pgstat_drop_relation(Relation rel); extern void pgstat_copy_relation_stats(Relation dstrel, Relation srcrel); extern void pgstat_init_relation(Relation rel); +extern void pgstat_assoc_relation(Relation rel); +extern void pgstat_unlink_relation(Relation rel); extern void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter livetuples, PgStat_Counter deadtuples); @@ -1077,8 +508,14 @@ extern void pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples, bool resetcounter); +/* + * If stats are enabled, but pending data hasn't been prepared yet, call + * pgstat_assoc_relation() to do so. See its comment for why this is done + * separately from pgstat_init_relation(). + */ #define pgstat_should_count_relation(rel) \ - (likely((rel)->pgstat_info != NULL)) + (likely((rel)->pgstat_info != NULL) ? true : \ + ((rel)->pgstat_enabled ? pgstat_assoc_relation(rel), true : false)) /* nontransactional event counts are simple enough to inline */ @@ -1129,6 +566,9 @@ extern void pgstat_twophase_postcommit(TransactionId xid, uint16 info, extern void pgstat_twophase_postabort(TransactionId xid, uint16 info, void *recdata, uint32 len); +extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); +extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry_ext(bool shared, + Oid relid); extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id); @@ -1140,7 +580,9 @@ extern void pgstat_reset_replslot(const char *name); struct ReplicationSlot; extern void pgstat_report_replslot(struct ReplicationSlot *slot, const PgStat_StatReplSlotEntry *repSlotStat); extern void pgstat_create_replslot(struct ReplicationSlot *slot); +extern void pgstat_acquire_replslot(struct ReplicationSlot *slot); extern void pgstat_drop_replslot(struct ReplicationSlot *slot); +extern PgStat_StatReplSlotEntry *pgstat_fetch_replslot(NameData slotname); /* @@ -1157,6 +599,7 @@ extern void pgstat_count_slru_flush(int slru_idx); extern void pgstat_count_slru_truncate(int slru_idx); extern const char *pgstat_get_slru_name(int slru_idx); extern int pgstat_get_slru_index(const char *name); +extern PgStat_SLRUStats *pgstat_fetch_slru(void); /* @@ -1166,6 +609,7 @@ extern int pgstat_get_slru_index(const char *name); extern void pgstat_report_subscription_error(Oid subid, bool is_apply_error); extern void pgstat_create_subscription(Oid subid); extern void pgstat_drop_subscription(Oid subid); +extern PgStat_StatSubEntry *pgstat_fetch_stat_subscription(Oid subid); /* @@ -1186,6 +630,7 @@ extern void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_ */ extern void pgstat_report_wal(bool force); +extern PgStat_WalStats *pgstat_fetch_stat_wal(void); /* @@ -1195,6 +640,8 @@ extern void pgstat_report_wal(bool force); /* GUC parameters */ extern PGDLLIMPORT bool pgstat_track_counts; extern PGDLLIMPORT int pgstat_track_functions; +extern PGDLLIMPORT int pgstat_fetch_consistency; + extern char *pgstat_stat_directory; extern char *pgstat_stat_tmpname; extern char *pgstat_stat_filename; @@ -1205,7 +652,7 @@ extern char *pgstat_stat_filename; */ /* updated directly by bgwriter and bufmgr */ -extern PgStat_MsgBgWriter PendingBgWriterStats; +extern PgStat_BgWriterStats PendingBgWriterStats; /* @@ -1216,7 +663,7 @@ extern PgStat_MsgBgWriter PendingBgWriterStats; * Checkpointer statistics counters are updated directly by checkpointer and * bufmgr. */ -extern PgStat_MsgCheckpointer PendingCheckpointerStats; +extern PgStat_CheckpointerStats PendingCheckpointerStats; /* @@ -1243,7 +690,7 @@ extern SessionEndType pgStatSessionEndCause; */ /* updated directly by backends and background processes */ -extern PgStat_MsgWal WalStats; +extern PgStat_WalStats PendingWalStats; #endif /* PGSTAT_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index c3d5889d7b2..33eb4c10339 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -190,6 +190,9 @@ typedef enum BuiltinTrancheIds LWTRANCHE_SHARED_TIDBITMAP, LWTRANCHE_PARALLEL_APPEND, LWTRANCHE_PER_XACT_PREDICATE_LIST, + LWTRANCHE_PGSTATS_DSA, + LWTRANCHE_PGSTATS_HASH, + LWTRANCHE_PGSTATS_DATA, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds; diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h index c3f83c74c62..ab27bc47c5e 100644 --- a/src/include/utils/pgstat_internal.h +++ b/src/include/utils/pgstat_internal.h @@ -14,21 +14,134 @@ #define PGSTAT_INTERNAL_H +#include "common/hashfn.h" +#include "lib/dshash.h" +#include "lib/ilist.h" #include "pgstat.h" +#include "storage/lwlock.h" +#include "utils/dsa.h" -#define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file - * updates; in milliseconds. */ +/* + * Types related to shared memory storage of statistics. + * + * Per-object statistics are stored in the "shared stats" hashtable. That + * table's entries (PgStatShared_HashEntry) contain a pointer to the actual stats + * data for the object (the size of the stats data varies depending on the + * kind of stats). The table is keyed by PgStat_HashKey. + * + * Once a backend has a reference to a shared stats entry, it increments the + * entry's refcount. Even after stats data is dropped (e.g., due to a DROP + * TABLE), the entry itself can only be deleted once all references have been + * released. + * + * These refcounts, in combination with a backend local hashtable + * (pgStatEntryRefHash, with entries pointing to PgStat_EntryRef) in front of + * the shared hash table, mean that most stats work can happen without + * touching the shared hash table, reducing contention. + * + * Once there are pending stats updates for a table PgStat_EntryRef->pending + * is allocated to contain a working space for as-of-yet-unapplied stats + * updates. Once the stats are flushed, PgStat_EntryRef->pending is freed. + * + * Each stat kind in the shared hash table has a fixed member + * PgStatShared_Common as the first element. + */ -/* ---------- - * The initial size hints for the hash tables used in the collector. - * ---------- +/* struct for shared statistics hash entry key. */ +typedef struct PgStat_HashKey +{ + PgStat_Kind kind; /* statistics entry kind */ + Oid dboid; /* database ID. InvalidOid for shared objects. */ + Oid objoid; /* object ID, either table or function. */ +} PgStat_HashKey; + +/* + * Shared statistics hash entry. Doesn't itself contain any stats, but points + * to them (with ->body). That allows the stats entries themselves to be of + * variable size. */ -#define PGSTAT_DB_HASH_SIZE 16 -#define PGSTAT_TAB_HASH_SIZE 512 -#define PGSTAT_FUNCTION_HASH_SIZE 512 -#define PGSTAT_SUBSCRIPTION_HASH_SIZE 32 -#define PGSTAT_REPLSLOT_HASH_SIZE 32 +typedef struct PgStatShared_HashEntry +{ + PgStat_HashKey key; /* hash key */ + + /* + * If dropped is set, backends need to release their references so that + * the memory for the entry can be freed. No new references may be made + * once marked as dropped. + */ + bool dropped; + + /* + * Refcount managing lifetime of the entry itself (as opposed to the + * dshash entry pointing to it). The stats lifetime has to be separate + * from the hash table entry lifetime because we allow backends to point + * to a stats entry without holding a hash table lock (and some other + * reasons). + * + * As long as the entry is not dropped, 1 is added to the refcount + * representing that the entry should not be dropped. In addition each + * backend that has a reference to the entry needs to increment the + * refcount as long as it does. + * + * May only be incremented / decremented while holding at least a shared + * lock on the dshash partition containing the entry. It needs to be an + * atomic variable because multiple backends can increment the refcount + * with just a shared lock. + * + * When the refcount reaches 0 the entry needs to be freed. + */ + pg_atomic_uint32 refcount; + + /* + * Pointer to shared stats. The stats entry always starts with + * PgStatShared_Common, embedded in a larger struct containing the + * PgStat_Kind specific stats fields. + */ + dsa_pointer body; +} PgStatShared_HashEntry; + +/* + * Common header struct for PgStatShm_Stat*Entry. + */ +typedef struct PgStatShared_Common +{ + uint32 magic; /* just a validity cross-check */ + /* lock protecting stats contents (i.e. data following the header) */ + LWLock lock; +} PgStatShared_Common; + +/* + * A backend local reference to a shared stats entry. As long as at least one + * such reference exists, the shared stats entry will not be released. + * + * If there are pending stats update to the shared stats, these are stored in + * ->pending. + */ +typedef struct PgStat_EntryRef +{ + /* + * Pointer to the PgStatShared_HashEntry entry in the shared stats + * hashtable. + */ + PgStatShared_HashEntry *shared_entry; + + /* + * Pointer to the stats data (i.e. PgStatShared_HashEntry->body), resolved + * as a local pointer, to avoid repeated dsa_get_address() calls. + */ + PgStatShared_Common *shared_stats; + + /* + * Pending statistics data that will need to be flushed to shared memory + * stats eventually. Each stats kind utilizing pending data defines what + * format its pending data has and needs to provide a + * PgStat_KindInfo->flush_pending_cb callback to merge pending into shared + * stats. + */ + void *pending; + dlist_node pending_node; /* membership in pgStatPending list */ +} PgStat_EntryRef; /* @@ -43,11 +156,11 @@ typedef struct PgStat_SubXactStatus struct PgStat_SubXactStatus *prev; /* higher-level subxact if any */ /* - * Dropping the statistics for objects that dropped transactionally itself - * needs to be transactional. Therefore we collect the stats dropped in - * the current (sub-)transaction and only execute the stats drop when we - * know if the transaction commits/aborts. To handle replicas and crashes, - * stats drops are included in commit records. + * Statistics for transactionally dropped objects need to be + * transactionally dropped as well. Collect the stats dropped in the + * current (sub-)transaction and only execute the stats drop when we know + * if the transaction commits/aborts. To handle replicas and crashes, + * stats drops are included in commit / abort records. */ dlist_head pending_drops; int pending_drops_count; @@ -65,9 +178,95 @@ typedef struct PgStat_SubXactStatus /* + * Metadata for a specific kind of statistics. + */ +typedef struct PgStat_KindInfo +{ + /* + * Do a fixed number of stats objects exist for this kind of stats (e.g. + * bgwriter stats) or not (e.g. tables). + */ + bool fixed_amount:1; + + /* + * Can stats of this kind be accessed from another database? Determines + * whether a stats object gets included in stats snapshots. + */ + bool accessed_across_databases:1; + + /* + * For variable-numbered stats: Identified on-disk using a name, rather + * than PgStat_HashKey. Probably only needed for replication slot stats. + */ + bool named_on_disk:1; + + /* + * The size of an entry in the shared stats hash table (pointed to by + * PgStatShared_HashEntry->body). + */ + uint32 shared_size; + + /* + * The offset/size of statistics inside the shared stats entry. Used when + * [de-]serializing statistics to / from disk respectively. Separate from + * shared_size because [de-]serialization may not include in-memory state + * like lwlocks. + */ + uint32 shared_data_off; + uint32 shared_data_len; + + /* + * The size of the pending data for this kind. E.g. how large + * PgStat_EntryRef->pending is. Used for allocations. + * + * 0 signals that an entry of this kind should never have a pending entry. + */ + uint32 pending_size; + + /* + * For variable-numbered stats: flush pending stats. Required if pending + * data is used. + */ + bool (*flush_pending_cb) (PgStat_EntryRef *sr, bool nowait); + + /* + * For variable-numbered stats: delete pending stats. Optional. + */ + void (*delete_pending_cb) (PgStat_EntryRef *sr); + + /* + * For variable-numbered stats: reset the reset timestamp. Optional. + */ + void (*reset_timestamp_cb) (PgStatShared_Common *header, TimestampTz ts); + + /* + * For variable-numbered stats with named_on_disk. Optional. + */ + void (*to_serialized_name) (const PgStatShared_Common *header, NameData *name); + bool (*from_serialized_name) (const NameData *name, PgStat_HashKey *key); + + /* + * For fixed-numbered statistics: Reset All. + */ + void (*reset_all_cb) (TimestampTz ts); + + /* + * For fixed-numbered statistics: Build snapshot for entry + */ + void (*snapshot_cb) (void); + + /* name of the kind of stats */ + const char *const name; +} PgStat_KindInfo; + + +/* * List of SLRU names that we keep stats for. There is no central registry of * SLRUs, so we use this fixed list instead. The "other" entry is used for * all SLRUs without an explicit entry (e.g. SLRUs in extensions). + * + * This is only defined here so that SLRU_NUM_ELEMENTS is known for later type + * definitions. */ static const char *const slru_names[] = { "CommitTs", @@ -83,33 +282,271 @@ static const char *const slru_names[] = { #define SLRU_NUM_ELEMENTS lengthof(slru_names) +/* ---------- + * Types and definitions for different kinds of fixed-amount stats. + * + * Single-writer stats use the changecount mechanism to achieve low-overhead + * writes - they're obviously more performance critical than reads. Check the + * definition of struct PgBackendStatus for some explanation of the + * changecount mechanism. + * + * Because the obvious implementation of resetting single-writer stats isn't + * compatible with that (another backend needs to write), we don't scribble on + * shared stats while resetting. Instead, just record the current counter + * values in a copy of the stats data, which is protected by ->lock. See + * pgstat_fetch_stat_(archiver|bgwriter|checkpointer) for the reader side. + * + * The only exception to that is the the stat_reset_timestamp in these + * structs, which is protected by ->lock, because it has to be written by + * another backend while resetting + * ---------- + */ + +typedef struct PgStatShared_Archiver +{ + /* lock protects ->reset_offset as well as stats->stat_reset_timestamp */ + LWLock lock; + uint32 changecount; + PgStat_ArchiverStats stats; + PgStat_ArchiverStats reset_offset; +} PgStatShared_Archiver; + +typedef struct PgStatShared_BgWriter +{ + /* lock protects ->reset_offset as well as stats->stat_reset_timestamp */ + LWLock lock; + uint32 changecount; + PgStat_BgWriterStats stats; + PgStat_BgWriterStats reset_offset; +} PgStatShared_BgWriter; + +typedef struct PgStatShared_Checkpointer +{ + /* lock protects ->reset_offset as well as stats->stat_reset_timestamp */ + LWLock lock; + uint32 changecount; + PgStat_CheckpointerStats stats; + PgStat_CheckpointerStats reset_offset; +} PgStatShared_Checkpointer; + +typedef struct PgStatShared_SLRU +{ + /* lock protects ->stats */ + LWLock lock; + PgStat_SLRUStats stats[SLRU_NUM_ELEMENTS]; +} PgStatShared_SLRU; + +typedef struct PgStatShared_Wal +{ + /* lock protects ->stats */ + LWLock lock; + PgStat_WalStats stats; +} PgStatShared_Wal; + + + +/* ---------- + * Types and definitions for different kinds of variable-amount stats. + * + * Each struct has to start with PgStatShared_Common, containing information + * common across the different types of stats. Kind-specific data follows. + * ---------- + */ + +typedef struct PgStatShared_Database +{ + PgStatShared_Common header; + PgStat_StatDBEntry stats; +} PgStatShared_Database; + +typedef struct PgStatShared_Relation +{ + PgStatShared_Common header; + PgStat_StatTabEntry stats; +} PgStatShared_Relation; + +typedef struct PgStatShared_Function +{ + PgStatShared_Common header; + PgStat_StatFuncEntry stats; +} PgStatShared_Function; + +typedef struct PgStatShared_Subscription +{ + PgStatShared_Common header; + PgStat_StatSubEntry stats; +} PgStatShared_Subscription; + +typedef struct PgStatShared_ReplSlot +{ + PgStatShared_Common header; + PgStat_StatReplSlotEntry stats; +} PgStatShared_ReplSlot; + + +/* + * Central shared memory entry for the cumulative stats system. + * + * Fixed amount stats, the dynamic shared memory hash table for + * non-fixed-amount stats, as well as remaining bits and pieces are all + * reached from here. + */ +typedef struct PgStat_ShmemControl +{ + void *raw_dsa_area; + + /* + * Stats for variable-numbered objects are kept in this shared hash table. + * See comment above PgStat_Kind for details. + */ + dshash_table_handle hash_handle; /* shared dbstat hash */ + + /* Has the stats system already been shut down? Just a debugging check. */ + bool is_shutdown; + + /* + * Whenever statistics for dropped objects could not be freed - because + * backends still have references - the dropping backend calls + * pgstat_request_entry_refs_gc() incrementing this counter. Eventually + * that causes backends to run pgstat_gc_entry_refs(), allowing memory to + * be reclaimed. + */ + pg_atomic_uint64 gc_request_count; + + /* + * Stats data for fixed-numbered objects. + */ + PgStatShared_Archiver archiver; + PgStatShared_BgWriter bgwriter; + PgStatShared_Checkpointer checkpointer; + PgStatShared_SLRU slru; + PgStatShared_Wal wal; +} PgStat_ShmemControl; + + +/* + * Cached statistics snapshot + */ +typedef struct PgStat_Snapshot +{ + PgStat_FetchConsistency mode; + + /* time at which snapshot was taken */ + TimestampTz snapshot_timestamp; + + bool fixed_valid[PGSTAT_NUM_KINDS]; + + PgStat_ArchiverStats archiver; + + PgStat_BgWriterStats bgwriter; + + PgStat_CheckpointerStats checkpointer; + + PgStat_SLRUStats slru[SLRU_NUM_ELEMENTS]; + + PgStat_WalStats wal; + + /* to free snapshot in bulk */ + MemoryContext context; + struct pgstat_snapshot_hash *stats; +} PgStat_Snapshot; + + +/* + * Collection of backend-local stats state. + */ +typedef struct PgStat_LocalState +{ + PgStat_ShmemControl *shmem; + dsa_area *dsa; + dshash_table *shared_hash; + + /* the current statistics snapshot */ + PgStat_Snapshot snapshot; +} PgStat_LocalState; + + +/* + * Inline functions defined further below. + */ + +static inline void pgstat_begin_changecount_write(uint32 *cc); +static inline void pgstat_end_changecount_write(uint32 *cc); +static inline uint32 pgstat_begin_changecount_read(uint32 *cc); +static inline bool pgstat_end_changecount_read(uint32 *cc, uint32 cc_before); + +static inline void pgstat_copy_changecounted_stats(void *dst, void *src, size_t len, + uint32 *cc); + +static inline int pgstat_cmp_hash_key(const void *a, const void *b, size_t size, void *arg); +static inline uint32 pgstat_hash_hash_key(const void *d, size_t size, void *arg); +static inline size_t pgstat_get_entry_len(PgStat_Kind kind); +static inline void *pgstat_get_entry_data(PgStat_Kind kind, PgStatShared_Common *entry); + + /* * Functions in pgstat.c */ -extern void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype); -extern void pgstat_send(void *msg, int len); +const PgStat_KindInfo *pgstat_get_kind_info(PgStat_Kind kind); + #ifdef USE_ASSERT_CHECKING extern void pgstat_assert_is_up(void); #else #define pgstat_assert_is_up() ((void)true) #endif +extern void pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref); +extern PgStat_EntryRef *pgstat_prep_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid, bool *created_entry); +extern PgStat_EntryRef *pgstat_fetch_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid); + +extern void *pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, Oid objoid); +extern void pgstat_snapshot_fixed(PgStat_Kind kind); + + +/* + * Functions in pgstat_archiver.c + */ + +extern void pgstat_archiver_reset_all_cb(TimestampTz ts); +extern void pgstat_archiver_snapshot_cb(void); + + +/* + * Functions in pgstat_bgwriter.c + */ + +extern void pgstat_bgwriter_reset_all_cb(TimestampTz ts); +extern void pgstat_bgwriter_snapshot_cb(void); + + +/* + * Functions in pgstat_checkpointer.c + */ + +extern void pgstat_checkpointer_reset_all_cb(TimestampTz ts); +extern void pgstat_checkpointer_snapshot_cb(void); + /* * Functions in pgstat_database.c */ -extern void AtEOXact_PgStat_Database(bool isCommit, bool parallel); extern void pgstat_report_disconnect(Oid dboid); -extern void pgstat_update_dbstats(PgStat_MsgTabstat *tsmsg, TimestampTz now); +extern void pgstat_update_dbstats(TimestampTz ts); +extern void AtEOXact_PgStat_Database(bool isCommit, bool parallel); + +extern PgStat_StatDBEntry *pgstat_prep_database_pending(Oid dboid); +extern void pgstat_reset_database_timestamp(Oid dboid, TimestampTz ts); +extern bool pgstat_database_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_database_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); /* * Functions in pgstat_function.c */ -extern void pgstat_send_funcstats(void); +extern bool pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); /* @@ -120,23 +557,73 @@ extern void AtEOXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isC extern void AtEOSubXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit, int nestDepth); extern void AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state); extern void PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state); -extern void pgstat_send_tabstats(TimestampTz now, bool disconnect); + +extern bool pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref); + + +/* + * Functions in pgstat_replslot.c + */ + +extern void pgstat_replslot_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); +extern void pgstat_replslot_to_serialized_name_cb(const PgStatShared_Common *tmp, NameData *name); +extern bool pgstat_replslot_from_serialized_name_cb(const NameData *name, PgStat_HashKey *key); + + +/* + * Functions in pgstat_shmem.c + */ + +extern void pgstat_attach_shmem(void); +extern void pgstat_detach_shmem(void); + +extern PgStat_EntryRef *pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, Oid objoid, + bool create, bool *found); +extern bool pgstat_lock_entry(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_unlock_entry(PgStat_EntryRef *entry_ref); +extern bool pgstat_drop_entry(PgStat_Kind kind, Oid dboid, Oid objoid); +extern void pgstat_drop_all_entries(void); +extern PgStat_EntryRef *pgstat_get_entry_ref_locked(PgStat_Kind kind, Oid dboid, Oid objoid, + bool nowait); +extern void pgstat_reset_entry(PgStat_Kind kind, Oid dboid, Oid objoid, TimestampTz ts); +extern void pgstat_reset_entries_of_kind(PgStat_Kind kind, TimestampTz ts); +extern void pgstat_reset_matching_entries(bool (*do_reset) (PgStatShared_HashEntry *, Datum), + Datum match_data, + TimestampTz ts); + +extern void pgstat_request_entry_refs_gc(void); +extern PgStatShared_Common *pgstat_init_entry(PgStat_Kind kind, + PgStatShared_HashEntry *shhashent); /* * Functions in pgstat_slru.c */ -extern void pgstat_send_slru(void); +extern bool pgstat_slru_flush(bool nowait); +extern void pgstat_slru_reset_all_cb(TimestampTz ts); +extern void pgstat_slru_snapshot_cb(void); /* * Functions in pgstat_wal.c */ +extern bool pgstat_flush_wal(bool nowait); extern void pgstat_init_wal(void); extern bool pgstat_have_pending_wal(void); +extern void pgstat_wal_reset_all_cb(TimestampTz ts); +extern void pgstat_wal_snapshot_cb(void); + + +/* + * Functions in pgstat_subscription.c + */ + +extern bool pgstat_subscription_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_subscription_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); /* * Functions in pgstat_xact.c @@ -151,29 +638,145 @@ extern void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, Oid objoid) * Variables in pgstat.c */ -extern pgsocket pgStatSock; +extern PgStat_LocalState pgStatLocal; /* - * Variables in pgstat_database.c + * Variables in pgstat_slru.c */ -extern int pgStatXactCommit; -extern int pgStatXactRollback; +extern bool have_slrustats; /* - * Variables in pgstat_functions.c + * Implementation of inline functions declared above. + */ + +/* + * Helpers for changecount manipulation. See comments around struct + * PgBackendStatus for details. */ -extern bool have_function_stats; +static inline void +pgstat_begin_changecount_write(uint32 *cc) +{ + Assert((*cc & 1) == 0); + + START_CRIT_SECTION(); + (*cc)++; + pg_write_barrier(); +} + +static inline void +pgstat_end_changecount_write(uint32 *cc) +{ + Assert((*cc & 1) == 1); + + pg_write_barrier(); + + (*cc)++; + + END_CRIT_SECTION(); +} + +static inline uint32 +pgstat_begin_changecount_read(uint32 *cc) +{ + uint32 before_cc = *cc; + + CHECK_FOR_INTERRUPTS(); + pg_read_barrier(); + + return before_cc; +} /* - * Variables in pgstat_relation.c + * Returns true if the read succeeded, false if it needs to be repeated. */ +static inline bool +pgstat_end_changecount_read(uint32 *cc, uint32 before_cc) +{ + uint32 after_cc; + + pg_read_barrier(); + + after_cc = *cc; + + /* was a write in progress when we started? */ + if (before_cc & 1) + return false; + + /* did writes start and complete while we read? */ + return before_cc == after_cc; +} + + +/* + * helper function for PgStat_KindInfo->snapshot_cb + * PgStat_KindInfo->reset_all_cb callbacks. + * + * Copies out the specified memory area following change-count protocol. + */ +static inline void +pgstat_copy_changecounted_stats(void *dst, void *src, size_t len, + uint32 *cc) +{ + uint32 cc_before; + + do + { + cc_before = pgstat_begin_changecount_read(cc); + + memcpy(dst, src, len); + } + while (!pgstat_end_changecount_read(cc, cc_before)); +} + +/* helpers for dshash / simplehash hashtables */ +static inline int +pgstat_cmp_hash_key(const void *a, const void *b, size_t size, void *arg) +{ + AssertArg(size == sizeof(PgStat_HashKey) && arg == NULL); + return memcmp(a, b, sizeof(PgStat_HashKey)); +} + +static inline uint32 +pgstat_hash_hash_key(const void *d, size_t size, void *arg) +{ + const PgStat_HashKey *key = (PgStat_HashKey *) d; + uint32 hash; + + AssertArg(size == sizeof(PgStat_HashKey) && arg == NULL); + + hash = murmurhash32(key->kind); + hash = hash_combine(hash, murmurhash32(key->dboid)); + hash = hash_combine(hash, murmurhash32(key->objoid)); + + return hash; +} + +/* + * The length of the data portion of a shared memory stats entry (i.e. without + * transient data such as refcounts, lwlocks, ...). + */ +static inline size_t +pgstat_get_entry_len(PgStat_Kind kind) +{ + return pgstat_get_kind_info(kind)->shared_data_len; +} + +/* + * Returns a pointer to the data portion of a shared memory stats entry. + */ +static inline void * +pgstat_get_entry_data(PgStat_Kind kind, PgStatShared_Common *entry) +{ + size_t off = pgstat_get_kind_info(kind)->shared_data_off; -extern bool have_relation_stats; + Assert(off != 0 && off < PG_UINT32_MAX); + return ((char *) (entry)) + off; +} #endif /* PGSTAT_INTERNAL_H */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 121dbbc9a96..eadbd009045 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -246,6 +246,7 @@ typedef struct RelationData */ Oid rd_toastoid; /* Real TOAST table's OID, or InvalidOid */ + bool pgstat_enabled; /* should relation stats be counted */ /* use "struct" here to avoid needing to include pgstat.h: */ struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ } RelationData; diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h index 099f91c61da..c068986d09a 100644 --- a/src/include/utils/timeout.h +++ b/src/include/utils/timeout.h @@ -32,6 +32,7 @@ typedef enum TimeoutId STANDBY_LOCK_TIMEOUT, IDLE_IN_TRANSACTION_SESSION_TIMEOUT, IDLE_SESSION_TIMEOUT, + IDLE_STATS_UPDATE_TIMEOUT, CLIENT_CONNECTION_CHECK_TIMEOUT, STARTUP_PROGRESS_TIMEOUT, /* First user-definable timeout reason */ diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index d870c592632..b578e2ec757 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -42,7 +42,6 @@ typedef enum WAIT_EVENT_CHECKPOINTER_MAIN, WAIT_EVENT_LOGICAL_APPLY_MAIN, WAIT_EVENT_LOGICAL_LAUNCHER_MAIN, - WAIT_EVENT_PGSTAT_MAIN, WAIT_EVENT_RECOVERY_WAL_STREAM, WAIT_EVENT_SYSLOGGER_MAIN, WAIT_EVENT_WAL_RECEIVER_MAIN, |