/*------------------------------------------------------------------------- * * postmaster.c * This program acts as a clearing house for requests to the * POSTGRES system. Frontend programs connect to the Postmaster, * and postmaster forks a new backend process to handle the * connection. * * The postmaster also manages system-wide operations such as * startup and shutdown. The postmaster itself doesn't do those * operations, mind you --- it just forks off a subprocess to do them * at the right times. It also takes care of resetting the system * if a backend crashes. * * The postmaster process creates the shared memory and semaphore * pools during startup, but as a rule does not touch them itself. * In particular, it is not a member of the PGPROC array of backends * and so it cannot participate in lock-manager operations. Keeping * the postmaster away from shared memory operations makes it simpler * and more reliable. The postmaster is almost always able to recover * from crashes of individual backends by resetting shared memory; * if it did much with shared memory then it would be prone to crashing * along with the backends. * * When a request message is received, we now fork() immediately. * The child process performs authentication of the request, and * then becomes a backend if successful. This allows the auth code * to be written in a simple single-threaded style (as opposed to the * crufty "poor man's multitasking" code that used to be needed). * More importantly, it ensures that blockages in non-multithreaded * libraries like SSL or PAM cannot cause denial of service to other * clients. * * * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/backend/postmaster/postmaster.c * * NOTES * * Initialization: * The Postmaster sets up shared memory data structures * for the backends. * * Synchronization: * The Postmaster shares memory with the backends but should avoid * touching shared memory, so as not to become stuck if a crashing * backend screws up locks or shared memory. Likewise, the Postmaster * should never block on messages from frontend clients. * * Garbage Collection: * The Postmaster cleans up after backends if they have an emergency * exit and/or core dump. * * Error Reporting: * Use write_stderr() only for reporting "interactive" errors * (essentially, bogus arguments on the command line). Once the * postmaster is launched, use ereport(). * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include #include #include #include #include #include #include #include #include #include #ifdef USE_BONJOUR #include #endif #ifdef USE_SYSTEMD #include #endif #ifdef HAVE_PTHREAD_IS_THREADED_NP #include #endif #include "access/xlog.h" #include "access/xlogrecovery.h" #include "common/file_perm.h" #include "common/pg_prng.h" #include "lib/ilist.h" #include "libpq/libpq.h" #include "libpq/pqsignal.h" #include "pg_getopt.h" #include "pgstat.h" #include "port/pg_bswap.h" #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/pgarch.h" #include "postmaster/postmaster.h" #include "postmaster/syslogger.h" #include "postmaster/walsummarizer.h" #include "replication/logicallauncher.h" #include "replication/slotsync.h" #include "replication/walsender.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/pmsignal.h" #include "tcop/backend_startup.h" #include "tcop/tcopprot.h" #include "utils/datetime.h" #include "utils/memutils.h" #include "utils/pidfile.h" #include "utils/timestamp.h" #include "utils/varlena.h" #ifdef EXEC_BACKEND #include "common/file_utils.h" #include "storage/pg_shmem.h" #endif /* * Possible types of a backend. Beyond being the possible bkend_type values in * struct bkend, these are OR-able request flag bits for SignalSomeChildren() * and CountChildren(). */ #define BACKEND_TYPE_NORMAL 0x0001 /* normal backend */ #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */ #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */ #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ #define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ /* * List of active backends (or child processes anyway; we don't actually * know whether a given child has become a backend or is still in the * authorization phase). This is used mainly to keep track of how many * children we have and send them appropriate signals when necessary. * * As shown in the above set of backend types, this list includes not only * "normal" client sessions, but also autovacuum workers, walsenders, and * background workers. (Note that at the time of launch, walsenders are * labeled BACKEND_TYPE_NORMAL; we relabel them to BACKEND_TYPE_WALSND * upon noticing they've changed their PMChildFlags entry. Hence that check * must be done before any operation that needs to distinguish walsenders * from normal backends.) * * Also, "dead_end" children are in it: these are children launched just for * the purpose of sending a friendly rejection message to a would-be client. * We must track them because they are attached to shared memory, but we know * they will never become live backends. dead_end children are not assigned a * PMChildSlot. dead_end children have bkend_type NORMAL. * * "Special" children such as the startup, bgwriter, autovacuum launcher, and * slot sync worker tasks are not in this list. They are tracked via StartupPID * and other pid_t variables below. (Thus, there can't be more than one of any * given "special" child process type. We use BackendList entries for any * child process there can be more than one of.) */ typedef struct bkend { pid_t pid; /* process id of backend */ int child_slot; /* PMChildSlot for this backend, if any */ int bkend_type; /* child process flavor, see above */ bool dead_end; /* is it going to send an error and quit? */ RegisteredBgWorker *rw; /* bgworker info, if this is a bgworker */ bool bgworker_notify; /* gets bgworker start/stop notifications */ dlist_node elem; /* list link in BackendList */ } Backend; static dlist_head BackendList = DLIST_STATIC_INIT(BackendList); BackgroundWorker *MyBgworkerEntry = NULL; /* The socket number we are listening for connections on */ int PostPortNumber = DEF_PGPORT; /* The directory names for Unix socket(s) */ char *Unix_socket_directories; /* The TCP listen address(es) */ char *ListenAddresses; /* * SuperuserReservedConnections is the number of backends reserved for * superuser use, and ReservedConnections is the number of backends reserved * for use by roles with privileges of the pg_use_reserved_connections * predefined role. These are taken out of the pool of MaxConnections backend * slots, so the number of backend slots available for roles that are neither * superuser nor have privileges of pg_use_reserved_connections is * (MaxConnections - SuperuserReservedConnections - ReservedConnections). * * If the number of remaining slots is less than or equal to * SuperuserReservedConnections, only superusers can make new connections. If * the number of remaining slots is greater than SuperuserReservedConnections * but less than or equal to * (SuperuserReservedConnections + ReservedConnections), only superusers and * roles with privileges of pg_use_reserved_connections can make new * connections. Note that pre-existing superuser and * pg_use_reserved_connections connections don't count against the limits. */ int SuperuserReservedConnections; int ReservedConnections; /* The socket(s) we're listening to. */ #define MAXLISTEN 64 static int NumListenSockets = 0; static pgsocket *ListenSockets = NULL; /* still more option variables */ bool EnableSSL = false; int PreAuthDelay = 0; int AuthenticationTimeout = 60; bool log_hostname; /* for ps display and logging */ bool Log_connections = false; bool enable_bonjour = false; char *bonjour_name; bool restart_after_crash = true; bool remove_temp_files_after_crash = true; bool send_abort_for_crash = false; bool send_abort_for_kill = false; /* PIDs of special child processes; 0 when not running */ static pid_t StartupPID = 0, BgWriterPID = 0, CheckpointerPID = 0, WalWriterPID = 0, WalReceiverPID = 0, WalSummarizerPID = 0, AutoVacPID = 0, PgArchPID = 0, SysLoggerPID = 0, SlotSyncWorkerPID = 0; /* Startup process's status */ typedef enum { STARTUP_NOT_RUNNING, STARTUP_RUNNING, STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */ STARTUP_CRASHED, } StartupStatusEnum; static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING; /* Startup/shutdown state */ #define NoShutdown 0 #define SmartShutdown 1 #define FastShutdown 2 #define ImmediateShutdown 3 static int Shutdown = NoShutdown; static bool FatalError = false; /* T if recovering from backend crash */ /* * We use a simple state machine to control startup, shutdown, and * crash recovery (which is rather like shutdown followed by startup). * * After doing all the postmaster initialization work, we enter PM_STARTUP * state and the startup process is launched. The startup process begins by * reading the control file and other preliminary initialization steps. * In a normal startup, or after crash recovery, the startup process exits * with exit code 0 and we switch to PM_RUN state. However, archive recovery * is handled specially since it takes much longer and we would like to support * hot standby during archive recovery. * * When the startup process is ready to start archive recovery, it signals the * postmaster, and we switch to PM_RECOVERY state. The background writer and * checkpointer are launched, while the startup process continues applying WAL. * If Hot Standby is enabled, then, after reaching a consistent point in WAL * redo, startup process signals us again, and we switch to PM_HOT_STANDBY * state and begin accepting connections to perform read-only queries. When * archive recovery is finished, the startup process exits with exit code 0 * and we switch to PM_RUN state. * * Normal child backends can only be launched when we are in PM_RUN or * PM_HOT_STANDBY state. (connsAllowed can also restrict launching.) * In other states we handle connection requests by launching "dead_end" * child processes, which will simply send the client an error message and * quit. (We track these in the BackendList so that we can know when they * are all gone; this is important because they're still connected to shared * memory, and would interfere with an attempt to destroy the shmem segment, * possibly leading to SHMALL failure when we try to make a new one.) * In PM_WAIT_DEAD_END state we are waiting for all the dead_end children * to drain out of the system, and therefore stop accepting connection * requests at all until the last existing child has quit (which hopefully * will not be very long). * * Notice that this state variable does not distinguish *why* we entered * states later than PM_RUN --- Shutdown and FatalError must be consulted * to find that out. FatalError is never true in PM_RECOVERY, PM_HOT_STANDBY, * or PM_RUN states, nor in PM_SHUTDOWN states (because we don't enter those * states when trying to recover from a crash). It can be true in PM_STARTUP * state, because we don't clear it until we've successfully started WAL redo. */ typedef enum { PM_INIT, /* postmaster starting */ PM_STARTUP, /* waiting for startup subprocess */ PM_RECOVERY, /* in archive recovery mode */ PM_HOT_STANDBY, /* in hot standby mode */ PM_RUN, /* normal "database is alive" state */ PM_STOP_BACKENDS, /* need to stop remaining backends */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ PM_SHUTDOWN, /* waiting for checkpointer to do shutdown * ckpt */ PM_SHUTDOWN_2, /* waiting for archiver and walsenders to * finish */ PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */ PM_NO_CHILDREN, /* all important children have exited */ } PMState; static PMState pmState = PM_INIT; /* * While performing a "smart shutdown", we restrict new connections but stay * in PM_RUN or PM_HOT_STANDBY state until all the client backends are gone. * connsAllowed is a sub-state indicator showing the active restriction. * It is of no interest unless pmState is PM_RUN or PM_HOT_STANDBY. */ static bool connsAllowed = true; /* Start time of SIGKILL timeout during immediate shutdown or child crash */ /* Zero means timeout is not running */ static time_t AbortStartTime = 0; /* Length of said timeout */ #define SIGKILL_CHILDREN_AFTER_SECS 5 static bool ReachedNormalRunning = false; /* T if we've reached PM_RUN */ bool ClientAuthInProgress = false; /* T during new-client * authentication */ bool redirection_done = false; /* stderr redirected for syslogger? */ /* received START_AUTOVAC_LAUNCHER signal */ static bool start_autovac_launcher = false; /* the launcher needs to be signaled to communicate some condition */ static bool avlauncher_needs_signal = false; /* received START_WALRECEIVER signal */ static bool WalReceiverRequested = false; /* set when there's a worker that needs to be started up */ static bool StartWorkerNeeded = true; static bool HaveCrashedWorker = false; /* set when signals arrive */ static volatile sig_atomic_t pending_pm_pmsignal; static volatile sig_atomic_t pending_pm_child_exit; static volatile sig_atomic_t pending_pm_reload_request; static volatile sig_atomic_t pending_pm_shutdown_request; static volatile sig_atomic_t pending_pm_fast_shutdown_request; static volatile sig_atomic_t pending_pm_immediate_shutdown_request; /* event multiplexing object */ static WaitEventSet *pm_wait_set; #ifdef USE_SSL /* Set when and if SSL has been initialized properly */ bool LoadedSSL = false; #endif #ifdef USE_BONJOUR static DNSServiceRef bonjour_sdref = NULL; #endif /* * postmaster.c - function prototypes */ static void CloseServerPorts(int status, Datum arg); static void unlink_external_pid_file(int status, Datum arg); static void getInstallationPaths(const char *argv0); static void checkControlFile(void); static void handle_pm_pmsignal_signal(SIGNAL_ARGS); static void handle_pm_child_exit_signal(SIGNAL_ARGS); static void handle_pm_reload_request_signal(SIGNAL_ARGS); static void handle_pm_shutdown_request_signal(SIGNAL_ARGS); static void process_pm_pmsignal(void); static void process_pm_child_exit(void); static void process_pm_reload_request(void); static void process_pm_shutdown_request(void); static void dummy_handler(SIGNAL_ARGS); static void CleanupBackend(Backend *bp, int exitstatus); static void HandleChildCrash(int pid, int exitstatus, const char *procname); static void LogChildExit(int lev, const char *procname, int pid, int exitstatus); static void PostmasterStateMachine(void); static void ExitPostmaster(int status) pg_attribute_noreturn(); static int ServerLoop(void); static int BackendStartup(ClientSocket *client_sock); static void report_fork_failure_to_client(ClientSocket *client_sock, int errnum); static CAC_state canAcceptConnections(int backend_type); static void signal_child(pid_t pid, int signal); static void sigquit_child(pid_t pid); static bool SignalSomeChildren(int signal, int target); static void TerminateChildren(int signal); #define SignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL) static int CountChildren(int target); static Backend *assign_backendlist_entry(void); static void LaunchMissingBackgroundProcesses(void); static void maybe_start_bgworkers(void); static bool CreateOptsFile(int argc, char *argv[], char *fullprogname); static pid_t StartChildProcess(BackendType type); static void StartAutovacuumWorker(void); static void InitPostmasterDeathWatchHandle(void); #ifdef WIN32 #define WNOHANG 0 /* ignored, so any integer value will do */ static pid_t waitpid(pid_t pid, int *exitstatus, int options); static void WINAPI pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired); static HANDLE win32ChildQueue; typedef struct { HANDLE waitHandle; HANDLE procHandle; DWORD procId; } win32_deadchild_waitinfo; #endif /* WIN32 */ /* Macros to check exit status of a child process */ #define EXIT_STATUS_0(st) ((st) == 0) #define EXIT_STATUS_1(st) (WIFEXITED(st) && WEXITSTATUS(st) == 1) #define EXIT_STATUS_3(st) (WIFEXITED(st) && WEXITSTATUS(st) == 3) #ifndef WIN32 /* * File descriptors for pipe used to monitor if postmaster is alive. * First is POSTMASTER_FD_WATCH, second is POSTMASTER_FD_OWN. */ int postmaster_alive_fds[2] = {-1, -1}; #else /* Process handle of postmaster used for the same purpose on Windows */ HANDLE PostmasterHandle; #endif /* * Postmaster main entry point */ void PostmasterMain(int argc, char *argv[]) { int opt; int status; char *userDoption = NULL; bool listen_addr_saved = false; char *output_config_variable = NULL; InitProcessGlobals(); PostmasterPid = MyProcPid; IsPostmasterEnvironment = true; /* * Start our win32 signal implementation */ #ifdef WIN32 pgwin32_signal_initialize(); #endif /* * We should not be creating any files or directories before we check the * data directory (see checkDataDir()), but just in case set the umask to * the most restrictive (owner-only) permissions. * * checkDataDir() will reset the umask based on the data directory * permissions. */ umask(PG_MODE_MASK_OWNER); /* * By default, palloc() requests in the postmaster will be allocated in * the PostmasterContext, which is space that can be recycled by backends. * Allocated data that needs to be available to backends should be * allocated in TopMemoryContext. */ PostmasterContext = AllocSetContextCreate(TopMemoryContext, "Postmaster", ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(PostmasterContext); /* Initialize paths to installation files */ getInstallationPaths(argv[0]); /* * Set up signal handlers for the postmaster process. * * CAUTION: when changing this list, check for side-effects on the signal * handling setup of child processes. See tcop/postgres.c, * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c, * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/syslogger.c, * postmaster/bgworker.c and postmaster/checkpointer.c. */ pqinitmask(); sigprocmask(SIG_SETMASK, &BlockSig, NULL); pqsignal(SIGHUP, handle_pm_reload_request_signal); pqsignal(SIGINT, handle_pm_shutdown_request_signal); pqsignal(SIGQUIT, handle_pm_shutdown_request_signal); pqsignal(SIGTERM, handle_pm_shutdown_request_signal); pqsignal(SIGALRM, SIG_IGN); /* ignored */ pqsignal(SIGPIPE, SIG_IGN); /* ignored */ pqsignal(SIGUSR1, handle_pm_pmsignal_signal); pqsignal(SIGUSR2, dummy_handler); /* unused, reserve for children */ pqsignal(SIGCHLD, handle_pm_child_exit_signal); /* This may configure SIGURG, depending on platform. */ InitializeLatchSupport(); InitProcessLocalLatch(); /* * No other place in Postgres should touch SIGTTIN/SIGTTOU handling. We * ignore those signals in a postmaster environment, so that there is no * risk of a child process freezing up due to writing to stderr. But for * a standalone backend, their default handling is reasonable. Hence, all * child processes should just allow the inherited settings to stand. */ #ifdef SIGTTIN pqsignal(SIGTTIN, SIG_IGN); /* ignored */ #endif #ifdef SIGTTOU pqsignal(SIGTTOU, SIG_IGN); /* ignored */ #endif /* ignore SIGXFSZ, so that ulimit violations work like disk full */ #ifdef SIGXFSZ pqsignal(SIGXFSZ, SIG_IGN); /* ignored */ #endif /* Begin accepting signals. */ sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); /* * Options setup */ InitializeGUCOptions(); opterr = 1; /* * Parse command-line options. CAUTION: keep this in sync with * tcop/postgres.c (the option sets should not conflict) and with the * common help() function in main/main.c. */ while ((opt = getopt(argc, argv, "B:bC:c:D:d:EeFf:h:ijk:lN:OPp:r:S:sTt:W:-:")) != -1) { switch (opt) { case 'B': SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; case 'b': /* Undocumented flag used for binary upgrades */ IsBinaryUpgrade = true; break; case 'C': output_config_variable = strdup(optarg); break; case 'c': case '-': { char *name, *value; ParseLongOption(optarg, &name, &value); if (!value) { if (opt == '-') ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("--%s requires a value", optarg))); else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("-c %s requires a value", optarg))); } SetConfigOption(name, value, PGC_POSTMASTER, PGC_S_ARGV); pfree(name); pfree(value); break; } case 'D': userDoption = strdup(optarg); break; case 'd': set_debug_options(atoi(optarg), PGC_POSTMASTER, PGC_S_ARGV); break; case 'E': SetConfigOption("log_statement", "all", PGC_POSTMASTER, PGC_S_ARGV); break; case 'e': SetConfigOption("datestyle", "euro", PGC_POSTMASTER, PGC_S_ARGV); break; case 'F': SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV); break; case 'f': if (!set_plan_disabling_options(optarg, PGC_POSTMASTER, PGC_S_ARGV)) { write_stderr("%s: invalid argument for option -f: \"%s\"\n", progname, optarg); ExitPostmaster(1); } break; case 'h': SetConfigOption("listen_addresses", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; case 'i': SetConfigOption("listen_addresses", "*", PGC_POSTMASTER, PGC_S_ARGV); break; case 'j': /* only used by interactive backend */ break; case 'k': SetConfigOption("unix_socket_directories", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; case 'l': SetConfigOption("ssl", "true", PGC_POSTMASTER, PGC_S_ARGV); break; case 'N': SetConfigOption("max_connections", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; case 'O': SetConfigOption("allow_system_table_mods", "true", PGC_POSTMASTER, PGC_S_ARGV); break; case 'P': SetConfigOption("ignore_system_indexes", "true", PGC_POSTMASTER, PGC_S_ARGV); break; case 'p': SetConfigOption("port", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; case 'r': /* only used by single-user backend */ break; case 'S': SetConfigOption("work_mem", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; case 's': SetConfigOption("log_statement_stats", "true", PGC_POSTMASTER, PGC_S_ARGV); break; case 'T': /* * This option used to be defined as sending SIGSTOP after a * backend crash, but sending SIGABRT seems more useful. */ SetConfigOption("send_abort_for_crash", "true", PGC_POSTMASTER, PGC_S_ARGV); break; case 't': { const char *tmp = get_stats_option_name(optarg); if (tmp) { SetConfigOption(tmp, "true", PGC_POSTMASTER, PGC_S_ARGV); } else { write_stderr("%s: invalid argument for option -t: \"%s\"\n", progname, optarg); ExitPostmaster(1); } break; } case 'W': SetConfigOption("post_auth_delay", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; default: write_stderr("Try \"%s --help\" for more information.\n", progname); ExitPostmaster(1); } } /* * Postmaster accepts no non-option switch arguments. */ if (optind < argc) { write_stderr("%s: invalid argument: \"%s\"\n", progname, argv[optind]); write_stderr("Try \"%s --help\" for more information.\n", progname); ExitPostmaster(1); } /* * Locate the proper configuration files and data directory, and read * postgresql.conf for the first time. */ if (!SelectConfigFiles(userDoption, progname)) ExitPostmaster(2); if (output_config_variable != NULL) { /* * If this is a runtime-computed GUC, it hasn't yet been initialized, * and the present value is not useful. However, this is a convenient * place to print the value for most GUCs because it is safe to run * postmaster startup to this point even if the server is already * running. For the handful of runtime-computed GUCs that we cannot * provide meaningful values for yet, we wait until later in * postmaster startup to print the value. We won't be able to use -C * on running servers for those GUCs, but using this option now would * lead to incorrect results for them. */ int flags = GetConfigOptionFlags(output_config_variable, true); if ((flags & GUC_RUNTIME_COMPUTED) == 0) { /* * "-C guc" was specified, so print GUC's value and exit. No * extra permission check is needed because the user is reading * inside the data dir. */ const char *config_val = GetConfigOption(output_config_variable, false, false); puts(config_val ? config_val : ""); ExitPostmaster(0); } /* * A runtime-computed GUC will be printed later on. As we initialize * a server startup sequence, silence any log messages that may show * up in the output generated. FATAL and more severe messages are * useful to show, even if one would only expect at least PANIC. LOG * entries are hidden. */ SetConfigOption("log_min_messages", "FATAL", PGC_SUSET, PGC_S_OVERRIDE); } /* Verify that DataDir looks reasonable */ checkDataDir(); /* Check that pg_control exists */ checkControlFile(); /* And switch working directory into it */ ChangeToDataDir(); /* * Check for invalid combinations of GUC settings. */ if (SuperuserReservedConnections + ReservedConnections >= MaxConnections) { write_stderr("%s: \"superuser_reserved_connections\" (%d) plus \"reserved_connections\" (%d) must be less than \"max_connections\" (%d)\n", progname, SuperuserReservedConnections, ReservedConnections, MaxConnections); ExitPostmaster(1); } if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level == WAL_LEVEL_MINIMAL) ereport(ERROR, (errmsg("WAL archival cannot be enabled when \"wal_level\" is \"minimal\""))); if (max_wal_senders > 0 && wal_level == WAL_LEVEL_MINIMAL) ereport(ERROR, (errmsg("WAL streaming (\"max_wal_senders\" > 0) requires \"wal_level\" to be \"replica\" or \"logical\""))); if (summarize_wal && wal_level == WAL_LEVEL_MINIMAL) ereport(ERROR, (errmsg("WAL cannot be summarized when \"wal_level\" is \"minimal\""))); /* * Other one-time internal sanity checks can go here, if they are fast. * (Put any slow processing further down, after postmaster.pid creation.) */ if (!CheckDateTokenTables()) { write_stderr("%s: invalid datetoken tables, please fix\n", progname); ExitPostmaster(1); } /* * Now that we are done processing the postmaster arguments, reset * getopt(3) library so that it will work correctly in subprocesses. */ optind = 1; #ifdef HAVE_INT_OPTRESET optreset = 1; /* some systems need this too */ #endif /* For debugging: display postmaster environment */ { extern char **environ; char **p; ereport(DEBUG3, (errmsg_internal("%s: PostmasterMain: initial environment dump:", progname))); ereport(DEBUG3, (errmsg_internal("-----------------------------------------"))); for (p = environ; *p; ++p) ereport(DEBUG3, (errmsg_internal("\t%s", *p))); ereport(DEBUG3, (errmsg_internal("-----------------------------------------"))); } /* * Create lockfile for data directory. * * We want to do this before we try to grab the input sockets, because the * data directory interlock is more reliable than the socket-file * interlock (thanks to whoever decided to put socket files in /tmp :-(). * For the same reason, it's best to grab the TCP socket(s) before the * Unix socket(s). * * Also note that this internally sets up the on_proc_exit function that * is responsible for removing both data directory and socket lockfiles; * so it must happen before opening sockets so that at exit, the socket * lockfiles go away after CloseServerPorts runs. */ CreateDataDirLockFile(true); /* * Read the control file (for error checking and config info). * * Since we verify the control file's CRC, this has a useful side effect * on machines where we need a run-time test for CRC support instructions. * The postmaster will do the test once at startup, and then its child * processes will inherit the correct function pointer and not need to * repeat the test. */ LocalProcessControlFile(false); /* * Register the apply launcher. It's probably a good idea to call this * before any modules had a chance to take the background worker slots. */ ApplyLauncherRegister(); /* * process any libraries that should be preloaded at postmaster start */ process_shared_preload_libraries(); /* * Initialize SSL library, if specified. */ #ifdef USE_SSL if (EnableSSL) { (void) secure_initialize(true); LoadedSSL = true; } #endif /* * Now that loadable modules have had their chance to alter any GUCs, * calculate MaxBackends. */ InitializeMaxBackends(); /* * Calculate the size of the PGPROC fast-path lock arrays. */ InitializeFastPathLocks(); /* * Give preloaded libraries a chance to request additional shared memory. */ process_shmem_requests(); /* * Now that loadable modules have had their chance to request additional * shared memory, determine the value of any runtime-computed GUCs that * depend on the amount of shared memory required. */ InitializeShmemGUCs(); /* * Now that modules have been loaded, we can process any custom resource * managers specified in the wal_consistency_checking GUC. */ InitializeWalConsistencyChecking(); /* * If -C was specified with a runtime-computed GUC, we held off printing * the value earlier, as the GUC was not yet initialized. We handle -C * for most GUCs before we lock the data directory so that the option may * be used on a running server. However, a handful of GUCs are runtime- * computed and do not have meaningful values until after locking the data * directory, and we cannot safely calculate their values earlier on a * running server. At this point, such GUCs should be properly * initialized, and we haven't yet set up shared memory, so this is a good * time to handle the -C option for these special GUCs. */ if (output_config_variable != NULL) { const char *config_val = GetConfigOption(output_config_variable, false, false); puts(config_val ? config_val : ""); ExitPostmaster(0); } /* * Set up shared memory and semaphores. * * Note: if using SysV shmem and/or semas, each postmaster startup will * normally choose the same IPC keys. This helps ensure that we will * clean up dead IPC objects if the postmaster crashes and is restarted. */ CreateSharedMemoryAndSemaphores(); /* * Estimate number of openable files. This must happen after setting up * semaphores, because on some platforms semaphores count as open files. */ set_max_safe_fds(); /* * Set reference point for stack-depth checking. */ (void) set_stack_base(); /* * Initialize pipe (or process handle on Windows) that allows children to * wake up from sleep on postmaster death. */ InitPostmasterDeathWatchHandle(); #ifdef WIN32 /* * Initialize I/O completion port used to deliver list of dead children. */ win32ChildQueue = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 1); if (win32ChildQueue == NULL) ereport(FATAL, (errmsg("could not create I/O completion port for child queue"))); #endif #ifdef EXEC_BACKEND /* Write out nondefault GUC settings for child processes to use */ write_nondefault_variables(PGC_POSTMASTER); /* * Clean out the temp directory used to transmit parameters to child * processes (see internal_forkexec). We must do this before launching * any child processes, else we have a race condition: we could remove a * parameter file before the child can read it. It should be safe to do * so now, because we verified earlier that there are no conflicting * Postgres processes in this data directory. */ RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false); #endif /* * Forcibly remove the files signaling a standby promotion request. * Otherwise, the existence of those files triggers a promotion too early, * whether a user wants that or not. * * This removal of files is usually unnecessary because they can exist * only during a few moments during a standby promotion. However there is * a race condition: if pg_ctl promote is executed and creates the files * during a promotion, the files can stay around even after the server is * brought up to be the primary. Then, if a new standby starts by using * the backup taken from the new primary, the files can exist at server * startup and must be removed in order to avoid an unexpected promotion. * * Note that promotion signal files need to be removed before the startup * process is invoked. Because, after that, they can be used by * postmaster's SIGUSR1 signal handler. */ RemovePromoteSignalFiles(); /* Do the same for logrotate signal file */ RemoveLogrotateSignalFiles(); /* Remove any outdated file holding the current log filenames. */ if (unlink(LOG_METAINFO_DATAFILE) < 0 && errno != ENOENT) ereport(LOG, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", LOG_METAINFO_DATAFILE))); /* * If enabled, start up syslogger collection subprocess */ SysLoggerPID = SysLogger_Start(); /* * Reset whereToSendOutput from DestDebug (its starting state) to * DestNone. This stops ereport from sending log messages to stderr unless * Log_destination permits. We don't do this until the postmaster is * fully launched, since startup failures may as well be reported to * stderr. * * If we are in fact disabling logging to stderr, first emit a log message * saying so, to provide a breadcrumb trail for users who may not remember * that their logging is configured to go somewhere else. */ if (!(Log_destination & LOG_DESTINATION_STDERR)) ereport(LOG, (errmsg("ending log output to stderr"), errhint("Future log output will go to log destination \"%s\".", Log_destination_string))); whereToSendOutput = DestNone; /* * Report server startup in log. While we could emit this much earlier, * it seems best to do so after starting the log collector, if we intend * to use one. */ ereport(LOG, (errmsg("starting %s", PG_VERSION_STR))); /* * Establish input sockets. * * First set up an on_proc_exit function that's charged with closing the * sockets again at postmaster shutdown. */ ListenSockets = palloc(MAXLISTEN * sizeof(pgsocket)); on_proc_exit(CloseServerPorts, 0); if (ListenAddresses) { char *rawstring; List *elemlist; ListCell *l; int success = 0; /* Need a modifiable copy of ListenAddresses */ rawstring = pstrdup(ListenAddresses); /* Parse string into list of hostnames */ if (!SplitGUCList(rawstring, ',', &elemlist)) { /* syntax error in list */ ereport(FATAL, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid list syntax in parameter \"%s\"", "listen_addresses"))); } foreach(l, elemlist) { char *curhost = (char *) lfirst(l); if (strcmp(curhost, "*") == 0) status = ListenServerPort(AF_UNSPEC, NULL, (unsigned short) PostPortNumber, NULL, ListenSockets, &NumListenSockets, MAXLISTEN); else status = ListenServerPort(AF_UNSPEC, curhost, (unsigned short) PostPortNumber, NULL, ListenSockets, &NumListenSockets, MAXLISTEN); if (status == STATUS_OK) { success++; /* record the first successful host addr in lockfile */ if (!listen_addr_saved) { AddToDataDirLockFile(LOCK_FILE_LINE_LISTEN_ADDR, curhost); listen_addr_saved = true; } } else ereport(WARNING, (errmsg("could not create listen socket for \"%s\"", curhost))); } if (!success && elemlist != NIL) ereport(FATAL, (errmsg("could not create any TCP/IP sockets"))); list_free(elemlist); pfree(rawstring); } #ifdef USE_BONJOUR /* Register for Bonjour only if we opened TCP socket(s) */ if (enable_bonjour && NumListenSockets > 0) { DNSServiceErrorType err; /* * We pass 0 for interface_index, which will result in registering on * all "applicable" interfaces. It's not entirely clear from the * DNS-SD docs whether this would be appropriate if we have bound to * just a subset of the available network interfaces. */ err = DNSServiceRegister(&bonjour_sdref, 0, 0, bonjour_name, "_postgresql._tcp.", NULL, NULL, pg_hton16(PostPortNumber), 0, NULL, NULL, NULL); if (err != kDNSServiceErr_NoError) ereport(LOG, (errmsg("DNSServiceRegister() failed: error code %ld", (long) err))); /* * We don't bother to read the mDNS daemon's reply, and we expect that * it will automatically terminate our registration when the socket is * closed at postmaster termination. So there's nothing more to be * done here. However, the bonjour_sdref is kept around so that * forked children can close their copies of the socket. */ } #endif if (Unix_socket_directories) { char *rawstring; List *elemlist; ListCell *l; int success = 0; /* Need a modifiable copy of Unix_socket_directories */ rawstring = pstrdup(Unix_socket_directories); /* Parse string into list of directories */ if (!SplitDirectoriesString(rawstring, ',', &elemlist)) { /* syntax error in list */ ereport(FATAL, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid list syntax in parameter \"%s\"", "unix_socket_directories"))); } foreach(l, elemlist) { char *socketdir = (char *) lfirst(l); status = ListenServerPort(AF_UNIX, NULL, (unsigned short) PostPortNumber, socketdir, ListenSockets, &NumListenSockets, MAXLISTEN); if (status == STATUS_OK) { success++; /* record the first successful Unix socket in lockfile */ if (success == 1) AddToDataDirLockFile(LOCK_FILE_LINE_SOCKET_DIR, socketdir); } else ereport(WARNING, (errmsg("could not create Unix-domain socket in directory \"%s\"", socketdir))); } if (!success && elemlist != NIL) ereport(FATAL, (errmsg("could not create any Unix-domain sockets"))); list_free_deep(elemlist); pfree(rawstring); } /* * check that we have some socket to listen on */ if (NumListenSockets == 0) ereport(FATAL, (errmsg("no socket created for listening"))); /* * If no valid TCP ports, write an empty line for listen address, * indicating the Unix socket must be used. Note that this line is not * added to the lock file until there is a socket backing it. */ if (!listen_addr_saved) AddToDataDirLockFile(LOCK_FILE_LINE_LISTEN_ADDR, ""); /* * Record postmaster options. We delay this till now to avoid recording * bogus options (eg, unusable port number). */ if (!CreateOptsFile(argc, argv, my_exec_path)) ExitPostmaster(1); /* * Write the external PID file if requested */ if (external_pid_file) { FILE *fpidfile = fopen(external_pid_file, "w"); if (fpidfile) { fprintf(fpidfile, "%d\n", MyProcPid); fclose(fpidfile); /* Make PID file world readable */ if (chmod(external_pid_file, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) != 0) write_stderr("%s: could not change permissions of external PID file \"%s\": %m\n", progname, external_pid_file); } else write_stderr("%s: could not write external PID file \"%s\": %m\n", progname, external_pid_file); on_proc_exit(unlink_external_pid_file, 0); } /* * Remove old temporary files. At this point there can be no other * Postgres processes running in this directory, so this should be safe. */ RemovePgTempFiles(); /* * Initialize the autovacuum subsystem (again, no process start yet) */ autovac_init(); /* * Load configuration files for client authentication. */ if (!load_hba()) { /* * It makes no sense to continue if we fail to load the HBA file, * since there is no way to connect to the database in this case. */ ereport(FATAL, /* translator: %s is a configuration file */ (errmsg("could not load %s", HbaFileName))); } if (!load_ident()) { /* * We can start up without the IDENT file, although it means that you * cannot log in using any of the authentication methods that need a * user name mapping. load_ident() already logged the details of error * to the log. */ } #ifdef HAVE_PTHREAD_IS_THREADED_NP /* * On macOS, libintl replaces setlocale() with a version that calls * CFLocaleCopyCurrent() when its second argument is "" and every relevant * environment variable is unset or empty. CFLocaleCopyCurrent() makes * the process multithreaded. The postmaster calls sigprocmask() and * calls fork() without an immediate exec(), both of which have undefined * behavior in a multithreaded program. A multithreaded postmaster is the * normal case on Windows, which offers neither fork() nor sigprocmask(). */ if (pthread_is_threaded_np() != 0) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("postmaster became multithreaded during startup"), errhint("Set the LC_ALL environment variable to a valid locale."))); #endif /* * Remember postmaster startup time */ PgStartTime = GetCurrentTimestamp(); /* * Report postmaster status in the postmaster.pid file, to allow pg_ctl to * see what's happening. */ AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STARTING); /* Start bgwriter and checkpointer so they can help with recovery */ if (CheckpointerPID == 0) CheckpointerPID = StartChildProcess(B_CHECKPOINTER); if (BgWriterPID == 0) BgWriterPID = StartChildProcess(B_BG_WRITER); /* * We're ready to rock and roll... */ StartupPID = StartChildProcess(B_STARTUP); Assert(StartupPID != 0); StartupStatus = STARTUP_RUNNING; pmState = PM_STARTUP; /* Some workers may be scheduled to start now */ maybe_start_bgworkers(); status = ServerLoop(); /* * ServerLoop probably shouldn't ever return, but if it does, close down. */ ExitPostmaster(status != STATUS_OK); abort(); /* not reached */ } /* * on_proc_exit callback to close server's listen sockets */ static void CloseServerPorts(int status, Datum arg) { int i; /* * First, explicitly close all the socket FDs. We used to just let this * happen implicitly at postmaster exit, but it's better to close them * before we remove the postmaster.pid lockfile; otherwise there's a race * condition if a new postmaster wants to re-use the TCP port number. */ for (i = 0; i < NumListenSockets; i++) { if (closesocket(ListenSockets[i]) != 0) elog(LOG, "could not close listen socket: %m"); } NumListenSockets = 0; /* * Next, remove any filesystem entries for Unix sockets. To avoid race * conditions against incoming postmasters, this must happen after closing * the sockets and before removing lock files. */ RemoveSocketFiles(); /* * We don't do anything about socket lock files here; those will be * removed in a later on_proc_exit callback. */ } /* * on_proc_exit callback to delete external_pid_file */ static void unlink_external_pid_file(int status, Datum arg) { if (external_pid_file) unlink(external_pid_file); } /* * Compute and check the directory paths to files that are part of the * installation (as deduced from the postgres executable's own location) */ static void getInstallationPaths(const char *argv0) { DIR *pdir; /* Locate the postgres executable itself */ if (find_my_exec(argv0, my_exec_path) < 0) ereport(FATAL, (errmsg("%s: could not locate my own executable path", argv0))); #ifdef EXEC_BACKEND /* Locate executable backend before we change working directory */ if (find_other_exec(argv0, "postgres", PG_BACKEND_VERSIONSTR, postgres_exec_path) < 0) ereport(FATAL, (errmsg("%s: could not locate matching postgres executable", argv0))); #endif /* * Locate the pkglib directory --- this has to be set early in case we try * to load any modules from it in response to postgresql.conf entries. */ get_pkglib_path(my_exec_path, pkglib_path); /* * Verify that there's a readable directory there; otherwise the Postgres * installation is incomplete or corrupt. (A typical cause of this * failure is that the postgres executable has been moved or hardlinked to * some directory that's not a sibling of the installation lib/ * directory.) */ pdir = AllocateDir(pkglib_path); if (pdir == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", pkglib_path), errhint("This may indicate an incomplete PostgreSQL installation, or that the file \"%s\" has been moved away from its proper location.", my_exec_path))); FreeDir(pdir); /* * It's not worth checking the share/ directory. If the lib/ directory is * there, then share/ probably is too. */ } /* * Check that pg_control exists in the correct location in the data directory. * * No attempt is made to validate the contents of pg_control here. This is * just a sanity check to see if we are looking at a real data directory. */ static void checkControlFile(void) { char path[MAXPGPATH]; FILE *fp; snprintf(path, sizeof(path), "%s/global/pg_control", DataDir); fp = AllocateFile(path, PG_BINARY_R); if (fp == NULL) { write_stderr("%s: could not find the database system\n" "Expected to find it in the directory \"%s\",\n" "but could not open file \"%s\": %m\n", progname, DataDir, path); ExitPostmaster(2); } FreeFile(fp); } /* * Determine how long should we let ServerLoop sleep, in milliseconds. * * In normal conditions we wait at most one minute, to ensure that the other * background tasks handled by ServerLoop get done even when no requests are * arriving. However, if there are background workers waiting to be started, * we don't actually sleep so that they are quickly serviced. Other exception * cases are as shown in the code. */ static int DetermineSleepTime(void) { TimestampTz next_wakeup = 0; /* * Normal case: either there are no background workers at all, or we're in * a shutdown sequence (during which we ignore bgworkers altogether). */ if (Shutdown > NoShutdown || (!StartWorkerNeeded && !HaveCrashedWorker)) { if (AbortStartTime != 0) { int seconds; /* time left to abort; clamp to 0 in case it already expired */ seconds = SIGKILL_CHILDREN_AFTER_SECS - (time(NULL) - AbortStartTime); return Max(seconds * 1000, 0); } else return 60 * 1000; } if (StartWorkerNeeded) return 0; if (HaveCrashedWorker) { dlist_mutable_iter iter; /* * When there are crashed bgworkers, we sleep just long enough that * they are restarted when they request to be. Scan the list to * determine the minimum of all wakeup times according to most recent * crash time and requested restart interval. */ dlist_foreach_modify(iter, &BackgroundWorkerList) { RegisteredBgWorker *rw; TimestampTz this_wakeup; rw = dlist_container(RegisteredBgWorker, rw_lnode, iter.cur); if (rw->rw_crashed_at == 0) continue; if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART || rw->rw_terminate) { ForgetBackgroundWorker(rw); continue; } this_wakeup = TimestampTzPlusMilliseconds(rw->rw_crashed_at, 1000L * rw->rw_worker.bgw_restart_time); if (next_wakeup == 0 || this_wakeup < next_wakeup) next_wakeup = this_wakeup; } } if (next_wakeup != 0) { int ms; /* result of TimestampDifferenceMilliseconds is in [0, INT_MAX] */ ms = (int) TimestampDifferenceMilliseconds(GetCurrentTimestamp(), next_wakeup); return Min(60 * 1000, ms); } return 60 * 1000; } /* * Activate or deactivate notifications of server socket events. Since we * don't currently have a way to remove events from an existing WaitEventSet, * we'll just destroy and recreate the whole thing. This is called during * shutdown so we can wait for backends to exit without accepting new * connections, and during crash reinitialization when we need to start * listening for new connections again. The WaitEventSet will be freed in fork * children by ClosePostmasterPorts(). */ static void ConfigurePostmasterWaitSet(bool accept_connections) { if (pm_wait_set) FreeWaitEventSet(pm_wait_set); pm_wait_set = NULL; pm_wait_set = CreateWaitEventSet(NULL, accept_connections ? (1 + NumListenSockets) : 1); AddWaitEventToSet(pm_wait_set, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); if (accept_connections) { for (int i = 0; i < NumListenSockets; i++) AddWaitEventToSet(pm_wait_set, WL_SOCKET_ACCEPT, ListenSockets[i], NULL, NULL); } } /* * Main idle loop of postmaster */ static int ServerLoop(void) { time_t last_lockfile_recheck_time, last_touch_time; WaitEvent events[MAXLISTEN]; int nevents; ConfigurePostmasterWaitSet(true); last_lockfile_recheck_time = last_touch_time = time(NULL); for (;;) { time_t now; nevents = WaitEventSetWait(pm_wait_set, DetermineSleepTime(), events, lengthof(events), 0 /* postmaster posts no wait_events */ ); /* * Latch set by signal handler, or new connection pending on any of * our sockets? If the latter, fork a child process to deal with it. */ for (int i = 0; i < nevents; i++) { if (events[i].events & WL_LATCH_SET) ResetLatch(MyLatch); /* * The following requests are handled unconditionally, even if we * didn't see WL_LATCH_SET. This gives high priority to shutdown * and reload requests where the latch happens to appear later in * events[] or will be reported by a later call to * WaitEventSetWait(). */ if (pending_pm_shutdown_request) process_pm_shutdown_request(); if (pending_pm_reload_request) process_pm_reload_request(); if (pending_pm_child_exit) process_pm_child_exit(); if (pending_pm_pmsignal) process_pm_pmsignal(); if (events[i].events & WL_SOCKET_ACCEPT) { ClientSocket s; if (AcceptConnection(events[i].fd, &s) == STATUS_OK) BackendStartup(&s); /* We no longer need the open socket in this process */ if (s.sock != PGINVALID_SOCKET) { if (closesocket(s.sock) != 0) elog(LOG, "could not close client socket: %m"); } } } /* * If we need to launch any background processes after changing state * or because some exited, do so now. */ LaunchMissingBackgroundProcesses(); /* If we need to signal the autovacuum launcher, do so now */ if (avlauncher_needs_signal) { avlauncher_needs_signal = false; if (AutoVacPID != 0) kill(AutoVacPID, SIGUSR2); } #ifdef HAVE_PTHREAD_IS_THREADED_NP /* * With assertions enabled, check regularly for appearance of * additional threads. All builds check at start and exit. */ Assert(pthread_is_threaded_np() == 0); #endif /* * Lastly, check to see if it's time to do some things that we don't * want to do every single time through the loop, because they're a * bit expensive. Note that there's up to a minute of slop in when * these tasks will be performed, since DetermineSleepTime() will let * us sleep at most that long; except for SIGKILL timeout which has * special-case logic there. */ now = time(NULL); /* * If we already sent SIGQUIT to children and they are slow to shut * down, it's time to send them SIGKILL (or SIGABRT if requested). * This doesn't happen normally, but under certain conditions backends * can get stuck while shutting down. This is a last measure to get * them unwedged. * * Note we also do this during recovery from a process crash. */ if ((Shutdown >= ImmediateShutdown || FatalError) && AbortStartTime != 0 && (now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS) { /* We were gentle with them before. Not anymore */ ereport(LOG, /* translator: %s is SIGKILL or SIGABRT */ (errmsg("issuing %s to recalcitrant children", send_abort_for_kill ? "SIGABRT" : "SIGKILL"))); TerminateChildren(send_abort_for_kill ? SIGABRT : SIGKILL); /* reset flag so we don't SIGKILL again */ AbortStartTime = 0; } /* * Once a minute, verify that postmaster.pid hasn't been removed or * overwritten. If it has, we force a shutdown. This avoids having * postmasters and child processes hanging around after their database * is gone, and maybe causing problems if a new database cluster is * created in the same place. It also provides some protection * against a DBA foolishly removing postmaster.pid and manually * starting a new postmaster. Data corruption is likely to ensue from * that anyway, but we can minimize the damage by aborting ASAP. */ if (now - last_lockfile_recheck_time >= 1 * SECS_PER_MINUTE) { if (!RecheckDataDirLockFile()) { ereport(LOG, (errmsg("performing immediate shutdown because data directory lock file is invalid"))); kill(MyProcPid, SIGQUIT); } last_lockfile_recheck_time = now; } /* * Touch Unix socket and lock files every 58 minutes, to ensure that * they are not removed by overzealous /tmp-cleaning tasks. We assume * no one runs cleaners with cutoff times of less than an hour ... */ if (now - last_touch_time >= 58 * SECS_PER_MINUTE) { TouchSocketFiles(); TouchSocketLockFiles(); last_touch_time = now; } } } /* * canAcceptConnections --- check to see if database state allows connections * of the specified type. backend_type can be BACKEND_TYPE_NORMAL, * BACKEND_TYPE_AUTOVAC, or BACKEND_TYPE_BGWORKER. (Note that we don't yet * know whether a NORMAL connection might turn into a walsender.) */ static CAC_state canAcceptConnections(int backend_type) { CAC_state result = CAC_OK; /* * Can't start backends when in startup/shutdown/inconsistent recovery * state. We treat autovac workers the same as user backends for this * purpose. However, bgworkers are excluded from this test; we expect * bgworker_should_start_now() decided whether the DB state allows them. */ if (pmState != PM_RUN && pmState != PM_HOT_STANDBY && backend_type != BACKEND_TYPE_BGWORKER) { if (Shutdown > NoShutdown) return CAC_SHUTDOWN; /* shutdown is pending */ else if (!FatalError && pmState == PM_STARTUP) return CAC_STARTUP; /* normal startup */ else if (!FatalError && pmState == PM_RECOVERY) return CAC_NOTCONSISTENT; /* not yet at consistent recovery * state */ else return CAC_RECOVERY; /* else must be crash recovery */ } /* * "Smart shutdown" restrictions are applied only to normal connections, * not to autovac workers or bgworkers. */ if (!connsAllowed && backend_type == BACKEND_TYPE_NORMAL) return CAC_SHUTDOWN; /* shutdown is pending */ /* * Don't start too many children. * * We allow more connections here than we can have backends because some * might still be authenticating; they might fail auth, or some existing * backend might exit before the auth cycle is completed. The exact * MaxBackends limit is enforced when a new backend tries to join the * shared-inval backend array. * * The limit here must match the sizes of the per-child-process arrays; * see comments for MaxLivePostmasterChildren(). */ if (CountChildren(BACKEND_TYPE_ALL) >= MaxLivePostmasterChildren()) result = CAC_TOOMANY; return result; } /* * ClosePostmasterPorts -- close all the postmaster's open sockets * * This is called during child process startup to release file descriptors * that are not needed by that child process. The postmaster still has * them open, of course. * * Note: we pass am_syslogger as a boolean because we don't want to set * the global variable yet when this is called. */ void ClosePostmasterPorts(bool am_syslogger) { /* Release resources held by the postmaster's WaitEventSet. */ if (pm_wait_set) { FreeWaitEventSetAfterFork(pm_wait_set); pm_wait_set = NULL; } #ifndef WIN32 /* * Close the write end of postmaster death watch pipe. It's important to * do this as early as possible, so that if postmaster dies, others won't * think that it's still running because we're holding the pipe open. */ if (close(postmaster_alive_fds[POSTMASTER_FD_OWN]) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg_internal("could not close postmaster death monitoring pipe in child process: %m"))); postmaster_alive_fds[POSTMASTER_FD_OWN] = -1; /* Notify fd.c that we released one pipe FD. */ ReleaseExternalFD(); #endif /* * Close the postmaster's listen sockets. These aren't tracked by fd.c, * so we don't call ReleaseExternalFD() here. * * The listen sockets are marked as FD_CLOEXEC, so this isn't needed in * EXEC_BACKEND mode. */ #ifndef EXEC_BACKEND if (ListenSockets) { for (int i = 0; i < NumListenSockets; i++) { if (closesocket(ListenSockets[i]) != 0) elog(LOG, "could not close listen socket: %m"); } pfree(ListenSockets); } NumListenSockets = 0; ListenSockets = NULL; #endif /* * If using syslogger, close the read side of the pipe. We don't bother * tracking this in fd.c, either. */ if (!am_syslogger) { #ifndef WIN32 if (syslogPipe[0] >= 0) close(syslogPipe[0]); syslogPipe[0] = -1; #else if (syslogPipe[0]) CloseHandle(syslogPipe[0]); syslogPipe[0] = 0; #endif } #ifdef USE_BONJOUR /* If using Bonjour, close the connection to the mDNS daemon */ if (bonjour_sdref) close(DNSServiceRefSockFD(bonjour_sdref)); #endif } /* * InitProcessGlobals -- set MyProcPid, MyStartTime[stamp], random seeds * * Called early in the postmaster and every backend. */ void InitProcessGlobals(void) { MyProcPid = getpid(); MyStartTimestamp = GetCurrentTimestamp(); MyStartTime = timestamptz_to_time_t(MyStartTimestamp); /* * Set a different global seed in every process. We want something * unpredictable, so if possible, use high-quality random bits for the * seed. Otherwise, fall back to a seed based on timestamp and PID. */ if (unlikely(!pg_prng_strong_seed(&pg_global_prng_state))) { uint64 rseed; /* * Since PIDs and timestamps tend to change more frequently in their * least significant bits, shift the timestamp left to allow a larger * total number of seeds in a given time period. Since that would * leave only 20 bits of the timestamp that cycle every ~1 second, * also mix in some higher bits. */ rseed = ((uint64) MyProcPid) ^ ((uint64) MyStartTimestamp << 12) ^ ((uint64) MyStartTimestamp >> 20); pg_prng_seed(&pg_global_prng_state, rseed); } /* * Also make sure that we've set a good seed for random(3). Use of that * is deprecated in core Postgres, but extensions might use it. */ #ifndef WIN32 srandom(pg_prng_uint32(&pg_global_prng_state)); #endif } /* * Child processes use SIGUSR1 to notify us of 'pmsignals'. pg_ctl uses * SIGUSR1 to ask postmaster to check for logrotate and promote files. */ static void handle_pm_pmsignal_signal(SIGNAL_ARGS) { pending_pm_pmsignal = true; SetLatch(MyLatch); } /* * pg_ctl uses SIGHUP to request a reload of the configuration files. */ static void handle_pm_reload_request_signal(SIGNAL_ARGS) { pending_pm_reload_request = true; SetLatch(MyLatch); } /* * Re-read config files, and tell children to do same. */ static void process_pm_reload_request(void) { pending_pm_reload_request = false; ereport(DEBUG2, (errmsg_internal("postmaster received reload request signal"))); if (Shutdown <= SmartShutdown) { ereport(LOG, (errmsg("received SIGHUP, reloading configuration files"))); ProcessConfigFile(PGC_SIGHUP); SignalChildren(SIGHUP); if (StartupPID != 0) signal_child(StartupPID, SIGHUP); if (BgWriterPID != 0) signal_child(BgWriterPID, SIGHUP); if (CheckpointerPID != 0) signal_child(CheckpointerPID, SIGHUP); if (WalWriterPID != 0) signal_child(WalWriterPID, SIGHUP); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGHUP); if (WalSummarizerPID != 0) signal_child(WalSummarizerPID, SIGHUP); if (AutoVacPID != 0) signal_child(AutoVacPID, SIGHUP); if (PgArchPID != 0) signal_child(PgArchPID, SIGHUP); if (SysLoggerPID != 0) signal_child(SysLoggerPID, SIGHUP); if (SlotSyncWorkerPID != 0) signal_child(SlotSyncWorkerPID, SIGHUP); /* Reload authentication config files too */ if (!load_hba()) ereport(LOG, /* translator: %s is a configuration file */ (errmsg("%s was not reloaded", HbaFileName))); if (!load_ident()) ereport(LOG, (errmsg("%s was not reloaded", IdentFileName))); #ifdef USE_SSL /* Reload SSL configuration as well */ if (EnableSSL) { if (secure_initialize(false) == 0) LoadedSSL = true; else ereport(LOG, (errmsg("SSL configuration was not reloaded"))); } else { secure_destroy(); LoadedSSL = false; } #endif #ifdef EXEC_BACKEND /* Update the starting-point file for future children */ write_nondefault_variables(PGC_SIGHUP); #endif } } /* * pg_ctl uses SIGTERM, SIGINT and SIGQUIT to request different types of * shutdown. */ static void handle_pm_shutdown_request_signal(SIGNAL_ARGS) { switch (postgres_signal_arg) { case SIGTERM: /* smart is implied if the other two flags aren't set */ pending_pm_shutdown_request = true; break; case SIGINT: pending_pm_fast_shutdown_request = true; pending_pm_shutdown_request = true; break; case SIGQUIT: pending_pm_immediate_shutdown_request = true; pending_pm_shutdown_request = true; break; } SetLatch(MyLatch); } /* * Process shutdown request. */ static void process_pm_shutdown_request(void) { int mode; ereport(DEBUG2, (errmsg_internal("postmaster received shutdown request signal"))); pending_pm_shutdown_request = false; /* * If more than one shutdown request signal arrived since the last server * loop, take the one that is the most immediate. That matches the * priority that would apply if we processed them one by one in any order. */ if (pending_pm_immediate_shutdown_request) { pending_pm_immediate_shutdown_request = false; pending_pm_fast_shutdown_request = false; mode = ImmediateShutdown; } else if (pending_pm_fast_shutdown_request) { pending_pm_fast_shutdown_request = false; mode = FastShutdown; } else mode = SmartShutdown; switch (mode) { case SmartShutdown: /* * Smart Shutdown: * * Wait for children to end their work, then shut down. */ if (Shutdown >= SmartShutdown) break; Shutdown = SmartShutdown; ereport(LOG, (errmsg("received smart shutdown request"))); /* Report status */ AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); #ifdef USE_SYSTEMD sd_notify(0, "STOPPING=1"); #endif /* * If we reached normal running, we go straight to waiting for * client backends to exit. If already in PM_STOP_BACKENDS or a * later state, do not change it. */ if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) connsAllowed = false; else if (pmState == PM_STARTUP || pmState == PM_RECOVERY) { /* There should be no clients, so proceed to stop children */ pmState = PM_STOP_BACKENDS; } /* * Now wait for online backup mode to end and backends to exit. If * that is already the case, PostmasterStateMachine will take the * next step. */ PostmasterStateMachine(); break; case FastShutdown: /* * Fast Shutdown: * * Abort all children with SIGTERM (rollback active transactions * and exit) and shut down when they are gone. */ if (Shutdown >= FastShutdown) break; Shutdown = FastShutdown; ereport(LOG, (errmsg("received fast shutdown request"))); /* Report status */ AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); #ifdef USE_SYSTEMD sd_notify(0, "STOPPING=1"); #endif if (pmState == PM_STARTUP || pmState == PM_RECOVERY) { /* Just shut down background processes silently */ pmState = PM_STOP_BACKENDS; } else if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) { /* Report that we're about to zap live client sessions */ ereport(LOG, (errmsg("aborting any active transactions"))); pmState = PM_STOP_BACKENDS; } /* * PostmasterStateMachine will issue any necessary signals, or * take the next step if no child processes need to be killed. */ PostmasterStateMachine(); break; case ImmediateShutdown: /* * Immediate Shutdown: * * abort all children with SIGQUIT, wait for them to exit, * terminate remaining ones with SIGKILL, then exit without * attempt to properly shut down the data base system. */ if (Shutdown >= ImmediateShutdown) break; Shutdown = ImmediateShutdown; ereport(LOG, (errmsg("received immediate shutdown request"))); /* Report status */ AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); #ifdef USE_SYSTEMD sd_notify(0, "STOPPING=1"); #endif /* tell children to shut down ASAP */ /* (note we don't apply send_abort_for_crash here) */ SetQuitSignalReason(PMQUIT_FOR_STOP); TerminateChildren(SIGQUIT); pmState = PM_WAIT_BACKENDS; /* set stopwatch for them to die */ AbortStartTime = time(NULL); /* * Now wait for backends to exit. If there are none, * PostmasterStateMachine will take the next step. */ PostmasterStateMachine(); break; } } static void handle_pm_child_exit_signal(SIGNAL_ARGS) { pending_pm_child_exit = true; SetLatch(MyLatch); } /* * Cleanup after a child process dies. */ static void process_pm_child_exit(void) { int pid; /* process id of dead child process */ int exitstatus; /* its exit status */ pending_pm_child_exit = false; ereport(DEBUG4, (errmsg_internal("reaping dead processes"))); while ((pid = waitpid(-1, &exitstatus, WNOHANG)) > 0) { bool found; dlist_mutable_iter iter; /* * Check if this child was a startup process. */ if (pid == StartupPID) { StartupPID = 0; /* * Startup process exited in response to a shutdown request (or it * completed normally regardless of the shutdown request). */ if (Shutdown > NoShutdown && (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) { StartupStatus = STARTUP_NOT_RUNNING; pmState = PM_WAIT_BACKENDS; /* PostmasterStateMachine logic does the rest */ continue; } if (EXIT_STATUS_3(exitstatus)) { ereport(LOG, (errmsg("shutdown at recovery target"))); StartupStatus = STARTUP_NOT_RUNNING; Shutdown = Max(Shutdown, SmartShutdown); TerminateChildren(SIGTERM); pmState = PM_WAIT_BACKENDS; /* PostmasterStateMachine logic does the rest */ continue; } /* * Unexpected exit of startup process (including FATAL exit) * during PM_STARTUP is treated as catastrophic. There are no * other processes running yet, so we can just exit. */ if (pmState == PM_STARTUP && StartupStatus != STARTUP_SIGNALED && !EXIT_STATUS_0(exitstatus)) { LogChildExit(LOG, _("startup process"), pid, exitstatus); ereport(LOG, (errmsg("aborting startup due to startup process failure"))); ExitPostmaster(1); } /* * After PM_STARTUP, any unexpected exit (including FATAL exit) of * the startup process is catastrophic, so kill other children, * and set StartupStatus so we don't try to reinitialize after * they're gone. Exception: if StartupStatus is STARTUP_SIGNALED, * then we previously sent the startup process a SIGQUIT; so * that's probably the reason it died, and we do want to try to * restart in that case. * * This stanza also handles the case where we sent a SIGQUIT * during PM_STARTUP due to some dead_end child crashing: in that * situation, if the startup process dies on the SIGQUIT, we need * to transition to PM_WAIT_BACKENDS state which will allow * PostmasterStateMachine to restart the startup process. (On the * other hand, the startup process might complete normally, if we * were too late with the SIGQUIT. In that case we'll fall * through and commence normal operations.) */ if (!EXIT_STATUS_0(exitstatus)) { if (StartupStatus == STARTUP_SIGNALED) { StartupStatus = STARTUP_NOT_RUNNING; if (pmState == PM_STARTUP) pmState = PM_WAIT_BACKENDS; } else StartupStatus = STARTUP_CRASHED; HandleChildCrash(pid, exitstatus, _("startup process")); continue; } /* * Startup succeeded, commence normal operations */ StartupStatus = STARTUP_NOT_RUNNING; FatalError = false; AbortStartTime = 0; ReachedNormalRunning = true; pmState = PM_RUN; connsAllowed = true; /* * At the next iteration of the postmaster's main loop, we will * crank up the background tasks like the autovacuum launcher and * background workers that were not started earlier already. */ StartWorkerNeeded = true; /* at this point we are really open for business */ ereport(LOG, (errmsg("database system is ready to accept connections"))); /* Report status */ AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_READY); #ifdef USE_SYSTEMD sd_notify(0, "READY=1"); #endif continue; } /* * Was it the bgwriter? Normal exit can be ignored; we'll start a new * one at the next iteration of the postmaster's main loop, if * necessary. Any other exit condition is treated as a crash. */ if (pid == BgWriterPID) { BgWriterPID = 0; if (!EXIT_STATUS_0(exitstatus)) HandleChildCrash(pid, exitstatus, _("background writer process")); continue; } /* * Was it the checkpointer? */ if (pid == CheckpointerPID) { CheckpointerPID = 0; if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN) { /* * OK, we saw normal exit of the checkpointer after it's been * told to shut down. We expect that it wrote a shutdown * checkpoint. (If for some reason it didn't, recovery will * occur on next postmaster start.) * * At this point we should have no normal backend children * left (else we'd not be in PM_SHUTDOWN state) but we might * have dead_end children to wait for. * * If we have an archiver subprocess, tell it to do a last * archive cycle and quit. Likewise, if we have walsender * processes, tell them to send any remaining WAL and quit. */ Assert(Shutdown > NoShutdown); /* Waken archiver for the last time */ if (PgArchPID != 0) signal_child(PgArchPID, SIGUSR2); /* * Waken walsenders for the last time. No regular backends * should be around anymore. */ SignalChildren(SIGUSR2); pmState = PM_SHUTDOWN_2; } else { /* * Any unexpected exit of the checkpointer (including FATAL * exit) is treated as a crash. */ HandleChildCrash(pid, exitstatus, _("checkpointer process")); } continue; } /* * Was it the wal writer? Normal exit can be ignored; we'll start a * new one at the next iteration of the postmaster's main loop, if * necessary. Any other exit condition is treated as a crash. */ if (pid == WalWriterPID) { WalWriterPID = 0; if (!EXIT_STATUS_0(exitstatus)) HandleChildCrash(pid, exitstatus, _("WAL writer process")); continue; } /* * Was it the wal receiver? If exit status is zero (normal) or one * (FATAL exit), we assume everything is all right just like normal * backends. (If we need a new wal receiver, we'll start one at the * next iteration of the postmaster's main loop.) */ if (pid == WalReceiverPID) { WalReceiverPID = 0; if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) HandleChildCrash(pid, exitstatus, _("WAL receiver process")); continue; } /* * Was it the wal summarizer? Normal exit can be ignored; we'll start * a new one at the next iteration of the postmaster's main loop, if * necessary. Any other exit condition is treated as a crash. */ if (pid == WalSummarizerPID) { WalSummarizerPID = 0; if (!EXIT_STATUS_0(exitstatus)) HandleChildCrash(pid, exitstatus, _("WAL summarizer process")); continue; } /* * Was it the autovacuum launcher? Normal exit can be ignored; we'll * start a new one at the next iteration of the postmaster's main * loop, if necessary. Any other exit condition is treated as a * crash. */ if (pid == AutoVacPID) { AutoVacPID = 0; if (!EXIT_STATUS_0(exitstatus)) HandleChildCrash(pid, exitstatus, _("autovacuum launcher process")); continue; } /* * Was it the archiver? If exit status is zero (normal) or one (FATAL * exit), we assume everything is all right just like normal backends * and just try to start a new one on the next cycle of the * postmaster's main loop, to retry archiving remaining files. */ if (pid == PgArchPID) { PgArchPID = 0; if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) HandleChildCrash(pid, exitstatus, _("archiver process")); continue; } /* Was it the system logger? If so, try to start a new one */ if (pid == SysLoggerPID) { SysLoggerPID = 0; /* for safety's sake, launch new logger *first* */ SysLoggerPID = SysLogger_Start(); if (!EXIT_STATUS_0(exitstatus)) LogChildExit(LOG, _("system logger process"), pid, exitstatus); continue; } /* * Was it the slot sync worker? Normal exit or FATAL exit can be * ignored (FATAL can be caused by libpqwalreceiver on receiving * shutdown request by the startup process during promotion); we'll * start a new one at the next iteration of the postmaster's main * loop, if necessary. Any other exit condition is treated as a crash. */ if (pid == SlotSyncWorkerPID) { SlotSyncWorkerPID = 0; if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) HandleChildCrash(pid, exitstatus, _("slot sync worker process")); continue; } /* * Was it a backend or a background worker? */ found = false; dlist_foreach_modify(iter, &BackendList) { Backend *bp = dlist_container(Backend, elem, iter.cur); if (bp->pid == pid) { dlist_delete(iter.cur); CleanupBackend(bp, exitstatus); found = true; break; } } /* * We don't know anything about this child process. That's highly * unexpected, as we do track all the child processes that we fork. */ if (!found) { if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) HandleChildCrash(pid, exitstatus, _("untracked child process")); else LogChildExit(LOG, _("untracked child process"), pid, exitstatus); } } /* loop over pending child-death reports */ /* * After cleaning out the SIGCHLD queue, see if we have any state changes * or actions to make. */ PostmasterStateMachine(); } /* * CleanupBackend -- cleanup after terminated backend or background worker. * * Remove all local state associated with backend. The Backend entry has * already been unlinked from BackendList, but we will free it here. */ static void CleanupBackend(Backend *bp, int exitstatus) /* child's exit status. */ { char namebuf[MAXPGPATH]; char *procname; bool crashed = false; bool logged = false; /* Construct a process name for log message */ if (bp->dead_end) { procname = _("dead end backend"); } else if (bp->bkend_type == BACKEND_TYPE_BGWORKER) { snprintf(namebuf, MAXPGPATH, _("background worker \"%s\""), bp->rw->rw_worker.bgw_type); procname = namebuf; } else procname = _("server process"); /* * If a backend dies in an ugly way then we must signal all other backends * to quickdie. If exit status is zero (normal) or one (FATAL exit), we * assume everything is all right and proceed to remove the backend from * the active backend list. */ if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) crashed = true; #ifdef WIN32 /* * On win32, also treat ERROR_WAIT_NO_CHILDREN (128) as nonfatal case, * since that sometimes happens under load when the process fails to start * properly (long before it starts using shared memory). Microsoft reports * it is related to mutex failure: * http://archives.postgresql.org/pgsql-hackers/2010-09/msg00790.php */ if (exitstatus == ERROR_WAIT_NO_CHILDREN) { LogChildExit(LOG, procname, bp->pid, exitstatus); logged = true; crashed = false; } #endif /* * If the process attached to shared memory, check that it detached * cleanly. */ if (!bp->dead_end) { if (!ReleasePostmasterChildSlot(bp->child_slot)) { /* * Uh-oh, the child failed to clean itself up. Treat as a crash * after all. */ crashed = true; } } if (crashed) { HandleChildCrash(bp->pid, exitstatus, procname); pfree(bp); return; } /* * This backend may have been slated to receive SIGUSR1 when some * background worker started or stopped. Cancel those notifications, as * we don't want to signal PIDs that are not PostgreSQL backends. This * gets skipped in the (probably very common) case where the backend has * never requested any such notifications. */ if (bp->bgworker_notify) BackgroundWorkerStopNotifications(bp->pid); /* * If it was a background worker, also update its RegisteredBgWorker * entry. */ if (bp->bkend_type == BACKEND_TYPE_BGWORKER) { RegisteredBgWorker *rw = bp->rw; if (!EXIT_STATUS_0(exitstatus)) { /* Record timestamp, so we know when to restart the worker. */ rw->rw_crashed_at = GetCurrentTimestamp(); } else { /* Zero exit status means terminate */ rw->rw_crashed_at = 0; rw->rw_terminate = true; } rw->rw_pid = 0; ReportBackgroundWorkerExit(rw); /* report child death */ if (!logged) { LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG, procname, bp->pid, exitstatus); logged = true; } /* have it be restarted */ HaveCrashedWorker = true; } if (!logged) LogChildExit(DEBUG2, procname, bp->pid, exitstatus); pfree(bp); } /* * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer, * walwriter, autovacuum, archiver, slot sync worker, or background worker. * * The objectives here are to clean up our local state about the child * process, and to signal all other remaining children to quickdie. * * If it's a backend, the caller has already removed it from the BackendList. * If it's an aux process, the corresponding *PID global variable has been * reset already. */ static void HandleChildCrash(int pid, int exitstatus, const char *procname) { bool take_action; /* * We only log messages and send signals if this is the first process * crash and we're not doing an immediate shutdown; otherwise, we're only * here to update postmaster's idea of live processes. If we have already * signaled children, nonzero exit status is to be expected, so don't * clutter log. */ take_action = !FatalError && Shutdown != ImmediateShutdown; if (take_action) { LogChildExit(LOG, procname, pid, exitstatus); ereport(LOG, (errmsg("terminating any other active server processes"))); SetQuitSignalReason(PMQUIT_FOR_CRASH); } if (take_action) { dlist_iter iter; dlist_foreach(iter, &BackendList) { Backend *bp = dlist_container(Backend, elem, iter.cur); /* * This backend is still alive. Unless we did so already, tell it * to commit hara-kiri. * * We could exclude dead_end children here, but at least when * sending SIGABRT it seems better to include them. */ sigquit_child(bp->pid); } if (StartupPID != 0) { sigquit_child(StartupPID); StartupStatus = STARTUP_SIGNALED; } /* Take care of the bgwriter too */ if (BgWriterPID != 0) sigquit_child(BgWriterPID); /* Take care of the checkpointer too */ if (CheckpointerPID != 0) sigquit_child(CheckpointerPID); /* Take care of the walwriter too */ if (WalWriterPID != 0) sigquit_child(WalWriterPID); /* Take care of the walreceiver too */ if (WalReceiverPID != 0) sigquit_child(WalReceiverPID); /* Take care of the walsummarizer too */ if (WalSummarizerPID != 0) sigquit_child(WalSummarizerPID); /* Take care of the autovacuum launcher too */ if (AutoVacPID != 0) sigquit_child(AutoVacPID); /* Take care of the archiver too */ if (PgArchPID != 0) sigquit_child(PgArchPID); /* Take care of the slot sync worker too */ if (SlotSyncWorkerPID != 0) sigquit_child(SlotSyncWorkerPID); /* We do NOT restart the syslogger */ } if (Shutdown != ImmediateShutdown) FatalError = true; /* We now transit into a state of waiting for children to die */ if (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY || pmState == PM_RUN || pmState == PM_STOP_BACKENDS || pmState == PM_SHUTDOWN) pmState = PM_WAIT_BACKENDS; /* * .. and if this doesn't happen quickly enough, now the clock is ticking * for us to kill them without mercy. */ if (AbortStartTime == 0) AbortStartTime = time(NULL); } /* * Log the death of a child process. */ static void LogChildExit(int lev, const char *procname, int pid, int exitstatus) { /* * size of activity_buffer is arbitrary, but set equal to default * track_activity_query_size */ char activity_buffer[1024]; const char *activity = NULL; if (!EXIT_STATUS_0(exitstatus)) activity = pgstat_get_crashed_backend_activity(pid, activity_buffer, sizeof(activity_buffer)); if (WIFEXITED(exitstatus)) ereport(lev, /*------ translator: %s is a noun phrase describing a child process, such as "server process" */ (errmsg("%s (PID %d) exited with exit code %d", procname, pid, WEXITSTATUS(exitstatus)), activity ? errdetail("Failed process was running: %s", activity) : 0)); else if (WIFSIGNALED(exitstatus)) { #if defined(WIN32) ereport(lev, /*------ translator: %s is a noun phrase describing a child process, such as "server process" */ (errmsg("%s (PID %d) was terminated by exception 0x%X", procname, pid, WTERMSIG(exitstatus)), errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."), activity ? errdetail("Failed process was running: %s", activity) : 0)); #else ereport(lev, /*------ translator: %s is a noun phrase describing a child process, such as "server process" */ (errmsg("%s (PID %d) was terminated by signal %d: %s", procname, pid, WTERMSIG(exitstatus), pg_strsignal(WTERMSIG(exitstatus))), activity ? errdetail("Failed process was running: %s", activity) : 0)); #endif } else ereport(lev, /*------ translator: %s is a noun phrase describing a child process, such as "server process" */ (errmsg("%s (PID %d) exited with unrecognized status %d", procname, pid, exitstatus), activity ? errdetail("Failed process was running: %s", activity) : 0)); } /* * Advance the postmaster's state machine and take actions as appropriate * * This is common code for process_pm_shutdown_request(), * process_pm_child_exit() and process_pm_pmsignal(), which process the signals * that might mean we need to change state. */ static void PostmasterStateMachine(void) { /* If we're doing a smart shutdown, try to advance that state. */ if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) { if (!connsAllowed) { /* * This state ends when we have no normal client backends running. * Then we're ready to stop other children. */ if (CountChildren(BACKEND_TYPE_NORMAL) == 0) pmState = PM_STOP_BACKENDS; } } /* * If we're ready to do so, signal child processes to shut down. (This * isn't a persistent state, but treating it as a distinct pmState allows * us to share this code across multiple shutdown code paths.) */ if (pmState == PM_STOP_BACKENDS) { /* * Forget any pending requests for background workers, since we're no * longer willing to launch any new workers. (If additional requests * arrive, BackgroundWorkerStateChange will reject them.) */ ForgetUnstartedBackgroundWorkers(); /* Signal all backend children except walsenders */ SignalSomeChildren(SIGTERM, BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND); /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); /* and the bgwriter too */ if (BgWriterPID != 0) signal_child(BgWriterPID, SIGTERM); /* and the walwriter too */ if (WalWriterPID != 0) signal_child(WalWriterPID, SIGTERM); /* If we're in recovery, also stop startup and walreceiver procs */ if (StartupPID != 0) signal_child(StartupPID, SIGTERM); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGTERM); if (WalSummarizerPID != 0) signal_child(WalSummarizerPID, SIGTERM); if (SlotSyncWorkerPID != 0) signal_child(SlotSyncWorkerPID, SIGTERM); /* checkpointer, archiver, stats, and syslogger may continue for now */ /* Now transition to PM_WAIT_BACKENDS state to wait for them to die */ pmState = PM_WAIT_BACKENDS; } /* * If we are in a state-machine state that implies waiting for backends to * exit, see if they're all gone, and change state if so. */ if (pmState == PM_WAIT_BACKENDS) { /* * PM_WAIT_BACKENDS state ends when we have no regular backends * (including autovac workers), no bgworkers (including unconnected * ones), and no walwriter, autovac launcher, bgwriter or slot sync * worker. If we are doing crash recovery or an immediate shutdown * then we expect the checkpointer to exit as well, otherwise not. The * stats and syslogger processes are disregarded since they are not * connected to shared memory; we also disregard dead_end children * here. Walsenders and archiver are also disregarded, they will be * terminated later after writing the checkpoint record. */ if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && StartupPID == 0 && WalReceiverPID == 0 && WalSummarizerPID == 0 && BgWriterPID == 0 && (CheckpointerPID == 0 || (!FatalError && Shutdown < ImmediateShutdown)) && WalWriterPID == 0 && AutoVacPID == 0 && SlotSyncWorkerPID == 0) { if (Shutdown >= ImmediateShutdown || FatalError) { /* * Start waiting for dead_end children to die. This state * change causes ServerLoop to stop creating new ones. */ pmState = PM_WAIT_DEAD_END; /* * We already SIGQUIT'd the archiver and stats processes, if * any, when we started immediate shutdown or entered * FatalError state. */ } else { /* * If we get here, we are proceeding with normal shutdown. All * the regular children are gone, and it's time to tell the * checkpointer to do a shutdown checkpoint. */ Assert(Shutdown > NoShutdown); /* Start the checkpointer if not running */ if (CheckpointerPID == 0) CheckpointerPID = StartChildProcess(B_CHECKPOINTER); /* And tell it to shut down */ if (CheckpointerPID != 0) { signal_child(CheckpointerPID, SIGUSR2); pmState = PM_SHUTDOWN; } else { /* * If we failed to fork a checkpointer, just shut down. * Any required cleanup will happen at next restart. We * set FatalError so that an "abnormal shutdown" message * gets logged when we exit. * * We don't consult send_abort_for_crash here, as it's * unlikely that dumping cores would illuminate the reason * for checkpointer fork failure. */ FatalError = true; pmState = PM_WAIT_DEAD_END; /* Kill the walsenders and archiver too */ SignalChildren(SIGQUIT); if (PgArchPID != 0) signal_child(PgArchPID, SIGQUIT); } } } } if (pmState == PM_SHUTDOWN_2) { /* * PM_SHUTDOWN_2 state ends when there's no other children than * dead_end children left. There shouldn't be any regular backends * left by now anyway; what we're really waiting for is walsenders and * archiver. */ if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0) { pmState = PM_WAIT_DEAD_END; } } if (pmState == PM_WAIT_DEAD_END) { /* Don't allow any new socket connection events. */ ConfigurePostmasterWaitSet(false); /* * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty * (ie, no dead_end children remain), and the archiver is gone too. * * The reason we wait for those two is to protect them against a new * postmaster starting conflicting subprocesses; this isn't an * ironclad protection, but it at least helps in the * shutdown-and-immediately-restart scenario. Note that they have * already been sent appropriate shutdown signals, either during a * normal state transition leading up to PM_WAIT_DEAD_END, or during * FatalError processing. */ if (dlist_is_empty(&BackendList) && PgArchPID == 0) { /* These other guys should be dead already */ Assert(StartupPID == 0); Assert(WalReceiverPID == 0); Assert(WalSummarizerPID == 0); Assert(BgWriterPID == 0); Assert(CheckpointerPID == 0); Assert(WalWriterPID == 0); Assert(AutoVacPID == 0); Assert(SlotSyncWorkerPID == 0); /* syslogger is not considered here */ pmState = PM_NO_CHILDREN; } } /* * If we've been told to shut down, we exit as soon as there are no * remaining children. If there was a crash, cleanup will occur at the * next startup. (Before PostgreSQL 8.3, we tried to recover from the * crash before exiting, but that seems unwise if we are quitting because * we got SIGTERM from init --- there may well not be time for recovery * before init decides to SIGKILL us.) * * Note that the syslogger continues to run. It will exit when it sees * EOF on its input pipe, which happens when there are no more upstream * processes. */ if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN) { if (FatalError) { ereport(LOG, (errmsg("abnormal database system shutdown"))); ExitPostmaster(1); } else { /* * Normal exit from the postmaster is here. We don't need to log * anything here, since the UnlinkLockFiles proc_exit callback * will do so, and that should be the last user-visible action. */ ExitPostmaster(0); } } /* * If the startup process failed, or the user does not want an automatic * restart after backend crashes, wait for all non-syslogger children to * exit, and then exit postmaster. We don't try to reinitialize when the * startup process fails, because more than likely it will just fail again * and we will keep trying forever. */ if (pmState == PM_NO_CHILDREN) { if (StartupStatus == STARTUP_CRASHED) { ereport(LOG, (errmsg("shutting down due to startup process failure"))); ExitPostmaster(1); } if (!restart_after_crash) { ereport(LOG, (errmsg("shutting down because \"restart_after_crash\" is off"))); ExitPostmaster(1); } } /* * If we need to recover from a crash, wait for all non-syslogger children * to exit, then reset shmem and start the startup process. */ if (FatalError && pmState == PM_NO_CHILDREN) { ereport(LOG, (errmsg("all server processes terminated; reinitializing"))); /* remove leftover temporary files after a crash */ if (remove_temp_files_after_crash) RemovePgTempFiles(); /* allow background workers to immediately restart */ ResetBackgroundWorkerCrashTimes(); shmem_exit(1); /* re-read control file into local memory */ LocalProcessControlFile(true); /* re-create shared memory and semaphores */ CreateSharedMemoryAndSemaphores(); StartupPID = StartChildProcess(B_STARTUP); Assert(StartupPID != 0); StartupStatus = STARTUP_RUNNING; pmState = PM_STARTUP; /* crash recovery started, reset SIGKILL flag */ AbortStartTime = 0; /* start accepting server socket connection events again */ ConfigurePostmasterWaitSet(true); } } /* * Launch background processes after state change, or relaunch after an * existing process has exited. * * Check the current pmState and the status of any background processes. If * there are any background processes missing that should be running in the * current state, but are not, launch them. */ static void LaunchMissingBackgroundProcesses(void) { /* Syslogger is active in all states */ if (SysLoggerPID == 0 && Logging_collector) SysLoggerPID = SysLogger_Start(); /* * The checkpointer and the background writer are active from the start, * until shutdown is initiated. * * (If the checkpointer is not running when we enter the PM_SHUTDOWN * state, it is launched one more time to perform the shutdown checkpoint. * That's done in PostmasterStateMachine(), not here.) */ if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY || pmState == PM_STARTUP) { if (CheckpointerPID == 0) CheckpointerPID = StartChildProcess(B_CHECKPOINTER); if (BgWriterPID == 0) BgWriterPID = StartChildProcess(B_BG_WRITER); } /* * WAL writer is needed only in normal operation (else we cannot be * writing any new WAL). */ if (WalWriterPID == 0 && pmState == PM_RUN) WalWriterPID = StartChildProcess(B_WAL_WRITER); /* * We don't want autovacuum to run in binary upgrade mode because * autovacuum might update relfrozenxid for empty tables before the * physical files are put in place. */ if (!IsBinaryUpgrade && AutoVacPID == 0 && (AutoVacuumingActive() || start_autovac_launcher) && pmState == PM_RUN) { AutoVacPID = StartChildProcess(B_AUTOVAC_LAUNCHER); if (AutoVacPID != 0) start_autovac_launcher = false; /* signal processed */ } /* * If WAL archiving is enabled always, we are allowed to start archiver * even during recovery. */ if (PgArchPID == 0 && ((XLogArchivingActive() && pmState == PM_RUN) || (XLogArchivingAlways() && (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY))) && PgArchCanRestart()) PgArchPID = StartChildProcess(B_ARCHIVER); /* * If we need to start a slot sync worker, try to do that now * * We allow to start the slot sync worker when we are on a hot standby, * fast or immediate shutdown is not in progress, slot sync parameters are * configured correctly, and it is the first time of worker's launch, or * enough time has passed since the worker was launched last. */ if (SlotSyncWorkerPID == 0 && pmState == PM_HOT_STANDBY && Shutdown <= SmartShutdown && sync_replication_slots && ValidateSlotSyncParams(LOG) && SlotSyncWorkerCanRestart()) SlotSyncWorkerPID = StartChildProcess(B_SLOTSYNC_WORKER); /* * If we need to start a WAL receiver, try to do that now * * Note: if WalReceiverPID is already nonzero, it might seem that we * should clear WalReceiverRequested. However, there's a race condition * if the walreceiver terminates and the startup process immediately * requests a new one: it's quite possible to get the signal for the * request before reaping the dead walreceiver process. Better to risk * launching an extra walreceiver than to miss launching one we need. (The * walreceiver code has logic to recognize that it should go away if not * needed.) */ if (WalReceiverRequested) { if (WalReceiverPID == 0 && (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY) && Shutdown <= SmartShutdown) { WalReceiverPID = StartChildProcess(B_WAL_RECEIVER); if (WalReceiverPID != 0) WalReceiverRequested = false; /* else leave the flag set, so we'll try again later */ } } /* If we need to start a WAL summarizer, try to do that now */ if (summarize_wal && WalSummarizerPID == 0 && (pmState == PM_RUN || pmState == PM_HOT_STANDBY) && Shutdown <= SmartShutdown) WalSummarizerPID = StartChildProcess(B_WAL_SUMMARIZER); /* Get other worker processes running, if needed */ if (StartWorkerNeeded || HaveCrashedWorker) maybe_start_bgworkers(); } /* * Send a signal to a postmaster child process * * On systems that have setsid(), each child process sets itself up as a * process group leader. For signals that are generally interpreted in the * appropriate fashion, we signal the entire process group not just the * direct child process. This allows us to, for example, SIGQUIT a blocked * archive_recovery script, or SIGINT a script being run by a backend via * system(). * * There is a race condition for recently-forked children: they might not * have executed setsid() yet. So we signal the child directly as well as * the group. We assume such a child will handle the signal before trying * to spawn any grandchild processes. We also assume that signaling the * child twice will not cause any problems. */ static void signal_child(pid_t pid, int signal) { if (kill(pid, signal) < 0) elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) pid, signal); #ifdef HAVE_SETSID switch (signal) { case SIGINT: case SIGTERM: case SIGQUIT: case SIGKILL: case SIGABRT: if (kill(-pid, signal) < 0) elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) (-pid), signal); break; default: break; } #endif } /* * Convenience function for killing a child process after a crash of some * other child process. We log the action at a higher level than we would * otherwise do, and we apply send_abort_for_crash to decide which signal * to send. Normally it's SIGQUIT -- and most other comments in this file * are written on the assumption that it is -- but developers might prefer * to use SIGABRT to collect per-child core dumps. */ static void sigquit_child(pid_t pid) { ereport(DEBUG2, (errmsg_internal("sending %s to process %d", (send_abort_for_crash ? "SIGABRT" : "SIGQUIT"), (int) pid))); signal_child(pid, (send_abort_for_crash ? SIGABRT : SIGQUIT)); } /* * Send a signal to the targeted children (but NOT special children; * dead_end children are never signaled, either). */ static bool SignalSomeChildren(int signal, int target) { dlist_iter iter; bool signaled = false; dlist_foreach(iter, &BackendList) { Backend *bp = dlist_container(Backend, elem, iter.cur); if (bp->dead_end) continue; /* * Since target == BACKEND_TYPE_ALL is the most common case, we test * it first and avoid touching shared memory for every child. */ if (target != BACKEND_TYPE_ALL) { /* * Assign bkend_type for any recently announced WAL Sender * processes. */ if (bp->bkend_type == BACKEND_TYPE_NORMAL && IsPostmasterChildWalSender(bp->child_slot)) bp->bkend_type = BACKEND_TYPE_WALSND; if (!(target & bp->bkend_type)) continue; } ereport(DEBUG4, (errmsg_internal("sending signal %d to process %d", signal, (int) bp->pid))); signal_child(bp->pid, signal); signaled = true; } return signaled; } /* * Send a termination signal to children. This considers all of our children * processes, except syslogger and dead_end backends. */ static void TerminateChildren(int signal) { SignalChildren(signal); if (StartupPID != 0) { signal_child(StartupPID, signal); if (signal == SIGQUIT || signal == SIGKILL || signal == SIGABRT) StartupStatus = STARTUP_SIGNALED; } if (BgWriterPID != 0) signal_child(BgWriterPID, signal); if (CheckpointerPID != 0) signal_child(CheckpointerPID, signal); if (WalWriterPID != 0) signal_child(WalWriterPID, signal); if (WalReceiverPID != 0) signal_child(WalReceiverPID, signal); if (WalSummarizerPID != 0) signal_child(WalSummarizerPID, signal); if (AutoVacPID != 0) signal_child(AutoVacPID, signal); if (PgArchPID != 0) signal_child(PgArchPID, signal); if (SlotSyncWorkerPID != 0) signal_child(SlotSyncWorkerPID, signal); } /* * BackendStartup -- start backend process * * returns: STATUS_ERROR if the fork failed, STATUS_OK otherwise. * * Note: if you change this code, also consider StartAutovacuumWorker. */ static int BackendStartup(ClientSocket *client_sock) { Backend *bn; /* for backend cleanup */ pid_t pid; BackendStartupData startup_data; /* * Create backend data structure. Better before the fork() so we can * handle failure cleanly. */ bn = (Backend *) palloc_extended(sizeof(Backend), MCXT_ALLOC_NO_OOM); if (!bn) { ereport(LOG, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); return STATUS_ERROR; } /* Pass down canAcceptConnections state */ startup_data.canAcceptConnections = canAcceptConnections(BACKEND_TYPE_NORMAL); bn->dead_end = (startup_data.canAcceptConnections != CAC_OK); bn->rw = NULL; /* * Unless it's a dead_end child, assign it a child slot number */ if (!bn->dead_end) bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); else bn->child_slot = 0; /* Hasn't asked to be notified about any bgworkers yet */ bn->bgworker_notify = false; pid = postmaster_child_launch(B_BACKEND, (char *) &startup_data, sizeof(startup_data), client_sock); if (pid < 0) { /* in parent, fork failed */ int save_errno = errno; if (!bn->dead_end) (void) ReleasePostmasterChildSlot(bn->child_slot); pfree(bn); errno = save_errno; ereport(LOG, (errmsg("could not fork new process for connection: %m"))); report_fork_failure_to_client(client_sock, save_errno); return STATUS_ERROR; } /* in parent, successful fork */ ereport(DEBUG2, (errmsg_internal("forked new backend, pid=%d socket=%d", (int) pid, (int) client_sock->sock))); /* * Everything's been successful, it's safe to add this backend to our list * of backends. */ bn->pid = pid; bn->bkend_type = BACKEND_TYPE_NORMAL; /* Can change later to WALSND */ dlist_push_head(&BackendList, &bn->elem); return STATUS_OK; } /* * Try to report backend fork() failure to client before we close the * connection. Since we do not care to risk blocking the postmaster on * this connection, we set the connection to non-blocking and try only once. * * This is grungy special-purpose code; we cannot use backend libpq since * it's not up and running. */ static void report_fork_failure_to_client(ClientSocket *client_sock, int errnum) { char buffer[1000]; int rc; /* Format the error message packet (always V2 protocol) */ snprintf(buffer, sizeof(buffer), "E%s%s\n", _("could not fork new process for connection: "), strerror(errnum)); /* Set port to non-blocking. Don't do send() if this fails */ if (!pg_set_noblock(client_sock->sock)) return; /* We'll retry after EINTR, but ignore all other failures */ do { rc = send(client_sock->sock, buffer, strlen(buffer) + 1, 0); } while (rc < 0 && errno == EINTR); } /* * ExitPostmaster -- cleanup * * Do NOT call exit() directly --- always go through here! */ static void ExitPostmaster(int status) { #ifdef HAVE_PTHREAD_IS_THREADED_NP /* * There is no known cause for a postmaster to become multithreaded after * startup. Recheck to account for the possibility of unknown causes. * This message uses LOG level, because an unclean shutdown at this point * would usually not look much different from a clean shutdown. */ if (pthread_is_threaded_np() != 0) ereport(LOG, (errcode(ERRCODE_INTERNAL_ERROR), errmsg_internal("postmaster became multithreaded"), errdetail("Please report this to <%s>.", PACKAGE_BUGREPORT))); #endif /* should cleanup shared memory and kill all backends */ /* * Not sure of the semantics here. When the Postmaster dies, should the * backends all be killed? probably not. * * MUST -- vadim 05-10-1999 */ proc_exit(status); } /* * Handle pmsignal conditions representing requests from backends, * and check for promote and logrotate requests from pg_ctl. */ static void process_pm_pmsignal(void) { pending_pm_pmsignal = false; ereport(DEBUG2, (errmsg_internal("postmaster received pmsignal signal"))); /* * RECOVERY_STARTED and BEGIN_HOT_STANDBY signals are ignored in * unexpected states. If the startup process quickly starts up, completes * recovery, exits, we might process the death of the startup process * first. We don't want to go back to recovery in that case. */ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED) && pmState == PM_STARTUP && Shutdown == NoShutdown) { /* WAL redo has started. We're out of reinitialization. */ FatalError = false; AbortStartTime = 0; /* * Start the archiver if we're responsible for (re-)archiving received * files. */ Assert(PgArchPID == 0); if (XLogArchivingAlways()) PgArchPID = StartChildProcess(B_ARCHIVER); /* * If we aren't planning to enter hot standby mode later, treat * RECOVERY_STARTED as meaning we're out of startup, and report status * accordingly. */ if (!EnableHotStandby) { AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STANDBY); #ifdef USE_SYSTEMD sd_notify(0, "READY=1"); #endif } pmState = PM_RECOVERY; } if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && pmState == PM_RECOVERY && Shutdown == NoShutdown) { ereport(LOG, (errmsg("database system is ready to accept read-only connections"))); /* Report status */ AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_READY); #ifdef USE_SYSTEMD sd_notify(0, "READY=1"); #endif pmState = PM_HOT_STANDBY; connsAllowed = true; /* Some workers may be scheduled to start now */ StartWorkerNeeded = true; } /* Process background worker state changes. */ if (CheckPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE)) { /* Accept new worker requests only if not stopping. */ BackgroundWorkerStateChange(pmState < PM_STOP_BACKENDS); StartWorkerNeeded = true; } /* Tell syslogger to rotate logfile if requested */ if (SysLoggerPID != 0) { if (CheckLogrotateSignal()) { signal_child(SysLoggerPID, SIGUSR1); RemoveLogrotateSignalFiles(); } else if (CheckPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE)) { signal_child(SysLoggerPID, SIGUSR1); } } if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER) && Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS) { /* * Start one iteration of the autovacuum daemon, even if autovacuuming * is nominally not enabled. This is so we can have an active defense * against transaction ID wraparound. We set a flag for the main loop * to do it rather than trying to do it here --- this is because the * autovac process itself may send the signal, and we want to handle * that by launching another iteration as soon as the current one * completes. */ start_autovac_launcher = true; } if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER) && Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS) { /* The autovacuum launcher wants us to start a worker process. */ StartAutovacuumWorker(); } if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER)) { /* Startup Process wants us to start the walreceiver process. */ WalReceiverRequested = true; } /* * Try to advance postmaster's state machine, if a child requests it. * * Be careful about the order of this action relative to this function's * other actions. Generally, this should be after other actions, in case * they have effects PostmasterStateMachine would need to know about. * However, we should do it before the CheckPromoteSignal step, which * cannot have any (immediate) effect on the state machine, but does * depend on what state we're in now. */ if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE)) { PostmasterStateMachine(); } if (StartupPID != 0 && (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY) && CheckPromoteSignal()) { /* * Tell startup process to finish recovery. * * Leave the promote signal file in place and let the Startup process * do the unlink. */ signal_child(StartupPID, SIGUSR2); } } /* * Dummy signal handler * * We use this for signals that we don't actually use in the postmaster, * but we do use in backends. If we were to SIG_IGN such signals in the * postmaster, then a newly started backend might drop a signal that arrives * before it's able to reconfigure its signal processing. (See notes in * tcop/postgres.c.) */ static void dummy_handler(SIGNAL_ARGS) { } /* * Count up number of child processes of specified types (dead_end children * are always excluded). */ static int CountChildren(int target) { dlist_iter iter; int cnt = 0; dlist_foreach(iter, &BackendList) { Backend *bp = dlist_container(Backend, elem, iter.cur); if (bp->dead_end) continue; /* * Since target == BACKEND_TYPE_ALL is the most common case, we test * it first and avoid touching shared memory for every child. */ if (target != BACKEND_TYPE_ALL) { /* * Assign bkend_type for any recently announced WAL Sender * processes. */ if (bp->bkend_type == BACKEND_TYPE_NORMAL && IsPostmasterChildWalSender(bp->child_slot)) bp->bkend_type = BACKEND_TYPE_WALSND; if (!(target & bp->bkend_type)) continue; } cnt++; } return cnt; } /* * StartChildProcess -- start an auxiliary process for the postmaster * * "type" determines what kind of child will be started. All child types * initially go to AuxiliaryProcessMain, which will handle common setup. * * Return value of StartChildProcess is subprocess' PID, or 0 if failed * to start subprocess. */ static pid_t StartChildProcess(BackendType type) { pid_t pid; pid = postmaster_child_launch(type, NULL, 0, NULL); if (pid < 0) { /* in parent, fork failed */ ereport(LOG, (errmsg("could not fork \"%s\" process: %m", PostmasterChildName(type)))); /* * fork failure is fatal during startup, but there's no need to choke * immediately if starting other child types fails. */ if (type == B_STARTUP) ExitPostmaster(1); return 0; } /* * in parent, successful fork */ return pid; } /* * StartAutovacuumWorker * Start an autovac worker process. * * This function is here because it enters the resulting PID into the * postmaster's private backends list. * * NB -- this code very roughly matches BackendStartup. */ static void StartAutovacuumWorker(void) { Backend *bn; /* * If not in condition to run a process, don't try, but handle it like a * fork failure. This does not normally happen, since the signal is only * supposed to be sent by autovacuum launcher when it's OK to do it, but * we have to check to avoid race-condition problems during DB state * changes. */ if (canAcceptConnections(BACKEND_TYPE_AUTOVAC) == CAC_OK) { bn = (Backend *) palloc_extended(sizeof(Backend), MCXT_ALLOC_NO_OOM); if (bn) { /* Autovac workers are not dead_end and need a child slot */ bn->dead_end = false; bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); bn->bgworker_notify = false; bn->rw = NULL; bn->pid = StartChildProcess(B_AUTOVAC_WORKER); if (bn->pid > 0) { bn->bkend_type = BACKEND_TYPE_AUTOVAC; dlist_push_head(&BackendList, &bn->elem); /* all OK */ return; } /* * fork failed, fall through to report -- actual error message was * logged by StartChildProcess */ (void) ReleasePostmasterChildSlot(bn->child_slot); pfree(bn); } else ereport(LOG, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } /* * Report the failure to the launcher, if it's running. (If it's not, we * might not even be connected to shared memory, so don't try to call * AutoVacWorkerFailed.) Note that we also need to signal it so that it * responds to the condition, but we don't do that here, instead waiting * for ServerLoop to do it. This way we avoid a ping-pong signaling in * quick succession between the autovac launcher and postmaster in case * things get ugly. */ if (AutoVacPID != 0) { AutoVacWorkerFailed(); avlauncher_needs_signal = true; } } /* * Create the opts file */ static bool CreateOptsFile(int argc, char *argv[], char *fullprogname) { FILE *fp; int i; #define OPTS_FILE "postmaster.opts" if ((fp = fopen(OPTS_FILE, "w")) == NULL) { ereport(LOG, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", OPTS_FILE))); return false; } fprintf(fp, "%s", fullprogname); for (i = 1; i < argc; i++) fprintf(fp, " \"%s\"", argv[i]); fputs("\n", fp); if (fclose(fp)) { ereport(LOG, (errcode_for_file_access(), errmsg("could not write file \"%s\": %m", OPTS_FILE))); return false; } return true; } /* * MaxLivePostmasterChildren * * This reports the number of entries needed in the per-child-process array * (PMChildFlags). It includes regular backends, autovac workers, walsenders * and background workers, but not special children nor dead_end children. * This allows the array to have a fixed maximum size, to wit the same * too-many-children limit enforced by canAcceptConnections(). The exact value * isn't too critical as long as it's more than MaxBackends. */ int MaxLivePostmasterChildren(void) { return 2 * (MaxConnections + autovacuum_max_workers + 1 + max_wal_senders + max_worker_processes); } /* * Start a new bgworker. * Starting time conditions must have been checked already. * * Returns true on success, false on failure. * In either case, update the RegisteredBgWorker's state appropriately. * * This code is heavily based on autovacuum.c, q.v. */ static bool do_start_bgworker(RegisteredBgWorker *rw) { Backend *bn; pid_t worker_pid; Assert(rw->rw_pid == 0); /* * Allocate and assign the Backend element. Note we must do this before * forking, so that we can handle failures (out of memory or child-process * slots) cleanly. * * Treat failure as though the worker had crashed. That way, the * postmaster will wait a bit before attempting to start it again; if we * tried again right away, most likely we'd find ourselves hitting the * same resource-exhaustion condition. */ bn = assign_backendlist_entry(); if (bn == NULL) { rw->rw_crashed_at = GetCurrentTimestamp(); return false; } bn->rw = rw; ereport(DEBUG1, (errmsg_internal("starting background worker process \"%s\"", rw->rw_worker.bgw_name))); worker_pid = postmaster_child_launch(B_BG_WORKER, (char *) &rw->rw_worker, sizeof(BackgroundWorker), NULL); if (worker_pid == -1) { /* in postmaster, fork failed ... */ ereport(LOG, (errmsg("could not fork background worker process: %m"))); /* undo what assign_backendlist_entry did */ ReleasePostmasterChildSlot(bn->child_slot); pfree(bn); /* mark entry as crashed, so we'll try again later */ rw->rw_crashed_at = GetCurrentTimestamp(); return false; } /* in postmaster, fork successful ... */ rw->rw_pid = worker_pid; bn->pid = rw->rw_pid; ReportBackgroundWorkerPID(rw); /* add new worker to lists of backends */ dlist_push_head(&BackendList, &bn->elem); return true; } /* * Does the current postmaster state require starting a worker with the * specified start_time? */ static bool bgworker_should_start_now(BgWorkerStartTime start_time) { switch (pmState) { case PM_NO_CHILDREN: case PM_WAIT_DEAD_END: case PM_SHUTDOWN_2: case PM_SHUTDOWN: case PM_WAIT_BACKENDS: case PM_STOP_BACKENDS: break; case PM_RUN: if (start_time == BgWorkerStart_RecoveryFinished) return true; /* fall through */ case PM_HOT_STANDBY: if (start_time == BgWorkerStart_ConsistentState) return true; /* fall through */ case PM_RECOVERY: case PM_STARTUP: case PM_INIT: if (start_time == BgWorkerStart_PostmasterStart) return true; /* fall through */ } return false; } /* * Allocate the Backend struct for a connected background worker, but don't * add it to the list of backends just yet. * * On failure, return NULL. */ static Backend * assign_backendlist_entry(void) { Backend *bn; /* * Check that database state allows another connection. Currently the * only possible failure is CAC_TOOMANY, so we just log an error message * based on that rather than checking the error code precisely. */ if (canAcceptConnections(BACKEND_TYPE_BGWORKER) != CAC_OK) { ereport(LOG, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), errmsg("no slot available for new background worker process"))); return NULL; } bn = palloc_extended(sizeof(Backend), MCXT_ALLOC_NO_OOM); if (bn == NULL) { ereport(LOG, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); return NULL; } bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); bn->bkend_type = BACKEND_TYPE_BGWORKER; bn->dead_end = false; bn->bgworker_notify = false; return bn; } /* * If the time is right, start background worker(s). * * As a side effect, the bgworker control variables are set or reset * depending on whether more workers may need to be started. * * We limit the number of workers started per call, to avoid consuming the * postmaster's attention for too long when many such requests are pending. * As long as StartWorkerNeeded is true, ServerLoop will not block and will * call this function again after dealing with any other issues. */ static void maybe_start_bgworkers(void) { #define MAX_BGWORKERS_TO_LAUNCH 100 int num_launched = 0; TimestampTz now = 0; dlist_mutable_iter iter; /* * During crash recovery, we have no need to be called until the state * transition out of recovery. */ if (FatalError) { StartWorkerNeeded = false; HaveCrashedWorker = false; return; } /* Don't need to be called again unless we find a reason for it below */ StartWorkerNeeded = false; HaveCrashedWorker = false; dlist_foreach_modify(iter, &BackgroundWorkerList) { RegisteredBgWorker *rw; rw = dlist_container(RegisteredBgWorker, rw_lnode, iter.cur); /* ignore if already running */ if (rw->rw_pid != 0) continue; /* if marked for death, clean up and remove from list */ if (rw->rw_terminate) { ForgetBackgroundWorker(rw); continue; } /* * If this worker has crashed previously, maybe it needs to be * restarted (unless on registration it specified it doesn't want to * be restarted at all). Check how long ago did a crash last happen. * If the last crash is too recent, don't start it right away; let it * be restarted once enough time has passed. */ if (rw->rw_crashed_at != 0) { if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART) { int notify_pid; notify_pid = rw->rw_worker.bgw_notify_pid; ForgetBackgroundWorker(rw); /* Report worker is gone now. */ if (notify_pid != 0) kill(notify_pid, SIGUSR1); continue; } /* read system time only when needed */ if (now == 0) now = GetCurrentTimestamp(); if (!TimestampDifferenceExceeds(rw->rw_crashed_at, now, rw->rw_worker.bgw_restart_time * 1000)) { /* Set flag to remember that we have workers to start later */ HaveCrashedWorker = true; continue; } } if (bgworker_should_start_now(rw->rw_worker.bgw_start_time)) { /* reset crash time before trying to start worker */ rw->rw_crashed_at = 0; /* * Try to start the worker. * * On failure, give up processing workers for now, but set * StartWorkerNeeded so we'll come back here on the next iteration * of ServerLoop to try again. (We don't want to wait, because * there might be additional ready-to-run workers.) We could set * HaveCrashedWorker as well, since this worker is now marked * crashed, but there's no need because the next run of this * function will do that. */ if (!do_start_bgworker(rw)) { StartWorkerNeeded = true; return; } /* * If we've launched as many workers as allowed, quit, but have * ServerLoop call us again to look for additional ready-to-run * workers. There might not be any, but we'll find out the next * time we run. */ if (++num_launched >= MAX_BGWORKERS_TO_LAUNCH) { StartWorkerNeeded = true; return; } } } } /* * When a backend asks to be notified about worker state changes, we * set a flag in its backend entry. The background worker machinery needs * to know when such backends exit. */ bool PostmasterMarkPIDForWorkerNotify(int pid) { dlist_iter iter; Backend *bp; dlist_foreach(iter, &BackendList) { bp = dlist_container(Backend, elem, iter.cur); if (bp->pid == pid) { bp->bgworker_notify = true; return true; } } return false; } #ifdef WIN32 /* * Subset implementation of waitpid() for Windows. We assume pid is -1 * (that is, check all child processes) and options is WNOHANG (don't wait). */ static pid_t waitpid(pid_t pid, int *exitstatus, int options) { win32_deadchild_waitinfo *childinfo; DWORD exitcode; DWORD dwd; ULONG_PTR key; OVERLAPPED *ovl; /* Try to consume one win32_deadchild_waitinfo from the queue. */ if (!GetQueuedCompletionStatus(win32ChildQueue, &dwd, &key, &ovl, 0)) { errno = EAGAIN; return -1; } childinfo = (win32_deadchild_waitinfo *) key; pid = childinfo->procId; /* * Remove handle from wait - required even though it's set to wait only * once */ UnregisterWaitEx(childinfo->waitHandle, NULL); if (!GetExitCodeProcess(childinfo->procHandle, &exitcode)) { /* * Should never happen. Inform user and set a fixed exitcode. */ write_stderr("could not read exit code for process\n"); exitcode = 255; } *exitstatus = exitcode; /* * Close the process handle. Only after this point can the PID can be * recycled by the kernel. */ CloseHandle(childinfo->procHandle); /* * Free struct that was allocated before the call to * RegisterWaitForSingleObject() */ pfree(childinfo); return pid; } /* * Note! Code below executes on a thread pool! All operations must * be thread safe! Note that elog() and friends must *not* be used. */ static void WINAPI pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired) { /* Should never happen, since we use INFINITE as timeout value. */ if (TimerOrWaitFired) return; /* * Post the win32_deadchild_waitinfo object for waitpid() to deal with. If * that fails, we leak the object, but we also leak a whole process and * get into an unrecoverable state, so there's not much point in worrying * about that. We'd like to panic, but we can't use that infrastructure * from this thread. */ if (!PostQueuedCompletionStatus(win32ChildQueue, 0, (ULONG_PTR) lpParameter, NULL)) write_stderr("could not post child completion status\n"); /* Queue SIGCHLD signal. */ pg_queue_signal(SIGCHLD); } /* * Queue a waiter to signal when this child dies. The wait will be handled * automatically by an operating system thread pool. The memory and the * process handle will be freed by a later call to waitpid(). */ void pgwin32_register_deadchild_callback(HANDLE procHandle, DWORD procId) { win32_deadchild_waitinfo *childinfo; childinfo = palloc(sizeof(win32_deadchild_waitinfo)); childinfo->procHandle = procHandle; childinfo->procId = procId; if (!RegisterWaitForSingleObject(&childinfo->waitHandle, procHandle, pgwin32_deadchild_callback, childinfo, INFINITE, WT_EXECUTEONLYONCE | WT_EXECUTEINWAITTHREAD)) ereport(FATAL, (errmsg_internal("could not register process for wait: error code %lu", GetLastError()))); } #endif /* WIN32 */ /* * Initialize one and only handle for monitoring postmaster death. * * Called once in the postmaster, so that child processes can subsequently * monitor if their parent is dead. */ static void InitPostmasterDeathWatchHandle(void) { #ifndef WIN32 /* * Create a pipe. Postmaster holds the write end of the pipe open * (POSTMASTER_FD_OWN), and children hold the read end. Children can pass * the read file descriptor to select() to wake up in case postmaster * dies, or check for postmaster death with a (read() == 0). Children must * close the write end as soon as possible after forking, because EOF * won't be signaled in the read end until all processes have closed the * write fd. That is taken care of in ClosePostmasterPorts(). */ Assert(MyProcPid == PostmasterPid); if (pipe(postmaster_alive_fds) < 0) ereport(FATAL, (errcode_for_file_access(), errmsg_internal("could not create pipe to monitor postmaster death: %m"))); /* Notify fd.c that we've eaten two FDs for the pipe. */ ReserveExternalFD(); ReserveExternalFD(); /* * Set O_NONBLOCK to allow testing for the fd's presence with a read() * call. */ if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) ereport(FATAL, (errcode_for_socket_access(), errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); #else /* * On Windows, we use a process handle for the same purpose. */ if (DuplicateHandle(GetCurrentProcess(), GetCurrentProcess(), GetCurrentProcess(), &PostmasterHandle, 0, TRUE, DUPLICATE_SAME_ACCESS) == 0) ereport(FATAL, (errmsg_internal("could not duplicate postmaster handle: error code %lu", GetLastError()))); #endif /* WIN32 */ }