aboutsummaryrefslogtreecommitdiff
path: root/src/include/storage/aio_internal.h
blob: 2d37a243abe525866d6ccfd74a1062be32d422e8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
/*-------------------------------------------------------------------------
 *
 * aio_internal.h
 *    AIO related declarations that should only be used by the AIO subsystem
 *    internally.
 *
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/storage/aio_internal.h
 *
 *-------------------------------------------------------------------------
 */
#ifndef AIO_INTERNAL_H
#define AIO_INTERNAL_H


#include "lib/ilist.h"
#include "port/pg_iovec.h"
#include "storage/aio.h"
#include "storage/condition_variable.h"


/*
 * The maximum number of IOs that can be batch submitted at once.
 */
#define PGAIO_SUBMIT_BATCH_SIZE 32



/*
 * State machine for handles. With some exceptions, noted below, handles move
 * linearly through all states.
 *
 * State changes should all go through pgaio_io_update_state().
 *
 * Note that the externally visible functions to start IO
 * (e.g. FileStartReadV(), via pgaio_io_start_readv()) move an IO from
 * PGAIO_HS_HANDED_OUT to at least PGAIO_HS_STAGED and at most
 * PGAIO_HS_COMPLETED_LOCAL (at which point the handle will be reused).
 */
typedef enum PgAioHandleState
{
	/* not in use */
	PGAIO_HS_IDLE = 0,

	/*
	 * Returned by pgaio_io_acquire(). The next state is either DEFINED (if
	 * pgaio_io_start_*() is called), or IDLE (if pgaio_io_release() is
	 * called).
	 */
	PGAIO_HS_HANDED_OUT,

	/*
	 * pgaio_io_start_*() has been called, but IO is not yet staged. At this
	 * point the handle has all the information for the IO to be executed.
	 */
	PGAIO_HS_DEFINED,

	/*
	 * stage() callbacks have been called, handle ready to be submitted for
	 * execution. Unless in batchmode (see c.f. pgaio_enter_batchmode()), the
	 * IO will be submitted immediately after.
	 */
	PGAIO_HS_STAGED,

	/* IO has been submitted to the IO method for execution */
	PGAIO_HS_SUBMITTED,

	/* IO finished, but result has not yet been processed */
	PGAIO_HS_COMPLETED_IO,

	/*
	 * IO completed, shared completion has been called.
	 *
	 * If the IO completion occurs in the issuing backend, local callbacks
	 * will immediately be called. Otherwise the handle stays in
	 * COMPLETED_SHARED until the issuing backend waits for the completion of
	 * the IO.
	 */
	PGAIO_HS_COMPLETED_SHARED,

	/*
	 * IO completed, local completion has been called.
	 *
	 * After this the handle will be made reusable and go into IDLE state.
	 */
	PGAIO_HS_COMPLETED_LOCAL,
} PgAioHandleState;


struct ResourceOwnerData;

/* typedef is in aio_types.h */
struct PgAioHandle
{
	/* all state updates should go through pgaio_io_update_state() */
	PgAioHandleState state:8;

	/* what are we operating on */
	PgAioTargetID target:8;

	/* which IO operation */
	PgAioOp		op:8;

	/* bitfield of PgAioHandleFlags */
	uint8		flags;

	uint8		num_callbacks;

	/* using the proper type here would use more space */
	uint8		callbacks[PGAIO_HANDLE_MAX_CALLBACKS];

	/* data forwarded to each callback */
	uint8		callbacks_data[PGAIO_HANDLE_MAX_CALLBACKS];

	/*
	 * Length of data associated with handle using
	 * pgaio_io_set_handle_data_*().
	 */
	uint8		handle_data_len;

	/* XXX: could be optimized out with some pointer math */
	int32		owner_procno;

	/* raw result of the IO operation */
	int32		result;

	/**
	 * In which list the handle is registered, depends on the state:
	 * - IDLE, in per-backend list
	 * - HANDED_OUT - not in a list
	 * - DEFINED - not in a list
	 * - STAGED - in per-backend staged array
	 * - SUBMITTED - in issuer's in_flight list
	 * - COMPLETED_IO - in issuer's in_flight list
	 * - COMPLETED_SHARED - in issuer's in_flight list
	 **/
	dlist_node	node;

	struct ResourceOwnerData *resowner;
	dlist_node	resowner_node;

	/* incremented every time the IO handle is reused */
	uint64		generation;

	/*
	 * To wait for the IO to complete other backends can wait on this CV. Note
	 * that, if in SUBMITTED state, a waiter first needs to check if it needs
	 * to do work via IoMethodOps->wait_one().
	 */
	ConditionVariable cv;

	/* result of shared callback, passed to issuer callback */
	PgAioResult distilled_result;

	/*
	 * Index into PgAioCtl->iovecs and PgAioCtl->handle_data.
	 *
	 * At the moment there's no need to differentiate between the two, but
	 * that won't necessarily stay that way.
	 */
	uint32		iovec_off;

	/*
	 * If not NULL, this memory location will be updated with information
	 * about the IOs completion iff the issuing backend learns about the IOs
	 * completion.
	 */
	PgAioReturn *report_return;

	/* Data necessary for the IO to be performed */
	PgAioOpData op_data;

	/*
	 * Data necessary to identify the object undergoing IO to higher-level
	 * code. Needs to be sufficient to allow another backend to reopen the
	 * file.
	 */
	PgAioTargetData target_data;
};


typedef struct PgAioBackend
{
	/* index into PgAioCtl->io_handles */
	uint32		io_handle_off;

	/* IO Handles that currently are not used */
	dclist_head idle_ios;

	/*
	 * Only one IO may be returned by pgaio_io_acquire()/pgaio_io_acquire_nb()
	 * without having been either defined (by actually associating it with IO)
	 * or released (with pgaio_io_release()). This restriction is necessary to
	 * guarantee that we always can acquire an IO. ->handed_out_io is used to
	 * enforce that rule.
	 */
	PgAioHandle *handed_out_io;

	/* Are we currently in batchmode? See pgaio_enter_batchmode(). */
	bool		in_batchmode;

	/*
	 * IOs that are defined, but not yet submitted.
	 */
	uint16		num_staged_ios;
	PgAioHandle *staged_ios[PGAIO_SUBMIT_BATCH_SIZE];

	/*
	 * List of in-flight IOs. Also contains IOs that aren't strictly speaking
	 * in-flight anymore, but have been waited-for and completed by another
	 * backend. Once this backend sees such an IO it'll be reclaimed.
	 *
	 * The list is ordered by submission time, with more recently submitted
	 * IOs being appended at the end.
	 */
	dclist_head in_flight_ios;
} PgAioBackend;


typedef struct PgAioCtl
{
	int			backend_state_count;
	PgAioBackend *backend_state;

	/*
	 * Array of iovec structs. Each iovec is owned by a specific backend. The
	 * allocation is in PgAioCtl to allow the maximum number of iovecs for
	 * individual IOs to be configurable with PGC_POSTMASTER GUC.
	 */
	uint32		iovec_count;
	struct iovec *iovecs;

	/*
	 * For, e.g., an IO covering multiple buffers in shared / temp buffers, we
	 * need to get Buffer IDs during completion to be able to change the
	 * BufferDesc state accordingly. This space can be used to store e.g.
	 * Buffer IDs.  Note that the actual iovec might be shorter than this,
	 * because we combine neighboring pages into one larger iovec entry.
	 */
	uint64	   *handle_data;

	uint32		io_handle_count;
	PgAioHandle *io_handles;
} PgAioCtl;



/*
 * Callbacks used to implement an IO method.
 */
typedef struct IoMethodOps
{
	/* properties */

	/*
	 * If an FD is about to be closed, do we need to wait for all in-flight
	 * IOs referencing that FD?
	 */
	bool		wait_on_fd_before_close;


	/* global initialization */

	/*
	 * Amount of additional shared memory to reserve for the io_method. Called
	 * just like a normal ipci.c style *Size() function. Optional.
	 */
	size_t		(*shmem_size) (void);

	/*
	 * Initialize shared memory. First time is true if AIO's shared memory was
	 * just initialized, false otherwise. Optional.
	 */
	void		(*shmem_init) (bool first_time);

	/*
	 * Per-backend initialization. Optional.
	 */
	void		(*init_backend) (void);


	/* handling of IOs */

	/* optional */
	bool		(*needs_synchronous_execution) (PgAioHandle *ioh);

	/*
	 * Start executing passed in IOs.
	 *
	 * Shall advance state to at least PGAIO_HS_SUBMITTED.  (By the time this
	 * returns, other backends might have advanced the state further.)
	 *
	 * Will not be called if ->needs_synchronous_execution() returned true.
	 *
	 * num_staged_ios is <= PGAIO_SUBMIT_BATCH_SIZE.
	 *
	 * Always called in a critical section.
	 */
	int			(*submit) (uint16 num_staged_ios, PgAioHandle **staged_ios);

	/* ---
	 * Wait for the IO to complete. Optional.
	 *
	 * On return, state shall be on of
	 * - PGAIO_HS_COMPLETED_IO
	 * - PGAIO_HS_COMPLETED_SHARED
	 * - PGAIO_HS_COMPLETED_LOCAL
	 *
	 * The callback must not block if the handle is already in one of those
	 * states, or has been reused (see pgaio_io_was_recycled()).  If, on
	 * return, the state is PGAIO_HS_COMPLETED_IO, state will reach
	 * PGAIO_HS_COMPLETED_SHARED without further intervention by the IO
	 * method.
	 *
	 * If not provided, it needs to be guaranteed that the IO method calls
	 * pgaio_io_process_completion() without further interaction by the
	 * issuing backend.
	 * ---
	 */
	void		(*wait_one) (PgAioHandle *ioh,
							 uint64 ref_generation);
} IoMethodOps;


/* aio.c */
extern bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state);
extern void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op);
extern void pgaio_io_process_completion(PgAioHandle *ioh, int result);
extern void pgaio_io_prepare_submit(PgAioHandle *ioh);
extern bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh);
extern const char *pgaio_io_get_state_name(PgAioHandle *ioh);
const char *pgaio_result_status_string(PgAioResultStatus rs);
extern void pgaio_shutdown(int code, Datum arg);

/* aio_callback.c */
extern void pgaio_io_call_stage(PgAioHandle *ioh);
extern void pgaio_io_call_complete_shared(PgAioHandle *ioh);
extern PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh);

/* aio_io.c */
extern void pgaio_io_perform_synchronously(PgAioHandle *ioh);
extern const char *pgaio_io_get_op_name(PgAioHandle *ioh);
extern bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd);
extern int	pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov);

/* aio_target.c */
extern bool pgaio_io_can_reopen(PgAioHandle *ioh);
extern void pgaio_io_reopen(PgAioHandle *ioh);
extern const char *pgaio_io_get_target_name(PgAioHandle *ioh);


/*
 * The AIO subsystem has fairly verbose debug logging support. This can be
 * enabled/disabled at build time. The reason for this is that
 * a) the verbosity can make debugging things on higher levels hard
 * b) even if logging can be skipped due to elevel checks, it still causes a
 *    measurable slowdown
 *
 * XXX: This likely should be eventually be disabled by default, at least in
 * non-assert builds.
 */
#define PGAIO_VERBOSE		1

/*
 * Simple ereport() wrapper that only logs if PGAIO_VERBOSE is defined.
 *
 * This intentionally still compiles the code, guarded by a constant if (0),
 * if verbose logging is disabled, to make it less likely that debug logging
 * is silently broken.
 *
 * The current definition requires passing at least one argument.
 */
#define pgaio_debug(elevel, msg, ...)  \
	do { \
		if (PGAIO_VERBOSE) \
			ereport(elevel, \
					errhidestmt(true), errhidecontext(true), \
					errmsg_internal(msg, \
									__VA_ARGS__)); \
	} while(0)

/*
 * Simple ereport() wrapper. Note that the definition requires passing at
 * least one argument.
 */
#define pgaio_debug_io(elevel, ioh, msg, ...)  \
	pgaio_debug(elevel, "io %-10d|op %-5s|target %-4s|state %-16s: " msg, \
				pgaio_io_get_id(ioh), \
				pgaio_io_get_op_name(ioh), \
				pgaio_io_get_target_name(ioh), \
				pgaio_io_get_state_name(ioh), \
				__VA_ARGS__)

/* Declarations for the tables of function pointers exposed by each IO method. */
extern PGDLLIMPORT const IoMethodOps pgaio_sync_ops;
extern PGDLLIMPORT const IoMethodOps pgaio_worker_ops;
#ifdef IOMETHOD_IO_URING_ENABLED
extern PGDLLIMPORT const IoMethodOps pgaio_uring_ops;
#endif

extern PGDLLIMPORT const IoMethodOps *pgaio_method_ops;
extern PGDLLIMPORT PgAioCtl *pgaio_ctl;
extern PGDLLIMPORT PgAioBackend *pgaio_my_backend;



#endif							/* AIO_INTERNAL_H */