diff options
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 8 | ||||
-rw-r--r-- | src/backend/storage/buffer/localbuf.c | 3 | ||||
-rw-r--r-- | src/include/storage/buf_internals.h | 7 | ||||
-rw-r--r-- | src/test/modules/Makefile | 1 | ||||
-rw-r--r-- | src/test/modules/meson.build | 1 | ||||
-rw-r--r-- | src/test/modules/test_aio/.gitignore | 2 | ||||
-rw-r--r-- | src/test/modules/test_aio/Makefile | 26 | ||||
-rw-r--r-- | src/test/modules/test_aio/meson.build | 37 | ||||
-rw-r--r-- | src/test/modules/test_aio/t/001_aio.pl | 1503 | ||||
-rw-r--r-- | src/test/modules/test_aio/t/002_io_workers.pl | 125 | ||||
-rw-r--r-- | src/test/modules/test_aio/test_aio--1.0.sql | 108 | ||||
-rw-r--r-- | src/test/modules/test_aio/test_aio.c | 806 | ||||
-rw-r--r-- | src/test/modules/test_aio/test_aio.control | 3 |
13 files changed, 2622 insertions, 8 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f9681d09e1e..1c37d7dfe2f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -518,10 +518,6 @@ static uint32 WaitBufHdrUnlocked(BufferDesc *buf); static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context); static void WaitIO(BufferDesc *buf); -static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait); -static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, - uint32 set_flag_bits, bool forget_owner, - bool release_aio); static void AbortBufferIO(Buffer buffer); static void shared_buffer_write_error_callback(void *arg); static void local_buffer_write_error_callback(void *arg); @@ -5962,7 +5958,7 @@ WaitIO(BufferDesc *buf) * find out if they can perform the I/O as part of a larger operation, without * waiting for the answer or distinguishing the reasons why not. */ -static bool +bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait) { uint32 buf_state; @@ -6019,7 +6015,7 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait) * resource owner. (forget_owner=false is used when the resource owner itself * is being released) */ -static void +void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio) { diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index bf89076bb10..ed56202af14 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -57,7 +57,6 @@ static int NLocalPinnedBuffers = 0; static void InitLocalBuffers(void); static Block GetLocalBufferStorage(void); static Buffer GetLocalVictimBuffer(void); -static void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced); /* @@ -597,7 +596,7 @@ TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bit * * See also InvalidateBuffer(). */ -static void +void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) { Buffer buffer = BufferDescriptorGetBuffer(bufHdr); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 72b36a4af26..0dec7d93b3b 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -434,6 +434,12 @@ extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_co extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag); +/* solely to make it easier to write tests */ +extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait); +extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, + bool forget_owner, bool release_aio); + + /* freelist.c */ extern IOContext IOContextForStrategy(BufferAccessStrategy strategy); extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, @@ -478,6 +484,7 @@ extern void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio); extern bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait); extern void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln); +extern void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced); extern void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock); diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 4e4be3fa511..aa1d27bbed3 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -14,6 +14,7 @@ SUBDIRS = \ oauth_validator \ plsample \ spgist_name_ops \ + test_aio \ test_bloomfilter \ test_copy_callbacks \ test_custom_rmgrs \ diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 2b057451473..9de0057bd1d 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -13,6 +13,7 @@ subdir('oauth_validator') subdir('plsample') subdir('spgist_name_ops') subdir('ssl_passphrase_callback') +subdir('test_aio') subdir('test_bloomfilter') subdir('test_copy_callbacks') subdir('test_custom_rmgrs') diff --git a/src/test/modules/test_aio/.gitignore b/src/test/modules/test_aio/.gitignore new file mode 100644 index 00000000000..716e17f5a2a --- /dev/null +++ b/src/test/modules/test_aio/.gitignore @@ -0,0 +1,2 @@ +# Generated subdirectories +/tmp_check/ diff --git a/src/test/modules/test_aio/Makefile b/src/test/modules/test_aio/Makefile new file mode 100644 index 00000000000..f53cc64671a --- /dev/null +++ b/src/test/modules/test_aio/Makefile @@ -0,0 +1,26 @@ +# src/test/modules/test_aio/Makefile + +PGFILEDESC = "test_aio - test code for AIO" + +MODULE_big = test_aio +OBJS = \ + $(WIN32RES) \ + test_aio.o + +EXTENSION = test_aio +DATA = test_aio--1.0.sql + +TAP_TESTS = 1 + +export enable_injection_points + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_aio +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_aio/meson.build b/src/test/modules/test_aio/meson.build new file mode 100644 index 00000000000..73d2fd68eaa --- /dev/null +++ b/src/test/modules/test_aio/meson.build @@ -0,0 +1,37 @@ +# Copyright (c) 2024-2025, PostgreSQL Global Development Group + +test_aio_sources = files( + 'test_aio.c', +) + +if host_system == 'windows' + test_aio_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_aio', + '--FILEDESC', 'test_aio - test code for AIO',]) +endif + +test_aio = shared_module('test_aio', + test_aio_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_aio + +test_install_data += files( + 'test_aio.control', + 'test_aio--1.0.sql', +) + +tests += { + 'name': 'test_aio', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_aio.pl', + 't/002_io_workers.pl', + ], + }, +} diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl new file mode 100644 index 00000000000..93fe5b116df --- /dev/null +++ b/src/test/modules/test_aio/t/001_aio.pl @@ -0,0 +1,1503 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + + +### +# Test io_method=worker +### +my $node_worker = create_node('worker'); +$node_worker->start(); + +test_generic('worker', $node_worker); +SKIP: +{ + skip 'Injection points not supported by this build', 1 + unless $ENV{enable_injection_points} eq 'yes'; + test_inject_worker('worker', $node_worker); +} + +$node_worker->stop(); + + +### +# Test io_method=io_uring +### + +if (have_io_uring()) +{ + my $node_uring = create_node('io_uring'); + $node_uring->start(); + test_generic('io_uring', $node_uring); + $node_uring->stop(); +} + + +### +# Test io_method=sync +### + +my $node_sync = create_node('sync'); + +# just to have one test not use the default auto-tuning + +$node_sync->append_conf( + 'postgresql.conf', qq( +io_max_concurrency=4 +)); + +$node_sync->start(); +test_generic('sync', $node_sync); +$node_sync->stop(); + +done_testing(); + + +### +# Test Helpers +### + +sub create_node +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + + my $io_method = shift; + + my $node = PostgreSQL::Test::Cluster->new($io_method); + + # Want to test initdb for each IO method, otherwise we could just reuse + # the cluster. + # + # Unfortunately Cluster::init() puts PG_TEST_INITDB_EXTRA_OPTS after the + # options specified by ->extra, if somebody puts -c io_method=xyz in + # PG_TEST_INITDB_EXTRA_OPTS it would break this test. Fix that up if we + # detect it. + local $ENV{PG_TEST_INITDB_EXTRA_OPTS} = $ENV{PG_TEST_INITDB_EXTRA_OPTS}; + if (defined $ENV{PG_TEST_INITDB_EXTRA_OPTS} + && $ENV{PG_TEST_INITDB_EXTRA_OPTS} =~ m/io_method=/) + { + $ENV{PG_TEST_INITDB_EXTRA_OPTS} .= " -c io_method=$io_method"; + } + + $node->init(extra => [ '-c', "io_method=$io_method" ]); + + $node->append_conf( + 'postgresql.conf', qq( +shared_preload_libraries=test_aio +log_min_messages = 'DEBUG3' +log_statement=all +log_error_verbosity=default +restart_after_crash=false +temp_buffers=100 +)); + + ok(1, "$io_method: initdb"); + + return $node; +} + +sub have_io_uring +{ + # To detect if io_uring is supported, we look at the error message for + # assigning an invalid value to an enum GUC, which lists all the valid + # options. We need to use -C to deal with running as administrator on + # windows, the superuser check is omitted if -C is used. + my ($stdout, $stderr) = + run_command [qw(postgres -C invalid -c io_method=invalid)]; + die "can't determine supported io_method values" + unless $stderr =~ m/Available values: ([^\.]+)\./; + my $methods = $1; + note "supported io_method values are: $methods"; + + return ($methods =~ m/io_uring/) ? 1 : 0; +} + +sub psql_like +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + my $io_method = shift; + my $psql = shift; + my $name = shift; + my $sql = shift; + my $expected_stdout = shift; + my $expected_stderr = shift; + my ($cmdret, $output); + + ($output, $cmdret) = $psql->query($sql); + + like($output, $expected_stdout, "$io_method: $name: expected stdout"); + like($psql->{stderr}, $expected_stderr, + "$io_method: $name: expected stderr"); + $psql->{stderr} = ''; + + return $output; +} + +sub query_wait_block +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + my $io_method = shift; + my $node = shift; + my $psql = shift; + my $name = shift; + my $sql = shift; + my $waitfor = shift; + + my $pid = $psql->query_safe('SELECT pg_backend_pid()'); + + $psql->{stdin} .= qq($sql;\n); + $psql->{run}->pump_nb(); + ok(1, "$io_method: $name: issued sql"); + + $node->poll_query_until('postgres', + qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid), + $waitfor); + ok(1, "$io_method: $name: observed $waitfor wait event"); +} + +# Returns count of checksum failures for the specified database or for shared +# relations, if $datname is undefined. +sub checksum_failures +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + my $psql = shift; + my $datname = shift; + my $checksum_count; + my $checksum_last_failure; + + if (defined $datname) + { + $checksum_count = $psql->query_safe( + qq( +SELECT checksum_failures FROM pg_stat_database WHERE datname = '$datname'; +)); + $checksum_last_failure = $psql->query_safe( + qq( +SELECT checksum_last_failure FROM pg_stat_database WHERE datname = '$datname'; +)); + } + else + { + $checksum_count = $psql->query_safe( + qq( +SELECT checksum_failures FROM pg_stat_database WHERE datname IS NULL; +)); + $checksum_last_failure = $psql->query_safe( + qq( +SELECT checksum_last_failure FROM pg_stat_database WHERE datname IS NULL; +)); + } + + return $checksum_count, $checksum_last_failure; +} + +### +# Sub-tests +### + +# Sanity checks for the IO handle API +sub test_handle +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # leak warning: implicit xact + psql_like( + $io_method, + $psql, + "handle_get() leak in implicit xact", + qq(SELECT handle_get()), + qr/^$/, + qr/leaked AIO handle/, + "$io_method: leaky handle_get() warns"); + + # leak warning: explicit xact + psql_like( + $io_method, $psql, + "handle_get() leak in explicit xact", + qq(BEGIN; SELECT handle_get(); COMMIT), + qr/^$/, qr/leaked AIO handle/); + + + # leak warning: explicit xact, rollback + psql_like( + $io_method, + $psql, + "handle_get() leak in explicit xact, rollback", + qq(BEGIN; SELECT handle_get(); ROLLBACK;), + qr/^$/, + qr/leaked AIO handle/); + + # leak warning: subtrans + psql_like( + $io_method, + $psql, + "handle_get() leak in subxact", + qq(BEGIN; SAVEPOINT foo; SELECT handle_get(); COMMIT;), + qr/^$/, + qr/leaked AIO handle/); + + # leak warning + error: released in different command (thus resowner) + psql_like( + $io_method, + $psql, + "handle_release() in different command", + qq(BEGIN; SELECT handle_get(); SELECT handle_release_last(); COMMIT;), + qr/^$/, + qr/leaked AIO handle.*release in unexpected state/ms); + + # no leak, release in same command + psql_like( + $io_method, + $psql, + "handle_release() in same command", + qq(BEGIN; SELECT handle_get() UNION ALL SELECT handle_release_last(); COMMIT;), + qr/^$/, + qr/^$/); + + # normal handle use + psql_like($io_method, $psql, "handle_get_release()", + qq(SELECT handle_get_release()), + qr/^$/, qr/^$/); + + # should error out, API violation + psql_like( + $io_method, + $psql, + "handle_get_twice()", + qq(SELECT handle_get_twice()), + qr/^$/, + qr/ERROR: API violation: Only one IO can be handed out$/); + + # recover after error in implicit xact + psql_like( + $io_method, + $psql, + "handle error recovery in implicit xact", + qq(SELECT handle_get_and_error(); SELECT 'ok', handle_get_release()), + qr/^|ok$/, + qr/ERROR.*as you command/); + + # recover after error in implicit xact + psql_like( + $io_method, + $psql, + "handle error recovery in explicit xact", + qq(BEGIN; SELECT handle_get_and_error(); SELECT handle_get_release(), 'ok'; COMMIT;), + qr/^|ok$/, + qr/ERROR.*as you command/); + + # recover after error in subtrans + psql_like( + $io_method, + $psql, + "handle error recovery in explicit subxact", + qq(BEGIN; SAVEPOINT foo; SELECT handle_get_and_error(); ROLLBACK TO SAVEPOINT foo; SELECT handle_get_release(); ROLLBACK;), + qr/^|ok$/, + qr/ERROR.*as you command/); + + $psql->quit(); +} + +# Sanity checks for the batchmode API +sub test_batchmode +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # leak warning & recovery: implicit xact + psql_like( + $io_method, + $psql, + "batch_start() leak & cleanup in implicit xact", + qq(SELECT batch_start()), + qr/^$/, + qr/open AIO batch at end/, + "$io_method: leaky batch_start() warns"); + + # leak warning & recovery: explicit xact + psql_like( + $io_method, + $psql, + "batch_start() leak & cleanup in explicit xact", + qq(BEGIN; SELECT batch_start(); COMMIT;), + qr/^$/, + qr/open AIO batch at end/, + "$io_method: leaky batch_start() warns"); + + + # leak warning & recovery: explicit xact, rollback + # + # XXX: This doesn't fail right now, due to not getting a chance to do + # something at transaction command commit. That's not a correctness issue, + # it just means it's a bit harder to find buggy code. + #psql_like($io_method, $psql, + # "batch_start() leak & cleanup after abort", + # qq(BEGIN; SELECT batch_start(); ROLLBACK;), + # qr/^$/, + # qr/open AIO batch at end/, "$io_method: leaky batch_start() warns"); + + # no warning, batch closed in same command + psql_like( + $io_method, + $psql, + "batch_start(), batch_end() works", + qq(SELECT batch_start() UNION ALL SELECT batch_end()), + qr/^$/, + qr/^$/, + "$io_method: batch_start(), batch_end()"); + + $psql->quit(); +} + +# Test that simple cases of invalid pages are reported +sub test_io_error +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + $psql->query_safe( + qq( +CREATE TEMPORARY TABLE tmp_corr(data int not null); +INSERT INTO tmp_corr SELECT generate_series(1, 10000); +SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true); +)); + + foreach my $tblname (qw(tbl_corr tmp_corr)) + { + my $invalid_page_re = + $tblname eq 'tbl_corr' + ? qr/invalid page in block 1 of relation base\/\d+\/\d+/ + : qr/invalid page in block 1 of relation base\/\d+\/t\d+_\d+/; + + # verify the error is reported in custom C code + psql_like( + $io_method, + $psql, + "read_rel_block_ll() of $tblname page", + qq(SELECT read_rel_block_ll('$tblname', 1)), + qr/^$/, + $invalid_page_re); + + # verify the error is reported for bufmgr reads, seq scan + psql_like( + $io_method, $psql, + "sequential scan of $tblname block fails", + qq(SELECT count(*) FROM $tblname), + qr/^$/, $invalid_page_re); + + # verify the error is reported for bufmgr reads, tid scan + psql_like( + $io_method, + $psql, + "tid scan of $tblname block fails", + qq(SELECT count(*) FROM $tblname WHERE ctid = '(1, 1)'), + qr/^$/, + $invalid_page_re); + } + + $psql->quit(); +} + +# Test interplay between StartBufferIO and TerminateBufferIO +sub test_startwait_io +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql_a = $node->background_psql('postgres', on_error_stop => 0); + my $psql_b = $node->background_psql('postgres', on_error_stop => 0); + + + ### Verify behavior for normal tables + + # create a buffer we can play around with + my $buf_id = psql_like( + $io_method, $psql_a, + "creation of toy buffer succeeds", + qq(SELECT buffer_create_toy('tbl_ok', 1)), + qr/^\d+$/, qr/^$/); + + # check that one backend can perform StartBufferIO + psql_like( + $io_method, + $psql_a, + "first StartBufferIO", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + qr/^t$/, + qr/^$/); + + # but not twice on the same buffer (non-waiting) + psql_like( + $io_method, + $psql_a, + "second StartBufferIO fails, same session", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^f$/, + qr/^$/); + psql_like( + $io_method, + $psql_b, + "second StartBufferIO fails, other session", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^f$/, + qr/^$/); + + # start io in a different session, will block + query_wait_block( + $io_method, + $node, + $psql_b, + "blocking start buffer io", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + "BufferIo"); + + # Terminate the IO, without marking it as success, this should trigger the + # waiting session to be able to start the io + psql_like( + $io_method, + $psql_a, + "blocking start buffer io, terminating io, not valid", + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false)), + qr/^$/, + qr/^$/); + + + # Because the IO was terminated, but not marked as valid, second session should get the right to start io + pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/t/); + ok(1, "$io_method: blocking start buffer io, can start io"); + + # terminate the IO again + $psql_b->query_safe( + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);) + ); + + + # same as the above scenario, but mark IO as having succeeded + psql_like( + $io_method, + $psql_a, + "blocking buffer io w/ success: first start buffer io", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + qr/^t$/, + qr/^$/); + + # start io in a different session, will block + query_wait_block( + $io_method, + $node, + $psql_b, + "blocking start buffer io", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + "BufferIo"); + + # Terminate the IO, marking it as success + psql_like( + $io_method, + $psql_a, + "blocking start buffer io, terminating io, valid", + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false)), + qr/^$/, + qr/^$/); + + # Because the IO was terminated, and marked as valid, second session should complete but not need io + pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/f/); + ok(1, "$io_method: blocking start buffer io, no need to start io"); + + # buffer is valid now, make it invalid again + $psql_a->query_safe(qq(SELECT buffer_create_toy('tbl_ok', 1);)); + + + ### Verify behavior for temporary tables + + # Can't unfortunately share the code with the normal table case, there are + # too many behavioral differences. + + # create a buffer we can play around with + $psql_a->query_safe( + qq( +CREATE TEMPORARY TABLE tmp_ok(data int not null); +INSERT INTO tmp_ok SELECT generate_series(1, 10000); +)); + $buf_id = $psql_a->query_safe(qq(SELECT buffer_create_toy('tmp_ok', 3);)); + + # check that one backend can perform StartLocalBufferIO + psql_like( + $io_method, + $psql_a, + "first StartLocalBufferIO", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^t$/, + qr/^$/); + + # Because local buffers don't use IO_IN_PROGRESS, a second StartLocalBufer + # succeeds as well. This test mostly serves as a documentation of that + # fact. If we had actually started IO, it'd be different. + psql_like( + $io_method, + $psql_a, + "second StartLocalBufferIO succeeds, same session", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^t$/, + qr/^$/); + + # Terminate the IO again, without marking it as a success + $psql_a->query_safe( + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);) + ); + psql_like( + $io_method, + $psql_a, + "StartLocalBufferIO after not marking valid succeeds, same session", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^t$/, + qr/^$/); + + # Terminate the IO again, marking it as a success + $psql_a->query_safe( + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false);) + ); + + # Now another StartLocalBufferIO should fail, this time because the buffer + # is already valid. + psql_like( + $io_method, + $psql_a, + "StartLocalBufferIO after marking valid fails", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + qr/^f$/, + qr/^$/); + + $psql_a->quit(); + $psql_b->quit(); +} + +# Test that if the backend issuing a read doesn't wait for the IO's +# completion, another backend can complete the IO +sub test_complete_foreign +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql_a = $node->background_psql('postgres', on_error_stop => 0); + my $psql_b = $node->background_psql('postgres', on_error_stop => 0); + + # Issue IO without waiting for completion, then sleep + $psql_a->query_safe( + qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);)); + + # Check that another backend can read the relevant block + psql_like( + $io_method, + $psql_b, + "completing read started by sleeping backend", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1), + qr/^1$/, + qr/^$/); + + # Issue IO without waiting for completion, then exit. + $psql_a->query_safe( + qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);)); + $psql_a->reconnect_and_clear(); + + # Check that another backend can read the relevant block. This verifies + # that the exiting backend left the AIO in a sane state. + psql_like( + $io_method, + $psql_b, + "read buffer started by exited backend", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1), + qr/^1$/, + qr/^$/); + + # Read a tbl_corr block, then sleep. The other session will retry the IO + # and also fail. The easiest thing to verify that seems to be to check + # that both are in the log. + my $log_location = -s $node->logfile; + $psql_a->query_safe( + qq(SELECT read_rel_block_ll('tbl_corr', 1, wait_complete=>false);)); + + psql_like( + $io_method, + $psql_b, + "completing read of tbl_corr block started by other backend", + qq(SELECT count(*) FROM tbl_corr WHERE ctid = '(1,1)' LIMIT 1), + qr/^$/, + qr/invalid page in block/); + + # The log message issued for the read_rel_block_ll() should be logged as a LOG + $node->wait_for_log(qr/LOG[^\n]+invalid page in/, $log_location); + ok(1, + "$io_method: completing read of tbl_corr block started by other backend: LOG message for background read" + ); + + # But for the SELECT, it should be an ERROR + $log_location = + $node->wait_for_log(qr/ERROR[^\n]+invalid page in/, $log_location); + ok(1, + "$io_method: completing read of tbl_corr block started by other backend: ERROR message for foreground read" + ); + + $psql_a->quit(); + $psql_b->quit(); +} + +# Test that we deal correctly with FDs being closed while IO is in progress +sub test_close_fd +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + psql_like( + $io_method, + $psql, + "close all FDs after read, waiting for results", + qq( + SELECT read_rel_block_ll('tbl_ok', 1, + wait_complete=>true, + batchmode_enter=>true, + smgrreleaseall=>true, + batchmode_exit=>true + );), + qr/^$/, + qr/^$/); + + psql_like( + $io_method, + $psql, + "close all FDs after read, no waiting", + qq( + SELECT read_rel_block_ll('tbl_ok', 1, + wait_complete=>false, + batchmode_enter=>true, + smgrreleaseall=>true, + batchmode_exit=>true + );), + qr/^$/, + qr/^$/); + + # Check that another backend can read the relevant block + psql_like( + $io_method, + $psql, + "close all FDs after read, no waiting, query works", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1), + qr/^1$/, + qr/^$/); + + $psql->quit(); +} + +# Tests using injection points. Mostly to exercise hard IO errors that are +# hard to trigger without using injection points. +sub test_inject +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # injected what we'd expect + $psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);)); + $psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);)); + psql_like( + $io_method, $psql, + "injection point not triggering failure", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'), + qr/^1$/, qr/^$/); + + # injected a read shorter than a single block, expecting error + $psql->query_safe(qq(SELECT inj_io_short_read_attach(17);)); + $psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);)); + psql_like( + $io_method, + $psql, + "single block short read fails", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'), + qr/^$/, + qr/ERROR:.*could not read blocks 2\.\.2 in file "base\/.*": read only 0 of 8192 bytes/ + ); + + # shorten multi-block read to a single block, should retry + my $inval_query = qq(SELECT invalidate_rel_block('tbl_ok', 0); +SELECT invalidate_rel_block('tbl_ok', 1); +SELECT invalidate_rel_block('tbl_ok', 2); +SELECT invalidate_rel_block('tbl_ok', 3); +/* gap */ +SELECT invalidate_rel_block('tbl_ok', 5); +SELECT invalidate_rel_block('tbl_ok', 6); +SELECT invalidate_rel_block('tbl_ok', 7); +SELECT invalidate_rel_block('tbl_ok', 8);); + + $psql->query_safe($inval_query); + $psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);)); + psql_like( + $io_method, $psql, + "multi block short read (1 block) is retried", + qq(SELECT count(*) FROM tbl_ok), + qr/^10000$/, qr/^$/); + + # shorten multi-block read to two blocks, should retry + $psql->query_safe($inval_query); + $psql->query_safe(qq(SELECT inj_io_short_read_attach(8192*2);)); + + psql_like( + $io_method, $psql, + "multi block short read (2 blocks) is retried", + qq(SELECT count(*) FROM tbl_ok), + qr/^10000$/, qr/^$/); + + # verify that page verification errors are detected even as part of a + # shortened multi-block read (tbl_corr, block 1 is corrupted) + $psql->query_safe( + qq( +SELECT invalidate_rel_block('tbl_corr', 0); +SELECT invalidate_rel_block('tbl_corr', 1); +SELECT invalidate_rel_block('tbl_corr', 2); +SELECT inj_io_short_read_attach(8192); + )); + + psql_like( + $io_method, + $psql, + "shortened multi-block read detects invalid page", + qq(SELECT count(*) FROM tbl_corr WHERE ctid < '(2, 1)'), + qr/^$/, + qr/ERROR:.*invalid page in block 1 of relation base\/.*/); + + # trigger a hard error, should error out + $psql->query_safe( + qq( +SELECT inj_io_short_read_attach(-errno_from_string('EIO')); +SELECT invalidate_rel_block('tbl_ok', 2); + )); + + psql_like( + $io_method, + $psql, + "first hard IO error is reported", + qq(SELECT count(*) FROM tbl_ok), + qr/^$/, + qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/ + ); + + psql_like( + $io_method, + $psql, + "second hard IO error is reported", + qq(SELECT count(*) FROM tbl_ok), + qr/^$/, + qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/ + ); + + $psql->query_safe(qq(SELECT inj_io_short_read_detach())); + + # now the IO should be ok. + psql_like( + $io_method, $psql, + "recovers after hard error", + qq(SELECT count(*) FROM tbl_ok), + qr/^10000$/, qr/^$/); + + # trigger a different hard error, should error out + $psql->query_safe( + qq( +SELECT inj_io_short_read_attach(-errno_from_string('EROFS')); +SELECT invalidate_rel_block('tbl_ok', 2); + )); + psql_like( + $io_method, + $psql, + "different hard IO error is reported", + qq(SELECT count(*) FROM tbl_ok), + qr/^$/, + qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Read-only file system/ + ); + $psql->query_safe(qq(SELECT inj_io_short_read_detach())); + + $psql->quit(); +} + +# Tests using injection points, only for io_method=worker. +# +# io_method=worker has the special case of needing to reopen files. That can +# in theory fail, because the file could be gone. That's a hard path to test +# for real, so we use an injection point to trigger it. +sub test_inject_worker +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # trigger a failure to reopen, should error out, but should recover + $psql->query_safe( + qq( +SELECT inj_io_reopen_attach(); +SELECT invalidate_rel_block('tbl_ok', 1); + )); + + psql_like( + $io_method, + $psql, + "failure to open: detected", + qq(SELECT count(*) FROM tbl_ok), + qr/^$/, + qr/ERROR:.*could not read blocks 1\.\.1 in file "base\/.*": No such file or directory/ + ); + + $psql->query_safe(qq(SELECT inj_io_reopen_detach();)); + + # check that we indeed recover + psql_like( + $io_method, $psql, + "failure to open: recovers", + qq(SELECT count(*) FROM tbl_ok), + qr/^10000$/, qr/^$/); + + $psql->quit(); +} + +# Verify that we handle a relation getting removed (due to a rollback or a +# DROP TABLE) while IO is ongoing for that table. +sub test_invalidate +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + foreach my $persistency (qw(normal unlogged temporary)) + { + my $sql_persistency = $persistency eq 'normal' ? '' : $persistency; + my $tblname = $persistency . '_transactional'; + + my $create_sql = qq( +CREATE $sql_persistency TABLE $tblname (id int not null, data text not null) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO $tblname(id, data) SELECT generate_series(1, 10000) as id, repeat('a', 200); +); + + # Verify that outstanding read IO does not cause problems with + # AbortTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ... + # -> Invalidate[Local]Buffer. + $psql->query_safe("BEGIN; $create_sql;"); + $psql->query_safe( + qq( +SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false); +)); + psql_like( + $io_method, + $psql, + "rollback of newly created $persistency table with outstanding IO", + qq(ROLLBACK), + qr/^$/, + qr/^$/); + + # Verify that outstanding read IO does not cause problems with + # CommitTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ... + # -> Invalidate[Local]Buffer. + $psql->query_safe("BEGIN; $create_sql; COMMIT;"); + $psql->query_safe( + qq( +BEGIN; +SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false); +)); + + psql_like( + $io_method, $psql, + "drop $persistency table with outstanding IO", + qq(DROP TABLE $tblname), + qr/^$/, qr/^$/); + + psql_like($io_method, $psql, + "commit after drop $persistency table with outstanding IO", + qq(COMMIT), qr/^$/, qr/^$/); + } + + $psql->quit(); +} + +# Test behavior related to ZERO_ON_ERROR and zero_damaged_pages +sub test_zero +{ + my $io_method = shift; + my $node = shift; + + my $psql_a = $node->background_psql('postgres', on_error_stop => 0); + my $psql_b = $node->background_psql('postgres', on_error_stop => 0); + + foreach my $persistency (qw(normal temporary)) + { + my $sql_persistency = $persistency eq 'normal' ? '' : $persistency; + + $psql_a->query_safe( + qq( +CREATE $sql_persistency TABLE tbl_zero(id int) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_zero SELECT generate_series(1, 10000); +)); + + $psql_a->query_safe( + qq( +SELECT modify_rel_block('tbl_zero', 0, corrupt_header=>true); +)); + + # Check that page validity errors are detected + psql_like( + $io_method, + $psql_a, + "$persistency: test reading of invalid block 0", + qq( +SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)), + qr/^$/, + qr/^psql:<stdin>:\d+: ERROR: invalid page in block 0 of relation base\/.*\/.*$/ + ); + + # Check that page validity errors are zeroed + psql_like( + $io_method, + $psql_a, + "$persistency: test zeroing of invalid block 0", + qq( +SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>true)), + qr/^$/, + qr/^psql:<stdin>:\d+: WARNING: invalid page in block 0 of relation base\/.*\/.*; zeroing out page$/ + ); + + # And that once the corruption is fixed, we can read again + $psql_a->query( + qq( +SELECT modify_rel_block('tbl_zero', 0, zero=>true); +)); + $psql_a->{stderr} = ''; + + psql_like( + $io_method, + $psql_a, + "$persistency: test re-read of block 0", + qq( +SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)), + qr/^$/, + qr/^$/); + + # Check a page validity error in another block, to ensure we report + # the correct block number + $psql_a->query_safe( + qq( +SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true); +)); + psql_like( + $io_method, + $psql_a, + "$persistency: test zeroing of invalid block 3", + qq(SELECT read_rel_block_ll('tbl_zero', 3, zero_on_error=>true);), + qr/^$/, + qr/^psql:<stdin>:\d+: WARNING: invalid page in block 3 of relation base\/.*\/.*; zeroing out page$/ + ); + + + # Check one read reporting multiple invalid blocks + $psql_a->query_safe( + qq( +SELECT modify_rel_block('tbl_zero', 2, corrupt_header=>true); +SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true); +)); + # First test error + psql_like( + $io_method, + $psql_a, + "$persistency: test reading of invalid block 2,3 in larger read", + qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)), + qr/^$/, + qr/^psql:<stdin>:\d+: ERROR: 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL: Block 2 held first invalid page\.\nHINT:[^\n]+$/ + ); + + # Then test zeroing via ZERO_ON_ERROR flag + psql_like( + $io_method, + $psql_a, + "$persistency: test zeroing of invalid block 2,3 in larger read, ZERO_ON_ERROR", + qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>true)), + qr/^$/, + qr/^psql:<stdin>:\d+: WARNING: zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL: Block 2 held first zeroed page\.\nHINT:[^\n]+$/ + ); + + # Then test zeroing via zero_damaged_pages + psql_like( + $io_method, + $psql_a, + "$persistency: test zeroing of invalid block 2,3 in larger read, zero_damaged_pages", + qq( +BEGIN; +SET LOCAL zero_damaged_pages = true; +SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false) +COMMIT; +), + qr/^$/, + qr/^psql:<stdin>:\d+: WARNING: zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL: Block 2 held first zeroed page\.\nHINT:[^\n]+$/ + ); + + $psql_a->query_safe(qq(COMMIT)); + + + # Verify that bufmgr.c IO detects page validity errors + $psql_a->query( + qq( +SELECT invalidate_rel_block('tbl_zero', g.i) +FROM generate_series(0, 15) g(i); +SELECT modify_rel_block('tbl_zero', 3, zero=>true); +)); + $psql_a->{stderr} = ''; + + psql_like( + $io_method, + $psql_a, + "$persistency: verify reading zero_damaged_pages=off", + qq( +SELECT count(*) FROM tbl_zero), + qr/^$/, + qr/^psql:<stdin>:\d+: ERROR: invalid page in block 2 of relation base\/.*\/.*$/ + ); + + # Verify that bufmgr.c IO zeroes out pages with page validity errors + psql_like( + $io_method, + $psql_a, + "$persistency: verify zero_damaged_pages=on", + qq( +BEGIN; +SET LOCAL zero_damaged_pages = true; +SELECT count(*) FROM tbl_zero; +COMMIT; +), + qr/^\d+$/, + qr/^psql:<stdin>:\d+: WARNING: invalid page in block 2 of relation base\/.*\/.*$/ + ); + + # Check that warnings/errors about page validity in an IO started by + # session A that session B might complete aren't logged visibly to + # session B. + # + # This will only ever trigger for io_method's like io_uring, that can + # complete IO's in a client backend. But it doesn't seem worth + # restricting to that. + # + # This requires cross-session access to the same relation, hence the + # restriction to non-temporary table. + if ($sql_persistency ne 'temporary') + { + # Create a corruption and then read the block without waiting for + # completion. + $psql_a->query(qq( +SELECT modify_rel_block('tbl_zero', 1, corrupt_header=>true); +SELECT read_rel_block_ll('tbl_zero', 1, wait_complete=>false, zero_on_error=>true) +)); + + psql_like( + $io_method, + $psql_b, + "$persistency: test completing read by other session doesn't generate warning", + qq(SELECT count(*) > 0 FROM tbl_zero;), + qr/^t$/, qr/^$/); + } + + # Clean up + $psql_a->query_safe( + qq( +DROP TABLE tbl_zero; +)); + } + + $psql_a->{stderr} = ''; + + $psql_a->quit(); + $psql_b->quit(); +} + +# Test that we detect checksum failures and report them +sub test_checksum +{ + my $io_method = shift; + my $node = shift; + + my $psql_a = $node->background_psql('postgres', on_error_stop => 0); + + $psql_a->query_safe( + qq( +CREATE TABLE tbl_normal(id int) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_normal SELECT generate_series(1, 5000); +SELECT modify_rel_block('tbl_normal', 3, corrupt_checksum=>true); + +CREATE TEMPORARY TABLE tbl_temp(id int) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_temp SELECT generate_series(1, 5000); +SELECT modify_rel_block('tbl_temp', 3, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_temp', 4, corrupt_checksum=>true); +)); + + # To be able to test checksum failures on shared rels we need a shared rel + # with invalid pages - which is a bit scary. pg_shseclabel seems like a + # good bet, as it's not accessed in a default configuration. + $psql_a->query_safe( + qq( +SELECT grow_rel('pg_shseclabel', 4); +SELECT modify_rel_block('pg_shseclabel', 2, corrupt_checksum=>true); +SELECT modify_rel_block('pg_shseclabel', 3, corrupt_checksum=>true); +)); + + + # Check that page validity errors are detected, checksums stats increase, normal rel + my ($cs_count_before, $cs_ts_before) = + checksum_failures($psql_a, 'postgres'); + psql_like( + $io_method, + $psql_a, + "normal rel: test reading of invalid block 3", + qq( +SELECT read_rel_block_ll('tbl_normal', 3, nblocks=>1, zero_on_error=>false);), + qr/^$/, + qr/^psql:<stdin>:\d+: ERROR: invalid page in block 3 of relation base\/\d+\/\d+$/ + ); + + my ($cs_count_after, $cs_ts_after) = + checksum_failures($psql_a, 'postgres'); + + cmp_ok($cs_count_before + 1, + '<=', $cs_count_after, + "$io_method: normal rel: checksum count increased"); + cmp_ok($cs_ts_after, 'ne', '', + "$io_method: normal rel: checksum timestamp is not null"); + + + # Check that page validity errors are detected, checksums stats increase, temp rel + ($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres'); + psql_like( + $io_method, + $psql_a, + "temp rel: test reading of invalid block 4, valid block 5", + qq( +SELECT read_rel_block_ll('tbl_temp', 4, nblocks=>2, zero_on_error=>false);), + qr/^$/, + qr/^psql:<stdin>:\d+: ERROR: invalid page in block 4 of relation base\/\d+\/t\d+_\d+$/ + ); + + ($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres'); + + cmp_ok($cs_count_before + 1, + '<=', $cs_count_after, + "$io_method: temp rel: checksum count increased"); + cmp_ok($cs_ts_after, 'ne', '', + "$io_method: temp rel: checksum timestamp is not null"); + + + # Check that page validity errors are detected, checksums stats increase, shared rel + ($cs_count_before, $cs_ts_after) = checksum_failures($psql_a); + psql_like( + $io_method, + $psql_a, + "shared rel: reading of invalid blocks 2+3", + qq( +SELECT read_rel_block_ll('pg_shseclabel', 2, nblocks=>2, zero_on_error=>false);), + qr/^$/, + qr/^psql:<stdin>:\d+: ERROR: 2 invalid pages among blocks 2..3 of relation global\/\d+\nDETAIL: Block 2 held first invalid page\.\nHINT:[^\n]+$/ + ); + + ($cs_count_after, $cs_ts_after) = checksum_failures($psql_a); + + cmp_ok($cs_count_before + 1, + '<=', $cs_count_after, + "$io_method: shared rel: checksum count increased"); + cmp_ok($cs_ts_after, 'ne', '', + "$io_method: shared rel: checksum timestamp is not null"); + + + # and restore sanity + $psql_a->query( + qq( +SELECT modify_rel_block('pg_shseclabel', 1, zero=>true); +DROP TABLE tbl_normal; +)); + $psql_a->{stderr} = ''; + + $psql_a->quit(); +} + +# Verify checksum handling when creating database from a database with an +# invalid block. This also serves as a minimal check that cross-database IO is +# handled reasonably. +sub test_checksum_createdb +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + $node->safe_psql('postgres', + 'CREATE DATABASE regression_createdb_source'); + + $node->safe_psql( + 'regression_createdb_source', qq( +CREATE EXTENSION test_aio; +CREATE TABLE tbl_cs_fail(data int not null) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_cs_fail SELECT generate_series(1, 1000); +SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true); +)); + + my $createdb_sql = qq( +CREATE DATABASE regression_createdb_target +TEMPLATE regression_createdb_source +STRATEGY wal_log; +); + + # Verify that CREATE DATABASE of an invalid database fails and is + # accounted for accurately. + my ($cs_count_before, $cs_ts_before) = + checksum_failures($psql, 'regression_createdb_source'); + psql_like( + $io_method, + $psql, + "create database w/ wal strategy, invalid source", + $createdb_sql, + qr/^$/, + qr/^psql:<stdin>:\d+: ERROR: invalid page in block 1 of relation base\/\d+\/\d+$/ + ); + my ($cs_count_after, $cs_ts_after) = + checksum_failures($psql, 'regression_createdb_source'); + cmp_ok($cs_count_before + 1, '<=', $cs_count_after, + "$io_method: create database w/ wal strategy, invalid source: checksum count increased" + ); + + # Verify that CREATE DATABASE of the fixed database succeeds. + $node->safe_psql( + 'regression_createdb_source', qq( +SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true); +)); + psql_like($io_method, $psql, + "create database w/ wal strategy, valid source", + $createdb_sql, qr/^$/, qr/^$/); + + $psql->quit(); +} + +# Test that we detect checksum failures and report them +# +# In several places we make sure that the server log actually contains +# individual information for each block involved in the IO. +sub test_ignore_checksum +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # Test setup + $psql->query_safe( + qq( +CREATE TABLE tbl_cs_fail(id int) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_cs_fail SELECT generate_series(1, 10000); +)); + + my $count_sql = "SELECT count(*) FROM tbl_cs_fail"; + my $invalidate_sql = qq( +SELECT invalidate_rel_block('tbl_cs_fail', g.i) +FROM generate_series(0, 6) g(i); +); + + my $expect = $psql->query_safe($count_sql); + + + # Very basic tests for ignore_checksum_failure=off / on + + $psql->query_safe( + qq( +SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_cs_fail', 6, corrupt_checksum=>true); +)); + + $psql->query_safe($invalidate_sql); + psql_like($io_method, $psql, + "reading block w/ wrong checksum with ignore_checksum_failure=off fails", + $count_sql, qr/^$/, qr/ERROR: invalid page in block/); + + $psql->query_safe("SET ignore_checksum_failure=on"); + + $psql->query_safe($invalidate_sql); + psql_like($io_method, $psql, + "reading block w/ wrong checksum with ignore_checksum_failure=off succeeds", + $count_sql, + qr/^$expect$/, + qr/WARNING: ignoring (checksum failure|\d checksum failures)/); + + + # Verify that ignore_checksum_failure=off works in multi-block reads + + $psql->query_safe( + qq( +SELECT modify_rel_block('tbl_cs_fail', 2, zero=>true); +SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true); +)); + + my $log_location = -s $node->logfile; + psql_like( + $io_method, + $psql, + "test reading of checksum failed block 3, with ignore", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);), + qr/^$/, + qr/^psql:<stdin>:\d+: WARNING: ignoring checksum failure in block 3/ + ); + + # Check that the log contains a LOG message about the failure + $log_location = + $node->wait_for_log(qr/LOG: ignoring checksum failure/, $log_location); + + # check that we error + psql_like( + $io_method, + $psql, + "test reading of valid block 2, checksum failed 3, invalid 4, zero=false with ignore", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 2, nblocks=>3, zero_on_error=>false);), + qr/^$/, + qr/^psql:<stdin>:\d+: ERROR: invalid page in block 4 of relation base\/\d+\/\d+$/ + ); + + # Test multi-block read with different problems in different blocks + $psql->query( + qq( +SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true); +SELECT modify_rel_block('tbl_cs_fail', 2, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true); +SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true); +SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_header=>true); +)); + $psql->{stderr} = ''; + + $log_location = -s $node->logfile; + psql_like( + $io_method, + $psql, + "test reading of valid block 1, checksum failed 2, 3, invalid 3-5, zero=true", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 1, nblocks=>5, zero_on_error=>true);), + qr/^$/, + qr/^psql:<stdin>:\d+: WARNING: zeroing 3 page\(s\) and ignoring 2 checksum failure\(s\) among blocks 1..5 of relation/ + ); + + + # Unfortunately have to scan the whole log since determining $log_location + # above in each of the tests, as wait_for_log() returns the size of the + # file. + + $node->wait_for_log(qr/LOG: ignoring checksum failure in block 2/, + $log_location); + ok(1, "$io_method: found information about checksum failure in block 2"); + + $node->wait_for_log(qr/LOG: invalid page in block 3 of relation base.*; zeroing out page/, + $log_location); + ok(1, "$io_method: found information about invalid page in block 3"); + + $node->wait_for_log(qr/LOG: invalid page in block 4 of relation base.*; zeroing out page/, + $log_location); + ok(1, "$io_method: found information about checksum failure in block 4"); + + $node->wait_for_log(qr/LOG: invalid page in block 5 of relation base.*; zeroing out page/, + $log_location); + ok(1, "$io_method: found information about checksum failure in block 5"); + + + # Reading a page with both an invalid header and an invalid checksum + $psql->query( + qq( +SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true); +)); + $psql->{stderr} = ''; + + psql_like( + $io_method, + $psql, + "test reading of block with both invalid header and invalid checksum, zero=false", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);), + qr/^$/, + qr/^psql:<stdin>:\d+: ERROR: invalid page in block 3 of relation/ + ); + + psql_like( + $io_method, + $psql, + "test reading of block 3 with both invalid header and invalid checksum, zero=true", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>true);), + qr/^$/, + qr/^psql:<stdin>:\d+: WARNING: invalid page in block 3 of relation base\/.*; zeroing out page/ + ); + + + $psql->quit(); +} + + +# Run all tests that are supported for all io_methods +sub test_generic +{ + my $io_method = shift; + my $node = shift; + + is($node->safe_psql('postgres', 'SHOW io_method'), + $io_method, "$io_method: io_method set correctly"); + + $node->safe_psql( + 'postgres', qq( +CREATE EXTENSION test_aio; +CREATE TABLE tbl_corr(data int not null) WITH (AUTOVACUUM_ENABLED = false); +CREATE TABLE tbl_ok(data int not null) WITH (AUTOVACUUM_ENABLED = false); + +INSERT INTO tbl_corr SELECT generate_series(1, 10000); +INSERT INTO tbl_ok SELECT generate_series(1, 10000); +SELECT grow_rel('tbl_corr', 16); +SELECT grow_rel('tbl_ok', 16); + +SELECT modify_rel_block('tbl_corr', 1, corrupt_header=>true); +CHECKPOINT; +)); + + test_handle($io_method, $node); + test_io_error($io_method, $node); + test_batchmode($io_method, $node); + test_startwait_io($io_method, $node); + test_complete_foreign($io_method, $node); + test_close_fd($io_method, $node); + test_invalidate($io_method, $node); + test_zero($io_method, $node); + test_checksum($io_method, $node); + test_ignore_checksum($io_method, $node); + test_checksum_createdb($io_method, $node); + + SKIP: + { + skip 'Injection points not supported by this build', 1 + unless $ENV{enable_injection_points} eq 'yes'; + test_inject($io_method, $node); + } +} diff --git a/src/test/modules/test_aio/t/002_io_workers.pl b/src/test/modules/test_aio/t/002_io_workers.pl new file mode 100644 index 00000000000..af5fae15ea7 --- /dev/null +++ b/src/test/modules/test_aio/t/002_io_workers.pl @@ -0,0 +1,125 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use List::Util qw(shuffle); + + +my $node = PostgreSQL::Test::Cluster->new('worker'); +$node->init(); +$node->append_conf( + 'postgresql.conf', qq( +io_method=worker +)); + +$node->start(); + +# Test changing the number of I/O worker processes while also evaluating the +# handling of their termination. +test_number_of_io_workers_dynamic($node); + +$node->stop(); + +done_testing(); + + +sub test_number_of_io_workers_dynamic +{ + my $node = shift; + + my $prev_worker_count = $node->safe_psql('postgres', 'SHOW io_workers'); + + # Verify that worker count can't be set to 0 + change_number_of_io_workers($node, 0, $prev_worker_count, 1); + + # Verify that worker count can't be set to 33 (above the max) + change_number_of_io_workers($node, 33, $prev_worker_count, 1); + + # Try changing IO workers to a random value and verify that the worker + # count ends up as expected. Always test the min/max of workers. + # + # Valid range for io_workers is [1, 32]. 8 tests in total seems + # reasonable. + my @io_workers_range = shuffle(1 ... 32); + foreach my $worker_count (1, 32, @io_workers_range[ 0, 6 ]) + { + $prev_worker_count = + change_number_of_io_workers($node, $worker_count, + $prev_worker_count, 0); + } +} + +sub change_number_of_io_workers +{ + my $node = shift; + my $worker_count = shift; + my $prev_worker_count = shift; + my $expect_failure = shift; + my ($result, $stdout, $stderr); + + ($result, $stdout, $stderr) = + $node->psql('postgres', "ALTER SYSTEM SET io_workers = $worker_count"); + $node->safe_psql('postgres', 'SELECT pg_reload_conf()'); + + if ($expect_failure) + { + ok( $stderr =~ + /$worker_count is outside the valid range for parameter "io_workers"/, + "updating number of io_workers to $worker_count failed, as expected" + ); + + return $prev_worker_count; + } + else + { + is( $node->safe_psql('postgres', 'SHOW io_workers'), + $worker_count, + "updating number of io_workers from $prev_worker_count to $worker_count" + ); + + check_io_worker_count($node, $worker_count); + terminate_io_worker($node, $worker_count); + check_io_worker_count($node, $worker_count); + + return $worker_count; + } +} + +sub terminate_io_worker +{ + my $node = shift; + my $worker_count = shift; + my ($pid, $ret); + + # Select a random io worker + $pid = $node->safe_psql( + 'postgres', + qq(SELECT pid FROM pg_stat_activity WHERE + backend_type = 'io worker' ORDER BY RANDOM() LIMIT 1)); + + # terminate IO worker with SIGINT + is(PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'INT', $pid), + 0, "random io worker process signalled with INT"); + + # Check that worker exits + ok( $node->poll_query_until( + 'postgres', + qq(SELECT COUNT(*) FROM pg_stat_activity WHERE pid = $pid), '0'), + "random io worker process exited after signal"); +} + +sub check_io_worker_count +{ + my $node = shift; + my $worker_count = shift; + + ok( $node->poll_query_until( + 'postgres', + qq(SELECT COUNT(*) FROM pg_stat_activity WHERE backend_type = 'io worker'), + $worker_count), + "io worker count is $worker_count"); +} diff --git a/src/test/modules/test_aio/test_aio--1.0.sql b/src/test/modules/test_aio/test_aio--1.0.sql new file mode 100644 index 00000000000..e495481c41e --- /dev/null +++ b/src/test/modules/test_aio/test_aio--1.0.sql @@ -0,0 +1,108 @@ +/* src/test/modules/test_aio/test_aio--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_aio" to load this file. \quit + + +CREATE FUNCTION errno_from_string(sym text) +RETURNS pg_catalog.int4 STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + +CREATE FUNCTION grow_rel(rel regclass, nblocks int) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + +CREATE FUNCTION modify_rel_block(rel regclass, blockno int, + zero bool DEFAULT false, + corrupt_header bool DEFAULT false, + corrupt_checksum bool DEFAULT false) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION read_rel_block_ll( + rel regclass, + blockno int, + nblocks int DEFAULT 1, + wait_complete bool DEFAULT true, + batchmode_enter bool DEFAULT false, + smgrreleaseall bool DEFAULT false, + batchmode_exit bool DEFAULT false, + zero_on_error bool DEFAULT false) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION invalidate_rel_block(rel regclass, blockno int) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION buffer_create_toy(rel regclass, blockno int4) +RETURNS pg_catalog.int4 STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION buffer_call_start_io(buffer int, for_input bool, nowait bool) +RETURNS pg_catalog.bool STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION buffer_call_terminate_io(buffer int, for_input bool, succeed bool, io_error bool, release_aio bool) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + + +/* + * Handle related functions + */ +CREATE FUNCTION handle_get_and_error() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION handle_get_twice() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION handle_get() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION handle_get_release() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION handle_release_last() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + +/* + * Batchmode related functions + */ +CREATE FUNCTION batch_start() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION batch_end() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + + +/* + * Injection point related functions + */ +CREATE FUNCTION inj_io_short_read_attach(result int) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION inj_io_short_read_detach() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION inj_io_reopen_attach() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION inj_io_reopen_detach() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c new file mode 100644 index 00000000000..bef0ecd9007 --- /dev/null +++ b/src/test/modules/test_aio/test_aio.c @@ -0,0 +1,806 @@ +/*------------------------------------------------------------------------- + * + * test_aio.c + * Helpers to write tests for AIO + * + * This module provides interface functions for C functionality to SQL, to + * make it possible to test AIO related behavior in a targeted way from SQL. + * It'd not generally be safe to export these functions to SQL, but for a test + * that's fine. + * + * Copyright (c) 2020-2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_aio/test_aio.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relation.h" +#include "fmgr.h" +#include "storage/aio.h" +#include "storage/aio_internal.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "utils/builtins.h" +#include "utils/injection_point.h" +#include "utils/rel.h" + + +PG_MODULE_MAGIC; + + +typedef struct InjIoErrorState +{ + bool enabled_short_read; + bool enabled_reopen; + + bool short_read_result_set; + int short_read_result; +} InjIoErrorState; + +static InjIoErrorState * inj_io_error_state; + +/* Shared memory init callbacks */ +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; + + +static PgAioHandle *last_handle; + + + +static void +test_aio_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(sizeof(InjIoErrorState)); +} + +static void +test_aio_shmem_startup(void) +{ + bool found; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + /* Create or attach to the shared memory state */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + inj_io_error_state = ShmemInitStruct("injection_points", + sizeof(InjIoErrorState), + &found); + + if (!found) + { + /* First time through, initialize */ + inj_io_error_state->enabled_short_read = false; + inj_io_error_state->enabled_reopen = false; + +#ifdef USE_INJECTION_POINTS + InjectionPointAttach("AIO_PROCESS_COMPLETION_BEFORE_SHARED", + "test_aio", + "inj_io_short_read", + NULL, + 0); + InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED"); + + InjectionPointAttach("AIO_WORKER_AFTER_REOPEN", + "test_aio", + "inj_io_reopen", + NULL, + 0); + InjectionPointLoad("AIO_WORKER_AFTER_REOPEN"); + +#endif + } + else + { + /* + * Pre-load the injection points now, so we can call them in a + * critical section. + */ +#ifdef USE_INJECTION_POINTS + InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED"); + InjectionPointLoad("AIO_WORKER_AFTER_REOPEN"); + elog(LOG, "injection point loaded"); +#endif + } + + LWLockRelease(AddinShmemInitLock); +} + +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = test_aio_shmem_request; + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = test_aio_shmem_startup; +} + + +PG_FUNCTION_INFO_V1(errno_from_string); +Datum +errno_from_string(PG_FUNCTION_ARGS) +{ + const char *sym = text_to_cstring(PG_GETARG_TEXT_PP(0)); + + if (strcmp(sym, "EIO") == 0) + PG_RETURN_INT32(EIO); + else if (strcmp(sym, "EAGAIN") == 0) + PG_RETURN_INT32(EAGAIN); + else if (strcmp(sym, "EINTR") == 0) + PG_RETURN_INT32(EINTR); + else if (strcmp(sym, "ENOSPC") == 0) + PG_RETURN_INT32(ENOSPC); + else if (strcmp(sym, "EROFS") == 0) + PG_RETURN_INT32(EROFS); + + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg_internal("%s is not a supported errno value", sym)); + PG_RETURN_INT32(0); +} + +PG_FUNCTION_INFO_V1(grow_rel); +Datum +grow_rel(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + uint32 nblocks = PG_GETARG_UINT32(1); + Relation rel; +#define MAX_BUFFERS_TO_EXTEND_BY 64 + Buffer victim_buffers[MAX_BUFFERS_TO_EXTEND_BY]; + + rel = relation_open(relid, AccessExclusiveLock); + + while (nblocks > 0) + { + uint32 extend_by_pages; + + extend_by_pages = Min(nblocks, MAX_BUFFERS_TO_EXTEND_BY); + + ExtendBufferedRelBy(BMR_REL(rel), + MAIN_FORKNUM, + NULL, + 0, + extend_by_pages, + victim_buffers, + &extend_by_pages); + + nblocks -= extend_by_pages; + + for (uint32 i = 0; i < extend_by_pages; i++) + { + ReleaseBuffer(victim_buffers[i]); + } + } + + relation_close(rel, NoLock); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(modify_rel_block); +Datum +modify_rel_block(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + BlockNumber blkno = PG_GETARG_UINT32(1); + bool zero = PG_GETARG_BOOL(2); + bool corrupt_header = PG_GETARG_BOOL(3); + bool corrupt_checksum = PG_GETARG_BOOL(4); + Page page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + Relation rel; + Buffer buf; + PageHeader ph; + + rel = relation_open(relid, AccessExclusiveLock); + + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, + RBM_ZERO_ON_ERROR, NULL); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * copy the page to local memory, seems nicer than to directly modify in + * the buffer pool. + */ + memcpy(page, BufferGetPage(buf), BLCKSZ); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + ReleaseBuffer(buf); + + /* + * Don't want to have a buffer in-memory that's marked valid where the + * on-disk contents are invalid. Particularly not if the in-memory buffer + * could be dirty... + * + * While we hold an AEL on the relation nobody else should be able to read + * the buffer in. + * + * NB: This is probably racy, better don't copy this to non-test code. + */ + if (BufferIsLocal(buf)) + InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true); + else + EvictUnpinnedBuffer(buf); + + /* + * Now modify the page as asked for by the caller. + */ + if (zero) + memset(page, 0, BufferGetPageSize(buf)); + + if (PageIsEmpty(page) && (corrupt_header || corrupt_checksum)) + PageInit(page, BufferGetPageSize(buf), 0); + + ph = (PageHeader) page; + + if (corrupt_header) + ph->pd_special = BLCKSZ + 1; + + if (corrupt_checksum) + { + bool successfully_corrupted = 0; + + /* + * Any single modification of the checksum could just end up being + * valid again, due to e.g. corrupt_header changing the data in a way + * that'd result in the "corrupted" checksum, or the checksum already + * being invalid. Retry in that, unlikely, case. + */ + for (int i = 0; i < 100; i++) + { + uint16 verify_checksum; + uint16 old_checksum; + + old_checksum = ph->pd_checksum; + ph->pd_checksum = old_checksum + 1; + + elog(LOG, "corrupting checksum of blk %u from %u to %u", + blkno, old_checksum, ph->pd_checksum); + + verify_checksum = pg_checksum_page(page, blkno); + if (verify_checksum != ph->pd_checksum) + { + successfully_corrupted = true; + break; + } + } + + if (!successfully_corrupted) + elog(ERROR, "could not corrupt checksum, what's going on?"); + } + else + { + PageSetChecksumInplace(page, blkno); + } + + smgrwrite(RelationGetSmgr(rel), + MAIN_FORKNUM, blkno, page, true); + + relation_close(rel, NoLock); + + PG_RETURN_VOID(); +} + +/* + * Ensures a buffer for rel & blkno is in shared buffers, without actually + * caring about the buffer contents. Used to set up test scenarios. + */ +static Buffer +create_toy_buffer(Relation rel, BlockNumber blkno) +{ + Buffer buf; + BufferDesc *buf_hdr; + uint32 buf_state; + bool was_pinned = false; + + /* place buffer in shared buffers without erroring out */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK, NULL); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (RelationUsesLocalBuffers(rel)) + { + buf_hdr = GetLocalBufferDescriptor(-buf - 1); + buf_state = pg_atomic_read_u32(&buf_hdr->state); + } + else + { + buf_hdr = GetBufferDescriptor(buf - 1); + buf_state = LockBufHdr(buf_hdr); + } + + /* + * We should be the only backend accessing this buffer. This is just a + * small bit of belt-and-suspenders defense, none of this code should ever + * run in a cluster with real data. + */ + if (BUF_STATE_GET_REFCOUNT(buf_state) > 1) + was_pinned = true; + else + buf_state &= ~(BM_VALID | BM_DIRTY); + + if (RelationUsesLocalBuffers(rel)) + pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state); + else + UnlockBufHdr(buf_hdr, buf_state); + + if (was_pinned) + elog(ERROR, "toy buffer %d was already pinned", + buf); + + return buf; +} + +/* + * A "low level" read. This does similar things to what + * StartReadBuffers()/WaitReadBuffers() do, but provides more control (and + * less sanity). + */ +PG_FUNCTION_INFO_V1(read_rel_block_ll); +Datum +read_rel_block_ll(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + BlockNumber blkno = PG_GETARG_UINT32(1); + int nblocks = PG_GETARG_INT32(2); + bool wait_complete = PG_GETARG_BOOL(3); + bool batchmode_enter = PG_GETARG_BOOL(4); + bool call_smgrreleaseall = PG_GETARG_BOOL(5); + bool batchmode_exit = PG_GETARG_BOOL(6); + bool zero_on_error = PG_GETARG_BOOL(7); + Relation rel; + Buffer bufs[PG_IOV_MAX]; + BufferDesc *buf_hdrs[PG_IOV_MAX]; + Page pages[PG_IOV_MAX]; + uint8 srb_flags = 0; + PgAioReturn ior; + PgAioHandle *ioh; + PgAioWaitRef iow; + SMgrRelation smgr; + + if (nblocks <= 0 || nblocks > PG_IOV_MAX) + elog(ERROR, "nblocks is out of range"); + + rel = relation_open(relid, AccessExclusiveLock); + + for (int i = 0; i < nblocks; i++) + { + bufs[i] = create_toy_buffer(rel, blkno + i); + pages[i] = BufferGetBlock(bufs[i]); + buf_hdrs[i] = BufferIsLocal(bufs[i]) ? + GetLocalBufferDescriptor(-bufs[i] - 1) : + GetBufferDescriptor(bufs[i] - 1); + } + + smgr = RelationGetSmgr(rel); + + pgstat_prepare_report_checksum_failure(smgr->smgr_rlocator.locator.dbOid); + + ioh = pgaio_io_acquire(CurrentResourceOwner, &ior); + pgaio_io_get_wref(ioh, &iow); + + if (RelationUsesLocalBuffers(rel)) + { + for (int i = 0; i < nblocks; i++) + StartLocalBufferIO(buf_hdrs[i], true, false); + pgaio_io_set_flag(ioh, PGAIO_HF_REFERENCES_LOCAL); + } + else + { + for (int i = 0; i < nblocks; i++) + StartBufferIO(buf_hdrs[i], true, false); + } + + pgaio_io_set_handle_data_32(ioh, (uint32 *) bufs, nblocks); + + if (zero_on_error | zero_damaged_pages) + srb_flags |= READ_BUFFERS_ZERO_ON_ERROR; + if (ignore_checksum_failure) + srb_flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES; + + pgaio_io_register_callbacks(ioh, + RelationUsesLocalBuffers(rel) ? + PGAIO_HCB_LOCAL_BUFFER_READV : + PGAIO_HCB_SHARED_BUFFER_READV, + srb_flags); + + if (batchmode_enter) + pgaio_enter_batchmode(); + + smgrstartreadv(ioh, smgr, MAIN_FORKNUM, blkno, + (void *) pages, nblocks); + + if (call_smgrreleaseall) + smgrreleaseall(); + + if (batchmode_exit) + pgaio_exit_batchmode(); + + for (int i = 0; i < nblocks; i++) + ReleaseBuffer(bufs[i]); + + if (wait_complete) + { + pgaio_wref_wait(&iow); + + if (ior.result.status != PGAIO_RS_OK) + pgaio_result_report(ior.result, + &ior.target_data, + ior.result.status == PGAIO_RS_ERROR ? + ERROR : WARNING); + } + + relation_close(rel, NoLock); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(invalidate_rel_block); +Datum +invalidate_rel_block(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + BlockNumber blkno = PG_GETARG_UINT32(1); + Relation rel; + PrefetchBufferResult pr; + Buffer buf; + + rel = relation_open(relid, AccessExclusiveLock); + + /* + * This is a gross hack, but there's no other API exposed that allows to + * get a buffer ID without actually reading the block in. + */ + pr = PrefetchBuffer(rel, MAIN_FORKNUM, blkno); + buf = pr.recent_buffer; + + if (BufferIsValid(buf)) + { + /* if the buffer contents aren't valid, this'll return false */ + if (ReadRecentBuffer(rel->rd_locator, MAIN_FORKNUM, blkno, buf)) + { + BufferDesc *buf_hdr = BufferIsLocal(buf) ? + GetLocalBufferDescriptor(-buf - 1) + : GetBufferDescriptor(buf - 1); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + if (pg_atomic_read_u32(&buf_hdr->state) & BM_DIRTY) + { + if (BufferIsLocal(buf)) + FlushLocalBuffer(buf_hdr, NULL); + else + FlushOneBuffer(buf); + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buf); + + if (BufferIsLocal(buf)) + InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true); + else if (!EvictUnpinnedBuffer(buf)) + elog(ERROR, "couldn't evict"); + } + } + + relation_close(rel, AccessExclusiveLock); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(buffer_create_toy); +Datum +buffer_create_toy(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + BlockNumber blkno = PG_GETARG_UINT32(1); + Relation rel; + Buffer buf; + + rel = relation_open(relid, AccessExclusiveLock); + + buf = create_toy_buffer(rel, blkno); + ReleaseBuffer(buf); + + relation_close(rel, NoLock); + + PG_RETURN_INT32(buf); +} + +PG_FUNCTION_INFO_V1(buffer_call_start_io); +Datum +buffer_call_start_io(PG_FUNCTION_ARGS) +{ + Buffer buf = PG_GETARG_INT32(0); + bool for_input = PG_GETARG_BOOL(1); + bool nowait = PG_GETARG_BOOL(2); + bool can_start; + + if (BufferIsLocal(buf)) + can_start = StartLocalBufferIO(GetLocalBufferDescriptor(-buf - 1), + for_input, nowait); + else + can_start = StartBufferIO(GetBufferDescriptor(buf - 1), + for_input, nowait); + + /* + * For tests we don't want the resowner release preventing us from + * orchestrating odd scenarios. + */ + if (can_start && !BufferIsLocal(buf)) + ResourceOwnerForgetBufferIO(CurrentResourceOwner, + buf); + + ereport(LOG, + errmsg("buffer %d after StartBufferIO: %s", + buf, DebugPrintBufferRefcount(buf)), + errhidestmt(true), errhidecontext(true)); + + PG_RETURN_BOOL(can_start); +} + +PG_FUNCTION_INFO_V1(buffer_call_terminate_io); +Datum +buffer_call_terminate_io(PG_FUNCTION_ARGS) +{ + Buffer buf = PG_GETARG_INT32(0); + bool for_input = PG_GETARG_BOOL(1); + bool succeed = PG_GETARG_BOOL(2); + bool io_error = PG_GETARG_BOOL(3); + bool release_aio = PG_GETARG_BOOL(4); + bool clear_dirty = false; + uint32 set_flag_bits = 0; + + if (io_error) + set_flag_bits |= BM_IO_ERROR; + + if (for_input) + { + clear_dirty = false; + + if (succeed) + set_flag_bits |= BM_VALID; + } + else + { + if (succeed) + clear_dirty = true; + } + + ereport(LOG, + errmsg("buffer %d before Terminate[Local]BufferIO: %s", + buf, DebugPrintBufferRefcount(buf)), + errhidestmt(true), errhidecontext(true)); + + if (BufferIsLocal(buf)) + TerminateLocalBufferIO(GetLocalBufferDescriptor(-buf - 1), + clear_dirty, set_flag_bits, release_aio); + else + TerminateBufferIO(GetBufferDescriptor(buf - 1), + clear_dirty, set_flag_bits, false, release_aio); + + ereport(LOG, + errmsg("buffer %d after Terminate[Local]BufferIO: %s", + buf, DebugPrintBufferRefcount(buf)), + errhidestmt(true), errhidecontext(true)); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_get); +Datum +handle_get(PG_FUNCTION_ARGS) +{ + last_handle = pgaio_io_acquire(CurrentResourceOwner, NULL); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_release_last); +Datum +handle_release_last(PG_FUNCTION_ARGS) +{ + if (!last_handle) + elog(ERROR, "no handle"); + + pgaio_io_release(last_handle); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_get_and_error); +Datum +handle_get_and_error(PG_FUNCTION_ARGS) +{ + pgaio_io_acquire(CurrentResourceOwner, NULL); + + elog(ERROR, "as you command"); + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_get_twice); +Datum +handle_get_twice(PG_FUNCTION_ARGS) +{ + pgaio_io_acquire(CurrentResourceOwner, NULL); + pgaio_io_acquire(CurrentResourceOwner, NULL); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_get_release); +Datum +handle_get_release(PG_FUNCTION_ARGS) +{ + PgAioHandle *handle; + + handle = pgaio_io_acquire(CurrentResourceOwner, NULL); + pgaio_io_release(handle); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(batch_start); +Datum +batch_start(PG_FUNCTION_ARGS) +{ + pgaio_enter_batchmode(); + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(batch_end); +Datum +batch_end(PG_FUNCTION_ARGS) +{ + pgaio_exit_batchmode(); + PG_RETURN_VOID(); +} + +#ifdef USE_INJECTION_POINTS +extern PGDLLEXPORT void inj_io_short_read(const char *name, const void *private_data); +extern PGDLLEXPORT void inj_io_reopen(const char *name, const void *private_data); + +void +inj_io_short_read(const char *name, const void *private_data) +{ + PgAioHandle *ioh; + + ereport(LOG, + errmsg("short read injection point called, is enabled: %d", + inj_io_error_state->enabled_reopen), + errhidestmt(true), errhidecontext(true)); + + if (inj_io_error_state->enabled_short_read) + { + ioh = pgaio_inj_io_get(); + + /* + * Only shorten reads that are actually longer than the target size, + * otherwise we can trigger over-reads. + */ + if (inj_io_error_state->short_read_result_set + && ioh->op == PGAIO_OP_READV + && inj_io_error_state->short_read_result <= ioh->result) + { + struct iovec *iov = &pgaio_ctl->iovecs[ioh->iovec_off]; + int32 old_result = ioh->result; + int32 new_result = inj_io_error_state->short_read_result; + int32 processed = 0; + + ereport(LOG, + errmsg("short read inject point, changing result from %d to %d", + old_result, new_result), + errhidestmt(true), errhidecontext(true)); + + /* + * The underlying IO actually completed OK, and thus the "invalid" + * portion of the IOV actually contains valid data. That can hide + * a lot of problems, e.g. if we were to wrongly mark a buffer, + * that wasn't read according to the shortened-read, IO as valid, + * the contents would look valid and we might miss a bug. + * + * To avoid that, iterate through the IOV and zero out the + * "failed" portion of the IO. + */ + for (int i = 0; i < ioh->op_data.read.iov_length; i++) + { + if (processed + iov[i].iov_len <= new_result) + processed += iov[i].iov_len; + else if (processed <= new_result) + { + uint32 ok_part = new_result - processed; + + memset((char *) iov[i].iov_base + ok_part, 0, iov[i].iov_len - ok_part); + processed += iov[i].iov_len; + } + else + { + memset((char *) iov[i].iov_base, 0, iov[i].iov_len); + } + } + + ioh->result = new_result; + } + } +} + +void +inj_io_reopen(const char *name, const void *private_data) +{ + ereport(LOG, + errmsg("reopen injection point called, is enabled: %d", + inj_io_error_state->enabled_reopen), + errhidestmt(true), errhidecontext(true)); + + if (inj_io_error_state->enabled_reopen) + elog(ERROR, "injection point triggering failure to reopen "); +} +#endif + +PG_FUNCTION_INFO_V1(inj_io_short_read_attach); +Datum +inj_io_short_read_attach(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + inj_io_error_state->enabled_short_read = true; + inj_io_error_state->short_read_result_set = !PG_ARGISNULL(0); + if (inj_io_error_state->short_read_result_set) + inj_io_error_state->short_read_result = PG_GETARG_INT32(0); +#else + elog(ERROR, "injection points not supported"); +#endif + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(inj_io_short_read_detach); +Datum +inj_io_short_read_detach(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + inj_io_error_state->enabled_short_read = false; +#else + elog(ERROR, "injection points not supported"); +#endif + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(inj_io_reopen_attach); +Datum +inj_io_reopen_attach(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + inj_io_error_state->enabled_reopen = true; +#else + elog(ERROR, "injection points not supported"); +#endif + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(inj_io_reopen_detach); +Datum +inj_io_reopen_detach(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + inj_io_error_state->enabled_reopen = false; +#else + elog(ERROR, "injection points not supported"); +#endif + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_aio/test_aio.control b/src/test/modules/test_aio/test_aio.control new file mode 100644 index 00000000000..cd91c3ed16b --- /dev/null +++ b/src/test/modules/test_aio/test_aio.control @@ -0,0 +1,3 @@ +comment = 'Test code for AIO' +default_version = '1.0' +module_pathname = '$libdir/test_aio' |