diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 6 | ||||
-rw-r--r-- | src/backend/storage/page/bufpage.c | 99 | ||||
-rw-r--r-- | src/backend/storage/page/checksum.c | 146 | ||||
-rw-r--r-- | src/include/storage/checksum.h | 9 | ||||
-rw-r--r-- | src/include/storage/checksum_impl.h | 207 |
5 files changed, 251 insertions, 216 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 43eb7d59f46..c6b033cf417 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1982,9 +1982,13 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) * have been able to write it while we were busy with log flushing because * we have the io_in_progress lock. */ - bufBlock = BufHdrGetBlock(buf); + /* + * Update page checksum if desired. Since we have only shared lock on the + * buffer, other processes might be updating hint bits in it, so we must + * copy the page to private storage if we do checksumming. + */ bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); if (track_io_timing) diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index a5594bde64e..36b88c5729b 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -17,13 +17,12 @@ #include "access/htup_details.h" #include "access/xlog.h" #include "storage/checksum.h" +#include "utils/memutils.h" -bool ignore_checksum_failure = false; -static char pageCopyData[BLCKSZ]; /* for checksum calculation */ -static Page pageCopy = pageCopyData; +/* GUC variable */ +bool ignore_checksum_failure = false; -static uint16 PageCalcChecksum16(Page page, BlockNumber blkno); /* ---------------------------------------------------------------- * Page support functions @@ -94,7 +93,7 @@ PageIsVerified(Page page, BlockNumber blkno) { if (DataChecksumsEnabled()) { - checksum = PageCalcChecksum16(page, blkno); + checksum = pg_checksum_page((char *) page, blkno); if (checksum != p->pd_checksum) checksum_failure = true; @@ -885,13 +884,16 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) pfree(itemidbase); } + /* - * Set checksum for page in shared buffers. + * Set checksum for a page in shared buffers. * * If checksums are disabled, or if the page is not initialized, just return - * the input. Otherwise, we must make a copy of the page before calculating the - * checksum, to prevent concurrent modifications (e.g. setting hint bits) from - * making the final checksum invalid. + * the input. Otherwise, we must make a copy of the page before calculating + * the checksum, to prevent concurrent modifications (e.g. setting hint bits) + * from making the final checksum invalid. It doesn't matter if we include or + * exclude hints during the copy, as long as we write a valid page and + * associated checksum. * * Returns a pointer to the block-sized data that needs to be written. Uses * statically-allocated memory, so the caller must immediately write the @@ -900,79 +902,38 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) char * PageSetChecksumCopy(Page page, BlockNumber blkno) { + static char *pageCopy = NULL; + + /* If we don't need a checksum, just return the passed-in data */ if (PageIsNew(page) || !DataChecksumsEnabled()) return (char *) page; /* - * We make a copy iff we need to calculate a checksum because other - * backends may set hint bits on this page while we write, which would - * mean the checksum differs from the page contents. It doesn't matter if - * we include or exclude hints during the copy, as long as we write a - * valid page and associated checksum. + * We allocate the copy space once and use it over on each subsequent + * call. The point of palloc'ing here, rather than having a static char + * array, is first to ensure adequate alignment for the checksumming code + * and second to avoid wasting space in processes that never call this. */ - memcpy((char *) pageCopy, (char *) page, BLCKSZ); - PageSetChecksumInplace(pageCopy, blkno); - return (char *) pageCopy; + if (pageCopy == NULL) + pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); + + memcpy(pageCopy, (char *) page, BLCKSZ); + ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); + return pageCopy; } /* - * Set checksum for page in private memory. + * Set checksum for a page in private memory. * - * This is a simpler version of PageSetChecksumCopy(). The more explicit API - * allows us to more easily see if we're making the correct call and reduces - * the amount of additional code specific to page verification. + * This must only be used when we know that no other process can be modifying + * the page buffer. */ void PageSetChecksumInplace(Page page, BlockNumber blkno) { - if (PageIsNew(page)) + /* If we don't need a checksum, just return */ + if (PageIsNew(page) || !DataChecksumsEnabled()) return; - if (DataChecksumsEnabled()) - { - PageHeader p = (PageHeader) page; - - p->pd_checksum = PageCalcChecksum16(page, blkno); - } - - return; -} - -/* - * Calculate checksum for a PostgreSQL Page. This includes the block number (to - * detect the case when a page is somehow moved to a different location), the - * page header (excluding the checksum itself), and the page data. - * - * Note that if the checksum validation fails we cannot tell the difference - * between a transposed block and failure from direct on-block corruption, - * though that is better than just ignoring transposed blocks altogether. - */ -static uint16 -PageCalcChecksum16(Page page, BlockNumber blkno) -{ - PageHeader phdr = (PageHeader) page; - uint16 save_checksum; - uint32 checksum; - - /* only calculate the checksum for properly-initialized pages */ - Assert(!PageIsNew(page)); - - /* - * Save pd_checksum and set it to zero, so that the checksum calculation - * isn't affected by the checksum stored on the page. We do this to allow - * optimization of the checksum calculation on the whole block in one go. - */ - save_checksum = phdr->pd_checksum; - phdr->pd_checksum = 0; - checksum = checksum_block(page, BLCKSZ); - phdr->pd_checksum = save_checksum; - - /* mix in the block number to detect transposed pages */ - checksum ^= blkno; - - /* - * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of - * one. That avoids checksums of zero, which seems like a good idea. - */ - return (checksum % 65535) + 1; + ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); } diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c index 41c8ae784de..f72b70de881 100644 --- a/src/backend/storage/page/checksum.c +++ b/src/backend/storage/page/checksum.c @@ -6,156 +6,18 @@ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * * IDENTIFICATION * src/backend/storage/page/checksum.c * *------------------------------------------------------------------------- - * - * Checksum algorithm - * - * The algorithm used to checksum pages is chosen for very fast calculation. - * Workloads where the database working set fits into OS file cache but not - * into shared buffers can read in pages at a very fast pace and the checksum - * algorithm itself can become the largest bottleneck. - * - * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand - * for Fowler/Noll/Vo) The primitive of a plain FNV-1a hash folds in data 1 - * byte at a time according to the formula: - * - * hash = (hash ^ value) * FNV_PRIME - * - * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/ - * - * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of - * high bits - high order bits in input data only affect high order bits in - * output data. To resolve this we xor in the value prior to multiplication - * shifted right by 17 bits. The number 17 was chosen because it doesn't - * have common denominator with set bit positions in FNV_PRIME and empirically - * provides the fastest mixing for high order bits of final iterations quickly - * avalanche into lower positions. For performance reasons we choose to combine - * 4 bytes at a time. The actual hash formula used as the basis is: - * - * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17) - * - * The main bottleneck in this calculation is the multiplication latency. To - * hide the latency and to make use of SIMD parallelism multiple hash values - * are calculated in parallel. The page is treated as a 32 column two - * dimensional array of 32 bit values. Each column is aggregated separately - * into a partial checksum. Each partial checksum uses a different initial - * value (offset basis in FNV terminology). The initial values actually used - * were chosen randomly, as the values themselves don't matter as much as that - * they are different and don't match anything in real data. After initializing - * partial checksums each value in the column is aggregated according to the - * above formula. Finally two more iterations of the formula are performed with - * value 0 to mix the bits of the last value added. - * - * The partial checksums are then folded together using xor to form a single - * 32-bit checksum. The caller can safely reduce the value to 16 bits - * using modulo 2^16-1. That will cause a very slight bias towards lower - * values but this is not significant for the performance of the - * checksum. - * - * The algorithm choice was based on what instructions are available in SIMD - * instruction sets. This meant that a fast and good algorithm needed to use - * multiplication as the main mixing operator. The simplest multiplication - * based checksum primitive is the one used by FNV. The prime used is chosen - * for good dispersion of values. It has no known simple patterns that result - * in collisions. Test of 5-bit differentials of the primitive over 64bit keys - * reveals no differentials with 3 or more values out of 100000 random keys - * colliding. Avalanche test shows that only high order bits of the last word - * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes, - * overwriting page from random position to end with 0 bytes, and overwriting - * random segments of page with 0x00, 0xFF and random data all show optimal - * 2e-16 false positive rate within margin of error. - * - * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer - * multiplication instruction. As of 2013 the corresponding instruction is - * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). - * Vectorization requires a compiler to do the vectorization for us. For recent - * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough - * to achieve vectorization. - * - * The optimal amount of parallelism to use depends on CPU specific instruction - * latency, SIMD instruction width, throughput and the amount of registers - * available to hold intermediate state. Generally, more parallelism is better - * up to the point that state doesn't fit in registers and extra load-store - * instructions are needed to swap values in/out. The number chosen is a fixed - * part of the algorithm because changing the parallelism changes the checksum - * result. - * - * The parallelism number 32 was chosen based on the fact that it is the - * largest state that fits into architecturally visible x86 SSE registers while - * leaving some free registers for intermediate values. For future processors - * with 256bit vector registers this will leave some performance on the table. - * When vectorization is not available it might be beneficial to restructure - * the computation to calculate a subset of the columns at a time and perform - * multiple passes to avoid register spilling. This optimization opportunity - * is not used. Current coding also assumes that the compiler has the ability - * to unroll the inner loop to avoid loop overhead and minimize register - * spilling. For less sophisticated compilers it might be beneficial to manually - * unroll the inner loop. */ #include "postgres.h" #include "storage/checksum.h" -/* number of checksums to calculate in parallel */ -#define N_SUMS 32 -/* prime multiplier of FNV-1a hash */ -#define FNV_PRIME 16777619 - -/* - * Base offsets to initialize each of the parallel FNV hashes into a - * different initial state. - */ -static const uint32 checksumBaseOffsets[N_SUMS] = { - 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, - 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, - 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, - 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, - 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, - 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, - 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, - 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756 -}; - /* - * Calculate one round of the checksum. + * The actual code is in storage/checksum_impl.h. This is done so that + * external programs can incorporate the checksum code by #include'ing + * that file from the exported Postgres headers. (Compare our CRC code.) */ -#define CHECKSUM_COMP(checksum, value) do {\ - uint32 __tmp = (checksum) ^ (value);\ - (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17);\ -} while (0) - -uint32 -checksum_block(char *data, uint32 size) -{ - uint32 sums[N_SUMS]; - uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data; - uint32 result = 0; - int i, - j; - - /* ensure that the size is compatible with the algorithm */ - Assert((size % (sizeof(uint32) * N_SUMS)) == 0); - - /* initialize partial checksums to their corresponding offsets */ - memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); - - /* main checksum calculation */ - for (i = 0; i < size / sizeof(uint32) / N_SUMS; i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], dataArr[i][j]); - - /* finally add in two rounds of zeroes for additional mixing */ - for (i = 0; i < 2; i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], 0); - - /* xor fold partial checksums together */ - for (i = 0; i < N_SUMS; i++) - result ^= sums[i]; - - return result; -} +#include "storage/checksum_impl.h" diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index e41fd9804b9..9077c218ffd 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -3,7 +3,6 @@ * checksum.h * Checksum implementation for data pages. * - * * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -14,10 +13,12 @@ #ifndef CHECKSUM_H #define CHECKSUM_H +#include "storage/block.h" + /* - * Fowler-Noll-Vo 1a block checksum algorithm. The data argument should be - * aligned on a 4-byte boundary. + * Compute the checksum for a Postgres page. The page must be aligned on a + * 4-byte boundary. */ -extern uint32 checksum_block(char *data, uint32 size); +extern uint16 pg_checksum_page(char *page, BlockNumber blkno); #endif /* CHECKSUM_H */ diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h new file mode 100644 index 00000000000..ce1b124fa53 --- /dev/null +++ b/src/include/storage/checksum_impl.h @@ -0,0 +1,207 @@ +/*------------------------------------------------------------------------- + * + * checksum_impl.h + * Checksum implementation for data pages. + * + * This file exists for the benefit of external programs that may wish to + * check Postgres page checksums. They can #include this to get the code + * referenced by storage/checksum.h. (Note: you may need to redefine + * Assert() as empty to compile this successfully externally.) + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksum_impl.h + * + *------------------------------------------------------------------------- + */ + +/* + * The algorithm used to checksum pages is chosen for very fast calculation. + * Workloads where the database working set fits into OS file cache but not + * into shared buffers can read in pages at a very fast pace and the checksum + * algorithm itself can become the largest bottleneck. + * + * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand + * for Fowler/Noll/Vo). The primitive of a plain FNV-1a hash folds in data 1 + * byte at a time according to the formula: + * + * hash = (hash ^ value) * FNV_PRIME + * + * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/ + * + * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of + * high bits - high order bits in input data only affect high order bits in + * output data. To resolve this we xor in the value prior to multiplication + * shifted right by 17 bits. The number 17 was chosen because it doesn't + * have common denominator with set bit positions in FNV_PRIME and empirically + * provides the fastest mixing for high order bits of final iterations quickly + * avalanche into lower positions. For performance reasons we choose to combine + * 4 bytes at a time. The actual hash formula used as the basis is: + * + * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17) + * + * The main bottleneck in this calculation is the multiplication latency. To + * hide the latency and to make use of SIMD parallelism multiple hash values + * are calculated in parallel. The page is treated as a 32 column two + * dimensional array of 32 bit values. Each column is aggregated separately + * into a partial checksum. Each partial checksum uses a different initial + * value (offset basis in FNV terminology). The initial values actually used + * were chosen randomly, as the values themselves don't matter as much as that + * they are different and don't match anything in real data. After initializing + * partial checksums each value in the column is aggregated according to the + * above formula. Finally two more iterations of the formula are performed with + * value 0 to mix the bits of the last value added. + * + * The partial checksums are then folded together using xor to form a single + * 32-bit checksum. The caller can safely reduce the value to 16 bits + * using modulo 2^16-1. That will cause a very slight bias towards lower + * values but this is not significant for the performance of the + * checksum. + * + * The algorithm choice was based on what instructions are available in SIMD + * instruction sets. This meant that a fast and good algorithm needed to use + * multiplication as the main mixing operator. The simplest multiplication + * based checksum primitive is the one used by FNV. The prime used is chosen + * for good dispersion of values. It has no known simple patterns that result + * in collisions. Test of 5-bit differentials of the primitive over 64bit keys + * reveals no differentials with 3 or more values out of 100000 random keys + * colliding. Avalanche test shows that only high order bits of the last word + * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes, + * overwriting page from random position to end with 0 bytes, and overwriting + * random segments of page with 0x00, 0xFF and random data all show optimal + * 2e-16 false positive rate within margin of error. + * + * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer + * multiplication instruction. As of 2013 the corresponding instruction is + * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). + * Vectorization requires a compiler to do the vectorization for us. For recent + * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough + * to achieve vectorization. + * + * The optimal amount of parallelism to use depends on CPU specific instruction + * latency, SIMD instruction width, throughput and the amount of registers + * available to hold intermediate state. Generally, more parallelism is better + * up to the point that state doesn't fit in registers and extra load-store + * instructions are needed to swap values in/out. The number chosen is a fixed + * part of the algorithm because changing the parallelism changes the checksum + * result. + * + * The parallelism number 32 was chosen based on the fact that it is the + * largest state that fits into architecturally visible x86 SSE registers while + * leaving some free registers for intermediate values. For future processors + * with 256bit vector registers this will leave some performance on the table. + * When vectorization is not available it might be beneficial to restructure + * the computation to calculate a subset of the columns at a time and perform + * multiple passes to avoid register spilling. This optimization opportunity + * is not used. Current coding also assumes that the compiler has the ability + * to unroll the inner loop to avoid loop overhead and minimize register + * spilling. For less sophisticated compilers it might be beneficial to + * manually unroll the inner loop. + */ + +#include "storage/bufpage.h" + +/* number of checksums to calculate in parallel */ +#define N_SUMS 32 +/* prime multiplier of FNV-1a hash */ +#define FNV_PRIME 16777619 + +/* + * Base offsets to initialize each of the parallel FNV hashes into a + * different initial state. + */ +static const uint32 checksumBaseOffsets[N_SUMS] = { + 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, + 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, + 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, + 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, + 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, + 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, + 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, + 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756 +}; + +/* + * Calculate one round of the checksum. + */ +#define CHECKSUM_COMP(checksum, value) \ +do { \ + uint32 __tmp = (checksum) ^ (value); \ + (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \ +} while (0) + +/* + * Block checksum algorithm. The data argument must be aligned on a 4-byte + * boundary. + */ +static uint32 +pg_checksum_block(char *data, uint32 size) +{ + uint32 sums[N_SUMS]; + uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data; + uint32 result = 0; + int i, + j; + + /* ensure that the size is compatible with the algorithm */ + Assert((size % (sizeof(uint32) * N_SUMS)) == 0); + + /* initialize partial checksums to their corresponding offsets */ + memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); + + /* main checksum calculation */ + for (i = 0; i < size / sizeof(uint32) / N_SUMS; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], dataArr[i][j]); + + /* finally add in two rounds of zeroes for additional mixing */ + for (i = 0; i < 2; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], 0); + + /* xor fold partial checksums together */ + for (i = 0; i < N_SUMS; i++) + result ^= sums[i]; + + return result; +} + +/* + * Compute the checksum for a Postgres page. The page must be aligned on a + * 4-byte boundary. + * + * The checksum includes the block number (to detect the case where a page is + * somehow moved to a different location), the page header (excluding the + * checksum itself), and the page data. + */ +uint16 +pg_checksum_page(char *page, BlockNumber blkno) +{ + PageHeader phdr = (PageHeader) page; + uint16 save_checksum; + uint32 checksum; + + /* We only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew(page)); + + /* + * Save pd_checksum and temporarily set it to zero, so that the checksum + * calculation isn't affected by the old checksum stored on the page. + * Restore it after, because actually updating the checksum is NOT part of + * the API of this function. + */ + save_checksum = phdr->pd_checksum; + phdr->pd_checksum = 0; + checksum = pg_checksum_block(page, BLCKSZ); + phdr->pd_checksum = save_checksum; + + /* Mix in the block number to detect transposed pages */ + checksum ^= blkno; + + /* + * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of + * one. That avoids checksums of zero, which seems like a good idea. + */ + return (checksum % 65535) + 1; +} |