diff options
author | Nathan Bossart <nathan@postgresql.org> | 2025-03-28 14:49:35 -0500 |
---|---|---|
committer | Nathan Bossart <nathan@postgresql.org> | 2025-03-28 14:49:35 -0500 |
commit | 6be53c27673a5fca64a00a684c36c29db6ca33a5 (patch) | |
tree | 6631906bc69ffa8ec6404be8851d37b664e225f8 /src/port/pg_bitutils.c | |
parent | 51a0382e8d8793b5cc89b69285e5ecdffe03c2bf (diff) | |
download | postgresql-6be53c27673a5fca64a00a684c36c29db6ca33a5.tar.gz postgresql-6be53c27673a5fca64a00a684c36c29db6ca33a5.zip |
Optimize popcount functions with ARM Neon intrinsics.
This commit introduces Neon implementations of pg_popcount{32,64},
pg_popcount(), and pg_popcount_masked(). As in simd.h, we assume
that all available AArch64 hardware supports Neon, so we don't need
any new configure-time or runtime checks. Some compilers already
emit Neon instructions for these functions, but our hand-rolled
implementations for pg_popcount() and pg_popcount_masked()
performed better in testing, likely due to better instruction-level
parallelism.
Author: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com>
Reviewed-by: John Naylor <johncnaylorls@gmail.com>
Discussion: https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com
Diffstat (limited to 'src/port/pg_bitutils.c')
-rw-r--r-- | src/port/pg_bitutils.c | 22 |
1 files changed, 16 insertions, 6 deletions
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 82be40e2fb4..61c7388f474 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; +/* + * If we are building the Neon versions, we don't need the "slow" fallbacks. + */ +#ifndef POPCNT_AARCH64 static inline int pg_popcount32_slow(uint32 word); static inline int pg_popcount64_slow(uint64 word); static uint64 pg_popcount_slow(const char *buf, int bytes); static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask); +#endif #ifdef TRY_POPCNT_X86_64 static bool pg_popcount_available(void); @@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) #endif /* TRY_POPCNT_X86_64 */ +/* + * If we are building the Neon versions, we don't need the "slow" fallbacks. + */ +#ifndef POPCNT_AARCH64 /* * pg_popcount32_slow @@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) return popcnt; } -#ifndef TRY_POPCNT_X86_64 +#endif /* ! POPCNT_AARCH64 */ + +#if !defined(TRY_POPCNT_X86_64) && !defined(POPCNT_AARCH64) /* - * When the POPCNT instruction is not available, there's no point in using + * When special CPU instructions are not available, there's no point in using * function pointers to vary the implementation between the fast and slow - * method. We instead just make these actual external functions when - * TRY_POPCNT_X86_64 is not defined. The compiler should be able to inline - * the slow versions here. + * method. We instead just make these actual external functions. The compiler + * should be able to inline the slow versions here. */ int pg_popcount32(uint32 word) @@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) return pg_popcount_masked_slow(buf, bytes, mask); } -#endif /* !TRY_POPCNT_X86_64 */ +#endif /* ! TRY_POPCNT_X86_64 && ! POPCNT_AARCH64 */ |