aboutsummaryrefslogtreecommitdiff
path: root/src/port/pg_bitutils.c
diff options
context:
space:
mode:
authorNathan Bossart <nathan@postgresql.org>2025-03-28 14:49:35 -0500
committerNathan Bossart <nathan@postgresql.org>2025-03-28 14:49:35 -0500
commit6be53c27673a5fca64a00a684c36c29db6ca33a5 (patch)
tree6631906bc69ffa8ec6404be8851d37b664e225f8 /src/port/pg_bitutils.c
parent51a0382e8d8793b5cc89b69285e5ecdffe03c2bf (diff)
downloadpostgresql-6be53c27673a5fca64a00a684c36c29db6ca33a5.tar.gz
postgresql-6be53c27673a5fca64a00a684c36c29db6ca33a5.zip
Optimize popcount functions with ARM Neon intrinsics.
This commit introduces Neon implementations of pg_popcount{32,64}, pg_popcount(), and pg_popcount_masked(). As in simd.h, we assume that all available AArch64 hardware supports Neon, so we don't need any new configure-time or runtime checks. Some compilers already emit Neon instructions for these functions, but our hand-rolled implementations for pg_popcount() and pg_popcount_masked() performed better in testing, likely due to better instruction-level parallelism. Author: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Discussion: https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com
Diffstat (limited to 'src/port/pg_bitutils.c')
-rw-r--r--src/port/pg_bitutils.c22
1 files changed, 16 insertions, 6 deletions
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 82be40e2fb4..61c7388f474 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = {
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
+/*
+ * If we are building the Neon versions, we don't need the "slow" fallbacks.
+ */
+#ifndef POPCNT_AARCH64
static inline int pg_popcount32_slow(uint32 word);
static inline int pg_popcount64_slow(uint64 word);
static uint64 pg_popcount_slow(const char *buf, int bytes);
static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
+#endif
#ifdef TRY_POPCNT_X86_64
static bool pg_popcount_available(void);
@@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
#endif /* TRY_POPCNT_X86_64 */
+/*
+ * If we are building the Neon versions, we don't need the "slow" fallbacks.
+ */
+#ifndef POPCNT_AARCH64
/*
* pg_popcount32_slow
@@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
return popcnt;
}
-#ifndef TRY_POPCNT_X86_64
+#endif /* ! POPCNT_AARCH64 */
+
+#if !defined(TRY_POPCNT_X86_64) && !defined(POPCNT_AARCH64)
/*
- * When the POPCNT instruction is not available, there's no point in using
+ * When special CPU instructions are not available, there's no point in using
* function pointers to vary the implementation between the fast and slow
- * method. We instead just make these actual external functions when
- * TRY_POPCNT_X86_64 is not defined. The compiler should be able to inline
- * the slow versions here.
+ * method. We instead just make these actual external functions. The compiler
+ * should be able to inline the slow versions here.
*/
int
pg_popcount32(uint32 word)
@@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
return pg_popcount_masked_slow(buf, bytes, mask);
}
-#endif /* !TRY_POPCNT_X86_64 */
+#endif /* ! TRY_POPCNT_X86_64 && ! POPCNT_AARCH64 */