aboutsummaryrefslogtreecommitdiff
path: root/config
diff options
context:
space:
mode:
authorNathan Bossart <nathan@postgresql.org>2025-03-28 16:20:20 -0500
committerNathan Bossart <nathan@postgresql.org>2025-03-28 16:20:20 -0500
commit519338ace410d9b1ffb13176b8802b0307ff0531 (patch)
treecef689c0b92e9678b1b5cf0110b0ba3a37c8ebe0 /config
parent3c8e463b0d885e0d976f6a13a1fb78187b25c86f (diff)
downloadpostgresql-519338ace410d9b1ffb13176b8802b0307ff0531.tar.gz
postgresql-519338ace410d9b1ffb13176b8802b0307ff0531.zip
Optimize popcount functions with ARM SVE intrinsics.
This commit introduces SVE implementations of pg_popcount{32,64}. Unlike the Neon versions, we need an additional configure-time check to determine if the compiler supports SVE intrinsics, and we need a runtime check to determine if the current CPU supports SVE instructions. Our testing showed that the SVE implementations are much faster for larger inputs and are comparable to the status quo for smaller inputs. Author: "Devanga.Susmitha@fujitsu.com" <Devanga.Susmitha@fujitsu.com> Co-authored-by: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com> Co-authored-by: "Malladi, Rama" <ramamalladi@hotmail.com> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Discussion: https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com Discussion: https://postgr.es/m/OSZPR01MB84990A9A02A3515C6E85A65B8B2A2%40OSZPR01MB8499.jpnprd01.prod.outlook.com
Diffstat (limited to 'config')
-rw-r--r--config/c-compiler.m452
1 files changed, 52 insertions, 0 deletions
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 3712e81e38c..e9e54470e66 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -708,3 +708,55 @@ if test x"$Ac_cachevar" = x"yes"; then
fi
undefine([Ac_cachevar])dnl
])# PGAC_AVX512_POPCNT_INTRINSICS
+
+# PGAC_SVE_POPCNT_INTRINSICS
+# --------------------------
+# Check if the compiler supports the SVE popcount instructions using the
+# svptrue_b64, svdup_u64, svcntb, svld1_u64, svld1_u8, svadd_u64_x,
+# svcnt_u64_x, svcnt_u8_x, svaddv_u64, svaddv_u8, svwhilelt_b8_s32,
+# svand_n_u64_x, and svand_n_u8_x intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics.
+AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl
+AC_CACHE_CHECK([for svcnt_x], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>
+
+ char buf[128];
+
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("arch=armv8-a+sve")))
+ #endif
+ static int popcount_test(void)
+ {
+ svbool_t pred = svptrue_b64();
+ svuint8_t vec8;
+ svuint64_t accum1 = svdup_u64(0),
+ accum2 = svdup_u64(0),
+ vec64;
+ char *p = buf;
+ uint64_t popcnt,
+ mask = 0x5555555555555555;
+
+ vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
+ accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
+ p += svcntb();
+
+ vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
+ accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
+ p += svcntb();
+
+ popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
+
+ pred = svwhilelt_b8_s32(0, sizeof(buf));
+ vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
+ return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
+ }]],
+ [return popcount_test();])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+ pgac_sve_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_SVE_POPCNT_INTRINSICS