/*------------------------------------------------------------------------- * * pg_popcount_aarch64.c * Holds the AArch64 popcount implementations. * * Copyright (c) 2025, PostgreSQL Global Development Group * * IDENTIFICATION * src/port/pg_popcount_aarch64.c * *------------------------------------------------------------------------- */ #include "c.h" #include "port/pg_bitutils.h" #ifdef POPCNT_AARCH64 #include #ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK #include #if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL) #include #endif #endif /* * The Neon versions are built regardless of whether we are building the SVE * versions. */ static uint64 pg_popcount_neon(const char *buf, int bytes); static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask); #ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK /* * These are the SVE implementations of the popcount functions. */ static uint64 pg_popcount_sve(const char *buf, int bytes); static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask); /* * The function pointers are initially set to "choose" functions. These * functions will first set the pointers to the right implementations (based on * what the current CPU supports) and then will call the pointer to fulfill the * caller's request. */ static uint64 pg_popcount_choose(const char *buf, int bytes); static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; static inline bool pg_popcount_sve_available(void) { #ifdef HAVE_ELF_AUX_INFO unsigned long value; return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && (value & HWCAP_SVE) != 0; #elif defined(HAVE_GETAUXVAL) return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; #else return false; #endif } static inline void choose_popcount_functions(void) { if (pg_popcount_sve_available()) { pg_popcount_optimized = pg_popcount_sve; pg_popcount_masked_optimized = pg_popcount_masked_sve; } else { pg_popcount_optimized = pg_popcount_neon; pg_popcount_masked_optimized = pg_popcount_masked_neon; } } static uint64 pg_popcount_choose(const char *buf, int bytes) { choose_popcount_functions(); return pg_popcount_optimized(buf, bytes); } static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask) { choose_popcount_functions(); return pg_popcount_masked_optimized(buf, bytes, mask); } /* * pg_popcount_sve * Returns number of 1 bits in buf */ pg_attribute_target("arch=armv8-a+sve") static uint64 pg_popcount_sve(const char *buf, int bytes) { svbool_t pred = svptrue_b64(); svuint64_t accum1 = svdup_u64(0), accum2 = svdup_u64(0), accum3 = svdup_u64(0), accum4 = svdup_u64(0); uint32 vec_len = svcntb(), bytes_per_iteration = 4 * vec_len; uint64 popcnt = 0; /* * For better instruction-level parallelism, each loop iteration operates * on a block of four registers. */ for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) { svuint64_t vec; vec = svld1_u64(pred, (const uint64 *) buf); accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec)); buf += vec_len; vec = svld1_u64(pred, (const uint64 *) buf); accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec)); buf += vec_len; vec = svld1_u64(pred, (const uint64 *) buf); accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec)); buf += vec_len; vec = svld1_u64(pred, (const uint64 *) buf); accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec)); buf += vec_len; } /* * If enough data remains, do another iteration on a block of two * registers. */ bytes_per_iteration = 2 * vec_len; if (bytes >= bytes_per_iteration) { svuint64_t vec; vec = svld1_u64(pred, (const uint64 *) buf); accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec)); buf += vec_len; vec = svld1_u64(pred, (const uint64 *) buf); accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec)); buf += vec_len; bytes -= bytes_per_iteration; } /* * Add the accumulators. */ popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2)); popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4)); /* * Process any remaining data. */ for (; bytes > 0; bytes -= vec_len) { svuint8_t vec; pred = svwhilelt_b8_s32(0, bytes); vec = svld1_u8(pred, (const uint8 *) buf); popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec)); buf += vec_len; } return popcnt; } /* * pg_popcount_masked_sve * Returns number of 1 bits in buf after applying the mask to each byte */ pg_attribute_target("arch=armv8-a+sve") static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask) { svbool_t pred = svptrue_b64(); svuint64_t accum1 = svdup_u64(0), accum2 = svdup_u64(0), accum3 = svdup_u64(0), accum4 = svdup_u64(0); uint32 vec_len = svcntb(), bytes_per_iteration = 4 * vec_len; uint64 popcnt = 0, mask64 = ~UINT64CONST(0) / 0xFF * mask; /* * For better instruction-level parallelism, each loop iteration operates * on a block of four registers. */ for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) { svuint64_t vec; vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec)); buf += vec_len; vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec)); buf += vec_len; vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec)); buf += vec_len; vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec)); buf += vec_len; } /* * If enough data remains, do another iteration on a block of two * registers. */ bytes_per_iteration = 2 * vec_len; if (bytes >= bytes_per_iteration) { svuint64_t vec; vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec)); buf += vec_len; vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec)); buf += vec_len; bytes -= bytes_per_iteration; } /* * Add the accumulators. */ popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2)); popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4)); /* * Process any remaining data. */ for (; bytes > 0; bytes -= vec_len) { svuint8_t vec; pred = svwhilelt_b8_s32(0, bytes); vec = svand_n_u8_x(pred, svld1_u8(pred, (const uint8 *) buf), mask); popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec)); buf += vec_len; } return popcnt; } #else /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ /* * When the SVE version isn't available, there's no point in using function * pointers to vary the implementation. We instead just make these actual * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined. * The compiler should be able to inline the Neon versions here. */ uint64 pg_popcount_optimized(const char *buf, int bytes) { return pg_popcount_neon(buf, bytes); } uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) { return pg_popcount_masked_neon(buf, bytes, mask); } #endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ /* * pg_popcount32 * Return number of 1 bits in word */ int pg_popcount32(uint32 word) { return pg_popcount64((uint64) word); } /* * pg_popcount64 * Return number of 1 bits in word */ int pg_popcount64(uint64 word) { /* * For some compilers, __builtin_popcountl() already emits Neon * instructions. The line below should compile to the same code on those * systems. */ return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word))); } /* * pg_popcount_neon * Returns number of 1 bits in buf */ static uint64 pg_popcount_neon(const char *buf, int bytes) { uint8x16_t vec; uint64x2_t accum1 = vdupq_n_u64(0), accum2 = vdupq_n_u64(0), accum3 = vdupq_n_u64(0), accum4 = vdupq_n_u64(0); uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); uint64 popcnt = 0; /* * For better instruction-level parallelism, each loop iteration operates * on a block of four registers. */ for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) { vec = vld1q_u8((const uint8 *) buf); accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); vec = vld1q_u8((const uint8 *) buf); accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); vec = vld1q_u8((const uint8 *) buf); accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); vec = vld1q_u8((const uint8 *) buf); accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); } /* * If enough data remains, do another iteration on a block of two * registers. */ bytes_per_iteration = 2 * sizeof(uint8x16_t); if (bytes >= bytes_per_iteration) { vec = vld1q_u8((const uint8 *) buf); accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); vec = vld1q_u8((const uint8 *) buf); accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); bytes -= bytes_per_iteration; } /* * Add the accumulators. */ popcnt += vaddvq_u64(vaddq_u64(accum1, accum2)); popcnt += vaddvq_u64(vaddq_u64(accum3, accum4)); /* * Process remaining 8-byte blocks. */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { popcnt += pg_popcount64(*((uint64 *) buf)); buf += sizeof(uint64); } /* * Process any remaining data byte-by-byte. */ while (bytes--) popcnt += pg_number_of_ones[(unsigned char) *buf++]; return popcnt; } /* * pg_popcount_masked_neon * Returns number of 1 bits in buf after applying the mask to each byte */ static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) { uint8x16_t vec, maskv = vdupq_n_u8(mask); uint64x2_t accum1 = vdupq_n_u64(0), accum2 = vdupq_n_u64(0), accum3 = vdupq_n_u64(0), accum4 = vdupq_n_u64(0); uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); uint64 popcnt = 0, mask64 = ~UINT64CONST(0) / 0xFF * mask; /* * For better instruction-level parallelism, each loop iteration operates * on a block of four registers. */ for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) { vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); } /* * If enough data remains, do another iteration on a block of two * registers. */ bytes_per_iteration = 2 * sizeof(uint8x16_t); if (bytes >= bytes_per_iteration) { vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); buf += sizeof(uint8x16_t); bytes -= bytes_per_iteration; } /* * Add the accumulators. */ popcnt += vaddvq_u64(vaddq_u64(accum1, accum2)); popcnt += vaddvq_u64(vaddq_u64(accum3, accum4)); /* * Process remaining 8-byte blocks. */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { popcnt += pg_popcount64(*((uint64 *) buf) & mask64); buf += sizeof(uint64); } /* * Process any remaining data byte-by-byte. */ while (bytes--) popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; return popcnt; } #endif /* POPCNT_AARCH64 */