Skip to content

Commit 76bd236

Browse files
committed
crypto: implement naive ARM AES backend for simple rounds
1 parent f2ececc commit 76bd236

File tree

5 files changed

+156
-2
lines changed

5 files changed

+156
-2
lines changed

configure.ac

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,7 @@ dnl https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111843. To work around that, se
535535
dnl -fstack-reuse=none for all gcc builds. (Only gcc understands this flag)
536536
AX_CHECK_COMPILE_FLAG([-fstack-reuse=none], [CORE_CXXFLAGS="$CORE_CXXFLAGS -fstack-reuse=none"])
537537

538+
enable_arm_aes=no
538539
enable_arm_crc=no
539540
enable_arm_shani=no
540541
enable_ssse3=no
@@ -677,7 +678,22 @@ CXXFLAGS="$TEMP_CXXFLAGS"
677678

678679
# ARM
679680
AX_CHECK_COMPILE_FLAG([-march=armv8-a+crc+crypto], [ARM_CRC_CXXFLAGS="-march=armv8-a+crc+crypto"], [], [$CXXFLAG_WERROR])
680-
AX_CHECK_COMPILE_FLAG([-march=armv8-a+crypto], [ARM_SHANI_CXXFLAGS="-march=armv8-a+crypto"], [], [$CXXFLAG_WERROR])
681+
AX_CHECK_COMPILE_FLAG([-march=armv8-a+crypto], [ARM_AES_CXXFLAGS="-march=armv8-a+crypto"; ARM_SHANI_CXXFLAGS="-march=armv8-a+crypto"], [], [$CXXFLAG_WERROR])
682+
683+
TEMP_CXXFLAGS="$CXXFLAGS"
684+
CXXFLAGS="$ARM_AES_CXXFLAGS $CXXFLAGS"
685+
AC_MSG_CHECKING([for ARMv8 AES intrinsics])
686+
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
687+
#include <arm_neon.h>
688+
]],[[
689+
uint8x16_t a, b;
690+
vaesmcq_u8(vaeseq_u8(a, b));
691+
return 0;
692+
]])],
693+
[ AC_MSG_RESULT([yes]); enable_arm_aes=yes; AC_DEFINE([ENABLE_ARM_AES], [1], [Define this symbol to build code that uses ARMv8 AES intrinsics]) ],
694+
[ AC_MSG_RESULT([no])]
695+
)
696+
CXXFLAGS="$TEMP_CXXFLAGS"
681697

682698
TEMP_CXXFLAGS="$CXXFLAGS"
683699
CXXFLAGS="$ARM_CRC_CXXFLAGS $CXXFLAGS"
@@ -1861,6 +1877,7 @@ AM_CONDITIONAL([ENABLE_SSE41], [test "$enable_sse41" = "yes"])
18611877
AM_CONDITIONAL([ENABLE_AVX2], [test "$enable_avx2" = "yes"])
18621878
AM_CONDITIONAL([ENABLE_X86_AESNI], [test "$enable_x86_aesni" = "yes"])
18631879
AM_CONDITIONAL([ENABLE_X86_SHANI], [test "$enable_x86_shani" = "yes"])
1880+
AM_CONDITIONAL([ENABLE_ARM_AES], [test "$enable_arm_aes" = "yes"])
18641881
AM_CONDITIONAL([ENABLE_ARM_CRC], [test "$enable_arm_crc" = "yes"])
18651882
AM_CONDITIONAL([ENABLE_ARM_SHANI], [test "$enable_arm_shani" = "yes"])
18661883
AM_CONDITIONAL([WORDS_BIGENDIAN], [test "$ac_cv_c_bigendian" = "yes"])
@@ -1922,6 +1939,7 @@ AC_SUBST(CLMUL_CXXFLAGS)
19221939
AC_SUBST(AVX2_CXXFLAGS)
19231940
AC_SUBST(X86_AESNI_CXXFLAGS)
19241941
AC_SUBST(X86_SHANI_CXXFLAGS)
1942+
AC_SUBST(ARM_AES_CXXFLAGS)
19251943
AC_SUBST(ARM_CRC_CXXFLAGS)
19261944
AC_SUBST(ARM_SHANI_CXXFLAGS)
19271945
AC_SUBST(LIBTOOL_APP_LDFLAGS)

src/Makefile.am

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ if ENABLE_AVX2
9595
LIBBITCOIN_CRYPTO_AVX2 = crypto/libbitcoin_crypto_avx2.la
9696
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_AVX2)
9797
endif
98+
if ENABLE_ARM_AES
99+
LIBBITCOIN_CRYPTO_ARM_AES = crypto/libbitcoin_crypto_arm_aes.la
100+
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_AES)
101+
endif
98102
if ENABLE_ARM_SHANI
99103
LIBBITCOIN_CRYPTO_ARM_SHANI = crypto/libbitcoin_crypto_arm_shani.la
100104
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_SHANI)
@@ -773,6 +777,16 @@ crypto_libbitcoin_crypto_sph_la_SOURCES = \
773777
crypto/x11/util/consts_aes.hpp \
774778
crypto/x11/util/util.hpp
775779

780+
# See explanation for -static in crypto_libbitcoin_crypto_base_la's LDFLAGS and
781+
# CXXFLAGS above
782+
crypto_libbitcoin_crypto_arm_aes_la_LDFLAGS = $(AM_LDFLAGS) -static
783+
crypto_libbitcoin_crypto_arm_aes_la_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) -static
784+
crypto_libbitcoin_crypto_arm_aes_la_CPPFLAGS = $(AM_CPPFLAGS)
785+
crypto_libbitcoin_crypto_arm_aes_la_CXXFLAGS += $(ARM_AES_CXXFLAGS)
786+
crypto_libbitcoin_crypto_arm_aes_la_CPPFLAGS += -DENABLE_ARM_AES
787+
crypto_libbitcoin_crypto_arm_aes_la_SOURCES = \
788+
crypto/x11/arm_crypto/aes.cpp
789+
776790
# See explanation for -static in crypto_libbitcoin_crypto_base_la's LDFLAGS and
777791
# CXXFLAGS above
778792
crypto_libbitcoin_crypto_ssse3_la_LDFLAGS = $(AM_LDFLAGS) -static

src/crypto/x11/arm_crypto/aes.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
// Copyright (c) 2025 The Dash Core developers
2+
// Distributed under the MIT software license, see the accompanying
3+
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4+
5+
#if defined(ENABLE_ARM_AES)
6+
#include <crypto/x11/util/util.hpp>
7+
8+
#include <cstdint>
9+
10+
#include <arm_neon.h>
11+
12+
namespace sapphire {
13+
namespace arm_crypto_aes {
14+
void Round(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
15+
uint32_t k0, uint32_t k1, uint32_t k2, uint32_t k3,
16+
uint32_t& y0, uint32_t& y1, uint32_t& y2, uint32_t& y3)
17+
{
18+
uint8x16_t block = util::aes_round(util::pack_le(x0, x1, x2, x3), util::pack_le(k0, k1, k2, k3));
19+
util::unpack_le(block, y0, y1, y2, y3);
20+
}
21+
22+
void RoundKeyless(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
23+
uint32_t& y0, uint32_t& y1, uint32_t& y2, uint32_t& y3)
24+
{
25+
uint8x16_t block = util::aes_round_nk(util::pack_le(x0, x1, x2, x3));
26+
util::unpack_le(block, y0, y1, y2, y3);
27+
}
28+
} // namespace arm_crypto_aes
29+
} // namespace sapphire
30+
31+
#endif // ENABLE_ARM_AES

src/crypto/x11/dispatch.cpp

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,39 @@
1010

1111
#if !defined(DISABLE_OPTIMIZED_SHA256)
1212
#include <compat/cpuid.h>
13+
14+
#if defined(ENABLE_ARM_AES)
15+
#if defined(__APPLE__)
16+
#include <sys/sysctl.h>
17+
#include <sys/types.h>
18+
#endif // __APPLE__
19+
20+
#if defined(__linux__)
21+
#include <asm/hwcap.h>
22+
#include <sys/auxv.h>
23+
#endif // __linux__
24+
25+
#if defined(_WIN32)
26+
#include <processthreadsapi.h>
27+
#include <winnt.h>
28+
#endif // _WIN32
29+
#endif // ENABLE_ARM_AES
1330
#endif // !DISABLE_OPTIMIZED_SHA256
1431

15-
#include <cstdint>
32+
#include <cstddef>
1633

1734
namespace sapphire {
1835
#if !defined(DISABLE_OPTIMIZED_SHA256)
36+
#if defined(ENABLE_ARM_AES)
37+
namespace arm_crypto_aes {
38+
void Round(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
39+
uint32_t k0, uint32_t k1, uint32_t k2, uint32_t k3,
40+
uint32_t& y0, uint32_t& y1, uint32_t& y2, uint32_t& y3);
41+
void RoundKeyless(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
42+
uint32_t& y0, uint32_t& y1, uint32_t& y2, uint32_t& y3);
43+
} // namespace arm_crypto_aes
44+
#endif // ENABLE_ARM_AES
45+
1946
#if defined(ENABLE_SSSE3)
2047
namespace ssse3_echo {
2148
void ShiftAndMix(uint64_t W[16][2]);
@@ -92,5 +119,34 @@ void SapphireAutoDetect()
92119
}
93120
#endif // ENABLE_SSSE3
94121
#endif // HAVE_GETCPUID
122+
123+
#if defined(ENABLE_ARM_AES)
124+
bool have_arm_aes = false;
125+
#if defined(__APPLE__)
126+
int val = 0;
127+
size_t len = sizeof(val);
128+
if (::sysctlbyname("hw.optional.arm.FEAT_AES", &val, &len, nullptr, 0) == 0) {
129+
have_arm_aes = val != 0;
130+
}
131+
#endif // __APPLE__
132+
133+
#if defined(__linux__)
134+
#if defined(__arm__)
135+
have_arm_aes = (::getauxval(AT_HWCAP2) & HWCAP2_AES);
136+
#endif // __arm__
137+
#if defined(__aarch64__)
138+
have_arm_aes = (::getauxval(AT_HWCAP) & HWCAP_AES);
139+
#endif // __aarch64__
140+
#endif // __linux__
141+
142+
#if defined(_WIN32)
143+
have_arm_aes = ::IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
144+
#endif // _WIN32
145+
146+
if (have_arm_aes) {
147+
aes_round = sapphire::arm_crypto_aes::Round;
148+
aes_round_nk = sapphire::arm_crypto_aes::RoundKeyless;
149+
}
150+
#endif // ENABLE_ARM_AES
95151
#endif // !DISABLE_OPTIMIZED_SHA256
96152
}

src/crypto/x11/util/util.hpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
#if !defined(DISABLE_OPTIMIZED_SHA256)
1111
#include <attributes.h>
1212

13+
#if defined(ENABLE_ARM_AES)
14+
#include <arm_neon.h>
15+
#endif // ENABLE_ARM_AES
16+
1317
#if defined(ENABLE_SSSE3) || (defined(ENABLE_SSE41) && defined(ENABLE_X86_AESNI))
1418
#include <immintrin.h>
1519
#endif // ENABLE_SSSE3 || (ENABLE_SSE41 && ENABLE_X86_AESNI)
@@ -26,6 +30,37 @@ constexpr inline uint32_t pack_le(uint8_t b3, uint8_t b2, uint8_t b1, uint8_t b0
2630
}
2731

2832
#if !defined(DISABLE_OPTIMIZED_SHA256)
33+
#if defined(ENABLE_ARM_AES)
34+
uint8x16_t ALWAYS_INLINE Xor(const uint8x16_t& x, const uint8x16_t& y) { return veorq_u8(x, y); }
35+
36+
uint8x16_t ALWAYS_INLINE pack_le(const uint32_t& w0, const uint32_t& w1, const uint32_t& w2, const uint32_t& w3)
37+
{
38+
return vreinterpretq_u8_u32(uint32x4_t{w0, w1, w2, w3});
39+
}
40+
41+
void ALWAYS_INLINE unpack_le(const uint8x16_t& i, uint32_t& w0, uint32_t& w1, uint32_t& w2, uint32_t& w3)
42+
{
43+
const uint32x4_t r = vreinterpretq_u32_u8(i);
44+
w0 = vgetq_lane_u32(r, 0);
45+
w1 = vgetq_lane_u32(r, 1);
46+
w2 = vgetq_lane_u32(r, 2);
47+
w3 = vgetq_lane_u32(r, 3);
48+
}
49+
50+
uint8x16_t ALWAYS_INLINE aes_round(const uint8x16_t& input, const uint8x16_t& key)
51+
{
52+
// See "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase for _mm_aesenc_si128
53+
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
54+
return Xor(vaesmcq_u8(vaeseq_u8(input, vmovq_n_u8(0))), key);
55+
}
56+
57+
uint8x16_t ALWAYS_INLINE aes_round_nk(const uint8x16_t& input)
58+
{
59+
// We can skip the XOR when we don't have a key
60+
return vaesmcq_u8(vaeseq_u8(input, vmovq_n_u8(0)));
61+
}
62+
#endif // ENABLE_ARM_AES
63+
2964
#if defined(ENABLE_SSSE3) || (defined(ENABLE_SSE41) && defined(ENABLE_X86_AESNI))
3065
__m128i ALWAYS_INLINE Xor(const __m128i& x, const __m128i& y) { return _mm_xor_si128(x, y); }
3166

0 commit comments

Comments
 (0)