mirror of
https://github.com/monero-project/monero.git
synced 2025-01-18 07:33:40 +02:00
Add ARMv8-A AES support
More than twice as fast as plain C code. Note that both ARMv7 and ARMv8 can be further improved with better use of NEON. Also tweak ARMv7 multiplier
This commit is contained in:
parent
7c899ec33a
commit
69b59186f3
@ -37,6 +37,13 @@
|
||||
#include "hash-ops.h"
|
||||
#include "oaes_lib.h"
|
||||
|
||||
#define MEMORY (1 << 21) // 2MB scratchpad
|
||||
#define ITER (1 << 20)
|
||||
#define AES_BLOCK_SIZE 16
|
||||
#define AES_KEY_SIZE 32
|
||||
#define INIT_SIZE_BLK 8
|
||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
||||
|
||||
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_WIN64))
|
||||
// Optimised code below, uses x86-specific intrinsics, SSE2, AES-NI
|
||||
// Fall back to more portable code is down at the bottom
|
||||
@ -77,12 +84,6 @@
|
||||
#define ASM __asm
|
||||
#endif
|
||||
|
||||
#define MEMORY (1 << 21) // 2MB scratchpad
|
||||
#define ITER (1 << 20)
|
||||
#define AES_BLOCK_SIZE 16
|
||||
#define AES_KEY_SIZE 32
|
||||
#define INIT_SIZE_BLK 8
|
||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
||||
#define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE)
|
||||
|
||||
#define U64(x) ((uint64_t *) (x))
|
||||
@ -643,9 +644,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash)
|
||||
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
||||
}
|
||||
|
||||
#elif defined(__arm__)
|
||||
// ND: Some minor optimizations for ARM7 (raspberrry pi 2), effect seems to be ~40-50% faster.
|
||||
// Needs more work.
|
||||
#elif defined(__arm__) || defined(__aarch64__)
|
||||
void slow_hash_allocate_state(void)
|
||||
{
|
||||
// Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
|
||||
@ -658,13 +657,6 @@ void slow_hash_free_state(void)
|
||||
return;
|
||||
}
|
||||
|
||||
#define MEMORY (1 << 21) /* 2 MiB */
|
||||
#define ITER (1 << 20)
|
||||
#define AES_BLOCK_SIZE 16
|
||||
#define AES_KEY_SIZE 32 /*16*/
|
||||
#define INIT_SIZE_BLK 8
|
||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#define RDATA_ALIGN16 __attribute__ ((aligned(16)))
|
||||
#define STATIC static
|
||||
@ -677,6 +669,276 @@ void slow_hash_free_state(void)
|
||||
|
||||
#define U64(x) ((uint64_t *) (x))
|
||||
|
||||
#pragma pack(push, 1)
|
||||
union cn_slow_hash_state
|
||||
{
|
||||
union hash_state hs;
|
||||
struct
|
||||
{
|
||||
uint8_t k[64];
|
||||
uint8_t init[INIT_SIZE_BYTE];
|
||||
};
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
||||
|
||||
/* ARMv8-A optimized with NEON and AES instructions.
|
||||
* Copied from the x86-64 AES-NI implementation. It has much the same
|
||||
* characteristics as x86-64: there's no 64x64=128 multiplier for vectors,
|
||||
* and moving between vector and regular registers stalls the pipeline.
|
||||
*/
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE)
|
||||
|
||||
#define state_index(x) (((*((uint64_t *)x) >> 4) & (TOTALBLOCKS - 1)) << 4)
|
||||
#define __mul() __asm__("mul %0, %1, %2\n\t" : "=r"(lo) : "r"(c[0]), "r"(b[0]) ); \
|
||||
__asm__("umulh %0, %1, %2\n\t" : "=r"(hi) : "r"(c[0]), "r"(b[0]) );
|
||||
|
||||
#define pre_aes() \
|
||||
j = state_index(a); \
|
||||
_c = vld1q_u8(&hp_state[j]); \
|
||||
_a = vld1q_u8((const uint8_t *)a); \
|
||||
|
||||
#define post_aes() \
|
||||
vst1q_u8((uint8_t *)c, _c); \
|
||||
_b = veorq_u8(_b, _c); \
|
||||
vst1q_u8(&hp_state[j], _b); \
|
||||
j = state_index(c); \
|
||||
p = U64(&hp_state[j]); \
|
||||
b[0] = p[0]; b[1] = p[1]; \
|
||||
__mul(); \
|
||||
a[0] += hi; a[1] += lo; \
|
||||
p = U64(&hp_state[j]); \
|
||||
p[0] = a[0]; p[1] = a[1]; \
|
||||
a[0] ^= b[0]; a[1] ^= b[1]; \
|
||||
_b = _c; \
|
||||
|
||||
|
||||
/* Note: this was based on a standard 256bit key schedule but
|
||||
* it's been shortened since Cryptonight doesn't use the full
|
||||
* key schedule. Don't try to use this for vanilla AES.
|
||||
*/
|
||||
static void aes_expand_key(const uint8_t *key, uint8_t *expandedKey) {
|
||||
__asm__("mov x2, %1\n\t" : : "r"(key), "r"(expandedKey));
|
||||
__asm__(
|
||||
" adr x3,Lrcon\n"
|
||||
"\n"
|
||||
" eor v0.16b,v0.16b,v0.16b\n"
|
||||
" ld1 {v3.16b},[x0],#16\n"
|
||||
" ld1 {v1.4s,v2.4s},[x3],#32\n"
|
||||
" b L256\n"
|
||||
".align 5\n"
|
||||
"Lrcon:\n"
|
||||
".long 0x01,0x01,0x01,0x01\n"
|
||||
".long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat\n"
|
||||
".long 0x1b,0x1b,0x1b,0x1b\n"
|
||||
"\n"
|
||||
".align 4\n"
|
||||
"L256:\n"
|
||||
" ld1 {v4.16b},[x0]\n"
|
||||
" mov w1,#5\n"
|
||||
" st1 {v3.4s},[x2],#16\n"
|
||||
"\n"
|
||||
"Loop256:\n"
|
||||
" tbl v6.16b,{v4.16b},v2.16b\n"
|
||||
" ext v5.16b,v0.16b,v3.16b,#12\n"
|
||||
" st1 {v4.4s},[x2],#16\n"
|
||||
" aese v6.16b,v0.16b\n"
|
||||
" subs w1,w1,#1\n"
|
||||
"\n"
|
||||
" eor v3.16b,v3.16b,v5.16b\n"
|
||||
" ext v5.16b,v0.16b,v5.16b,#12\n"
|
||||
" eor v3.16b,v3.16b,v5.16b\n"
|
||||
" ext v5.16b,v0.16b,v5.16b,#12\n"
|
||||
" eor v6.16b,v6.16b,v1.16b\n"
|
||||
" eor v3.16b,v3.16b,v5.16b\n"
|
||||
" shl v1.16b,v1.16b,#1\n"
|
||||
" eor v3.16b,v3.16b,v6.16b\n"
|
||||
" st1 {v3.4s},[x2],#16\n"
|
||||
" b.eq Ldone\n"
|
||||
"\n"
|
||||
" dup v6.4s,v3.s[3] // just splat\n"
|
||||
" ext v5.16b,v0.16b,v4.16b,#12\n"
|
||||
" aese v6.16b,v0.16b\n"
|
||||
"\n"
|
||||
" eor v4.16b,v4.16b,v5.16b\n"
|
||||
" ext v5.16b,v0.16b,v5.16b,#12\n"
|
||||
" eor v4.16b,v4.16b,v5.16b\n"
|
||||
" ext v5.16b,v0.16b,v5.16b,#12\n"
|
||||
" eor v4.16b,v4.16b,v5.16b\n"
|
||||
"\n"
|
||||
" eor v4.16b,v4.16b,v6.16b\n"
|
||||
" b Loop256\n"
|
||||
"\n"
|
||||
"Ldone:\n");
|
||||
}
|
||||
|
||||
/* An ordinary AES round is a sequence of SubBytes, ShiftRows, MixColumns, AddRoundKey. There
|
||||
* is also an InitialRound which consists solely of AddRoundKey. The ARM instructions slice
|
||||
* this sequence differently; the aese instruction performs AddRoundKey, SubBytes, ShiftRows.
|
||||
* The aesmc instruction does the MixColumns. Since the aese instruction moves the AddRoundKey
|
||||
* up front, and Cryptonight's hash skips the InitialRound step, we have to kludge it here by
|
||||
* feeding in a vector of zeros for our first step. Also we have to do our own Xor explicitly
|
||||
* at the last step, to provide the AddRoundKey that the ARM instructions omit.
|
||||
*/
|
||||
STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, int nblocks)
|
||||
{
|
||||
const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0};
|
||||
uint8x16_t tmp;
|
||||
int i;
|
||||
|
||||
for (i=0; i<nblocks; i++)
|
||||
{
|
||||
uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE);
|
||||
tmp = vaeseq_u8(tmp, zero);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[0]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[1]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[2]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[3]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[4]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[5]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[6]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[7]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[8]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = veorq_u8(tmp, k[9]);
|
||||
vst1q_u8(out + i * AES_BLOCK_SIZE, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
STATIC INLINE void aes_pseudo_round_xor(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, const uint8_t *xor, int nblocks)
|
||||
{
|
||||
const uint8x16_t *k = (const uint8x16_t *)expandedKey;
|
||||
const uint8x16_t *x = (const uint8x16_t *)xor;
|
||||
uint8x16_t tmp;
|
||||
int i;
|
||||
|
||||
for (i=0; i<nblocks; i++)
|
||||
{
|
||||
uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE);
|
||||
tmp = vaeseq_u8(tmp, x[i]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[0]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[1]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[2]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[3]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[4]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[5]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[6]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[7]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = vaeseq_u8(tmp, k[8]);
|
||||
tmp = vaesmcq_u8(tmp);
|
||||
tmp = veorq_u8(tmp, k[9]);
|
||||
vst1q_u8(out + i * AES_BLOCK_SIZE, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void cn_slow_hash(const void *data, size_t length, char *hash)
|
||||
{
|
||||
RDATA_ALIGN16 uint8_t expandedKey[240];
|
||||
RDATA_ALIGN16 uint8_t hp_state[MEMORY];
|
||||
|
||||
uint8_t text[INIT_SIZE_BYTE];
|
||||
RDATA_ALIGN16 uint64_t a[2];
|
||||
RDATA_ALIGN16 uint64_t b[2];
|
||||
RDATA_ALIGN16 uint64_t c[2];
|
||||
union cn_slow_hash_state state;
|
||||
uint8x16_t _a, _b, _c, zero = {0};
|
||||
uint64_t hi, lo;
|
||||
|
||||
size_t i, j;
|
||||
uint64_t *p = NULL;
|
||||
|
||||
static void (*const extra_hashes[4])(const void *, size_t, char *) =
|
||||
{
|
||||
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
|
||||
};
|
||||
|
||||
/* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */
|
||||
|
||||
hash_process(&state.hs, data, length);
|
||||
memcpy(text, state.init, INIT_SIZE_BYTE);
|
||||
|
||||
/* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill
|
||||
* the 2MB large random access buffer.
|
||||
*/
|
||||
|
||||
aes_expand_key(state.hs.b, expandedKey);
|
||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||
{
|
||||
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
||||
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
||||
}
|
||||
|
||||
U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0];
|
||||
U64(a)[1] = U64(&state.k[0])[1] ^ U64(&state.k[32])[1];
|
||||
U64(b)[0] = U64(&state.k[16])[0] ^ U64(&state.k[48])[0];
|
||||
U64(b)[1] = U64(&state.k[16])[1] ^ U64(&state.k[48])[1];
|
||||
|
||||
/* CryptoNight Step 3: Bounce randomly 1 million times through the mixing buffer,
|
||||
* using 500,000 iterations of the following mixing function. Each execution
|
||||
* performs two reads and writes from the mixing buffer.
|
||||
*/
|
||||
|
||||
_b = vld1q_u8((const uint8_t *)b);
|
||||
|
||||
|
||||
for(i = 0; i < ITER / 2; i++)
|
||||
{
|
||||
pre_aes();
|
||||
_c = vaeseq_u8(_c, zero);
|
||||
_c = vaesmcq_u8(_c);
|
||||
_c = veorq_u8(_c, _a);
|
||||
post_aes();
|
||||
}
|
||||
|
||||
/* CryptoNight Step 4: Sequentially pass through the mixing buffer and use 10 rounds
|
||||
* of AES encryption to mix the random data back into the 'text' buffer. 'text'
|
||||
* was originally created with the output of Keccak1600. */
|
||||
|
||||
memcpy(text, state.init, INIT_SIZE_BYTE);
|
||||
|
||||
aes_expand_key(&state.hs.b[32], expandedKey);
|
||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||
{
|
||||
// add the xor to the pseudo round
|
||||
aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
||||
}
|
||||
|
||||
/* CryptoNight Step 5: Apply Keccak to the state again, and then
|
||||
* use the resulting data to select which of four finalizer
|
||||
* hash functions to apply to the data (Blake, Groestl, JH, or Skein).
|
||||
* Use this hash to squeeze the state array down
|
||||
* to the final 256 bit hash output.
|
||||
*/
|
||||
|
||||
memcpy(state.init, text, INIT_SIZE_BYTE);
|
||||
hash_permutation(&state.hs);
|
||||
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
||||
}
|
||||
#else /* aarch64 && crypto */
|
||||
|
||||
// ND: Some minor optimizations for ARMv7 (raspberrry pi 2), effect seems to be ~40-50% faster.
|
||||
// Needs more work.
|
||||
#include "aesb.c"
|
||||
|
||||
#ifdef NO_OPTIMIZED_MULTIPLY_ON_ARM
|
||||
@ -714,13 +976,21 @@ void mul(const uint8_t *ca, const uint8_t *cb, uint8_t *cres) {
|
||||
}
|
||||
#else // !NO_OPTIMIZED_MULTIPLY_ON_ARM
|
||||
|
||||
/* Can work as inline, but actually runs slower. Keep it separate */
|
||||
#define mul(a, b, c) cn_mul128(a, b, c)
|
||||
void mul(const uint8_t *ca, const uint8_t *cb, uint8_t *cr)
|
||||
#ifdef __aarch64__ /* ARM64, no crypto */
|
||||
#define mul(a, b, c) cn_mul128((const uint64_t *)a, (const uint64_t *)b, (uint64_t *)c)
|
||||
STATIC void cn_mul128(const uint64_t *a, const uint64_t *b, uint64_t *r)
|
||||
{
|
||||
uint64_t lo, hi;
|
||||
__asm__("mul %0, %1, %2\n\t" : "=r"(lo) : "r"(a[0]), "r"(b[0]) );
|
||||
__asm__("umulh %0, %1, %2\n\t" : "=r"(hi) : "r"(a[0]), "r"(b[0]) );
|
||||
r[0] = hi;
|
||||
r[1] = lo;
|
||||
}
|
||||
#else /* ARM32 */
|
||||
/* Can work as inline, but actually runs slower. Keep it separate */
|
||||
#define mul(a, b, c) cn_mul128((const uint32_t *)a, (const uint32_t *)b, (uint32_t *)c)
|
||||
void cn_mul128(const uint32_t *aa, const uint32_t *bb, uint32_t *r)
|
||||
{
|
||||
const uint32_t *aa = (uint32_t *)ca;
|
||||
const uint32_t *bb = (uint32_t *)cb;
|
||||
uint32_t *r = (uint32_t *)cr;
|
||||
uint32_t t0, t1;
|
||||
__asm__ __volatile__(
|
||||
"umull %[t0], %[t1], %[a], %[b]\n\t"
|
||||
@ -743,10 +1013,11 @@ __asm__ __volatile__(
|
||||
|
||||
"str %[t0], [%[r]]\n\t"
|
||||
"str %[t1], [%[r], #4]\n\t"
|
||||
: [t0]"=&r"(t0), [t1]"=&r"(t1)
|
||||
: [t0]"=&r"(t0), [t1]"=&r"(t1), "=m"(r[0]), "=m"(r[1]), "=m"(r[2]), "=m"(r[3])
|
||||
: [A]"r"(aa[1]), [a]"r"(aa[0]), [B]"r"(bb[1]), [b]"r"(bb[0]), [r]"r"(r)
|
||||
: "cc", "memory");
|
||||
: "cc");
|
||||
}
|
||||
#endif /* !aarch64 */
|
||||
#endif // NO_OPTIMIZED_MULTIPLY_ON_ARM
|
||||
|
||||
STATIC INLINE void sum_half_blocks(uint8_t* a, const uint8_t* b)
|
||||
@ -779,18 +1050,6 @@ STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b)
|
||||
U64(a)[1] ^= U64(b)[1];
|
||||
}
|
||||
|
||||
#pragma pack(push, 1)
|
||||
union cn_slow_hash_state
|
||||
{
|
||||
union hash_state hs;
|
||||
struct
|
||||
{
|
||||
uint8_t k[64];
|
||||
uint8_t init[INIT_SIZE_BYTE];
|
||||
};
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
void cn_slow_hash(const void *data, size_t length, char *hash)
|
||||
{
|
||||
uint8_t long_state[MEMORY];
|
||||
@ -871,6 +1130,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash)
|
||||
hash_permutation(&state.hs);
|
||||
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
||||
}
|
||||
#endif /* !aarch64 || !crypto */
|
||||
|
||||
#else
|
||||
// Portable implementation as a fallback
|
||||
@ -891,13 +1151,6 @@ static void (*const extra_hashes[4])(const void *, size_t, char *) = {
|
||||
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
|
||||
};
|
||||
|
||||
#define MEMORY (1 << 21) /* 2 MiB */
|
||||
#define ITER (1 << 20)
|
||||
#define AES_BLOCK_SIZE 16
|
||||
#define AES_KEY_SIZE 32 /*16*/
|
||||
#define INIT_SIZE_BLK 8
|
||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
||||
|
||||
extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
||||
extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user