forked from wolfSSL/wolfssl
ARMv8 AES: remove extra memcpy during encrypt/decrypt
This commit is contained in:
@ -347,12 +347,8 @@ void wc_AesAsyncFree(Aes* aes)
|
|||||||
#elif defined(WOLFSSL_ARMASM)
|
#elif defined(WOLFSSL_ARMASM)
|
||||||
static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
|
static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
|
||||||
{
|
{
|
||||||
byte* keyPt = (byte*)aes->key;
|
byte* keyPt = (byte*)aes->key;
|
||||||
word32 rounds = aes->rounds;
|
word32 rounds = aes->rounds;
|
||||||
byte out[AES_BLOCK_SIZE];
|
|
||||||
byte* output = out;
|
|
||||||
byte* input = (byte*)inBlock;
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
AESE exor's input with round key
|
AESE exor's input with round key
|
||||||
@ -361,7 +357,7 @@ void wc_AesAsyncFree(Aes* aes)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
"LD1 {v0.16b}, [%[CtrIn]], #16 \n"
|
"LD1 {v0.16b}, [%[CtrIn]] \n"
|
||||||
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
||||||
|
|
||||||
"AESE v0.16b, v1.16b \n"
|
"AESE v0.16b, v1.16b \n"
|
||||||
@ -386,12 +382,12 @@ void wc_AesAsyncFree(Aes* aes)
|
|||||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||||
"AESE v0.16b, v1.16b \n"
|
"AESE v0.16b, v1.16b \n"
|
||||||
"AESMC v0.16b, v0.16b \n"
|
"AESMC v0.16b, v0.16b \n"
|
||||||
"AESE v0.16b, v2.16b \n"
|
"AESE v0.16b, v2.16b \n"
|
||||||
|
|
||||||
"#subtract rounds done so far and see if should continue\n"
|
"#subtract rounds done so far and see if should continue\n"
|
||||||
"MOV w12, %w[R] \n"
|
"MOV w12, %w[R] \n"
|
||||||
"SUB w12, w12, #10 \n"
|
"SUB w12, w12, #10 \n"
|
||||||
"CBZ w12, final \n"
|
"CBZ w12, final \n"
|
||||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||||
"AESMC v0.16b, v0.16b \n"
|
"AESMC v0.16b, v0.16b \n"
|
||||||
"AESE v0.16b, v1.16b \n"
|
"AESE v0.16b, v1.16b \n"
|
||||||
@ -399,7 +395,7 @@ void wc_AesAsyncFree(Aes* aes)
|
|||||||
"AESE v0.16b, v2.16b \n"
|
"AESE v0.16b, v2.16b \n"
|
||||||
|
|
||||||
"SUB w12, w12, #2 \n"
|
"SUB w12, w12, #2 \n"
|
||||||
"CBZ w12, final \n"
|
"CBZ w12, final \n"
|
||||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||||
"AESMC v0.16b, v0.16b \n"
|
"AESMC v0.16b, v0.16b \n"
|
||||||
"AESE v0.16b, v1.16b \n"
|
"AESE v0.16b, v1.16b \n"
|
||||||
@ -408,27 +404,24 @@ void wc_AesAsyncFree(Aes* aes)
|
|||||||
|
|
||||||
"#Final AddRoundKey then store result \n"
|
"#Final AddRoundKey then store result \n"
|
||||||
"final: \n"
|
"final: \n"
|
||||||
"LD1 {v1.16b}, [%[Key]], #16 \n"
|
"LD1 {v1.16b}, [%[Key]], #16 \n"
|
||||||
"EOR v0.16b, v0.16b, v1.16b \n"
|
"EOR v0.16b, v0.16b, v1.16b \n"
|
||||||
"ST1 {v0.16b}, [%[CtrOut]] \n"
|
"ST1 {v0.16b}, [%[CtrOut]] \n"
|
||||||
|
|
||||||
:[CtrOut] "=r" (output), "=r" (keyPt), "=r" (rounds)
|
:[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (rounds),
|
||||||
:[Key] "1" (keyPt), [R] "2" (rounds), [CtrIn] "r" (input), "0" (output)
|
"=r" (inBlock)
|
||||||
|
:"0" (outBlock), [Key] "1" (keyPt), [R] "2" (rounds),
|
||||||
|
[CtrIn] "3" (inBlock)
|
||||||
: "cc", "memory", "w12"
|
: "cc", "memory", "w12"
|
||||||
);
|
);
|
||||||
|
|
||||||
XMEMCPY(outBlock, out, AES_BLOCK_SIZE);
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#ifdef HAVE_AES_DECRYPT
|
#ifdef HAVE_AES_DECRYPT
|
||||||
static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
|
static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
|
||||||
{
|
{
|
||||||
byte* keyPt = (byte*)aes->key;
|
byte* keyPt = (byte*)aes->key;
|
||||||
word32 rounds = aes->rounds;
|
word32 rounds = aes->rounds;
|
||||||
byte out[AES_BLOCK_SIZE];
|
|
||||||
byte* output = out;
|
|
||||||
byte* input = (byte*)inBlock;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
AESE exor's input with round key
|
AESE exor's input with round key
|
||||||
@ -437,64 +430,64 @@ void wc_AesAsyncFree(Aes* aes)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
"LD1 {v0.16b}, [%[CtrIn]], #16 \n"
|
"LD1 {v0.16b}, [%[CtrIn]] \n"
|
||||||
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
||||||
|
|
||||||
"AESD v0.16b, v1.16b \n"
|
"AESD v0.16b, v1.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v2.16b \n"
|
"AESD v0.16b, v2.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v3.16b \n"
|
"AESD v0.16b, v3.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v4.16b \n"
|
"AESD v0.16b, v4.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
|
|
||||||
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
"LD1 {v1.16b-v4.16b}, [%[Key]], #64 \n"
|
||||||
"AESD v0.16b, v1.16b \n"
|
"AESD v0.16b, v1.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v2.16b \n"
|
"AESD v0.16b, v2.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v3.16b \n"
|
"AESD v0.16b, v3.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v4.16b \n"
|
"AESD v0.16b, v4.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
|
|
||||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||||
"AESD v0.16b, v1.16b \n"
|
"AESD v0.16b, v1.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v2.16b \n"
|
"AESD v0.16b, v2.16b \n"
|
||||||
|
|
||||||
"#subtract rounds done so far and see if should continue\n"
|
"#subtract rounds done so far and see if should continue\n"
|
||||||
"MOV w12, %w[R] \n"
|
"MOV w12, %w[R] \n"
|
||||||
"SUB w12, w12, #10 \n"
|
"SUB w12, w12, #10 \n"
|
||||||
"CBZ w12, finalDec \n"
|
"CBZ w12, finalDec \n"
|
||||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v1.16b \n"
|
"AESD v0.16b, v1.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v2.16b \n"
|
"AESD v0.16b, v2.16b \n"
|
||||||
|
|
||||||
"SUB w12, w12, #2 \n"
|
"SUB w12, w12, #2 \n"
|
||||||
"CBZ w12, finalDec \n"
|
"CBZ w12, finalDec \n"
|
||||||
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
"LD1 {v1.16b-v2.16b}, [%[Key]], #32 \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v1.16b \n"
|
"AESD v0.16b, v1.16b \n"
|
||||||
"AESIMC v0.16b, v0.16b \n"
|
"AESIMC v0.16b, v0.16b \n"
|
||||||
"AESD v0.16b, v2.16b \n"
|
"AESD v0.16b, v2.16b \n"
|
||||||
|
|
||||||
"#Final AddRoundKey then store result \n"
|
"#Final AddRoundKey then store result \n"
|
||||||
"finalDec: \n"
|
"finalDec: \n"
|
||||||
"LD1 {v1.16b}, [%[Key]], #16 \n"
|
"LD1 {v1.16b}, [%[Key]], #16 \n"
|
||||||
"EOR v0.16b, v0.16b, v1.16b \n"
|
"EOR v0.16b, v0.16b, v1.16b \n"
|
||||||
"ST1 {v0.4s}, [%[CtrOut]] \n"
|
"ST1 {v0.4s}, [%[CtrOut]] \n"
|
||||||
|
|
||||||
:[CtrOut] "=r" (output), "=r" (keyPt), "=r" (rounds), "=r" (input)
|
:[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (rounds),
|
||||||
:[Key] "1" (keyPt), [R] "2" (rounds), [CtrIn] "3" (input), "0" (output)
|
"=r" (inBlock)
|
||||||
|
:"0" (outBlock), [Key] "1" (keyPt), [R] "2" (rounds),
|
||||||
|
[CtrIn] "3" (inBlock)
|
||||||
: "cc", "memory", "w12"
|
: "cc", "memory", "w12"
|
||||||
);
|
);
|
||||||
|
|
||||||
XMEMCPY(outBlock, out, AES_BLOCK_SIZE);
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#endif /* HAVE_AES_DECRYPT */
|
#endif /* HAVE_AES_DECRYPT */
|
||||||
|
Reference in New Issue
Block a user