hwcrypto bignum/MPI: Batch safe DPORT reads to improve performance

This commit is contained in:
Angus Gratton
2017-08-16 12:09:40 +10:00
committed by Angus Gratton
parent cb31222e8b
commit 7be002ec0f

View File

@@ -76,16 +76,19 @@ void esp_mpi_acquire_hardware( void )
/* newlib locks lazy initialize on ESP-IDF */ /* newlib locks lazy initialize on ESP-IDF */
_lock_acquire(&mpi_lock); _lock_acquire(&mpi_lock);
DPORT_REG_SET_BIT(DPORT_PERI_CLK_EN_REG, DPORT_PERI_EN_RSA); DPORT_STALL_OTHER_CPU_START();
{
_DPORT_REG_SET_BIT(DPORT_PERI_CLK_EN_REG, DPORT_PERI_EN_RSA);
/* also clear reset on digital signature, otherwise RSA is held in reset */ /* also clear reset on digital signature, otherwise RSA is held in reset */
DPORT_REG_CLR_BIT(DPORT_PERI_RST_EN_REG, _DPORT_REG_CLR_BIT(DPORT_PERI_RST_EN_REG,
DPORT_PERI_EN_RSA DPORT_PERI_EN_RSA
| DPORT_PERI_EN_DIGITAL_SIGNATURE); | DPORT_PERI_EN_DIGITAL_SIGNATURE);
DPORT_REG_CLR_BIT(DPORT_RSA_PD_CTRL_REG, DPORT_RSA_PD); _DPORT_REG_CLR_BIT(DPORT_RSA_PD_CTRL_REG, DPORT_RSA_PD);
}
DPORT_STALL_OTHER_CPU_END();
while(DPORT_REG_READ(RSA_CLEAN_REG) != 1); while(DPORT_REG_READ(RSA_CLEAN_REG) != 1);
// Note: from enabling RSA clock to here takes about 1.3us // Note: from enabling RSA clock to here takes about 1.3us
#ifdef CONFIG_MBEDTLS_MPI_USE_INTERRUPT #ifdef CONFIG_MBEDTLS_MPI_USE_INTERRUPT
@@ -95,11 +98,15 @@ void esp_mpi_acquire_hardware( void )
void esp_mpi_release_hardware( void ) void esp_mpi_release_hardware( void )
{ {
DPORT_REG_SET_BIT(DPORT_RSA_PD_CTRL_REG, DPORT_RSA_PD); DPORT_STALL_OTHER_CPU_START();
{
_DPORT_REG_SET_BIT(DPORT_RSA_PD_CTRL_REG, DPORT_RSA_PD);
/* don't reset digital signature unit, as this resets AES also */ /* don't reset digital signature unit, as this resets AES also */
DPORT_REG_SET_BIT(DPORT_PERI_RST_EN_REG, DPORT_PERI_EN_RSA); _DPORT_REG_SET_BIT(DPORT_PERI_RST_EN_REG, DPORT_PERI_EN_RSA);
DPORT_REG_CLR_BIT(DPORT_PERI_CLK_EN_REG, DPORT_PERI_EN_RSA); _DPORT_REG_CLR_BIT(DPORT_PERI_CLK_EN_REG, DPORT_PERI_EN_RSA);
}
DPORT_STALL_OTHER_CPU_END();
_lock_release(&mpi_lock); _lock_release(&mpi_lock);
} }
@@ -139,6 +146,9 @@ static inline size_t bits_to_hardware_words(size_t num_bits)
If num_words is higher than the number of words in the bignum then If num_words is higher than the number of words in the bignum then
these additional words will be zeroed in the memory buffer. these additional words will be zeroed in the memory buffer.
As this function only writes to DPORT memory, no DPORT_STALL_OTHER_CPU_START()
is required.
*/ */
static inline void mpi_to_mem_block(uint32_t mem_base, const mbedtls_mpi *mpi, size_t num_words) static inline void mpi_to_mem_block(uint32_t mem_base, const mbedtls_mpi *mpi, size_t num_words)
{ {
@@ -146,10 +156,14 @@ static inline void mpi_to_mem_block(uint32_t mem_base, const mbedtls_mpi *mpi, s
uint32_t copy_words = num_words < mpi->n ? num_words : mpi->n; uint32_t copy_words = num_words < mpi->n ? num_words : mpi->n;
/* Copy MPI data to memory block registers */ /* Copy MPI data to memory block registers */
memcpy(pbase, mpi->p, copy_words * 4); for (int i = 0; i < copy_words; i++) {
pbase[i] = mpi->p[i];
}
/* Zero any remaining memory block data */ /* Zero any remaining memory block data */
bzero(pbase + copy_words, (num_words - copy_words) * 4); for (int i = copy_words; i < num_words; i++) {
pbase[i] = 0;
}
/* Note: not executing memw here, can do it before we start a bignum operation */ /* Note: not executing memw here, can do it before we start a bignum operation */
} }
@@ -159,6 +173,8 @@ static inline void mpi_to_mem_block(uint32_t mem_base, const mbedtls_mpi *mpi, s
Reads num_words words from block. Reads num_words words from block.
Can return a failure result if fails to grow the MPI result. Can return a failure result if fails to grow the MPI result.
Cannot be called inside DPORT_STALL_OTHER_CPU_START() (as may allocate memory).
*/ */
static inline int mem_block_to_mpi(mbedtls_mpi *x, uint32_t mem_base, int num_words) static inline int mem_block_to_mpi(mbedtls_mpi *x, uint32_t mem_base, int num_words)
{ {
@@ -167,9 +183,13 @@ static inline int mem_block_to_mpi(mbedtls_mpi *x, uint32_t mem_base, int num_wo
MBEDTLS_MPI_CHK( mbedtls_mpi_grow(x, num_words) ); MBEDTLS_MPI_CHK( mbedtls_mpi_grow(x, num_words) );
/* Copy data from memory block registers */ /* Copy data from memory block registers */
DPORT_STALL_OTHER_CPU_START();
{
for (size_t i = 0; i < num_words; ++i) { for (size_t i = 0; i < num_words; ++i) {
x->p[i] = DPORT_REG_READ(mem_base + i * 4); x->p[i] = _DPORT_REG_READ(mem_base + i * 4);
} }
}
DPORT_STALL_OTHER_CPU_END();
/* Zero any remaining limbs in the bignum, if the buffer is bigger /* Zero any remaining limbs in the bignum, if the buffer is bigger
than num_words */ than num_words */
@@ -238,10 +258,13 @@ static int calculate_rinv(mbedtls_mpi *Rinv, const mbedtls_mpi *M, int num_words
} }
/* Execute RSA operation. op_reg specifies which 'START' register /* Begin an RSA operation. op_reg specifies which 'START' register
to write to. to write to.
Because the only DPORT operations here are writes,
does not need protecting via DPORT_STALL_OTHER_CPU_START();
*/ */
static inline void execute_op(uint32_t op_reg) static inline void start_op(uint32_t op_reg)
{ {
/* Clear interrupt status */ /* Clear interrupt status */
DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1); DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1);
@@ -250,7 +273,15 @@ static inline void execute_op(uint32_t op_reg)
to the memory blocks are also complete. */ to the memory blocks are also complete. */
DPORT_REG_WRITE(op_reg, 1); DPORT_REG_WRITE(op_reg, 1);
}
/* Wait for an RSA operation to complete.
This should NOT be called inside a DPORT_STALL_OTHER_CPU_START(), as it will stall the other CPU for an unacceptably long
period (and - depending on config - may require interrupts enabled).
*/
static inline void wait_op_complete(uint32_t op_reg)
{
#ifdef CONFIG_MBEDTLS_MPI_USE_INTERRUPT #ifdef CONFIG_MBEDTLS_MPI_USE_INTERRUPT
if (!xSemaphoreTake(op_complete_sem, 2000 / portTICK_PERIOD_MS)) { if (!xSemaphoreTake(op_complete_sem, 2000 / portTICK_PERIOD_MS)) {
ESP_LOGE(TAG, "Timed out waiting for RSA operation (op_reg 0x%x int_reg 0x%x)", ESP_LOGE(TAG, "Timed out waiting for RSA operation (op_reg 0x%x int_reg 0x%x)",
@@ -258,12 +289,13 @@ static inline void execute_op(uint32_t op_reg)
abort(); /* indicates a fundamental problem with driver */ abort(); /* indicates a fundamental problem with driver */
} }
#else #else
while(REG_READ(RSA_INTERRUPT_REG) != 1) while(DPORT_REG_READ(RSA_INTERRUPT_REG) != 1)
{ } { }
#endif
/* clear the interrupt */ /* clear the interrupt */
DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1); DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1);
#endif
} }
/* Sub-stages of modulo multiplication/exponentiation operations */ /* Sub-stages of modulo multiplication/exponentiation operations */
@@ -287,6 +319,8 @@ int esp_mpi_mul_mpi_mod(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi
esp_mpi_acquire_hardware(); esp_mpi_acquire_hardware();
/* (As the following are all writes to DPORT memory, no DPORT_STALL_OTHER_CPU_START is required.) */
/* Load M, X, Rinv, Mprime (Mprime is mod 2^32) */ /* Load M, X, Rinv, Mprime (Mprime is mod 2^32) */
mpi_to_mem_block(RSA_MEM_M_BLOCK_BASE, M, num_words); mpi_to_mem_block(RSA_MEM_M_BLOCK_BASE, M, num_words);
mpi_to_mem_block(RSA_MEM_X_BLOCK_BASE, X, num_words); mpi_to_mem_block(RSA_MEM_X_BLOCK_BASE, X, num_words);
@@ -297,10 +331,12 @@ int esp_mpi_mul_mpi_mod(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi
DPORT_REG_WRITE(RSA_MULT_MODE_REG, (num_words / 16) - 1); DPORT_REG_WRITE(RSA_MULT_MODE_REG, (num_words / 16) - 1);
/* Execute first stage montgomery multiplication */ /* Execute first stage montgomery multiplication */
execute_op(RSA_MULT_START_REG); start_op(RSA_MULT_START_REG);
wait_op_complete(RSA_MULT_START_REG);
/* execute second stage */ /* execute second stage */
MBEDTLS_MPI_CHK( modular_multiply_finish(Z, X, Y, num_words) ); ret = modular_multiply_finish(Z, X, Y, num_words);
esp_mpi_release_hardware(); esp_mpi_release_hardware();
@@ -366,6 +402,8 @@ int mbedtls_mpi_exp_mod( mbedtls_mpi* Z, const mbedtls_mpi* X, const mbedtls_mpi
esp_mpi_acquire_hardware(); esp_mpi_acquire_hardware();
/* (As the following are all writes to DPORT memory, no DPORT_STALL_OTHER_CPU_START is required.) */
/* "mode" register loaded with number of 512-bit blocks, minus 1 */ /* "mode" register loaded with number of 512-bit blocks, minus 1 */
DPORT_REG_WRITE(RSA_MODEXP_MODE_REG, (num_words / 16) - 1); DPORT_REG_WRITE(RSA_MODEXP_MODE_REG, (num_words / 16) - 1);
@@ -376,10 +414,11 @@ int mbedtls_mpi_exp_mod( mbedtls_mpi* Z, const mbedtls_mpi* X, const mbedtls_mpi
mpi_to_mem_block(RSA_MEM_RB_BLOCK_BASE, Rinv, num_words); mpi_to_mem_block(RSA_MEM_RB_BLOCK_BASE, Rinv, num_words);
DPORT_REG_WRITE(RSA_M_DASH_REG, Mprime); DPORT_REG_WRITE(RSA_M_DASH_REG, Mprime);
execute_op(RSA_START_MODEXP_REG); start_op(RSA_START_MODEXP_REG);
wait_op_complete(RSA_START_MODEXP_REG);
ret = mem_block_to_mpi(Z, RSA_MEM_Z_BLOCK_BASE, num_words); ret = mem_block_to_mpi(Z, RSA_MEM_Z_BLOCK_BASE, num_words);
esp_mpi_release_hardware(); esp_mpi_release_hardware();
cleanup: cleanup:
@@ -407,11 +446,14 @@ int mbedtls_mpi_exp_mod( mbedtls_mpi* Z, const mbedtls_mpi* X, const mbedtls_mpi
*/ */
static int modular_multiply_finish(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, size_t num_words) static int modular_multiply_finish(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, size_t num_words)
{ {
int ret; int ret = 0;
/* Load Y to X input memory block, rerun */ /* Load Y to X input memory block, rerun */
mpi_to_mem_block(RSA_MEM_X_BLOCK_BASE, Y, num_words); mpi_to_mem_block(RSA_MEM_X_BLOCK_BASE, Y, num_words);
execute_op(RSA_MULT_START_REG); start_op(RSA_MULT_START_REG);
wait_op_complete(RSA_MULT_START_REG);
/* Read result into Z */ /* Read result into Z */
ret = mem_block_to_mpi(Z, RSA_MEM_Z_BLOCK_BASE, num_words); ret = mem_block_to_mpi(Z, RSA_MEM_Z_BLOCK_BASE, num_words);
@@ -429,7 +471,7 @@ static int mpi_mult_mpi_overlong(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbe
/* Z = X * Y */ /* Z = X * Y */
int mbedtls_mpi_mul_mpi( mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y ) int mbedtls_mpi_mul_mpi( mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y )
{ {
int ret; int ret = 0;
size_t bits_x, bits_y, words_x, words_y, words_mult, words_z; size_t bits_x, bits_y, words_x, words_y, words_mult, words_z;
/* Count words needed for X & Y in hardware */ /* Count words needed for X & Y in hardware */
@@ -511,7 +553,9 @@ int mbedtls_mpi_mul_mpi( mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi
*/ */
DPORT_REG_WRITE(RSA_MULT_MODE_REG, (words_z / 16) + 7); DPORT_REG_WRITE(RSA_MULT_MODE_REG, (words_z / 16) + 7);
execute_op(RSA_MULT_START_REG); start_op(RSA_MULT_START_REG);
wait_op_complete(RSA_MULT_START_REG);
/* Read back the result */ /* Read back the result */
ret = mem_block_to_mpi(Z, RSA_MEM_Z_BLOCK_BASE, words_z); ret = mem_block_to_mpi(Z, RSA_MEM_Z_BLOCK_BASE, words_z);
@@ -566,14 +610,15 @@ static int mpi_mult_mpi_failover_mod_mult(mbedtls_mpi *Z, const mbedtls_mpi *X,
DPORT_REG_WRITE(RSA_MEM_RB_BLOCK_BASE + i * 4, 0); DPORT_REG_WRITE(RSA_MEM_RB_BLOCK_BASE + i * 4, 0);
} }
execute_op(RSA_MULT_START_REG); start_op(RSA_MULT_START_REG);
wait_op_complete(RSA_MULT_START_REG);
/* finish the modular multiplication */ /* finish the modular multiplication */
MBEDTLS_MPI_CHK( modular_multiply_finish(Z, X, Y, num_words) ); ret = modular_multiply_finish(Z, X, Y, num_words);
esp_mpi_release_hardware(); esp_mpi_release_hardware();
cleanup:
return ret; return ret;
} }
@@ -596,7 +641,7 @@ static int mpi_mult_mpi_failover_mod_mult(mbedtls_mpi *Z, const mbedtls_mpi *X,
*/ */
static int mpi_mult_mpi_overlong(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, size_t bits_y, size_t words_result) static int mpi_mult_mpi_overlong(mbedtls_mpi *Z, const mbedtls_mpi *X, const mbedtls_mpi *Y, size_t bits_y, size_t words_result)
{ {
int ret; int ret = 0;
mbedtls_mpi Ztemp; mbedtls_mpi Ztemp;
const size_t limbs_y = (bits_y + biL - 1) / biL; const size_t limbs_y = (bits_y + biL - 1) / biL;
/* Rather than slicing in two on bits we slice on limbs (32 bit words) */ /* Rather than slicing in two on bits we slice on limbs (32 bit words) */