From 4676d159adc6e92ab90546a618c340b9a5814459 Mon Sep 17 00:00:00 2001
From: Ivan Grokhotkov <ivan@espressif.com>
Date: Wed, 18 Jan 2017 15:07:27 +0800
Subject: [PATCH 1/3] spi_flash: fix race condition when doing operations in
 unpinned tasks

spi_flash_enable_interrupts_caches_and_other_cpu function used to enable
non-IRAM interrupts after giving up flash operation lock, which would
cause problems if another task was waiting on the lock to start a flash
operation. In fact, non-IRAM interrupts should be re-enabled before the
task scheduler is resumed. Otherwise non-pinned task can be moved to the
other CPU due to preemption, causing esp_intr_noniram_enable to be
called on the other CPU, causing an abort to be triggered.

Fixes the issue reported in
https://github.com/espressif/esp-idf/pull/258
---
 components/spi_flash/cache_utils.c         | 32 ++++++++--
 components/spi_flash/test/test_spi_flash.c | 69 ++++++++++------------
 2 files changed, 58 insertions(+), 43 deletions(-)
diff --git a/components/spi_flash/cache_utils.c b/components/spi_flash/cache_utils.c
index 983ff5d256..df9d18c443 100644
--- a/components/spi_flash/cache_utils.c
+++ b/components/spi_flash/cache_utils.c
@@ -41,6 +41,9 @@ static uint32_t s_flash_op_cache_state[2];
 static SemaphoreHandle_t s_flash_op_mutex;
 static volatile bool s_flash_op_can_start = false;
 static volatile bool s_flash_op_complete = false;
+#ifndef NDEBUG
+static volatile int s_flash_op_cpu = -1;
+#endif
 
 void spi_flash_init_lock()
 {
@@ -85,6 +88,11 @@ void IRAM_ATTR spi_flash_disable_interrupts_caches_and_other_cpu()
 
     const uint32_t cpuid = xPortGetCoreID();
     const uint32_t other_cpuid = (cpuid == 0) ? 1 : 0;
+#ifndef NDEBUG
+    // For sanity check later: record the CPU which has started doing flash operation
+    assert(s_flash_op_cpu == -1);
+    s_flash_op_cpu = cpuid;
+#endif
 
     if (xTaskGetSchedulerState() == taskSCHEDULER_NOT_STARTED) {
         // Scheduler hasn't been started yet, it means that spi_flash API is being
@@ -98,12 +106,13 @@ void IRAM_ATTR spi_flash_disable_interrupts_caches_and_other_cpu()
         // disable cache there and block other tasks from executing.
         s_flash_op_can_start = false;
         s_flash_op_complete = false;
-        esp_ipc_call(other_cpuid, &spi_flash_op_block_func, (void*) other_cpuid);
+        esp_err_t ret = esp_ipc_call(other_cpuid, &spi_flash_op_block_func, (void*) other_cpuid);
+        assert(ret == ESP_OK);
         while (!s_flash_op_can_start) {
             // Busy loop and wait for spi_flash_op_block_func to disable cache
             // on the other CPU
         }
-        // Disable scheduler on CPU cpuid
+        // Disable scheduler on the current CPU
         vTaskSuspendAll();
         // This is guaranteed to run on CPU <cpuid> because the other CPU is now
         // occupied by highest priority task
@@ -119,6 +128,11 @@ void IRAM_ATTR spi_flash_enable_interrupts_caches_and_other_cpu()
 {
     const uint32_t cpuid = xPortGetCoreID();
     const uint32_t other_cpuid = (cpuid == 0) ? 1 : 0;
+#ifndef NDEBUG
+    // Sanity check: flash operation ends on the same CPU as it has started
+    assert(cpuid == s_flash_op_cpu);
+    s_flash_op_cpu = -1;
+#endif
 
     // Re-enable cache on this CPU
     spi_flash_restore_cache(cpuid, s_flash_op_cache_state[cpuid]);
@@ -132,13 +146,21 @@ void IRAM_ATTR spi_flash_enable_interrupts_caches_and_other_cpu()
     } else {
         // Signal to spi_flash_op_block_task that flash operation is complete
         s_flash_op_complete = true;
-        // Resume tasks on the current CPU
+    }
+    // Re-enable non-iram interrupts
+    esp_intr_noniram_enable();
+
+    // Resume tasks on the current CPU, if the scheduler has started.
+    // NOTE: enabling non-IRAM interrupts has to happen before this,
+    // because once the scheduler has started, due to preemption the
+    // current task can end up being moved to the other CPU.
+    // But esp_intr_noniram_enable has to be called on the same CPU which
+    // called esp_intr_noniram_disable
+    if (xTaskGetSchedulerState() != taskSCHEDULER_NOT_STARTED) {
         xTaskResumeAll();
     }
     // Release API lock
     spi_flash_op_unlock();
-    // Re-enable non-iram interrupts
-    esp_intr_noniram_enable();
 }
 
 void IRAM_ATTR spi_flash_disable_interrupts_caches_and_other_cpu_no_os()
diff --git a/components/spi_flash/test/test_spi_flash.c b/components/spi_flash/test/test_spi_flash.c
index 330e37bb82..4f3171049e 100644
--- a/components/spi_flash/test/test_spi_flash.c
+++ b/components/spi_flash/test/test_spi_flash.c
@@ -8,26 +8,25 @@
 #include <esp_attr.h>
 
 struct flash_test_ctx {
-    uint32_t offset[2];
-    bool fail[2];
+    uint32_t offset;
+    bool fail;
     SemaphoreHandle_t done;
 };
 
 static void flash_test_task(void *arg)
 {
-    const uint32_t coreid = xPortGetCoreID();
-    ets_printf("t%d\n", coreid);
     struct flash_test_ctx *ctx = (struct flash_test_ctx *) arg;
     vTaskDelay(100 / portTICK_PERIOD_MS);
-    const uint32_t sector = ctx->offset[coreid];
-    ets_printf("es%d\n", coreid);
+    const uint32_t sector = ctx->offset;
+    printf("t%d\n", sector);
+    printf("es%d\n", sector);
     if (spi_flash_erase_sector(sector) != ESP_OK) {
-        ctx->fail[coreid] = true;
-        ets_printf("Erase failed\r\n");
+        ctx->fail = true;
+        printf("Erase failed\r\n");
         xSemaphoreGive(ctx->done);
         vTaskDelete(NULL);
     }
-    ets_printf("ed%d\n", coreid);
+    printf("ed%d\n", sector);
 
     vTaskDelay(0 / portTICK_PERIOD_MS);
 
@@ -35,58 +34,52 @@ static void flash_test_task(void *arg)
     const uint32_t n = 4096;
     for (uint32_t offset = 0; offset < n; offset += 4) {
         if (spi_flash_write(sector * SPI_FLASH_SEC_SIZE + offset, (const uint8_t *) &val, 4) != ESP_OK) {
-            ets_printf("Write failed at offset=%d\r\n", offset);
-            ctx->fail[coreid] = true;
+            printf("Write failed at offset=%d\r\n", offset);
+            ctx->fail = true;
             break;
         }
     }
-    ets_printf("wd%d\n", coreid);
+    printf("wd%d\n", sector);
 
     vTaskDelay(0 / portTICK_PERIOD_MS);
 
     uint32_t val_read;
     for (uint32_t offset = 0; offset < n; offset += 4) {
         if (spi_flash_read(sector * SPI_FLASH_SEC_SIZE + offset, (uint8_t *) &val_read, 4) != ESP_OK) {
-            ets_printf("Read failed at offset=%d\r\n", offset);
-            ctx->fail[coreid] = true;
+            printf("Read failed at offset=%d\r\n", offset);
+            ctx->fail = true;
             break;
         }
         if (val_read != val) {
-            ets_printf("Read invalid value=%08x at offset=%d\r\n", val_read, offset);
-            ctx->fail[coreid] = true;
+            printf("Read invalid value=%08x at offset=%d\r\n", val_read, offset);
+            ctx->fail = true;
             break;
         }
     }
-    ets_printf("td%d\n", coreid);
+    printf("td%d\n", sector);
     xSemaphoreGive(ctx->done);
     vTaskDelete(NULL);
 }
 
 TEST_CASE("flash write and erase work both on PRO CPU and on APP CPU", "[spi_flash]")
 {
-    TaskHandle_t procpu_task;
-    TaskHandle_t appcpu_task;
-    struct flash_test_ctx ctx;
+    SemaphoreHandle_t done = xSemaphoreCreateCounting(4, 0);
+    struct flash_test_ctx ctx[4] = {
+            { .offset = 0x100 + 6, .done = done },
+            { .offset = 0x100 + 7, .done = done },
+            { .offset = 0x100 + 8, .done = done },
+            { .offset = 0x100 + 9, .done = done }
+    };
 
-    ctx.offset[0] = 6;
-    ctx.offset[1] = 7;
-    ctx.fail[0] = 0;
-    ctx.fail[1] = 0;
-    ctx.done = xSemaphoreCreateBinary();
+    xTaskCreatePinnedToCore(flash_test_task, "1", 2048, &ctx[0], 3, NULL, 0);
+    xTaskCreatePinnedToCore(flash_test_task, "2", 2048, &ctx[1], 3, NULL, 1);
+    xTaskCreatePinnedToCore(flash_test_task, "3", 2048, &ctx[2], 3, NULL, tskNO_AFFINITY);
+    xTaskCreatePinnedToCore(flash_test_task, "4", 2048, &ctx[3], 3, NULL, tskNO_AFFINITY);
 
-    xTaskCreatePinnedToCore(flash_test_task, "1", 2048, &ctx, 3, &procpu_task, 0);
-    if (portNUM_PROCESSORS == 2) {
-        xTaskCreatePinnedToCore(flash_test_task, "2", 2048, &ctx, 3, &appcpu_task, 1);
-    }
-
-    xSemaphoreTake(ctx.done, portMAX_DELAY);
-    if (portNUM_PROCESSORS == 2) {
-        xSemaphoreTake(ctx.done, portMAX_DELAY);
-    }
-
-    TEST_ASSERT_EQUAL(false, ctx.fail[0]);
-    if (portNUM_PROCESSORS == 2) {
-        TEST_ASSERT_EQUAL(false, ctx.fail[1]);
+    for (int i = 0; i < 4; ++i) {
+        xSemaphoreTake(done, portMAX_DELAY);
+        TEST_ASSERT_FALSE(ctx[i].fail);
     }
+    vSemaphoreDelete(done);
 }
 

From f7e2e456e41145f6922b5c6a53ee3f003a389aa7 Mon Sep 17 00:00:00 2001
From: Ivan Grokhotkov <ivan@espressif.com>
Date: Wed, 18 Jan 2017 18:31:06 +0800
Subject: [PATCH 2/3] intr_alloc: mark inline asm operand as earlyclobber

When compiling in release mode, compiler was choosing same register for
oldint and intmask variables, so INTENABLE was never modified.
This effectively broke disabling of non-IRAM interrupts during flash
operations, observed in the existing tests if task watchdog is enabled.
This change adds an extra constraint tells the compiler that output
operand should not be placed into the same register as an input one.
---
 components/esp32/intr_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/esp32/intr_alloc.c b/components/esp32/intr_alloc.c
index bfd6c51206..d9cc627e17 100644
--- a/components/esp32/intr_alloc.c
+++ b/components/esp32/intr_alloc.c
@@ -700,7 +700,7 @@ void esp_intr_noniram_disable()
         "and a3,%0,%1\n"        //mask ints that need disabling
         "wsr a3,INTENABLE\n"    //write back
         "rsync\n"
-        :"=r"(oldint):"r"(intmask):"a3");
+        :"=&r"(oldint):"r"(intmask):"a3");
     //Save which ints we did disable
     non_iram_int_disabled[cpu]=oldint&non_iram_int_mask[cpu];
 }

From 31ec0a7c4453ace7bddac6258fd676381b2cc1ae Mon Sep 17 00:00:00 2001
From: Ivan Grokhotkov <ivan@espressif.com>
Date: Wed, 18 Jan 2017 23:14:29 +0800
Subject: [PATCH 3/3] freertos: call tick hooks on CPU1 even if CPU0 scheduler
 is suspended
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The block which dispatches ticks on CPU1 was a copy of the code block
for the normal path (CPU0). It used to check uxPendedTicks, with the
logic that uxPendedTicks can be 0 iff the scheduler is not suspended.

On CPU1 however, uxPendedTicks is not related to the state of the
scheduler (as uxPendedTicks is updated on CPU0). Due to this, if CPU0
scheduler was suspended, and uxPendedTicks happened to be nonzero,
tick hooks on CPU1 didn’t run, even though CPU1 scheduler was working.

This change removes the check for uxPendedTicks in CPU1 code path,
so that the tick hooks on CPU1 always get called (as for the CPU0 code
path).
---
 components/freertos/tasks.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/components/freertos/tasks.c b/components/freertos/tasks.c
index 6caabaa38e..0b5cc42957 100644
--- a/components/freertos/tasks.c
+++ b/components/freertos/tasks.c
@@ -2381,26 +2381,15 @@ BaseType_t xSwitchRequired = pdFALSE;
 	   switch, even when this routine (running on core 0) unblocks a bunch of high-priority
 	   tasks... this is less than optimal -- JD. */
 	if ( xPortGetCoreID()!=0 ) {
+		#if ( configUSE_TICK_HOOK == 1 )
+		vApplicationTickHook();
+		#endif /* configUSE_TICK_HOOK */
+		esp_vApplicationTickHook();
+
 		/*
 		  We can't really calculate what we need, that's done on core 0... just assume we need a switch.
 		  ToDo: Make this more intelligent? -- JD
 		*/
-		{
-			/* Guard against the tick hook being called when the pended tick
-			count is being unwound (when the scheduler is being unlocked). */
-			if( ( uxSchedulerSuspended[ xPortGetCoreID() ] != ( UBaseType_t ) pdFALSE ) || uxPendedTicks == ( UBaseType_t ) 0U )
-			{
-				#if ( configUSE_TICK_HOOK == 1 )
-				vApplicationTickHook();
-				#endif /* configUSE_TICK_HOOK */
-				esp_vApplicationTickHook();
-			}
-			else
-			{
-				mtCOVERAGE_TEST_MARKER();
-			}
-		}
-
 		return pdTRUE;
 	}