diff --git a/components/esp32/intr_alloc.c b/components/esp32/intr_alloc.c
index bfd6c51206..d9cc627e17 100644
--- a/components/esp32/intr_alloc.c
+++ b/components/esp32/intr_alloc.c
@@ -700,7 +700,7 @@ void esp_intr_noniram_disable()
         "and a3,%0,%1\n"        //mask ints that need disabling
         "wsr a3,INTENABLE\n"    //write back
         "rsync\n"
-        :"=r"(oldint):"r"(intmask):"a3");
+        :"=&r"(oldint):"r"(intmask):"a3");
     //Save which ints we did disable
     non_iram_int_disabled[cpu]=oldint&non_iram_int_mask[cpu];
 }
diff --git a/components/freertos/tasks.c b/components/freertos/tasks.c
index 6caabaa38e..0b5cc42957 100644
--- a/components/freertos/tasks.c
+++ b/components/freertos/tasks.c
@@ -2381,26 +2381,15 @@ BaseType_t xSwitchRequired = pdFALSE;
 	   switch, even when this routine (running on core 0) unblocks a bunch of high-priority
 	   tasks... this is less than optimal -- JD. */
 	if ( xPortGetCoreID()!=0 ) {
+		#if ( configUSE_TICK_HOOK == 1 )
+		vApplicationTickHook();
+		#endif /* configUSE_TICK_HOOK */
+		esp_vApplicationTickHook();
+
 		/*
 		  We can't really calculate what we need, that's done on core 0... just assume we need a switch.
 		  ToDo: Make this more intelligent? -- JD
 		*/
-		{
-			/* Guard against the tick hook being called when the pended tick
-			count is being unwound (when the scheduler is being unlocked). */
-			if( ( uxSchedulerSuspended[ xPortGetCoreID() ] != ( UBaseType_t ) pdFALSE ) || uxPendedTicks == ( UBaseType_t ) 0U )
-			{
-				#if ( configUSE_TICK_HOOK == 1 )
-				vApplicationTickHook();
-				#endif /* configUSE_TICK_HOOK */
-				esp_vApplicationTickHook();
-			}
-			else
-			{
-				mtCOVERAGE_TEST_MARKER();
-			}
-		}
-
 		return pdTRUE;
 	}
 
diff --git a/components/spi_flash/cache_utils.c b/components/spi_flash/cache_utils.c
index 983ff5d256..df9d18c443 100644
--- a/components/spi_flash/cache_utils.c
+++ b/components/spi_flash/cache_utils.c
@@ -41,6 +41,9 @@ static uint32_t s_flash_op_cache_state[2];
 static SemaphoreHandle_t s_flash_op_mutex;
 static volatile bool s_flash_op_can_start = false;
 static volatile bool s_flash_op_complete = false;
+#ifndef NDEBUG
+static volatile int s_flash_op_cpu = -1;
+#endif
 
 void spi_flash_init_lock()
 {
@@ -85,6 +88,11 @@ void IRAM_ATTR spi_flash_disable_interrupts_caches_and_other_cpu()
 
     const uint32_t cpuid = xPortGetCoreID();
     const uint32_t other_cpuid = (cpuid == 0) ? 1 : 0;
+#ifndef NDEBUG
+    // For sanity check later: record the CPU which has started doing flash operation
+    assert(s_flash_op_cpu == -1);
+    s_flash_op_cpu = cpuid;
+#endif
 
     if (xTaskGetSchedulerState() == taskSCHEDULER_NOT_STARTED) {
         // Scheduler hasn't been started yet, it means that spi_flash API is being
@@ -98,12 +106,13 @@ void IRAM_ATTR spi_flash_disable_interrupts_caches_and_other_cpu()
         // disable cache there and block other tasks from executing.
         s_flash_op_can_start = false;
         s_flash_op_complete = false;
-        esp_ipc_call(other_cpuid, &spi_flash_op_block_func, (void*) other_cpuid);
+        esp_err_t ret = esp_ipc_call(other_cpuid, &spi_flash_op_block_func, (void*) other_cpuid);
+        assert(ret == ESP_OK);
         while (!s_flash_op_can_start) {
             // Busy loop and wait for spi_flash_op_block_func to disable cache
             // on the other CPU
         }
-        // Disable scheduler on CPU cpuid
+        // Disable scheduler on the current CPU
         vTaskSuspendAll();
         // This is guaranteed to run on CPU <cpuid> because the other CPU is now
         // occupied by highest priority task
@@ -119,6 +128,11 @@ void IRAM_ATTR spi_flash_enable_interrupts_caches_and_other_cpu()
 {
     const uint32_t cpuid = xPortGetCoreID();
     const uint32_t other_cpuid = (cpuid == 0) ? 1 : 0;
+#ifndef NDEBUG
+    // Sanity check: flash operation ends on the same CPU as it has started
+    assert(cpuid == s_flash_op_cpu);
+    s_flash_op_cpu = -1;
+#endif
 
     // Re-enable cache on this CPU
     spi_flash_restore_cache(cpuid, s_flash_op_cache_state[cpuid]);
@@ -132,13 +146,21 @@ void IRAM_ATTR spi_flash_enable_interrupts_caches_and_other_cpu()
     } else {
         // Signal to spi_flash_op_block_task that flash operation is complete
         s_flash_op_complete = true;
-        // Resume tasks on the current CPU
+    }
+    // Re-enable non-iram interrupts
+    esp_intr_noniram_enable();
+
+    // Resume tasks on the current CPU, if the scheduler has started.
+    // NOTE: enabling non-IRAM interrupts has to happen before this,
+    // because once the scheduler has started, due to preemption the
+    // current task can end up being moved to the other CPU.
+    // But esp_intr_noniram_enable has to be called on the same CPU which
+    // called esp_intr_noniram_disable
+    if (xTaskGetSchedulerState() != taskSCHEDULER_NOT_STARTED) {
         xTaskResumeAll();
     }
     // Release API lock
     spi_flash_op_unlock();
-    // Re-enable non-iram interrupts
-    esp_intr_noniram_enable();
 }
 
 void IRAM_ATTR spi_flash_disable_interrupts_caches_and_other_cpu_no_os()
diff --git a/components/spi_flash/test/test_spi_flash.c b/components/spi_flash/test/test_spi_flash.c
index bf05b64db4..ecc472ac0e 100644
--- a/components/spi_flash/test/test_spi_flash.c
+++ b/components/spi_flash/test/test_spi_flash.c
@@ -8,26 +8,25 @@
 #include <esp_attr.h>
 
 struct flash_test_ctx {
-    uint32_t offset[2];
-    bool fail[2];
+    uint32_t offset;
+    bool fail;
     SemaphoreHandle_t done;
 };
 
 static void flash_test_task(void *arg)
 {
-    const uint32_t coreid = xPortGetCoreID();
-    ets_printf("t%d\n", coreid);
     struct flash_test_ctx *ctx = (struct flash_test_ctx *) arg;
     vTaskDelay(100 / portTICK_PERIOD_MS);
-    const uint32_t sector = ctx->offset[coreid];
-    ets_printf("es%d\n", coreid);
+    const uint32_t sector = ctx->offset;
+    printf("t%d\n", sector);
+    printf("es%d\n", sector);
     if (spi_flash_erase_sector(sector) != ESP_OK) {
-        ctx->fail[coreid] = true;
-        ets_printf("Erase failed\r\n");
+        ctx->fail = true;
+        printf("Erase failed\r\n");
         xSemaphoreGive(ctx->done);
         vTaskDelete(NULL);
     }
-    ets_printf("ed%d\n", coreid);
+    printf("ed%d\n", sector);
 
     vTaskDelay(0 / portTICK_PERIOD_MS);
 
@@ -35,58 +34,52 @@ static void flash_test_task(void *arg)
     const uint32_t n = 4096;
     for (uint32_t offset = 0; offset < n; offset += 4) {
         if (spi_flash_write(sector * SPI_FLASH_SEC_SIZE + offset, (const uint8_t *) &val, 4) != ESP_OK) {
-            ets_printf("Write failed at offset=%d\r\n", offset);
-            ctx->fail[coreid] = true;
+            printf("Write failed at offset=%d\r\n", offset);
+            ctx->fail = true;
             break;
         }
     }
-    ets_printf("wd%d\n", coreid);
+    printf("wd%d\n", sector);
 
     vTaskDelay(0 / portTICK_PERIOD_MS);
 
     uint32_t val_read;
     for (uint32_t offset = 0; offset < n; offset += 4) {
         if (spi_flash_read(sector * SPI_FLASH_SEC_SIZE + offset, (uint8_t *) &val_read, 4) != ESP_OK) {
-            ets_printf("Read failed at offset=%d\r\n", offset);
-            ctx->fail[coreid] = true;
+            printf("Read failed at offset=%d\r\n", offset);
+            ctx->fail = true;
             break;
         }
         if (val_read != val) {
-            ets_printf("Read invalid value=%08x at offset=%d\r\n", val_read, offset);
-            ctx->fail[coreid] = true;
+            printf("Read invalid value=%08x at offset=%d\r\n", val_read, offset);
+            ctx->fail = true;
             break;
         }
     }
-    ets_printf("td%d\n", coreid);
+    printf("td%d\n", sector);
     xSemaphoreGive(ctx->done);
     vTaskDelete(NULL);
 }
 
 TEST_CASE("flash write and erase work both on PRO CPU and on APP CPU", "[spi_flash][ignore]")
 {
-    TaskHandle_t procpu_task;
-    TaskHandle_t appcpu_task;
-    struct flash_test_ctx ctx;
+    SemaphoreHandle_t done = xSemaphoreCreateCounting(4, 0);
+    struct flash_test_ctx ctx[4] = {
+            { .offset = 0x100 + 6, .done = done },
+            { .offset = 0x100 + 7, .done = done },
+            { .offset = 0x100 + 8, .done = done },
+            { .offset = 0x100 + 9, .done = done }
+    };
 
-    ctx.offset[0] = 6;
-    ctx.offset[1] = 7;
-    ctx.fail[0] = 0;
-    ctx.fail[1] = 0;
-    ctx.done = xSemaphoreCreateBinary();
+    xTaskCreatePinnedToCore(flash_test_task, "1", 2048, &ctx[0], 3, NULL, 0);
+    xTaskCreatePinnedToCore(flash_test_task, "2", 2048, &ctx[1], 3, NULL, 1);
+    xTaskCreatePinnedToCore(flash_test_task, "3", 2048, &ctx[2], 3, NULL, tskNO_AFFINITY);
+    xTaskCreatePinnedToCore(flash_test_task, "4", 2048, &ctx[3], 3, NULL, tskNO_AFFINITY);
 
-    xTaskCreatePinnedToCore(flash_test_task, "1", 2048, &ctx, 3, &procpu_task, 0);
-    if (portNUM_PROCESSORS == 2) {
-        xTaskCreatePinnedToCore(flash_test_task, "2", 2048, &ctx, 3, &appcpu_task, 1);
-    }
-
-    xSemaphoreTake(ctx.done, portMAX_DELAY);
-    if (portNUM_PROCESSORS == 2) {
-        xSemaphoreTake(ctx.done, portMAX_DELAY);
-    }
-
-    TEST_ASSERT_EQUAL(false, ctx.fail[0]);
-    if (portNUM_PROCESSORS == 2) {
-        TEST_ASSERT_EQUAL(false, ctx.fail[1]);
+    for (int i = 0; i < 4; ++i) {
+        xSemaphoreTake(done, portMAX_DELAY);
+        TEST_ASSERT_FALSE(ctx[i].fail);
     }
+    vSemaphoreDelete(done);
 }