diff --git a/components/esp_hw_support/dma/async_memcpy_gdma.c b/components/esp_hw_support/dma/async_memcpy_gdma.c index b2a2db7574..e4121fee4b 100644 --- a/components/esp_hw_support/dma/async_memcpy_gdma.c +++ b/components/esp_hw_support/dma/async_memcpy_gdma.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2020-2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2020-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -16,69 +16,49 @@ #include "esp_attr.h" #include "esp_err.h" #include "esp_private/gdma.h" +#include "esp_private/gdma_link.h" +#include "esp_private/esp_dma_utils.h" #include "esp_memory_utils.h" +#include "esp_cache.h" #include "esp_async_memcpy.h" #include "esp_async_memcpy_priv.h" -#include "esp_cache.h" -#include "hal/dma_types.h" #include "hal/cache_hal.h" #include "hal/cache_ll.h" +#include "hal/gdma_ll.h" static const char *TAG = "async_mcp.gdma"; -#ifdef CACHE_LL_L2MEM_NON_CACHE_ADDR -#define MCP_GET_NON_CACHE_ADDR(addr) ((addr) ? CACHE_LL_L2MEM_NON_CACHE_ADDR(addr) : 0) -#else -#define MCP_GET_NON_CACHE_ADDR(addr) (addr) -#endif - -#if SOC_AXI_GDMA_SUPPORTED -#define MCP_DMA_DESC_ALIGN 8 -typedef dma_descriptor_align8_t mcp_dma_descriptor_t; -#elif SOC_AHB_GDMA_SUPPORTED -#define MCP_DMA_DESC_ALIGN 4 -typedef dma_descriptor_align4_t mcp_dma_descriptor_t; -#else -#error "Unsupported GDMA type" -#endif +#define MCP_DMA_DESCRIPTOR_BUFFER_MAX_SIZE 4095 /// @brief Transaction object for async memcpy -/// @note - GDMA requires the DMA descriptors to be 4 or 8 bytes aligned -/// @note - The DMA descriptor link list is allocated dynamically from DMA-able memory -/// @note - Because of the eof_node, the transaction object should also be allocated from DMA-able memory typedef struct async_memcpy_transaction_t { - mcp_dma_descriptor_t eof_node; // this is the DMA node which act as the EOF descriptor (RX path only) - mcp_dma_descriptor_t *tx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size - mcp_dma_descriptor_t *tx_desc_nc; // non-cacheable version of tx_desc_link - mcp_dma_descriptor_t *rx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size - mcp_dma_descriptor_t *rx_desc_nc; // non-cacheable version of rx_desc_link - intptr_t tx_start_desc_addr; // TX start descriptor address - intptr_t rx_start_desc_addr; // RX start descriptor address - void *memcpy_dst_addr; // memcpy destination address - size_t memcpy_size; // memcpy size - async_memcpy_isr_cb_t cb; // user callback - void *cb_args; // user callback args + gdma_link_list_handle_t tx_link_list; // DMA link list for TX direction + gdma_link_list_handle_t rx_link_list; // DMA link list for RX direction + dma_buffer_split_array_t rx_buf_array; // Split the destination buffer into cache aligned ones, save the splits in this array + uint8_t* stash_buffer; // Stash buffer for cache aligned buffer + async_memcpy_isr_cb_t cb; // user callback + void *cb_args; // user callback args STAILQ_ENTRY(async_memcpy_transaction_t) idle_queue_entry; // Entry for the idle queue STAILQ_ENTRY(async_memcpy_transaction_t) ready_queue_entry; // Entry for the ready queue } async_memcpy_transaction_t; /// @brief Context of async memcpy driver /// @note - It saves two queues, one for idle transaction objects, one for ready transaction objects -/// @note - Transaction objects are allocated from DMA-able memory /// @note - Number of transaction objects are determined by the backlog parameter typedef struct { async_memcpy_context_t parent; // Parent IO interface - size_t rx_int_mem_alignment; // DMA buffer alignment (both in size and address) for internal RX memory - size_t rx_ext_mem_alignment; // DMA buffer alignment (both in size and address) for external RX memory - size_t tx_int_mem_alignment; // DMA buffer alignment (both in size and address) for internal TX memory - size_t tx_ext_mem_alignment; // DMA buffer alignment (both in size and address) for external TX memory - size_t max_single_dma_buffer; // max DMA buffer size by a single descriptor + size_t rx_int_mem_alignment; // Required DMA buffer alignment for internal RX memory + size_t rx_ext_mem_alignment; // Required DMA buffer alignment for external RX memory + size_t tx_int_mem_alignment; // Required DMA buffer alignment for internal TX memory + size_t tx_ext_mem_alignment; // Required DMA buffer alignment for external TX memory int gdma_bus_id; // GDMA bus id (AHB, AXI, etc.) gdma_channel_handle_t tx_channel; // GDMA TX channel handle gdma_channel_handle_t rx_channel; // GDMA RX channel handle portMUX_TYPE spin_lock; // spin lock to avoid threads and isr from accessing the same resource simultaneously _Atomic async_memcpy_fsm_t fsm; // driver state machine, changing state should be atomic - async_memcpy_transaction_t *transaction_pool; // transaction object pool + size_t num_trans_objs; // number of transaction objects + async_memcpy_transaction_t *transaction_pool; // transaction object pool + async_memcpy_transaction_t *current_transaction; // current transaction object STAILQ_HEAD(, async_memcpy_transaction_t) idle_queue_head; // Head of the idle queue STAILQ_HEAD(, async_memcpy_transaction_t) ready_queue_head; // Head of the ready queue } async_memcpy_gdma_context_t; @@ -92,9 +72,23 @@ static esp_err_t mcp_new_etm_event(async_memcpy_context_t *ctx, async_memcpy_etm static esp_err_t mcp_gdma_destroy(async_memcpy_gdma_context_t *mcp_gdma) { + // clean up transaction pool if (mcp_gdma->transaction_pool) { + for (size_t i = 0; i < mcp_gdma->num_trans_objs; i++) { + async_memcpy_transaction_t* trans = &mcp_gdma->transaction_pool[i]; + if (trans->tx_link_list) { + gdma_del_link_list(trans->tx_link_list); + } + if (trans->rx_link_list) { + gdma_del_link_list(trans->rx_link_list); + } + if (trans->stash_buffer) { + free(trans->stash_buffer); + } + } free(mcp_gdma->transaction_pool); } + // clean up GDMA channels if (mcp_gdma->tx_channel) { gdma_disconnect(mcp_gdma->tx_channel); gdma_del_channel(mcp_gdma->tx_channel); @@ -108,19 +102,19 @@ static esp_err_t mcp_gdma_destroy(async_memcpy_gdma_context_t *mcp_gdma) } static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp, - esp_err_t (*new_channel)(const gdma_channel_alloc_config_t *, gdma_channel_handle_t *), + esp_err_t (*new_channel_func)(const gdma_channel_alloc_config_t *, gdma_channel_handle_t *), int gdma_bus_id) { esp_err_t ret = ESP_OK; async_memcpy_gdma_context_t *mcp_gdma = NULL; ESP_RETURN_ON_FALSE(config && mcp, ESP_ERR_INVALID_ARG, TAG, "invalid argument"); - // allocate memory of driver context from internal memory + + // allocate memory of driver context from internal memory (because it contains atomic variable) mcp_gdma = heap_caps_calloc(1, sizeof(async_memcpy_gdma_context_t), MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); ESP_GOTO_ON_FALSE(mcp_gdma, ESP_ERR_NO_MEM, err, TAG, "no mem for driver context"); uint32_t trans_queue_len = config->backlog ? config->backlog : DEFAULT_TRANSACTION_QUEUE_LENGTH; - // allocate memory for transaction pool from internal memory because transaction structure contains DMA descriptor - mcp_gdma->transaction_pool = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, trans_queue_len, sizeof(async_memcpy_transaction_t), - MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA); + // allocate memory for transaction pool from internal memory + mcp_gdma->transaction_pool = heap_caps_calloc(trans_queue_len, sizeof(async_memcpy_transaction_t), MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); ESP_GOTO_ON_FALSE(mcp_gdma->transaction_pool, ESP_ERR_NO_MEM, err, TAG, "no mem for transaction pool"); // create TX channel and RX channel, they should reside in the same DMA pair @@ -128,29 +122,39 @@ static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_confi .flags.reserve_sibling = 1, .direction = GDMA_CHANNEL_DIRECTION_TX, }; - ESP_GOTO_ON_ERROR(new_channel(&tx_alloc_config, &mcp_gdma->tx_channel), err, TAG, "failed to create GDMA TX channel"); + ESP_GOTO_ON_ERROR(new_channel_func(&tx_alloc_config, &mcp_gdma->tx_channel), err, TAG, "failed to alloc GDMA TX channel"); gdma_channel_alloc_config_t rx_alloc_config = { .direction = GDMA_CHANNEL_DIRECTION_RX, .sibling_chan = mcp_gdma->tx_channel, }; - ESP_GOTO_ON_ERROR(new_channel(&rx_alloc_config, &mcp_gdma->rx_channel), err, TAG, "failed to create GDMA RX channel"); + ESP_GOTO_ON_ERROR(new_channel_func(&rx_alloc_config, &mcp_gdma->rx_channel), err, TAG, "failed to alloc GDMA RX channel"); - // initialize GDMA channels - gdma_trigger_t m2m_trigger = GDMA_MAKE_TRIGGER(GDMA_TRIG_PERIPH_M2M, 0); // get a free DMA trigger ID for memory copy + gdma_trigger_t m2m_trigger = GDMA_MAKE_TRIGGER(GDMA_TRIG_PERIPH_M2M, 0); uint32_t free_m2m_id_mask = 0; gdma_get_free_m2m_trig_id_mask(mcp_gdma->tx_channel, &free_m2m_id_mask); m2m_trigger.instance_id = __builtin_ctz(free_m2m_id_mask); ESP_GOTO_ON_ERROR(gdma_connect(mcp_gdma->rx_channel, m2m_trigger), err, TAG, "GDMA rx connect failed"); ESP_GOTO_ON_ERROR(gdma_connect(mcp_gdma->tx_channel, m2m_trigger), err, TAG, "GDMA tx connect failed"); + gdma_strategy_config_t strategy_cfg = { + .owner_check = true, + .auto_update_desc = true, + }; + gdma_apply_strategy(mcp_gdma->tx_channel, &strategy_cfg); + gdma_apply_strategy(mcp_gdma->rx_channel, &strategy_cfg); + gdma_transfer_config_t transfer_cfg = { - .max_data_burst_size = config->dma_burst_size ? config->dma_burst_size : 16, + .max_data_burst_size = config->dma_burst_size, .access_ext_mem = true, // allow to do memory copy from/to external memory }; ESP_GOTO_ON_ERROR(gdma_config_transfer(mcp_gdma->tx_channel, &transfer_cfg), err, TAG, "config transfer for tx channel failed"); ESP_GOTO_ON_ERROR(gdma_config_transfer(mcp_gdma->rx_channel, &transfer_cfg), err, TAG, "config transfer for rx channel failed"); + // get the buffer alignment required by the GDMA channel + gdma_get_alignment_constraints(mcp_gdma->rx_channel, &mcp_gdma->rx_int_mem_alignment, &mcp_gdma->rx_ext_mem_alignment); + gdma_get_alignment_constraints(mcp_gdma->tx_channel, &mcp_gdma->tx_int_mem_alignment, &mcp_gdma->tx_ext_mem_alignment); + // register rx eof callback gdma_rx_event_callbacks_t cbs = { .on_recv_eof = mcp_gdma_rx_eof_callback, @@ -169,20 +173,14 @@ static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_confi portMUX_INITIALIZE(&mcp_gdma->spin_lock); atomic_init(&mcp_gdma->fsm, MCP_FSM_IDLE); mcp_gdma->gdma_bus_id = gdma_bus_id; + mcp_gdma->num_trans_objs = trans_queue_len; - // get the buffer alignment required by the GDMA channel - gdma_get_alignment_constraints(mcp_gdma->rx_channel, &mcp_gdma->rx_int_mem_alignment, &mcp_gdma->rx_ext_mem_alignment); - gdma_get_alignment_constraints(mcp_gdma->tx_channel, &mcp_gdma->tx_int_mem_alignment, &mcp_gdma->tx_ext_mem_alignment); - - size_t buf_align = MAX(MAX(mcp_gdma->rx_int_mem_alignment, mcp_gdma->rx_ext_mem_alignment), - MAX(mcp_gdma->tx_int_mem_alignment, mcp_gdma->tx_ext_mem_alignment)); - mcp_gdma->max_single_dma_buffer = ALIGN_DOWN(DMA_DESCRIPTOR_BUFFER_MAX_SIZE, buf_align); mcp_gdma->parent.del = mcp_gdma_del; mcp_gdma->parent.memcpy = mcp_gdma_memcpy; #if SOC_GDMA_SUPPORT_ETM mcp_gdma->parent.new_etm_event = mcp_new_etm_event; #endif - // return driver object + // return base object *mcp = &mcp_gdma->parent; return ESP_OK; @@ -227,61 +225,6 @@ static esp_err_t mcp_gdma_del(async_memcpy_context_t *ctx) return mcp_gdma_destroy(mcp_gdma); } -static void mount_tx_buffer_to_dma(async_memcpy_transaction_t *trans, int num_desc, - uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer) -{ - mcp_dma_descriptor_t *desc_array = trans->tx_desc_link; - mcp_dma_descriptor_t *desc_nc = trans->tx_desc_nc; - uint32_t prepared_length = 0; - size_t len = buf_sz; - for (int i = 0; i < num_desc - 1; i++) { - desc_nc[i].buffer = &buf[prepared_length]; - desc_nc[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA; - desc_nc[i].dw0.suc_eof = 0; - desc_nc[i].dw0.size = max_single_dma_buffer; - desc_nc[i].dw0.length = max_single_dma_buffer; - desc_nc[i].next = &desc_array[i + 1]; - prepared_length += max_single_dma_buffer; - len -= max_single_dma_buffer; - } - // take special care to the EOF descriptor - desc_nc[num_desc - 1].buffer = &buf[prepared_length]; - desc_nc[num_desc - 1].next = NULL; - desc_nc[num_desc - 1].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA; - desc_nc[num_desc - 1].dw0.suc_eof = 1; - desc_nc[num_desc - 1].dw0.size = len; - desc_nc[num_desc - 1].dw0.length = len; -} - -static void mount_rx_buffer_to_dma(async_memcpy_transaction_t *trans, int num_desc, - uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer) -{ - mcp_dma_descriptor_t *desc_array = trans->rx_desc_link; - mcp_dma_descriptor_t *desc_nc = trans->rx_desc_nc; - mcp_dma_descriptor_t *eof_desc = &trans->eof_node; - mcp_dma_descriptor_t *eof_nc = (mcp_dma_descriptor_t *)MCP_GET_NON_CACHE_ADDR(eof_desc); - uint32_t prepared_length = 0; - size_t len = buf_sz; - if (desc_array) { - assert(num_desc > 0); - for (int i = 0; i < num_desc; i++) { - desc_nc[i].buffer = &buf[prepared_length]; - desc_nc[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA; - desc_nc[i].dw0.size = max_single_dma_buffer; - desc_nc[i].dw0.length = max_single_dma_buffer; - desc_nc[i].next = &desc_array[i + 1]; - prepared_length += max_single_dma_buffer; - len -= max_single_dma_buffer; - } - desc_nc[num_desc - 1].next = eof_desc; - } - eof_nc->buffer = &buf[prepared_length]; - eof_nc->next = NULL; - eof_nc->dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA; - eof_nc->dw0.size = len; - eof_nc->dw0.length = len; -} - /// @brief help function to get one transaction from the ready queue /// @note this function is allowed to be called in ISR static async_memcpy_transaction_t *try_pop_trans_from_ready_queue(async_memcpy_gdma_context_t *mcp_gdma) @@ -306,8 +249,9 @@ static void try_start_pending_transaction(async_memcpy_gdma_context_t *mcp_gdma) trans = try_pop_trans_from_ready_queue(mcp_gdma); if (trans) { atomic_store(&mcp_gdma->fsm, MCP_FSM_RUN); - gdma_start(mcp_gdma->rx_channel, trans->rx_start_desc_addr); - gdma_start(mcp_gdma->tx_channel, trans->tx_start_desc_addr); + mcp_gdma->current_transaction = trans; + gdma_start(mcp_gdma->rx_channel, gdma_link_get_head_addr(trans->rx_link_list)); + gdma_start(mcp_gdma->tx_channel, gdma_link_get_head_addr(trans->tx_link_list)); } else { atomic_store(&mcp_gdma->fsm, MCP_FSM_IDLE); } @@ -328,6 +272,7 @@ static async_memcpy_transaction_t *try_pop_trans_from_idle_queue(async_memcpy_gd return trans; } +/// @brief Check if the address and size can meet the requirement of the DMA engine static bool check_buffer_alignment(async_memcpy_gdma_context_t *mcp_gdma, void *src, void *dst, size_t n) { bool valid = true; @@ -355,19 +300,26 @@ static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *s { esp_err_t ret = ESP_OK; async_memcpy_gdma_context_t *mcp_gdma = __containerof(ctx, async_memcpy_gdma_context_t, parent); + size_t dma_link_item_alignment = 4; // buffer location check -#if SOC_AHB_GDMA_SUPPORTED && !SOC_AHB_GDMA_SUPPORT_PSRAM +#if SOC_AHB_GDMA_SUPPORTED if (mcp_gdma->gdma_bus_id == SOC_GDMA_BUS_AHB) { +#if !SOC_AHB_GDMA_SUPPORT_PSRAM ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AHB GDMA can only access SRAM"); +#endif // !SOC_AHB_GDMA_SUPPORT_PSRAM + dma_link_item_alignment = GDMA_LL_AHB_DESC_ALIGNMENT; } -#endif // SOC_AHB_GDMA_SUPPORTED && !SOC_AHB_GDMA_SUPPORT_PSRAM -#if SOC_AXI_GDMA_SUPPORTED && !SOC_AXI_GDMA_SUPPORT_PSRAM +#endif // SOC_AHB_GDMA_SUPPORTED +#if SOC_AXI_GDMA_SUPPORTED if (mcp_gdma->gdma_bus_id == SOC_GDMA_BUS_AXI) { - ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AXI DMA can only access SRAM"); +#if !SOC_AXI_GDMA_SUPPORT_PSRAM + ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AXI GDMA can only access SRAM"); +#endif // !SOC_AXI_GDMA_SUPPORT_PSRAM + dma_link_item_alignment = GDMA_LL_AXI_DESC_ALIGNMENT; } -#endif // SOC_AXI_GDMA_SUPPORTED && !SOC_AXI_GDMA_SUPPORT_PSRAM +#endif // SOC_AXI_GDMA_SUPPORTED // alignment check - ESP_RETURN_ON_FALSE(check_buffer_alignment(mcp_gdma, src, dst, n), ESP_ERR_INVALID_ARG, TAG, "buffer not aligned: %p -> %p, sz=%zu", src, dst, n); + ESP_RETURN_ON_FALSE(check_buffer_alignment(mcp_gdma, src, dst, n), ESP_ERR_INVALID_ARG, TAG, "address|size not aligned: %p -> %p, sz=%zu", src, dst, n); async_memcpy_transaction_t *trans = NULL; // pick one transaction node from idle queue @@ -375,51 +327,84 @@ static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *s // check if we get the transaction object successfully ESP_RETURN_ON_FALSE(trans, ESP_ERR_INVALID_STATE, TAG, "no free node in the idle queue"); - // calculate how many descriptors we want - size_t max_single_dma_buffer = mcp_gdma->max_single_dma_buffer; - uint32_t num_desc_per_path = (n + max_single_dma_buffer - 1) / max_single_dma_buffer; - // allocate DMA descriptors from internal memory - trans->tx_desc_link = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, num_desc_per_path, sizeof(mcp_dma_descriptor_t), - MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA); - ESP_GOTO_ON_FALSE(trans->tx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors"); - trans->tx_desc_nc = (mcp_dma_descriptor_t *)MCP_GET_NON_CACHE_ADDR(trans->tx_desc_link); - // don't have to allocate the EOF descriptor, we will use trans->eof_node as the RX EOF descriptor - if (num_desc_per_path > 1) { - trans->rx_desc_link = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, num_desc_per_path - 1, sizeof(mcp_dma_descriptor_t), - MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA); - ESP_GOTO_ON_FALSE(trans->rx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors"); - trans->rx_desc_nc = (mcp_dma_descriptor_t *)MCP_GET_NON_CACHE_ADDR(trans->rx_desc_link); - } else { - // small copy buffer, use the trans->eof_node is sufficient - trans->rx_desc_link = NULL; - trans->rx_desc_nc = NULL; + // clean up the transaction configuration comes from the last one + if (trans->tx_link_list) { + gdma_del_link_list(trans->tx_link_list); + trans->tx_link_list = NULL; + } + if (trans->rx_link_list) { + gdma_del_link_list(trans->rx_link_list); + trans->rx_link_list = NULL; + } + if (trans->stash_buffer) { + free(trans->stash_buffer); + trans->stash_buffer = NULL; } - // (preload) mount src data to the TX descriptor - mount_tx_buffer_to_dma(trans, num_desc_per_path, src, n, max_single_dma_buffer); - // (preload) mount dst data to the RX descriptor - mount_rx_buffer_to_dma(trans, num_desc_per_path - 1, dst, n, max_single_dma_buffer); + // allocate gdma TX link + gdma_link_list_config_t tx_link_cfg = { + .buffer_alignment = esp_ptr_internal(src) ? mcp_gdma->tx_int_mem_alignment : mcp_gdma->tx_ext_mem_alignment, + .item_alignment = dma_link_item_alignment, + .num_items = n / MCP_DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 1, + .flags = { + .check_owner = true, + .items_in_ext_mem = false, // TODO: if the memcopy size is too large, we may need to allocate the link list items from external memory + }, + }; + ESP_GOTO_ON_ERROR(gdma_new_link_list(&tx_link_cfg, &trans->tx_link_list), err, TAG, "failed to create TX link list"); + // mount the source buffer to the TX link list + gdma_buffer_mount_config_t tx_buf_mount_config[1] = { + [0] = { + .buffer = src, + .length = n, + .flags = { + .mark_eof = true, // mark the last item as EOF, so the RX channel can also received an EOF list item + .mark_final = true, // using singly list, so terminate the link here + } + } + }; + gdma_link_mount_buffers(trans->tx_link_list, 0, tx_buf_mount_config, 1, NULL); - // if the data is in the cache, write back, then DMA can see the latest data + // read the cache line size of internal and external memory, we use this information to check if a given memory is behind the cache + // write back the source data if it's behind the cache + size_t int_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA); + size_t ext_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_EXT_MEM, CACHE_TYPE_DATA); bool need_write_back = false; if (esp_ptr_external_ram(src)) { - need_write_back = true; + need_write_back = ext_mem_cache_line_size > 0; } else if (esp_ptr_internal(src)) { -#if SOC_CACHE_INTERNAL_MEM_VIA_L1CACHE - need_write_back = true; -#endif + need_write_back = int_mem_cache_line_size > 0; } if (need_write_back) { - esp_cache_msync(src, n, ESP_CACHE_MSYNC_FLAG_DIR_C2M); + esp_cache_msync(src, n, ESP_CACHE_MSYNC_FLAG_DIR_C2M | ESP_CACHE_MSYNC_FLAG_UNALIGNED); } + // allocate gdma RX link + gdma_link_list_config_t rx_link_cfg = { + .buffer_alignment = esp_ptr_internal(dst) ? mcp_gdma->rx_int_mem_alignment : mcp_gdma->rx_ext_mem_alignment, + .item_alignment = dma_link_item_alignment, + .num_items = n / MCP_DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 3, + .flags = { + .check_owner = true, + .items_in_ext_mem = false, // TODO: if the memcopy size is too large, we may need to allocate the link list items from external memory + }, + }; + ESP_GOTO_ON_ERROR(gdma_new_link_list(&rx_link_cfg, &trans->rx_link_list), err, TAG, "failed to create RX link list"); + + // if the destination buffer address is not cache line aligned, we need to split the buffer into cache line aligned ones + ESP_GOTO_ON_ERROR(esp_dma_split_rx_buffer_to_cache_aligned(dst, n, &trans->rx_buf_array, &trans->stash_buffer), + err, TAG, "failed to split RX buffer into aligned ones"); + // mount the destination buffer to the RX link list + gdma_buffer_mount_config_t rx_buf_mount_config[3] = {0}; + for (int i = 0; i < 3; i++) { + rx_buf_mount_config[i].buffer = trans->rx_buf_array.aligned_buffer[i].aligned_buffer; + rx_buf_mount_config[i].length = trans->rx_buf_array.aligned_buffer[i].length; + } + gdma_link_mount_buffers(trans->rx_link_list, 0, rx_buf_mount_config, 3, NULL); + // save other transaction context trans->cb = cb_isr; trans->cb_args = cb_args; - trans->memcpy_size = n; - trans->memcpy_dst_addr = dst; // save the destination buffer address, because we may need to do data cache invalidate later - trans->tx_start_desc_addr = (intptr_t)trans->tx_desc_link; - trans->rx_start_desc_addr = trans->rx_desc_link ? (intptr_t)trans->rx_desc_link : (intptr_t)&trans->eof_node; portENTER_CRITICAL(&mcp_gdma->spin_lock); // insert the trans to ready queue @@ -433,14 +418,6 @@ static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *s err: if (trans) { - if (trans->tx_desc_link) { - free(trans->tx_desc_link); - trans->tx_desc_link = NULL; - } - if (trans->rx_desc_link) { - free(trans->rx_desc_link); - trans->rx_desc_link = NULL; - } // return back the trans to idle queue portENTER_CRITICAL(&mcp_gdma->spin_lock); STAILQ_INSERT_TAIL(&mcp_gdma->idle_queue_head, trans, idle_queue_entry); @@ -453,26 +430,14 @@ static bool mcp_gdma_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_ { bool need_yield = false; async_memcpy_gdma_context_t *mcp_gdma = (async_memcpy_gdma_context_t *)user_data; - mcp_dma_descriptor_t *eof_desc = (mcp_dma_descriptor_t *)event_data->rx_eof_desc_addr; - // get the transaction object address by the EOF descriptor address - async_memcpy_transaction_t *trans = __containerof(eof_desc, async_memcpy_transaction_t, eof_node); + async_memcpy_transaction_t *trans = mcp_gdma->current_transaction; + dma_buffer_split_array_t *rx_buf_array = &trans->rx_buf_array; // switch driver state from RUN to IDLE async_memcpy_fsm_t expected_fsm = MCP_FSM_RUN; if (atomic_compare_exchange_strong(&mcp_gdma->fsm, &expected_fsm, MCP_FSM_IDLE_WAIT)) { - void *dst = trans->memcpy_dst_addr; - // if the data is in the cache, invalidate, then CPU can see the latest data - bool need_invalidate = false; - if (esp_ptr_external_ram(dst)) { - need_invalidate = true; - } else if (esp_ptr_internal(dst)) { -#if SOC_CACHE_INTERNAL_MEM_VIA_L1CACHE - need_invalidate = true; -#endif - } - if (need_invalidate) { - esp_cache_msync(dst, trans->memcpy_size, ESP_CACHE_MSYNC_FLAG_DIR_M2C); - } + // merge the cache aligned buffers to the original buffer + esp_dma_merge_aligned_rx_buffers(rx_buf_array); // invoked callback registered by user async_memcpy_isr_cb_t cb = trans->cb; @@ -482,15 +447,6 @@ static bool mcp_gdma_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_ }; need_yield = cb(&mcp_gdma->parent, &e, trans->cb_args); } - // recycle descriptor memory - if (trans->tx_desc_link) { - free(trans->tx_desc_link); - trans->tx_desc_link = NULL; - } - if (trans->rx_desc_link) { - free(trans->rx_desc_link); - trans->rx_desc_link = NULL; - } trans->cb = NULL; portENTER_CRITICAL_ISR(&mcp_gdma->spin_lock); diff --git a/components/esp_hw_support/dma/esp_async_memcpy_priv.h b/components/esp_hw_support/dma/esp_async_memcpy_priv.h index bf64f83495..3b85b77a9e 100644 --- a/components/esp_hw_support/dma/esp_async_memcpy_priv.h +++ b/components/esp_hw_support/dma/esp_async_memcpy_priv.h @@ -13,8 +13,6 @@ #include "esp_async_memcpy.h" #include "soc/soc_caps.h" -#define ALIGN_DOWN(val, align) ((val) & ~((align) - 1)) - #define DEFAULT_TRANSACTION_QUEUE_LENGTH 4 #ifdef __cplusplus diff --git a/components/esp_hw_support/dma/esp_dma_utils.c b/components/esp_hw_support/dma/esp_dma_utils.c index 0c1f0df1af..d93ee97ac7 100644 --- a/components/esp_hw_support/dma/esp_dma_utils.c +++ b/components/esp_hw_support/dma/esp_dma_utils.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2023-2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2023-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -13,14 +13,118 @@ #include "esp_heap_caps.h" #include "esp_memory_utils.h" #include "esp_dma_utils.h" +#include "esp_private/esp_dma_utils.h" #include "esp_private/esp_cache_private.h" #include "soc/soc_caps.h" #include "hal/hal_utils.h" +#include "hal/cache_hal.h" +#include "hal/cache_ll.h" +#include "esp_cache.h" static const char *TAG = "dma_utils"; #define ALIGN_UP_BY(num, align) (((num) + ((align) - 1)) & ~((align) - 1)) -#define ALIGN_DOWN_BY(num, align) ((num) & (~((align) - 1))) + +esp_err_t esp_dma_split_rx_buffer_to_cache_aligned(void *rx_buffer, size_t buffer_len, dma_buffer_split_array_t *align_buf_array, uint8_t** ret_stash_buffer) +{ + ESP_RETURN_ON_FALSE(rx_buffer && buffer_len && align_buf_array, ESP_ERR_INVALID_ARG, TAG, "invalid argument"); + + // read the cache line size of internal and external memory, we also use this information to check if a given memory is behind the cache + size_t int_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA); + size_t ext_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_EXT_MEM, CACHE_TYPE_DATA); + + size_t split_line_size = 0; + if (esp_ptr_external_ram(rx_buffer)) { + split_line_size = ext_mem_cache_line_size; + } else if (esp_ptr_internal(rx_buffer)) { + split_line_size = int_mem_cache_line_size; + } + ESP_LOGV(TAG, "split_line_size:%zu", split_line_size); + + // allocate the stash buffer from internal RAM + // Note, the split_line_size can be 0, in this case, the stash_buffer is also NULL, which is fine + uint8_t* stash_buffer = heap_caps_calloc(2, split_line_size, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); + ESP_RETURN_ON_FALSE(!(split_line_size && !stash_buffer), ESP_ERR_NO_MEM, TAG, "no mem for stash buffer"); + + // clear align_array to avoid garbage data + memset(align_buf_array, 0, sizeof(dma_buffer_split_array_t)); + bool need_cache_sync[3] = {false}; + + // if split_line_size is non-zero, split the buffer into head, body and tail + if (split_line_size > 0) { + // calculate head_overflow_len + size_t head_overflow_len = (uintptr_t)rx_buffer % split_line_size; + head_overflow_len = head_overflow_len ? split_line_size - head_overflow_len : 0; + ESP_LOGV(TAG, "head_addr:%p head_overflow_len:%zu", rx_buffer, head_overflow_len); + // calculate tail_overflow_len + size_t tail_overflow_len = ((uintptr_t)rx_buffer + buffer_len) % split_line_size; + ESP_LOGV(TAG, "tail_addr:%p tail_overflow_len:%zu", rx_buffer + buffer_len - tail_overflow_len, tail_overflow_len); + + uint8_t extra_buf_count = 0; + uint8_t* input_buffer = (uint8_t*)rx_buffer; + align_buf_array->buf.head.recovery_address = input_buffer; + align_buf_array->buf.head.aligned_buffer = stash_buffer + split_line_size * extra_buf_count++; + align_buf_array->buf.head.length = head_overflow_len; + need_cache_sync[0] = int_mem_cache_line_size > 0; + align_buf_array->buf.body.recovery_address = input_buffer + head_overflow_len; + align_buf_array->buf.body.aligned_buffer = input_buffer + head_overflow_len; + align_buf_array->buf.body.length = buffer_len - head_overflow_len - tail_overflow_len; + need_cache_sync[1] = true; + align_buf_array->buf.tail.recovery_address = input_buffer + buffer_len - tail_overflow_len; + align_buf_array->buf.tail.aligned_buffer = stash_buffer + split_line_size * extra_buf_count++; + align_buf_array->buf.tail.length = tail_overflow_len; + need_cache_sync[2] = int_mem_cache_line_size > 0; + + // special handling when input_buffer length is no more than buffer alignment + if (head_overflow_len >= buffer_len || tail_overflow_len >= buffer_len) { + align_buf_array->buf.head.length = buffer_len ; + align_buf_array->buf.body.length = 0 ; + align_buf_array->buf.tail.length = 0 ; + } + } else { + align_buf_array->buf.body.aligned_buffer = rx_buffer; + align_buf_array->buf.body.recovery_address = rx_buffer; + align_buf_array->buf.body.length = buffer_len; + need_cache_sync[1] = false; + } + + for (int i = 0; i < 3; i++) { + if (align_buf_array->aligned_buffer[i].length == 0) { + align_buf_array->aligned_buffer[i].aligned_buffer = NULL; + align_buf_array->aligned_buffer[i].recovery_address = NULL; + need_cache_sync[i] = false; + } + } + + // invalidate the aligned buffer if necessary + for (int i = 0; i < 3; i++) { + if (need_cache_sync[i]) { + size_t sync_size = align_buf_array->aligned_buffer[i].length; + if (sync_size < split_line_size) { + // If the size is smaller than the cache line, we need to sync the split buffer (must be cache line sized) + sync_size = split_line_size; + } + esp_cache_msync(align_buf_array->aligned_buffer[i].aligned_buffer, sync_size, ESP_CACHE_MSYNC_FLAG_DIR_M2C); + } + } + + *ret_stash_buffer = stash_buffer; + return ESP_OK; +} + +esp_err_t esp_dma_merge_aligned_rx_buffers(dma_buffer_split_array_t *align_array) +{ + ESP_RETURN_ON_FALSE_ISR(align_array, ESP_ERR_INVALID_ARG, TAG, "invalid argument"); + + // only need to copy the head and tail buffer + if (align_array->buf.head.length) { + memcpy(align_array->buf.head.recovery_address, align_array->buf.head.aligned_buffer, align_array->buf.head.length); + } + if (align_array->buf.tail.length) { + memcpy(align_array->buf.tail.recovery_address, align_array->buf.tail.aligned_buffer, align_array->buf.tail.length); + } + return ESP_OK; +} esp_err_t esp_dma_capable_malloc(size_t size, const esp_dma_mem_info_t *dma_mem_info, void **out_ptr, size_t *actual_size) { diff --git a/components/esp_hw_support/dma/gdma_link.c b/components/esp_hw_support/dma/gdma_link.c index 5381b150e1..6b141e9cda 100644 --- a/components/esp_hw_support/dma/gdma_link.c +++ b/components/esp_hw_support/dma/gdma_link.c @@ -6,14 +6,8 @@ #include #include -#include #include -#include -#include "sdkconfig.h" -#include "freertos/FreeRTOS.h" -#include "freertos/task.h" #include "soc/soc_caps.h" -#include "soc/ext_mem_defs.h" #include "esp_log.h" #include "esp_check.h" #include "esp_memory_utils.h" diff --git a/components/esp_hw_support/dma/include/esp_private/esp_dma_utils.h b/components/esp_hw_support/dma/include/esp_private/esp_dma_utils.h new file mode 100644 index 0000000000..ac89c287f4 --- /dev/null +++ b/components/esp_hw_support/dma/include/esp_private/esp_dma_utils.h @@ -0,0 +1,88 @@ +/* + * SPDX-FileCopyrightText: 2023-2025 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include "esp_err.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief DMA buffer information + */ +typedef struct { + void *aligned_buffer; //!< Buffer address + void *recovery_address; //!< Origin buffer address that aligned buffer should be recovered + size_t length; //!< Buffer length +} dma_buffer_split_info_t; + +/** + * @brief DMA buffer aligned array + * The array contains three parts: head, body and tail. + * Length of each part will be >=0, especially, length=0 means that there is no such part. + */ +typedef struct { + union { + struct { + dma_buffer_split_info_t head; //!< Aligned head part. Corresponds to the part of the original buffer where the head is not aligned + dma_buffer_split_info_t body; //!< Aligned body part. Corresponds to the part of the original aligned buffer + dma_buffer_split_info_t tail; //!< Aligned tail part. Corresponds to the part of the original buffer where the tail is not aligned + } buf; + dma_buffer_split_info_t aligned_buffer[3]; //!< DMA aligned buffer array, consist of `head`, `body` and `tail` + }; +} dma_buffer_split_array_t; + +/** + * @brief Split DMA RX buffer to cache aligned buffers + * + * @note After the original RX buffer is split into an array, caller should mount the buffer array to the DMA controller in scatter-gather mode. + * Don't read/write the aligned buffers before the DMA finished using them. + * + * @param[in] rx_buffer The origin DMA buffer used for receiving data + * @param[in] buffer_len rx_buffer length + * @param[out] align_buf_array Aligned DMA buffer array + * @param[out] ret_stash_buffer Allocated stash buffer (caller should free it after use) + * @return + * - ESP_OK: Split to aligned buffer successfully + * - ESP_ERR_INVALID_ARG: Split to aligned buffer failed because of invalid argument + * + * brief sketch: + * cache alignment delimiter cache alignment delimiter + * │ │ + * Origin Buffer │ Origin Buffer │ + * │ │ │ │ + * │ ▼ ▼ ▼ + * │ ...---xxxxx|xxxxxxxxxxxxxxxxxxxxxxxxxxxxx|xxxxx----... + * │ │ │ │ + * │ │ ▼ │ + * │ │ |xxxxxxxxxxxxxxxxxxxxxxxxxxxxx| │ + * │ │ ▲ │ + * ▼ │ │ │ + * Aligned buffers └──► Head Body Tail ◄──────┘ + * │ │ + * ▼ ▼ + * |xxxxx......| |xxxxx......| + */ +esp_err_t esp_dma_split_rx_buffer_to_cache_aligned(void *rx_buffer, size_t buffer_len, dma_buffer_split_array_t *align_buf_array, uint8_t** ret_stash_buffer); + +/** + * @brief Merge aligned RX buffer array to origin buffer + * + * @note This function can be used in the ISR context. + * + * @param[in] align_buf_array Aligned DMA buffer array + * @return + * - ESP_OK: Merge aligned buffer to origin buffer successfully + * - ESP_ERR_INVALID_ARG: Merge aligned buffer to origin buffer failed because of invalid argument + */ +esp_err_t esp_dma_merge_aligned_rx_buffers(dma_buffer_split_array_t *align_buf_array); + +#ifdef __cplusplus +} +#endif diff --git a/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c b/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c index 7d706491e7..e2c7a370dd 100644 --- a/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c +++ b/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2021-2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2021-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -8,27 +8,21 @@ #include #include #include +#include "unity.h" +#include "soc/soc_caps.h" #include "esp_heap_caps.h" -#include "esp_rom_sys.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" #include "freertos/semphr.h" -#include "unity.h" #include "ccomp_timer.h" #include "esp_async_memcpy.h" -#include "soc/soc_caps.h" -#include "hal/dma_types.h" +#if SOC_GDMA_SUPPORTED +#include "hal/gdma_ll.h" +#endif #define IDF_LOG_PERFORMANCE(item, value_fmt, value, ...) \ printf("[Performance][%s]: " value_fmt "\n", item, value, ##__VA_ARGS__) -#define ALIGN_UP(addr, align) (((addr) + (align)-1) & ~((align)-1)) -#define ALIGN_DOWN(size, align) ((size) & ~((align) - 1)) - -#if CONFIG_IDF_TARGET_ESP32P4 -#define TEST_MEMCPY_BUFFER_SIZE_MUST_ALIGN_CACHE 1 -#endif - typedef struct { uint32_t seed; size_t buffer_size; @@ -37,8 +31,9 @@ typedef struct { uint8_t *dst_buf; uint8_t *from_addr; uint8_t *to_addr; - uint32_t align; - uint32_t offset; + uint32_t align; // alignment required by DMA engine + uint32_t src_offset; + uint32_t dst_offset; bool src_in_psram; bool dst_in_psram; } memcpy_testbench_context_t; @@ -46,7 +41,6 @@ typedef struct { static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_context) { srand(test_context->seed); - printf("allocating memory buffer...\r\n"); size_t buffer_size = test_context->buffer_size; size_t copy_size = buffer_size; uint8_t *src_buf = NULL; @@ -63,13 +57,11 @@ static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_contex TEST_ASSERT_NOT_NULL(dst_buf); // adding extra offset - from_addr = src_buf + test_context->offset; - to_addr = dst_buf; - copy_size -= test_context->offset; - copy_size &= ~(test_context->align - 1); + from_addr = src_buf + test_context->src_offset; + to_addr = dst_buf + test_context->dst_offset; + copy_size -= MAX(test_context->src_offset, test_context->dst_offset); - printf("...to copy size %zu Bytes, from @%p, to @%p\r\n", copy_size, from_addr, to_addr); - printf("fill src buffer with random data\r\n"); + printf("copy @%p --> @%p, %zu Bytes\r\n", from_addr, to_addr, copy_size); for (int i = 0; i < copy_size; i++) { from_addr[i] = rand() % 256; } @@ -82,28 +74,23 @@ static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_contex test_context->to_addr = to_addr; } -static void async_memcpy_verify_and_clear_testbench(uint32_t seed, uint32_t copy_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr) +static void async_memcpy_verify_and_clear_testbench(uint32_t copy_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr) { - srand(seed); // check if source date has been copied to destination and source data not broken for (int i = 0; i < copy_size; i++) { - TEST_ASSERT_EQUAL_MESSAGE(rand() % 256, from_addr[i], "source data doesn't match generator data"); - } - srand(seed); - for (int i = 0; i < copy_size; i++) { - TEST_ASSERT_EQUAL_MESSAGE(rand() % 256, to_addr[i], "destination data doesn't match source data"); + if (from_addr[i] != to_addr[i]) { + printf("location[%d]:s=%d,d=%d\r\n", i, from_addr[i], to_addr[i]); + TEST_FAIL_MESSAGE("destination data doesn't match source data"); + } } free(src_buf); free(dst_buf); } -TEST_CASE("memory copy the same buffer with different content", "[async mcp]") +static void test_memory_copy_with_same_buffer(async_memcpy_handle_t driver) { - async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG(); - async_memcpy_handle_t driver = NULL; - TEST_ESP_OK(esp_async_memcpy_install(&config, &driver)); - uint8_t *sbuf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); - uint8_t *dbuf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); + uint8_t *sbuf = heap_caps_calloc(1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); + uint8_t *dbuf = heap_caps_calloc(1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); TEST_ASSERT_NOT_NULL(sbuf); TEST_ASSERT_NOT_NULL(dbuf); @@ -119,77 +106,35 @@ TEST_CASE("memory copy the same buffer with different content", "[async mcp]") } } } - TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); free(sbuf); free(dbuf); } -static void test_memory_copy_one_by_one(async_memcpy_handle_t driver) +TEST_CASE("memory copy the same buffer with different content", "[async mcp]") { - uint32_t aligned_test_buffer_size[] = {256, 512, 1024, 2048, 4096}; - memcpy_testbench_context_t test_context = { - .align = 4, - }; - - for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) { - test_context.buffer_size = aligned_test_buffer_size[i]; - test_context.seed = i; - test_context.offset = 0; - async_memcpy_setup_testbench(&test_context); - - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, NULL, NULL)); - vTaskDelay(pdMS_TO_TICKS(10)); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, - test_context.dst_buf, test_context.from_addr, test_context.to_addr); - } - -#if !TEST_MEMCPY_BUFFER_SIZE_MUST_ALIGN_CACHE - uint32_t unaligned_test_buffer_size[] = {255, 511, 1023, 2047, 4095, 5011}; - for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) { - // Test different align edge - for (int off = 0; off < 4; off++) { - test_context.buffer_size = unaligned_test_buffer_size[i]; - test_context.seed = i; - test_context.offset = off; - async_memcpy_setup_testbench(&test_context); - - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, NULL, NULL)); - vTaskDelay(pdMS_TO_TICKS(10)); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, - test_context.dst_buf, test_context.from_addr, test_context.to_addr); - } - } -#endif -} - -TEST_CASE("memory copy by DMA one by one", "[async mcp]") -{ - async_memcpy_config_t config = { - .backlog = 4, - }; + async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG(); async_memcpy_handle_t driver = NULL; #if SOC_AHB_GDMA_SUPPORTED - printf("Testing memory by AHB GDMA\r\n"); + printf("Testing memcpy by AHB GDMA\r\n"); TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&config, &driver)); - test_memory_copy_one_by_one(driver); + test_memory_copy_with_same_buffer(driver); TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); #endif // SOC_AHB_GDMA_SUPPORTED #if SOC_AXI_GDMA_SUPPORTED - printf("Testing memory by AXI GDMA\r\n"); + printf("Testing memcpy by AXI GDMA\r\n"); TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&config, &driver)); - test_memory_copy_one_by_one(driver); + test_memory_copy_with_same_buffer(driver); TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); #endif // SOC_AXI_GDMA_SUPPORTED #if SOC_CP_DMA_SUPPORTED - printf("Testing memory by CP DMA\r\n"); + printf("Testing memcpy by CP DMA\r\n"); TEST_ESP_OK(esp_async_memcpy_install_cpdma(&config, &driver)); - test_memory_copy_one_by_one(driver); + test_memory_copy_with_same_buffer(driver); TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); #endif // SOC_CP_DMA_SUPPORTED - } static bool test_async_memcpy_cb_v1(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args) @@ -200,208 +145,235 @@ static bool test_async_memcpy_cb_v1(async_memcpy_handle_t mcp_hdl, async_memcpy_ return high_task_wakeup == pdTRUE; } -TEST_CASE("memory copy done callback", "[async mcp]") +static void test_memory_copy_blocking(async_memcpy_handle_t driver) { - async_memcpy_config_t config = { - // all default - }; - async_memcpy_handle_t driver = NULL; - TEST_ESP_OK(esp_async_memcpy_install(&config, &driver)); - - uint8_t *src_buf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); - uint8_t *dst_buf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); - TEST_ASSERT_NOT_NULL(src_buf); - TEST_ASSERT_NOT_NULL(dst_buf); - SemaphoreHandle_t sem = xSemaphoreCreateBinary(); - TEST_ESP_OK(esp_async_memcpy(driver, dst_buf, src_buf, 256, test_async_memcpy_cb_v1, sem)); - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); - TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); - free(src_buf); - free(dst_buf); + const uint32_t test_buffer_size[] = {256, 512, 1024, 2048, 4096, 5012}; + memcpy_testbench_context_t test_context = { + .align = 4, + }; + for (int i = 0; i < sizeof(test_buffer_size) / sizeof(test_buffer_size[0]); i++) { + // Test different align edge + for (int off = 0; off < 4; off++) { + test_context.buffer_size = test_buffer_size[i]; + test_context.seed = i; + test_context.src_offset = off; + test_context.dst_offset = off; + async_memcpy_setup_testbench(&test_context); + + TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_cb_v1, sem)); + TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(10))); + async_memcpy_verify_and_clear_testbench(test_context.copy_size, test_context.src_buf, test_context.dst_buf, + test_context.from_addr, test_context.to_addr); + } + } vSemaphoreDelete(sem); } -TEST_CASE("memory copy by DMA on the fly", "[async mcp]") +TEST_CASE("memory copy by DMA (blocking)", "[async mcp]") { - async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG(); - async_memcpy_handle_t driver = NULL; - TEST_ESP_OK(esp_async_memcpy_install(&config, &driver)); - - uint32_t aligned_test_buffer_size[] = {512, 1024, 2048, 4096, 4608}; - memcpy_testbench_context_t test_context[5] = { - [0 ... 4] = { - .align = 4, - } + async_memcpy_config_t config = { + .backlog = 1, + .dma_burst_size = 0, }; + async_memcpy_handle_t driver = NULL; - // Aligned case - for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) { - test_context[i].seed = i; - test_context[i].buffer_size = aligned_test_buffer_size[i]; - async_memcpy_setup_testbench(&test_context[i]); - } - for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].copy_size, NULL, NULL)); - } - for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) { - async_memcpy_verify_and_clear_testbench(i, test_context[i].copy_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr); - } - -#if !TEST_MEMCPY_BUFFER_SIZE_MUST_ALIGN_CACHE - uint32_t unaligned_test_buffer_size[] = {511, 1023, 2047, 4095, 5011}; - // Non-aligned case - for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) { - test_context[i].seed = i; - test_context[i].buffer_size = unaligned_test_buffer_size[i]; - test_context[i].offset = 3; - async_memcpy_setup_testbench(&test_context[i]); - } - for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].copy_size, NULL, NULL)); - } - for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) { - async_memcpy_verify_and_clear_testbench(i, test_context[i].copy_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr); - } -#endif - +#if SOC_AHB_GDMA_SUPPORTED + printf("Testing memcpy by AHB GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&config, &driver)); + test_memory_copy_blocking(driver); TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AHB_GDMA_SUPPORTED + +#if SOC_AXI_GDMA_SUPPORTED + printf("Testing memcpy by AXI GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&config, &driver)); + test_memory_copy_blocking(driver); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AXI_GDMA_SUPPORTED + +#if SOC_CP_DMA_SUPPORTED + printf("Testing memcpy by CP DMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_cpdma(&config, &driver)); + test_memory_copy_blocking(driver); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_CP_DMA_SUPPORTED } -#define TEST_ASYNC_MEMCPY_BENCH_COUNTS (8) -static int s_count = 0; - -static IRAM_ATTR bool test_async_memcpy_isr_cb(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args) +[[maybe_unused]] static void test_memcpy_with_dest_addr_unaligned(async_memcpy_handle_t driver, bool src_in_psram, bool dst_in_psram) { - SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args; + SemaphoreHandle_t sem = xSemaphoreCreateBinary(); + const uint32_t test_buffer_size[] = {256, 512, 1024, 2048, 4096, 5012}; + memcpy_testbench_context_t test_context = { + .align = 4, + .src_in_psram = src_in_psram, + .dst_in_psram = dst_in_psram, + }; + for (int i = 0; i < sizeof(test_buffer_size) / sizeof(test_buffer_size[0]); i++) { + // Test different alignment + for (int off = 0; off < 4; off++) { + test_context.buffer_size = test_buffer_size[i]; + test_context.seed = i; + test_context.src_offset = off; + test_context.dst_offset = off + 1; + async_memcpy_setup_testbench(&test_context); + + TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_cb_v1, sem)); + TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(10))); + async_memcpy_verify_and_clear_testbench(test_context.copy_size, test_context.src_buf, test_context.dst_buf, + test_context.from_addr, test_context.to_addr); + } + } + vSemaphoreDelete(sem); +} + +TEST_CASE("memory copy with dest address unaligned", "[async mcp]") +{ + [[maybe_unused]] async_memcpy_config_t driver_config = { + .backlog = 4, + .dma_burst_size = 32, + }; + [[maybe_unused]] async_memcpy_handle_t driver = NULL; + + +#if SOC_CP_DMA_SUPPORTED + printf("Testing memcpy by CP DMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_cpdma(&driver_config, &driver)); + test_memcpy_with_dest_addr_unaligned(driver, false, false); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_CP_DMA_SUPPORTED + +#if SOC_AHB_GDMA_SUPPORTED && !GDMA_LL_AHB_RX_BURST_NEEDS_ALIGNMENT + printf("Testing memcpy by AHB GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&driver_config, &driver)); + test_memcpy_with_dest_addr_unaligned(driver, false, false); +#if SOC_AHB_GDMA_SUPPORT_PSRAM + test_memcpy_with_dest_addr_unaligned(driver, true, true); +#endif // SOC_AHB_GDMA_SUPPORT_PSRAM + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AHB_GDMA_SUPPORTED + +#if SOC_AXI_GDMA_SUPPORTED + printf("Testing memcpy by AXI GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&driver_config, &driver)); + test_memcpy_with_dest_addr_unaligned(driver, false, false); +#if SOC_AXI_GDMA_SUPPORT_PSRAM + test_memcpy_with_dest_addr_unaligned(driver, true, true); +#endif // SOC_AXI_GDMA_SUPPORT_PSRAM + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AXI_GDMA_SUPPORTED +} + +#define TEST_ASYNC_MEMCPY_BENCH_COUNTS 16 + +typedef struct { + int perf_count; + SemaphoreHandle_t sem; +} mcp_perf_user_context_t; + +static IRAM_ATTR bool test_async_memcpy_perf_cb(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args) +{ + mcp_perf_user_context_t* user = (mcp_perf_user_context_t*)cb_args; BaseType_t high_task_wakeup = pdFALSE; - s_count++; - if (s_count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) { - xSemaphoreGiveFromISR(sem, &high_task_wakeup); + user->perf_count++; + if (user->perf_count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) { + xSemaphoreGiveFromISR(user->sem, &high_task_wakeup); } return high_task_wakeup == pdTRUE; } -static void memcpy_performance_test(uint32_t buffer_size) +static void test_memcpy_performance(async_memcpy_handle_t driver, uint32_t buffer_size, bool src_in_psram, bool dst_in_psram) { - SemaphoreHandle_t sem = xSemaphoreCreateBinary(); - - async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG(); - config.backlog = (buffer_size / DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 1) * TEST_ASYNC_MEMCPY_BENCH_COUNTS; - config.dma_burst_size = 64; // set a big burst size for performance - async_memcpy_handle_t driver = NULL; int64_t elapse_us = 0; float throughput = 0.0; - TEST_ESP_OK(esp_async_memcpy_install(&config, &driver)); - // 1. SRAM->SRAM memcpy_testbench_context_t test_context = { - .align = config.dma_burst_size, + .align = 32, // set alignment same as the burst size, to achieve the best performance .buffer_size = buffer_size, - .src_in_psram = false, - .dst_in_psram = false, + .src_in_psram = src_in_psram, + .dst_in_psram = dst_in_psram, }; async_memcpy_setup_testbench(&test_context); - s_count = 0; - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem)); - } - // wait for done semaphore - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size); + + // get CPU memcpy performance ccomp_timer_start(); for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size); } elapse_us = ccomp_timer_stop(); throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); + IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: %s->%s", throughput, src_in_psram ? "PSRAM" : "SRAM", dst_in_psram ? "PSRAM" : "SRAM"); -#if SOC_AHB_GDMA_SUPPORT_PSRAM - // 2. PSRAM->PSRAM - test_context.src_in_psram = true; - test_context.dst_in_psram = true; - async_memcpy_setup_testbench(&test_context); - s_count = 0; + // get DMA memcpy performance ccomp_timer_start(); + mcp_perf_user_context_t user_context = { + .perf_count = 0, + .sem = xSemaphoreCreateBinary() + }; for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem)); + TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_perf_cb, &user_context)); } // wait for done semaphore - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); + TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(user_context.sem, pdMS_TO_TICKS(1000))); elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size); - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size); - } - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); + async_memcpy_verify_and_clear_testbench(test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); + throughput = (float)buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; + IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: %s->%s", throughput, src_in_psram ? "PSRAM" : "SRAM", dst_in_psram ? "PSRAM" : "SRAM"); - // 3. PSRAM->SRAM - test_context.src_in_psram = true; - test_context.dst_in_psram = false; - async_memcpy_setup_testbench(&test_context); - s_count = 0; - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem)); - } - // wait for done semaphore - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size); - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size); - } - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); + vSemaphoreDelete(user_context.sem); +} - // 4. SRAM->PSRAM - test_context.src_in_psram = false; - test_context.dst_in_psram = true; - async_memcpy_setup_testbench(&test_context); - s_count = 0; - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem)); - } - // wait for done semaphore - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size); - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size); - } - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); -#endif +TEST_CASE("memory copy performance 40KB: SRAM->SRAM", "[async mcp]") +{ + async_memcpy_config_t driver_config = { + .backlog = TEST_ASYNC_MEMCPY_BENCH_COUNTS, + .dma_burst_size = 32, + }; + async_memcpy_handle_t driver = NULL; +#if SOC_AHB_GDMA_SUPPORTED + printf("Testing memcpy by AHB GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, false, false); TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); - vSemaphoreDelete(sem); +#endif // SOC_AHB_GDMA_SUPPORTED + +#if SOC_AXI_GDMA_SUPPORTED + printf("Testing memcpy by AXI GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, false, false); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AXI_GDMA_SUPPORTED + +#if SOC_CP_DMA_SUPPORTED + printf("Testing memcpy by CP DMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_cpdma(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, false, false); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_CP_DMA_SUPPORTED } -TEST_CASE("memory copy performance test 40KB", "[async mcp]") +#if SOC_SPIRAM_SUPPORTED +TEST_CASE("memory copy performance 40KB: PSRAM->PSRAM", "[async mcp]") { - memcpy_performance_test(40 * 1024); -} + [[maybe_unused]] async_memcpy_config_t driver_config = { + .backlog = TEST_ASYNC_MEMCPY_BENCH_COUNTS, + .dma_burst_size = 32, + }; + [[maybe_unused]] async_memcpy_handle_t driver = NULL; -TEST_CASE("memory copy performance test 4KB", "[async mcp]") -{ - memcpy_performance_test(4 * 1024); +#if SOC_AHB_GDMA_SUPPORTED && SOC_AHB_GDMA_SUPPORT_PSRAM + printf("Testing memcpy by AHB GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, true, true); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AHB_GDMA_SUPPORTED && SOC_AHB_GDMA_SUPPORT_PSRAM + +#if SOC_AXI_GDMA_SUPPORTED && SOC_AXI_GDMA_SUPPORT_PSRAM + printf("Testing memcpy by AXI GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, true, true); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AXI_GDMA_SUPPORTED && SOC_AXI_GDMA_SUPPORT_PSRAM } +#endif diff --git a/components/esp_hw_support/test_apps/dma/main/test_gdma.c b/components/esp_hw_support/test_apps/dma/main/test_gdma.c index b5638f6f5e..780c6700ed 100644 --- a/components/esp_hw_support/test_apps/dma/main/test_gdma.c +++ b/components/esp_hw_support/test_apps/dma/main/test_gdma.c @@ -14,6 +14,7 @@ #include "esp_heap_caps.h" #include "esp_private/gdma.h" #include "esp_private/gdma_link.h" +#include "esp_private/esp_dma_utils.h" #include "hal/dma_types.h" #include "soc/soc_caps.h" #include "hal/gdma_ll.h" @@ -22,6 +23,9 @@ #include "esp_cache.h" #include "esp_memory_utils.h" +#define ALIGN_UP(num, align) (((num) + ((align) - 1)) & ~((align) - 1)) +#define ALIGN_DOWN(num, align) ((num) & ~((align) - 1)) + TEST_CASE("GDMA channel allocation", "[GDMA]") { gdma_channel_alloc_config_t channel_config = {}; @@ -147,22 +151,9 @@ TEST_CASE("GDMA channel allocation", "[GDMA]") #endif // GDMA_LL_AXI_PAIRS_PER_GROUP >= 2 } -static bool test_gdma_m2m_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data) +static void test_gdma_config_link_list(gdma_channel_handle_t tx_chan, gdma_channel_handle_t rx_chan, + gdma_link_list_handle_t *tx_link_list, gdma_link_list_handle_t *rx_link_list, size_t sram_alignment, bool dma_link_in_ext_mem) { - BaseType_t task_woken = pdFALSE; - SemaphoreHandle_t done_sem = (SemaphoreHandle_t)user_data; - xSemaphoreGiveFromISR(done_sem, &task_woken); - return task_woken == pdTRUE; -} - -static void test_gdma_m2m_mode(gdma_channel_handle_t tx_chan, gdma_channel_handle_t rx_chan, bool dma_link_in_ext_mem) -{ - size_t sram_alignment = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA); - gdma_rx_event_callbacks_t rx_cbs = { - .on_recv_eof = test_gdma_m2m_rx_eof_callback, - }; - SemaphoreHandle_t done_sem = xSemaphoreCreateBinary(); - TEST_ESP_OK(gdma_register_rx_event_callbacks(rx_chan, &rx_cbs, done_sem)); gdma_strategy_config_t strategy = { .auto_update_desc = true, @@ -189,24 +180,46 @@ static void test_gdma_m2m_mode(gdma_channel_handle_t tx_chan, gdma_channel_handl .check_owner = true, } }; - gdma_link_list_handle_t tx_link_list = NULL; - TEST_ESP_OK(gdma_new_link_list(&tx_link_list_config, &tx_link_list)); - // allocate the source buffer from SRAM - uint8_t *src_data = heap_caps_calloc(1, 128, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); - TEST_ASSERT_NOT_NULL(src_data); - + TEST_ESP_OK(gdma_new_link_list(&tx_link_list_config, tx_link_list)); // create DMA link list for RX channel gdma_link_list_config_t rx_link_list_config = { .buffer_alignment = sram_alignment, // RX buffer should be aligned to the cache line size, because we will do cache invalidate later .item_alignment = 8, // 8-byte alignment required by the AXI-GDMA - .num_items = 1, + .num_items = 5, .flags = { .items_in_ext_mem = dma_link_in_ext_mem, .check_owner = true, }, }; + TEST_ESP_OK(gdma_new_link_list(&rx_link_list_config, rx_link_list)); +} + +static bool test_gdma_m2m_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data) +{ + BaseType_t task_woken = pdFALSE; + SemaphoreHandle_t done_sem = (SemaphoreHandle_t)user_data; + xSemaphoreGiveFromISR(done_sem, &task_woken); + return task_woken == pdTRUE; +} + +static void test_gdma_m2m_mode(gdma_channel_handle_t tx_chan, gdma_channel_handle_t rx_chan, bool dma_link_in_ext_mem) +{ + size_t sram_alignment = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA); + gdma_rx_event_callbacks_t rx_cbs = { + .on_recv_eof = test_gdma_m2m_rx_eof_callback, + }; + SemaphoreHandle_t done_sem = xSemaphoreCreateBinary(); + TEST_ASSERT_NOT_NULL(done_sem); + TEST_ESP_OK(gdma_register_rx_event_callbacks(rx_chan, &rx_cbs, done_sem)); + + gdma_link_list_handle_t tx_link_list = NULL; gdma_link_list_handle_t rx_link_list = NULL; - TEST_ESP_OK(gdma_new_link_list(&rx_link_list_config, &rx_link_list)); + test_gdma_config_link_list(tx_chan, rx_chan, &tx_link_list, &rx_link_list, sram_alignment, dma_link_in_ext_mem); + + // allocate the source buffer from SRAM + uint8_t *src_data = heap_caps_calloc(1, 128, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); + TEST_ASSERT_NOT_NULL(src_data); + // allocate the destination buffer from SRAM uint8_t *dst_data = heap_caps_calloc(1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); TEST_ASSERT_NOT_NULL(dst_data); @@ -270,7 +283,7 @@ static void test_gdma_m2m_mode(gdma_channel_handle_t tx_chan, gdma_channel_handl TEST_ESP_OK(gdma_start(rx_chan, gdma_link_get_head_addr(rx_link_list))); TEST_ESP_OK(gdma_start(tx_chan, gdma_link_get_head_addr(tx_link_list))); - xSemaphoreTake(done_sem, portMAX_DELAY); + xSemaphoreTake(done_sem, 1000 / portTICK_PERIOD_MS); if (sram_alignment) { // the destination data are not reflected to the cache, so do an invalidate to ask the cache load new data @@ -344,3 +357,133 @@ TEST_CASE("GDMA M2M Mode", "[GDMA][M2M]") TEST_ESP_OK(gdma_del_channel(rx_chan)); #endif // SOC_AXI_GDMA_SUPPORTED } + +typedef struct { + SemaphoreHandle_t done_sem; + dma_buffer_split_array_t *align_array; +} test_gdma_context_t; + +static bool test_gdma_m2m_unaligned_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data) +{ + BaseType_t task_woken = pdFALSE; + test_gdma_context_t *user_ctx = (test_gdma_context_t*)user_data; + TEST_ESP_OK(esp_dma_merge_aligned_rx_buffers(user_ctx->align_array)); + xSemaphoreGiveFromISR(user_ctx->done_sem, &task_woken); + return task_woken == pdTRUE; +} + +static void test_gdma_m2m_unaligned_buffer_test(uint8_t *dst_data, uint8_t *src_data, size_t data_length, size_t offset_len) +{ + TEST_ASSERT_NOT_NULL(src_data); + TEST_ASSERT_NOT_NULL(dst_data); + gdma_channel_handle_t tx_chan = NULL; + gdma_channel_handle_t rx_chan = NULL; + gdma_channel_alloc_config_t tx_chan_alloc_config = {}; + gdma_channel_alloc_config_t rx_chan_alloc_config = {}; + tx_chan_alloc_config = (gdma_channel_alloc_config_t) { + .direction = GDMA_CHANNEL_DIRECTION_TX, + .flags.reserve_sibling = true, + }; + TEST_ESP_OK(gdma_new_ahb_channel(&tx_chan_alloc_config, &tx_chan)); + rx_chan_alloc_config = (gdma_channel_alloc_config_t) { + .direction = GDMA_CHANNEL_DIRECTION_RX, + .sibling_chan = tx_chan, + }; + TEST_ESP_OK(gdma_new_ahb_channel(&rx_chan_alloc_config, &rx_chan)); + size_t sram_alignment = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA); + + gdma_link_list_handle_t tx_link_list = NULL; + gdma_link_list_handle_t rx_link_list = NULL; + test_gdma_config_link_list(tx_chan, rx_chan, &tx_link_list, &rx_link_list, sram_alignment, false); + + // prepare the source data + for (int i = 0; i < data_length; i++) { + src_data[i] = i; + } + if (sram_alignment) { + // do write-back for the source data because it's in the cache + TEST_ESP_OK(esp_cache_msync(src_data, ALIGN_UP(data_length, sram_alignment), ESP_CACHE_MSYNC_FLAG_DIR_C2M)); + } + + gdma_buffer_mount_config_t tx_buf_mount_config[] = { + [0] = { + .buffer = src_data, + .length = data_length, + .flags = { + .mark_eof = true, + .mark_final = true, // using singly list, so terminate the link here + } + } + }; + TEST_ESP_OK(gdma_link_mount_buffers(tx_link_list, 0, tx_buf_mount_config, sizeof(tx_buf_mount_config) / sizeof(gdma_buffer_mount_config_t), NULL)); + + dma_buffer_split_array_t align_array = {0}; + gdma_buffer_mount_config_t rx_aligned_buf_mount_config[3] = {0}; + uint8_t* stash_buffer = NULL; + TEST_ESP_OK(esp_dma_split_rx_buffer_to_cache_aligned(dst_data + offset_len, data_length, &align_array, &stash_buffer)); + for (int i = 0; i < 3; i++) { + rx_aligned_buf_mount_config[i].buffer = align_array.aligned_buffer[i].aligned_buffer; + rx_aligned_buf_mount_config[i].length = align_array.aligned_buffer[i].length; + } + TEST_ESP_OK(gdma_link_mount_buffers(rx_link_list, 0, rx_aligned_buf_mount_config, 3, NULL)); + + gdma_rx_event_callbacks_t rx_cbs = { + .on_recv_eof = test_gdma_m2m_unaligned_rx_eof_callback, + }; + SemaphoreHandle_t done_sem = xSemaphoreCreateBinary(); + TEST_ASSERT_NOT_NULL(done_sem); + test_gdma_context_t user_ctx = { + .done_sem = done_sem, + .align_array = &align_array, + }; + TEST_ESP_OK(gdma_register_rx_event_callbacks(rx_chan, &rx_cbs, &user_ctx)); + + TEST_ESP_OK(gdma_start(rx_chan, gdma_link_get_head_addr(rx_link_list))); + TEST_ESP_OK(gdma_start(tx_chan, gdma_link_get_head_addr(tx_link_list))); + + xSemaphoreTake(done_sem, 1000 / portTICK_PERIOD_MS); + + // validate the destination data + for (int i = 0; i < data_length; i++) { + TEST_ASSERT_EQUAL(i % 256 , dst_data[i + offset_len]); + } + + TEST_ESP_OK(gdma_del_link_list(tx_link_list)); + TEST_ESP_OK(gdma_del_link_list(rx_link_list)); + TEST_ESP_OK(gdma_del_channel(tx_chan)); + TEST_ESP_OK(gdma_del_channel(rx_chan)); + vSemaphoreDelete(done_sem); + free(stash_buffer); +} + +TEST_CASE("GDMA M2M Unaligned RX Buffer Test", "[GDMA][M2M]") +{ + uint8_t *sbuf = heap_caps_aligned_calloc(64, 1, 10240, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); + uint8_t *dbuf = heap_caps_aligned_calloc(64, 1, 10240, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); + + // case buffer len less than buffer alignment + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 60, 0); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 60, 4); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 60, 2); + + // case buffer head aligned + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 246, 0); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8182, 0); + + // case buffer tail aligned + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 246, 10); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8182, 10); + + // case buffer unaligned + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 100, 10); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 10, 60); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 256, 10); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8192, 10); + + // case buffer full aligned + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 256, 0); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8192, 0); + + free(sbuf); + free(dbuf); +}