IDF master 3e370c4296

* Fix build compilation due to changes in the HW_TIMER's structs * Fix compilation warnings and errors with USB * Update USBCDC.cpp * Update CMakeLists.txt * Update HWCDC.cpp
2021-10-01 17:52:29 +03:00
parent 381e88ec75
commit 00214d5c2a
1475 changed files with 88153 additions and 49503 deletions
--- a/tools/sdk/esp32/include/esp-face/face_detection/include/fd_forward.h
+++ b/tools/sdk/esp32/include/esp-face/face_detection/include/fd_forward.h
@ -1,103 +0,0 @@
-/*
-  * ESPRESSIF MIT License
-  *
-  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
-  *
-  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
-  * it is free of charge, to any person obtaining a copy of this software and associated
-  * documentation files (the "Software"), to deal in the Software without restriction, including
-  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
-  * to do so, subject to the following conditions:
-  *
-  * The above copyright notice and this permission notice shall be included in all copies or
-  * substantial portions of the Software.
-  *
-  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-  *
-  */
-#pragma once
-
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include "image_util.h"
-#include "dl_lib_matrix3d.h"
-#include "mtmn.h"
-
-    typedef enum
-    {
-        FAST = 0,            /*!< fast resize type */         
-        NORMAL = 1,          /*!< normal resize type */ 
-    } mtmn_resize_type;
-
-    typedef struct
-    {
-        float score;          /*!< score threshold for filter candidates by score */
-        float nms;            /*!< nms threshold for nms process */
-        int candidate_number; /*!< candidate number limitation for each net */
-    } threshold_config_t;
-
-    typedef struct
-    {
-        int w;                        /*!< net width */
-        int h;                        /*!< net height */
-        threshold_config_t threshold; /*!< threshold of net */
-    } net_config_t;
-
-    typedef struct
-    {
-        float min_face;                 /*!< The minimum size of a detectable face */
-        float pyramid;                  /*!< The scale of the gradient scaling for the input images */
-        int pyramid_times;              /*!< The pyramid resizing times */
-        threshold_config_t p_threshold; /*!< The thresholds for P-Net. For details, see the definition of threshold_config_t */
-        threshold_config_t r_threshold; /*!< The thresholds for R-Net. For details, see the definition of threshold_config_t */
-        threshold_config_t o_threshold; /*!< The thresholds for O-Net. For details, see the definition of threshold_config_t */
-        mtmn_resize_type type;          /*!< The image resize type. 'pyramid' will lose efficacy, when 'type'==FAST. */
-    } mtmn_config_t;
-
-    /**
-     * @brief Get the initial MTMN model configuration
-     * 
-     * @return mtmn_config_t      MTMN configuration
-     */
-    static inline mtmn_config_t mtmn_init_config()
-    {
-        mtmn_config_t mtmn_config;
-        mtmn_config.type = FAST;
-        mtmn_config.min_face = 80;
-        mtmn_config.pyramid = 0.707;
-        mtmn_config.pyramid_times = 4;
-        mtmn_config.p_threshold.score = 0.6;
-        mtmn_config.p_threshold.nms = 0.7;
-        mtmn_config.p_threshold.candidate_number = 20;
-        mtmn_config.r_threshold.score = 0.7;
-        mtmn_config.r_threshold.nms = 0.7;
-        mtmn_config.r_threshold.candidate_number = 10;
-        mtmn_config.o_threshold.score = 0.7;
-        mtmn_config.o_threshold.nms = 0.7;
-        mtmn_config.o_threshold.candidate_number = 1;
-
-        return mtmn_config;
-    }
-
-    /**
-     * @brief Do MTMN face detection, return box and landmark infomation.
-     * 
-     * @param image_matrix      Image matrix, rgb888 format
-     * @param config            Configuration of MTMN i.e. score threshold, nms threshold, candidate number threshold, pyramid, min face size
-     * @return box_array_t*     A list of boxes and score.
-     */
-    box_array_t *face_detect(dl_matrix3du_t *image_matrix,
-                             mtmn_config_t *config);
-
-#if __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/face_recognition/include/fr_flash.h
+++ b/tools/sdk/esp32/include/esp-face/face_recognition/include/fr_flash.h
@ -1,82 +0,0 @@
-#pragma once
-
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include "fr_forward.h"
-
-#define FR_FLASH_TYPE   32
-#define FR_FLASH_SUBTYPE   32
-#define FR_FLASH_PARTITION_NAME "fr"
-#define FR_FLASH_INFO_FLAG 12138
-        
-     /**
-     * @brief Produce face id according to the input aligned face, and save it to dest_id and flash.
-     * 
-     * @param l                     Face id list
-     * @param aligned_face          An aligned face
-     * @return -2                   Flash partition not found
-     * @return 0                    Enrollment finish
-     * @return >=1                  The left piece of aligned faces should be input
-     */
-    int8_t enroll_face_id_to_flash(face_id_list *l,
-            dl_matrix3du_t *aligned_face);
-
-    /**
-     * @brief Produce face id according to the input aligned face, and save the id-name pairs to dest_id and flash.
-     * 
-     * @param l                     Face id list
-     * @param new_id                An aligned face
-     * @param name                  name corresponding to face id
-     * @return -2                   Flash partition not found
-     * @return 0                    Enrollment finish
-     * @return >=1                  The left piece of aligned faces should be input
-     */
-    int8_t enroll_face_id_to_flash_with_name(face_id_name_list *l,
-            dl_matrix3d_t *new_id,
-            char *name);
-    /**
-     * @brief Read the enrolled face IDs from the flash.
-     * 
-     * @param l                     Face id list
-     * @return int8_t               The number of IDs remaining in flash
-     */
-    int8_t read_face_id_from_flash(face_id_list *l);
-    
-    /**
-     * @brief Read the enrolled face IDs and their corresponding names from the flash.
-     * 
-     * @param l                     Face id list
-     * @return int8_t               The number of IDs remaining in flash
-     */
-    int8_t read_face_id_from_flash_with_name(face_id_name_list *l);
-
-    /**
-     * @brief Delete the enrolled face IDs in the flash.
-     * 
-     * @param l                     Face id list
-     * @return int8_t               The number of IDs remaining in flash
-     */
-    int8_t delete_face_id_in_flash(face_id_list *l);
-
-    /**
-     * @brief Delete the enrolled face ID corresponding to the name in the flash.
-     * 
-     * @param l                     Face id list
-     * @param name                  The name that needs to be deleted
-     * @return int8_t               The number of IDs remaining in flash
-     */
-    int8_t delete_face_id_in_flash_with_name(face_id_name_list *l, char *name);
-
-    /**
-     * @brief Delete all the enrolled face IDs and names paris in the flash.
-     * 
-     * @param l                     Face id list
-     */
-    void delete_face_all_in_flash_with_name(face_id_name_list *l);
-
-#if __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/face_recognition/include/fr_forward.h
+++ b/tools/sdk/esp32/include/esp-face/face_recognition/include/fr_forward.h
@ -1,194 +0,0 @@
-#pragma once
-
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include "image_util.h"
-#include "dl_lib_matrix3d.h"
-#include "frmn.h"
-
-#define FACE_WIDTH 56
-#define FACE_HEIGHT 56
-#define FACE_ID_SIZE 512
-#define FACE_REC_THRESHOLD 0.55
-
-#define LEFT_EYE_X 0
-#define LEFT_EYE_Y 1
-#define RIGHT_EYE_X 6
-#define RIGHT_EYE_Y 7
-#define NOSE_X 4
-#define NOSE_Y 5
-#define LEFT_MOUTH_X 2
-#define LEFT_MOUTH_Y 3
-#define RIGHT_MOUTH_X 8
-#define RIGHT_MOUTH_Y 9
-
-#define EYE_DIST_SET 16.5f
-#define NOSE_EYE_RATIO_THRES_MIN 0.49f
-#define NOSE_EYE_RATIO_THRES_MAX 2.04f
-
-
-#define ENROLL_NAME_LEN 16
-    typedef struct tag_face_id_node
-    {
-        struct tag_face_id_node *next;           /*!< next face id node */
-        char id_name[ENROLL_NAME_LEN];           /*!< name corresponding to the face id  */
-        dl_matrix3d_t *id_vec;                   /*!< face id */
-    } face_id_node;
-
-    typedef struct
-    {
-        face_id_node *head;    /*!< head pointer of the id list */
-        face_id_node *tail;    /*!< tail pointer of the id list */
-        uint8_t count;         /*!< number of enrolled ids */
-        uint8_t confirm_times; /*!< images needed for one enrolling */
-    } face_id_name_list;
-
-    typedef struct
-    {
-        uint8_t head;            /*!< head index of the id list */
-        uint8_t tail;            /*!< tail index of the id list */
-        uint8_t count;           /*!< number of enrolled ids */
-        uint8_t size;            /*!< max len of id list */
-        uint8_t confirm_times;   /*!< images needed for one enrolling */
-        dl_matrix3d_t **id_list; /*!< stores face id vectors */
-    } face_id_list;
-
-    /**
-     * @brief Initialize face id list.
-     * 
-     * @param l                    Face id list
-     * @param size                 Size of list, one list contains one vector
-     * @param confirm_times        Enroll times for one id
-     */
-    void face_id_init(face_id_list *l, uint8_t size, uint8_t confirm_times);
-
-    /**
-     * @brief Initialize face id list with name.
-     * 
-     * @param l                    Face id list
-     * @param size                 Size of list, one list contains one vector
-     * @param confirm_times        Enroll times for one id
-     */
-    void face_id_name_init(face_id_name_list *l, uint8_t size, uint8_t confirm_times);
-
-    /**
-     * @brief Alloc memory for aligned face.
-     * 
-     * @return dl_matrix3du_t*          Size: 1xFACE_WIDTHxFACE_HEIGHTx3
-     */
-    dl_matrix3du_t *aligned_face_alloc();
-
-    /**@{*/
-    /**
-     * @brief Align detected face to average face according to landmark.
-     * 
-     * @param onet_boxes        Output of MTMN with box and landmark
-     * @param src               Image matrix, rgb888 format
-     * @param dest              Output image
-     * @return ESP_OK           Input face is good for recognition
-     * @return ESP_FAIL         Input face is not good for recognition
-     */
-    int8_t align_face_rot(box_array_t *onet_boxes,
-                      dl_matrix3du_t *src,
-                      dl_matrix3du_t *dest);
-    
-    int8_t align_face_sim(box_array_t *onet_boxes,
-                   dl_matrix3du_t *src,
-                   dl_matrix3du_t *dest);
-    
-    inline int8_t align_face(box_array_t *onet_boxes,
-                       dl_matrix3du_t *src,
-                       dl_matrix3du_t *dest)
-    {
-        return align_face_sim(onet_boxes, src, dest);              
-    }
-    /**@}*/
-
-    /**
-     * @brief Run the face recognition model to get the face feature
-     * 
-     * @param aligned_face      A 56x56x3 image, the variable need to do align_face first
-     * @return face_id          A 512 vector, size (1, 1, 1, 512)
-     */
-    dl_matrix3d_t *get_face_id(dl_matrix3du_t *aligned_face);
-
-    /**
-     * @brief Add src_id to dest_id
-     * 
-     * @param dest_id       Face id after accumulation
-     * @param src_id        Face id to be added
-     */
-    void add_face_id(dl_matrix3d_t *dest_id,
-                     dl_matrix3d_t *src_id);
-
-    /**
-     * @brief Match face with the id_list, and return matched_id.
-     *
-     * @param l                     An ID list 
-     * @param algined_face          An aligned face
-     * @return int8_t               Matched face id
-     */
-    int8_t recognize_face(face_id_list *l, dl_matrix3du_t *algined_face);
-
-    /**
-     * @brief Match face id with the id_list, and return matched face id node.
-     * 
-     * @param l 
-     * @param face_id 
-     * @return face_id_node* 
-     */
-    face_id_node *recognize_face_with_name(face_id_name_list *l, dl_matrix3d_t *face_id);
-    
-    /**
-     * @brief Produce face id according to the input aligned face, and save it to dest_id.
-     * 
-     * @param l                     Face id list
-     * @param aligned_face          An aligned face
-     * @param enroll_confirm_times  Confirm times for each face id enrollment
-     * @return -1                   Wrong input enroll_confirm_times
-     * @return 0                    Enrollment finish
-     * @return >=1                  The left piece of aligned faces should be input
-     */
-    int8_t enroll_face(face_id_list *l, dl_matrix3du_t *aligned_face);
-
-    /**
-     * @brief Produce face id according to the input aligned face, and save the id-name pairs to dest_id
-     * 
-     * @param l                      Face id list with name 
-     * @param new_id                 A face id that need to be enrolled
-     * @param name                   name corresponding to the face id  
-     * @return int8_t                The left piece of aligned faces should be input
-     */
-    int8_t enroll_face_with_name(face_id_name_list *l,
-                                 dl_matrix3d_t *new_id,
-                                 char *name);
-
-    /**
-     * @brief Delete the enrolled face IDs
-     * 
-     * @param l            Face id list
-     * @return uint8_t     The number of IDs remaining in face id list
-     */
-    uint8_t delete_face(face_id_list *l);
-
-    /**
-     * @brief Delete the enrolled face IDs and associated names
-     * 
-     * @param l             Face id list
-     * @param name          The name that needs to be deleted
-     * @return int8_t       The number of IDs remaining in face id list
-     */
-    int8_t delete_face_with_name(face_id_name_list *l, char *name);
-    
-    /**
-     * @brief               Delete all the enrolled face IDs and names paris
-     * 
-     * @param l             Face id list with names
-     */
-    void delete_face_all_with_name(face_id_name_list *l);
-#if __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/image_util/include/esp_image.hpp
+++ b/tools/sdk/esp32/include/esp-face/image_util/include/esp_image.hpp
@ -1,344 +0,0 @@
-/*
-  * ESPRESSIF MIT License
-  *
-  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
-  *
-  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
-  * it is free of charge, to any person obtaining a copy of this software and associated
-  * documentation files (the "Software"), to deal in the Software without restriction, including
-  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
-  * to do so, subject to the following conditions:
-  *
-  * The above copyright notice and this permission notice shall be included in all copies or
-  * substantial portions of the Software.
-  *
-  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-  *
-  */
-#pragma once
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-#include <stdint.h>
-#include <math.h>
-#include <assert.h>
-
-#ifdef __cplusplus
-}
-#endif
-
-typedef enum
-{
-    IMAGE_RESIZE_BILINEAR = 0, /*<! Resize image by taking bilinear of four pixels */
-    IMAGE_RESIZE_MEAN = 1,     /*<! Resize image by taking mean of four pixels */
-    IMAGE_RESIZE_NEAREST = 2   /*<! Resize image by taking the nearest pixel */
-} image_resize_t;
-
-template <class T>
-class Image
-{
-public:
-    /**
-     * @brief Convert a RGB565 pixel to RGB888
-     * 
-     * @param input     Pixel value in RGB565
-     * @param output    Pixel value in RGB888
-     */
-    static inline void pixel_rgb565_to_rgb888(uint16_t input, T *output)
-    {
-        output[2] = (input & 0x1F00) >> 5;                           //blue
-        output[1] = ((input & 0x7) << 5) | ((input & 0xE000) >> 11); //green
-        output[0] = input & 0xF8;                                    //red
-    };
-
-    /**
-     * @brief Resize a RGB565 image to a RGB88 image
-     * 
-     * @param dst_image     The destination image
-     * @param y_start       The start y index of where resized image located
-     * @param y_end         The end y index of where resized image located
-     * @param x_start       The start x index of where resized image located
-     * @param x_end         The end x index of where resized image located
-     * @param channel       The channel number of image
-     * @param src_image     The source image
-     * @param src_h         The height of source image
-     * @param src_w         The width of source image
-     * @param dst_w         The width of destination image
-     * @param shift_left    The bit number of left shifting
-     * @param type          The resize type
-     */
-    static void resize_to_rgb888(T *dst_image, int y_start, int y_end, int x_start, int x_end, int channel, uint16_t *src_image, int src_h, int src_w, int dst_w, int shift_left, image_resize_t type);
-
-    /**
-     * @brief Resize a RGB888 image to a RGB88 image
-     * 
-     * @param dst_image     The destination image
-     * @param y_start       The start y index of where resized image located
-     * @param y_end         The end y index of where resized image located
-     * @param x_start       The start x index of where resized image located
-     * @param x_end         The end x index of where resized image located
-     * @param channel       The channel number of image
-     * @param src_image     The source image
-     * @param src_h         The height of source image
-     * @param src_w         The width of source image
-     * @param dst_w         The width of destination image
-     * @param shift_left    The bit number of left shifting
-     * @param type          The resize type
-     */
-    static void resize_to_rgb888(T *dst_image, int y_start, int y_end, int x_start, int x_end, int channel, uint8_t *src_image, int src_h, int src_w, int dst_w, int shift_left, image_resize_t type);
-    // static void resize_to_rgb565(uint16_t *dst_image, int y_start, int y_end, int x_start, int x_end, int channel, uint16_t *src_image, int src_h, int src_w, int dst_w, int shift_left, image_resize_t type);
-    // static void resize_to_rgb565(uint16_t *dst_image, int y_start, int y_end, int x_start, int x_end, int channel, uint8_t *src_image, int src_h, int src_w, int dst_w, int shift_left, image_resize_t type);
-};
-
-template <class T>
-void Image<T>::resize_to_rgb888(T *dst_image, int y_start, int y_end, int x_start, int x_end, int channel, uint16_t *src_image, int src_h, int src_w, int dst_w, int shift_left, image_resize_t type)
-{
-    assert(channel == 3);
-    float scale_y = (float)src_h / (y_end - y_start);
-    float scale_x = (float)src_w / (x_end - x_start);
-    int temp[13];
-
-    switch (type)
-    {
-    case IMAGE_RESIZE_BILINEAR:
-        for (size_t y = y_start; y < y_end; y++)
-        {
-            float ratio_y[2];
-            ratio_y[0] = (float)((y + 0.5) * scale_y - 0.5); // y
-            int src_y = (int)ratio_y[0];                     // y1
-            ratio_y[0] -= src_y;                             // y - y1
-
-            if (src_y < 0)
-            {
-                ratio_y[0] = 0;
-                src_y = 0;
-            }
-            if (src_y > src_h - 2)
-            {
-                ratio_y[0] = 0;
-                src_y = src_h - 2;
-            }
-            ratio_y[1] = 1 - ratio_y[0]; // y2 - y
-
-            int _dst_i = y * dst_w;
-
-            int _src_row_0 = src_y * src_w;
-            int _src_row_1 = _src_row_0 + src_w;
-
-            for (size_t x = x_start; x < x_end; x++)
-            {
-                float ratio_x[2];
-                ratio_x[0] = (float)((x + 0.5) * scale_x - 0.5); // x
-                int src_x = (int)ratio_x[0];                     // x1
-                ratio_x[0] -= src_x;                             // x - x1
-
-                if (src_x < 0)
-                {
-                    ratio_x[0] = 0;
-                    src_x = 0;
-                }
-                if (src_x > src_w - 2)
-                {
-                    ratio_x[0] = 0;
-                    src_x = src_w - 2;
-                }
-                ratio_x[1] = 1 - ratio_x[0]; // x2 - x
-
-                int dst_i = (_dst_i + x) * channel;
-
-                int src_row_0 = _src_row_0 + src_x;
-                int src_row_1 = _src_row_1 + src_x;
-
-                Image<int>::pixel_rgb565_to_rgb888(src_image[src_row_0], temp);
-                Image<int>::pixel_rgb565_to_rgb888(src_image[src_row_0 + 1], temp + 3);
-                Image<int>::pixel_rgb565_to_rgb888(src_image[src_row_1], temp + 6);
-                Image<int>::pixel_rgb565_to_rgb888(src_image[src_row_1 + 1], temp + 9);
-
-                for (int c = 0; c < channel; c++)
-                {
-                    temp[12] = round(temp[c] * ratio_x[1] * ratio_y[1] + temp[channel + c] * ratio_x[0] * ratio_y[1] + temp[channel + channel + c] * ratio_x[1] * ratio_y[0] + src_image[channel + channel + channel + c] * ratio_x[0] * ratio_y[0]);
-                    dst_image[dst_i + c] = (shift_left > 0) ? (temp[12] << shift_left) : (temp[12] >> -shift_left);
-                }
-            }
-        }
-        break;
-
-    case IMAGE_RESIZE_MEAN:
-        shift_left -= 2;
-        for (int y = y_start; y < y_end; y++)
-        {
-            int _dst_i = y * dst_w;
-
-            float _src_row_0 = rintf(y * scale_y) * src_w;
-            float _src_row_1 = _src_row_0 + src_w;
-
-            for (int x = x_start; x < x_end; x++)
-            {
-                int dst_i = (_dst_i + x) * channel;
-
-                int src_row_0 = (_src_row_0 + rintf(x * scale_x));
-                int src_row_1 = (_src_row_1 + rintf(x * scale_x));
-
-                Image<int>::pixel_rgb565_to_rgb888(src_image[src_row_0], temp);
-                Image<int>::pixel_rgb565_to_rgb888(src_image[src_row_0 + 1], temp + 3);
-                Image<int>::pixel_rgb565_to_rgb888(src_image[src_row_1], temp + 6);
-                Image<int>::pixel_rgb565_to_rgb888(src_image[src_row_1 + 1], temp + 9);
-
-                dst_image[dst_i] = (shift_left > 0) ? ((temp[0] + temp[3] + temp[6] + temp[9]) << shift_left) : ((temp[0] + temp[3] + temp[6] + temp[9]) >> -shift_left);
-                dst_image[dst_i + 1] = (shift_left > 0) ? ((temp[1] + temp[4] + temp[7] + temp[10]) << shift_left) : ((temp[1] + temp[4] + temp[7] + temp[10]) >> -shift_left);
-                dst_image[dst_i + 2] = (shift_left > 0) ? ((temp[2] + temp[5] + temp[8] + temp[11]) << shift_left) : ((temp[1] + temp[4] + temp[7] + temp[10]) >> -shift_left);
-            }
-        }
-
-        break;
-
-    case IMAGE_RESIZE_NEAREST:
-        for (size_t y = y_start; y < y_end; y++)
-        {
-            int _dst_i = y * dst_w;
-            float _src_i = rintf(y * scale_y) * src_w;
-
-            for (size_t x = x_start; x < x_end; x++)
-            {
-                int dst_i = (_dst_i + x) * channel;
-                int src_i = _src_i + rintf(x * scale_x);
-
-                Image<int>::pixel_rgb565_to_rgb888(src_image[src_i], temp);
-
-                dst_image[dst_i] = (shift_left > 0) ? (temp[0] << shift_left) : (temp[0] >> -shift_left);
-                dst_image[dst_i + 1] = (shift_left > 0) ? (temp[1] << shift_left) : (temp[1] >> -shift_left);
-                dst_image[dst_i + 2] = (shift_left > 0) ? (temp[2] << shift_left) : (temp[2] >> -shift_left);
-            }
-        }
-        break;
-
-    default:
-        break;
-    }
-}
-
-template <class T>
-void Image<T>::resize_to_rgb888(T *dst_image, int y_start, int y_end, int x_start, int x_end, int channel, uint8_t *src_image, int src_h, int src_w, int dst_w, int shift_left, image_resize_t type)
-{
-    float scale_y = (float)src_h / (y_end - y_start);
-    float scale_x = (float)src_w / (x_end - x_start);
-    int temp;
-
-    switch (type)
-    {
-    case IMAGE_RESIZE_BILINEAR:
-        for (size_t y = y_start; y < y_end; y++)
-        {
-            float ratio_y[2];
-            ratio_y[0] = (float)((y + 0.5) * scale_y - 0.5); // y
-            int src_y = (int)ratio_y[0];                     // y1
-            ratio_y[0] -= src_y;                             // y - y1
-
-            if (src_y < 0)
-            {
-                ratio_y[0] = 0;
-                src_y = 0;
-            }
-            if (src_y > src_h - 2)
-            {
-                ratio_y[0] = 0;
-                src_y = src_h - 2;
-            }
-            ratio_y[1] = 1 - ratio_y[0]; // y2 - y
-
-            int _dst_i = y * dst_w;
-
-            int _src_row_0 = src_y * src_w;
-            int _src_row_1 = _src_row_0 + src_w;
-
-            for (size_t x = x_start; x < x_end; x++)
-            {
-                float ratio_x[2];
-                ratio_x[0] = (float)((x + 0.5) * scale_x - 0.5); // x
-                int src_x = (int)ratio_x[0];                     // x1
-                ratio_x[0] -= src_x;                             // x - x1
-
-                if (src_x < 0)
-                {
-                    ratio_x[0] = 0;
-                    src_x = 0;
-                }
-                if (src_x > src_w - 2)
-                {
-                    ratio_x[0] = 0;
-                    src_x = src_w - 2;
-                }
-                ratio_x[1] = 1 - ratio_x[0]; // x2 - x
-
-                int dst_i = (_dst_i + x) * channel;
-
-                int src_row_0 = (_src_row_0 + src_x) * channel;
-                int src_row_1 = (_src_row_1 + src_x) * channel;
-
-                for (int c = 0; c < channel; c++)
-                {
-                    temp = round(src_image[src_row_0 + c] * ratio_x[1] * ratio_y[1] + src_image[src_row_0 + channel + c] * ratio_x[0] * ratio_y[1] + src_image[src_row_1 + c] * ratio_x[1] * ratio_y[0] + src_image[src_row_1 + channel + c] * ratio_x[0] * ratio_y[0]);
-                    dst_image[dst_i + c] = (shift_left > 0) ? (temp << shift_left) : (temp >> -shift_left);
-                }
-            }
-        }
-        break;
-
-    case IMAGE_RESIZE_MEAN:
-        shift_left -= 2;
-
-        for (size_t y = y_start; y < y_end; y++)
-        {
-            int _dst_i = y * dst_w;
-
-            float _src_row_0 = rintf(y * scale_y) * src_w;
-            float _src_row_1 = _src_row_0 + src_w;
-
-            for (size_t x = x_start; x < x_end; x++)
-            {
-                int dst_i = (_dst_i + x) * channel;
-
-                int src_row_0 = (_src_row_0 + rintf(x * scale_x)) * channel;
-                int src_row_1 = (_src_row_1 + rintf(x * scale_x)) * channel;
-
-                for (size_t c = 0; c < channel; c++)
-                {
-                    temp = (int)src_image[src_row_0 + c] + (int)src_image[src_row_0 + channel + c] + (int)src_image[src_row_1 + c] + (int)src_image[src_row_1 + channel + c];
-                    dst_image[dst_i + c] = (shift_left > 0) ? (temp << shift_left) : (temp >> -shift_left);
-                }
-            }
-        }
-        break;
-
-    case IMAGE_RESIZE_NEAREST:
-        for (size_t y = y_start; y < y_end; y++)
-        {
-            int _dst_i = y * dst_w;
-            float _src_i = rintf(y * scale_y) * src_w;
-
-            for (size_t x = x_start; x < x_end; x++)
-            {
-                int dst_i = (_dst_i + x) * channel;
-                int src_i = (_src_i + rintf(x * scale_x)) * channel;
-
-                for (size_t c = 0; c < channel; c++)
-                {
-                    dst_image[dst_i + c] = (shift_left > 0) ? ((T)src_image[src_i + c] << shift_left) : ((T)src_image[src_i + c] >> -shift_left);
-                }
-            }
-        }
-        break;
-
-    default:
-        break;
-    }
-}
--- a/tools/sdk/esp32/include/esp-face/image_util/include/image_util.h
+++ b/tools/sdk/esp32/include/esp-face/image_util/include/image_util.h
@ -1,548 +0,0 @@
-/*
-  * ESPRESSIF MIT License
-  *
-  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
-  *
-  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
-  * it is free of charge, to any person obtaining a copy of this software and associated
-  * documentation files (the "Software"), to deal in the Software without restriction, including
-  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
-  * to do so, subject to the following conditions:
-  *
-  * The above copyright notice and this permission notice shall be included in all copies or
-  * substantial portions of the Software.
-  *
-  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-  *
-  */
-#pragma once
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-#include <stdint.h>
-#include <math.h>
-#include "mtmn.h"
-
-#define LANDMARKS_NUM (10)
-
-#define MAX_VALID_COUNT_PER_IMAGE (30)
-
-#define DL_IMAGE_MIN(A, B) ((A) < (B) ? (A) : (B))
-#define DL_IMAGE_MAX(A, B) ((A) < (B) ? (B) : (A))
-
-#define RGB565_MASK_RED 0xF800
-#define RGB565_MASK_GREEN 0x07E0
-#define RGB565_MASK_BLUE 0x001F
-
-    typedef enum
-    {
-        BINARY, /*!< binary */
-    } en_threshold_mode;
-
-    typedef struct
-    {
-        fptp_t landmark_p[LANDMARKS_NUM]; /*!< landmark struct */
-    } landmark_t;
-
-    typedef struct
-    {
-        fptp_t box_p[4]; /*!< box struct */
-    } box_t;
-
-    typedef struct tag_box_list
-    {
-        uint8_t *category;    /*!< The category of the corresponding box */
-        fptp_t *score;        /*!< The confidence score of the class corresponding to the box */
-        box_t *box;           /*!< Anchor boxes or predicted boxes*/
-        landmark_t *landmark; /*!< The landmarks corresponding to the box */
-        int len;              /*!< The num of the boxes */
-    } box_array_t;
-
-    typedef struct tag_image_box
-    {
-        struct tag_image_box *next; /*!< Next image_box_t */
-        uint8_t category;
-        fptp_t score;        /*!< The confidence score of the class corresponding to the box */
-        box_t box;           /*!< Anchor boxes or predicted boxes */
-        box_t offset;        /*!< The predicted anchor-based offset */
-        landmark_t landmark; /*!< The landmarks corresponding to the box */
-    } image_box_t;
-
-    typedef struct tag_image_list
-    {
-        image_box_t *head;        /*!< The current head of the image_list */
-        image_box_t *origin_head; /*!< The original head of the image_list */
-        int len;                  /*!< Length of the image_list */
-    } image_list_t;
-
-    /**
-     * @brief Get the width and height of the box.
-     * 
-     * @param box         Input box
-     * @param w           Resulting width of the box
-     * @param h           Resulting height of the box
-     */
-    static inline void image_get_width_and_height(box_t *box, float *w, float *h)
-    {
-        *w = box->box_p[2] - box->box_p[0] + 1;
-        *h = box->box_p[3] - box->box_p[1] + 1;
-    }
-
-    /**
-     * @brief Get the area of the box.
-     * 
-     * @param box         Input box
-     * @param area        Resulting area of the box 
-     */
-    static inline void image_get_area(box_t *box, float *area)
-    {
-        float w, h;
-        image_get_width_and_height(box, &w, &h);
-        *area = w * h;
-    }
-
-    /**
-     * @brief calibrate the boxes by offset
-     * 
-     * @param image_list         Input boxes
-     * @param image_height       Height of the original image
-     * @param image_width        Width of the original image
-     */
-    static inline void image_calibrate_by_offset(image_list_t *image_list, int image_height, int image_width)
-    {
-        for (image_box_t *head = image_list->head; head; head = head->next)
-        {
-            float w, h;
-            image_get_width_and_height(&(head->box), &w, &h);
-            head->box.box_p[0] = DL_IMAGE_MAX(0, head->box.box_p[0] + head->offset.box_p[0] * w);
-            head->box.box_p[1] = DL_IMAGE_MAX(0, head->box.box_p[1] + head->offset.box_p[1] * w);
-            head->box.box_p[2] += head->offset.box_p[2] * w;
-            if (head->box.box_p[2] > image_width)
-            {
-                head->box.box_p[2] = image_width - 1;
-                head->box.box_p[0] = image_width - w;
-            }
-            head->box.box_p[3] += head->offset.box_p[3] * h;
-            if (head->box.box_p[3] > image_height)
-            {
-                head->box.box_p[3] = image_height - 1;
-                head->box.box_p[1] = image_height - h;
-            }
-        }
-    }
-
-    /**
-     * @brief calibrate the landmarks
-     * 
-     * @param image_list     Input landmarks
-     */
-    static inline void image_landmark_calibrate(image_list_t *image_list)
-    {
-        for (image_box_t *head = image_list->head; head; head = head->next)
-        {
-            float w, h;
-            image_get_width_and_height(&(head->box), &w, &h);
-            head->landmark.landmark_p[0] = head->box.box_p[0] + head->landmark.landmark_p[0] * w;
-            head->landmark.landmark_p[1] = head->box.box_p[1] + head->landmark.landmark_p[1] * h;
-
-            head->landmark.landmark_p[2] = head->box.box_p[0] + head->landmark.landmark_p[2] * w;
-            head->landmark.landmark_p[3] = head->box.box_p[1] + head->landmark.landmark_p[3] * h;
-
-            head->landmark.landmark_p[4] = head->box.box_p[0] + head->landmark.landmark_p[4] * w;
-            head->landmark.landmark_p[5] = head->box.box_p[1] + head->landmark.landmark_p[5] * h;
-
-            head->landmark.landmark_p[6] = head->box.box_p[0] + head->landmark.landmark_p[6] * w;
-            head->landmark.landmark_p[7] = head->box.box_p[1] + head->landmark.landmark_p[7] * h;
-
-            head->landmark.landmark_p[8] = head->box.box_p[0] + head->landmark.landmark_p[8] * w;
-            head->landmark.landmark_p[9] = head->box.box_p[1] + head->landmark.landmark_p[9] * h;
-        }
-    }
-
-    /**
-     * @brief Convert a rectangular box into a square box
-     * 
-     * @param boxes    Input box 
-     * @param width    Width of the orignal image
-     * @param height   height of the orignal image
-     */
-    static inline void image_rect2sqr(box_array_t *boxes, int width, int height)
-    {
-        for (int i = 0; i < boxes->len; i++)
-        {
-            box_t *box = &(boxes->box[i]);
-
-            int x1 = round(box->box_p[0]);
-            int y1 = round(box->box_p[1]);
-            int x2 = round(box->box_p[2]);
-            int y2 = round(box->box_p[3]);
-
-            int w = x2 - x1 + 1;
-            int h = y2 - y1 + 1;
-            int l = DL_IMAGE_MAX(w, h);
-
-            box->box_p[0] = DL_IMAGE_MAX(round(DL_IMAGE_MAX(0, x1) + 0.5 * (w - l)), 0);
-            box->box_p[1] = DL_IMAGE_MAX(round(DL_IMAGE_MAX(0, y1) + 0.5 * (h - l)), 0);
-
-            box->box_p[2] = box->box_p[0] + l - 1;
-            if (box->box_p[2] > width)
-            {
-                box->box_p[2] = width - 1;
-                box->box_p[0] = width - l;
-            }
-            box->box_p[3] = box->box_p[1] + l - 1;
-            if (box->box_p[3] > height)
-            {
-                box->box_p[3] = height - 1;
-                box->box_p[1] = height - l;
-            }
-        }
-    }
-
-    /**@{*/
-    /**
-     * @brief Convert RGB565 image to RGB888 image
-     * 
-     * @param in    Input RGB565 image
-     * @param dst   Resulting RGB888 image
-     */
-    static inline void rgb565_to_888(uint16_t in, uint8_t *dst)
-    { /*{{{*/
-        in = (in & 0xFF) << 8 | (in & 0xFF00) >> 8;
-        dst[2] = (in & RGB565_MASK_BLUE) << 3;  // blue
-        dst[1] = (in & RGB565_MASK_GREEN) >> 3; // green
-        dst[0] = (in & RGB565_MASK_RED) >> 8;   // red
-
-        // dst[0] = (in & 0x1F00) >> 5;
-        // dst[1] = ((in & 0x7) << 5) | ((in & 0xE000) >> 11);
-        // dst[2] = in & 0xF8;
-    } /*}}}*/
-
-    static inline void rgb565_to_888_q16(uint16_t in, int16_t *dst)
-    { /*{{{*/
-        in = (in & 0xFF) << 8 | (in & 0xFF00) >> 8;
-        dst[2] = (in & RGB565_MASK_BLUE) << 3;  // blue
-        dst[1] = (in & RGB565_MASK_GREEN) >> 3; // green
-        dst[0] = (in & RGB565_MASK_RED) >> 8;   // red
-
-        // dst[0] = (in & 0x1F00) >> 5;
-        // dst[1] = ((in & 0x7) << 5) | ((in & 0xE000) >> 11);
-        // dst[2] = in & 0xF8;
-    } /*}}}*/
-    /**@}*/
-
-    /**
-     * @brief Convert RGB888 image to RGB565 image
-     * 
-     * @param in      Resulting RGB565 image
-     * @param r       The red channel of the Input RGB888 image 
-     * @param g       The green channel of the Input RGB888 image 
-     * @param b       The blue channel of the Input RGB888 image
-     */
-    static inline void rgb888_to_565(uint16_t *in, uint8_t r, uint8_t g, uint8_t b)
-    { /*{{{*/
-        uint16_t rgb565 = 0;
-        rgb565 = ((r >> 3) << 11);
-        rgb565 |= ((g >> 2) << 5);
-        rgb565 |= (b >> 3);
-        rgb565 = (rgb565 & 0xFF) << 8 | (rgb565 & 0xFF00) >> 8;
-        *in = rgb565;
-    } /*}}}*/
-
-    /**
-     * @brief Filter out the resulting boxes whose confidence score is lower than the threshold and convert the boxes to the actual boxes on the original image.((x, y, w, h) -> (x1, y1, x2, y2))
-     * 
-     * @param score                    Confidence score of the boxes
-     * @param offset                   The predicted anchor-based offset
-     * @param landmark                 The landmarks corresponding to the box
-     * @param width                    Height of the original image
-     * @param height                   Width of the original image
-     * @param anchor_number            Anchor number of the detection output feature map 
-     * @param anchors_size             The anchor size
-     * @param score_threshold          Threshold of the confidence score
-     * @param stride 
-     * @param resized_height_scale 
-     * @param resized_width_scale 
-     * @param do_regression 
-     * @return image_list_t* 
-     */
-    image_list_t *image_get_valid_boxes(fptp_t *score,
-                                        fptp_t *offset,
-                                        fptp_t *landmark,
-                                        int width,
-                                        int height,
-                                        int anchor_number,
-                                        int *anchors_size,
-                                        fptp_t score_threshold,
-                                        int stride,
-                                        fptp_t resized_height_scale,
-                                        fptp_t resized_width_scale,
-                                        bool do_regression);
-    /**
-     * @brief Sort the resulting box lists by their confidence score.
-     * 
-     * @param image_sorted_list     The sorted box list.
-     * @param insert_list           The box list that have not been sorted.
-     */
-    void image_sort_insert_by_score(image_list_t *image_sorted_list, const image_list_t *insert_list);
-
-    /**
-     * @brief Run NMS algorithm 
-     * 
-     * @param image_list         The input boxes list
-     * @param nms_threshold      NMS threshold
-     * @param same_area          The flag of boxes with same area
-     */
-    void image_nms_process(image_list_t *image_list, fptp_t nms_threshold, int same_area);
-
-    /**
-     * @brief Resize an image to half size 
-     * 
-     * @param dimage      The output image
-     * @param dw          Width of the output image
-     * @param dh          Height of the output image
-     * @param dc          Channel of the output image
-     * @param simage      Source image
-     * @param sw          Width of the source image
-     * @param sc          Channel of the source image
-     */
-    void image_zoom_in_twice(uint8_t *dimage,
-                             int dw,
-                             int dh,
-                             int dc,
-                             uint8_t *simage,
-                             int sw,
-                             int sc);
-
-    /**
-     * @brief Resize the image in RGB888 format via bilinear interpolation
-     * 
-     * @param dst_image    The output image
-     * @param src_image    Source image
-     * @param dst_w        Width of the output image
-     * @param dst_h        Height of the output image
-     * @param dst_c        Channel of the output image
-     * @param src_w        Width of the source image
-     * @param src_h        Height of the source image
-     */
-    void image_resize_linear(uint8_t *dst_image, uint8_t *src_image, int dst_w, int dst_h, int dst_c, int src_w, int src_h);
-
-    /**
-     * @brief Crop， rotate and zoom the image in RGB888 format, 
-     * 
-     * @param corp_image       The output image
-     * @param src_image        Source image
-     * @param rotate_angle     Rotate angle
-     * @param ratio            scaling ratio
-     * @param center           Center of rotation
-     */
-    void image_cropper(uint8_t *corp_image, uint8_t *src_image, int dst_w, int dst_h, int dst_c, int src_w, int src_h, float rotate_angle, float ratio, float *center);
-
-    /**
-     * @brief Convert the rgb565 image to the rgb888 image   
-     * 
-     * @param m       The output rgb888 image
-     * @param bmp     The input rgb565 image
-     * @param count   Total pixels of the rgb565 image
-     */
-    void image_rgb565_to_888(uint8_t *m, uint16_t *bmp, int count);
-
-    /**
-     * @brief Convert the rgb888 image to the rgb565 image
-     * 
-     * @param bmp     The output rgb565 image
-     * @param m       The input rgb888 image
-     * @param count   Total pixels of the rgb565 image
-     */
-    void image_rgb888_to_565(uint16_t *bmp, uint8_t *m, int count);
-
-    /**
-     * @brief draw rectangle on the rgb565 image
-     * 
-     * @param buf     Input image
-     * @param boxes   Rectangle Boxes
-     * @param width   Width of the input image
-     */
-    void draw_rectangle_rgb565(uint16_t *buf, box_array_t *boxes, int width);
-
-    /**
-     * @brief draw rectangle on the rgb888 image
-     * 
-     * @param buf     Input image
-     * @param boxes   Rectangle Boxes
-     * @param width   Width of the input image
-     */
-    void draw_rectangle_rgb888(uint8_t *buf, box_array_t *boxes, int width);
-
-    /**
-     * @brief Get the pixel difference of two images
-     * 
-     * @param dst       The output pixel difference
-     * @param src1      Input image 1
-     * @param src2      Input image 2
-     * @param count     Total pixels of the input image
-     */
-    void image_abs_diff(uint8_t *dst, uint8_t *src1, uint8_t *src2, int count);
-
-    /**
-     * @brief Binarize an image to 0 and value. 
-     * 
-     * @param dst           The output image
-     * @param src           Source image
-     * @param threshold     Threshold of binarization
-     * @param value         The value of binarization
-     * @param count         Total pixels of the input image
-     * @param mode          Threshold mode
-     */
-    void image_threshold(uint8_t *dst, uint8_t *src, int threshold, int value, int count, en_threshold_mode mode);
-
-    /**
-     * @brief Erode the image
-     * 
-     * @param dst          The output image
-     * @param src          Source image
-     * @param src_w        Width of the source image
-     * @param src_h        Height of the source image
-     * @param src_c        Channel of the source image
-     */
-    void image_erode(uint8_t *dst, uint8_t *src, int src_w, int src_h, int src_c);
-
-    typedef float matrixType;
-    typedef struct
-    {
-        int w;              /*!< width */
-        int h;              /*!< height */
-        matrixType **array; /*!< array */
-    } Matrix;
-
-    /**
-     * @brief Allocate a 2d matrix
-     * 
-     * @param h                Height of matrix
-     * @param w                Width of matrix
-     * @return Matrix*         2d matrix
-     */
-    Matrix *matrix_alloc(int h, int w);
-
-    /**
-     * @brief Free a 2d matrix
-     * 
-     * @param m    2d matrix 
-     */
-    void matrix_free(Matrix *m);
-
-    /**
-     * @brief Get the similarity matrix of similarity transformation
-     * 
-     * @param srcx          Source x coordinates
-     * @param srcy          Source y coordinates
-     * @param dstx          Destination x coordinates
-     * @param dsty          Destination y coordinates
-     * @param num           The number of the coordinates
-     * @return Matrix*      The resulting transformation matrix
-     */
-    Matrix *get_similarity_matrix(float *srcx, float *srcy, float *dstx, float *dsty, int num);
-
-    /**
-     * @brief Get the affine transformation matrix
-     * 
-     * @param srcx          Source x coordinates
-     * @param srcy          Source y coordinates
-     * @param dstx          Destination x coordinates
-     * @param dsty          Destination y coordinates
-     * @return Matrix*      The resulting transformation matrix
-     */
-    Matrix *get_affine_transform(float *srcx, float *srcy, float *dstx, float *dsty);
-
-    /**
-     * @brief Applies an affine transformation to an image
-     * 
-     * @param img           Input image
-     * @param crop          Dst output image that has the size dsize and the same type as src
-     * @param M             Affine transformation matrix
-     */
-    void warp_affine(dl_matrix3du_t *img, dl_matrix3du_t *crop, Matrix *M);
-
-    /**
-     * @brief Resize the image in RGB888 format via bilinear interpolation, and quantify the output image
-     * 
-     * @param dst_image            Quantized output image
-     * @param src_image            Input image 
-     * @param dst_w                Width of the output image 
-     * @param dst_h                Height of the output image 
-     * @param dst_c                Channel of the output image
-     * @param src_w                Width of the input image 
-     * @param src_h                Height of the input image
-     * @param shift                Shift parameter of quantization.
-     */
-    void image_resize_linear_q(qtp_t *dst_image, uint8_t *src_image, int dst_w, int dst_h, int dst_c, int src_w, int src_h, int shift);
-
-    /**
-     * @brief Preprocess the input image of object detection model. The process is like this: resize -> normalize -> quantify
-     * 
-     * @param image                 Input image, RGB888 format.
-     * @param input_w               Width of the input image.
-     * @param input_h               Height of the input image.
-     * @param target_size           Target size of the model input image.
-     * @param exponent              Exponent of the quantized model input image.
-     * @param process_mode          Process mode. 0: resize with padding to keep height == width. 1: resize without padding, height != width.  
-     * @return dl_matrix3dq_t*      The resulting preprocessed image.
-     */
-    dl_matrix3dq_t *image_resize_normalize_quantize(uint8_t *image, int input_w, int input_h, int target_size, int exponent, int process_mode);
-
-    /**
-     * @brief Resize the image in RGB565 format via mean neighbour interpolation, and quantify the output image
-     * 
-     * @param dimage            Quantized output image. 
-     * @param simage            Input image.  
-     * @param dw                Width of the allocated output image memory.
-     * @param dc                Channel of the allocated output image memory.
-     * @param sw                Width of the input image. 
-     * @param sh                Height of the input image. 
-     * @param tw                Target width of the output image.
-     * @param th                Target height of the output image.
-     * @param shift             Shift parameter of quantization.
-     */
-    void image_resize_shift_fast(qtp_t *dimage, uint16_t *simage, int dw, int dc, int sw, int sh, int tw, int th, int shift);
-
-    /**
-     * @brief Resize the image in RGB565 format via nearest neighbour interpolation, and quantify the output image
-     * 
-     * @param dimage            Quantized output image. 
-     * @param simage            Input image.  
-     * @param dw                Width of the allocated output image memory.
-     * @param dc                Channel of the allocated output image memory.
-     * @param sw                Width of the input image. 
-     * @param sh                Height of the input image. 
-     * @param tw                Target width of the output image.
-     * @param th                Target height of the output image.
-     * @param shift             Shift parameter of quantization.
-     */
-    void image_resize_nearest_shift(qtp_t *dimage, uint16_t *simage, int dw, int dc, int sw, int sh, int tw, int th, int shift);
-
-    /**
-     * @brief Crop the image in RGB565 format and resize it to target size, then quantify the output image 
-     * 
-     * @param dimage            Quantized output image. 
-     * @param simage            Input image.
-     * @param dw                Target size of the output image.
-     * @param sw                Width of the input image. 
-     * @param sh                Height of the input image. 
-     * @param x1                The x coordinate of the upper left corner of the cropped area
-     * @param y1                The y coordinate of the upper left corner of the cropped area
-     * @param x2                The x coordinate of the lower right corner of the cropped area
-     * @param y2                The y coordinate of the lower right corner of the cropped area
-     * @param shift             Shift parameter of quantization.
-     */
-    void image_crop_shift_fast(qtp_t *dimage, uint16_t *simage, int dw, int sw, int sh, int x1, int y1, int x2, int y2, int shift);
-
-#ifdef __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/include/detect/dl_detect_define.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/detect/dl_detect_define.hpp
@ -0,0 +1,17 @@
+#pragma once
+
+#include <vector>
+
+namespace dl
+{
+    namespace detect
+    {
+        typedef struct
+        {
+            int category;              /*<! category index */
+            float score;               /*<! score of box */
+            std::vector<int> box;      /*<! [left_up_x, left_up_y, right_down_x, right_down_y] */
+            std::vector<int> keypoint; /*<! [x1, y1, x2, y2, ...] */
+        } result_t;
+    }
+}
--- a/tools/sdk/esp32/include/esp-face/include/dl_define.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/dl_define.hpp
@ -0,0 +1,90 @@
+#pragma once
+
+#include <climits>
+#include "sdkconfig.h"
+
+#define DL_LOG_LATENCY_UNIT 0  /*<! - 1: cycle */
+                               /*<! - 0: us */
+#define DL_LOG_NN_LATENCY 0    /*<! - 1: print the latency of each parts of nn */
+                               /*<! - 0: mute */
+#define DL_LOG_LAYER_LATENCY 0 /*<! - 1: print the latency of each parts of layer */
+                               /*<! - 0: mute */
+
+#if CONFIG_SPIRAM_SUPPORT || CONFIG_ESP32_SPIRAM_SUPPORT || CONFIG_ESP32S3_SPIRAM_SUPPORT
+#define DL_SPIRAM_SUPPORT 1
+#else
+#define DL_SPIRAM_SUPPORT 0
+#endif
+
+#if CONFIG_IDF_TARGET_ESP32
+#define CONFIG_DEFAULT_ASSIGN_CORE \
+    {                              \
+    } // TODO: 多核 task 完成时，改成默认 0,1
+#elif CONFIG_IDF_TARGET_ESP32S2
+#define CONFIG_DEFAULT_ASSIGN_CORE \
+    {                              \
+    }
+#elif CONFIG_IDF_TARGET_ESP32S3
+#define CONFIG_DEFAULT_ASSIGN_CORE \
+    {                              \
+    } // TODO: 多核 task 完成时，改成默认 0,1
+#elif CONFIG_IDF_TARGET_ESP32C3
+#define CONFIG_DEFAULT_ASSIGN_CORE \
+    {                              \
+    }
+#else
+#define CONFIG_DEFAULT_ASSIGN_CORE \
+    {                              \
+    }
+#endif
+
+#define DL_Q16_MIN (-32768)
+#define DL_Q16_MAX (32767)
+#define DL_Q8_MIN (-128)
+#define DL_Q8_MAX (127)
+
+#ifndef DL_MAX
+#define DL_MAX(x, y) (((x) < (y)) ? (y) : (x))
+#endif
+
+#ifndef DL_MIN
+#define DL_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#endif
+
+#ifndef DL_CLIP
+#define DL_CLIP(x, low, high) ((x) < (low)) ? (low) : (((x) > (high)) ? (high) : (x))
+#endif
+
+#ifndef DL_ABS
+#define DL_ABS(x) ((x) < 0 ? (-(x)) : (x))
+#endif
+
+#ifndef DL_RIGHT_SHIFT
+#define DL_RIGHT_SHIFT(x, shift) ((shift) > 0) ? ((x) >> (shift)) : ((x) << -(shift))
+#endif
+
+#ifndef DL_LEFT_SHIFT
+#define DL_LEFT_SHIFT(x, shift) ((shift) > 0) ? ((x) << (shift)) : ((x) >> -(shift))
+#endif
+
+namespace dl
+{
+    typedef enum
+    {
+        Linear,    /*<! Linear >*/
+        ReLU,      /*<! ReLU >*/
+        LeakyReLU, /*<! LeakyReLU >*/
+        PReLU,     /*<! PReLU >*/
+        // TODO: Sigmoid,   /*<! Sigmoid >*/
+        // TODO: Softmax,    /*<! Softmax*/
+        // TODO: TanH,
+        // TODO: ReLU6
+    } activation_type_t;
+
+    typedef enum
+    {
+        PADDING_VALID,     /*<! no padding >*/
+        PADDING_SAME,      /*<! SAME in TensorFlow style >*/
+        PADDING_SAME_MXNET /*<! SAME in MXNET style >*/
+    } padding_type_t;
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/image/dl_image.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/image/dl_image.hpp
@ -0,0 +1,380 @@
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <math.h>
+#include <vector>
+#include "dl_define.hpp"
+#include "dl_variable.hpp"
+#include "dl_math_matrix.hpp"
+
+namespace dl
+{
+    namespace image
+    {
+        typedef enum
+        {
+            IMAGE_RESIZE_BILINEAR = 0, /*<! Resize image by taking bilinear of four pixels */
+            IMAGE_RESIZE_MEAN = 1,     /*<! Resize image by taking mean of four pixels */
+            IMAGE_RESIZE_NEAREST = 2   /*<! Resize image by taking the nearest pixel */
+        } resize_type_t;
+
+        /**
+         * @brief Convert RGB888 pixel to Gray.
+         * 
+         * @param red   red value
+         * @param green green value
+         * @param blue  blue value
+         * @return gray value
+         */
+        inline uint8_t convert_pixel_rgb888_to_gray(int red, int green, int blue)
+        {
+            int temp = (red * 38 + green * 75 + blue * 15) >> 7;
+            return DL_CLIP(temp, 0, 255);
+        }
+
+        /**
+         * @brief Convert RGB565 pixel to RGB888.
+         * 
+         * @tparam T supports all integer types
+         * @param input  pixel value in RGB565
+         * @param output pixel value in RGB888
+         */
+        template <typename T>
+        inline void convert_pixel_rgb565_to_rgb888(uint16_t input, T *output)
+        {
+            output[0] = (input & 0x1F00) >> 5;                           // blue
+            output[1] = ((input & 0x7) << 5) | ((input & 0xE000) >> 11); // green
+            output[2] = input & 0xF8;                                    // red
+        }
+
+        /**
+         * @brief Convert RGB565 pixel to Gray.
+         * 
+         * @param input pixel value in RGB565
+         * @return pixel value in Gray
+         */
+        inline uint8_t convert_pixel_rgb565_to_gray(uint16_t input)
+        {
+            int blue = (input & 0x1F00) >> 5;                            // blue
+            int green = ((input & 0x7) << 5) | ((input & 0xE000) >> 11); // green
+            int red = input & 0xF8;                                      // red
+
+            return convert_pixel_rgb888_to_gray(red, green, blue);
+        }
+
+        /**
+         * @brief Crop a patch from image and resize and store to destination image.
+         * If the cropping box is out of image, destination image will be padded with edge.
+         * 
+         * The outer rectangle is the entire output image.
+         * The inner rectangle is where the resized image will be stored.
+         * In other world, this function could help you do padding while resize image.
+         *               ___________________________(dst_w)__________________
+         *              |         ___________________________                |
+         *              |        |(x_start, y_start)         |               | 
+         *              |        |                           |               | 
+         *              |        |                           |               | 
+         *       (dst_h)|        |                           |               | 
+         *              |        |                           |               | 
+         *              |        |                           |               | 
+         *              |        |___________________________|(x_end, y_end) | 
+         *              |____________________________________________________| 
+         * 
+         * @tparam T suppot all integer types
+         * @param dst_image     pointer of destination(output) image
+         * @param dst_width     destination image width
+         * @param dst_channel   destination image channel number
+         * @param dst_y_start   start y of resized image in destination image
+         * @param dst_y_end     end y of resized image in destination image
+         * @param dst_x_start   start x of resized image in destination image
+         * @param dst_x_end     end x of resized image in destination image
+         * @param src_image     pointer of source image
+         * @param src_height    source image height
+         * @param src_width     source image width
+         * @param src_channel   source image channel
+         * @param src_y_start   start y of resized image in source image
+         * @param src_y_end     end y of resized image in source image
+         * @param src_x_start   start x of resized image in source image
+         * @param src_x_end     end x of resized image in source image
+         * @param resize_type   one of IMAGE_RESIZE_BILINEAR or IMAGE_RESIZE_MEAN or IMAGE_RESIZE_NEAREST
+         * @param shift_left    bit left shift number implemented on output
+         */
+        template <typename T>
+        void crop_and_resize(T *dst_image,
+                             int dst_width,
+                             int dst_channel,
+                             int dst_y_start, int dst_y_end,
+                             int dst_x_start, int dst_x_end,
+                             uint16_t *src_image,
+                             int src_height,
+                             int src_width,
+                             int src_channel,
+                             int src_y_start, int src_y_end,
+                             int src_x_start, int src_x_end,
+                             resize_type_t resize_type = IMAGE_RESIZE_NEAREST,
+                             int shift_left = 0);
+
+        /**
+         * @brief Crop a patch from image and resize and store to destination image.
+         * If the cropping box is out of image, destination image will be padded with edge.
+         * 
+         * The outer rectangle is the entire output image.
+         * The inner rectangle is where the resized image will be stored.
+         * In other world, this function could help you do padding while resize image.
+         *               ___________________________(dst_w)__________________
+         *              |         ___________________________                |
+         *              |        |(x_start, y_start)         |               | 
+         *              |        |                           |               | 
+         *              |        |                           |               | 
+         *       (dst_h)|        |                           |               | 
+         *              |        |                           |               | 
+         *              |        |                           |               | 
+         *              |        |___________________________|(x_end, y_end) | 
+         *              |____________________________________________________| 
+         * 
+         * @tparam T suppot all integer types
+         * @param dst_image     pointer of destination(output) image
+         * @param dst_width     destination image width
+         * @param dst_channel   destination image channel number
+         * @param dst_y_start   start y of resized image in destination image
+         * @param dst_y_end     end y of resized image in destination image
+         * @param dst_x_start   start x of resized image in destination image
+         * @param dst_x_end     end x of resized image in destination image
+         * @param src_image     pointer of source image
+         * @param src_height    source image height
+         * @param src_width     source image width
+         * @param src_channel   source image channel
+         * @param src_y_start   start y of resized image in source image
+         * @param src_y_end     end y of resized image in source image
+         * @param src_x_start   start x of resized image in source image
+         * @param src_x_end     end x of resized image in source image
+         * @param resize_type   one of IMAGE_RESIZE_BILINEAR or IMAGE_RESIZE_MEAN or IMAGE_RESIZE_NEAREST
+         * @param shift_left    bit left shift number implemented on output
+         */
+        template <typename T>
+        void crop_and_resize(T *dst_image,
+                             int dst_width,
+                             int dst_channel,
+                             int dst_y_start, int dst_y_end,
+                             int dst_x_start, int dst_x_end,
+                             uint8_t *src_image,
+                             int src_height,
+                             int src_width,
+                             int src_channel,
+                             int src_y_start, int src_y_end,
+                             int src_x_start, int src_x_end,
+                             resize_type_t resize_type = IMAGE_RESIZE_NEAREST,
+                             int shift_left = 0);
+
+        /**
+         * @brief Draw a filled rectangle on RGB888 image.
+         * 
+         * @param image        pointer of input image
+         * @param image_height height of input image
+         * @param image_width  width of input image
+         * @param x1           left up corner x
+         * @param y1           left up corner y
+         * @param x2           right bottom corner x
+         * @param y2           right bottom corner y
+         * @param color        0x    00|       00|       00|       00
+         *                     reserved|channel 0|channel 1|channel 2 
+         */
+        void draw_filled_rectangle(uint8_t *image, const uint32_t image_height, const uint32_t image_width,
+                                   uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2,
+                                   const uint32_t color = 0x00FF0000);
+
+        /**
+         * @brief Draw a filled rectangle on RGB565 image.
+         * 
+         * @param image        pointer of input image
+         * @param image_height height of input image
+         * @param image_width  width of input image
+         * @param x1           left up corner x
+         * @param y1           left up corner y
+         * @param x2           right bottom corner x
+         * @param y2           right bottom corner y
+         * @param color        0b         000|    00000|    00000|           000
+         *                     channel 1[2:0]|channel 0|channel 2|channel 1[5:3] 
+         */
+        void draw_filled_rectangle(uint16_t *image, const uint32_t image_height, const uint32_t image_width,
+                                   uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2,
+                                   const uint16_t color = 0b0001111100000000);
+
+        /**
+         * @brief Draw a point on RGB888 image.
+         * 
+         * @param image        pointer of input image
+         * @param image_height height of input image
+         * @param image_width  width of input image
+         * @param x            point x
+         * @param y            point y
+         * @param size         size of point
+         * @param color        0x    00|       00|       00|       00
+         *                     reserved|channel 0|channel 1|channel 2 
+         */
+        void draw_point(uint8_t *image, const uint32_t image_height, const uint32_t image_width,
+                        const uint32_t x, const uint32_t y, const uint32_t size,
+                        const uint32_t color = 0x00FF0000);
+
+        /**
+         * @brief Draw a point on RGB565 image.
+         * 
+         * @param image        pointer of input image
+         * @param image_height height of input image
+         * @param image_width  width of input image
+         * @param x            point x
+         * @param y            point y
+         * @param size         size of point
+         * @param color        0b         000|    00000|    00000|           000
+         *                     channel 1[2:0]|channel 0|channel 2|channel 1[5:3] 
+         */
+        void draw_point(uint16_t *image, const uint32_t image_height, const uint32_t image_width,
+                        const uint32_t x, const uint32_t y, const uint32_t size,
+                        uint16_t color = 0b0001111100000000);
+
+        /**
+         * @brief Draw a hollow rectangle on RGB888 image.
+         * 
+         * @param image        pointer of input image
+         * @param image_height height of input image
+         * @param image_width  width of input image
+         * @param x1           left up corner x
+         * @param y1           left up corner y
+         * @param x2           right bottom corner x
+         * @param y2           right bottom corner y
+         * @param color        0x    00|       00|       00|       00
+         *                     reserved|channel 0|channel 1|channel 2 
+         */
+        void draw_hollow_rectangle(uint8_t *image, const uint32_t image_height, const uint32_t image_width,
+                                   uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2,
+                                   uint32_t color = 0x00FF0000);
+
+        /**
+         * @brief Draw a hollow rectangle on RGB565 image.
+         * 
+         * @param image        pointer of input image
+         * @param image_height height of input image
+         * @param image_width  width of input image
+         * @param x1           left up corner x
+         * @param y1           left up corner y
+         * @param x2           right bottom corner x
+         * @param y2           right bottom corner y
+         * @param color        0b         000|    00000|    00000|           000
+         *                     channel 1[2:0]|channel 0|channel 2|channel 1[5:3] 
+         */
+        void draw_hollow_rectangle(uint16_t *image, const uint32_t image_height, const uint32_t image_width,
+                                   uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2,
+                                   const uint16_t color = 0b0001111100000000);
+
+        /**
+         * @brief Detect target moving by activated detection point number. Each cross in the figure below is a detection point.
+         * Once abs(frame_1_detection_point[i] - frame_2_detection_point[i]) > threshold, this detection point is activated.
+         * This function will return the number of activated detection point.
+         * 
+         *         __stride__________________________
+         *         |        |        |        |   |
+         *  stride |        |        |        |   |
+         *         |        |        |        |   |
+         *         |________|________|________|   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |________|________|________| height
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |________|________|________|   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |________|________|________|___|___
+         *         |                          |
+         *         |__________width___________|
+         *         |                          |
+         * 
+         * Time consumption:
+         * Frame shape = (240, 240)
+         * Both frame are in PSRAM
+         * On ESP32-S3 with CPU 240MHz, QSPI 80MHz
+         * 
+         * stride  latency
+         *      1  28316us
+         *      2   8770us
+         *      4   3622us
+         *      8   1990us
+         *     16    880us
+         *     32    260us
+         * 
+         * 
+         * In a application, outside this function, threshold of activated detection point number is needed.
+         * Once activated detection point number > number_threshold, this two frame are judged target moved.
+         * How to determine the number_threshold?
+         * Let's assume that the minimize shape of target is (target_min_height, target_max_width).
+         * Then, the number_threshold = [target_min_height / stride] * [target_max_width / stride] * ratio,
+         * where ratio is in (0, 1), the smaller the ratio is, the more sensitive the detector is, the more false detected.
+         * 
+         * 
+         * @param f1        one frame in RGB565
+         * @param f2        another frame in RGB565
+         * @param height    height of frame
+         * @param width     width of frame
+         * @param stride    stride of detection point, the smaller the stride is, the more reliable the detector is.
+         * @param threshold activation threshold of each detection point
+         * @return activated detection point number 
+         */
+        uint32_t get_moving_point_number(uint16_t *f1, uint16_t *f2, const uint32_t height, const uint32_t width, const uint32_t stride, const uint32_t threshold = 5);
+
+        /**
+         * @brief Detect target moving by activated detection point number. Each cross in the figure below is a detection point.
+         * Once abs(frame_1_detection_point[i] - frame_2_detection_point[i]) > threshold, this detection point is activated.
+         * This function will return the number of activated detection point.
+         * 
+         *         __stride__________________________
+         *         |        |        |        |   |
+         *  stride |        |        |        |   |
+         *         |        |        |        |   |
+         *         |________|________|________|   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |________|________|________| height
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |________|________|________|   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |        |        |        |   |
+         *         |________|________|________|___|___
+         *         |                          |
+         *         |__________width___________|
+         *         |                          |
+         * 
+         * 
+         * In a application, outside this function, threshold of activated detection point number is needed.
+         * Once activated detection point number > number_threshold, this two frame are judged target moved.
+         * How to determine the number_threshold?
+         * Let's assume that the minimize shape of target is (target_min_height, target_max_width).
+         * Then, the number_threshold = [target_min_height / stride] * [target_max_width / stride] * ratio,
+         * where ratio is in (0, 1), the smaller the ratio is, the more sensitive the detector is, the more false detected.
+         * 
+         * 
+         * @param f1        one frame in RGB888
+         * @param f2        another frame in RGB888
+         * @param height    height of frame
+         * @param width     width of frame
+         * @param stride    stride of detection point, the smaller the stride is, the more reliable the detector is.
+         * @param threshold activation threshold of each detection point
+         * @return activated detection point number 
+         */
+        uint32_t get_moving_point_number(uint8_t *f1, uint8_t *f2, const uint32_t height, const uint32_t width, const uint32_t stride, const uint32_t threshold = 5);
+
+
+        template <typename T>
+        void warp_affine(dl::Tensor<T> *input, dl::Tensor<T> *output, dl::math::Matrix<float> *M_inv);
+        template <typename T>
+        void warp_affine(uint16_t *input, std::vector<int> shape, dl::Tensor<T> *output, dl::math::Matrix<float> *M_inv);
+
+    } // namespace image
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_add2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_add2d.hpp
@ -0,0 +1,127 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn_add2d.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Activation(Add2D(input0, input1)).
+         * NOTE: addition is element-wise, i.e., output[i,j,k] = input0[i,j,k] + input1[i,j,k]
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class Add2D : public Layer
+        {
+        private:
+            const Activation<feature_t> *activation; /*<! activation of add2d, if you don't specify anything, no activation is applied >*/
+            const int output_exponent;               /*<! exponent of output >*/
+            Tensor<feature_t> *output;               /*<! output ptr of add2d >*/
+            bool inplace;                            /*<! true: the output will store to input0
+                                                          false: the output will store to a seperate memeory >*/
+
+        public:
+            /**
+             * @brief Construct a new Add2D object.
+             * 
+             * @param output_exponent exponent of output
+             * @param activation      activation of add2d, if you don't specify anything, no activation is applied
+             * @param name            name of add2d
+             * @param inplace         true: the output will store to input0
+             *                        false: the output will store to a seperate memeory
+             */
+            Add2D(const int output_exponent, const Activation<feature_t> *activation = NULL, const char *name = NULL, bool inplace = false) : Layer(name), activation(activation), output_exponent(output_exponent), output(NULL)
+            {
+                this->inplace = inplace;
+            }
+
+            /**
+             * @brief Destroy the Add2D object
+             */
+            ~Add2D()
+            {
+                if((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape.
+             * NOTE: input0.shape must equal to input1.shape.
+             * 
+             * @param input0 as one input
+             * @param input1 as another input
+             */
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            {
+                assert(input0.is_same_shape(input1));
+
+                if (!this->inplace)
+                {
+                    if (this->output == NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(this->output_exponent);
+                    this->output->set_shape(input0.shape);
+                    this->output->free_element();
+                }
+                else
+                {
+                    this->output = &input0;
+                }
+            }
+
+            /**
+            * @brief Get the output
+            * 
+            * @return Tensor<feature_t>& Add2D result
+            */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call Add2D operation.
+             * 
+             * @param input0      as one input
+             * @param input1      as another input
+             * @param assign_core not effective yet
+             * @return Tensor<feature_t>& added result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input0, Tensor<feature_t> &input1, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if (!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->apply_element();
+                    this->output->set_exponent(this->output_exponent);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::add2d(*this->output, input0, input1, this->activation, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "add2d");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::add2d(*this->output, input0, input1, this->activation, assign_core, this->output_exponent);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "add2d");
+                }
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_avg_pool2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_avg_pool2d.hpp
@ -0,0 +1,137 @@
+#pragma once
+
+#include <vector>
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn_avg_pool2d.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief AvgPool2D(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class AvgPool2D : public Layer
+        {
+        private:
+            const int output_exponent;         /*<! exponent of output >*/
+            std::vector<int> filter_shape;     /*<! filter shape in [filter_height, filter_width] >*/
+            const int stride_y;                /*<! stride in height >*/
+            const int stride_x;                /*<! stride in width >*/
+            const padding_type_t padding_type; /*<! one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET >*/
+            std::vector<int> padding;          /*<! padding size needed in [top, bottom, left, right] of this operation >*/
+            Tensor<feature_t> *output;          /*<! output ptr of AvgPool2D >*/
+
+        public:
+
+            /**
+             * @brief Construct a new AvgPool2D object.
+             * 
+             * @param output_exponent exponent of output
+             * @param filter_shape    filter shape in [filter_height, filter_width]
+             * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+             *                        - PADDING_VALID means no padding
+             *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+             *                        such that output has the same height/width dimension as the input,
+             *                        - PADDING_SAME results padding in TensorFlow style
+             *                        - PADDING_SAME_MXNET results padding in MXNET style
+             * @param stride_y        stride in height
+             * @param stride_x        stride in width
+             * @param name            name of layer
+             */
+            AvgPool2D(const int output_exponent,
+                      const std::vector<int> filter_shape,
+                      const padding_type_t padding_type = PADDING_VALID,
+                      const int stride_y = 1,
+                      const int stride_x = 1,
+                      const char *name = NULL) : Layer(name),
+                                                 output_exponent(output_exponent),
+                                                 filter_shape(filter_shape),
+                                                 stride_y(stride_y),
+                                                 stride_x(stride_x),
+                                                 padding_type(padding_type)
+            {
+                this->output = new Tensor<feature_t>;
+            }
+
+            /**
+            * @brief Destroy the AvgPool2D object.
+            * 
+            */
+            ~AvgPool2D()
+            {
+                if(this->output != NULL)
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape and padding.
+             * 
+             * @param input as an input
+             */
+            void build(Tensor<feature_t> &input)
+            {
+                assert(input.shape[0] > 0);
+                assert(input.shape[1] > 0);
+                std::vector<int> output_shape = nn::get_output_shape(input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
+                this->output->set_shape(output_shape);
+                this->output->set_exponent(this->output_exponent);
+
+                this->padding = nn::get_pad_size(output_shape, input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
+                input.set_padding_size(this->padding);
+                this->output->free_element();
+            }
+
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& AvgPool2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call AvgPool2D operation
+             * 
+             * @param input           as an input
+             * @param autoload_enable one of true or false, 
+             *                        - true: load input and output from PSRAM to CACHE automatically
+             *                        - false: do not
+             * @param assign_core     not effective yet
+             * @return AvgPool2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, uint8_t autoload_enable = 0)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                this->output->apply_element();
+                this->output->set_exponent(this->output_exponent);
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                if (autoload_enable)
+                {
+                    dl::tool::cache::autoload_func((uint32_t)(this->output->element), this->output->get_size() * sizeof(feature_t),
+                                                   (uint32_t)(input.element), input.get_size() * sizeof(feature_t));
+                }
+
+                DL_LOG_LAYER_LATENCY_START();
+                nn::avg_pool2d(*this->output, input, this->padding, this->filter_shape, this->stride_y, this->stride_x);
+                DL_LOG_LAYER_LATENCY_END(this->name, "avg_pool2d");
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_base.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_base.hpp
@ -0,0 +1,55 @@
+#pragma once
+#include "dl_tool.hpp"
+#include "dl_tool_cache.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Base class for layer.
+         * 
+         */
+        class Layer
+        {
+        public:
+            char *name; /*<! name of layer >*/
+
+            /**
+             * @brief Construct a new Layer object.
+             * 
+             * @param name name of layer.
+             */
+            Layer(const char *name = NULL);
+
+            /**
+             * @brief Destroy the Layer object. Return resource.
+             * 
+             */
+            ~Layer();
+        };
+    } // namespace layer
+} // namespace dl
+
+#if DL_LOG_LAYER_LATENCY
+/**
+ * @brief Initialize.
+ */
+#define DL_LOG_LAYER_LATENCY_INIT() dl::tool::Latency latency
+
+/**
+ * @brief Time starts.
+ */
+#define DL_LOG_LAYER_LATENCY_START() latency.start()
+
+/**
+ * @brief Time ends and printed.
+ */
+#define DL_LOG_LAYER_LATENCY_END(prefix, key) \
+    latency.end();                            \
+    latency.print(prefix, key)
+#else
+#define DL_LOG_LAYER_LATENCY_INIT()
+#define DL_LOG_LAYER_LATENCY_START()
+#define DL_LOG_LAYER_LATENCY_END(prefix, key)
+#endif
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_concat2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_concat2d.hpp
@ -0,0 +1,179 @@
+#pragma once
+
+#include <assert.h>
+#include <vector>
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Concat2D(input1, input2, input3, ...).
+         * 
+         * @tparam feature_t support all kinds of integer and float data type
+         */
+        template <typename feature_t>
+        class Concat2D : Layer
+        {
+        private:
+            std::vector<Tensor<feature_t> *> output_vec; /*<! pointers of concatenated inputs >*/
+            std::vector<int> offset;                     /*<! memory offset of each concatenated inputs in entire element >*/
+            std::vector<int> channel;                    /*<! channel of concatenated inputs >*/
+            Tensor<feature_t> *output;                  /*<! output ptr of Concat2D >*/
+            int output_exponent;                        /*<! exponent of output >*/
+        public: 
+
+            /**
+             * @brief Construct a new Concat2D object.
+             * 
+             * @param name name of layer
+             */
+            Concat2D(const char *name = NULL) : Layer(name) {
+                this->output = new Tensor<feature_t>;
+            }
+
+            /**
+             * @brief Destroy the Concat2D object
+             */
+            ~Concat2D() 
+            {
+                if (this->output != NULL)
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Collect inputs' channel and memory offset, called in Model.build().
+             * 
+             * @param args pointers of concatenated Tensor
+             */
+            void build(std::vector<Tensor<feature_t> *> args)
+            {
+                assert(args.size() > 0);
+
+                this->output_vec = args;
+
+                this->offset = std::vector<int>(args.size());
+                this->channel = std::vector<int>(args.size());
+
+                this->output_exponent = args[0]->exponent;
+                this->offset[0] = 0;
+                this->channel[0] = args[0]->shape[2];
+                std::vector<int> output_shape = args[0]->shape;
+
+                for (int i = 1; i < args.size(); i++)
+                {
+                    assert(output_shape[0] == args[i]->shape[0]); // height
+                    assert(output_shape[1] == args[i]->shape[1]); // width
+                    // assert(this->output_exponent == args[i]->exponent); // exponent
+
+                    this->offset[i] = output_shape[2];
+                    this->channel[i] = args[i]->shape[2];
+                    output_shape[2] += args[i]->shape[2];
+                }
+                this->output->set_shape(output_shape);
+                this->output->set_exponent(this->output_exponent);
+                this->output->free_element();
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>&  Concat2d result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Get the maximum padding among inputs and output-> Then, set to this->output. Called at the end of Model.build().
+             * NOTE: Some special situations like C = Concat2D_1(A, B), E = Concat2D_2(C, D), where A, B, C, D, E are Tensor.
+             *         For avoiding memory copy, we apply an entire element for E, and take it apart for A, B, D. 
+             *         A, B, C, D and E will become other layer's inputs so that result different size of padding.
+             *         For get the maximum padding, we should call at the end of Model.build(),
+             *              Concat2D_1.backward();  // max_padding_temp = get_max_padding(A, B, C), padding of A, B and C are set to max_padding_temp.
+             *              Concat2D_2.backward();  // max_padding = get_max_padding(max_padding_temp, get_max_padding(D, E)) , padding of C, D and E are set to max_padding.
+             *                                         However, padding of A and B is still max_padding_temp.
+             *              Concat2D_1.backward();  // padding of A and B are set to max_padding.
+             *         Or,
+             *              Concat2D_2.backward();
+             *              Concat2D_1.backward();
+             *              Concat2D_2.backward();
+             */
+            void backward()
+            {
+                std::vector<int> max_padding = this->output->padding;
+                int max_channel_with_padding = this->output->shape_with_padding[2];
+                for (int i = 0; i < this->output_vec.size(); i++)
+                {
+                    for (int j = 0; j < max_padding.size(); j++)
+                    {
+                        max_padding[j] = DL_MAX(max_padding[j], this->output_vec[i]->padding[j]);
+                    }
+                    max_channel_with_padding = DL_MAX(max_channel_with_padding, this->output_vec[i]->shape_with_padding[2]);
+                }
+
+                this->output->set_padding_size(max_padding);
+                this->output->shape_with_padding[2] = max_channel_with_padding;
+                for (int i = 0; i < this->output_vec.size(); i++)
+                {
+                    this->output_vec[i]->set_padding_size(max_padding);
+                    this->output_vec[i]->shape_with_padding[2] = max_channel_with_padding;
+#if CONFIG_DEBUG_MODE
+                    assert(this->output->shape_with_padding[0] == this->output_vec[i]->shape_with_padding[0]);
+                    assert(this->output->shape_with_padding[1] == this->output_vec[i]->shape_with_padding[1]);
+                    assert(this->output->shape_with_padding[2] == this->output_vec[i]->shape_with_padding[2]);
+#endif
+                }
+            }
+
+            /**
+             * @brief Calloc an entire element for concatnate result. Take the entire element apart and deliver element pointers to concatenated layer.
+             * NOTE: For example, C = Concat2D(A, B). We apply an entire element for C and deliver two element pointers to A and B.
+             *       Let's assume that A result is produced first. We should call Concat2D.calloc_element() just before A result is produced
+             *       to make sure the element of A is ready and could be filled.
+             */
+            void calloc_element()
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                this->output->calloc_element();
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                DL_LOG_LAYER_LATENCY_START();
+                for (int i = 0; i < this->offset.size(); i++)
+                {
+                    this->output_vec[i]->element = this->output->element + this->offset[i];
+                    this->output_vec[i]->set_auto_free(false);
+                }
+                DL_LOG_LAYER_LATENCY_END(this->name, "deliver");
+            }
+
+            void apply_element()
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                this->output->apply_element();
+                this->output->set_exponent(this->output_exponent);
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                DL_LOG_LAYER_LATENCY_START();
+                for (int i = 0; i < this->offset.size(); i++)
+                {
+                    this->output_vec[i]->element = this->output->element + this->offset[i];
+                    this->output_vec[i]->set_auto_free(false);
+                }
+                DL_LOG_LAYER_LATENCY_END(this->name, "deliver");
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_conv2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_conv2d.hpp
@ -0,0 +1,157 @@
+#pragma once
+
+#include "dl_nn_conv2d.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Activation(Conv2D(input, filter) + bias).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class Conv2D : public Layer
+        {
+        private:
+            const int output_exponent;               /*<! exponent of output >*/
+            const Filter<feature_t> *filter;         /*<! filter of Conv2D >*/
+            const int stride_y;                      /*<! stride in height >*/
+            const int stride_x;                      /*<! stride in width >*/
+            const padding_type_t padding_type;       /*<! one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET >*/
+            const Bias<feature_t> *bias;             /*<! bias of Conv2D, if you don't specify anything, no bias is added >*/
+            const Activation<feature_t> *activation; /*<! activation of Conv2D, if you don't specify anything, no activation is applied >*/
+            std::vector<int> padding;                /*<! padding size needed in [top, bottom, left, right] of this operation >*/
+            Tensor<feature_t> *output;              /*<! output ptr of Conv2D >*/
+
+        public:
+
+            /**
+             * @brief Construct a new Conv2D object.
+             * 
+             * @param output_exponent exponent of output
+             * @param filter          filter of Conv2D
+             * @param bias            bias of Conv2D, if you don't specify anything, no bias is added
+             * @param activation      activation of Conv2D, if you don't specify anything, no activation is applied
+             * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+             *                        - PADDING_VALID means no padding
+             *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+             *                        such that output has the same height/width dimension as the input,
+             *                        - PADDING_SAME results padding in TensorFlow style
+             *                        - PADDING_SAME_MXNET results padding in MXNET style
+             * @param stride_y        stride in height
+             * @param stride_x        stride in width
+             * @param name            name of layer
+             */
+            Conv2D(const int output_exponent,
+                   const Filter<feature_t> *filter,
+                   const Bias<feature_t> *bias = NULL,
+                   const Activation<feature_t> *activation = NULL,
+                   const padding_type_t padding_type = PADDING_VALID,
+                   const int stride_y = 1,
+                   const int stride_x = 1,
+                   const char *name = NULL) : Layer(name),
+                                              output_exponent(output_exponent),
+                                              filter(filter),
+                                              stride_y(stride_y),
+                                              stride_x(stride_x),
+                                              padding_type(padding_type),
+                                              bias(bias),
+                                              activation(activation)
+            {
+                this->output = new Tensor<feature_t>;
+            }
+
+            /**
+             * @brief Destroy the Conv2D object.
+             * 
+             */
+            ~Conv2D()
+            {
+                if (this->output != NULL)
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output padding and input padding.
+             * 
+             * @param input as an input
+             */
+            void build(Tensor<feature_t> &input)
+            {
+                assert(input.shape[0] > 0);
+                assert(input.shape[1] > 0);
+
+                std::vector<int> output_shape = nn::get_output_shape(input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type, true);
+                this->output->set_shape(output_shape);
+                this->output->set_exponent(this->output_exponent);
+                this->output->free_element();
+
+                this->padding = nn::get_pad_size(output_shape, input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type);
+                input.set_padding_size(this->padding);
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& Conv2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call Conv2D operation
+             * 
+             * @param input           as an input.
+             * @param autoload_enable one of true or false, 
+             *                        - true: load input and output from PSRAM to CACHE automatically
+             *                        - false: do not
+             * @param assign_core     not effective yet
+             * @return Conv2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, bool autoload_enable = false, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                this->output->apply_element();
+                this->output->set_exponent(this->output_exponent);
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                if (autoload_enable)
+                {
+                    dl::tool::cache::autoload_func((uint32_t)(this->output->element), this->output->get_size() * sizeof(feature_t),
+                                                   (uint32_t)(input.element), input.get_size() * sizeof(feature_t));
+                }
+
+                DL_LOG_LAYER_LATENCY_START();
+                nn::conv2d(*this->output, input, this->padding, *(this->filter), this->stride_y, this->stride_x, this->bias, this->activation, assign_core);
+                DL_LOG_LAYER_LATENCY_END(this->name, "conv2d");
+                return *this->output;
+            }
+
+            /**
+             * @brief Preload the filter to Cache.
+             * NOTE: Call this layer's preload() before previous layer's call() such that filter could be loaded while previous layer is doing calculation.
+             */
+            void preload()
+            {
+                size_t size = sizeof(feature_t);
+                int shape_size = this->filter->shape.size();
+                for (int i = 0; i < shape_size; ++i)
+                {
+                    size *= filter->shape[i];
+                }
+                dl::tool::cache::preload_func((uint32_t)(this->filter->element), size);
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_depthwise_conv2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_depthwise_conv2d.hpp
@ -0,0 +1,158 @@
+#pragma once
+
+#include "dl_nn_depthwise_conv2d.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Activation(DepthwiseConv2D(filter, input) + bias).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class DepthwiseConv2D : public Layer
+        {
+        private:
+            const int output_exponent;               /*<! exponent of output >*/
+            const Filter<feature_t> *filter;         /*<! filter of DepthwiseConv2D >*/
+            const int stride_y;                      /*<! stride in height >*/
+            const int stride_x;                      /*<! stride in width >*/
+            const padding_type_t padding_type;       /*<! one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET >*/
+            const Bias<feature_t> *bias;             /*<! bias of DepthwiseConv2D, if you don't specify anything, no bias is added >*/
+            const Activation<feature_t> *activation; /*<! activation of DepthwiseConv2D, if you don't specify anything, no activation is applied >*/
+            std::vector<int> padding;                /*<! padding size needed in [top, bottom, left, right] of this operation >*/
+            Tensor<feature_t> *output;               /*<! output ptr of DepthwiseConv2D >*/
+
+        public:
+
+            /**
+             * @brief Construct a new DepthwiseConv2D object.
+             * 
+             * @param output_exponent exponent of output
+             * @param filter          filter of DepthwiseConv2D
+             * @param bias            bias of DepthwiseConv2D, if you don't specify anything, no bias is added
+             * @param activation      activation of DepthwiseConv2D, if you don't specify anything, no activation is applied
+             * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+             *                        - PADDING_VALID means no padding
+             *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+             *                        such that output has the same height/width dimension as the input
+             *                        - PADDING_SAME results padding in TensorFlow style
+             *                        - PADDING_SAME_MXNET results padding in MXNET style
+             * @param stride_y        - stride in height
+             * @param stride_x        - stride in width
+             * @param name            name of layer
+             */
+            DepthwiseConv2D(const int output_exponent,
+                            const Filter<feature_t> *filter,
+                            const Bias<feature_t> *bias = NULL,
+                            const Activation<feature_t> *activation = NULL,
+                            const padding_type_t padding_type = PADDING_VALID,
+                            const int stride_y = 1,
+                            const int stride_x = 1,
+                            const char *name = NULL) : Layer(name),
+                                                       output_exponent(output_exponent),
+                                                       filter(filter),
+                                                       stride_y(stride_y),
+                                                       stride_x(stride_x),
+                                                       padding_type(padding_type),
+                                                       bias(bias),
+                                                       activation(activation)
+            {
+                this->output = new Tensor<feature_t>;
+            }
+
+            /**
+             * @brief Destroy the DepthwiseConv2D object.
+             * 
+             */
+            ~DepthwiseConv2D() 
+            {
+                if (this->output != NULL)
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape and padding.
+             * 
+             * @param input as an input
+             */
+            void build(Tensor<feature_t> &input)
+            {
+                assert(input.shape[0] > 0);
+                assert(input.shape[1] > 0);
+
+                std::vector<int> output_shape = nn::get_output_shape(input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type);
+                this->output->set_shape(output_shape);
+                this->output->set_exponent(this->output_exponent);
+
+                this->padding = nn::get_pad_size(output_shape, input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type);
+                input.set_padding_size(this->padding);
+                this->output->free_element();
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& DepthwiseConv2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call DepthwiseConv2D operation.
+             * 
+             * @param input           as an input
+             * @param autoload_enable one of true or false,
+             *                        - true: load input and output from PSRAM to CACHE automatically
+             *                        - false: do not 
+             * @param assign_core     not effective yet
+             * @return DepthwiseConv2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, bool autoload_enable = false, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                this->output->apply_element();
+                this->output->set_exponent(this->output_exponent);
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                if (autoload_enable)
+                {
+                    dl::tool::cache::autoload_func((uint32_t)(this->output->element), this->output->get_size() * sizeof(feature_t),
+                                                   (uint32_t)(input.element), input.get_size() * sizeof(feature_t));
+                }
+
+                DL_LOG_LAYER_LATENCY_START();
+                nn::depthwise_conv2d(*this->output, input, this->padding, *(this->filter), this->stride_y, this->stride_x, this->bias, this->activation, assign_core);
+                DL_LOG_LAYER_LATENCY_END(this->name, "depthwise_conv2d");
+
+                return *this->output;
+            }
+
+            /**
+             * @brief Preload the filter to Cache. 
+             * NOTE: Call this layer's preload() before previous layer's call() such that filter could be loaded while previous layer is calculating.
+             */
+            void preload()
+            {
+                size_t size = sizeof(feature_t);
+                int shape_size = this->filter->shape.size();
+                for (int i = 0; i < shape_size; ++i)
+                {
+                    size *= filter->shape[i];
+                }
+                dl::tool::cache::preload_func((uint32_t)(this->filter->element), size);
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_global_avg_pool2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_global_avg_pool2d.hpp
@ -0,0 +1,111 @@
+#pragma once
+
+#include <vector>
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn_global_avg_pool2d.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief GlobalAveragePool2D(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class GlobalAveragePool2D : public Layer
+        {
+        private:
+            const int output_exponent; /*<! exponent of output >*/
+            Tensor<feature_t> *output; /*<! output ptr of GlobalAveragePool2D >*/
+        public:
+            /**
+             * @brief Construct a new GlobalAveragePool2D object.
+             * 
+             * @param output_exponent exponent of output
+             * @param name            name of layer
+             */
+            GlobalAveragePool2D(const int output_exponent, const char *name = NULL) : Layer(name),
+                                                                                      output_exponent(output_exponent)
+
+            {
+                this->output = new Tensor<feature_t>;
+            }
+
+            /**
+            * @brief Destroy the GlobalAveragePool2D object.
+            * 
+            */
+            ~GlobalAveragePool2D()
+            {
+                if (this->output != NULL)
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape.
+             * 
+             * @param input as an input
+             */
+            void build(Tensor<feature_t> &input)
+            {
+                assert(input.shape[0] > 0);
+                assert(input.shape[1] > 0);
+
+                std::vector<int> output_shape(input.shape.size(), 1);
+                output_shape[2] = input.shape[2];
+                this->output->set_shape(output_shape);
+                this->output->set_exponent(this->output_exponent);
+                this->output->free_element();
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& GlobalAveragePool2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call GlobalAveragePool2D operation
+             * 
+             * @param input           as an input
+             * @param autoload_enable one of true or false, 
+             *                        - true: load input and output from PSRAM to CACHE automatically
+             *                        - false: do not
+             * @param assign_core     not effective yet
+             * @return GlobalAveragePool2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, uint8_t autoload_enable = 0)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                this->output->apply_element();
+                this->output->set_exponent(this->output_exponent);
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                if (autoload_enable)
+                {
+                    dl::tool::cache::autoload_func((uint32_t)(this->output->element), this->output->get_size() * sizeof(feature_t),
+                                                   (uint32_t)(input.element), input.get_size() * sizeof(feature_t));
+                }
+
+                DL_LOG_LAYER_LATENCY_START();
+                nn::global_avg_pool2d(*this->output, input);
+                DL_LOG_LAYER_LATENCY_END(this->name, "global_avg_pool2d");
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_global_max_pool2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_global_max_pool2d.hpp
@ -0,0 +1,108 @@
+#pragma once
+
+#include <vector>
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn_global_max_pool2d.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief GlobalMaxPool2D(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class GlobalMaxPool2D : public Layer
+        {
+        private:
+            Tensor<feature_t> *output;  /*<! output ptr of GlobalMaxPool2D >*/
+        public:
+
+            /**
+             * @brief Construct a new GlobalMaxPool2D object.
+             * 
+             * @param name         name of layer
+             */
+            GlobalMaxPool2D(const char *name = NULL) : Layer(name)
+            {
+                this->output = new Tensor<feature_t>;
+            }
+
+            /**
+             * @brief Destroy the GlobalMaxPool2D object.
+             * 
+             */
+            ~GlobalMaxPool2D()
+            {
+                if (this->output != NULL)
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape and exponent.
+             * 
+             * @param input as an input
+             */
+            void build(Tensor<feature_t> &input)
+            {
+                assert(input.shape[0] > 0);
+                assert(input.shape[1] > 0);
+                this->output->set_exponent(input.exponent);
+
+                std::vector<int> output_shape(input.shape.size(), 1);
+                output_shape[2] = input.shape[2];
+                this->output->set_shape(output_shape);
+                this->output->free_element();
+            }
+
+            /**
+             * @brief Get the output 
+             * 
+             * @return Tensor<feature_t>& GlobalMaxPool2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call GlobalMaxPool2D operation
+             * 
+             * @param input           as an input
+             * @param autoload_enable one of true or false, 
+             *                        - true: load input and output from PSRAM to CACHE automatically
+             *                        - false: do not
+             * @param assign_core     not effective yet
+             * @return GlobalMaxPool2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, uint8_t autoload_enable = 0)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                this->output->apply_element();
+                this->output->set_exponent(input.exponent);
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                if (autoload_enable)
+                {
+                    dl::tool::cache::autoload_func((uint32_t)(this->output->element), this->output->get_size() * sizeof(feature_t),
+                                                   (uint32_t)(input.element), input.get_size() * sizeof(feature_t));
+                }
+
+                DL_LOG_LAYER_LATENCY_START();
+                nn::global_max_pool2d(*this->output, input);
+                DL_LOG_LAYER_LATENCY_END(this->name, "global_max_pool2d");
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_leakyrelu.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_leakyrelu.hpp
@ -0,0 +1,125 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn_LeakyReLU.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief LeakyReLU(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class LeakyReLU : public Layer
+        {
+        private:
+            feature_t activation_alpha; /*<! quantized alpha >*/
+            int activation_exponent;    /*<! exponent of quantized alpha >*/
+            Tensor<feature_t> *output;  /*<! output ptr of leakyrelu>*/
+            bool inplace;               /*<! true: the output will store to input0
+                                             false: the output will store to a seperate memeory >*/
+        public:
+
+            /**
+             * @brief Construct a new LeakyReLU object
+             * 
+             * @param activation_alpha     quantized alpha
+             * @param activation_exponent  exponent of quantized alpha
+             * @param name                 name of leakyrelu
+             * @param inplace              true: the output will store to input0
+             *                             false: the output will store to a seperate memeory
+             */
+            LeakyReLU(const int activation_alpha, const int activation_exponent, const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
+            {
+                this->activation_alpha = activation_alpha;
+                this->activation_exponent = activation_exponent;
+                this->inplace = inplace;
+            }
+
+            /**
+             * @brief Destroy the LeakyReLU object
+             * 
+             */
+            ~LeakyReLU() 
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape and exponent
+             * 
+             * @param input       as an input
+             */
+            void build(Tensor<feature_t> &input)
+            {
+                if(!this->inplace)
+                {
+                    if(this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }  
+                    this->output->set_shape(input.shape);
+                    this->output->set_exponent(input.exponent);
+                    this->output->free_element();
+                }
+                else
+                {
+                    this->output = &input;
+                }
+                
+            }
+
+            /**
+             * @brief Get the output 
+             * 
+             * @return Tensor<feature_t>& LeakyReLU result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call LeakyReLU operation.
+             * 
+             * @param input       as an input
+             * @param assign_core not effective yet
+             * @return LeakyReLU result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if(!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->apply_element();
+                    this->output->set_exponent(input.exponent);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::leakyrelu(*this->output, input, this->activation_alpha, this->activation_exponent, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "leakyrelu");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::leakyrelu<true>(*this->output, input, this->activation_alpha, this->activation_exponent, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "leakyrelu");
+                }
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_max2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_max2d.hpp
@ -0,0 +1,125 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_nn_max2d.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Max2D(input0, input1).
+         * NOTE: maximum is element-wise, i.e., output[i,j,k] = max(input0[i,j,k], input1[i,j,k])
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class Max2D : public Layer
+        {
+        private:
+            Tensor<feature_t> *output;  /*<! output ptr of max2d >*/
+            bool inplace;               /*<! true: the output will store to input0
+                                             false: the output will store to a seperate memeory >*/
+        public:
+            
+            /**
+             * @brief Construct a new Max2D object.
+             * 
+             * @param name            name of max2d
+             * @param inplace         true: the output will store to input0
+             *                        false: the output will store to a seperate memeory
+             */
+            Max2D(const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
+            {
+                this->inplace = inplace;
+            }
+
+            /**
+             * @brief Destroy the Max2D object
+             * 
+             */
+            ~Max2D() 
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape and exponent
+             * NOTE: input0.shape must equal to input1.shape.
+             *       input0.exponent must equal to input1.exponent.
+             * 
+             * @param input0 as one input
+             * @param input1 as another input
+             */
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            {
+                assert(input0.is_same_shape(input1));
+                assert(input0.exponent == input1.exponent);
+
+                if(!this->inplace)
+                {
+                    if(this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(this->output_exponent);
+                    this->output->set_shape(input0.shape);
+                    this->output->free_element();
+                }
+                else
+                    this->output = &input0;
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& Max2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call Max2D operation.
+             * 
+             * @param input0      as one input
+             * @param input1      as another input
+             * @param assign_core not effective yet
+             * @return Max2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input0, Tensor<feature_t> &input1, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if(!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->apply_element();
+                    this->output->set_exponent(input0.exponent);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::max2d(*this->output, input0, input1, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "max2d");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::max2d<true>(*this->output, input0, input1, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "max2d");
+                }
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_max_pool2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_max_pool2d.hpp
@ -0,0 +1,132 @@
+#pragma once
+
+#include <vector>
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn_max_pool2d.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief MaxPool2D(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class MaxPool2D : public Layer
+        {
+        private:
+            std::vector<int> filter_shape;     /*<! filter shape in [filter_height, filter_width] >*/
+            const int stride_y;                /*<! stride in height >*/
+            const int stride_x;                /*<! stride in width >*/
+            const padding_type_t padding_type; /*<! one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET >*/
+            std::vector<int> padding;          /*<! padding size needed in [top, bottom, left, right] of this operation >*/
+            Tensor<feature_t> *output;         /*<! output ptr of MaxPool2D >*/
+
+        public:
+
+            /**
+             * @brief Construct a new MaxPool2D object.
+             * 
+             * @param filter_shape filter shape in [filter_height, filter_width]
+             * @param padding_type one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+             *                     - PADDING_VALID means no padding
+             *                     PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+             *                     such that output has the same height/width dimension as the input,
+             *                     - PADDING_SAME results padding in TensorFlow style
+             *                     - PADDING_SAME_MXNET results padding in MXNET style
+             * @param stride_y     stride in height
+             * @param stride_x     stride in width
+             * @param name         name of layer
+             */
+            MaxPool2D(const std::vector<int> filter_shape,
+                      const padding_type_t padding_type = PADDING_VALID,
+                      const int stride_y = 1,
+                      const int stride_x = 1,
+                      const char *name = NULL) : Layer(name),
+                                                 filter_shape(filter_shape),
+                                                 stride_y(stride_y),
+                                                 stride_x(stride_x),
+                                                 padding_type(padding_type)
+            {
+                this->output = new Tensor<feature_t>;
+            }
+
+            /**
+             * @brief Destroy the MaxPool2D object.
+             * 
+             */
+            ~MaxPool2D() 
+            {
+                if (this->output != NULL)
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape and padding.
+             * 
+             * @param input as an input
+             */
+            void build(Tensor<feature_t> &input)
+            {
+                assert(input.shape[0] > 0);
+                assert(input.shape[1] > 0);
+                this->output->set_exponent(input.exponent);
+                std::vector<int> output_shape = nn::get_output_shape(input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
+                this->output->set_shape(output_shape);
+
+                this->padding = nn::get_pad_size(output_shape, input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
+                input.set_padding_size(this->padding);
+                this->output->free_element();
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& MaxPool2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call MaxPool2D operation
+             * 
+             * @param input           as an input
+             * @param autoload_enable one of true or false, 
+             *                        - true: load input and output from PSRAM to CACHE automatically
+             *                        - false: do not
+             * @param assign_core     not effective yet
+             * @return MaxPool2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, uint8_t autoload_enable = 0)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                this->output->apply_element();
+                this->output->set_exponent(input.exponent);
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                if (autoload_enable)
+                {
+                    dl::tool::cache::autoload_func((uint32_t)(this->output->element), this->output->get_size() * sizeof(feature_t),
+                                                   (uint32_t)(input.element), input.get_size() * sizeof(feature_t));
+                }
+
+                DL_LOG_LAYER_LATENCY_START();
+                nn::max_pool2d(*this->output, input, this->padding, this->filter_shape, this->stride_y, this->stride_x);
+                DL_LOG_LAYER_LATENCY_END(this->name, "max_pool2d");
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_min2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_min2d.hpp
@ -0,0 +1,126 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_nn_min2d.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Min2D(input0, input1).
+         * NOTE: minimum is element-wise, i.e., output[i,j,k] = min(input0[i,j,k], input1[i,j,k])
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class Min2D : public Layer
+        {
+        private:
+            Tensor<feature_t> *output;  /*<! output of ptr min2d>*/ 
+            bool inplace;               /*<! true: the output will store to input0
+                                             false: the output will store to a seperate memeory >*/ 
+        public:    
+
+            /**
+             * @brief Construct a new Min2D object
+             * 
+             * @param name            name of min2d
+             * @param inplace         true: the output will store to input0
+             *                        false: the output will store to a seperate memeory
+             */
+            Min2D(const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
+            {
+                this->inplace = inplace;
+            }
+
+            /**
+             * @brief Destroy the Min2D object
+             * 
+             */
+            ~Min2D() 
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape and exponent
+             * NOTE: input0.shape must equal to input1.shape.
+             *       input0.exponent must equal to input1.exponent.
+             * 
+             * @param input0 as one input
+             * @param input1 as another input
+             */
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            {
+                assert(input0.is_same_shape(input1));
+                assert(input0.exponent == input1.exponent);
+
+                if(!this->inplace)
+                {
+                    if(this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_shape(input0.shape);
+                    this->output->set_exponent(input0.exponent);
+                    this->output->free_element();
+                }
+                else
+                    this->output = &input0;
+                
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& Min2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call Min2D operation
+             * 
+             * @param input0      as one input
+             * @param input1      as another input
+             * @param assign_core not effective yet
+             * @return Min2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input0, Tensor<feature_t> &input1, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if(!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->apply_element();
+                    this->output->set_exponent(input0.exponent);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::min2d(*this->output, input0, input1, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "min2d");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::min2d<true>(*this->output, input0, input1, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "min2d");
+                }
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_model.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_model.hpp
@ -0,0 +1,52 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Neural Network Model.
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class Model
+        {
+        private:
+            std::vector<int> input_shape; /*<! input shape in [height, width, channel] >*/
+
+        public:
+            /**
+             * @brief Destroy the Model object.
+             * 
+             */
+            virtual ~Model() {}
+
+            /**
+             * @brief Build a model including update output shape and input padding of each layer.
+             * 
+             * @param input as an input
+             */
+            virtual void build(Tensor<feature_t> &input) = 0;
+
+            /**
+             * @brief Call the model layer by layer.
+             * 
+             * @param input as an input.
+             */
+            virtual void call(Tensor<feature_t> &input) = 0;
+
+            /**
+             * @brief If input.shape changes, call Model.build(), otherwise, do not. Then call Model.call().
+             * 
+             * @param input as an input
+             */
+            void forward(Tensor<feature_t> &input);
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_mul2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_mul2d.hpp
@ -0,0 +1,128 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn_mul2d.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Activation(Multiply2D(input0, input1)).
+         * NOTE: multiplication is element-wise, i.e., output[i,j,k] = input0[i,j,k] * input1[i,j,k]
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class Mul2D : public Layer
+        {
+        private:
+            const int output_exponent;              /*<! exponent of output >*/
+            const Activation<feature_t> *activation; /*<! activation of Mul2D, if you don't specify anything, no activation is applied >*/
+            Tensor<feature_t> *output;  /*<! output ptr of Mul2D >*/
+            bool inplace;               /*<! true: the output will store to input0
+                                             false: the output will store to a seperate memeory >*/ 
+        public:
+            const int output_exponent; /*<! exponent of output >*/
+
+            /**
+             * @brief Construct a new Mul2D object.
+             * 
+             * @param output_exponent exponent of output
+             * @param activation      activation of Mul2D, if you don't specify anything, no activation is applied
+             * @param name            name of layer
+             * @param inplace         true: the output will store to input0
+             *                        false: the output will store to a seperate memeory
+             */
+            Mul2D(const int output_exponent, const Activation<feature_t> *activation = NULL, const char *name = NULL, bool inplace = false) : Layer(name), 
+                                                                                            output_exponent(output_exponent),activation(activation), output(NULL)
+            {
+                this->inplace = inplace;
+            }
+
+            /**
+             * @brief Destroy the Multiply2D object.
+             */
+            ~Mul2D() 
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape.
+             * NOTE: input0.shape must equal to input1.shape.
+             * 
+             * @param input0 as one input
+             * @param input1 as another input
+             */
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            {
+                assert(input0.is_same_shape(input1));
+
+                if (!this->inplace)
+                {
+                    if(this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(this->output_exponent);
+                    this->output->set_shape(input0.shape);
+                    this->output->free_element();
+                }
+                    
+                else
+                    this->output = &input0;
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& Mul2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call Mul2D operation.
+             * 
+             * @param input0      as one input
+             * @param input1      as another input
+             * @param assign_core not effective yet
+             * @return Mul2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input0, Tensor<feature_t> &input1, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if (!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->apply_element();
+                    this->output->set_exponent(this->output_exponent);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::mul2d(*this->output, input0, input1, this->activation, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "mul2d");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::mul2d<true>(*this->output, input0, input1, this->activation, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "mul2d");
+                }
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_prelu.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_prelu.hpp
@ -0,0 +1,124 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn_prelu.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief PReLU(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class PReLU : public Layer
+        {
+        private:
+            feature_t *activation_element; /*<! quantized alpha elements along channel axis >*/
+            int activation_exponent;       /*<! exponent of quantized alpha elements >*/
+            Tensor<feature_t> *output;     /*<! output ptr of prelu >*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                false: the output will store to a seperate memeory >*/ 
+        public:
+
+            /**
+             * @brief Construct a new PReLU object
+             * 
+             * @param activation_element   quantized alpha elements along channel axis
+             * @param activation_exponent  exponent of quantized alpha elements
+             * @param name                 name of prelu
+             * @param inplace              true: the output will store to input0
+             *                             false: the output will store to a seperate memeory
+             */
+            PReLU(const feature_t *activation_element, const int activation_exponent = 0, const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
+            {
+                this->activation_element = activation_element;
+                this->activation_exponent = activation_exponent;
+                this->inplace = inplace;
+            }
+
+            /**
+             * @brief Destroy the PReLU object
+             * 
+             */
+            ~PReLU() 
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape and exponent
+             * 
+             * @param input       as an input
+             */
+            void build(Tensor<feature_t> &input)
+            {
+                if(!this->inplace)
+                {
+                    if(this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(input.exponent);
+                    this->output->set_shape(input.shape);
+                    this->output->free_element();
+                }
+                else
+                {
+                    this->output = &input;
+                }
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& PReLU result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call PReLU operation.
+             * 
+             * @param input       as an input
+             * @param assign_core not effective yet
+             * @return PReLU result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if(!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->set_exponent(input.exponent);
+                    this->output->apply_element();
+                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::prelu(*this->output, input, this->activation_element, this->activation_exponent, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "leakyrelu");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::prelu(*this->output, input, this->activation_element, this->activation_exponent, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "leakyrelu");
+                }
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_relu.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_relu.hpp
@ -0,0 +1,120 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_nn_relu.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief ReLU(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class ReLU : public Layer
+        {
+        private:
+            Tensor<feature_t> *output;  /*<! output ptr of relu >*/
+            bool inplace;               /*<! true: the output will store to input0
+                                             false: the output will store to a seperate memeory >*/ 
+        public:
+
+
+            /**
+             * @brief Construct a new ReLU object
+             * 
+             * @param name            name of relu
+             * @param inplace         true: the output will store to input0
+             *                        false: the output will store to a seperate memeory
+             */
+            ReLU(const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
+            {
+                this->inplace = inplace;
+            }
+
+            /**
+             * @brief Destroy the ReLU object
+             * 
+             */
+            ~ReLU() 
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape and exponent
+             * 
+             * @param input       as an input
+             */
+            void build(Tensor<feature_t> &input)
+            {
+                if(!this->inplace)
+                {
+                    if(this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(input.exponent);
+                    this->output->set_shape(input.shape);
+                    this->output->free_element();
+                }
+                else
+                {
+                    this->output = &input;
+                }
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& ReLU result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call ReLU operation.
+             * 
+             * @param input       as an input
+             * @param assign_core not effective yet
+             * @return ReLU result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if(!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->apply_element();
+                    this->output->set_exponent(input.exponent);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::relu(*this->output, input, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "relu");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::relu(*this->output, input, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "relu");
+                }
+
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_sub2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/layer/dl_layer_sub2d.hpp
@ -0,0 +1,124 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn_sub2d.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Activation(Sub2D(input0, input1)).
+         * NOTE: subtraction is element-wise, i.e., output[i,j,k] = input0[i,j,k] - input1[i,j,k]
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class Sub2D : public Layer
+        {
+        private:
+            const int output_exponent; /*<! exponent of output >*/
+            const Activation<feature_t> *activation; /*<! activation of Mul2D, if you don't specify anything, no activation is applied >*/
+            Tensor<feature_t> *output;  /*<! output ptr of Sub2D >*/
+            bool inplace;               /*<! true: the output will store to input0
+                                             false: the output will store to a seperate memeory >*/ 
+        public:
+
+            /**
+             * @brief Construct a new Sub2D object.
+             * 
+             * @param output_exponent exponent of output
+             * @param activation      activation of Mul2D, if you don't specify anything, no activation is applied
+             * @param name            name of layer
+             * @param inplace         true: the output will store to input0
+             *                        false: the output will store to a seperate memeory
+             */
+            Sub2D(const int output_exponent, const Activation<feature_t> *activation = NULL, const char *name = NULL, bool inplace = false) : Layer(name), 
+                                                        output_exponent(output_exponent), activation(activation), output(NULL)
+            {
+                this->inplace = inplace;
+            }
+
+            /**
+             * @brief Destroy the Sub2D object.
+             */
+            ~Sub2D() 
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output shape.
+             * NOTE: input0.shape must equal to input1.shape.
+             * 
+             * @param input0 as one input
+             * @param input1 as another input
+             */
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            {
+                assert(input0.is_same_shape(input1));
+                if (!this->inplace)
+                {
+                    if(this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(this->output_exponent);
+                    this->output->set_shape(input0.shape);
+                    this->output->free_element();
+                }    
+                else
+                    this->output = &input0;
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& Sub2D result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call Sub2D operation.
+             * 
+             * @param input0      as one input
+             * @param input1      as another input
+             * @param assign_core not effective yet
+             * @return Sub2D result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input0, Tensor<feature_t> &input1, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if (!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output.apply_element();
+                    this->output->set_exponent(input0.exponent);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::sub2d(this->output, input0, input1, this->activation, assign_core);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "sub2d");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    nn::sub2d<true>(this->output, input0, input1, this->activation, assign_core, this->output_exponent);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "sub2d");
+                }
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/math/dl_math.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/math/dl_math.hpp
@ -0,0 +1,188 @@
+#pragma once
+
+#include "dl_define.hpp"
+
+namespace dl
+{
+    namespace math
+    {
+        /**
+         * @brief x^a.
+         * 
+         * @param x as a base
+         * @param a as an exponent
+         * @return x^a
+         */
+        inline float power(float x, int a)
+        {
+            if (a > 0)
+            {
+                return x * power(x, a - 1);
+            }
+            else if (a < 0)
+            {
+                return 1 / (x * power(x, -a - 1));
+            }
+            else
+            {
+                return 1.f;
+            }
+        }
+
+        /**
+         * @brief sqrt(x).
+         * 
+         * @param x as a base
+         * @return sqrt(x)
+         */
+        inline float sqrt_quick(float x)
+        {
+            const int result = 0x1fbb4000 + (*(int *)&x >> 1);
+            return *(float *)&result;
+        }
+
+        /**
+         * @brief 1/sqrt(x).
+         * 
+         * @param x as a base
+         * @return 1/sqrt(x)
+         */
+        inline float sqrt_reciprocal_quick(float x)
+        {
+            float xhalf = 0.5f * x;
+            int i = *(int *)&x;             // get bits for floating value
+            i = 0x5f375a86 - (i >> 1);      // gives initial guess y0
+            x = *(float *)&i;               // convert bits back to float
+            x = x * (1.5f - xhalf * x * x); // Newton step, repeating increases accuracy
+            return x;
+        }
+
+        static const float EN = 0.00001f;
+
+        /**
+         * @brief sqrt(x).
+         * 
+         * @param x as a base
+         * @return sqrt(x)
+         */
+        inline float sqrt_newton(float x)
+        {
+            /**
+            * Use Newton iteration method to find the square root
+            * */
+            if (x == 0.f)
+                return 0.f;
+            float result = x;
+            float last_value;
+            do
+            {
+                last_value = result;
+                result = (last_value + x / last_value) * 0.5;
+            } while (DL_ABS(result - last_value) > EN);
+            return result;
+        }
+
+        /**
+         * @brief n-th root of x.
+         * 
+         * @param x as a base
+         * @param n root times
+         * @return n-th root of x
+         */
+        inline float root_newton(float x, int n)
+        {
+            if (n == 2)
+                return sqrt_newton(x);
+            if (n == 0)
+                return 1.f;
+            if (n == 1)
+                return x;
+            if (x == 0.f)
+                return 0.f;
+            float result = x;
+            float last_value;
+            float _n = (float)((n - 1) * n);
+            do
+            {
+                last_value = result;
+                result = _n * last_value + x / (n * power(last_value, n - 1));
+            } while (DL_ABS(result - last_value) > EN);
+            return result;
+        }
+
+        /**
+         * @brief atan(x).
+         * 
+         * @param x as an input
+         * @return atan(x) in range [-pi/2, pi/2]
+         */
+        inline float atan(float x)
+        {
+            return x * (0.78539816 - (DL_ABS(x) - 1) * (0.2447 + 0.0663 * DL_ABS(x)));
+            // float s = x*x;
+            // return ((-0.0464964749 * s + 0.15931422) * s - 0.327622764) * s * x + x;
+        }
+
+        // TODO:@yuanjiong
+        /**
+         * @brief 
+         * 
+         * @param x
+         * @param y 
+         * @return in range [-pi, pi]
+         */
+        inline float atan2(float x, float y)
+        {
+            float ax = DL_ABS(x);
+            float ay = DL_ABS(y);
+            float eps = 1e-8;
+            float a = DL_MIN(ax, ay) / (DL_MAX(ax, ay) + eps);
+            float r = atan(a); //[0, pi/2]
+            if (ay > ax)
+                r = 1.57079633 - r;
+            if (x < 0)
+                r = 3.14159265 - r;
+            if (y < 0)
+                r = -r;
+
+            return r;
+        }
+
+        /**
+         * @brief acos(x).
+         * 
+         * @param x as an input
+         * @return acos(x) in range [-pi/2, pi/2]
+         */
+        inline float acos(float x)
+        {
+            return atan2(x, sqrt_newton(1.0 - x * x));
+        }
+
+        /**
+         * @brief asin(x).
+         * 
+         * @param x as an input
+         * @return asin(x) in range [0, pi]
+         */
+        inline float asin(float x)
+        {
+            return atan2(sqrt_newton(1.0 - x * x), x);
+        }
+
+        /**
+         * @brief e^x
+         * 
+         * @param x     exponent
+         * @param steps iteration steps
+         * @return e^x
+         */
+        inline float exp_fast(double x, int steps)
+        {
+            x = 1.0 + x / (1 << steps);
+            for (int i = 0; i < steps; i++)
+                x *= x;
+            return x;
+        }
+    }
+}
--- a/tools/sdk/esp32/include/esp-face/include/math/dl_math_matrix.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/math/dl_math_matrix.hpp
@ -0,0 +1,397 @@
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <vector>
+#include "dl_define.hpp"
+#include "dl_tool.hpp"
+#include "dl_variable.hpp"
+#include "esp_timer.h"
+
+namespace dl
+{
+    namespace math
+    {
+        /**
+         * @brief the Matrix class
+         * 
+         * @tparam T 
+         */
+        template <typename T>
+        class Matrix
+        {
+        public:
+            T **array;
+            int h;
+            int w;
+            Matrix() : h(0), w(0)
+            {
+                this->array = NULL;
+            }
+
+            Matrix(int h, int w) : h(h), w(w)
+            {
+                this->calloc_element();
+            }
+
+            Matrix(int h, int w, T s) : h(h), w(w)
+            {
+                this->calloc_element();
+                this->set_value(s);
+            }
+
+            Matrix(const Matrix<T> &mat) : h(mat.h), w(mat.w)
+            {
+                this->calloc_element();
+                this->set_value(mat);
+            }
+            virtual ~Matrix()
+            {
+                if (this->array != NULL)
+                {
+                    for (int i = 0; i < this->h; i++)
+                    {
+                        free(this->array[i]);
+                    }
+                    free(this->array);
+                    this->array = NULL;
+                }
+            }
+
+            /**
+             * @brief calloc the matrix element 
+             * 
+             */
+            void calloc_element()
+            {
+                if ((this->h > 0) && (this->w > 0))
+                {
+                    this->array = (T **)calloc(this->h, sizeof(T *));
+                    for (int i = 0; i < this->h; i++)
+                    {
+                        this->array[i] = (T *)calloc(this->w, sizeof(T));
+                    }
+                }
+                else
+                {
+                    this->array = NULL;
+                }
+            }
+
+            /**
+             * @brief Set the matrix element to random number.
+             * 
+             * @param thresh the max abs value of the element. 
+             */
+            void set_random(T thresh = 1)
+            {
+                unsigned int seed = esp_timer_get_time();
+                srand(seed);
+                for (int i = 0; i < this->h; i++)
+                {
+                    for (int j = 0; j < this->w; j++)
+                    {
+                        this->array[i][j] = ((T)rand()) / (T)(RAND_MAX)*thresh;
+                    }
+                }
+            }
+
+            /**
+             * @brief Set the small value to zero
+             * 
+             * @param thresh the threshold of small value 
+             */
+            void set_zero(T thresh = 1e-8)
+            {
+                for (int i = 0; i < this->h; i++)
+                {
+                    for (int j = 0; j < this->w; j++)
+                    {
+                        if (DL_ABS(this->array[i][j]) < thresh)
+                        {
+                            this->array[i][j] = 0;
+                        }
+                    }
+                }
+            }
+
+            /**
+             * @brief Set the matrix value from a vector
+             * 
+             * @tparam TT 
+             * @param mat   the input vector
+             */
+            template <typename TT>
+            void set_value(std::vector<TT> mat)
+            {
+                int area = this->w * this->h;
+                assert(area == mat.size());
+                int index = 0;
+                for (int i = 0; i < this->h; i++)
+                {
+                    for (int j = 0; j < this->w; j++)
+                    {
+                        this->array[i][j] = (T)(mat[index++]);
+                    }
+                }
+            }
+
+            /**
+             * @brief Set the matrix value from another matrix.
+             * 
+             * @tparam TT 
+             * @param mat the input matrix.
+             */
+            template <typename TT>
+            void set_value(const Matrix<TT> &mat)
+            {
+                assert((this->h == mat.h) && (this->w == mat.w));
+                for (int i = 0; i < this->h; i++)
+                {
+                    for (int j = 0; j < this->w; j++)
+                    {
+                        this->array[i][j] = (T)(mat.array[i][j]);
+                    }
+                }
+            }
+
+            /**
+             * @brief Set a part of the matrix value from another matrix.
+             * 
+             * @param h_start    the start index of height
+             * @param h_end      the end index of height
+             * @param w_start    the start index of width
+             * @param w_end      the end index of width
+             * @param mat        the input matrix
+             */
+            void set_value(int h_start, int h_end, int w_start, int w_end, const Matrix<T> &mat)
+            {
+                int h = h_end - h_start;
+                int w = w_end - w_start;
+
+                assert((h == mat.h) && (w == mat.w));
+                assert((h_end <= this->h) && (w_end <= this->w) && (h_start >= 0) && (w_start >= 0));
+                for (int i = 0; i < h; i++)
+                {
+                    for (int j = 0; j < w; j++)
+                    {
+                        this->array[i + h_start][j + w_start] = mat.array[i][j];
+                    }
+                }
+            }
+
+            /**
+             * @brief Set the matrix value to a constant.
+             * 
+             * @tparam TT 
+             * @param s  the input value.
+             */
+            template <typename TT>
+            void set_value(TT s)
+            {
+                for (int i = 0; i < this->h; i++)
+                {
+                    for (int j = 0; j < this->w; j++)
+                    {
+                        this->array[i][j] = (T)s;
+                    }
+                }
+            }
+
+            /**
+             * @brief print the matrix element.
+             * 
+             */
+            void print_value() const
+            {
+                printf("h: %d, w: %d\n", this->h, this->w);
+                for (int i = 0; i < this->h; i++)
+                {
+                    for (int j = 0; j < this->w; j++)
+                    {
+                        printf("%f ", (float)(this->array[i][j]));
+                    }
+                    printf("\n");
+                }
+            }
+
+            /**
+             * @brief  do matrix multiply
+             * 
+             * @param input     the input matrix
+             * @return Matrix<T> the output matrix
+             */
+            Matrix<T> matmul(const Matrix<T> &input) const;
+
+            /**
+             * @brief transpose the matrix
+             * 
+             * @return Matrix<T> the transposed matrix
+             */
+            Matrix<T> transpose() const;
+
+            /**
+             * @brief get the inverse matrix
+             * 
+             * @return Matrix<T> the output matrix
+             */
+            Matrix<T> inverse() const;
+
+            /**
+             * @brief get the diagonal of the matrix
+             * 
+             * @return Matrix<T> the diagonal
+             */
+            Matrix<T> diagonal() const;
+
+            /**
+             * @brief slice the matrix
+             * 
+             * @param h_start   the start index of height
+             * @param h_end     the end index of height
+             * @param w_start   the start index of width
+             * @param w_end     the end index of width
+             * @return Matrix<T> the output.
+             */
+            Matrix<T> slice(int h_start, int h_end, int w_start, int w_end) const;
+
+            /**
+             * @brief get an identity matrix
+             * 
+             * @param n     the dim of the identity matrix
+             * @return Matrix<T>  the output
+             */
+            static Matrix<T> identity(int n)
+            {
+                Matrix<T> A(n, n);
+                for (int i = 0; i < n; ++i)
+                {
+                    A.array[i][i] = 1;
+                }
+                return A;
+            }
+
+            /**
+             * @brief get a diag matrix
+             * 
+             * @param d  the diagonal value.
+             * @return Matrix<T>  the output
+             */
+            static Matrix<T> diag(const Matrix<T> &d)
+            {
+                assert(d.h == 1);
+                Matrix<T> A(d.w, d.w);
+                for (int i = 0; i < d.w; ++i)
+                {
+                    A.array[i][i] = d.array[0][i];
+                }
+                return A;
+            }
+
+
+            static Matrix<T> arange(uint32_t n)
+            {
+                Matrix<T> A(1, n);
+                for (int i = 0; i < n; ++i)
+                {
+                    A.array[0][i] = i;
+                }
+                return A;
+            }
+
+            static Matrix<T> arange(uint32_t n1, uint32_t n2)
+            {
+                int len = n2 - n1;
+                assert(len > 0);
+                Matrix<T> A(1, len);
+                for (int i = 0; i < len; ++i)
+                {
+                    A.array[0][i] = n1 + i;
+                }
+
+                return A;
+            }
+
+            /**
+             * @brief get the F_norm of the matrix
+             * 
+             * @return T the output F_norm
+             */
+            T F_norm() const
+            {
+                T f_n = 0.0;
+                for (int i = 0; i < this->h; ++i)
+                {
+                    for (int j = 0; j < this->w; ++j)
+                    {
+                        f_n += (this->array[i][j] * this->array[i][j]);
+                    }
+                }
+                f_n = sqrt_newton(f_n);
+                return f_n;
+            }
+
+            Matrix<T> &operator=(const Matrix<T> &A)
+            {
+                if ((A.h == this->h) && (A.w == this->w))
+                {
+                    for (int i = 0; i < A.h; ++i)
+                    {
+                        for (int j = 0; j < A.w; ++j)
+                        {
+                            this->array[i][j] = A.array[i][j];
+                        }
+                    }
+                }
+                else
+                {
+                    if (this->array != NULL)
+                    {
+                        for (int i = 0; i < this->h; ++i)
+                        {
+                            free(this->array[i]);
+                        }
+                        free(this->array);
+                        this->array = NULL;
+                    }
+                    this->h = A.h;
+                    this->w = A.w;
+                    if ((A.h > 0) && (A.w > 0))
+                    {
+                        this->calloc_element();
+                        this->set_value(A);
+                    }
+                }
+                return *this;
+            }
+        };
+
+        /**
+         * @brief Get the affine transform matrix
+         * 
+         * @param source_coord the source coordinates
+         * @param dest_coord   the target coordinates
+         * @return Matrix<float> the output matrix
+         */
+        Matrix<float> get_affine_transform(Matrix<float> &source_coord, Matrix<float> &dest_coord);
+        
+        /**
+         * @brief Get the similarity transform matrix
+         * 
+         * @param source_coord the source coordinates
+         * @param dest_coord the target coordinates
+         * @return Matrix<float> the output matrix
+         */
+        Matrix<float> get_similarity_transform(Matrix<float> &source_coord, Matrix<float> &dest_coord);
+        
+        /**
+         * @brief Get the perspective transform matrix
+         * 
+         * @param source_coord the source coordinates
+         * @param dest_coord   the target coordinates
+         * @return Matrix<float> the output matrix
+         */
+        Matrix<float> get_perspective_transform(Matrix<float> &source_coord, Matrix<float> &dest_coord);
+    } // namespace math
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/model_zoo/cat_face_detect_mn03.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/model_zoo/cat_face_detect_mn03.hpp
@ -0,0 +1,47 @@
+#pragma once
+
+#include <stdint.h>
+#include <vector>
+#include <list>
+#include "dl_detect_define.hpp"
+
+/**
+ * @brief Hardware Requirement.
+ *        - flash 310kB
+ */
+
+class CatFaceDetectMN03
+{
+private:
+    void *model;
+
+public:
+    /**
+     * @brief Construct a new Cat Face Detect MN03 object.
+     * 
+     * @param score_threshold predicted boxes with score lower than the threshold will be filtered out
+     * @param nms_threshold   predicted boxes with IoU higher than the threshold will be filtered out
+     * @param top_k           first k highest score boxes will be remained
+     * @param resize_scale    resize scale to implement on input image
+     */
+    CatFaceDetectMN03(const float score_threshold, const float nms_threshold, const int top_k, const float resize_scale);
+
+    /**
+     * @brief Destroy the Cat Face Detect MN03 object.
+     * 
+     */
+    ~CatFaceDetectMN03();
+
+    /**
+     * @brief Inference.
+     * 
+     * @tparam T supports uint8_t and uint16_t
+     *         - uint8_t: input image is RGB888
+     *         - uint16_t: input image is RGB565
+     * @param input_element pointer of input image
+     * @param input_shape   shape of input image
+     * @return detection result
+     */
+    template <typename T>
+    std::list<dl::detect::result_t> &infer(T *input_element, std::vector<int> input_shape);
+};
--- a/tools/sdk/esp32/include/esp-face/include/model_zoo/face_recognition_112_v1_s16.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/model_zoo/face_recognition_112_v1_s16.hpp
@ -0,0 +1,30 @@
+#pragma once
+
+#include "dl_variable.hpp"
+#include "face_recognition_tool.hpp"
+#include "face_recognizer.hpp"
+#include <vector>
+
+using namespace dl;
+
+/**
+ * @brief face recognition model v1
+ * input size: 112 x 112 x 3
+ * quantization mode: S16
+ * 
+ */
+class FaceRecognition112V1S16 : public FaceRecognizer<int16_t>
+{       
+    public:
+        /**
+         * @brief Construct a new Face_Recognition_112_V1_S16 object
+         * 
+         */
+        FaceRecognition112V1S16();
+        
+        /**
+         * @brief Destroy the Face_Recognition_112_V1_S16 object
+         * 
+         */
+        ~FaceRecognition112V1S16();
+};
--- a/tools/sdk/esp32/include/esp-face/include/model_zoo/face_recognition_112_v1_s8.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/model_zoo/face_recognition_112_v1_s8.hpp
@ -0,0 +1,30 @@
+#pragma once
+
+#include "dl_variable.hpp"
+#include "face_recognition_tool.hpp"
+#include "face_recognizer.hpp"
+#include <vector>
+
+using namespace dl;
+
+/**
+ * @brief face recognition model v1
+ * input size: 112 x 112 x 3
+ * quantization mode: S8
+ * 
+ */
+class FaceRecognition112V1S8 : public FaceRecognizer<int8_t>
+{       
+    public:
+        /**
+         * @brief Construct a new Face_Recognition_112_V1_S8 object
+         * 
+         */
+        FaceRecognition112V1S8();
+
+        /**
+         * @brief Destroy the Face Recognition_112_V1_S8 object
+         * 
+         */
+        ~FaceRecognition112V1S8();
+};
--- a/tools/sdk/esp32/include/esp-face/include/model_zoo/face_recognition_tool.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/model_zoo/face_recognition_tool.hpp
@ -0,0 +1,162 @@
+#pragma once
+
+#include "dl_variable.hpp"
+#include "dl_define.hpp"
+#include "dl_tool.hpp"
+#include "dl_math.hpp"
+#include "dl_math_matrix.hpp"
+#include <vector>
+#include <list>
+#include <algorithm>
+#include <math.h>
+#include <string>
+
+/**
+ * @brief struct of face similarity
+ * 
+ */
+typedef struct
+{
+    int id;
+    std::string name;
+    float similarity;
+} face_info_t;
+
+
+/**
+ * @brief Face ID
+ * 
+ * @tparam feature_t 
+ */
+template <typename feature_t>
+class FaceID
+{
+public:
+    int id;     /*<! id index >*/
+    dl::Tensor<feature_t> id_emb;   /*<! id embedding >*/
+    std::string name;                /*<! id name >*/
+
+    /**
+     * @brief Construct a new Face ID object
+     * 
+     * @param id        id index
+     * @param id_emb    id embedding
+     * @param name      id name
+     */
+    FaceID(int id, dl::Tensor<feature_t> &id_emb, std::string name = "");
+
+    /**
+     * @brief Destroy the Face ID object
+     * 
+     */
+    ~FaceID() {}
+
+    /**
+     * @brief print the face id information
+     * 
+     */
+    void print();
+};
+
+namespace face_recognition_tool
+{
+    /**
+     * @brief l2 normalize the feautre
+     * 
+     * @param feature 
+     */
+    void l2_norm(dl::Tensor<float> &feature);
+
+    /**
+     * @brief calculate the cosine distance of the input ids
+     * 
+     * @param id_1  id 1
+     * @param id_2  id 2
+     * @param normalized_ids true: the input ids have been normalized.
+     *                       false: the input ids have not been normlized 
+     * @param type           0: cos dist: [-1, 1]
+     *                       1: normalzied cos dist: [0, 1]
+     * @return float  the cosine distance
+     */
+    float cos_distance(dl::Tensor<float> &id_1, dl::Tensor<float> &id_2, bool normalized_ids = true, int8_t type = 0);
+
+    /**
+     * @brief transform the image to the input of a mfn model 
+     * 
+     * @tparam T 
+     * @param image             the input image.
+     * @param free_input        true: free the input image.
+     *                          false: do not free the input image.
+     * @param do_padding        true: pad the result.
+     *                          false: do not pad the result. 
+     * @return dl::Tensor<T>* 
+     */
+    template <typename T>
+    dl::Tensor<T> *transform_mfn_input(dl::Tensor<uint8_t> &image, bool free_input = false, bool do_padding = true);
+
+    /**
+     * @brief  transform the image to the input of a mfn model 
+     * 
+     * @tparam T 
+     * @param image                the input image.
+     * @param output               the preprocessed image.
+     * @param free_input           true: free the input image.
+     *                             false: do not free the input image.
+     * @param do_padding           true: pad the result.
+     *                             false: do not pad the result
+     */
+    template <typename T>
+    void transform_mfn_input(dl::Tensor<uint8_t> &image, dl::Tensor<T> &output, bool free_input = false, bool do_padding = true);
+
+    /**
+     * @brief transform the mfn output embedding to a floating embedding
+     * 
+     * @tparam T 
+     * @param input the input embedding. 
+     * @param norm   true: normalize the output embedding.
+     *               false: do not normalize the output embedding.
+     * @param free_input true: free the input embedding.
+     *                   false: do not free the input embedding.
+     * @return dl::Tensor<float>* 
+     */
+    template <typename T>
+    dl::Tensor<float> *transform_mfn_output(dl::Tensor<T> &input, bool norm = true, bool free_input = false);
+
+    /**
+     * @brief transform the mfn output embedding to a floating embedding
+     * 
+     * @tparam T 
+     * @param input         the input embedding. 
+     * @param output        the output embedding.
+     * @param norm          true: normalize the output embedding.
+     *                      false: do not normalize the output embedding.
+     * @param free_input    true: free the input embedding.
+     *                      false: do not free the input embedding.
+     */
+    template <typename T>
+    void transform_mfn_output(dl::Tensor<T> &input, dl::Tensor<float> &output, bool norm = true, bool free_input = false);
+
+    /**
+     * @brief get the aligned face.
+     * 
+     * @tparam T 
+     * @param input     input tensor 
+     * @param output    the output aligned face.
+     * @param landmarks the landmarks of the face.
+     */
+    template <typename T>
+    void align_face(dl::Tensor<T> *input, dl::Tensor<T> *output, std::vector<int> &landmarks);
+
+    /**
+     * @brief get the aligned face.
+     * 
+     * @tparam T 
+     * @param input     input image with rgb565 format.
+     * @param shape     the shape of the input image.
+     * @param output    the output aligned face.
+     * @param landmarks the landmarks of the face.
+     */
+    template <typename T>
+    void align_face(uint16_t *input, std::vector<int> shape, dl::Tensor<T> *output, std::vector<int> &landmarks);
+
+} // namespace face_recognition_tool
--- a/tools/sdk/esp32/include/esp-face/include/model_zoo/face_recognizer.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/model_zoo/face_recognizer.hpp
@ -0,0 +1,220 @@
+#pragma once
+
+#include "dl_variable.hpp"
+#include "face_recognition_tool.hpp"
+#include <vector>
+
+using namespace dl;
+
+/**
+ * @brief 
+ * 
+ * @tparam feature_t 
+ */
+template<typename feature_t>
+class FaceRecognizer
+{
+    public:
+        /**
+         * @brief Construct a new Face Recognizer object
+         * 
+         */
+        FaceRecognizer();
+
+        /**
+         * @brief Destroy the Face Recognizer object
+         * 
+         */
+        virtual ~FaceRecognizer();
+
+        void *model;
+
+        /**
+         * @brief Set the face recognition threshold [-1, 1], default thresh: 0.55
+         * Note: If the similarity of two faces is greater than the threshold, they will be judged as the same person
+         * 
+         * @param thresh 
+         */
+        void set_thresh(float thresh);
+
+        /**
+         * @brief Get the current threshold of recognizer.
+         * 
+         * @return float current threshold.
+         */
+        float get_thresh();
+
+        /**
+         * @brief Get the input shape of the recognizer.
+         * 
+         * @return std::vector<int> the input shape of the recognizer. 
+         */
+        std::vector<int> get_input_shape();
+
+        /**
+         * @brief do forward
+         * 
+         * @param model_input            the input data of the face recognition model. 
+         * Note: the input data should have been preprocessed.
+         * @return Tensor<feature_t>&    the output of the face recognition model.
+         */
+        Tensor<feature_t> &forward(Tensor<feature_t> &model_input);
+
+        /**
+         * @brief recognize face
+         * 
+         * @param image_input       the pointer of the input image with format bgr565.
+         * @param shape             the shape of the input image
+         * @param landmarks         face landmarks coordinates
+         * @return face_info_t      the recognition result.
+         */
+        face_info_t recognize(uint16_t *image_input, std::vector<int> shape, std::vector<int> &landmarks);
+        
+        /**
+         * @brief recognize face
+         * 
+         * @param image_input        the pointer of the input image with format bgr565.
+         * @param shape              the shape of the input image  
+         * @param aligned_face       the Tensor to store the intermeidate aligned face.
+         * @param landmarks          face landmarks coordinates
+         * @return face_info_t       the recognition result.
+         */
+        face_info_t recognize(uint16_t *image_input, std::vector<int> shape, Tensor<uint8_t> &aligned_face, std::vector<int> &landmarks);
+
+        /**
+         * @brief recognize face
+         * 
+         * @param image_input         the Tensor of input image with format bgr888.
+         * @param landmarks           face landmarks coordinates
+         * @return face_info_t        the recognition result.
+         */
+        face_info_t recognize(Tensor<uint8_t> &image_input, std::vector<int> &landmarks);
+
+        /**
+         * @brief recognize face
+         * 
+         * @param image_input           the Tensor of input image with format bgr888.
+         * @param aligned_face          the Tensor to store the intermeidate aligned face.
+         * @param landmarks             face landmarks coordinates
+         * @return face_info_t          the recognition result.
+         */
+        face_info_t recognize(Tensor<uint8_t> &image_input, Tensor<uint8_t> &aligned_face, std::vector<int> &landmarks);
+
+        /**
+         * @brief recognize face
+         * 
+         * @param aligned_face          the Tensor of the input aligned face with format bgr888.
+         * @return face_info_t          the recognition result.
+         */
+        face_info_t recognize(Tensor<uint8_t> &aligned_face);
+
+        /**
+         * @brief recognize the face embedding.
+         * 
+         * @param emb  the normalized face embbeding.
+         * @return face_info_t  the recognition result.
+         */
+        face_info_t recognize(Tensor<float> &emb);
+
+        /**
+         * @brief Get the index of the enrolled ids 
+         * 
+         * @return std::vector<int> a vector of face ids index
+         */
+        std::vector<face_info_t> get_enrolled_ids();
+
+        /**
+         * @brief Get the face embedding 
+         * 
+         * @param id the face id index
+         * @return Tensor<float>  the face embedding of the face id index. 
+         *                        if there is no matched id return the embedding of last input image.
+         */
+        Tensor<float> &get_face_emb(int id=-1);
+
+        /**
+         * @brief Get the number of enrolled id
+         * 
+         * @return int the number of enrolled id
+         */
+        int get_enrolled_id_num();
+
+        /**
+         * @brief enroll face id
+         * 
+         * @param image_input       the pointer of the input image with format bgr565.
+         * @param shape             the shape of the input image
+         * @param landmarks         face landmarks coordinates
+         * @param name              name of the face id.
+         * @return  int             the face id index of the enrolled embedding.
+         */
+        int enroll_id(uint16_t *image_input, std::vector<int> shape, std::vector<int> &landmarks, std::string name="");
+        
+        /**
+         * @brief enroll face id
+         * 
+         * @param image_input        the pointer of the input image with format bgr565.
+         * @param shape              the shape of the input image  
+         * @param aligned_face       the Tensor to store the intermeidate aligned face.
+         * @param landmarks          face landmarks coordinates
+         * @param name               name of the face id.
+         * @return  int              the face id index of the enrolled embedding.
+         */
+        int enroll_id(uint16_t *image_input, std::vector<int> shape, Tensor<uint8_t> &aligned_face, std::vector<int> &landmarks, std::string name="");
+
+        /**
+         * @brief enroll face id
+         * 
+         * @param image_input         the Tensor of input image with format bgr888.
+         * @param landmarks           face landmarks coordinates
+         * @param name                name of the face id.
+         * @return  int               the face id index of the enrolled embedding.
+         */
+        int enroll_id(Tensor<uint8_t> &image_input, std::vector<int> &landmarks, std::string name="");
+
+        /**
+         * @brief enroll face id
+         * 
+         * @param image_input           the Tensor of input image with format bgr888.
+         * @param aligned_face          the Tensor to store the intermeidate aligned face.
+         * @param landmarks             face landmarks coordinates
+         * @param name                  name of the face id.
+         * @return  int                 the face id index of the enrolled embedding.
+         */
+        int enroll_id(Tensor<uint8_t> &image_input, Tensor<uint8_t> &aligned_face, std::vector<int> &landmarks, std::string name="");
+
+        /**
+         * @brief enroll face id
+         * 
+         * @param aligned_face          the Tensor of the input aligned face with format bgr888.
+         * @param name                  name of the face id.
+         * @return  int                 the face id index of the enrolled embedding.
+         */
+        int enroll_id(Tensor<uint8_t> &aligned_face, std::string name="");
+        
+        /**
+         * @brief       enroll the normalzied face embedding.
+         * 
+         * @param emb   the normalized face embbeding.
+         * @param name  name of the face id.
+         * @return int  the face id index of the enrolled embedding.
+         */
+        int enroll_id(Tensor<float> &emb, std::string name="");
+
+        /**
+         * @brief       delete the last enrolled face id.
+         * 
+         * @return int  the number of remained face ids.
+         *              if the face ids list is empty, return -1
+         */
+        int delete_id();
+
+        /**
+         * @brief       delete the face id with id index.
+         * 
+         * @param id    face id index.
+         * @return int  the number of remained face ids.
+         *              if there is no matched id return -1
+         */
+        int delete_id(int id);
+};
--- a/tools/sdk/esp32/include/esp-face/include/model_zoo/human_face_detect_mnp01.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/model_zoo/human_face_detect_mnp01.hpp
@ -0,0 +1,41 @@
+#pragma once
+
+#include <vector>
+#include <list>
+#include "dl_detect_define.hpp"
+
+class HumanFaceDetectMNP01
+{
+private:
+    void *model;
+
+public:
+    /**
+     * @brief Construct a new Human Face Detect MNP01 object.
+     * 
+     * @param score_threshold predicted boxes with score lower than the threshold will be filtered out
+     * @param nms_threshold   predicted boxes with IoU higher than the threshold will be filtered out
+     * @param top_k           first k highest score boxes will be remained
+     */
+    HumanFaceDetectMNP01(const float score_threshold, const float nms_threshold, const int top_k);
+
+    /**
+     * @brief Destroy the Human Face Detect MNP01 object.
+     * 
+     */
+    ~HumanFaceDetectMNP01();
+
+    /**
+     * @brief Inference.
+     * 
+     * @tparam T supports uint16_t and uint8_t,
+     *         - uint16_t: input image is RGB565
+     *         - uint8_t: input image is RGB888
+     * @param input_element pointer of input image
+     * @param input_shape   shape of input image
+     * @param candidates    candidate boxes on input image
+     * @return detection result
+     */
+    template <typename T>
+    std::list<dl::detect::result_t> &infer(T *input_element, std::vector<int> input_shape, std::list<dl::detect::result_t> &candidates);
+};
--- a/tools/sdk/esp32/include/esp-face/include/model_zoo/human_face_detect_msr01.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/model_zoo/human_face_detect_msr01.hpp
@ -0,0 +1,40 @@
+#pragma once
+
+#include <list>
+#include <vector>
+#include "dl_detect_define.hpp"
+
+class HumanFaceDetectMSR01
+{
+private:
+    void *model;
+
+public:
+    /**
+     * @brief Construct a new Human Face Detect MSR01 object
+     * 
+     * @param score_threshold   predicted boxes with score lower than the threshold will be filtered out
+     * @param nms_threshold     predicted boxes with IoU higher than the threshold will be filtered out
+     * @param top_k             first k highest score boxes will be remained
+     * @param resize_scale      resize scale to implement on input image
+     */
+    HumanFaceDetectMSR01(const float score_threshold, const float nms_threshold, const int top_k, float resize_scale);
+
+    /**
+     * @brief Destroy the Human Face Detect MSR01 object
+     */
+    ~HumanFaceDetectMSR01();
+
+    /**
+     * @brief Inference.
+     * 
+     * @tparam T supports uint8_t and uint16_t
+     *         - uint8_t: input image is RGB888
+     *         - uint16_t: input image is RGB565
+     * @param input_element pointer of input image
+     * @param input_shape   shape of input image
+     * @return detection result
+     */
+    template <typename T>
+    std::list<dl::detect::result_t> &infer(T *input_element, std::vector<int> input_shape);
+};
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn.hpp
@ -0,0 +1,61 @@
+#pragma once
+#include <vector>
+#include "dl_define.hpp"
+#include "dl_tool.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief Get the output shape object
+         * 
+         * @param input_shape  input shape
+         * @param filter_shape filter shape with dilation
+         * @param stride_y     stride in height
+         * @param stride_x     stride in width
+         * @param pad_type     one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET
+         * @param is_conv2d    one of true or false,
+         *                     - true: serve for Conv2D
+         *                     - false: serve for other operations
+         * @return std::vector<int> 
+         */
+        std::vector<int> get_output_shape(const std::vector<int> &input_shape, const std::vector<int> &filter_shape, const int stride_y, const int stride_x, const padding_type_t pad_type, const bool is_conv2d = false);
+
+        /**
+         * @brief Get the pad size object
+         * 
+         * @param output_shape output shape
+         * @param input_shape  input shape
+         * @param filter_shape filter shape with dilation
+         * @param stride_y     stride in height
+         * @param stride_x     stride in width
+         * @param padding_type one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET
+         * @return padding size
+         */
+        std::vector<int> get_pad_size(const std::vector<int> &output_shape, const std::vector<int> &input_shape, const std::vector<int> &filter_shape, const int stride_y, const int stride_x, const padding_type_t padding_type);
+    } // namespace nn
+} // namespace dl
+
+#if DL_LOG_NN_LATENCY
+/**
+ * @brief Initialize.
+ */
+#define DL_LOG_NN_LATENCY_INIT() dl::tool::Latency latency
+
+/**
+ * @brief Time starts.
+ */
+#define DL_LOG_NN_LATENCY_START() latency.start()
+
+/**
+ * @brief Time ends and printed.
+ */
+#define DL_LOG_NN_LATENCY_END(key) \
+    latency.end();                 \
+    latency.print("nn", key)
+#else
+#define DL_LOG_NN_LATENCY_INIT()
+#define DL_LOG_NN_LATENCY_START()
+#define DL_LOG_NN_LATENCY_END(key)
+#endif
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_add2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_add2d.hpp
@ -0,0 +1,91 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief activation(add2d(input0, input1)).
+         * 
+         * @param output            as an output
+         * @param input0            as one input
+         * @param input1            as another input
+         * @param activation        activation of add2d, if you don't specify anything, no activation is applied
+         * @param assign_core       not effective yet
+         * @param output_exponent   exponent of output, only and must specify if inplace operation happens 
+         */
+        void add2d(Tensor<int16_t> &output,
+                   Tensor<int16_t> &input0,
+                   Tensor<int16_t> &input1,
+                   const Activation<int16_t> *const activation = NULL,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
+                   const int output_exponent = INT_MIN);
+
+        /**
+         * @brief activation(add2d(input0, input1)).
+         * 
+         * @param output            as an output
+         * @param input0            as one input
+         * @param input1            as another input
+         * @param activation        activation of add2d, if you don't specify anything, no activation is applied
+         * @param assign_core       not effective yet
+         * @param output_exponent   exponent of output, only and must specify if inplace operation happens 
+         */
+        void add2d(Tensor<int8_t> &output,
+                   Tensor<int8_t> &input0,
+                   Tensor<int8_t> &input1,
+                   const Activation<int8_t> *const activation = NULL,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE, const int output_exponent = INT_MIN);
+
+        /**
+         * @brief activation(add2d(input0, input1))
+         * 
+         * @tparam inplace: whether directly store the output to input0
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param output_exponent exponent of output
+         * @param input0          as one input
+         * @param input1          as another input
+         * @param activation      activation of add2d, if you don't specify anything, no activation is applied
+         * @param assign_core     not effective yet
+         * @param inplace         whether directly store the output to input0
+         * @return add2d result or no return(result store to input0)
+         */
+        template <bool inplace = false, typename feature_t>
+        auto add2d(const int output_exponent,
+                    Tensor<feature_t> &input0,
+                    Tensor<feature_t> &input1,
+                    const Activation<feature_t> *activation,
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        {
+            assert(input0.is_same_shape(input1));
+
+            DL_LOG_NN_LATENCY_INIT();
+
+            Tensor<feature_t> output;
+            if constexpr(!inplace)
+            {
+                DL_LOG_NN_LATENCY_START();
+                output.set_exponent(output_exponent).set_shape(input0.shape).apply_element();
+                DL_LOG_NN_LATENCY_END("apply");
+
+                DL_LOG_NN_LATENCY_START();
+                add2d(output, input0, input1, activation, assign_core);
+                DL_LOG_NN_LATENCY_END("add2d");
+                return output;
+            }
+            else
+            {
+                DL_LOG_NN_LATENCY_START();
+                add2d(input0, input0, input1, activation, assign_core, output_exponent);
+                input0.set_exponent(output_exponent);
+                DL_LOG_NN_LATENCY_END("add2d");
+            }
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_avg_pool2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_avg_pool2d.hpp
@ -0,0 +1,102 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+#include <stdint.h>
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief avg_pool2d(input).
+         * 
+         * @param output        as an output
+         * @param input         as an input
+         * @param padding       padding size needed in [top, bottom, left, right] of this operation
+         * @param filter_shape  filter_shape in [filter_height, filter_width]
+         * @param stride_y      stride in height
+         * @param stride_x      stride in width
+         * @param assign_core   not effective yet
+         */
+        void avg_pool2d(Tensor<int16_t> &output,
+                        Tensor<int16_t> &input,
+                        std::vector<int> &padding,
+                        std::vector<int> &filter_shape,
+                        const int stride_y,
+                        const int stride_x,
+                        const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief avg_pool2d(input).
+         * 
+         * @param output        as an output
+         * @param input         as an input
+         * @param padding       padding size needed in [top, bottom, left, right] of this operation
+         * @param filter_shape  filter_shape in [filter_height, filter_width]
+         * @param stride_y      stride in height
+         * @param stride_x      stride in width
+         * @param assign_core   not effective yet
+         */
+        void avg_pool2d(Tensor<int8_t> &output,
+                        Tensor<int8_t> &input,
+                        std::vector<int> &padding,
+                        std::vector<int> &filter_shape,
+                        const int stride_y,
+                        const int stride_x,
+                        const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief avg_pool2d(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param output_exponent exponent of output
+         * @param input           as an input
+         * @param filter_shape    filter_shape in [filter_height, filter_width]
+         * @param stride_y        stride in height
+         * @param stride_x        stride in width
+         * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+         *                        - PADDING_VALID: no padding
+         *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+         *                        such that output has the same height/width dimension as the input,
+         *                        - PADDING_SAME results padding in TensorFlow style
+         *                        - PADDING_SAME_MXNET results padding in MXNET style
+         * @param assign_core     not effective yet
+         * @return avg_pool2d result
+         */
+        template <typename feature_t>
+        Tensor<feature_t> avg_pool2d(const int output_exponent,
+                                     Tensor<feature_t> &input,
+                                     std::vector<int> filter_shape,
+                                     const int stride_y,
+                                     const int stride_x,
+                                     const padding_type_t padding_type,
+                                     const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+        {
+            DL_LOG_NN_LATENCY_INIT();
+
+            DL_LOG_NN_LATENCY_START();
+            std::vector<int> output_shape = get_output_shape(input.shape, filter_shape, stride_y, stride_x, padding_type);
+            Tensor<feature_t> output;
+            output.set_exponent(output_exponent).set_shape(output_shape).apply_element();
+            DL_LOG_NN_LATENCY_END("apply");
+
+            DL_LOG_NN_LATENCY_START();
+            if (padding_type == PADDING_SAME || padding_type == PADDING_SAME_MXNET)
+            {
+                std::vector<int> padding = get_pad_size(output_shape, input.shape, filter_shape, stride_y, stride_x, padding_type);
+                input.set_padding_size(padding);
+            }
+            DL_LOG_NN_LATENCY_END("padding");
+
+            DL_LOG_NN_LATENCY_START();
+            avg_pool2d(output, input, input.padding, filter_shape, stride_y, stride_x, assign_core);
+            DL_LOG_NN_LATENCY_END("avg_pool2d");
+
+            return output;
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_concat2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_concat2d.hpp
@ -0,0 +1,22 @@
+#pragma once
+
+#include <vector>
+#include "dl_variable.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief concat2d(input_1, input_2, ...)
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param output as an output
+         * @param inputs a bundle of inputs to be concatenated
+         */
+        template <typename feature_t>
+        void concat2d(Tensor<feature_t> &output, std::vector<Tensor<feature_t>> inputs);
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_conv2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_conv2d.hpp
@ -0,0 +1,116 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief activation(conv2d(input, filter) + bias).
+         * NOTE: When padding_type is SAME, make sure padding is already added in input.
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param padding     padding size needed in [top, bottom, left, right] of this operation
+         * @param filter      filter of conv2d
+         * @param stride_y    stride in height
+         * @param stride_x    stride in width
+         * @param bias        bias of conv2d, if you don't specify anything, no bias is added
+         * @param activation  activation of conv2d, if you don't specify anything, no activation is applied
+         * @param assign_core not effective yet
+         */
+        void conv2d(Tensor<int16_t> &output,
+                    Tensor<int16_t> &input,
+                    std::vector<int> &padding,
+                    const Filter<int16_t> &filter,
+                    const int stride_y,
+                    const int stride_x,
+                    const Bias<int16_t> *const bias = NULL,
+                    const Activation<int16_t> *const activation = NULL,
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief activation(conv2d(input, filter) + bias).
+         * NOTE: When padding_type is SAME, make sure padding is already added in input.
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param padding     padding size needed in [top, bottom, left, right] of this operation
+         * @param filter      filter of conv2d
+         * @param stride_y    stride in height
+         * @param stride_x    stride in width
+         * @param bias        bias of conv2d, if you don't specify anything, no bias is added
+         * @param activation  activation of conv2d, if you don't specify anything, no activation is applied
+         * @param assign_core not effective yet
+         */
+        void conv2d(Tensor<int8_t> &output,
+                    Tensor<int8_t> &input,
+                    std::vector<int> &padding,
+                    const Filter<int8_t> &filter,
+                    const int stride_y,
+                    const int stride_x,
+                    const Bias<int8_t> *const bias = NULL,
+                    const Activation<int8_t> *const activation = NULL,
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief activation(conv2d(input, filter) + bias).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param output_exponent exponent of output
+         * @param input           as an input
+         * @param filter          Filter of conv2d
+         * @param stride_y        stride in height
+         * @param stride_x        stride in width
+         * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+         *                        - PADDING_VALID: no padding
+         *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+         *                        such that output has the same height/width dimension as the input,
+         *                        - PADDING_SAME results padding in TensorFlow style
+         *                        - PADDING_SAME_MXNET results padding in MXNET style
+         * @param bias            bias of conv2d, if you don't specify anything, no bias is added
+         * @param activation      activation of conv2d, if you don't specify anything, no activation is applied
+         * @param assign_core     not effective yet
+         * @return conv2d result
+         */
+        template <typename feature_t>
+        Tensor<feature_t> conv2d(const int output_exponent,
+                                 Tensor<feature_t> &input,
+                                 const Filter<feature_t> &filter,
+                                 const int stride_y,
+                                 const int stride_x,
+                                 const padding_type_t padding_type,
+                                 const Bias<feature_t> *bias,
+                                 const Activation<feature_t> *activation,
+                                 const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+        {
+            DL_LOG_NN_LATENCY_INIT();
+
+            DL_LOG_NN_LATENCY_START();
+            std::vector<int> output_shape = get_output_shape(input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type, true);
+            Tensor<feature_t> output;
+            output.set_exponent(output_exponent).set_shape(output_shape).apply_element();
+            DL_LOG_NN_LATENCY_END("apply");
+
+            DL_LOG_NN_LATENCY_START();
+            if (padding_type == PADDING_SAME || padding_type == PADDING_SAME_MXNET)
+            {
+                std::vector<int> padding = get_pad_size(output_shape, input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type);
+                input.set_padding_size(padding);
+                input.set_padding_value(padding, 0);
+            }
+            DL_LOG_NN_LATENCY_END("padding");
+
+            DL_LOG_NN_LATENCY_START();
+            conv2d(output, input, input.padding, filter, stride_y, stride_x, bias, activation, assign_core);
+            DL_LOG_NN_LATENCY_END("conv2d");
+
+            return output;
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_depthwise_conv2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_depthwise_conv2d.hpp
@ -0,0 +1,116 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief activate(depthwise_conv2d(input, filter) + bias)
+         * NOTE: When padding_type is SAME, make sure padding is already added in input
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param padding     padding size needed in [top, bottom, left, right] of this operation
+         * @param filter      Filter of depthwise_conv2d
+         * @param stride_y    stride in height
+         * @param stride_x    stride in width
+         * @param bias        bias of depthwise_conv2d, if you don't specify anything, no bias is added
+         * @param activation  activation of depthwise_conv2d, if you don't specify anything, no activation is applied
+         * @param assign_core not effective yet
+         */
+        void depthwise_conv2d(Tensor<int16_t> &output,
+                              Tensor<int16_t> &input,
+                              std::vector<int> &padding,
+                              const Filter<int16_t> &filter,
+                              const int stride_y,
+                              const int stride_x,
+                              const Bias<int16_t> *bias = NULL,
+                              const Activation<int16_t> *activation = NULL,
+                              const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief activate(depthwise_conv2d(input, filter) + bias)
+         * NOTE: When padding_type is SAME, make sure padding is already added in input
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param padding     padding size needed in [top, bottom, left, right] of this operation
+         * @param filter      filter of depthwise_conv2d
+         * @param stride_y    stride in height
+         * @param stride_x    stride in width
+         * @param bias        bias of depthwise_conv2d, if you don't specify anything, no bias is added
+         * @param activation  activation of depthwise_conv2d, if you don't specify anything, no activation is applied
+         * @param assign_core not effective yet
+         */
+        void depthwise_conv2d(Tensor<int8_t> &output,
+                              Tensor<int8_t> &input,
+                              std::vector<int> &padding,
+                              const Filter<int8_t> &filter,
+                              const int stride_y,
+                              const int stride_x,
+                              const Bias<int8_t> *bias = NULL,
+                              const Activation<int8_t> *activation = NULL,
+                              const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief activation(depthwise_conv2d(input, filter) + bias)
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param output_exponent exponent of output
+         * @param input           as an input
+         * @param filter          filter of depthwise_conv2d
+         * @param stride_y        stride in height
+         * @param stride_x        stride in width
+         * @param pad_type        one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+         *                        - PADDING_VALID means no padding
+         *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+         *                        such that output has the same height/width dimension as the input,
+         *                        - PADDING_SAME results padding in TensorFlow style
+         *                        - PADDING_SAME_MXNET results padding in MXNET style
+         * @param bias            bias of depthwise_conv2d, if you don't specify anything, no bias is added
+         * @param activation      activation of depthwise_conv2d, if you don't specify anything, no activation is applied
+         * @param assign_core     not effective yet
+         * @return depthwise_conv2d result
+         */
+        template <typename feature_t>
+        Tensor<feature_t> depthwise_conv2d(const int output_exponent,
+                                           Tensor<feature_t> &input,
+                                           const Filter<feature_t> &filter,
+                                           const int stride_y,
+                                           const int stride_x,
+                                           const padding_type_t padding_type,
+                                           const Bias<feature_t> *bias,
+                                           const Activation<feature_t> *activation,
+                                           const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+        {
+            DL_LOG_NN_LATENCY_INIT();
+
+            DL_LOG_NN_LATENCY_START();
+            std::vector<int> output_shape = get_output_shape(input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type);
+            Tensor<feature_t> output;
+            output.set_exponent(output_exponent).set_shape(output_shape).apply_element();
+            DL_LOG_NN_LATENCY_END("apply");
+
+            DL_LOG_NN_LATENCY_START();
+            if (padding_type == PADDING_SAME || padding_type == PADDING_SAME_MXNET)
+            {
+                std::vector<int> padding = get_pad_size(output_shape, input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type);
+                input.set_padding_size(padding);
+                input.set_padding_value(padding, 0);
+            }
+            DL_LOG_NN_LATENCY_END("padding");
+
+            DL_LOG_NN_LATENCY_START();
+            depthwise_conv2d(output, input, input.padding, filter, stride_y, stride_x, bias, activation, assign_core);
+            DL_LOG_NN_LATENCY_END("depthwise_conv2d");
+
+            return output;
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_global_avg_pool2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_global_avg_pool2d.hpp
@ -0,0 +1,66 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+#include <stdint.h>
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief global_avg_pool2d(input).
+         * 
+         * @param output        as an output
+         * @param input         as an input
+         * @param assign_core   not effective yet
+         */
+        void global_avg_pool2d(Tensor<int16_t> &output,
+                               Tensor<int16_t> &input,
+                               const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief global_avg_pool2d(input).
+         * 
+         * @param output        as an output
+         * @param input         as an input
+         * @param assign_core   not effective yet
+         */
+        void global_avg_pool2d(Tensor<int8_t> &output,
+                               Tensor<int8_t> &input,
+                               const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief global_avg_pool2d(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param output_exponent exponent of output
+         * @param input           as an input
+         * @param assign_core     not effective yet
+         * @return global_avg_pool2d result
+         */
+        template <typename feature_t>
+        Tensor<feature_t> global_avg_pool2d(const int output_exponent,
+                                            Tensor<feature_t> &input,
+                                            const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+        {
+            DL_LOG_NN_LATENCY_INIT();
+
+            DL_LOG_NN_LATENCY_START();
+            std::vector<int> output_shape(input.shape.size(), 1);
+            output_shape[2] = input.shape[2];
+            Tensor<feature_t> output;
+            output.set_exponent(output_exponent).set_shape(output_shape).apply_element();
+            DL_LOG_NN_LATENCY_END("apply");
+
+            DL_LOG_NN_LATENCY_START();
+            global_avg_pool2d(output, input, assign_core);
+            DL_LOG_NN_LATENCY_END("global_avg_pool2d");
+
+            return output;
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_global_max_pool2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_global_max_pool2d.hpp
@ -0,0 +1,64 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+#include <stdint.h>
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief global_max_pool2d(input).
+         * 
+         * @param output       as an output
+         * @param input        as an input
+         * @param assign_core  not effective yet
+         */
+        void global_max_pool2d(Tensor<int16_t> &output,
+                               Tensor<int16_t> &input,
+                               const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief global_max_pool2d(input).
+         * 
+         * @param output       as an output
+         * @param input        as an input
+         * @param assign_core  not effective yet
+         */
+        void global_max_pool2d(Tensor<int8_t> &output,
+                               Tensor<int8_t> &input,
+                               const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief global_max_pool2d(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param input        as an input
+         * @param assign_core  not effective yet
+         * @return global_max_pool2d result
+         */
+        template <typename feature_t>
+        Tensor<feature_t> global_max_pool2d(Tensor<feature_t> &input,
+                                            const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+        {
+            DL_LOG_NN_LATENCY_INIT();
+
+            DL_LOG_NN_LATENCY_START();
+            std::vector<int> output_shape(input.shape.size(), 1);
+            output_shape[2] = input.shape[2];
+            Tensor<feature_t> output;
+            output.set_exponent(input.exponent).set_shape(output_shape).apply_element();
+            DL_LOG_NN_LATENCY_END("apply");
+
+            DL_LOG_NN_LATENCY_START();
+            global_max_pool2d(output, input, assign_core);
+            DL_LOG_NN_LATENCY_END("global_max_pool2d");
+
+            return output;
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_leakyrelu.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_leakyrelu.hpp
@ -0,0 +1,82 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief leakyrelu(input).
+         * 
+         * @param output                as an output
+         * @param input                 as an input
+         * @param activation_alpha      quantized alpha
+         * @param activation_exponent   exponent of quantized alpha
+         * @param assign_core not effective yet
+         */
+        void leakyrelu(Tensor<int16_t> &output,
+                       Tensor<int16_t> &input,
+                       const int16_t activation_alpha,
+                       const int activation_exponent,
+                       const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief leakyrelu(input).
+         * 
+         * @param output                as an output
+         * @param input                 as an input
+         * @param activation_alpha      quantized alpha
+         * @param activation_exponent   exponent of quantized alpha
+         * @param assign_core not effective yet
+         */
+        void leakyrelu(Tensor<int8_t> &output,
+                       Tensor<int8_t> &input,
+                       const int8_t activation_alpha,
+                       const int activation_exponent,
+                       const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief leakyrelu(input)
+         * 
+         * @tparam inplace: whether directly store the output to input
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param input                 as an input
+         * @param activation_alpha      quantized alpha
+         * @param activation_exponent   exponent of quantized alpha
+         * @param assign_core           not effective yet
+         * @return leakyrelu result or no return(result store to input)
+         */
+        template <bool inplace = false, typename feature_t>
+        auto leakyrelu(Tensor<feature_t> &input, 
+                        const int activation_alpha, 
+                        const int activation_exponent, 
+                        const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        {
+            DL_LOG_NN_LATENCY_INIT();
+            Tensor<feature_t> output;
+            if constexpr(!inplace)
+            {
+                DL_LOG_NN_LATENCY_START();
+                output.set_exponent(input.exponent).set_shape(input.shape).apply_element();
+                DL_LOG_NN_LATENCY_END("apply");
+
+                DL_LOG_NN_LATENCY_START();
+                leakyrelu(output, input, activation_alpha, activation_exponent, assign_core);
+                DL_LOG_NN_LATENCY_END("leakyrelu");
+
+                return output;
+            }
+            else
+            {
+                DL_LOG_NN_LATENCY_START();
+                leakyrelu(input, input, activation_alpha, activation_exponent, assign_core);
+                DL_LOG_NN_LATENCY_END("leakyrelu");
+            }
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_max2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_max2d.hpp
@ -0,0 +1,81 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief max2d(input0, input1) 
+         * 
+         * @param output            as an output
+         * @param input0            as one input
+         * @param input1            as another input
+         * @param assign_core       not effective yet
+         */
+        void max2d(Tensor<int16_t> &output,
+                   Tensor<int16_t> &input0,
+                   Tensor<int16_t> &input1,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief max2d(input0, input1)  
+         * 
+         * @param output            as an output
+         * @param input0            as one input
+         * @param input1            as another input
+         * @param assign_core       not effective yet 
+         * @param output_exponent   exponent of output, only and must specify if inplace operation happens
+         */
+        void max2d(Tensor<int8_t> &output,
+                   Tensor<int8_t> &input0,
+                   Tensor<int8_t> &input1,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief max2d(input0, input1)
+         *
+         * @tparam inplace: whether directly store the output to input0 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param input0          as one input
+         * @param input1          as another input
+         * @param assign_core     not effective yet
+         * @return max2d result or no return(result store to input0)
+         */
+        template <bool inplace = false, typename feature_t>
+        auto max2d(Tensor<feature_t> &input0, 
+                    Tensor<feature_t> &input1, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        {
+            assert(input0.is_same_shape(input1));
+            assert(input0.exponent == input1.exponent);
+
+            DL_LOG_NN_LATENCY_INIT();
+            Tensor<feature_t> output;
+            
+            if constexpr(!inplace)
+            {
+                DL_LOG_NN_LATENCY_START();
+                output.set_exponent(input0.exponent).set_shape(input0.shape).apply_element();
+                DL_LOG_NN_LATENCY_END("apply");
+
+                DL_LOG_NN_LATENCY_START();
+                max2d(output, input0, input1, assign_core);
+                DL_LOG_NN_LATENCY_END("max2d");
+
+                return output;
+            }
+            else
+            {
+                DL_LOG_NN_LATENCY_START();
+                max2d(input0, input0, input1, assign_core);
+                DL_LOG_NN_LATENCY_END("max2d");
+            }
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_max_pool2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_max_pool2d.hpp
@ -0,0 +1,101 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+#include <stdint.h>
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief max_pool2d(input).
+         * 
+         * @param output       as an output
+         * @param input        as an input
+         * @param padding      padding size needed in [top, bottom, left, right] of this operation
+         * @param filter_shape filter shape in [filter_height, filter_width]
+         * @param stride_y     stride in height
+         * @param stride_x     stride in width
+         * @param assign_core  not effective yet
+         */
+        void max_pool2d(Tensor<int16_t> &output,
+                        Tensor<int16_t> &input,
+                        std::vector<int> &padding,
+                        std::vector<int> &filter_shape,
+                        const int stride_y,
+                        const int stride_x,
+                        const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief max_pool2d(input).
+         * 
+         * @param output       as an output
+         * @param input        as an input
+         * @param padding      padding size needed in [top, bottom, left, right] of this operation
+         * @param filter_shape filter shape in [filter_height, filter_width]
+         * @param stride_y     stride in height
+         * @param stride_x     stride in width
+         * @param assign_core  not effective yet
+         */
+        void max_pool2d(Tensor<int8_t> &output,
+                        Tensor<int8_t> &input,
+                        std::vector<int> &padding,
+                        std::vector<int> &filter_shape,
+                        const int stride_y,
+                        const int stride_x,
+                        const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief max_pool2d(input).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param input        as an input
+         * @param filter_shape filter shape in [filter_height, filter_width]
+         * @param stride_y     stride in height
+         * @param stride_x     stride in width
+         * @param padding_type one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+         *                     - PADDING_VALID: no padding
+         *                     PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+         *                     such that output has the same height/width dimension as the input,
+         *                     - PADDING_SAME results padding in TensorFlow style
+         *                     - PADDING_SAME_MXNET results padding in MXNET style
+         * @param assign_core  not effective yet
+         * @return max_pool2d result
+         */
+        template <typename feature_t>
+        Tensor<feature_t> max_pool2d(Tensor<feature_t> &input,
+                                     std::vector<int> filter_shape,
+                                     const int stride_y,
+                                     const int stride_x,
+                                     const padding_type_t padding_type,
+                                     const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+        {
+            DL_LOG_NN_LATENCY_INIT();
+
+            DL_LOG_NN_LATENCY_START();
+            std::vector<int> output_shape = get_output_shape(input.shape, filter_shape, stride_y, stride_x, padding_type);
+            Tensor<feature_t> output;
+            output.set_exponent(input.exponent).set_shape(output_shape).apply_element();
+            DL_LOG_NN_LATENCY_END("apply");
+
+            DL_LOG_NN_LATENCY_START();
+            if (padding_type == PADDING_SAME || padding_type == PADDING_SAME_MXNET)
+            {
+                std::vector<int> padding = get_pad_size(output_shape, input.shape, filter_shape, stride_y, stride_x, padding_type);
+                input.set_padding_size(padding);
+                input.set_padding_value(padding, 0);
+            }
+            DL_LOG_NN_LATENCY_END("padding");
+
+            DL_LOG_NN_LATENCY_START();
+            max_pool2d(output, input, input.padding, filter_shape, stride_y, stride_x, assign_core);
+            DL_LOG_NN_LATENCY_END("max_pool2d");
+
+            return output;
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_min2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_min2d.hpp
@ -0,0 +1,80 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief min2d(input0, input1) 
+         * 
+         * @param output      as an output
+         * @param input0      as one input
+         * @param input1      as another input
+         * @param assign_core 
+         */
+        void min2d(Tensor<int16_t> &output,
+                   Tensor<int16_t> &input0,
+                   Tensor<int16_t> &input1,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief min2d(input0, input1)  
+         * 
+         * @param output      as an output
+         * @param input0      as one input
+         * @param input1      as another input
+         * @param assign_core 
+         */
+        void min2d(Tensor<int8_t> &output,
+                   Tensor<int8_t> &input0,
+                   Tensor<int8_t> &input1,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief min2d(input0, input1)
+         * 
+         * @tparam inplace: whether directly store the output to input0
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param input0          as one input
+         * @param input1          as another input
+         * @param assign_core     not effective yet
+         * @return min2d result or no return(result store to input0)
+         */
+        template <bool inplace = false, typename feature_t>
+        auto min2d(Tensor<feature_t> &input0, 
+                    Tensor<feature_t> &input1, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        {
+            assert(input0.is_same_shape(input1));
+            assert(input0.exponent == input1.exponent);
+
+            DL_LOG_NN_LATENCY_INIT();
+            Tensor<feature_t> output;
+            
+            if constexpr(!inplace)
+            {
+                DL_LOG_NN_LATENCY_START();
+                output.set_exponent(input0.exponent).set_shape(input0.shape).apply_element();
+                DL_LOG_NN_LATENCY_END("apply");
+
+                DL_LOG_NN_LATENCY_START();
+                min2d(output, input0, input1, assign_core);
+                DL_LOG_NN_LATENCY_END("min2d");
+
+                return output;
+            }
+            else
+            {
+                DL_LOG_NN_LATENCY_START();
+                min2d(input0, input0, input1, assign_core);
+                DL_LOG_NN_LATENCY_END("min2d");
+            }
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_mul2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_mul2d.hpp
@ -0,0 +1,91 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief activation(mul2d(input0, input1)).
+         * 
+         * @param output            as an output
+         * @param input0            as one input
+         * @param input1            as another input
+         * @param activation        activation of mul2d, if you don't specify anything, no activation is applied
+         * @param assign_core       not effective yet
+         * @param output_exponent   exponent of output, only and must specify if inplace operation happens
+         */
+        void mul2d(Tensor<int16_t> &output, 
+                    Tensor<int16_t> &input0, 
+                    Tensor<int16_t> &input1, 
+                    const Activation<int16_t> *const activation = NULL, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
+                    const int output_exponent = INT_MIN);
+
+        /**
+         * @brief activation(mul2d(input0, input1)).
+         * 
+         * @param output            as an output
+         * @param input0            as one input
+         * @param input1            as another input
+         * @param activation        activation of mul2d, if you don't specify anything, no activation is applied
+         * @param assign_core       not effective yet
+         * @param output_exponent   exponent of output, only and must specify if inplace operation happens
+         */
+        void mul2d(Tensor<int8_t> &output, 
+                    Tensor<int8_t> &input0, 
+                    Tensor<int8_t> &input1, 
+                    const Activation<int8_t> *const activation = NULL, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
+                    const int output_exponent = INT_MIN);
+
+        /**
+         * @brief activation(mul2d(input0, input1)).
+         * 
+         * @tparam inplace: whether directly store the output to input0
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param output_exponent exponent of output
+         * @param input0          as one input
+         * @param input1          as another input
+         * @param activation      activation of mul2d, if you don't specify anything, no activation is applied
+         * @param assign_core     not effective yet
+         * @return mul2d result or no return(result store to input0)
+         */
+        template <bool inplace = false, typename feature_t>
+        auto mul2d(const int output_exponent, 
+                    Tensor<feature_t> &input0, 
+                    Tensor<feature_t> &input1, 
+                    const Activation<feature_t> *activation, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        {
+            assert(input0.is_same_shape(input1));
+
+            DL_LOG_NN_LATENCY_INIT();
+            Tensor<feature_t> output;
+
+            if constexpr(!inplace)
+            {
+                DL_LOG_NN_LATENCY_START();
+                output.set_exponent(output_exponent).set_shape(input0.shape).apply_element();
+                DL_LOG_NN_LATENCY_END("apply");
+
+                DL_LOG_NN_LATENCY_START();
+                mul2d(output, input0, input1, activation, assign_core);
+                DL_LOG_NN_LATENCY_END("mul2d");
+
+                return output;
+            }
+            else
+            {
+                DL_LOG_NN_LATENCY_START();
+                mul2d(input0, input0, input1, activation, assign_core, output_exponent);
+                DL_LOG_NN_LATENCY_END("mul2d");
+            }
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_prelu.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_prelu.hpp
@ -0,0 +1,82 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief prelu(input).
+         * 
+         * @param output                as an output
+         * @param input                 as an input
+         * @param activation_element    quantized alpha elements along channel axis
+         * @param activation_exponent   exponent of quantized alpha elements
+         * @param assign_core not effective yet
+         */
+        void prelu(Tensor<int16_t> &output,
+                   Tensor<int16_t> &input,
+                   const int16_t *activation_element,
+                   const int activation_exponent,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief prelu(input).
+         * 
+         * @param output                as an output
+         * @param input                 as an input
+         * @param activation_element    quantized alpha elements along channel axis
+         * @param activation_exponent   exponent of quantized alpha elements
+         * @param assign_core not effective yet
+         */
+        void prelu(Tensor<int8_t> &output,
+                   Tensor<int8_t> &input,
+                   const int8_t *activation_element,
+                   const int activation_exponent,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief prelu(input)
+         * 
+         * @tparam inplace: whether directly store the output to input
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param input                 as an input
+         * @param activation_element    quantized alpha elements along channel axis
+         * @param activation_exponent   exponent of quantized alpha elements
+         * @param assign_core           not effective yet
+         * @return prelu result or no return(result store to input)
+         */
+        template <bool inplace = false, typename feature_t>
+        auto prelu(Tensor<feature_t> &input, 
+                    const feature_t *activation_element, 
+                    const int activation_exponent, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        {
+            DL_LOG_NN_LATENCY_INIT();
+            Tensor<feature_t> output;
+            if constexpr(!inplace)
+            {
+                DL_LOG_NN_LATENCY_START();
+                output.set_exponent(input.exponent).set_shape(input.shape).apply_element();
+                DL_LOG_NN_LATENCY_END("apply");
+
+                DL_LOG_NN_LATENCY_START();
+                prelu(output, input, activation_element, activation_exponent, assign_core);
+                DL_LOG_NN_LATENCY_END("prelu");
+
+                return output;
+            }
+            else
+            {
+                DL_LOG_NN_LATENCY_START();
+                prelu(input, input, activation_element, activation_exponent, assign_core);
+                DL_LOG_NN_LATENCY_END("prelu");
+            } 
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_relu.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_relu.hpp
@ -0,0 +1,70 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief relu(input).
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param assign_core not effective yet
+         */
+        void relu(Tensor<int16_t> &output, 
+                    Tensor<int16_t> &input, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief relu(input).
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param assign_core not effective yet
+         */
+        void relu(Tensor<int8_t> &output, 
+                    Tensor<int8_t> &input, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief relu(input)
+         * 
+         * @tparam inplace: whether directly store the output to input
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param input           as an input
+         * @param assign_core     not effective yet
+         * @return relu result or no return(result store to input)
+         */
+        template <bool inplace = false, typename feature_t>
+        auto relu(Tensor<feature_t> &input, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        {
+            DL_LOG_NN_LATENCY_INIT();
+            Tensor<feature_t> output;
+            
+            if constexpr(!inplace)
+            {
+                DL_LOG_NN_LATENCY_START();
+                output.set_exponent(input.exponent).set_shape(input.shape).apply_element();
+                DL_LOG_NN_LATENCY_END("apply");
+
+                DL_LOG_NN_LATENCY_START();
+                relu(output, input, assign_core);
+                DL_LOG_NN_LATENCY_END("relu");
+
+                return output;
+            }
+            else
+            {
+                DL_LOG_NN_LATENCY_START();
+                relu(input, input, assign_core);
+                DL_LOG_NN_LATENCY_END("relu");
+            }
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_sub2d.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/nn/dl_nn_sub2d.hpp
@ -0,0 +1,90 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief activation(sub2d(input0, input1)).
+         * 
+         * @param output      as an output
+         * @param input0      as one input
+         * @param input1      as another input
+         * @param activation  activation of sub2d, if you don't specify anything, no activation is applied
+         * @param assign_core not effective yet
+         * @param output_exponent   exponent of output, only and must specify if inplace operation happens 
+         */
+        void sub2d(Tensor<int16_t> &output, 
+                    Tensor<int16_t> &input0, 
+                    Tensor<int16_t> &input1, 
+                    const Activation<int16_t> *const activation = NULL, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
+                    const int output_exponent = INT_MIN);
+
+        /**
+         * @brief activation(sub2d(input0, input1)).
+         * 
+         * @param output      as an output
+         * @param input0      as one input
+         * @param input1      as another input
+         * @param activation  activation of sub2d, if you don't specify anything, no activation is applied
+         * @param assign_core not effective yet
+         * @param output_exponent   exponent of output, only and must specify if inplace operation happens
+         */
+        void sub2d(Tensor<int8_t> &output, 
+                    Tensor<int8_t> &input0, 
+                    Tensor<int8_t> &input1, 
+                    const Activation<int8_t> *const activation = NULL, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
+                    const int output_exponent = INT_MIN);
+
+        /**
+         * @brief activation(sub2d(input0, input1)).
+         * 
+         * @tparam inplace: whether directly store the output to input0
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param output_exponent exponent of output
+         * @param input0          as one input
+         * @param input1          as another input
+         * @param activation      activation of sub2d, if you don't specify anything, no activation is applied
+         * @param assign_core     not effective yet
+         * @return sub2d result or no return(result store to input0)
+         */
+        template <bool inplace = false, typename feature_t>
+        auto sub2d(const int output_exponent, 
+                    Tensor<feature_t> &input0, 
+                    Tensor<feature_t> &input1, 
+                    const Activation<feature_t> *activation, 
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        {
+            assert(input0.is_same_shape(input1));
+
+            DL_LOG_NN_LATENCY_INIT();
+            Tensor<feature_t> output;
+            if constexpr(!inplace)
+            {
+                DL_LOG_NN_LATENCY_START();
+                output.set_exponent(output_exponent).set_shape(input0.shape).apply_element();
+                DL_LOG_NN_LATENCY_END("apply");
+
+                DL_LOG_NN_LATENCY_START();
+                sub2d(output, input0, input1, activation, assign_core);
+                DL_LOG_NN_LATENCY_END("sub2d");
+
+                return output;
+            }
+            else
+            {
+                DL_LOG_NN_LATENCY_START();
+                sub2d(input0, input0, input1, activation, assign_core, output_exponent);
+                DL_LOG_NN_LATENCY_END("sub2d");
+            }
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/tool/dl_tool.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/tool/dl_tool.hpp
@ -0,0 +1,377 @@
+#pragma once
+
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "esp_system.h"
+#include "esp_timer.h"
+#include "freertos/FreeRTOS.h"
+
+#include "dl_define.hpp"
+
+extern "C"
+{
+#if CONFIG_TIE728_BOOST
+    void dl_tie728_memset_8b(void *ptr, const int value, const int n);
+    void dl_tie728_memset_16b(void *ptr, const int value, const int n);
+    void dl_tie728_memset_32b(void *ptr, const int value, const int n);
+#endif
+}
+
+namespace dl
+{
+    namespace tool
+    {
+        /**
+         * @brief Set memory zero.
+         * 
+         * @param ptr pointer of memory
+         * @param n   byte number
+         */
+        void set_zero(void *ptr, const int n);
+
+        /**
+         * @brief Set array value.
+         * 
+         * @tparam T supports all data type, sizeof(T) equals to 1, 2 and 4 will boost by instruction 
+         * @param ptr   pointer of array
+         * @param value value to set
+         * @param len   length of array
+         */
+        template <typename T>
+        void set_value(T *ptr, const T value, const int len)
+        {
+#if CONFIG_TIE728_BOOST
+            int *temp = (int *)&value;
+            if (sizeof(T) == 1)
+                dl_tie728_memset_8b(ptr, *temp, len);
+            else if (sizeof(T) == 2)
+                dl_tie728_memset_16b(ptr, *temp, len);
+            else if (sizeof(T) == 4)
+                dl_tie728_memset_32b(ptr, *temp, len);
+            else
+#endif
+                for (size_t i = 0; i < len; i++)
+                    ptr[i] = value;
+        }
+
+        /**
+         * @brief Copy memory.
+         * 
+         * @param dst pointer of destination
+         * @param src pointer of source
+         * @param n   byte number
+         */
+        void copy_memory(void *dst, void *src, const int n);
+
+        /**
+         * @brief Apply memory without initialized. Must use free_aligned() to free the memory.
+         * 
+         * @param number number of elements
+         * @param size   size of element
+         * @param align  number of aligned, e.g., 16 means 16-byte aligned
+         * @return pointer of allocated memory. NULL for failed
+         */
+        inline void *malloc_aligned(int number, int size, int align = 0)
+        {
+            int n = number * size;
+            n >>= 4;
+            n += 2;
+            n <<= 4;
+            int total_size = n + align + sizeof(void *) + sizeof(int);
+            void *res = malloc(total_size);
+#if DL_SPIRAM_SUPPORT
+            if (NULL == res)
+                res = heap_caps_malloc(total_size, MALLOC_CAP_SPIRAM);
+#endif
+            if (NULL == res)
+            {
+                printf("Fail to malloc %d bytes from DRAM(%d bytyes) and PSRAM(%d bytes), PSRAM is %s.\n",
+                       total_size,
+                       heap_caps_get_free_size(MALLOC_CAP_INTERNAL),
+                       heap_caps_get_free_size(MALLOC_CAP_SPIRAM),
+                       DL_SPIRAM_SUPPORT ? "on" : "off");
+                return NULL;
+            }
+            void **data = (void **)res + 2; // 4-byte for pointer, 4-bytes for n
+            void **aligned;
+            if (align)
+                aligned = (void **)(((size_t)data + (align - 1)) & -align);
+            else
+                aligned = data;
+
+            aligned[-1] = res;
+            int *temp = (int *)aligned;
+            temp[-2] = n;
+
+            return (void *)aligned;
+        }
+
+        /**
+         * @brief Apply memory with zero-initialized. Must use dl_lib_free() to free the memory.
+         * 
+         * @param number number of elements
+         * @param size   size of element
+         * @param align  number of aligned, e.g., 16 means 16-byte aligned
+         * @return pointer of allocated memory. NULL for failed
+         */
+        inline void *calloc_aligned(int number, int size, int align = 0)
+        {
+
+            void *aligned = malloc_aligned(number, size, align);
+            int n = *((int *)aligned - 2);
+            set_zero(aligned, n);
+
+            return (void *)aligned;
+        }
+
+        /**
+         * @brief Free the calloc_aligned() and malloc_aligned() memory
+         * 
+         * @param address pointer of memory to free
+         */
+        inline void free_aligned(void *address)
+        {
+            if (NULL == address)
+                return;
+
+            free(((void **)address)[-1]);
+        }
+
+        /**
+         * @brief Truncate the input into int8_t range.
+         * 
+         * @tparam T supports all integer types
+         * @param output as an output
+         * @param input  as an input
+         */
+        template <typename T>
+        void truncate(int8_t &output, T input)
+        {
+            if (input >= DL_Q8_MAX)
+                output = DL_Q8_MAX;
+            else if (input <= DL_Q8_MIN)
+                output = DL_Q8_MIN;
+            else
+                output = input;
+        }
+
+        /**
+         * @brief Truncate the input into int16_t range.
+         * 
+         * @tparam T supports all integer types
+         * @param output as an output
+         * @param input  as an input
+         */
+        template <typename T>
+        void truncate(int16_t &output, T input)
+        {
+            if (input >= DL_Q16_MAX)
+                output = DL_Q16_MAX;
+            else if (input <= DL_Q16_MIN)
+                output = DL_Q16_MIN;
+            else
+                output = input;
+        }
+
+        /**
+         * @brief Calculate the exponent of quantizing 1/n into max_value range.
+         * 
+         * @param n          1/n: value to be quantized
+         * @param max_value  the max_range
+         */
+        inline int calculate_exponent(int n, int max_value)
+        {
+            int exp = 0;
+            int tmp = 1 / n;
+            while (tmp < max_value)
+            {
+                exp += 1;
+                tmp = (1 << exp) / n;
+            }
+            exp -= 1;
+
+            return exp;
+        }
+
+        /**
+         * @brief Print vector in format "[x1, x2, ...]\n".
+         * 
+         * @param array to print
+         */
+        inline void print_vector(std::vector<int> &array, const char *message = NULL)
+        {
+            if (message)
+                printf("%s: ", message);
+
+            printf("[");
+            for (int i = 0; i < array.size(); i++)
+            {
+                printf(", %d" + (i ? 0 : 2), array[i]);
+            }
+            printf("]\n");
+        }
+
+        /**
+         * @brief Get the cycle object
+         * 
+         * @return cycle count
+         */
+        inline uint32_t get_cycle()
+        {
+            uint32_t ccount;
+            __asm__ __volatile__("rsr %0, ccount"
+                                 : "=a"(ccount)
+                                 :
+                                 : "memory");
+            return ccount;
+        }
+
+        class Latency
+        {
+        private:
+            const uint32_t size; /*<! size of queue */
+            uint32_t *queue;     /*<! queue for storing history period */
+            uint32_t period;     /*<! current period */
+            uint32_t sum;        /*<! sum of period */
+            uint32_t count;      /*<! the number of added period */
+            uint32_t next;       /*<! point to next element in queue */
+            uint32_t timestamp;  /*<! record the start >*/
+
+        public:
+            /**
+             * @brief Construct a new Latency object.
+             * 
+             * @param size 
+             */
+            Latency(const uint32_t size = 1) : size(size),
+                                               period(0),
+                                               sum(0),
+                                               count(0),
+                                               next(0)
+            {
+                this->queue = (this->size > 1) ? (uint32_t *)calloc(this->size, sizeof(uint32_t)) : NULL;
+            }
+
+            /**
+             * @brief Destroy the Latency object.
+             * 
+             */
+            ~Latency()
+            {
+                if (this->queue)
+                    free(this->queue);
+            }
+
+            /**
+             * @brief Record the start timestamp.
+             * 
+             */
+            void start()
+            {
+#if DL_LOG_LATENCY_UNIT
+                this->timestamp = get_cycle();
+#else
+                this->timestamp = esp_timer_get_time();
+#endif
+            }
+
+            /**
+             * @brief Record the period.
+             * 
+             */
+            void end()
+            {
+#if DL_LOG_LATENCY_UNIT
+                this->period = get_cycle() - this->timestamp;
+#else
+                this->period = esp_timer_get_time() - this->timestamp;
+#endif
+                if (this->queue)
+                {
+                    this->sum -= this->queue[this->next];
+                    this->queue[this->next] = this->period;
+                    this->sum += this->queue[this->next];
+                    this->next++;
+                    this->next = this->next % this->size;
+                    if (this->count < this->size)
+                    {
+                        this->count++;
+                    }
+                }
+            }
+
+            /**
+             * @brief Return the period.
+             * 
+             * @return this->timestamp_end - this->timestamp
+             */
+            uint32_t get_period()
+            {
+                return this->period;
+            }
+
+            /**
+             * @brief Get the average period.
+             * 
+             * @return average latency 
+             */
+            uint32_t get_average_period()
+            {
+                return this->queue ? (this->sum / this->count) : this->period;
+            }
+
+            /**
+             * @brief Clear the period
+             * 
+             */
+            void clear_period()
+            {
+                this->period = 0;
+            }
+
+            /**
+             * @brief Print in format "latency: {this->period} {unit}\n".
+             */
+            void print()
+            {
+#if DL_LOG_LATENCY_UNIT
+                printf("latency: %15u cycle\n", this->get_average_period());
+#else
+                printf("latency: %15u us\n", this->get_average_period());
+#endif
+            }
+
+            /**
+             * @brief Print in format "{message}: {this->period} {unit}\n".
+             * 
+             * @param message message of print
+             */
+            void print(const char *message)
+            {
+#if DL_LOG_LATENCY_UNIT
+                printf("%s: %15u cycle\n", message, this->get_average_period());
+#else
+                printf("%s: %15u us\n", message, this->get_average_period());
+#endif
+            }
+
+            /**
+             * @brief Print in format "{prefix}::{key}: {this->period} {unit}\n".
+             * 
+             * @param prefix prefix of print
+             * @param key    key of print
+             */
+            void print(const char *prefix, const char *key)
+            {
+#if DL_LOG_LATENCY_UNIT
+                printf("%s::%s: %u cycle\n", prefix, key, this->get_average_period());
+#else
+                printf("%s::%s: %u us\n", prefix, key, this->get_average_period());
+#endif
+            }
+        };
+    } // namespace tool
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/tool/dl_tool_cache.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/tool/dl_tool_cache.hpp
@ -0,0 +1,74 @@
+#pragma once
+
+#include <stdint.h>
+
+#if CONFIG_IDF_TARGET_ESP32S3
+#include "esp32s3/rom/cache.h"
+#include "soc/extmem_reg.h"
+#endif
+
+namespace dl
+{
+    namespace tool
+    {
+        namespace cache
+        {
+            /**
+             * @brief Initialize preload.
+             * 
+             * @param preload One of 1 or 0,
+             *                - 1: turn on the preload
+             *                - 0: turn off the preload
+             * @return 
+             *         - 1: Initialize successfully
+             *         - 0: Initialize successfully, autoload has been turned off
+             *         - -1: Initialize failed, the chip does not support preload
+             */
+            int8_t preload_init(uint8_t preload = 1);
+
+            /**
+             * @brief Preload memory.
+             * 
+             * @param addr the start address of data to be preloaded
+             * @param size the size of the data in byte to be preloaded
+             */
+            void preload_func(uint32_t addr, uint32_t size);
+
+            /**
+             * @brief Initialize autoload. 
+             * 
+             * @param autoload  One of 1 or 0,
+             *                  - 1: turn on the autoload
+             *                  - 0: turn off the autoload
+             * @param trigger   One of 0 or 1 or 2,
+             *                  - 0: miss, TODO:@yuanjiong
+             *                  - 1: hit, TODO:@yuanjiong
+             *                  - 2: both,TODO:@yuanjiong
+             * @param line_size the number of cache lines to be autoloaded
+             * @return status,
+             *         - 1: Initialize sucessfully
+             *         - 0: Initialize suceesfully, preload has been turned off
+             *         - -1: Initialize failed, the chip does not support autoload
+             */
+            int8_t autoload_init(uint8_t autoload = 1, uint8_t trigger = 2, uint8_t line_size = 0);
+
+            /**
+             * @brief Autoload memory.           
+             * 
+             * @param addr1 the start address of data1 to be autoloaded
+             * @param size1 the size of the data1 in byte to be preloaded
+             * @param addr2 the start address of data2 to be autoloaded
+             * @param size2 the size of the data2 in byte to be preloaded
+             */
+            void autoload_func(uint32_t addr1, uint32_t size1, uint32_t addr2, uint32_t size2);
+
+            /**
+             * @brief Autoload memory.
+             * 
+             * @param addr1 the start address of data1 to be autoloaded
+             * @param size1 the size of the data1 in byte to be preloaded
+             */
+            void autoload_func(uint32_t addr1, uint32_t size1);
+        }
+    } // namespace tool
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/typedef/dl_constant.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/typedef/dl_constant.hpp
@ -0,0 +1,125 @@
+#pragma once
+
+#include "dl_define.hpp"
+#include <vector>
+
+namespace dl
+{
+    /**
+     * @brief Base class of Filter, Bias, Activation.
+     * 
+     * @tparam T supports int16_t and int8_t,
+     *         - int16_t: stands for operation in int16_t quantize,
+     *         - int8_t: stands for operation in int8_t quantize.
+     */
+    template <typename T>
+    class Constant
+    {
+    public:
+        const T *element;             /*<! point to element. >*/
+        const int exponent;           /*<! exponent of element. >*/
+        const std::vector<int> shape; /*<! shape of element. >*/
+
+        /**
+         * @brief Construct a new Constant object.
+         * 
+         * @param element  point to element.
+         * @param exponent exponent of element.
+         * @param shape    shape of Constant.
+         */
+        Constant(const T *element, const int exponent, const std::vector<int> shape);
+    };
+
+    /**
+     * @brief Filter.
+     * NOTE: The shape format of filter is fixed, but the element sequence depands on optimization method.
+     *       - 1D: reserved
+     *       - 2D: shape format is [filter_height, filter_width, input_channel, output_channel]. dilation format is [height, width]
+     *  
+     * @tparam T supports int16_t and int8_t,
+     *         - int16_t: stands for operation in int16_t quantize,
+     *         - int8_t: stands for operation in int8_t quantize.
+     */
+    template <typename T>
+    class Filter : public Constant<T>
+    {
+    public:
+        const std::vector<int> dilation;      /*<! - 1D: reserved >*/
+                                              /*<! - 2D: [dilation_in_height, dilation_in_width] >*/
+        std::vector<int> shape_with_dilation; /*<! - 1D: reserved >*/
+                                              /*<! - 2D: [filter_height_with_dilation, filter_width_with_dilation, input_channel, output_channel] >*/
+        std::vector<int> channel_exponent;    /*<! exponent for per-channel >*/
+
+        /**
+         * @brief Construct a new Filter object.
+         * 
+         * @param element  point to element
+         * @param exponent exponent of element
+         * @param shape    shape of Filter,
+         *                 - 1D: reserved
+         *                 - 2D: [filter_height, filter_width, input_channel, output_channel]
+         * @param dilation dilation of Filter
+         *                 - 1D: reserved
+         *                 - 2D: [dilation_in_height, dilation_in_width]
+         */
+        Filter(const T *element, const int exponent, const std::vector<int> shape, const std::vector<int> dilation = {1, 1});
+
+        /**
+         * @brief Construct a new Filter object.
+         * 
+         * @param element          point to element
+         * @param channel_exponent exponent for per-channel
+         * @param shape            shape of element
+         * @param dilation         dilation of Filter
+         *                         - 1D: reserved
+         *                         - 2D: [dilation_in_height, dilation_in_width]
+         */
+        Filter(const T *element, const std::vector<int> channel_exponent, const std::vector<int> shape, const std::vector<int> dilation = {1, 1});
+
+        /**
+         * @brief Print the n-th filter.
+         * 
+         * @param n       index of output_channel
+         * @param message to print
+         */
+        void print2d_n(const int n, const char *message) const;
+    };
+
+    /**
+     * @brief Bias.
+     * 
+     * @tparam T supports int16_t and int8_t
+     *         - int16_t: stands for operation in int16_t quantize
+     *         - int8_t: stands for operation in int8_t quantize
+     */
+    template <typename T>
+    class Bias : public Constant<T>
+    {
+    public:
+        using Constant<T>::Constant;
+    };
+
+    /**
+     * @brief Activation.
+     * 
+     * @tparam T supports int16_t and int8_t
+     *         - int16_t: stands for operation in int16_t quantize
+     *         - int8_t: stands for operation in int8_t quantize
+     */
+    template <typename T>
+    class Activation : public Constant<T>
+    {
+    public:
+        const activation_type_t type; /*<! One of Linear or ReLU or LeakyReLU or PReLU */
+
+        /**
+         * @brief Construct a new Activation object.
+         * 
+         * @param type      One of Linear or ReLU or LeakyReLU or PReLU
+         * @param element   point to element of activation
+         * @param exponent  exponent of element
+         * @param shape     shape of element
+         */
+        Activation(const activation_type_t type, const T *element = NULL, const int exponent = 0, const std::vector<int> shape = {0});
+    };
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/include/typedef/dl_variable.hpp
+++ b/tools/sdk/esp32/include/esp-face/include/typedef/dl_variable.hpp
@ -0,0 +1,734 @@
+#pragma once
+
+#include <stdio.h>
+#include <vector>
+#include <assert.h>
+
+#include "dl_tool.hpp"
+
+namespace dl
+{
+    /**
+     * @brief Tensor
+     * 
+     * @tparam T support uint8_t, int8_t, int16_t and float.
+     */
+    template <typename T>
+    class Tensor
+    {
+    private:
+        int size;       /*<! size of element including padding */
+        bool auto_free; /*<! free element when object destroy */
+
+    public:
+        T *element;                          /*<! point to element */
+        int exponent;                        /*<! exponent of element */
+        std::vector<int> shape;              /*<! shape of Tensor */
+                                             /*<! 2D: shape is [height, width, channel] */
+                                             /*<! 1D: reserved */
+        std::vector<int> shape_with_padding; /*<! shape with padding of Tensor */
+                                             /*<! 2D: shape_with_padding is [height_with_padding, width_with_padding, channel_with_padding] */
+                                             /*<! 1D: reserved */
+        std::vector<int> padding;            /*<! padding of Tensor */
+                                             /*<!- 2D: padding format is [top, bottom, left, right] */
+                                             /*<! - 1D: reserved */
+
+        /**
+         * @brief Construct a new Tensor object
+         * 
+         */
+        Tensor() : size(-1), auto_free(true), element(NULL), exponent(0) {}
+
+        /**
+         * @brief Construct a new Tensor object by copying from input.
+         * 
+         * @param input an input Tensor
+         * @param deep  one of true or false
+         *              - true: apply a new memory, copy value from input.element to this new memory
+         *              - false: take over input.element to this->element
+         */
+        Tensor(Tensor<T> &input, bool deep) : size(input.size),
+                                              auto_free(input.auto_free),
+                                              exponent(input.exponent),
+                                              shape(input.shape),
+                                              shape_with_padding(input.shape_with_padding),
+                                              padding(input.padding)
+        {
+            if (deep)
+            {
+                int size_real = input.shape_with_padding.size() ? input.shape_with_padding[0] * input.shape_with_padding[1] * input.shape_with_padding[2] : 0;
+                T *new_element = (T *)tool::calloc_aligned(size_real, sizeof(T), 16);
+                tool::copy_memory(new_element, input.element, size_real * sizeof(T));
+                this->element = new_element;
+            }
+            else
+            {
+                this->element = input.element;
+            }
+        }
+
+        /**
+         * @brief Destroy the Tensor object
+         * 
+         */
+        ~Tensor()
+        {
+            if (this->auto_free)
+                this->free_element();
+        }
+
+        /**
+         * @brief Set the auto free object.
+         * 
+         * @param auto_free one of true or false
+         *                  - true: free element when object destroyed
+         *                  - false: do not
+         * @return self
+         */
+        Tensor<T> &set_auto_free(const bool auto_free)
+        {
+            this->auto_free = auto_free;
+            return *this;
+        }
+
+        /**
+         * @brief Set the element.
+         * 
+         * @param element point to element memory
+         * @return self
+         */
+        Tensor<T> &set_element(T *element, const bool auto_free = false)
+        {
+            assert(this->element == NULL);
+            this->element = element;
+            this->auto_free = auto_free;
+
+            return *this;
+        }
+
+        /**
+         * @brief Set the exponent.
+         * 
+         * @param exponent exponent of element
+         * @return self
+         */
+        Tensor<T> &set_exponent(const int exponent)
+        {
+            this->exponent = exponent;
+
+            return *this;
+        }
+
+        /**
+         * @brief Set the shape of Tensor. Initial this->padding = {0}. Initial this->size = -1.
+         * 
+         * @param shape shape in 
+         *              - 2D: [height, width]
+         * @return self
+         */
+        Tensor<T> &set_shape(const std::vector<int> shape)
+        {
+            for (int i = 0; i < shape.size(); ++i)
+            {
+                assert(shape[i] > 0);
+            }
+            this->shape = shape;
+            this->shape_with_padding = shape;
+            this->size = -1;
+            this->padding = std::vector<int>(((this->shape.size() - 1) << 1), 0);
+            return *this;
+        }
+
+        /**
+         * @brief Set the padding size object.
+         * 
+         * @param padding padding size in
+         *                - 2D: [top, bottom, left, right]
+         * @return self
+         */
+        Tensor &set_padding_size(std::vector<int> &padding)
+        {
+            assert(this->shape.size());      // call Tensor.set_shape() first
+            assert(this->shape.size() == 3); // TODO: || this->shape.size() == 2
+
+            if (this->shape.size() == 3)
+            {
+                std::vector<int> new_padding = this->padding;
+                bool dont_update = true;
+
+                if (padding[0] > this->padding[0])
+                {
+                    new_padding[0] = padding[0];
+                    dont_update = false;
+                }
+
+                if (padding[1] > this->padding[1])
+                {
+                    new_padding[1] = padding[1];
+                    dont_update = false;
+                }
+
+                if (padding[2] > this->padding[2])
+                {
+                    new_padding[2] = padding[2];
+                    dont_update = false;
+                }
+
+                if (padding[3] > this->padding[3])
+                {
+                    new_padding[3] = padding[3];
+                    dont_update = false;
+                }
+
+                if (dont_update)
+                {
+                    return *this;
+                }
+
+                std::vector<int> new_shape_with_padding = this->shape;
+
+                new_shape_with_padding[0] += (new_padding[0] + new_padding[1]);
+                new_shape_with_padding[1] += (new_padding[2] + new_padding[3]);
+                int new_size = new_shape_with_padding[0] * new_shape_with_padding[1] * new_shape_with_padding[2];
+
+                if (this->element) // if this->element != NULL, do padding by copy memory
+                {
+                    T *new_element = (T *)tool::malloc_aligned(new_size, sizeof(T), 16);
+                    T *dst = new_element + ((new_padding[0] * new_shape_with_padding[1]) + new_padding[2]) * new_shape_with_padding[2];
+                    T *src = this->get_element_ptr();
+                    int offset_dst_next_y = new_shape_with_padding[1] * new_shape_with_padding[2];     // width * channel
+                    int src_copy_length = this->shape[1] * this->shape[2];                             // width * channel
+                    int offset_src_next_y = this->shape_with_padding[1] * this->shape_with_padding[2]; // width * channel
+                    for (int y = 0; y < this->shape[0]; y++)
+                    {
+                        tool::copy_memory(dst, src, src_copy_length * sizeof(T));
+                        dst += offset_dst_next_y;
+                        src += offset_src_next_y;
+                    }
+
+                    if (this->auto_free)
+                        tool::free_aligned(this->element);
+                    this->element = new_element;
+                    this->auto_free = true;
+                }
+                this->padding = new_padding;
+                this->shape_with_padding = new_shape_with_padding;
+                this->size = new_size;
+            }
+            else if (this->shape.size() == 2)
+            {
+                printf("Tensor.set_padding_size with this->shape.size() == 2 not implement yet.\n");
+            }
+
+            return *this;
+        }
+
+        /**
+         * @brief Set the padding value object.
+         * 
+         * @param padding padding size in
+         *                - 2D: [top, bottom, left, right]
+         * @param value   value to set
+         * @return self
+         */
+        Tensor<T> &set_padding_value(std::vector<int> &padding, T value);
+
+        /**
+         * @brief Get the element pointer.
+         * 
+         * @param padding padding size in
+         *                - 2D: [top, bottom, left, right]
+         * @return pointer to memory with padding
+         */
+        T *get_element_ptr(const std::vector<int> padding = {0, 0, 0, 0})
+        {
+            assert(this->shape.size() == 3); // TODO: || this->shape.size() == 2
+
+            if (this->shape.size() == 3)
+            {
+                return this->element + ((this->padding[0] - padding[0]) * this->shape_with_padding[1] + (this->padding[2] - padding[2])) * this->shape_with_padding[2];
+            }
+            else if (this->shape.size() == 2)
+            {
+                printf("Tensor.get_element_ptr with this->shape.size() == 2 is not implemented.\n");
+            }
+
+            return NULL;
+        }
+
+        /**
+         * @brief Get the element value.
+         * 
+         * @param index        index in
+         *                     - 2D: [y, x, c]
+         * @param with_padding one of true or false,
+         *                     - true: make padding size in count
+         *                     - false: do not
+         * @return element value
+         */
+        T &get_element_value(const std::vector<int> index, const bool with_padding = false)
+        {
+            assert(index.size() == this->shape.size());
+            assert(this->shape.size() == 3); // TODO: || this->shape() == 2
+
+            int i = 0;
+            if (this->shape.size() == 3)
+            {
+                int y = index[0];
+                int x = index[1];
+                int c = index[2];
+                i = with_padding ? (y * this->shape_with_padding[1] + x) * this->shape_with_padding[2] + c : ((y + this->padding[0]) * this->shape_with_padding[1] + x + this->padding[2]) * this->shape_with_padding[2] + c;
+            }
+            else if (this->shape.size() == 2)
+            {
+                printf("Tensor.get_element_value with this->shape.size() == 2 is not implemented.\n");
+            }
+
+            return this->element[i];
+        }
+
+        /**
+         * @brief Get the size of element.
+         * 
+         * @return size of element including padding
+         */
+        int get_size()
+        {
+            if (this->size == -1) // didn't call Tensor.set_padding_size() before
+            {
+                this->size = 1;
+                for (std::vector<int>::iterator d = this->shape.begin(); d != this->shape.end(); d++)
+                    this->size *= *d;
+            }
+
+            return this->size;
+        }
+
+        /**
+         * @brief Apply memory with zero-initialized only if this->element is NULL.
+         * 
+         * @param auto_free one of true or false
+         *                  - true: free element when object destroyed
+         *                  - false: do not
+         * @return 
+         *         - true: on success
+         *         - false: if applying failed
+         */
+        bool calloc_element(const bool auto_free = true)
+        {
+            if (this->element != NULL)
+                return false;
+
+            this->element = (T *)dl::tool::calloc_aligned(this->get_size(), sizeof(T), 16);
+            this->auto_free = auto_free;
+
+            return true;
+        }
+
+        /**
+         * @brief Apply memory without initialized only if this->element is NULL.
+         * 
+         * @param auto_free one of true or false
+         *                  - true: free element when object destroyed
+         *                  - false: do not
+         * @return 
+         *         - true: on success
+         *         - false: if applying failed
+         */
+        bool malloc_element(const bool auto_free = true)
+        {
+            if (this->element != NULL)
+                return false;
+
+            this->element = (T *)tool::malloc_aligned(this->get_size(), sizeof(T), 16);
+            this->auto_free = auto_free;
+
+            return true;
+        }
+
+        /**
+         * @brief If this->element != NULL no memory will be applied and no value will be set in padding.
+         * Else apply memory without initialized and set value to padding.
+         * 
+         * @param padding_value value to set in padding
+         * @param auto_free     one of true of false
+         *                      - true: free element when object destroyed
+         *                      - false: do not
+         * @return 
+         *         - true: apply memory and set padding value successfully
+         *         - false: no memory applied and no padding value set
+         */
+        bool apply_element(const T padding_value = 0, const bool auto_free = true)
+        {
+            if (this->element != NULL)
+                return false;
+
+            this->element = (T *)tool::malloc_aligned(this->get_size(), sizeof(T), 16);
+            this->set_padding_value(this->padding, padding_value);
+            this->auto_free = auto_free;
+
+            return true;
+        }
+
+        /**
+         * @brief free element only if this->element != NULL
+         * set this->element to NULL, after free
+         * @brief Free element if this->element is not NULL.
+         */
+        void free_element()
+        {
+            if (this->auto_free && this->element)
+            {
+                tool::free_aligned(this->element);
+                this->element = NULL;
+            }
+        }
+
+        /**
+         * @brief Print the shape of Tensor in format "shape = ({top_padding} + {height} + {bottom_padding}, {left_padding} + {width} + {right_padding}, {channel}(channel_with_padding))\n".
+         */
+        void print_shape()
+        {
+            printf("shape = (%d + %d + %d, %d + %d + %d, %d(%d))\n",
+                   this->padding[0], this->shape[0], this->padding[1],
+                   this->padding[2], this->shape[1], this->padding[3],
+                   this->shape[2], this->shape_with_padding[2]);
+        }
+
+        /**
+         * @brief Take numpy for example, this function print Tensor[y_start:y_end, x_start:x_end, c_start:c_end].
+         * 
+         * inner box is effective value of Tensor, "0" around is padding.
+         * 
+         * (with padding)
+         *               00000000000000000000000000000000000000000000000000
+         *               00000000000000000000000000000000000000000000000000
+         *               00000000000000000000000000000000000000000000000000
+         *               000000(without padding)                   00000000
+         *               000000                                    00000000
+         *               000000                                    00000000
+         *               000000          effective value           00000000
+         *               000000                                    00000000
+         *               000000                                    00000000
+         *               00000000000000000000000000000000000000000000000000
+         *               00000000000000000000000000000000000000000000000000
+         *               00000000000000000000000000000000000000000000000000
+         * 
+         * @param y_start start index in height
+         * @param y_end   end index in height
+         * @param x_start start index in width
+         * @param x_end   end index in width
+         * @param c_start start index in channel
+         * @param c_end   end index in channel
+         * @param message to print
+         * @param axis    print aligned this axis, effective only if all y_end - y_start, x_end - x_start and c_end - c_start equals to 1
+         * @param with_padding one of true or false,
+         *                     - true: count from (with padding) in upper image
+         *                     - false: count from (without padding) in upper image
+         */
+        void print(int y_start, int y_end,
+                   int x_start, int x_end,
+                   int c_start, int c_end,
+                   const char *message, int axis = 0, const bool with_padding = false)
+        {
+            assert(y_end > y_start);
+            assert(x_end > x_start);
+            assert(c_end > c_start);
+
+            y_start = DL_MAX(y_start, 0);
+            x_start = DL_MAX(x_start, 0);
+            c_start = DL_MAX(c_start, 0);
+            if (with_padding)
+            {
+                y_end = DL_MIN(y_end, this->shape_with_padding[0]);
+                x_end = DL_MIN(x_end, this->shape_with_padding[1]);
+                c_end = DL_MIN(c_end, this->shape_with_padding[2]);
+            }
+            else
+            {
+                y_end = DL_MIN(y_end, this->shape[0]);
+                x_end = DL_MIN(x_end, this->shape[1]);
+                c_end = DL_MIN(c_end, this->shape[2]);
+            }
+
+            printf("%s[%d:%d, %d:%d, %d:%d] | ", message, y_start, y_end, x_start, x_end, c_start, c_end);
+            this->print_shape();
+
+            if (y_end - y_start == 1)
+            {
+                if (x_end - x_start == 1)
+                {
+                    for (int c = c_start; c < c_end; c++)
+                        printf("%7d", c);
+                    printf("\n");
+
+                    for (int c = c_start; c < c_end; c++)
+                        printf("%7d", this->get_element_value({y_start, x_start, c}, with_padding));
+                    printf("\n");
+
+                    return;
+                }
+                else
+                {
+                    if (c_end - c_start == 1)
+                    {
+                        for (int x = x_start; x < x_end; x++)
+                            printf("%7d", x);
+                        printf("\n");
+
+                        for (int x = x_start; x < x_end; x++)
+                            printf("%7d", this->get_element_value({y_start, x, c_start}, with_padding));
+                        printf("\n");
+
+                        return;
+                    }
+                }
+            }
+            else
+            {
+                if (x_end - x_start == 1)
+                {
+                    if (c_end - c_start == 1)
+                    {
+                        for (int y = y_start; y < y_end; y++)
+                            printf("%7d", y);
+                        printf("\n");
+
+                        for (int y = y_start; y < y_end; y++)
+                            printf("%7d", this->get_element_value({y, x_start, c_start}, with_padding));
+                        printf("\n");
+
+                        return;
+                    }
+                }
+            }
+
+            if (y_end - y_start == 1)
+                axis = 0;
+
+            if (x_end - x_start == 1)
+                axis = 1;
+
+            if (c_end - c_start == 1)
+                axis = 2;
+
+            if (axis == 0)
+            {
+                // ______c
+                // |
+                // |
+                // x
+                //
+                for (int y = y_start; y < y_end; y++)
+                {
+                    printf("y = %d\n     ", y);
+
+                    for (int c = c_start; c < c_end; c++)
+                        printf("%7d", c);
+                    printf("\n");
+
+                    for (int x = x_start; x < x_end; x++)
+                    {
+                        printf("%5d", x);
+                        for (int c = c_start; c < c_end; c++)
+                            printf("%7d", this->get_element_value({y, x, c}, with_padding));
+                        printf("\n");
+                    }
+                    printf("\n");
+                }
+            }
+            else if (axis == 1)
+            {
+                // ______c
+                // |
+                // |
+                // y
+                //
+                for (int x = x_start; x < x_end; x++)
+                {
+                    printf("x = %d\n     ", x);
+
+                    for (int c = c_start; c < c_end; c++)
+                        printf("%7d", c);
+                    printf("\n");
+
+                    for (int y = y_start; y < y_end; y++)
+                    {
+                        printf("%5d", y);
+                        for (int c = c_start; c < c_end; c++)
+                            printf("%7d", this->get_element_value({y, x, c}, with_padding));
+                        printf("\n");
+                    }
+                    printf("\n");
+                }
+            }
+            else
+            {
+                // ______x
+                // |
+                // |
+                // y
+                //
+                for (int c = c_start; c < c_end; c++)
+                {
+                    printf("c = %d\n     ", c);
+
+                    for (int x = x_start; x < x_end; x++)
+                        printf("%7d", x);
+                    printf("\n");
+
+                    for (int y = y_start; y < y_end; y++)
+                    {
+                        printf("%5d", y);
+                        for (int x = x_start; x < x_end; x++)
+                            printf("%7d", this->get_element_value({y, x, c}, with_padding));
+                        printf("\n");
+                    }
+                    printf("\n");
+                }
+            }
+
+            return;
+        }
+
+        /**
+         * @brief print all the element of the Tensor.
+         * 
+         * @param message to print
+         * @param with_padding one of true or false,
+         *                     - true: the padding element will also be printed
+         *                     - false: the padding element will not be printed
+         */
+        void print_all(const char *message, const bool with_padding = false)
+        {
+            int y_end;
+            int x_end;
+            int c_end;
+            if (with_padding)
+            {
+                y_end = this->shape_with_padding[0];
+                x_end = this->shape_with_padding[1];
+                c_end = this->shape_with_padding[2];
+            }
+            else
+            {
+                y_end = this->shape[0];
+                x_end = this->shape[1];
+                c_end = this->shape[2];
+            }
+
+            printf("\n%s | ", message);
+            this->print_shape();
+
+            for (int y = 0; y < y_end; y++)
+            {
+                for (int x = 0; x < x_end; x++)
+                {
+                    for (int c = 0; c < c_end; c++)
+                        printf("%d ", this->get_element_value({y, x, c}, with_padding));
+                }
+            }
+            printf("\n");
+            return;
+        }
+
+        /**
+         * @brief Check the element value with input ground-truth.
+         * 
+         * @param gt_element ground-truth value of element
+         * @param bias permissible error
+         * @param info one of true or false
+         *             - true: print shape and result
+         *             - false: do not
+         * @return 
+         *         - true: in permissible error
+         *         - false: not 
+         */
+        bool check_element(T *gt_element, int bias = 2, bool info = true)
+        {
+            if (info)
+                this->print_shape();
+            int i = 0;
+            for (int y = 0; y < this->shape[0]; y++)
+            {
+                for (int x = 0; x < this->shape[1]; x++)
+                {
+                    for (int c = 0; c < this->shape[2]; c++)
+                    {
+                        int a = this->get_element_value({y, x, c});
+                        int b = gt_element[i];
+                        int offset = DL_ABS(a - b);
+                        if (offset > bias)
+                        {
+                            printf("element[%d, %d, %d]: %d v.s. %d\n", y, x, c, a, b);
+                            return false;
+                        }
+                        i++;
+                    }
+                }
+            }
+
+            if (info)
+                printf("PASS\n");
+
+            return true;
+        }
+
+        /**
+         * @brief Check the shape is the same as the shape of input.
+         * 
+         * @param input an input tensor 
+         * @return 
+         *         - true: same shape 
+         *         - false: not 
+         */
+        bool is_same_shape(Tensor<T> &input)
+        {
+            if (input.shape.size() != this->shape.size())
+            {
+                return false;
+            }
+            for (int i = 0; i < this->shape.size(); i++)
+            {
+                if (input.shape[i] != this->shape[i])
+                {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        Tensor<T> &operator=(const Tensor<T> &input)
+        {
+            this->size = input.size;
+            this->auto_free = input.auto_free;
+            this->exponent = input.exponent;
+            this->shape = input.shape;
+            this->padding = input.padding;
+            int size_real_tmp = this->shape_with_padding.size() ? this->shape_with_padding[0] * this->shape_with_padding[1] * this->shape_with_padding[2] : 0;
+            int size_input_real = input.shape_with_padding.size() ? input.shape_with_padding[0] * input.shape_with_padding[1] * input.shape_with_padding[2] : 0;
+            this->shape_with_padding = input.shape_with_padding;
+            if (this->element)
+            {
+                if (size_real_tmp != size_input_real)
+                {
+                    tool::free_aligned(this->element);
+                    T *new_element = (T *)tool::calloc_aligned(size_input_real, sizeof(T), 16);
+                    tool::copy_memory(new_element, input.element, size_input_real * sizeof(T));
+                    this->element = new_element;
+                }
+                else
+                {
+                    tool::copy_memory(this->element, input.element, size_input_real * sizeof(T));
+                }
+            }
+            else
+            {
+                T *new_element = (T *)tool::calloc_aligned(size_input_real, sizeof(T), 16);
+                tool::copy_memory(new_element, input.element, size_input_real * sizeof(T));
+                this->element = new_element;
+            }
+            return *this;
+        }
+    };
+} // namespace dl
--- a/tools/sdk/esp32/include/esp-face/lib/include/cat_face_3.h
+++ b/tools/sdk/esp32/include/esp-face/lib/include/cat_face_3.h
@ -1,40 +0,0 @@
-/*
- * ESPRESSIF MIT License
- *
- * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
- *
- * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
- * it is free of charge, to any person_body obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
- * to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or
- * substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
- * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
- * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#pragma once
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-#include "dl_lib_matrix3d.h"
-#include "dl_lib_matrix3dq.h"
-#include "freertos/FreeRTOS.h"
-#include "detection.h"
-
-    extern detection_model_t cat_face_3_model;
-
-#ifdef __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/lib/include/detection.h
+++ b/tools/sdk/esp32/include/esp-face/lib/include/detection.h
@ -1,87 +0,0 @@
-/*
-  * ESPRESSIF MIT License
-  *
-  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
-  *
-  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
-  * it is free of charge, to any person obtaining a copy of this software and associated
-  * documentation files (the "Software"), to deal in the Software without restriction, including
-  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
-  * to do so, subject to the following conditions:
-  *
-  * The above copyright notice and this permission notice shall be included in all copies or
-  * substantial portions of the Software.
-  *
-  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-  *
-  */
-#pragma once
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-#include "dl_lib_matrix3d.h"
-#include "dl_lib_matrix3dq.h"
-#include "freertos/FreeRTOS.h"
-
-    typedef enum
-    {
-        Anchor_Point, /*<! Anchor point detection model*/
-        Anchor_Box    /*<! Anchor box detection model */
-    } detection_model_type_t;
-
-    typedef struct
-    {
-        int **anchors_shape; /*<! Anchor shape of this stage */
-        int stride;          /*<! Zoom in stride of this stage */
-        int boundary;        /*<! Detection image low-limit of this stage */
-        int project_offset;  /*<! Project offset of this stage */
-    } detection_stage_config_t;
-
-    typedef struct
-    {
-        dl_matrix3dq_t *score;           /*<! score feature map of this stage*/
-        dl_matrix3dq_t *box_offset;      /*<! box_offset feature map of this stage*/
-        dl_matrix3dq_t *landmark_offset; /*<! landmark_offset feature map of this stage */
-    } detection_stage_result_t;
-
-    typedef struct
-    {
-        int resized_height;    /*<! The height after resized */
-        int resized_width;     /*<! The width after resized */
-        fptp_t y_resize_scale; /*<! resized_height / input_height */
-        fptp_t x_resize_scale; /*<! resized_width / input_width */
-        qtp_t score_threshold; /*<! Score threshold of detection model */
-        fptp_t nms_threshold;  /*<! NMS threshold of detection model */
-        bool with_landmark;    /*<! Whether detection with landmark, true: with, false: without */
-        bool free_image;       /*<! Whether free the resized image */
-        int enabled_top_k;     /*<! The number of enabled stages */
-    } detection_model_config_t;
-
-    typedef struct
-    {
-        detection_stage_config_t *stage_config;                                                                      /*<! Configuration of each stage */
-        int stage_number;                                                                                            /*<! The number of stages */
-        detection_model_type_t model_type;                                                                           /*<! The type of detection model */
-        detection_model_config_t model_config;                                                                       /*<! Configuration of detection model */
-        detection_stage_result_t *(*op)(dl_matrix3dq_t *, detection_model_config_t *);                               /*<! The function of detection inference */
-        void *(*get_boxes)(detection_stage_result_t *, detection_model_config_t *, detection_stage_config_t *, int); /*<! The function of how to get real boxes */
-    } detection_model_t;
-
-    /**
-     * @brief free 'detection_stage_result_t' type value
-     * 
-     * @param value A 'detection_stage_result_t' type value
-     */
-    void free_detection_stage_result(detection_stage_result_t value);
-
-#ifdef __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/lib/include/dl_lib_matrix3d.h
+++ b/tools/sdk/esp32/include/esp-face/lib/include/dl_lib_matrix3d.h
@ -1,819 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <assert.h>
-
-#if CONFIG_SPIRAM_SUPPORT || CONFIG_ESP32_SPIRAM_SUPPORT
-#include "freertos/FreeRTOS.h"
-#define DL_SPIRAM_SUPPORT 1
-#else
-#define DL_SPIRAM_SUPPORT 0
-#endif
-
-
-#ifndef max
-#define max(x, y) (((x) < (y)) ? (y) : (x))
-#endif
-
-#ifndef min
-#define min(x, y) (((x) < (y)) ? (x) : (y))
-#endif
-
-typedef float fptp_t;
-typedef uint8_t uc_t;
-
-typedef enum
-{
-    DL_SUCCESS = 0,
-    DL_FAIL = 1,
-} dl_error_type;
-
-typedef enum
-{
-    PADDING_VALID = 0,                   /*!< Valid padding */
-    PADDING_SAME = 1,                    /*!< Same padding, from right to left, free input */
-    PADDING_SAME_DONT_FREE_INPUT = 2,    /*!< Same padding, from right to left, do not free input */
-    PADDING_SAME_MXNET = 3,              /*!< Same padding, from left to right */
-} dl_padding_type;
-
-typedef enum
-{
-    DL_POOLING_MAX = 0,        /*!< Max pooling */
-    DL_POOLING_AVG = 1,        /*!< Average pooling */
-} dl_pooling_type; 
-/*
- * Matrix for 3d
- * @Warning: the sequence of variables is fixed, cannot be modified, otherwise there will be errors in esp_dsp_dot_float
- */
-typedef struct
-{
-    int w;        /*!< Width */
-    int h;        /*!< Height */
-    int c;        /*!< Channel */
-    int n;        /*!< Number of filter, input and output must be 1 */
-    int stride;   /*!< Step between lines */
-    fptp_t *item; /*!< Data */
-} dl_matrix3d_t;
-
-typedef struct
-{
-    int w;      /*!< Width */
-    int h;      /*!< Height */
-    int c;      /*!< Channel */
-    int n;      /*!< Number of filter, input and output must be 1 */
-    int stride; /*!< Step between lines */
-    uc_t *item; /*!< Data */
-} dl_matrix3du_t;
-
-typedef enum
-{
-    UPSAMPLE_NEAREST_NEIGHBOR = 0, /*!< Use nearest neighbor interpolation as the upsample method*/
-    UPSAMPLE_BILINEAR = 1,        /*!< Use nearest bilinear interpolation as the upsample method*/
-} dl_upsample_type;
-
-typedef struct
-{
-    int stride_x;                    /*!< Strides of width */
-    int stride_y;                    /*!< Strides of height */
-    dl_padding_type padding;         /*!< Padding type */
-} dl_matrix3d_mobilenet_config_t;
-
-/*
- * @brief Allocate a zero-initialized space. Must use 'dl_lib_free' to free the memory.
- *
- * @param cnt  Count of units.
- * @param size Size of unit.
- * @param align Align of memory. If not required, set 0.
- * @return Pointer of allocated memory. Null for failed.
- */
-static void *dl_lib_calloc(int cnt, int size, int align)
-{
-    int total_size = cnt * size + align + sizeof(void *);
-    void *res = malloc(total_size);
-    if (NULL == res)
-    {
-#if DL_SPIRAM_SUPPORT
-        res = heap_caps_malloc(total_size, MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
-    }
-    if (NULL == res)
-    {
-        printf("Item psram alloc failed. Size: %d x %d\n", cnt, size);
-#else
-        printf("Item alloc failed. Size: %d x %d, SPIRAM_FLAG: %d\n", cnt, size, DL_SPIRAM_SUPPORT);
-#endif
-        return NULL;
-    }
-    bzero(res, total_size);
-    void **data = (void **)res + 1;
-    void **aligned;
-    if (align)
-        aligned = (void **)(((size_t)data + (align - 1)) & -align);
-    else
-        aligned = data;
-
-    aligned[-1] = res;
-    return (void *)aligned;
-}
-
-/**
- * @brief Free the memory space allocated by 'dl_lib_calloc'
- * 
- */
-static inline void dl_lib_free(void *d)
-{
-    if (NULL == d)
-        return;
-
-    free(((void **)d)[-1]);
-}
-
-/*
- * @brief Allocate a 3D matrix with float items, the access sequence is NHWC
- *
- * @param n     Number of matrix3d, for filters it is out channels, for others it is 1
- * @param w     Width of matrix3d
- * @param h     Height of matrix3d
- * @param c     Channel of matrix3d
- * @return      3d matrix
- */
-static inline dl_matrix3d_t *dl_matrix3d_alloc(int n, int w, int h, int c)
-{
-    dl_matrix3d_t *r = (dl_matrix3d_t *)dl_lib_calloc(1, sizeof(dl_matrix3d_t), 0);
-    if (NULL == r)
-    {
-        printf("internal r failed.\n");
-        return NULL;
-    }
-    fptp_t *items = (fptp_t *)dl_lib_calloc(n * w * h * c, sizeof(fptp_t), 0);
-    if (NULL == items)
-    {
-        printf("matrix3d item alloc failed.\n");
-        dl_lib_free(r);
-        return NULL;
-    }
-
-    r->w = w;
-    r->h = h;
-    r->c = c;
-    r->n = n;
-    r->stride = w * c;
-    r->item = items;
-
-    return r;
-}
-
-/*
- * @brief Allocate a 3D matrix with 8-bits items, the access sequence is NHWC
- *
- * @param n     Number of matrix3d, for filters it is out channels, for others it is 1
- * @param w     Width of matrix3d
- * @param h     Height of matrix3d
- * @param c     Channel of matrix3d
- * @return      3d matrix
- */
-static inline dl_matrix3du_t *dl_matrix3du_alloc(int n, int w, int h, int c)
-{
-    dl_matrix3du_t *r = (dl_matrix3du_t *)dl_lib_calloc(1, sizeof(dl_matrix3du_t), 0);
-    if (NULL == r)
-    {
-        printf("internal r failed.\n");
-        return NULL;
-    }
-    uc_t *items = (uc_t *)dl_lib_calloc(n * w * h * c, sizeof(uc_t), 0);
-    if (NULL == items)
-    {
-        printf("matrix3du item alloc failed.\n");
-        dl_lib_free(r);
-        return NULL;
-    }
-
-    r->w = w;
-    r->h = h;
-    r->c = c;
-    r->n = n;
-    r->stride = w * c;
-    r->item = items;
-
-    return r;
-}
-
-/*
- * @brief Free a matrix3d
- *
- * @param m matrix3d with float items
- */
-static inline void dl_matrix3d_free(dl_matrix3d_t *m)
-{
-    if (NULL == m)
-        return;
-    if (NULL == m->item)
-    {
-        dl_lib_free(m);
-        return;
-    }
-    dl_lib_free(m->item);
-    dl_lib_free(m);
-}
-
-/*
- * @brief Free a matrix3d
- *
- * @param m matrix3d with 8-bits items
- */
-static inline void dl_matrix3du_free(dl_matrix3du_t *m)
-{
-    if (NULL == m)
-        return;
-    if (NULL == m->item)
-    {
-        dl_lib_free(m);
-        return;
-    }
-    dl_lib_free(m->item);
-    dl_lib_free(m);
-}
-
-
-/*
- * @brief Dot product with a vector and matrix
- *
- * @param out   Space to put the result
- * @param in    input vector
- * @param f     filter matrix
- */
-void dl_matrix3dff_dot_product(dl_matrix3d_t *out, dl_matrix3d_t *in, dl_matrix3d_t *f);
-
-/**
- * @brief Do a softmax operation on a matrix3d
- *
- * @param in        Input matrix3d
- */
-void dl_matrix3d_softmax(dl_matrix3d_t *m);
-
-/**
- * @brief Copy a range of float items from an existing matrix to a preallocated matrix
- *
- * @param dst   The destination slice matrix
- * @param src   The source matrix to slice
- * @param x     X-offset of the origin of the returned matrix within the sliced matrix
- * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
- * @param w     Width of the resulting matrix
- * @param h     Height of the resulting matrix
- */
-void dl_matrix3d_slice_copy(dl_matrix3d_t *dst,
-                            dl_matrix3d_t *src,
-                            int x,
-                            int y,
-                            int w,
-                            int h);
-
-/**
- * @brief Copy a range of 8-bits items from an existing matrix to a preallocated matrix
- *
- * @param dst   The destination slice matrix
- * @param src   The source matrix to slice
- * @param x     X-offset of the origin of the returned matrix within the sliced matrix
- * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
- * @param w     Width of the resulting matrix
- * @param h     Height of the resulting matrix
- */
-void dl_matrix3du_slice_copy(dl_matrix3du_t *dst,
-                             dl_matrix3du_t *src,
-                             int x,
-                             int y,
-                             int w,
-                             int h);
-
-/**
- * @brief Transform a sliced matrix block from nhwc to nchw, the block needs to be memory continous.
- *
- * @param out  The destination sliced matrix in nchw
- * @param in   The source sliced matrix in nhwc
- */
-void dl_matrix3d_sliced_transform_nchw(dl_matrix3d_t *out,
-                                       dl_matrix3d_t *in);
-
-/**
- * @brief Do a general CNN layer pass, dimension is (number, width, height, channel)
- *
- * @param in               Input matrix3d
- * @param filter           Weights of the neurons
- * @param bias             Bias for the CNN layer
- * @param stride_x         The step length of the convolution window in x(width) direction
- * @param stride_y         The step length of the convolution window in y(height) direction
- * @param padding          One of VALID or SAME
- * @param mode             Do convolution using C implement or xtensa implement, 0 or 1, with respect
- *                         If ESP_PLATFORM is not defined, this value is not used. Default is 0
- * @return dl_matrix3d_t*  The result of CNN layer
- */
-dl_matrix3d_t *dl_matrix3d_conv(dl_matrix3d_t *in,
-                                dl_matrix3d_t *filter,
-                                dl_matrix3d_t *bias,
-                                int stride_x,
-                                int stride_y,
-                                int padding,
-                                int mode);
-
-/**
- * @brief Do a global average pooling layer pass, dimension is (number, width, height, channel)
- *
- * @param in             Input matrix3d
- *
- * @return               The result of global average pooling layer
- */
-dl_matrix3d_t *dl_matrix3d_global_pool(dl_matrix3d_t *in);
-
-/**
- * @brief Calculate pooling layer of a feature map
- *
- * @param in               Input matrix, size (1, w, h, c)
- * @param f_w              Window width
- * @param f_h              Window height 
- * @param stride_x         Stride in horizontal direction
- * @param stride_y         Stride in vertical direction
- * @param padding          Padding type: PADDING_VALID and PADDING_SAME
- * @param pooling_type     Pooling type: DL_POOLING_MAX and POOLING_AVG
- * @return dl_matrix3d_t*  Resulting matrix, size (1, w', h', c)
- */
-dl_matrix3d_t *dl_matrix3d_pooling(dl_matrix3d_t *in,
-                                   int f_w,
-                                   int f_h,
-                                   int stride_x,
-                                   int stride_y,
-                                   dl_padding_type padding,
-                                   dl_pooling_type pooling_type);
-/**
- * @brief Do a batch normalization operation, update the input matrix3d: input = input * scale + offset
- *
- * @param m              Input matrix3d
- * @param scale          scale matrix3d,  scale = gamma/((moving_variance+sigma)^(1/2))
- * @param Offset         Offset matrix3d, offset = beta-(moving_mean*gamma/((moving_variance+sigma)^(1/2)))
- */
-void dl_matrix3d_batch_normalize(dl_matrix3d_t *m,
-                                 dl_matrix3d_t *scale,
-                                 dl_matrix3d_t *offset);
-
-/**
- * @brief Add a pair of matrix3d item-by-item: res=in_1+in_2
- *
- * @param in_1             First Floating point input matrix3d
- * @param in_2             Second Floating point input matrix3d
- *
- * @return dl_matrix3d_t*  Added data
- */
-dl_matrix3d_t *dl_matrix3d_add(dl_matrix3d_t *in_1, dl_matrix3d_t *in_2);
-
-/**
- * @brief Concatenate the channels of two matrix3ds into a new matrix3d
- *
- * @param in_1             First Floating point input matrix3d
- * @param in_2             Second Floating point input matrix3d
- *
- * @return dl_matrix3d_t*  A newly allocated matrix3d with as avlues in_1|in_2
- */
-dl_matrix3d_t *dl_matrix3d_concat(dl_matrix3d_t *in_1, dl_matrix3d_t *in_2);
-
-/**
- * @brief Concatenate the channels of four matrix3ds into a new matrix3d
- *
- * @param in_1           First Floating point input matrix3d
- * @param in_2           Second Floating point input matrix3d
- * @param in_3           Third Floating point input matrix3d
- * @param in_4           Fourth Floating point input matrix3d
- *
- * @return               A newly allocated matrix3d with as avlues in_1|in_2|in_3|in_4
- */
-dl_matrix3d_t *dl_matrix3d_concat_4(dl_matrix3d_t *in_1,
-                                    dl_matrix3d_t *in_2,
-                                    dl_matrix3d_t *in_3,
-                                    dl_matrix3d_t *in_4);
-
-/**
- * @brief Concatenate the channels of eight matrix3ds into a new matrix3d
- *
- * @param in_1           First Floating point input matrix3d
- * @param in_2           Second Floating point input matrix3d
- * @param in_3           Third Floating point input matrix3d
- * @param in_4           Fourth Floating point input matrix3d
- * @param in_5           Fifth Floating point input matrix3d
- * @param in_6           Sixth Floating point input matrix3d
- * @param in_7           Seventh Floating point input matrix3d
- * @param in_8           eighth Floating point input matrix3d
- *
- * @return               A newly allocated matrix3d with as avlues in_1|in_2|in_3|in_4|in_5|in_6|in_7|in_8
- */
-dl_matrix3d_t *dl_matrix3d_concat_8(dl_matrix3d_t *in_1,
-                                    dl_matrix3d_t *in_2,
-                                    dl_matrix3d_t *in_3,
-                                    dl_matrix3d_t *in_4,
-                                    dl_matrix3d_t *in_5,
-                                    dl_matrix3d_t *in_6,
-                                    dl_matrix3d_t *in_7,
-                                    dl_matrix3d_t *in_8);
-
-/**
- * @brief Do a mobilefacenet block forward, dimension is (number, width, height, channel)
- *
- * @param in                    Input matrix3d
- * @param pw                    Weights of the pointwise conv layer
- * @param pw_bn_scale           The scale params of the batch_normalize layer after the pointwise conv layer
- * @param pw_bn_offset          The offset params of the batch_normalize layer after the pointwise conv layer
- * @param dw                    Weights of the depthwise conv layer
- * @param dw_bn_scale           The scale params of the batch_normalize layer after the depthwise conv layer
- * @param dw_bn_offset          The offset params of the batch_normalize layer after the depthwise conv layer
- * @param pw_linear             Weights of the pointwise linear conv layer
- * @param pw_linear_bn_scale    The scale params of the batch_normalize layer after the pointwise linear conv layer
- * @param pw_linear_bn_offset   The offset params of the batch_normalize layer after the pointwise linear conv layer
- * @param stride_x              The step length of the convolution window in x(width) direction
- * @param stride_y              The step length of the convolution window in y(height) direction
- * @param padding               One of VALID or SAME
- * @param mode                  Do convolution using C implement or xtensa implement, 0 or 1, with respect
- *                              If ESP_PLATFORM is not defined, this value is not used. Default is 0
- * @return                      The result of a mobilefacenet block
- */
-dl_matrix3d_t *dl_matrix3d_mobilefaceblock(dl_matrix3d_t *in,
-                                           dl_matrix3d_t *pw,
-                                           dl_matrix3d_t *pw_bn_scale,
-                                           dl_matrix3d_t *pw_bn_offset,
-                                           dl_matrix3d_t *dw,
-                                           dl_matrix3d_t *dw_bn_scale,
-                                           dl_matrix3d_t *dw_bn_offset,
-                                           dl_matrix3d_t *pw_linear,
-                                           dl_matrix3d_t *pw_linear_bn_scale,
-                                           dl_matrix3d_t *pw_linear_bn_offset,
-                                           int stride_x,
-                                           int stride_y,
-                                           int padding,
-                                           int mode,
-                                           int shortcut);
-
-/**
- * @brief Do a mobilefacenet block forward with 1x1 split conv, dimension is (number, width, height, channel)
- *
- * @param in                    Input matrix3d
- * @param pw_1                  Weights of the pointwise conv layer 1
- * @param pw_2                  Weights of the pointwise conv layer 2
- * @param pw_bn_scale           The scale params of the batch_normalize layer after the pointwise conv layer
- * @param pw_bn_offset          The offset params of the batch_normalize layer after the pointwise conv layer
- * @param dw                    Weights of the depthwise conv layer
- * @param dw_bn_scale           The scale params of the batch_normalize layer after the depthwise conv layer
- * @param dw_bn_offset          The offset params of the batch_normalize layer after the depthwise conv layer
- * @param pw_linear_1           Weights of the pointwise linear conv layer 1
- * @param pw_linear_2           Weights of the pointwise linear conv layer 2
- * @param pw_linear_bn_scale    The scale params of the batch_normalize layer after the pointwise linear conv layer
- * @param pw_linear_bn_offset   The offset params of the batch_normalize layer after the pointwise linear conv layer
- * @param stride_x              The step length of the convolution window in x(width) direction
- * @param stride_y              The step length of the convolution window in y(height) direction
- * @param padding               One of VALID or SAME
- * @param mode                  Do convolution using C implement or xtensa implement, 0 or 1, with respect
- *                              If ESP_PLATFORM is not defined, this value is not used. Default is 0
- * @return                      The result of a mobilefacenet block
- */
-dl_matrix3d_t *dl_matrix3d_mobilefaceblock_split(dl_matrix3d_t *in,
-                                                 dl_matrix3d_t *pw_1,
-                                                 dl_matrix3d_t *pw_2,
-                                                 dl_matrix3d_t *pw_bn_scale,
-                                                 dl_matrix3d_t *pw_bn_offset,
-                                                 dl_matrix3d_t *dw,
-                                                 dl_matrix3d_t *dw_bn_scale,
-                                                 dl_matrix3d_t *dw_bn_offset,
-                                                 dl_matrix3d_t *pw_linear_1,
-                                                 dl_matrix3d_t *pw_linear_2,
-                                                 dl_matrix3d_t *pw_linear_bn_scale,
-                                                 dl_matrix3d_t *pw_linear_bn_offset,
-                                                 int stride_x,
-                                                 int stride_y,
-                                                 int padding,
-                                                 int mode,
-                                                 int shortcut);
-
-/**
- * @brief           Initialize the matrix3d feature map to bias
- * 
- * @param out       The matrix3d feature map needs to be initialized
- * @param bias      The bias of a convlotion operation
- */
-void dl_matrix3d_init_bias(dl_matrix3d_t *out, dl_matrix3d_t *bias);
-
-/**
- * @brief  Do a elementwise multiplication of two matrix3ds
- * 
- * @param out  Preallocated matrix3d, size (n, w, h, c)
- * @param in1  Input matrix 1, size (n, w, h, c)
- * @param in2  Input matrix 2, size (n, w, h, c)
- */
-void dl_matrix3d_multiply(dl_matrix3d_t *out, dl_matrix3d_t *in1, dl_matrix3d_t *in2);
-
-//
-// Activation
-//
-
-/**
- * @brief Do a standard relu operation, update the input matrix3d
- *
- * @param m        Floating point input matrix3d
- */
-void dl_matrix3d_relu(dl_matrix3d_t *m);
-
-/**
- * @brief Do a relu (Rectifier Linear Unit) operation, update the input matrix3d
- *
- * @param in        Floating point input matrix3d
- * @param clip      If value is higher than this, it will be clipped to this value
- */
-void dl_matrix3d_relu_clip(dl_matrix3d_t *m, fptp_t clip);
-
-/**
- * @brief Do a Prelu (Rectifier Linear Unit) operation, update the input matrix3d
- *
- * @param in        Floating point input matrix3d
- * @param alpha     If value is less than zero, it will be updated by multiplying this factor
- */
-void dl_matrix3d_p_relu(dl_matrix3d_t *in, dl_matrix3d_t *alpha);
-
-/**
- * @brief Do a leaky relu (Rectifier Linear Unit) operation, update the input matrix3d
- *
- * @param in        Floating point input matrix3d
- * @param alpha     If value is less than zero, it will be updated by multiplying this factor
- */
-void dl_matrix3d_leaky_relu(dl_matrix3d_t *m, fptp_t alpha);
-
-//
-// Conv 1x1
-//
-/**
- * @brief Do 1x1 convolution with a matrix3d
- * 
- * @param out        Preallocated matrix3d, size (1, w, h, n)
- * @param in         Input matrix, size (1, w, h, c)
- * @param filter     1x1 filter, size (n, 1, 1, c)
- */
-void dl_matrix3dff_conv_1x1(dl_matrix3d_t *out,
-                            dl_matrix3d_t *in,
-                            dl_matrix3d_t *filter);
-
-/**
- * @brief Do 1x1 convolution with a matrix3d, with bias adding
- * 
- * @param out        Preallocated matrix3d, size (1, w, h, n)
- * @param in         Input matrix, size (1, w, h, c)
- * @param filter     1x1 filter, size (n, 1, 1, c)
- * @param bias       Bias, size (1, 1, 1, n)
- */
-void dl_matrix3dff_conv_1x1_with_bias(dl_matrix3d_t *out,
-                                      dl_matrix3d_t *in,
-                                      dl_matrix3d_t *filter,
-                                      dl_matrix3d_t *bias);
-
-/**
- * @brief Do 1x1 convolution with an 8-bit fixed point matrix
- * 
- * @param out        Preallocated matrix3d, size (1, w, h, n)
- * @param in         Input matrix, size (1, w, h, c)
- * @param filter     1x1 filter, size (n, 1, 1, c)
- */
-void dl_matrix3duf_conv_1x1(dl_matrix3d_t *out,
-                            dl_matrix3du_t *in,
-                            dl_matrix3d_t *filter);
-
-/**
- * @brief Do 1x1 convolution with an 8-bit fixed point matrix, with bias adding
- * 
- * @param out        Preallocated matrix3d, size (1, w, h, n)  
- * @param in         Input matrix, size (1, w, h, c)
- * @param filter     1x1 filter, size (n, 1, 1, c)
- * @param bias       Bias, size (1, 1, 1, n)
- */
-void dl_matrix3duf_conv_1x1_with_bias(dl_matrix3d_t *out,
-                                      dl_matrix3du_t *in,
-                                      dl_matrix3d_t *filter,
-                                      dl_matrix3d_t *bias);
-
-//
-// Conv 3x3
-//
-
-/**
- * @brief Do 3x3 convolution with a matrix3d, without padding
- * 
- * @param out        Preallocated matrix3d, size (1, w, h, n)
- * @param in         Input matrix, size (1, w, h, c)
- * @param f          3x3 filter, size (n, 3, 3, c)
- * @param step_x     Stride of width
- * @param step_y     Stride of height
- */
-void dl_matrix3dff_conv_3x3_op(dl_matrix3d_t *out,
-                               dl_matrix3d_t *in,
-                               dl_matrix3d_t *f,
-                               int step_x,
-                               int step_y);
-
-/**
- * @brief Do 3x3 convolution with a matrix3d, with bias adding
- * 
- * @param input             Input matrix, size (1, w, h, c)
- * @param filter            3x3 filter, size (n, 3, 3, c)
- * @param bias              Bias, size (1, 1, 1, n)
- * @param stride_x          Stride of width
- * @param stride_y          Stride of height
- * @param padding           Padding type
- * @return dl_matrix3d_t*   Resulting matrix3d
- */
-dl_matrix3d_t *dl_matrix3dff_conv_3x3(dl_matrix3d_t *in,
-                                      dl_matrix3d_t *filter,
-                                      dl_matrix3d_t *bias,
-                                      int stride_x,
-                                      int stride_y,
-                                      dl_padding_type padding);
-
-//
-// Conv Common
-//
-
-/**
- * @brief Do a general convolution layer pass with an 8-bit fixed point matrix, size is (number, width, height, channel)
- * 
- * @param in                Input image
- * @param filter            Weights of the neurons
- * @param bias              Bias for the CNN layer
- * @param stride_x          The step length of the convolution window in x(width) direction
- * @param stride_y          The step length of the convolution window in y(height) direction
- * @param padding           Padding type
- * @return dl_matrix3d_t*   Resulting matrix3d
- */
-dl_matrix3d_t *dl_matrix3duf_conv_common(dl_matrix3du_t *in,
-                                         dl_matrix3d_t *filter,
-                                         dl_matrix3d_t *bias,
-                                         int stride_x,
-                                         int stride_y,
-                                         dl_padding_type padding);
-
-/**
- * @brief Do a general convolution layer pass, size is (number, width, height, channel)
- * 
- * @param in                Input image
- * @param filter            Weights of the neurons
- * @param bias              Bias for the CNN layer
- * @param stride_x          The step length of the convolution window in x(width) direction
- * @param stride_y          The step length of the convolution window in y(height) direction
- * @param padding           Padding type
- * @return dl_matrix3d_t*   Resulting matrix3d
- */
-dl_matrix3d_t *dl_matrix3dff_conv_common(dl_matrix3d_t *in,
-                                         dl_matrix3d_t *filter,
-                                         dl_matrix3d_t *bias,
-                                         int stride_x,
-                                         int stride_y,
-                                         dl_padding_type padding);
-
-//
-// Depthwise 3x3
-//
-
-/**
- * @brief Do 3x3 depthwise convolution with a float matrix3d
- * 
- * @param in                  Input matrix, size (1, w, h, c)
- * @param filter              3x3 filter, size (1, 3, 3, c)
- * @param stride_x            Stride of width
- * @param stride_y            Stride of height
- * @param padding             Padding type, 0: valid, 1: same
- * @return dl_matrix3d_t*     Resulting float matrix3d
- */
-dl_matrix3d_t *dl_matrix3dff_depthwise_conv_3x3(dl_matrix3d_t *in,
-                                                dl_matrix3d_t *filter,
-                                                int stride_x,
-                                                int stride_y,
-                                                int padding);
-
-/**
- * @brief Do 3x3 depthwise convolution with a 8-bit fixed point matrix
- * 
- * @param in                  Input matrix, size (1, w, h, c)
- * @param filter              3x3 filter, size (1, 3, 3, c)
- * @param stride_x            Stride of width
- * @param stride_y            Stride of height
- * @param padding             Padding type, 0: valid, 1: same
- * @return dl_matrix3d_t*     Resulting float matrix3d
- */
-dl_matrix3d_t *dl_matrix3duf_depthwise_conv_3x3(dl_matrix3du_t *in,
-                                                dl_matrix3d_t *filter,
-                                                int stride_x,
-                                                int stride_y,
-                                                int padding);
-
-/**
- * @brief Do 3x3 depthwise convolution with a float matrix3d, without padding
- * 
- * @param out                 Preallocated matrix3d, size (1, w, h, n)
- * @param in                  Input matrix, size (1, w, h, c)
- * @param f                   3x3 filter, size (1, 3, 3, c)
- * @param step_x              Stride of width
- * @param step_y              Stride of height
- */
-void dl_matrix3dff_depthwise_conv_3x3_op(dl_matrix3d_t *out,
-                                         dl_matrix3d_t *in,
-                                         dl_matrix3d_t *f,
-                                         int step_x,
-                                         int step_y);
-
-//
-// Depthwise Common
-//
-
-/**
- * @brief Do a depthwise CNN layer pass, dimension is (number, width, height, channel)
- *
- * @param in             Input matrix3d
- * @param filter         Weights of the neurons
- * @param stride_x       The step length of the convolution window in x(width) direction
- * @param stride_y       The step length of the convolution window in y(height) direction
- * @param padding        One of VALID or SAME
- * @param mode           Do convolution using C implement or xtensa implement, 0 or 1, with respect
- *                       If ESP_PLATFORM is not defined, this value is not used. Default is 0
- * @return               The result of depthwise CNN layer
- */
-dl_matrix3d_t *dl_matrix3dff_depthwise_conv_common(dl_matrix3d_t *in,
-                                                   dl_matrix3d_t *filter,
-                                                   int stride_x,
-                                                   int stride_y,
-                                                   dl_padding_type padding);
-
-//
-// FC
-//
-/**
- * @brief Do a general fully connected layer pass, dimension is (number, width, height, channel)
- *
- * @param in             Input matrix3d, size is (1, w, 1, 1)
- * @param filter         Weights of the neurons, size is (1, w, h, 1)
- * @param bias           Bias for the fc layer, size is (1, 1, 1, h)
- * @return               The result of fc layer, size is (1, 1, 1, h)
- */
-void dl_matrix3dff_fc(dl_matrix3d_t *out,
-                      dl_matrix3d_t *in,
-                      dl_matrix3d_t *filter);
-
-/**
- * @brief Do fully connected layer forward, with bias adding
- *
- * @param out       Preallocated resulting matrix, size (1, 1, 1, h)
- * @param in        Input matrix, size (1, 1, 1, w)
- * @param filter    Filter matrix, size (1, w, h, 1)
- * @param bias      Bias matrix, size (1, 1, 1, h)
- */
-void dl_matrix3dff_fc_with_bias(dl_matrix3d_t *out,
-                                dl_matrix3d_t *in,
-                                dl_matrix3d_t *filter,
-                                dl_matrix3d_t *bias);
-
-//
-// Mobilenet
-//
-
-/**
- * @brief Do a mobilenet block forward, dimension is (number, width, height, channel)
- *
- * @param in             Input matrix3d
- * @param filter         Weights of the neurons
- * @param stride_x       The step length of the convolution window in x(width) direction
- * @param stride_y       The step length of the convolution window in y(height) direction
- * @param padding        One of VALID or SAME
- * @param mode           Do convolution using C implement or xtensa implement, 0 or 1, with respect
- *                       If ESP_PLATFORM is not defined, this value is not used. Default is 0
- * @return               The result of depthwise CNN layer
- */
-dl_matrix3d_t *dl_matrix3dff_mobilenet(dl_matrix3d_t *in,
-                                       dl_matrix3d_t *dilate_filter,
-                                       dl_matrix3d_t *dilate_prelu,
-                                       dl_matrix3d_t *depthwise_filter,
-                                       dl_matrix3d_t *depthwise_prelu,
-                                       dl_matrix3d_t *compress_filter,
-                                       dl_matrix3d_t *bias,
-                                       dl_matrix3d_mobilenet_config_t config);
-
-/**
- * @brief Do a mobilenet block forward, dimension is (number, width, height, channel)
- *
- * @param in             Input matrix3du
- * @param filter         Weights of the neurons
- * @param stride_x       The step length of the convolution window in x(width) direction
- * @param stride_y       The step length of the convolution window in y(height) direction
- * @param padding        One of VALID or SAME
- * @param mode           Do convolution using C implement or xtensa implement, 0 or 1, with respect
- *                       If ESP_PLATFORM is not defined, this value is not used. Default is 0
- * @return               The result of depthwise CNN layer
- */
-dl_matrix3d_t *dl_matrix3duf_mobilenet(dl_matrix3du_t *in,
-                                       dl_matrix3d_t *dilate_filter,
-                                       dl_matrix3d_t *dilate_prelu,
-                                       dl_matrix3d_t *depthwise_filter,
-                                       dl_matrix3d_t *depthwise_prelu,
-                                       dl_matrix3d_t *compress_filter,
-                                       dl_matrix3d_t *bias,
-                                       dl_matrix3d_mobilenet_config_t config);
--- a/tools/sdk/esp32/include/esp-face/lib/include/dl_lib_matrix3dq.h
+++ b/tools/sdk/esp32/include/esp-face/lib/include/dl_lib_matrix3dq.h
--- a/tools/sdk/esp32/include/esp-face/lib/include/frmn.h
+++ b/tools/sdk/esp32/include/esp-face/lib/include/frmn.h
@ -1,43 +0,0 @@
-#pragma once
-
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include "dl_lib_matrix3d.h"
-#include "dl_lib_matrix3dq.h"
-
-    /**
-     * @brief Forward the face recognition process with frmn model. Calculate in float.
-     *
-     * @param in    Image matrix, rgb888 format, size is 56x56, normalized
-     * @return dl_matrix3d_t* Face ID feature vector, size is 512
-     */
-    dl_matrix3d_t *frmn(dl_matrix3d_t *in);
-    
-    /**@{*/
-    /**
-     * @brief Forward the face recognition process with specified model. Calculate in quantization.
-     *
-     * @param in    Image matrix, rgb888 format, size is 56x56, normalized
-     * @param mode  0: C implement; 1: handwrite xtensa instruction implement
-     * @return      Face ID feature vector, size is 512
-     */
-    dl_matrix3dq_t *frmn_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    dl_matrix3dq_t *frmn2p_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    dl_matrix3dq_t *mfn56_42m_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    dl_matrix3dq_t *mfn56_72m_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    dl_matrix3dq_t *mfn56_112m_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    dl_matrix3dq_t *mfn56_156m_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    /**@}*/
-
-#if __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/lib/include/hd_model.h
+++ b/tools/sdk/esp32/include/esp-face/lib/include/hd_model.h
@ -1,66 +0,0 @@
-#pragma once
-
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include "dl_lib_matrix3d.h"
-#include "dl_lib_matrix3dq.h"
-
-    typedef struct
-    {
-        int num;              /*!< The total number of the boxes */
-        dl_matrix3d_t *cls;   /*!< The class feature map corresponding to the box. size: (height, width, anchor_num, 1) */
-        dl_matrix3d_t *score; /*!< The confidence score feature map of the class corresponding to the box. size: (height, width, anchor_num, 1) */
-        dl_matrix3d_t *boxes; /*!< (x, y, w, h) of the boxes. x and y are the center coordinates. size:(height, width, anchor_num, 4) */
-    } detection_result_t;
-
-    /**
-     * @brief Forward the hand detection process with hd_nano1 model. Calculate in quantization.
-     * 
-     * @param in                      A normalized image matrix in rgb888 format, its width and height must be integer multiples of 16.
-     * @param mode                    0: C implement; 1: handwrite xtensa instruction implement
-     * @return detection_result_t**   Detection results
-     */
-    detection_result_t **hd_nano1_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    /**
-     * @brief Forward the hand detection process with hd_lite1 model. Calculate in quantization.
-     * 
-     * @param in                      A normalized image matrix in rgb888 format, its width and height must be integer multiples of 32.
-     * @param mode                    0: C implement; 1: handwrite xtensa instruction implement.
-     * @return detection_result_t**   Detection results.
-     */
-    detection_result_t **hd_lite1_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    /**
-     * @brief Free the single detection result.
-     * 
-     * @param m     The single detection result.
-     */
-    void detection_result_free(detection_result_t *m);
-
-    /**
-     * @brief Free the detection result group from different feature map.
-     * 
-     * @param m       The detection result group
-     * @param length  The number of the detection results
-     */
-    void detection_results_free(detection_result_t **m, int length);
-
-    /**
-     * @brief Test the result of hand detection model.
-     * 
-     */
-    void hd_test();
-
-    /**
-     * @brief Test the forward time of hand detection model.
-     * 
-     */
-    void hd_time_test();
-
-#if __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/lib/include/hp_model.h
+++ b/tools/sdk/esp32/include/esp-face/lib/include/hp_model.h
@ -1,43 +0,0 @@
-#pragma once
-
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include "dl_lib_matrix3d.h"
-#include "dl_lib_matrix3dq.h"
-
-    /**
-     * @brief Forward the hand pose estimation process with hp_nano1_ls16 model. Calculate in quantization.
-     * 
-     * @param in                 A normalized image matrix in rgb888 format, its size is (1, 128, 128, 3).
-     * @param mode               0: C implement; 1: handwrite xtensa instruction implement
-     * @return dl_matrix3d_t*    The resulting hand joint point coordinates, the size is (1, 1, 21, 2)
-     */
-    dl_matrix3d_t *hp_nano1_ls16_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    /**
-     * @brief Forward the hand pose estimation process with hp_lite1 model. Calculate in quantization.
-     * 
-     * @param in                 A normalized image matrix in rgb888 format, its size is (1, 128, 128, 3).
-     * @param mode               0: C implement; 1: handwrite xtensa instruction implement
-     * @return dl_matrix3d_t*    The resulting hand joint point coordinates, the size is (1, 1, 21, 2)
-     */
-    dl_matrix3d_t *hp_lite1_q(dl_matrix3dq_t *in, dl_conv_mode mode);
-
-    /**
-     * @brief Test the result of hand pose estimation model.
-     * 
-     */
-    void hp_test();
-
-    /**
-     * @brief Test the forward time of hand pose estimation model.
-     * 
-     */
-    void hp_time_test();
-
-#if __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/lib/include/lssh.h
+++ b/tools/sdk/esp32/include/esp-face/lib/include/lssh.h
@ -1,91 +0,0 @@
-/*
-  * ESPRESSIF MIT License
-  *
-  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
-  *
-  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
-  * it is free of charge, to any person obtaining a copy of this software and associated
-  * documentation files (the "Software"), to deal in the Software without restriction, including
-  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
-  * to do so, subject to the following conditions:
-  *
-  * The above copyright notice and this permission notice shall be included in all copies or
-  * substantial portions of the Software.
-  *
-  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-  *
-  */
-#pragma once
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-#include "dl_lib_matrix3d.h"
-#include "dl_lib_matrix3dq.h"
-#include "freertos/FreeRTOS.h"
-
-    typedef struct
-    {
-        int resized_height;
-        int resized_width;
-        fptp_t y_resize_scale;
-        fptp_t x_resize_scale;
-        int enabled_top_k;
-        fptp_t score_threshold;
-        fptp_t nms_threshold;
-
-        dl_conv_mode mode;
-    } lssh_config_t;
-
-    typedef struct
-    {
-        int *anchor_size;
-        int stride;
-        int boundary;
-    } lssh_module_config_t;
-
-    typedef struct
-    {
-        lssh_module_config_t *module_config;
-        int number;
-    } lssh_modules_config_t;
-
-    typedef struct
-    {
-        dl_matrix3d_t *category;
-        dl_matrix3d_t *box_offset;
-        dl_matrix3d_t *landmark_offset;
-    } lssh_module_result_t;
-
-    /**
-     * @brief 
-     * 
-     * @param value 
-     */
-    void lssh_module_result_free(lssh_module_result_t value);
-
-    /**
-     * @brief 
-     * 
-     * @param values 
-     * @param length 
-     */
-    void lssh_module_results_free(lssh_module_result_t *values, int length);
-
-    /////////////////////////
-    //////sparse_mn_5_q//////
-    /////////////////////////
-    extern lssh_modules_config_t sparse_mn_5_modules_config;
-    lssh_module_result_t *sparse_mn_5_q_without_landmark(dl_matrix3du_t *image, bool free_image, int enabled_top_k, dl_conv_mode mode);
-    lssh_module_result_t *sparse_mn_5_q_with_landmark(dl_matrix3du_t *image, bool free_image, int enabled_top_k, dl_conv_mode mode);
-
-#ifdef __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/lib/include/mtmn.h
+++ b/tools/sdk/esp32/include/esp-face/lib/include/mtmn.h
@ -1,142 +0,0 @@
-/*
-  * ESPRESSIF MIT License
-  *
-  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
-  *
-  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
-  * it is free of charge, to any person obtaining a copy of this software and associated
-  * documentation files (the "Software"), to deal in the Software without restriction, including
-  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
-  * to do so, subject to the following conditions:
-  *
-  * The above copyright notice and this permission notice shall be included in all copies or
-  * substantial portions of the Software.
-  *
-  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-  *
-  */
-#pragma once
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-#include "dl_lib_matrix3d.h"
-#include "dl_lib_matrix3dq.h"
-
-    /**
-     * Detection results with MTMN.
-     *
-     */
-    typedef struct
-    {
-        dl_matrix3d_t *category;    /*!< Classification result after softmax, channel is 2 */
-        dl_matrix3d_t *offset;      /*!< Bounding box offset of 2 points: top-left and bottom-right, channel is 4 */
-        dl_matrix3d_t *landmark;    /*!< Offsets of 5 landmarks:
-                                     * - Left eye
-                                     * - Mouth leftside
-                                     * - Nose
-                                     * - Right eye
-                                     * - Mouth rightside
-                                     *
-                                     * channel is 10
-                                     * */
-    } mtmn_net_t;
-
-
-    /**
-     * @brief Free a mtmn_net_t
-     *
-     * @param p         A mtmn_net_t pointer
-     *
-     */
-
-    void mtmn_net_t_free(mtmn_net_t *p);
-
-    /**
-     * @brief Forward the pnet process, coarse detection. Calculate in float.
-     *
-     * @param in        Image matrix, rgb888 format, size is 320x240
-     * @return          Scores for every pixel, and box offset with respect.
-     */
-    mtmn_net_t *pnet_lite_f(dl_matrix3du_t *in);
-
-    /**
-     * @brief Forward the rnet process, fine determine the boxes from pnet. Calculate in float.
-     *
-     * @param in        Image matrix, rgb888 format
-     * @param threshold Score threshold to detect human face
-     * @return          Scores for every box, and box offset with respect.
-     */
-    mtmn_net_t *rnet_lite_f_with_score_verify(dl_matrix3du_t *in, float threshold);
-
-    /**
-     * @brief Forward the onet process, fine determine the boxes from rnet. Calculate in float.
-     *
-     * @param in        Image matrix, rgb888 format
-     * @param threshold Score threshold to detect human face
-     * @return          Scores for every box, box offset, and landmark with respect.
-     */
-    mtmn_net_t *onet_lite_f_with_score_verify(dl_matrix3du_t *in, float threshold);
-
-    /**
-     * @brief Forward the pnet process, coarse detection. Calculate in quantization.
-     *
-     * @param in        Image matrix, rgb888 format, size is 320x240
-     * @return          Scores for every pixel, and box offset with respect.
-     */
-    mtmn_net_t *pnet_lite_q(dl_matrix3du_t *in, dl_conv_mode mode);
-
-    /**
-     * @brief Forward the rnet process, fine determine the boxes from pnet. Calculate in quantization.
-     *
-     * @param in        Image matrix, rgb888 format
-     * @param threshold Score threshold to detect human face
-     * @return          Scores for every box, and box offset with respect.
-     */
-    mtmn_net_t *rnet_lite_q_with_score_verify(dl_matrix3du_t *in, float threshold, dl_conv_mode mode);
-
-    /**
-     * @brief Forward the onet process, fine determine the boxes from rnet. Calculate in quantization.
-     *
-     * @param in        Image matrix, rgb888 format
-     * @param threshold Score threshold to detect human face
-     * @return          Scores for every box, box offset, and landmark with respect.
-     */
-    mtmn_net_t *onet_lite_q_with_score_verify(dl_matrix3du_t *in, float threshold, dl_conv_mode mode);
-
-    /**
-     * @brief Forward the pnet process, coarse detection. Calculate in quantization.
-     *
-     * @param in        Image matrix, rgb888 format, size is 320x240
-     * @return          Scores for every pixel, and box offset with respect.
-     */
-    mtmn_net_t *pnet_heavy_q(dl_matrix3du_t *in, dl_conv_mode mode);
-
-    /**
-     * @brief Forward the rnet process, fine determine the boxes from pnet. Calculate in quantization.
-     *
-     * @param in        Image matrix, rgb888 format
-     * @param threshold Score threshold to detect human face
-     * @return          Scores for every box, and box offset with respect.
-     */
-    mtmn_net_t *rnet_heavy_q_with_score_verify(dl_matrix3du_t *in, float threshold, dl_conv_mode mode);
-
-    /**
-     * @brief Forward the onet process, fine determine the boxes from rnet. Calculate in quantization.
-     *
-     * @param in        Image matrix, rgb888 format
-     * @param threshold Score threshold to detect human face
-     * @return          Scores for every box, box offset, and landmark with respect.
-     */
-    mtmn_net_t *onet_heavy_q_with_score_verify(dl_matrix3du_t *in, float threshold, dl_conv_mode mode);
-
-#ifdef __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/object_detection/include/object_detection.h
+++ b/tools/sdk/esp32/include/esp-face/object_detection/include/object_detection.h
@ -1,59 +0,0 @@
-/*
-  * ESPRESSIF MIT License
-  *
-  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
-  *
-  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
-  * it is free of charge, to any person obtaining a copy of this software and associated
-  * documentation files (the "Software"), to deal in the Software without restriction, including
-  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
-  * to do so, subject to the following conditions:
-  *
-  * The above copyright notice and this permission notice shall be included in all copies or
-  * substantial portions of the Software.
-  *
-  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-  *
-  */
-#pragma once
-
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include "image_util.h"
-#include "detection.h"
-// Include models
-#include "cat_face_3.h"
-
-    /**
-     * @brief update detection hyperparameter 
-     * 
-     * @param model             The detection model
-     * @param resize_scale      The resize scale of input image
-     * @param score_threshold   Score threshold, used to filter candidates by score
-     * @param nms_threshold     NMS threshold, used to filter out overlapping boxes
-     * @param image_height      Input image height
-     * @param image_width       Input image width
-     */
-    void update_detection_model(detection_model_t *model, fptp_t resize_scale, fptp_t score_threshold, fptp_t nms_threshold, int image_height, int image_width);
-
-    /**
-     * @brief 
-     * 
-     * @param image             The input image
-     * @param model             A 'detection_model_t' type point of detection model
-     * @return box_array_t*     The detection result with box and corresponding score and category
-     */
-    box_array_t *detect_object(dl_matrix3du_t *image, detection_model_t *model);
-
-#if __cplusplus
-}
-#endif
--- a/tools/sdk/esp32/include/esp-face/pose_estimation/include/pe_forward.h
+++ b/tools/sdk/esp32/include/esp-face/pose_estimation/include/pe_forward.h
@ -1,153 +0,0 @@
-/*
-  * ESPRESSIF MIT License
-  *
-  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
-  *
-  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
-  * it is free of charge, to any person obtaining a copy of this software and associated
-  * documentation files (the "Software"), to deal in the Software without restriction, including
-  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
-  * to do so, subject to the following conditions:
-  *
-  * The above copyright notice and this permission notice shall be included in all copies or
-  * substantial portions of the Software.
-  *
-  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-  *
-  */
-#pragma once
-
-#if __cplusplus
-extern "C"
-{
-#endif
-
-#include "image_util.h"
-#include "dl_lib_matrix3d.h"
-#include "hd_model.h"
-#include "hp_model.h"
-
-#define INPUT_EXPONENT -10
-#define SCORE_THRESHOLD 0.5
-#define NMS_THRESHOLD 0.45
-
-#if CONFIG_HD_LITE1
-  #define HP_TARGET_SIZE 128 
-#else
-  #define HP_TARGET_SIZE 128
-#endif
-
-    typedef struct
-    {
-        int target_size;          /*!< The input size of hand detection network */
-        fptp_t score_threshold;   /*!< score threshold， used to filter candidates by score */
-        fptp_t nms_threshold;     /*!< nms threshold， used to filter out overlapping boxes */
-    } hd_config_t;
-
-    /**
-     * @brief Get the default hand detection network configuration
-     * 
-     * @return hd_config_t The default configuration
-     */
-    static inline hd_config_t hd_init_config()
-    {
-        hd_config_t hd_config;
-        hd_config.target_size = 96;
-        hd_config.score_threshold = SCORE_THRESHOLD;
-        hd_config.nms_threshold = NMS_THRESHOLD;
-        return hd_config;
-    }
-    
-    typedef struct tag_od_box_list
-    {
-        fptp_t *score;         /*!< The confidence score of the class corresponding to the box */
-        qtp_t *cls;            /*!< The class corresponding to the box */
-        box_t *box;            /*!< (x1, y1, x2, y2) of the boxes */
-        int len;               /*!< The number of the boxes */
-    } od_box_array_t;
-
-    typedef struct tag_od_image_box
-    {
-        struct tag_od_image_box *next;     /*!< Next od_image_box_t */
-        fptp_t score;                      /*!< The confidence score of the class corresponding to the box */
-        qtp_t cls;                         /*!< The class corresponding to the box */
-        box_t box;                         /*!< (x1, y1, x2, y2) of the boxes */
-    } od_image_box_t;
-
-    typedef struct tag_od_image_list
-    {
-        od_image_box_t *head;              /*!< The current head of the od_image_list */
-        od_image_box_t *origin_head;       /*!< The original head of the od_image_list */
-        int len;                           /*!< Length of the od_image_list */
-    } od_image_list_t;
-
-    /**
-     * @brief Sort the resulting box lists by their confidence score.
-     * 
-     * @param image_sorted_list      The sorted box list.
-     * @param insert_list            The box list that have not been sorted.
-     */
-    void od_image_sort_insert_by_score(od_image_list_t *image_sorted_list, const od_image_list_t *insert_list);
-    
-    /**
-     * @brief Filter out the resulting boxes whose confidence score is lower than the threshold and convert the boxes to the actual boxes on the original image.((x, y, w, h) -> (x1, y1, x2, y2)) 
-     * 
-     * @param score                Confidence score of the boxes.
-     * @param cls                  Class of the boxes. 
-     * @param boxes                (x, y, w, h) of the boxes. x and y are the center coordinates. 
-     * @param height               Height of the detection output feature map.
-     * @param width                Width of the detection output feature map. 
-     * @param anchor_number        Anchor number of the detection output feature map. 
-     * @param score_threshold      Threshold of the confidence score.
-     * @param resize_scale         Resize scale: target_size/orignal_size.
-     * @param padding_w            Width padding in preporcess.
-     * @param padding_h            Height padding in preporcess.
-     * @return od_image_list_t*    Resulting valid boxes.
-     */
-    od_image_list_t *od_image_get_valid_boxes(fptp_t *score,
-                                    fptp_t *cls,
-                                    fptp_t *boxes,
-                                    int height,
-                                    int width,
-                                    int anchor_number,
-                                    fptp_t score_threshold,
-                                    fptp_t resize_scale,
-                                    int padding_w,
-                                    int padding_h);
-    
-    /**
-     * @brief Run NMS algorithm 
-     * 
-     * @param image_list        The input boxes list
-     * @param nms_threshold     NMS threshold
-     */
-    void od_image_nms_process(od_image_list_t *image_list, fptp_t nms_threshold);
-
-    /**
-     * @brief Do hand detection, return box infomation.
-     * 
-     * @param image              Image matrix, rgb888 format
-     * @param hd_config          Configuration of hand detection 
-     * @return od_box_array_t*   A list of boxes, score and class.
-     */
-    od_box_array_t *hand_detection_forward(dl_matrix3du_t *image, hd_config_t hd_config);
-
-    /**
-     * @brief Do hand pose estimation, return 21 landmarks of each hand.
-     * 
-     * @param image              Image matrix, rgb888 format
-     * @param od_boxes           The output of the hand detection network
-     * @param target_size        The input size of hand pose estimation network
-     * @return dl_matrix3d_t*    The coordinates of 21 landmarks on the input image for each hand, size (n, 1, 21, 2)
-     */
-    dl_matrix3d_t *handpose_estimation_forward(dl_matrix3du_t *image, od_box_array_t *od_boxes, int target_size);
-
-#if __cplusplus
-}
-#endif