IDF master b86fe0c66c

2025-07-02 13:30:59 +02:00 · 2021-10-13 18:21:12 +00:00
parent 2fb2ef54ce
commit 34c81be93b
538 changed files with 17119 additions and 4346 deletions
--- a/tools/sdk/esp32s2/include/esp-face/include/dl_define.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/dl_define.hpp
@ -10,7 +10,7 @@
 #define DL_LOG_LAYER_LATENCY 0 /*<! - 1: print the latency of each parts of layer */
                               /*<! - 0: mute */

-#if CONFIG_SPIRAM_SUPPORT || CONFIG_ESP32_SPIRAM_SUPPORT || CONFIG_ESP32S3_SPIRAM_SUPPORT
+#if CONFIG_SPIRAM_SUPPORT || CONFIG_ESP32_SPIRAM_SUPPORT || CONFIG_ESP32S2_SPIRAM_SUPPORT || CONFIG_ESP32S3_SPIRAM_SUPPORT
 #define DL_SPIRAM_SUPPORT 1
 #else
 #define DL_SPIRAM_SUPPORT 0
@ -83,8 +83,17 @@ namespace dl

    typedef enum
    {
-        PADDING_VALID,     /*<! no padding >*/
-        PADDING_SAME,      /*<! SAME in TensorFlow style >*/
-        PADDING_SAME_MXNET /*<! SAME in MXNET style >*/
+        PADDING_NOT_SET,
+        PADDING_VALID,      /*<! no padding >*/
+        PADDING_SAME_BEGIN,  /*<! SAME in MXNET style >*/
+        PADDING_SAME_END,   /*<! SAME in TensorFlow style >*/
    } padding_type_t;
-} // namespace dl
+    
+    typedef enum
+    {
+        CONSTANT,
+        EDGE, 
+        REFLECT,
+        SYMMETRIC,
+    } padding_mode_t;
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/image/dl_image.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/image/dl_image.hpp
@ -370,11 +370,70 @@ namespace dl
         */
        uint32_t get_moving_point_number(uint8_t *f1, uint8_t *f2, const uint32_t height, const uint32_t width, const uint32_t stride, const uint32_t threshold = 5);

-
+        /**
+         * @brief Apply an affine transformation to an image.
+         * 
+         * @tparam T 
+         * @param input     the input image.
+         * @param output    the output image.
+         * @param M_inv     the inverse transformation matrix.
+         */
        template <typename T>
        void warp_affine(dl::Tensor<T> *input, dl::Tensor<T> *output, dl::math::Matrix<float> *M_inv);
+
+        /**
+         * @brief Apply an affine transformation to an image.
+         * 
+         * @tparam T 
+         * @param input    the pointer of the input image.
+         * @param shape    the shape of the input image.
+         * @param output   the output image.
+         * @param M_inv    the inverse transformation matrix.
+         */
        template <typename T>
        void warp_affine(uint16_t *input, std::vector<int> shape, dl::Tensor<T> *output, dl::math::Matrix<float> *M_inv);

+        /**
+         * @brief Get the otsu thresh object.
+         * 
+         * @param image  the gray image.
+         * @return uint8_t the otsu thresh.
+         */
+        uint8_t get_otsu_thresh(Tensor<uint8_t> &image);
+
+        /**
+         * @brief Convert RGB image to gray image
+         * 
+         * @param image  input image
+         * @param bgr    true: the image is in BGR format
+         *               false: the image is in RGB format
+         * @return Tensor<uint8_t>* output image in gray format
+         */
+        Tensor<uint8_t> *rgb2gray(Tensor<uint8_t> &image, bool bgr = false);
+
+        /**
+         * @brief Convert RGB image to LAB image
+         * 
+         * @param image  input image
+         * @param bgr    true: the image is in BGR format
+         *               false: the image is in RGB format 
+         * @param fast   true: use the fast alogrithm， but the accuracy will be reduced
+         *               false: do not use the fast alogrithm
+         * @return Tensor<uint8_t>* output image in LAB foramt
+         */
+        Tensor<uint8_t> *rgb2lab(Tensor<uint8_t> &image, bool bgr = false, bool fast = true);
+
+        /**
+         * @brief Convert RGB image to HSV image
+         * 
+         * @param image   input image
+         * @param bgr     true: the image is in BGR format
+         *                false: the image is in RGB format 
+         * @param fast    true: use the fast alogrithm， but the accuracy will be reduced
+         *                false: do not use the fast alogrithm
+         * @return Tensor<uint8_t>* output image in HSV format
+         */
+        Tensor<uint8_t> *rgb2hsv(Tensor<uint8_t> &image, bool bgr = false, bool fast = true);
+
    } // namespace image
 } // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_add2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_add2d.hpp
@ -25,7 +25,8 @@ namespace dl
            const int output_exponent;               /*<! exponent of output >*/
            Tensor<feature_t> *output;               /*<! output ptr of add2d >*/
            bool inplace;                            /*<! true: the output will store to input0
-                                                          false: the output will store to a seperate memeory >*/
+                                                          false: the output will store to a separate memory >*/
+            std::vector<int> output_shape;           /*<! output shape of add2d >*/

        public:
            /**
@ -35,19 +36,21 @@ namespace dl
             * @param activation      activation of add2d, if you don't specify anything, no activation is applied
             * @param name            name of add2d
             * @param inplace         true: the output will store to input0
-             *                        false: the output will store to a seperate memeory
+             *                        false: the output will store to a separate memory
             */
-            Add2D(const int output_exponent, const Activation<feature_t> *activation = NULL, const char *name = NULL, bool inplace = false) : Layer(name), activation(activation), output_exponent(output_exponent), output(NULL)
-            {
-                this->inplace = inplace;
-            }
+            Add2D(const int output_exponent, const Activation<feature_t> *activation = NULL, const char *name = "Add2D", bool inplace = false) : Layer(name),
+                                                                                                                                                 activation(activation),
+                                                                                                                                                 output_exponent(output_exponent),
+                                                                                                                                                 output(NULL),
+                                                                                                                                                 inplace(inplace),
+                                                                                                                                                 output_shape({}) {}

            /**
             * @brief Destroy the Add2D object
             */
            ~Add2D()
            {
-                if((!this->inplace) && (this->output != NULL))
+                if ((!this->inplace) && (this->output != NULL))
                {
                    delete this->output;
                }
@ -59,10 +62,12 @@ namespace dl
             * 
             * @param input0 as one input
             * @param input1 as another input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1, bool print_shape = false)
            {
                assert(input0.is_same_shape(input1));
+                this->output_shape = input0.shape;

                if (!this->inplace)
                {
@ -78,6 +83,11 @@ namespace dl
                {
                    this->output = &input0;
                }
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -105,7 +115,11 @@ namespace dl
                if (!this->inplace)
                {
                    DL_LOG_LAYER_LATENCY_START();
-                    this->output->apply_element();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
+                    this->output->malloc_element();
                    this->output->set_exponent(this->output_exponent);
                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");

@ -116,6 +130,10 @@ namespace dl
                else
                {
                    DL_LOG_LAYER_LATENCY_START();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
                    nn::add2d(*this->output, input0, input1, this->activation, assign_core, this->output_exponent);
                    DL_LOG_LAYER_LATENCY_END(this->name, "add2d");
                }
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_avg_pool2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_avg_pool2d.hpp
@ -24,23 +24,26 @@ namespace dl
            std::vector<int> filter_shape;     /*<! filter shape in [filter_height, filter_width] >*/
            const int stride_y;                /*<! stride in height >*/
            const int stride_x;                /*<! stride in width >*/
-            const padding_type_t padding_type; /*<! one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET >*/
+            const padding_type_t padding_type; /*<! one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN >*/
            std::vector<int> padding;          /*<! padding size needed in [top, bottom, left, right] of this operation >*/
-            Tensor<feature_t> *output;          /*<! output ptr of AvgPool2D >*/
+            Tensor<feature_t> *output;         /*<! output ptr of AvgPool2D >*/
+            std::vector<int> output_shape;     /*<! output shape of AvgPool2D >*/

        public:
-
            /**
             * @brief Construct a new AvgPool2D object.
             * 
             * @param output_exponent exponent of output
             * @param filter_shape    filter shape in [filter_height, filter_width]
-             * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+             * @param padding_type    one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN or PADDING_NOT_SET,
             *                        - PADDING_VALID means no padding
-             *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+             *                        PADDING_SAME_END and PADDING_SAME_BEGIN results in padding with zeros evenly to the left/right or up/down of the input 
             *                        such that output has the same height/width dimension as the input,
-             *                        - PADDING_SAME results padding in TensorFlow style
-             *                        - PADDING_SAME_MXNET results padding in MXNET style
+             *                        - PADDING_SAME_END results padding in TensorFlow style
+             *                        - PADDING_SAME_BEGIN results padding in MXNET style
+             *                        - PADDING_NOT_SET means padding with the specific "padding" value below. 
+             * @param padding         if padding_type is PADDING_NOT_SET, this value will be used as padding size. 
+             *                        the shape must be 4, the value of each position is: [padding top, padding bottom, padding left, padding right]
             * @param stride_y        stride in height
             * @param stride_x        stride in width
             * @param name            name of layer
@ -48,16 +51,23 @@ namespace dl
            AvgPool2D(const int output_exponent,
                      const std::vector<int> filter_shape,
                      const padding_type_t padding_type = PADDING_VALID,
+                      std::vector<int> padding = {},
                      const int stride_y = 1,
                      const int stride_x = 1,
-                      const char *name = NULL) : Layer(name),
-                                                 output_exponent(output_exponent),
-                                                 filter_shape(filter_shape),
-                                                 stride_y(stride_y),
-                                                 stride_x(stride_x),
-                                                 padding_type(padding_type)
+                      const char *name = "AvgPool2D") : Layer(name),
+                                                        output_exponent(output_exponent),
+                                                        filter_shape(filter_shape),
+                                                        padding_type(padding_type),
+                                                        padding(padding),
+                                                        stride_y(stride_y),
+                                                        stride_x(stride_x),
+                                                        output_shape({})
            {
                this->output = new Tensor<feature_t>;
+                if (this->padding_type == PADDING_NOT_SET)
+                {
+                    assert(this->padding.size() == 4);
+                }
            }

            /**
@ -66,7 +76,7 @@ namespace dl
            */
            ~AvgPool2D()
            {
-                if(this->output != NULL)
+                if (this->output != NULL)
                {
                    delete this->output;
                }
@ -76,20 +86,31 @@ namespace dl
             * @brief Update output shape and padding.
             * 
             * @param input as an input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input)
+            void build(Tensor<feature_t> &input, bool print_shape = false)
            {
                assert(input.shape[0] > 0);
                assert(input.shape[1] > 0);
-                std::vector<int> output_shape = nn::get_output_shape(input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
-                this->output->set_shape(output_shape);
+                assert(input.shape.size() == 3);
+
+                this->output_shape = nn::get_output_shape(input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type, false, this->padding);
+                this->output->set_shape(this->output_shape);
                this->output->set_exponent(this->output_exponent);

-                this->padding = nn::get_pad_size(output_shape, input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
-                input.set_padding_size(this->padding);
-                this->output->free_element();
-            }
+                if (this->padding_type != PADDING_NOT_SET)
+                {
+                    this->padding = nn::get_pad_size(this->output_shape, input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
+                }

+                this->output->free_element();
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
+            }

            /**
             * @brief Get the output
@ -108,7 +129,6 @@ namespace dl
             * @param autoload_enable one of true or false, 
             *                        - true: load input and output from PSRAM to CACHE automatically
             *                        - false: do not
-             * @param assign_core     not effective yet
             * @return AvgPool2D result
             */
            Tensor<feature_t> &call(Tensor<feature_t> &input, uint8_t autoload_enable = 0)
@ -116,7 +136,11 @@ namespace dl
                DL_LOG_LAYER_LATENCY_INIT();

                DL_LOG_LAYER_LATENCY_START();
-                this->output->apply_element();
+                if (this->output->shape != this->output_shape)
+                {
+                    this->output->set_shape(this->output_shape);
+                }
+                this->output->malloc_element();
                this->output->set_exponent(this->output_exponent);
                DL_LOG_LAYER_LATENCY_END(this->name, "apply");

--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_base.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_base.hpp
@ -1,6 +1,7 @@
 #pragma once
 #include "dl_tool.hpp"
 #include "dl_tool_cache.hpp"
+#include <iostream>

 namespace dl
 {
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_concat.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_concat.hpp
@ -0,0 +1,139 @@
+#pragma once
+
+#include <assert.h>
+#include <vector>
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_layer_base.hpp"
+#include "dl_nn_concat.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Concat(input1, input2, input3, ...).
+         * 
+         * @tparam feature_t support all kinds of integer and float data type
+         */
+        template <typename feature_t>
+        class Concat : Layer
+        {
+        private:
+            int output_exponent;           /*<! exponent of output >*/
+            int axis;                      /*<! The axis along which the Tensor will be concatenated. >*/
+            Tensor<feature_t> *output;     /*<! output ptr of Concat >*/
+            std::vector<int> output_shape; /*<! output shape of Concat >*/
+        public:
+            /**
+             * @brief Construct a new Concat object.
+             * 
+             * @param name name of layer
+             * @param axis The axis along which the Tensor will be concatenated.
+             */
+            Concat(int axis, const char *name = "Concat") : Layer(name), axis(axis), output_shape({})
+            {
+                this->output = new Tensor<feature_t>;
+            }
+
+            /**
+             * @brief Destroy the Concat object
+             */
+            ~Concat()
+            {
+                if (this->output != NULL)
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Collect inputs' channel and memory offset, called in Model.build().
+             * 
+             * @param args pointers of concatenated Tensor
+             * @param print_shape  whether to print the output shape.
+             */
+            void build(std::vector<Tensor<feature_t> *> args, bool print_shape = false)
+            {
+                assert(args.size() > 1);
+                int shape_size = args[0]->shape.size();
+
+                if (this->axis < 0)
+                {
+                    this->axis = shape_size + this->axis;
+                }
+                assert((this->axis < shape_size) && (this->axis > -1));
+
+                int output_shape_axis = args[0]->shape[this->axis];
+
+                for (int i = 1; i < args.size(); i++)
+                {
+                    assert(shape_size == args[i]->shape.size());
+                    assert(args[i]->exponent == args[i - 1]->exponent);
+                    output_shape_axis += args[i]->shape[this->axis];
+
+                    for (int j = 0; j < shape_size; j++)
+                    {
+                        if (j != this->axis)
+                        {
+                            assert(args[i]->shape[j] == args[i - 1]->shape[j]);
+                        }
+                    }
+                }
+
+                this->output_exponent = args[0]->exponent;
+                this->output_shape = args[0]->shape;
+                this->output_shape[this->axis] = output_shape_axis;
+
+                this->output->set_shape(this->output_shape);
+                this->output->set_exponent(this->output_exponent);
+                this->output->free_element();
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
+            }
+
+            /**
+             * @brief Call Concat operation
+             * 
+             * @param inputs                the pointers of inputs 
+             * @param free_inputs           true: free the inputs after call
+             *                              false: do not free inputs
+             * @return Tensor<feature_t>&   concat result
+             */
+            Tensor<feature_t> &call(std::vector<Tensor<feature_t> *> inputs, bool free_inputs = false)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                if (this->output->shape != this->output_shape)
+                {
+                    this->output->set_shape(this->output_shape);
+                }
+                this->output->malloc_element();
+                this->output->set_exponent(this->output_exponent);
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                DL_LOG_LAYER_LATENCY_START();
+                nn::concat(*this->output, inputs, this->axis, free_inputs);
+                DL_LOG_LAYER_LATENCY_END(this->name, "concat");
+                return *this->output;
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>&  Concat result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_conv2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_conv2d.hpp
@ -13,8 +13,11 @@ namespace dl
         * @tparam feature_t supports int16_t and int8_t,
         *         - int16_t: stands for operation in int16_t quantize
         *         - int8_t: stands for operation in int8_t quantize
+         * @tparam bias_t supports int16_t and int8_t, must specify when using int8 per-channel quantization
+         *         - int16_t: for int16 quantization and int8 per-channel quantization
+         *         - int8_t: for int8 per-tensor quantization
         */
-        template <typename feature_t>
+        template <typename feature_t, typename bias_t = feature_t>
        class Conv2D : public Layer
        {
        private:
@ -22,14 +25,14 @@ namespace dl
            const Filter<feature_t> *filter;         /*<! filter of Conv2D >*/
            const int stride_y;                      /*<! stride in height >*/
            const int stride_x;                      /*<! stride in width >*/
-            const padding_type_t padding_type;       /*<! one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET >*/
-            const Bias<feature_t> *bias;             /*<! bias of Conv2D, if you don't specify anything, no bias is added >*/
+            const padding_type_t padding_type;       /*<! one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN >*/
+            const Bias<bias_t> *bias;                /*<! bias of Conv2D, if you don't specify anything, no bias is added >*/
            const Activation<feature_t> *activation; /*<! activation of Conv2D, if you don't specify anything, no activation is applied >*/
            std::vector<int> padding;                /*<! padding size needed in [top, bottom, left, right] of this operation >*/
-            Tensor<feature_t> *output;              /*<! output ptr of Conv2D >*/
+            Tensor<feature_t> *output;               /*<! output ptr of Conv2D >*/
+            std::vector<int> output_shape;           /*<! output shape of Conv2D >*/

        public:
-
            /**
             * @brief Construct a new Conv2D object.
             * 
@ -37,33 +40,43 @@ namespace dl
             * @param filter          filter of Conv2D
             * @param bias            bias of Conv2D, if you don't specify anything, no bias is added
             * @param activation      activation of Conv2D, if you don't specify anything, no activation is applied
-             * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+             * @param padding_type    one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN or PADDING_NOT_SET,
             *                        - PADDING_VALID means no padding
-             *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+             *                        PADDING_SAME_END and PADDING_SAME_BEGIN results in padding with zeros evenly to the left/right or up/down of the input 
             *                        such that output has the same height/width dimension as the input,
-             *                        - PADDING_SAME results padding in TensorFlow style
-             *                        - PADDING_SAME_MXNET results padding in MXNET style
+             *                        - PADDING_SAME_END results padding in TensorFlow style
+             *                        - PADDING_SAME_BEGIN results padding in MXNET style
+             *                        - PADDING_NOT_SET means padding with the specific "padding" value below. 
+             * @param padding         if padding_type is PADDING_NOT_SET, this value will be used as padding size. 
+             *                        the shape must be 4, the value of each position is: [padding top, padding bottom, padding left, padding right]
             * @param stride_y        stride in height
             * @param stride_x        stride in width
             * @param name            name of layer
             */
            Conv2D(const int output_exponent,
                   const Filter<feature_t> *filter,
-                   const Bias<feature_t> *bias = NULL,
+                   const Bias<bias_t> *bias = NULL,
                   const Activation<feature_t> *activation = NULL,
                   const padding_type_t padding_type = PADDING_VALID,
+                   std::vector<int> padding = {},
                   const int stride_y = 1,
                   const int stride_x = 1,
-                   const char *name = NULL) : Layer(name),
-                                              output_exponent(output_exponent),
-                                              filter(filter),
-                                              stride_y(stride_y),
-                                              stride_x(stride_x),
-                                              padding_type(padding_type),
-                                              bias(bias),
-                                              activation(activation)
+                   const char *name = "Conv2D") : Layer(name),
+                                                  output_exponent(output_exponent),
+                                                  filter(filter),
+                                                  stride_y(stride_y),
+                                                  stride_x(stride_x),
+                                                  padding_type(padding_type),
+                                                  bias(bias),
+                                                  activation(activation),
+                                                  padding(padding),
+                                                  output_shape({})
            {
                this->output = new Tensor<feature_t>;
+                if (this->padding_type == PADDING_NOT_SET)
+                {
+                    assert(this->padding.size() == 4);
+                }
            }

            /**
@ -82,19 +95,30 @@ namespace dl
             * @brief Update output padding and input padding.
             * 
             * @param input as an input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input)
+            void build(Tensor<feature_t> &input, bool print_shape = false)
            {
                assert(input.shape[0] > 0);
                assert(input.shape[1] > 0);
+                assert(input.shape.size() == 3);
+                assert(this->filter->shape.size() == 4);
+                assert(input.shape[2] == this->filter->shape[2]);

-                std::vector<int> output_shape = nn::get_output_shape(input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type, true);
-                this->output->set_shape(output_shape);
+                this->output_shape = nn::get_output_shape(input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type, true, this->padding);
+                this->output->set_shape(this->output_shape);
                this->output->set_exponent(this->output_exponent);
                this->output->free_element();
+                if (this->padding_type != PADDING_NOT_SET)
+                {
+                    this->padding = nn::get_pad_size(this->output_shape, input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type);
+                }

-                this->padding = nn::get_pad_size(output_shape, input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type);
-                input.set_padding_size(this->padding);
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -122,7 +146,11 @@ namespace dl
                DL_LOG_LAYER_LATENCY_INIT();

                DL_LOG_LAYER_LATENCY_START();
-                this->output->apply_element();
+                if (this->output->shape != this->output_shape)
+                {
+                    this->output->set_shape(this->output_shape);
+                }
+                this->output->malloc_element();
                this->output->set_exponent(this->output_exponent);
                DL_LOG_LAYER_LATENCY_END(this->name, "apply");

@ -153,5 +181,6 @@ namespace dl
                dl::tool::cache::preload_func((uint32_t)(this->filter->element), size);
            }
        };
+
    } // namespace layer
 } // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_depthwise_conv2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_depthwise_conv2d.hpp
@ -13,8 +13,11 @@ namespace dl
         * @tparam feature_t supports int16_t and int8_t,
         *         - int16_t: stands for operation in int16_t quantize
         *         - int8_t: stands for operation in int8_t quantize
+         * @tparam bias_t supports int16_t and int8_t, must specify when using int8 per-channel quantization
+         *         - int16_t: for int16 quantization and int8 per-channel quantization
+         *         - int8_t: for int8 per-tensor quantization
         */
-        template <typename feature_t>
+        template <typename feature_t, typename bias_t = feature_t>
        class DepthwiseConv2D : public Layer
        {
        private:
@ -22,14 +25,14 @@ namespace dl
            const Filter<feature_t> *filter;         /*<! filter of DepthwiseConv2D >*/
            const int stride_y;                      /*<! stride in height >*/
            const int stride_x;                      /*<! stride in width >*/
-            const padding_type_t padding_type;       /*<! one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET >*/
-            const Bias<feature_t> *bias;             /*<! bias of DepthwiseConv2D, if you don't specify anything, no bias is added >*/
+            const padding_type_t padding_type;       /*<! one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN >*/
+            const Bias<bias_t> *bias;                /*<! bias of DepthwiseConv2D, if you don't specify anything, no bias is added >*/
            const Activation<feature_t> *activation; /*<! activation of DepthwiseConv2D, if you don't specify anything, no activation is applied >*/
            std::vector<int> padding;                /*<! padding size needed in [top, bottom, left, right] of this operation >*/
            Tensor<feature_t> *output;               /*<! output ptr of DepthwiseConv2D >*/
+            std::vector<int> output_shape;           /*<! output shape of DepthwiseConv2D >*/

        public:
-
            /**
             * @brief Construct a new DepthwiseConv2D object.
             * 
@ -37,40 +40,50 @@ namespace dl
             * @param filter          filter of DepthwiseConv2D
             * @param bias            bias of DepthwiseConv2D, if you don't specify anything, no bias is added
             * @param activation      activation of DepthwiseConv2D, if you don't specify anything, no activation is applied
-             * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+             * @param padding_type    one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN or PADDING_NOT_SET,
             *                        - PADDING_VALID means no padding
-             *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
-             *                        such that output has the same height/width dimension as the input
-             *                        - PADDING_SAME results padding in TensorFlow style
-             *                        - PADDING_SAME_MXNET results padding in MXNET style
+             *                        PADDING_SAME_END and PADDING_SAME_BEGIN results in padding with zeros evenly to the left/right or up/down of the input 
+             *                        such that output has the same height/width dimension as the input,
+             *                        - PADDING_SAME_END results padding in TensorFlow style
+             *                        - PADDING_SAME_BEGIN results padding in MXNET style
+             *                        - PADDING_NOT_SET means padding with the specific "padding" value below. 
+             * @param padding         if padding_type is PADDING_NOT_SET, this value will be used as padding size. 
+             *                        the shape must be 4, the value of each position is: [padding top, padding bottom, padding left, padding right]
             * @param stride_y        - stride in height
             * @param stride_x        - stride in width
             * @param name            name of layer
             */
            DepthwiseConv2D(const int output_exponent,
                            const Filter<feature_t> *filter,
-                            const Bias<feature_t> *bias = NULL,
+                            const Bias<bias_t> *bias = NULL,
                            const Activation<feature_t> *activation = NULL,
                            const padding_type_t padding_type = PADDING_VALID,
+                            std::vector<int> padding = {},
                            const int stride_y = 1,
                            const int stride_x = 1,
-                            const char *name = NULL) : Layer(name),
-                                                       output_exponent(output_exponent),
-                                                       filter(filter),
-                                                       stride_y(stride_y),
-                                                       stride_x(stride_x),
-                                                       padding_type(padding_type),
-                                                       bias(bias),
-                                                       activation(activation)
+                            const char *name = "DepthwiseConv2D") : Layer(name),
+                                                                    output_exponent(output_exponent),
+                                                                    filter(filter),
+                                                                    stride_y(stride_y),
+                                                                    stride_x(stride_x),
+                                                                    padding_type(padding_type),
+                                                                    bias(bias),
+                                                                    activation(activation),
+                                                                    padding(padding),
+                                                                    output_shape({})
            {
                this->output = new Tensor<feature_t>;
+                if (this->padding_type == PADDING_NOT_SET)
+                {
+                    assert(this->padding.size() == 4);
+                }
            }

            /**
             * @brief Destroy the DepthwiseConv2D object.
             * 
             */
-            ~DepthwiseConv2D() 
+            ~DepthwiseConv2D()
            {
                if (this->output != NULL)
                {
@ -82,19 +95,31 @@ namespace dl
             * @brief Update output shape and padding.
             * 
             * @param input as an input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input)
+            void build(Tensor<feature_t> &input, bool print_shape = false)
            {
                assert(input.shape[0] > 0);
                assert(input.shape[1] > 0);
+                assert(input.shape.size() == 3);
+                assert(this->filter->shape.size() == 4);
+                assert(input.shape[2] == this->filter->shape[2]);

-                std::vector<int> output_shape = nn::get_output_shape(input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type);
-                this->output->set_shape(output_shape);
+                this->output_shape = nn::get_output_shape(input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type, false, this->padding);
+                this->output->set_shape(this->output_shape);
                this->output->set_exponent(this->output_exponent);

-                this->padding = nn::get_pad_size(output_shape, input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type);
-                input.set_padding_size(this->padding);
+                if (this->padding_type != PADDING_NOT_SET)
+                {
+                    this->padding = nn::get_pad_size(this->output_shape, input.shape, this->filter->shape_with_dilation, this->stride_y, this->stride_x, this->padding_type);
+                }
                this->output->free_element();
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -122,7 +147,12 @@ namespace dl
                DL_LOG_LAYER_LATENCY_INIT();

                DL_LOG_LAYER_LATENCY_START();
-                this->output->apply_element();
+                if (this->output->shape != this->output_shape)
+                {
+                    this->output->set_shape(this->output_shape);
+                }
+
+                this->output->malloc_element();
                this->output->set_exponent(this->output_exponent);
                DL_LOG_LAYER_LATENCY_END(this->name, "apply");

--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_expand_dims.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_expand_dims.hpp
@ -0,0 +1,128 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief 
+         * 
+         * @tparam feature_t 
+         */
+        template <typename feature_t>
+        class ExpandDims : public Layer
+        {
+        private:
+            std::vector<int> output_shape; /*<! output shape of ExpandDims >*/
+            std::vector<int> axis;         /*<! position where the new axis is placed. >*/
+            Tensor<feature_t> *output;     /*<! output ptr of ExpandDims >*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                false: the output will store to a separate memory >*/
+
+        public:
+            int output_exponent;
+
+            /**
+             * @brief Construct a new ExpandDims object
+             * 
+             * @param axis position where the new axis is placed.
+             * @param name name of layer
+             * @param inplace true: the output will store to input
+             *                false: the output will store to a separate memory
+             */
+            ExpandDims(std::vector<int> axis, const char *name = "ExpandDims", bool inplace = false) : Layer(name),
+                                                                                                       axis(axis), inplace(inplace), output_shape({})
+            {
+            }
+
+            /**
+             * @brief Destroy the ExpandDims object
+             * 
+             */
+            ~ExpandDims()
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+            * @brief  Update output shape.
+            * 
+            * @param input as an input.
+            * @param print_shape  whether to print the output shape.
+            */
+            void build(Tensor<feature_t> &input, bool print_shape = false)
+            {
+                this->output_exponent = input.exponent;
+                if (!this->inplace)
+                {
+                    if (this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(this->output_exponent);
+                    this->output->set_shape(this->output_shape);
+                    this->output->expand_dims(this->axis);
+                    this->output->free_element();
+                }
+                else
+                {
+                    this->output = &input;
+                    this->output->set_shape(this->output_shape);
+                    this->output->expand_dims(this->axis);
+                }
+                this->output_shape = this->output->shape;
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& ExpandDims result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief call ExpandDims opeartion
+             * 
+             * @param input 
+             * @return Tensor<feature_t>& ExpandDims result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if (!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->set_exponent(input.exponent);
+                    this->output->set_shape(this->output_shape);
+                    this->output->copy_element(input, true);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "ExpandDims");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->set_shape(this->output_shape);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "ExpandDims");
+                }
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_flatten.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_flatten.hpp
@ -0,0 +1,120 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief 
+         * 
+         * @tparam feature_t 
+         */
+        template <typename feature_t>
+        class Flatten : public Layer
+        {
+        private:
+            int output_exponent;           /*<! exponent of output >*/
+            Tensor<feature_t> *output;     /*<! output ptr of Flatten >*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                false: the output will store to a separate memory >*/
+            std::vector<int> output_shape; /*<! output shape of Flatten >*/
+
+        public:
+            /**
+             * @brief Construct a new Flatten object
+             * 
+             * @param name name of layer
+             * @param inplace true: the output will store to input0
+             *                false: the output will store to a separate memory
+             */
+            Flatten(const char *name = "Flatten", bool inplace = false) : Layer(name), inplace(inplace), output_shape({})
+            {}
+
+            /**
+             * @brief Destroy the Flatten object
+             * 
+             */
+            ~Flatten()
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+            * @brief  Update output shape.
+            * 
+            * @param input as an input
+            * @param print_shape  whether to print the output shape. 
+            */
+            void build(Tensor<feature_t> &input, bool print_shape = false)
+            {
+                this->output_exponent = input.exponent;
+                this->output_shape = {input.get_size()};
+                if (!this->inplace)
+                {
+                    if (this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(this->output_exponent);
+                    this->output->set_shape(this->output_shape);
+                    this->output->free_element();
+                }
+                else
+                {
+                    this->output = &input;
+                    this->output->set_shape(this->output_shape);
+                }
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& Flatten result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call Flatten operation.
+             * 
+             * @param input as an input
+             * @return Tensor<feature_t>& Flatten result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if (!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->set_exponent(input.exponent);
+                    this->output->flatten();
+                    this->output->copy_element(input, true);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "flatten");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->flatten();
+                    DL_LOG_LAYER_LATENCY_END(this->name, "flatten");
+                }
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_fullyconnected.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_fullyconnected.hpp
@ -0,0 +1,167 @@
+#pragma once
+
+#include "dl_nn_fully_connected.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief Activation(FullyConnected(input, filter) + bias).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @tparam bias_t supports int16_t and int8_t, must specify when using int8 per-channel quantization
+         *         - int16_t: for int16 quantization and int8 per-channel quantization
+         *         - int8_t: for int8 per-tensor quantization
+         */
+        template <typename feature_t, typename bias_t = feature_t>
+        class FullyConnected : public Layer
+        {
+        private:
+            const int output_exponent;               /*<! exponent of output >*/
+            const bool flatten;                      /*<! true: input shape is [x1, x2, ..., xn], filter shape is [1, 1, x1 * x2 * ... * xn, output_dim], output shape is [output_dim]
+                                                          false: input shape is [x1, x2, ..., xn, input_dim], filter shape is [1, 1, input_dim, output_dim], output shape is [x1, x2, ...., xn, output_dim] >*/
+            const Filter<feature_t> *filter;         /*<! filter of FullyConnected >*/
+            const Bias<bias_t> *bias;                /*<! bias of FullyConnected, if you don't specify anything, no bias is added >*/
+            const Activation<feature_t> *activation; /*<! activation of FullyConnected, if you don't specify anything, no activation is applied >*/
+            Tensor<feature_t> *output;               /*<! output ptr of FullyConnected >*/
+            std::vector<int> output_shape;           /*<! output shape of FullyConnected >*/
+
+        public:
+            /**
+             * @brief Construct a new FullyConnected object.
+             * 
+             * @param output_exponent exponent of output
+             * @param filter          filter of FullyConnected
+             * @param bias            bias of FullyConnected, if you don't specify anything, no bias is added
+             * @param activation      activation of FullyConnected, if you don't specify anything, no activation is applied
+             * @param flatten         true: input shape is [x1, x2, ..., xn], filter shape is [1, 1, x1 * x2 * ... * xn, output_dim], output shape is [output_dim]
+                                      false: input shape is [x1, x2, ..., xn, input_dim], filter shape is [1, 1, input_dim, output_dim], output shape is [x1, x2, ...., xn, output_dim]
+             * @param name            name of layer
+             */
+            FullyConnected(const int output_exponent,
+                           const Filter<feature_t> *filter,
+                           const Bias<bias_t> *bias = NULL,
+                           const Activation<feature_t> *activation = NULL,
+                           const bool flatten = true,
+                           const char *name = "FullyConnected") : Layer(name),
+                                                                  output_exponent(output_exponent),
+                                                                  flatten(flatten),
+                                                                  filter(filter),
+                                                                  bias(bias),
+                                                                  activation(activation),
+                                                                  output_shape({})
+            {
+                this->output = new Tensor<feature_t>;
+            }
+
+            /**
+             * @brief Destroy the FullyConnected object.
+             * 
+             */
+            ~FullyConnected()
+            {
+                if (this->output != NULL)
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+             * @brief Update output padding and input padding.
+             * 
+             * @param input as an input
+             * @param print_shape  whether to print the output shape.
+             */
+            void build(Tensor<feature_t> &input, bool print_shape = false)
+            {
+                assert(this->filter->shape.size() == 4);
+                assert(this->filter->shape[0] == 1);
+                assert(this->filter->shape[1] == 1);
+                if (this->flatten)
+                {
+                    assert(input.get_size() == this->filter->shape[2]);
+                    this->output_shape = {this->filter->shape[3]};
+                }
+                else
+                {
+                    assert(input.shape.back() == this->filter->shape[2]);
+                    this->output_shape = input.shape;
+                    this->output_shape[this->output_shape.size() - 1] = this->filter->shape[3];
+                }
+                this->output->set_shape(this->output_shape);
+                this->output->set_exponent(this->output_exponent);
+                this->output->free_element();
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& FullyConnected result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call FullyConnected operation
+             * 
+             * @param input           as an input.
+             * @param autoload_enable one of true or false, 
+             *                        - true: load input and output from PSRAM to CACHE automatically
+             *                        - false: do not
+             * @param assign_core     not effective yet
+             * @return FullyConnected result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input, bool autoload_enable = false, const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                DL_LOG_LAYER_LATENCY_START();
+                if (this->output->shape != this->output_shape)
+                {
+                    this->output->set_shape(this->output_shape);
+                }
+                this->output->malloc_element();
+                this->output->set_exponent(this->output_exponent);
+                DL_LOG_LAYER_LATENCY_END(this->name, "apply");
+
+                if (autoload_enable)
+                {
+                    dl::tool::cache::autoload_func((uint32_t)(this->output->element), this->output->get_size() * sizeof(feature_t),
+                                                   (uint32_t)(input.element), input.get_size() * sizeof(feature_t));
+                }
+
+                DL_LOG_LAYER_LATENCY_START();
+                nn::fully_connected(*this->output, input, *(this->filter), this->bias, this->activation, this->flatten, assign_core);
+                DL_LOG_LAYER_LATENCY_END(this->name, "fully_connected");
+                return *this->output;
+            }
+
+            /**
+             * @brief Preload the filter to Cache.
+             * NOTE: Call this layer's preload() before previous layer's call() such that filter could be loaded while previous layer is doing calculation.
+             */
+            void preload()
+            {
+                size_t size = sizeof(feature_t);
+                int shape_size = this->filter->shape.size();
+                for (int i = 0; i < shape_size; ++i)
+                {
+                    size *= filter->shape[i];
+                }
+                dl::tool::cache::preload_func((uint32_t)(this->filter->element), size);
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_global_avg_pool2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_global_avg_pool2d.hpp
@ -20,8 +20,9 @@ namespace dl
        class GlobalAveragePool2D : public Layer
        {
        private:
-            const int output_exponent; /*<! exponent of output >*/
-            Tensor<feature_t> *output; /*<! output ptr of GlobalAveragePool2D >*/
+            const int output_exponent;     /*<! exponent of output >*/
+            std::vector<int> output_shape; /*<! output shape of GlobalAveragePool2D >*/
+            Tensor<feature_t> *output;     /*<! output ptr of GlobalAveragePool2D >*/
        public:
            /**
             * @brief Construct a new GlobalAveragePool2D object.
@ -29,8 +30,9 @@ namespace dl
             * @param output_exponent exponent of output
             * @param name            name of layer
             */
-            GlobalAveragePool2D(const int output_exponent, const char *name = NULL) : Layer(name),
-                                                                                      output_exponent(output_exponent)
+            GlobalAveragePool2D(const int output_exponent, const char *name = "GlobalAveragePool2D") : Layer(name),
+                                                                                                       output_exponent(output_exponent),
+                                                                                                       output_shape({})

            {
                this->output = new Tensor<feature_t>;
@ -52,17 +54,26 @@ namespace dl
             * @brief Update output shape.
             * 
             * @param input as an input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input)
+            void build(Tensor<feature_t> &input, bool print_shape = false)
            {
                assert(input.shape[0] > 0);
                assert(input.shape[1] > 0);
+                assert(input.shape.size() == 3);

                std::vector<int> output_shape(input.shape.size(), 1);
                output_shape[2] = input.shape[2];
-                this->output->set_shape(output_shape);
+                this->output_shape = output_shape;
+                this->output->set_shape(this->output_shape);
                this->output->set_exponent(this->output_exponent);
                this->output->free_element();
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -90,7 +101,11 @@ namespace dl
                DL_LOG_LAYER_LATENCY_INIT();

                DL_LOG_LAYER_LATENCY_START();
-                this->output->apply_element();
+                if (this->output->shape != this->output_shape)
+                {
+                    this->output->set_shape(this->output_shape);
+                }
+                this->output->malloc_element();
                this->output->set_exponent(this->output_exponent);
                DL_LOG_LAYER_LATENCY_END(this->name, "apply");

--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_global_max_pool2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_global_max_pool2d.hpp
@ -20,15 +20,15 @@ namespace dl
        class GlobalMaxPool2D : public Layer
        {
        private:
-            Tensor<feature_t> *output;  /*<! output ptr of GlobalMaxPool2D >*/
+            Tensor<feature_t> *output;     /*<! output ptr of GlobalMaxPool2D >*/
+            std::vector<int> output_shape; /*<! output shape of GlobalMaxPool2D >*/
        public:
-
            /**
             * @brief Construct a new GlobalMaxPool2D object.
             * 
             * @param name         name of layer
             */
-            GlobalMaxPool2D(const char *name = NULL) : Layer(name)
+            GlobalMaxPool2D(const char *name = "GlobalMaxPool2D") : Layer(name), output_shape({})
            {
                this->output = new Tensor<feature_t>;
            }
@ -49,17 +49,26 @@ namespace dl
             * @brief Update output shape and exponent.
             * 
             * @param input as an input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input)
+            void build(Tensor<feature_t> &input, bool print_shape = false)
            {
                assert(input.shape[0] > 0);
                assert(input.shape[1] > 0);
+                assert(input.shape.size() == 3);
                this->output->set_exponent(input.exponent);

                std::vector<int> output_shape(input.shape.size(), 1);
                output_shape[2] = input.shape[2];
-                this->output->set_shape(output_shape);
+                this->output_shape = output_shape;
+                this->output->set_shape(this->output_shape);
                this->output->free_element();
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -87,7 +96,11 @@ namespace dl
                DL_LOG_LAYER_LATENCY_INIT();

                DL_LOG_LAYER_LATENCY_START();
-                this->output->apply_element();
+                if (this->output->shape != this->output_shape)
+                {
+                    this->output->set_shape(this->output_shape);
+                }
+                this->output->malloc_element();
                this->output->set_exponent(input.exponent);
                DL_LOG_LAYER_LATENCY_END(this->name, "apply");

--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_leakyrelu.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_leakyrelu.hpp
@ -2,7 +2,7 @@

 #include "dl_constant.hpp"
 #include "dl_variable.hpp"
-#include "dl_nn_LeakyReLU.hpp"
+#include "dl_nn_leakyrelu.hpp"
 #include "dl_layer_base.hpp"

 namespace dl
@ -20,13 +20,13 @@ namespace dl
        class LeakyReLU : public Layer
        {
        private:
-            feature_t activation_alpha; /*<! quantized alpha >*/
-            int activation_exponent;    /*<! exponent of quantized alpha >*/
-            Tensor<feature_t> *output;  /*<! output ptr of leakyrelu>*/
-            bool inplace;               /*<! true: the output will store to input0
-                                             false: the output will store to a seperate memeory >*/
+            feature_t activation_alpha;    /*<! quantized alpha >*/
+            int activation_exponent;       /*<! exponent of quantized alpha >*/
+            Tensor<feature_t> *output;     /*<! output ptr of leakyrelu>*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                false: the output will store to a separate memory >*/
+            std::vector<int> output_shape; /*<! output shape of leakyrelu >*/
        public:
-
            /**
             * @brief Construct a new LeakyReLU object
             * 
@ -34,9 +34,9 @@ namespace dl
             * @param activation_exponent  exponent of quantized alpha
             * @param name                 name of leakyrelu
             * @param inplace              true: the output will store to input0
-             *                             false: the output will store to a seperate memeory
+             *                             false: the output will store to a separate memory
             */
-            LeakyReLU(const int activation_alpha, const int activation_exponent, const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
+            LeakyReLU(const int activation_alpha, const int activation_exponent, const char *name = "LeakyReLU", bool inplace = false) : Layer(name), output(NULL), output_shape({})
            {
                this->activation_alpha = activation_alpha;
                this->activation_exponent = activation_exponent;
@ -47,7 +47,7 @@ namespace dl
             * @brief Destroy the LeakyReLU object
             * 
             */
-            ~LeakyReLU() 
+            ~LeakyReLU()
            {
                if ((!this->inplace) && (this->output != NULL))
                {
@ -59,24 +59,32 @@ namespace dl
             * @brief Update output shape and exponent
             * 
             * @param input       as an input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input)
+            void build(Tensor<feature_t> &input, bool print_shape = false)
            {
-                if(!this->inplace)
+                this->output_shape = input.shape;
+                if (!this->inplace)
                {
-                    if(this->output != NULL)
+                    if (this->output != NULL)
                    {
                        this->output = new Tensor<feature_t>;
-                    }  
-                    this->output->set_shape(input.shape);
+                    }
+                    this->output->set_shape(this->output_shape);
                    this->output->set_exponent(input.exponent);
                    this->output->free_element();
                }
                else
                {
                    this->output = &input;
+                    this->output->set_shape(this->output_shape);
+                }
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
                }
-                
            }

            /**
@ -100,10 +108,14 @@ namespace dl
            {
                DL_LOG_LAYER_LATENCY_INIT();

-                if(!this->inplace)
+                if (!this->inplace)
                {
                    DL_LOG_LAYER_LATENCY_START();
-                    this->output->apply_element();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
+                    this->output->malloc_element();
                    this->output->set_exponent(input.exponent);
                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");

@ -114,6 +126,10 @@ namespace dl
                else
                {
                    DL_LOG_LAYER_LATENCY_START();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
                    nn::leakyrelu<true>(*this->output, input, this->activation_alpha, this->activation_exponent, assign_core);
                    DL_LOG_LAYER_LATENCY_END(this->name, "leakyrelu");
                }
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_max2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_max2d.hpp
@ -22,28 +22,28 @@ namespace dl
        class Max2D : public Layer
        {
        private:
-            Tensor<feature_t> *output;  /*<! output ptr of max2d >*/
-            bool inplace;               /*<! true: the output will store to input0
-                                             false: the output will store to a seperate memeory >*/
+            Tensor<feature_t> *output;     /*<! output ptr of max2d >*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                false: the output will store to a separate memory >*/
+            std::vector<int> output_shape; /*<! output shape of max2d >*/
        public:
-            
            /**
             * @brief Construct a new Max2D object.
             * 
             * @param name            name of max2d
             * @param inplace         true: the output will store to input0
-             *                        false: the output will store to a seperate memeory
+             *                        false: the output will store to a separate memory
             */
-            Max2D(const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
+            Max2D(const char *name = "Max2D", bool inplace = false) : Layer(name),
+                                                                      output(NULL), inplace(inplace), output_shape({})
            {
-                this->inplace = inplace;
            }

            /**
             * @brief Destroy the Max2D object
             * 
             */
-            ~Max2D() 
+            ~Max2D()
            {
                if ((!this->inplace) && (this->output != NULL))
                {
@ -58,24 +58,34 @@ namespace dl
             * 
             * @param input0 as one input
             * @param input1 as another input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1, bool print_shape = false)
            {
                assert(input0.is_same_shape(input1));
                assert(input0.exponent == input1.exponent);
+                this->output_shape = input0.shape;

-                if(!this->inplace)
+                if (!this->inplace)
                {
-                    if(this->output != NULL)
+                    if (this->output != NULL)
                    {
                        this->output = new Tensor<feature_t>;
                    }
                    this->output->set_exponent(this->output_exponent);
-                    this->output->set_shape(input0.shape);
+                    this->output->set_shape(this->output_shape);
                    this->output->free_element();
                }
                else
+                {
                    this->output = &input0;
+                }
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -100,10 +110,14 @@ namespace dl
            {
                DL_LOG_LAYER_LATENCY_INIT();

-                if(!this->inplace)
+                if (!this->inplace)
                {
                    DL_LOG_LAYER_LATENCY_START();
-                    this->output->apply_element();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
+                    this->output->malloc_element();
                    this->output->set_exponent(input0.exponent);
                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");

@ -114,6 +128,10 @@ namespace dl
                else
                {
                    DL_LOG_LAYER_LATENCY_START();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
                    nn::max2d<true>(*this->output, input0, input1, assign_core);
                    DL_LOG_LAYER_LATENCY_END(this->name, "max2d");
                }
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_max_pool2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_max_pool2d.hpp
@ -23,44 +23,54 @@ namespace dl
            std::vector<int> filter_shape;     /*<! filter shape in [filter_height, filter_width] >*/
            const int stride_y;                /*<! stride in height >*/
            const int stride_x;                /*<! stride in width >*/
-            const padding_type_t padding_type; /*<! one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET >*/
+            const padding_type_t padding_type; /*<! one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN >*/
            std::vector<int> padding;          /*<! padding size needed in [top, bottom, left, right] of this operation >*/
            Tensor<feature_t> *output;         /*<! output ptr of MaxPool2D >*/
+            std::vector<int> output_shape;     /*<! output shape of MaxPool2D >*/

        public:
-
            /**
             * @brief Construct a new MaxPool2D object.
             * 
             * @param filter_shape filter shape in [filter_height, filter_width]
-             * @param padding_type one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+             * @param padding_type one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN or PADDING_NOT_SET,
             *                     - PADDING_VALID means no padding
-             *                     PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+             *                     PADDING_SAME_END and PADDING_SAME_BEGIN results in padding with zeros evenly to the left/right or up/down of the input 
             *                     such that output has the same height/width dimension as the input,
-             *                     - PADDING_SAME results padding in TensorFlow style
-             *                     - PADDING_SAME_MXNET results padding in MXNET style
+             *                     - PADDING_SAME_END results padding in TensorFlow style
+             *                     - PADDING_SAME_BEGIN results padding in MXNET style
+             *                     - PADDING_NOT_SET means padding with the specific "padding" value below. 
+             * @param padding      if padding_type is PADDING_NOT_SET, this value will be used as padding size. 
+             *                     the shape must be 4, the value of each position is: [padding top, padding bottom, padding left, padding right]
             * @param stride_y     stride in height
             * @param stride_x     stride in width
             * @param name         name of layer
             */
            MaxPool2D(const std::vector<int> filter_shape,
                      const padding_type_t padding_type = PADDING_VALID,
+                      std::vector<int> padding = {},
                      const int stride_y = 1,
                      const int stride_x = 1,
-                      const char *name = NULL) : Layer(name),
-                                                 filter_shape(filter_shape),
-                                                 stride_y(stride_y),
-                                                 stride_x(stride_x),
-                                                 padding_type(padding_type)
+                      const char *name = "MaxPool2D") : Layer(name),
+                                                        filter_shape(filter_shape),
+                                                        padding_type(padding_type),
+                                                        padding(padding),
+                                                        stride_y(stride_y),
+                                                        stride_x(stride_x),
+                                                        output_shape({})
            {
                this->output = new Tensor<feature_t>;
+                if (this->padding_type == PADDING_NOT_SET)
+                {
+                    assert(this->padding.size() == 4);
+                }
            }

            /**
             * @brief Destroy the MaxPool2D object.
             * 
             */
-            ~MaxPool2D() 
+            ~MaxPool2D()
            {
                if (this->output != NULL)
                {
@ -72,18 +82,29 @@ namespace dl
             * @brief Update output shape and padding.
             * 
             * @param input as an input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input)
+            void build(Tensor<feature_t> &input, bool print_shape = false)
            {
                assert(input.shape[0] > 0);
                assert(input.shape[1] > 0);
-                this->output->set_exponent(input.exponent);
-                std::vector<int> output_shape = nn::get_output_shape(input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
-                this->output->set_shape(output_shape);
+                assert(input.shape.size() == 3);

-                this->padding = nn::get_pad_size(output_shape, input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
-                input.set_padding_size(this->padding);
+                this->output->set_exponent(input.exponent);
+                this->output_shape = nn::get_output_shape(input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type, false, this->padding);
+                this->output->set_shape(this->output_shape);
+
+                if (this->padding_type != PADDING_NOT_SET)
+                {
+                    this->padding = nn::get_pad_size(this->output_shape, input.shape, filter_shape, this->stride_y, this->stride_x, this->padding_type);
+                }
                this->output->free_element();
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -111,7 +132,11 @@ namespace dl
                DL_LOG_LAYER_LATENCY_INIT();

                DL_LOG_LAYER_LATENCY_START();
-                this->output->apply_element();
+                if (this->output->shape != this->output_shape)
+                {
+                    this->output->set_shape(this->output_shape);
+                }
+                this->output->malloc_element();
                this->output->set_exponent(input.exponent);
                DL_LOG_LAYER_LATENCY_END(this->name, "apply");

--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_min2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_min2d.hpp
@ -22,28 +22,28 @@ namespace dl
        class Min2D : public Layer
        {
        private:
-            Tensor<feature_t> *output;  /*<! output of ptr min2d>*/ 
-            bool inplace;               /*<! true: the output will store to input0
-                                             false: the output will store to a seperate memeory >*/ 
-        public:    
-
+            Tensor<feature_t> *output;     /*<! output of ptr min2d>*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                false: the output will store to a separate memory >*/
+            std::vector<int> output_shape; /*<! output shape of min2d >*/
+        public:
            /**
             * @brief Construct a new Min2D object
             * 
             * @param name            name of min2d
             * @param inplace         true: the output will store to input0
-             *                        false: the output will store to a seperate memeory
+             *                        false: the output will store to a separate memory
             */
-            Min2D(const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
-            {
-                this->inplace = inplace;
-            }
+            Min2D(const char *name = "Min2D", bool inplace = false) : Layer(name),
+                                                                      output(NULL),
+                                                                      inplace(inplace),
+                                                                      output_shape({}) {}

            /**
             * @brief Destroy the Min2D object
             * 
             */
-            ~Min2D() 
+            ~Min2D()
            {
                if ((!this->inplace) && (this->output != NULL))
                {
@ -58,25 +58,34 @@ namespace dl
             * 
             * @param input0 as one input
             * @param input1 as another input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1, bool print_shape = false)
            {
                assert(input0.is_same_shape(input1));
                assert(input0.exponent == input1.exponent);
+                this->output_shape = input0.shape;

-                if(!this->inplace)
+                if (!this->inplace)
                {
-                    if(this->output != NULL)
+                    if (this->output != NULL)
                    {
                        this->output = new Tensor<feature_t>;
                    }
-                    this->output->set_shape(input0.shape);
+                    this->output->set_shape(this->output_shape);
                    this->output->set_exponent(input0.exponent);
                    this->output->free_element();
                }
                else
+                {
                    this->output = &input0;
-                
+                }
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -101,10 +110,14 @@ namespace dl
            {
                DL_LOG_LAYER_LATENCY_INIT();

-                if(!this->inplace)
+                if (!this->inplace)
                {
                    DL_LOG_LAYER_LATENCY_START();
-                    this->output->apply_element();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
+                    this->output->malloc_element();
                    this->output->set_exponent(input0.exponent);
                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");

@ -115,6 +128,10 @@ namespace dl
                else
                {
                    DL_LOG_LAYER_LATENCY_START();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
                    nn::min2d<true>(*this->output, input0, input1, assign_core);
                    DL_LOG_LAYER_LATENCY_END(this->name, "min2d");
                }
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_mul2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_mul2d.hpp
@ -21,14 +21,13 @@ namespace dl
        class Mul2D : public Layer
        {
        private:
-            const int output_exponent;              /*<! exponent of output >*/
+            const int output_exponent;               /*<! exponent of output >*/
            const Activation<feature_t> *activation; /*<! activation of Mul2D, if you don't specify anything, no activation is applied >*/
-            Tensor<feature_t> *output;  /*<! output ptr of Mul2D >*/
-            bool inplace;               /*<! true: the output will store to input0
-                                             false: the output will store to a seperate memeory >*/ 
+            Tensor<feature_t> *output;               /*<! output ptr of Mul2D >*/
+            bool inplace;                            /*<! true: the output will store to input0
+                                                            false: the output will store to a separate memory >*/
+            std::vector<int> output_shape;           /*<! output shape of Mul2D >*/
        public:
-            const int output_exponent; /*<! exponent of output >*/
-
            /**
             * @brief Construct a new Mul2D object.
             * 
@ -36,18 +35,24 @@ namespace dl
             * @param activation      activation of Mul2D, if you don't specify anything, no activation is applied
             * @param name            name of layer
             * @param inplace         true: the output will store to input0
-             *                        false: the output will store to a seperate memeory
+             *                        false: the output will store to a separate memory
             */
-            Mul2D(const int output_exponent, const Activation<feature_t> *activation = NULL, const char *name = NULL, bool inplace = false) : Layer(name), 
-                                                                                            output_exponent(output_exponent),activation(activation), output(NULL)
+            Mul2D(const int output_exponent,
+                  const Activation<feature_t> *activation = NULL,
+                  const char *name = "Mul2D",
+                  bool inplace = false) : Layer(name),
+                                          output_exponent(output_exponent),
+                                          activation(activation),
+                                          output(NULL),
+                                          inplace(inplace),
+                                          output_shape({})
            {
-                this->inplace = inplace;
            }

            /**
             * @brief Destroy the Multiply2D object.
             */
-            ~Mul2D() 
+            ~Mul2D()
            {
                if ((!this->inplace) && (this->output != NULL))
                {
@ -61,24 +66,34 @@ namespace dl
             * 
             * @param input0 as one input
             * @param input1 as another input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1, bool print_shape = false)
            {
                assert(input0.is_same_shape(input1));
+                this->output_shape = input0.shape;

                if (!this->inplace)
                {
-                    if(this->output != NULL)
+                    if (this->output != NULL)
                    {
                        this->output = new Tensor<feature_t>;
                    }
                    this->output->set_exponent(this->output_exponent);
-                    this->output->set_shape(input0.shape);
+                    this->output->set_shape(this->output_shape);
                    this->output->free_element();
                }
-                    
+
                else
+                {
                    this->output = &input0;
+                }
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -106,7 +121,11 @@ namespace dl
                if (!this->inplace)
                {
                    DL_LOG_LAYER_LATENCY_START();
-                    this->output->apply_element();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
+                    this->output->malloc_element();
                    this->output->set_exponent(this->output_exponent);
                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");

@ -117,6 +136,10 @@ namespace dl
                else
                {
                    DL_LOG_LAYER_LATENCY_START();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
                    nn::mul2d<true>(*this->output, input0, input1, this->activation, assign_core);
                    DL_LOG_LAYER_LATENCY_END(this->name, "mul2d");
                }
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_prelu.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_prelu.hpp
@ -24,9 +24,9 @@ namespace dl
            int activation_exponent;       /*<! exponent of quantized alpha elements >*/
            Tensor<feature_t> *output;     /*<! output ptr of prelu >*/
            bool inplace;                  /*<! true: the output will store to input0
-                                                false: the output will store to a seperate memeory >*/ 
+                                                false: the output will store to a separate memory >*/
+            std::vector<int> output_shape; /*<! output shape of prelu >*/
        public:
-
            /**
             * @brief Construct a new PReLU object
             * 
@ -34,20 +34,25 @@ namespace dl
             * @param activation_exponent  exponent of quantized alpha elements
             * @param name                 name of prelu
             * @param inplace              true: the output will store to input0
-             *                             false: the output will store to a seperate memeory
+             *                             false: the output will store to a separate memory
             */
-            PReLU(const feature_t *activation_element, const int activation_exponent = 0, const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
+            PReLU(const feature_t *activation_element,
+                  const int activation_exponent = 0,
+                  const char *name = NULL,
+                  bool inplace = "PReLU") : Layer(name),
+                                            activation_element(activation_element),
+                                            activation_exponent(activation_exponent),
+                                            output(NULL),
+                                            inplace(inplace),
+                                            output_shape({})
            {
-                this->activation_element = activation_element;
-                this->activation_exponent = activation_exponent;
-                this->inplace = inplace;
            }

            /**
             * @brief Destroy the PReLU object
             * 
             */
-            ~PReLU() 
+            ~PReLU()
            {
                if ((!this->inplace) && (this->output != NULL))
                {
@ -59,23 +64,31 @@ namespace dl
             * @brief Update output shape and exponent
             * 
             * @param input       as an input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input)
+            void build(Tensor<feature_t> &input, bool print_shape = false)
            {
-                if(!this->inplace)
+                this->output_shape = input.shape;
+                if (!this->inplace)
                {
-                    if(this->output != NULL)
+                    if (this->output != NULL)
                    {
                        this->output = new Tensor<feature_t>;
                    }
                    this->output->set_exponent(input.exponent);
-                    this->output->set_shape(input.shape);
+                    this->output->set_shape(this->output_shape);
                    this->output->free_element();
                }
                else
                {
                    this->output = &input;
                }
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -99,11 +112,15 @@ namespace dl
            {
                DL_LOG_LAYER_LATENCY_INIT();

-                if(!this->inplace)
+                if (!this->inplace)
                {
                    DL_LOG_LAYER_LATENCY_START();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
                    this->output->set_exponent(input.exponent);
-                    this->output->apply_element();
+                    this->output->malloc_element();
                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");

                    DL_LOG_LAYER_LATENCY_START();
@ -113,6 +130,10 @@ namespace dl
                else
                {
                    DL_LOG_LAYER_LATENCY_START();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
                    nn::prelu(*this->output, input, this->activation_element, this->activation_exponent, assign_core);
                    DL_LOG_LAYER_LATENCY_END(this->name, "leakyrelu");
                }
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_relu.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_relu.hpp
@ -21,29 +21,28 @@ namespace dl
        class ReLU : public Layer
        {
        private:
-            Tensor<feature_t> *output;  /*<! output ptr of relu >*/
-            bool inplace;               /*<! true: the output will store to input0
-                                             false: the output will store to a seperate memeory >*/ 
+            Tensor<feature_t> *output;     /*<! output ptr of relu >*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                false: the output will store to a separate memory >*/
+            std::vector<int> output_shape; /*<! output shape of relu >*/
        public:
-
-
            /**
             * @brief Construct a new ReLU object
             * 
             * @param name            name of relu
             * @param inplace         true: the output will store to input0
-             *                        false: the output will store to a seperate memeory
+             *                        false: the output will store to a separate memory
             */
-            ReLU(const char *name = NULL, bool inplace = false) : Layer(name), output(NULL)
+            ReLU(const char *name = "ReLU", bool inplace = false) : Layer(name),
+                                                                    output(NULL), inplace(inplace), output_shape({})
            {
-                this->inplace = inplace;
            }

            /**
             * @brief Destroy the ReLU object
             * 
             */
-            ~ReLU() 
+            ~ReLU()
            {
                if ((!this->inplace) && (this->output != NULL))
                {
@ -55,23 +54,31 @@ namespace dl
             * @brief Update output shape and exponent
             * 
             * @param input       as an input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input)
+            void build(Tensor<feature_t> &input, bool print_shape = false)
            {
-                if(!this->inplace)
+                this->output_shape = input.shape;
+                if (!this->inplace)
                {
-                    if(this->output != NULL)
+                    if (this->output != NULL)
                    {
                        this->output = new Tensor<feature_t>;
                    }
                    this->output->set_exponent(input.exponent);
-                    this->output->set_shape(input.shape);
+                    this->output->set_shape(this->output_shape);
                    this->output->free_element();
                }
                else
                {
                    this->output = &input;
                }
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -95,10 +102,14 @@ namespace dl
            {
                DL_LOG_LAYER_LATENCY_INIT();

-                if(!this->inplace)
+                if (!this->inplace)
                {
                    DL_LOG_LAYER_LATENCY_START();
-                    this->output->apply_element();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
+                    this->output->malloc_element();
                    this->output->set_exponent(input.exponent);
                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");

@ -109,6 +120,10 @@ namespace dl
                else
                {
                    DL_LOG_LAYER_LATENCY_START();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
                    nn::relu(*this->output, input, assign_core);
                    DL_LOG_LAYER_LATENCY_END(this->name, "relu");
                }
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_reshape.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_reshape.hpp
@ -0,0 +1,124 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief  Reshape(input)
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         */
+        template <typename feature_t>
+        class Reshape : public Layer
+        {
+        private:
+            int output_exponent;           /*<! exponent of output >*/
+            Tensor<feature_t> *output;     /*<! output ptr of Reshape >*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                false: the output will store to a separate memory >*/
+            std::vector<int> output_shape; /*<! output shape of Reshape >*/
+        public:
+            /**
+             * @brief Construct a new Reshape object
+             * 
+             * @param shape      the target shape
+             * @param name       name of Reshape layer
+             * @param inplace    true: the output will store to input0
+             *                   false: the output will store to a separate memory
+             */
+            Reshape(std::vector<int> shape, const char *name = "Reshape", bool inplace = false) : Layer(name),
+                                                                                                  output_shape(shape), inplace(inplace)
+            {
+            }
+
+            /**
+             * @brief Destroy the Reshape object
+             * 
+             */
+            ~Reshape()
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+            * @brief             Update output shape and exponent
+            * 
+            * @param input        as an input
+            * @param print_shape  whether to print the output shape.
+            */
+            void build(Tensor<feature_t> &input, bool print_shape = false)
+            {
+                this->output_exponent = input.exponent;
+                if (!this->inplace)
+                {
+                    if (this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(this->output_exponent);
+                    this->output->set_shape(this->output_shape);
+                    this->output->free_element();
+                }
+                else
+                {
+                    this->output = &input;
+                    this->output->set_shape(this->output_shape);
+                }
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& Reshape result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call Reshape operation.
+             * 
+             * @param input  as an input
+             * @return Tensor<feature_t>& Reshape result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if (!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->set_exponent(input.exponent);
+                    this->output->reshape(this->output_shape);
+                    this->output->copy_element(input, true);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "reshape");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->reshape(this->output_shape);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "reshape");
+                }
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_squeeze.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_squeeze.hpp
@ -0,0 +1,127 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief 
+         * 
+         * @tparam feature_t 
+         */
+        template <typename feature_t>
+        class Squeeze : public Layer
+        {
+        private:
+            int output_exponent;           /*<! exponent of output >*/
+            Tensor<feature_t> *output;     /*<! output ptr of Squeeze >*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                 false: the output will store to a separate memory >*/
+            int axis;                      /*<! the dim to to be remove. make sure the length of the dim is equal to 1.
+                                            if axis == INT32_MAX, all the dims with length==1 will be removed. >*/
+            std::vector<int> output_shape; /*<! output shape of AvgPool2D >*/
+        public:
+            /**
+             * @brief Construct a new Squeeze object
+             * 
+             * @param axis the dim to to be remove. make sure the length of the dim is equal to 1.
+             *             if axis == INT32_MAX, all the dims with length==1 will be removed.
+             * @param name      name of Squeeze layer
+             * @param inplace   true: the output will store to input0
+             *                  false: the output will store to a separate memory
+             */
+            Squeeze(int axis = INT32_MAX, const char *name = "Squeeze", bool inplace = false) : Layer(name), axis(axis), inplace(inplace), output_shape({})
+            {
+            }
+
+            /**
+             * @brief Destroy the Squeeze object
+             * 
+             */
+            ~Squeeze()
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+            * @brief Update output shape and exponent
+            * 
+            * @param input as an input
+            * @param print_shape  whether to print the output shape.
+            */
+            void build(Tensor<feature_t> &input, bool print_shape = false)
+            {
+                this->output_exponent = input.exponent;
+                if (!this->inplace)
+                {
+                    if (this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(this->output_exponent);
+                    this->output->set_shape(input.shape);
+                    this->output->squeeze(this->axis);
+                    this->output->free_element();
+                }
+                else
+                {
+                    this->output = &input;
+                    this->output->set_shape(input.shape);
+                    this->output->squeeze(this->axis);
+                }
+                this->output_shape = this->output->shape;
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& Squeeze result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief  Call Squeeze operation.
+             * 
+             * @param input as an input
+             * @return Tensor<feature_t>& Squeeze result
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if (!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->set_exponent(input.exponent);
+                    this->output->set_shape(this->output_shape);
+                    this->output->copy_element(input, true);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "Squeeze");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->set_shape(this->output_shape);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "Squeeze");
+                }
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_sub2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_sub2d.hpp
@ -21,13 +21,13 @@ namespace dl
        class Sub2D : public Layer
        {
        private:
-            const int output_exponent; /*<! exponent of output >*/
-            const Activation<feature_t> *activation; /*<! activation of Mul2D, if you don't specify anything, no activation is applied >*/
-            Tensor<feature_t> *output;  /*<! output ptr of Sub2D >*/
-            bool inplace;               /*<! true: the output will store to input0
-                                             false: the output will store to a seperate memeory >*/ 
+            const int output_exponent;               /*<! exponent of output >*/
+            const Activation<feature_t> *activation; /*<! activation of Sub2D, if you don't specify anything, no activation is applied >*/
+            Tensor<feature_t> *output;               /*<! output ptr of Sub2D >*/
+            bool inplace;                            /*<! true: the output will store to input0
+                                                            false: the output will store to a separate memory >*/
+            std::vector<int> output_shape;           /*<! output shape of Sub2D >*/
        public:
-
            /**
             * @brief Construct a new Sub2D object.
             * 
@ -35,18 +35,17 @@ namespace dl
             * @param activation      activation of Mul2D, if you don't specify anything, no activation is applied
             * @param name            name of layer
             * @param inplace         true: the output will store to input0
-             *                        false: the output will store to a seperate memeory
+             *                        false: the output will store to a separate memory
             */
-            Sub2D(const int output_exponent, const Activation<feature_t> *activation = NULL, const char *name = NULL, bool inplace = false) : Layer(name), 
-                                                        output_exponent(output_exponent), activation(activation), output(NULL)
+            Sub2D(const int output_exponent, const Activation<feature_t> *activation = NULL, const char *name = "Sub2D", bool inplace = false) : Layer(name),
+                                                                                                                                                 output_exponent(output_exponent), activation(activation), output(NULL), inplace(inplace), output_shape({})
            {
-                this->inplace = inplace;
            }

            /**
             * @brief Destroy the Sub2D object.
             */
-            ~Sub2D() 
+            ~Sub2D()
            {
                if ((!this->inplace) && (this->output != NULL))
                {
@ -60,22 +59,32 @@ namespace dl
             * 
             * @param input0 as one input
             * @param input1 as another input
+             * @param print_shape  whether to print the output shape.
             */
-            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1)
+            void build(Tensor<feature_t> &input0, Tensor<feature_t> &input1, bool print_shape = false)
            {
                assert(input0.is_same_shape(input1));
+                this->output_shape = input0.shape;
                if (!this->inplace)
                {
-                    if(this->output != NULL)
+                    if (this->output != NULL)
                    {
                        this->output = new Tensor<feature_t>;
                    }
                    this->output->set_exponent(this->output_exponent);
-                    this->output->set_shape(input0.shape);
+                    this->output->set_shape(this->output_shape);
                    this->output->free_element();
-                }    
+                }
                else
+                {
                    this->output = &input0;
+                }
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
            }

            /**
@ -103,7 +112,11 @@ namespace dl
                if (!this->inplace)
                {
                    DL_LOG_LAYER_LATENCY_START();
-                    this->output.apply_element();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
+                    this->output.malloc_element();
                    this->output->set_exponent(input0.exponent);
                    DL_LOG_LAYER_LATENCY_END(this->name, "apply");

@ -114,6 +127,10 @@ namespace dl
                else
                {
                    DL_LOG_LAYER_LATENCY_START();
+                    if (this->output->shape != this->output_shape)
+                    {
+                        this->output->set_shape(this->output_shape);
+                    }
                    nn::sub2d<true>(this->output, input0, input1, this->activation, assign_core, this->output_exponent);
                    DL_LOG_LAYER_LATENCY_END(this->name, "sub2d");
                }
--- a/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_transpose.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/layer/dl_layer_transpose.hpp
@ -0,0 +1,126 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_tool.hpp"
+#include "dl_layer_base.hpp"
+
+namespace dl
+{
+    namespace layer
+    {
+        /**
+         * @brief 
+         * 
+         * @tparam feature_t 
+         */
+        template <typename feature_t>
+        class Transpose : public Layer
+        {
+        private:
+            int output_exponent;           /*<! exponent of output >*/
+            Tensor<feature_t> *output;     /*<! output ptr of Transpose >*/
+            bool inplace;                  /*<! true: the output will store to input0
+                                                false: the output will store to a separate memory >*/
+            std::vector<int> perm;         /*<! the new arangement of the dims. if perm == {}, the dims arangement will be reversed. >*/
+            std::vector<int> output_shape; /*<! output shape of Transpose >*/
+        public:
+            /**
+             * @brief Construct a new Transpose object
+             * 
+             * @param perm          the new arangement of the dims. if perm == {}, the dims arangement will be reversed.
+             * @param name          name of Transpose layer
+             * @param inplace       true: the output will store to input
+             *                      false: the output will store to a separate memory
+             */
+            Transpose(std::vector<int> perm = {}, const char *name = "Transpose", bool inplace = false) : Layer(name), perm(perm), inplace(inplace), output_shape({})
+            {
+            }
+
+            /**
+             * @brief Destroy the Transpose object
+             * 
+             */
+            ~Transpose()
+            {
+                if ((!this->inplace) && (this->output != NULL))
+                {
+                    delete this->output;
+                }
+            }
+
+            /**
+            * @brief Update output shape and exponent
+            * 
+            * @param input as an input
+            * @param print_shape  whether to print the output shape.
+            */
+            void build(Tensor<feature_t> &input, bool print_shape = false)
+            {
+                this->output_exponent = input.exponent;
+                this->output_shape = input.shape;
+                for (int i = 0; i < this->perm.size(); i++)
+                {
+                    this->output_shape[i] = input.shape[this->perm[i]];
+                }
+                if (!this->inplace)
+                {
+                    if (this->output != NULL)
+                    {
+                        this->output = new Tensor<feature_t>;
+                    }
+                    this->output->set_exponent(this->output_exponent);
+                    this->output->set_shape(this->output_shape);
+                    this->output->free_element();
+                }
+                else
+                {
+                    this->output = &input;
+                    this->output->set_shape(this->output_shape);
+                }
+
+                if (print_shape)
+                {
+                    std::cout << this->name << " | ";
+                    this->output->print_shape();
+                }
+            }
+
+            /**
+             * @brief Get the output
+             * 
+             * @return Tensor<feature_t>& Transpose result
+             */
+            Tensor<feature_t> &get_output()
+            {
+                return *this->output;
+            }
+
+            /**
+             * @brief Call Transpose operation.
+             * 
+             * @param input as an input.
+             * @return Tensor<feature_t>& Transpose result.
+             */
+            Tensor<feature_t> &call(Tensor<feature_t> &input)
+            {
+                DL_LOG_LAYER_LATENCY_INIT();
+
+                if (!this->inplace)
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->set_exponent(input.exponent);
+                    this->output->transpose(input, this->perm);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "transpose");
+                }
+                else
+                {
+                    DL_LOG_LAYER_LATENCY_START();
+                    this->output->transpose(this->perm);
+                    DL_LOG_LAYER_LATENCY_END(this->name, "transpose");
+                }
+                return *this->output;
+            }
+        };
+    } // namespace layer
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/model_zoo/color_detector.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/model_zoo/color_detector.hpp
@ -0,0 +1,68 @@
+#pragma once
+
+#include "dl_image.hpp"
+
+typedef struct
+{
+    int area;                /*!< Area of connected domains >*/
+    std::vector<int> center; /*<! centroid of connected domains [x, y] >*/
+    std::vector<int> box;    /*<! [left_up_x, left_up_y, right_down_x, right_down_y] >*/
+} components_stats_t;
+
+class ColorDetector
+{
+private:
+    std::vector<std::vector<components_stats_t>> results; /*!< detection results >*/
+
+public:
+    std::vector<std::vector<uint8_t>> color_thresh; /*!< threshold of colors, The threshold of each color is composed of 6 numbers >*/
+    std::vector<int> area_thresh;                   /*!< the area threshold of each color, 
+                                                                the area that is smaller than the threshold is filtered >*/
+    bool bgr;                                       /*!< true: the input image is in BGR format
+                                                                false: the input image is in RGB format >*/
+
+    /**
+     * @brief get the color threshold of rectangular region in the image
+     * 
+     * @param image the input image
+     * @param box   the coordinates of the rectanglar region : [left_up_x, left_up_y, right_down_x, right_down_y]
+     * @return std::vector<uint8_t> the threshold.
+     */
+    std::vector<uint8_t> cal_color_thresh(dl::Tensor<uint8_t> &image, std::vector<int> box);
+
+    /**
+     * @brief detect the colors based on the color thresholds
+     * 
+     * @param image the input image.
+     * @return std::vector<std::vector<components_stats_t>>&  detection result.
+     */
+    std::vector<std::vector<components_stats_t>> &detect(dl::Tensor<uint8_t> &image);
+
+    /**
+     * @brief Construct a new Color Detector object
+     * 
+     * @param color_thresh  threshold of colors, The threshold of each color is composed of 6 numbers
+     * @param area_thresh   the area threshold of each color,the area that is smaller than the threshold is filtered
+     * @param bgr           true: the input image is in BGR format
+     *                      false: the input image is in RGB format
+     */
+    ColorDetector(std::vector<std::vector<uint8_t>> color_thresh, std::vector<int> area_thresh, bool bgr = false) : color_thresh(color_thresh), area_thresh(area_thresh), bgr(bgr)
+    {
+    }
+
+    /**
+     * @brief Destroy the Color Detector object
+     * 
+     */
+    ~ColorDetector() {}
+
+    /**
+     * @brief Get the results object
+     * 
+     * @return std::vector<std::vector<components_stats_t>>& the detection result.
+     */
+    std::vector<std::vector<components_stats_t>> &get_results()
+    {
+        return this->results;
+    }
+};
--- a/tools/sdk/esp32s2/include/esp-face/include/model_zoo/face_recognition_tool.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/model_zoo/face_recognition_tool.hpp
@ -92,7 +92,7 @@ namespace face_recognition_tool
     * @return dl::Tensor<T>* 
     */
    template <typename T>
-    dl::Tensor<T> *transform_mfn_input(dl::Tensor<uint8_t> &image, bool free_input = false, bool do_padding = true);
+    dl::Tensor<T> *transform_mfn_input(dl::Tensor<uint8_t> &image, bool free_input = false);

    /**
     * @brief  transform the image to the input of a mfn model 
@ -106,7 +106,7 @@ namespace face_recognition_tool
     *                             false: do not pad the result
     */
    template <typename T>
-    void transform_mfn_input(dl::Tensor<uint8_t> &image, dl::Tensor<T> &output, bool free_input = false, bool do_padding = true);
+    void transform_mfn_input(dl::Tensor<uint8_t> &image, dl::Tensor<T> &output, bool free_input = false);

    /**
     * @brief transform the mfn output embedding to a floating embedding
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn.hpp
@ -14,13 +14,13 @@ namespace dl
         * @param filter_shape filter shape with dilation
         * @param stride_y     stride in height
         * @param stride_x     stride in width
-         * @param pad_type     one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET
+         * @param pad_type     one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN
         * @param is_conv2d    one of true or false,
         *                     - true: serve for Conv2D
         *                     - false: serve for other operations
         * @return std::vector<int> 
         */
-        std::vector<int> get_output_shape(const std::vector<int> &input_shape, const std::vector<int> &filter_shape, const int stride_y, const int stride_x, const padding_type_t pad_type, const bool is_conv2d = false);
+        std::vector<int> get_output_shape(const std::vector<int> &input_shape, const std::vector<int> &filter_shape, const int stride_y, const int stride_x, const padding_type_t pad_type, const bool is_conv2d = false, std::vector<int> padding = {});

        /**
         * @brief Get the pad size object
@ -30,7 +30,7 @@ namespace dl
         * @param filter_shape filter shape with dilation
         * @param stride_y     stride in height
         * @param stride_x     stride in width
-         * @param padding_type one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET
+         * @param padding_type one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN
         * @return padding size
         */
        std::vector<int> get_pad_size(const std::vector<int> &output_shape, const std::vector<int> &input_shape, const std::vector<int> &filter_shape, const int stride_y, const int stride_x, const padding_type_t padding_type);
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_add2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_add2d.hpp
@ -58,20 +58,20 @@ namespace dl
         */
        template <bool inplace = false, typename feature_t>
        auto add2d(const int output_exponent,
-                    Tensor<feature_t> &input0,
-                    Tensor<feature_t> &input1,
-                    const Activation<feature_t> *activation,
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+                   Tensor<feature_t> &input0,
+                   Tensor<feature_t> &input1,
+                   const Activation<feature_t> *activation,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
        {
            assert(input0.is_same_shape(input1));

            DL_LOG_NN_LATENCY_INIT();

            Tensor<feature_t> output;
-            if constexpr(!inplace)
+            if constexpr (!inplace)
            {
                DL_LOG_NN_LATENCY_START();
-                output.set_exponent(output_exponent).set_shape(input0.shape).apply_element();
+                output.set_exponent(output_exponent).set_shape(input0.shape).malloc_element();
                DL_LOG_NN_LATENCY_END("apply");

                DL_LOG_NN_LATENCY_START();
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_avg_pool2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_avg_pool2d.hpp
@ -58,12 +58,12 @@ namespace dl
         * @param filter_shape    filter_shape in [filter_height, filter_width]
         * @param stride_y        stride in height
         * @param stride_x        stride in width
-         * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+         * @param padding_type    one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN,
         *                        - PADDING_VALID: no padding
-         *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+         *                        PADDING_SAME_END and PADDING_SAME_BEGIN results in padding with zeros evenly to the left/right or up/down of the input 
         *                        such that output has the same height/width dimension as the input,
-         *                        - PADDING_SAME results padding in TensorFlow style
-         *                        - PADDING_SAME_MXNET results padding in MXNET style
+         *                        - PADDING_SAME_END results padding in TensorFlow style
+         *                        - PADDING_SAME_BEGIN results padding in MXNET style
         * @param assign_core     not effective yet
         * @return avg_pool2d result
         */
@ -81,19 +81,19 @@ namespace dl
            DL_LOG_NN_LATENCY_START();
            std::vector<int> output_shape = get_output_shape(input.shape, filter_shape, stride_y, stride_x, padding_type);
            Tensor<feature_t> output;
-            output.set_exponent(output_exponent).set_shape(output_shape).apply_element();
+            output.set_exponent(output_exponent).set_shape(output_shape).malloc_element();
            DL_LOG_NN_LATENCY_END("apply");
+            std::vector<int> padding(4, 0);

            DL_LOG_NN_LATENCY_START();
-            if (padding_type == PADDING_SAME || padding_type == PADDING_SAME_MXNET)
+            if (padding_type == PADDING_SAME_END || padding_type == PADDING_SAME_BEGIN)
            {
-                std::vector<int> padding = get_pad_size(output_shape, input.shape, filter_shape, stride_y, stride_x, padding_type);
-                input.set_padding_size(padding);
+                padding = get_pad_size(output_shape, input.shape, filter_shape, stride_y, stride_x, padding_type);
            }
            DL_LOG_NN_LATENCY_END("padding");

            DL_LOG_NN_LATENCY_START();
-            avg_pool2d(output, input, input.padding, filter_shape, stride_y, stride_x, assign_core);
+            avg_pool2d(output, input, padding, filter_shape, stride_y, stride_x, assign_core);
            DL_LOG_NN_LATENCY_END("avg_pool2d");

            return output;
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_concat.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_concat.hpp
@ -0,0 +1,63 @@
+#pragma once
+
+#include <vector>
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        template <typename feature_t>
+        void concat(Tensor<feature_t> &output, std::vector<Tensor<feature_t> *> &inputs, int axis, bool free_inputs = false);
+
+        template <typename feature_t>
+        Tensor<feature_t> concat(std::vector<Tensor<feature_t> *> &inputs, int axis, bool free_inputs = false)
+        {
+            DL_LOG_NN_LATENCY_INIT();
+
+            DL_LOG_NN_LATENCY_START();
+            assert(inputs.size() > 1);
+            int shape_size = inputs[0]->shape.size();
+
+            if (axis < 0)
+            {
+                axis = shape_size + axis;
+            }
+
+            assert((axis < shape_size) && (axis > -1));
+
+            int output_shape_axis = inputs[0]->shape[axis];
+
+            for (int i = 1; i < inputs.size(); i++)
+            {
+                assert(shape_size == inputs[i]->shape.size());
+                assert(inputs[i]->exponent == inputs[i - 1]->exponent);
+                output_shape_axis += inputs[i]->shape[axis];
+
+                for (int j = 0; j < shape_size; j++)
+                {
+                    if (j != axis)
+                    {
+                        assert(inputs[i]->shape[j] == inputs[i - 1]->shape[j]);
+                    }
+                }
+            }
+            DL_LOG_NN_LATENCY_END("assert");
+
+            DL_LOG_NN_LATENCY_START();
+            Tensor<feature_t> output;
+            std::vector<int> output_shape = inputs[0]->shape;
+            output_shape[axis] = output_shape_axis;
+            output.set_shape(output_shape);
+            output.set_exponent(inputs[0]->exponent);
+            output.malloc_element();
+            DL_LOG_NN_LATENCY_END("malloc");
+
+            DL_LOG_NN_LATENCY_START();
+            concat(output, inputs, axis, free_inputs);
+            DL_LOG_NN_LATENCY_END("concat");
+            return output;
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_conv2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_conv2d.hpp
@ -10,7 +10,6 @@ namespace dl
    {
        /**
         * @brief activation(conv2d(input, filter) + bias).
-         * NOTE: When padding_type is SAME, make sure padding is already added in input.
         * 
         * @param output      as an output
         * @param input       as an input
@ -34,7 +33,6 @@ namespace dl

        /**
         * @brief activation(conv2d(input, filter) + bias).
-         * NOTE: When padding_type is SAME, make sure padding is already added in input.
         * 
         * @param output      as an output
         * @param input       as an input
@ -56,6 +54,29 @@ namespace dl
                    const Activation<int8_t> *const activation = NULL,
                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);

+        /**
+         * @brief activation(conv2d(input, filter) + bias).
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param padding     padding size needed in [top, bottom, left, right] of this operation
+         * @param filter      filter of conv2d
+         * @param stride_y    stride in height
+         * @param stride_x    stride in width
+         * @param bias        bias of conv2d, if you don't specify anything, no bias is added
+         * @param activation  activation of conv2d, if you don't specify anything, no activation is applied
+         * @param assign_core not effective yet
+         */
+        void conv2d(Tensor<int8_t> &output,
+                    Tensor<int8_t> &input,
+                    std::vector<int> &padding,
+                    const Filter<int8_t> &filter,
+                    const int stride_y,
+                    const int stride_x,
+                    const Bias<int16_t> *const bias = NULL,
+                    const Activation<int8_t> *const activation = NULL,
+                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
        /**
         * @brief activation(conv2d(input, filter) + bias).
         * 
@ -67,25 +88,25 @@ namespace dl
         * @param filter          Filter of conv2d
         * @param stride_y        stride in height
         * @param stride_x        stride in width
-         * @param padding_type    one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+         * @param padding_type    one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN,
         *                        - PADDING_VALID: no padding
-         *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+         *                        PADDING_SAME_END and PADDING_SAME_BEGIN results in padding with zeros evenly to the left/right or up/down of the input 
         *                        such that output has the same height/width dimension as the input,
-         *                        - PADDING_SAME results padding in TensorFlow style
-         *                        - PADDING_SAME_MXNET results padding in MXNET style
+         *                        - PADDING_SAME_END results padding in TensorFlow style
+         *                        - PADDING_SAME_BEGIN results padding in MXNET style
         * @param bias            bias of conv2d, if you don't specify anything, no bias is added
         * @param activation      activation of conv2d, if you don't specify anything, no activation is applied
         * @param assign_core     not effective yet
         * @return conv2d result
         */
-        template <typename feature_t>
+        template <typename feature_t, typename bias_t>
        Tensor<feature_t> conv2d(const int output_exponent,
                                 Tensor<feature_t> &input,
                                 const Filter<feature_t> &filter,
                                 const int stride_y,
                                 const int stride_x,
                                 const padding_type_t padding_type,
-                                 const Bias<feature_t> *bias,
+                                 const Bias<bias_t> *bias,
                                 const Activation<feature_t> *activation,
                                 const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
        {
@ -94,20 +115,19 @@ namespace dl
            DL_LOG_NN_LATENCY_START();
            std::vector<int> output_shape = get_output_shape(input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type, true);
            Tensor<feature_t> output;
-            output.set_exponent(output_exponent).set_shape(output_shape).apply_element();
+            output.set_exponent(output_exponent).set_shape(output_shape).malloc_element();
            DL_LOG_NN_LATENCY_END("apply");

+            std::vector<int> padding(4, 0);
            DL_LOG_NN_LATENCY_START();
-            if (padding_type == PADDING_SAME || padding_type == PADDING_SAME_MXNET)
+            if (padding_type == PADDING_SAME_END || padding_type == PADDING_SAME_BEGIN)
            {
-                std::vector<int> padding = get_pad_size(output_shape, input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type);
-                input.set_padding_size(padding);
-                input.set_padding_value(padding, 0);
+                padding = get_pad_size(output_shape, input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type);
            }
            DL_LOG_NN_LATENCY_END("padding");

            DL_LOG_NN_LATENCY_START();
-            conv2d(output, input, input.padding, filter, stride_y, stride_x, bias, activation, assign_core);
+            conv2d(output, input, padding, filter, stride_y, stride_x, bias, activation, assign_core);
            DL_LOG_NN_LATENCY_END("conv2d");

            return output;
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_depthwise_conv2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_depthwise_conv2d.hpp
@ -10,7 +10,6 @@ namespace dl
    {
        /**
         * @brief activate(depthwise_conv2d(input, filter) + bias)
-         * NOTE: When padding_type is SAME, make sure padding is already added in input
         * 
         * @param output      as an output
         * @param input       as an input
@ -34,7 +33,6 @@ namespace dl

        /**
         * @brief activate(depthwise_conv2d(input, filter) + bias)
-         * NOTE: When padding_type is SAME, make sure padding is already added in input
         * 
         * @param output      as an output
         * @param input       as an input
@ -56,6 +54,29 @@ namespace dl
                              const Activation<int8_t> *activation = NULL,
                              const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);

+        /**
+         * @brief activate(depthwise_conv2d(input, filter) + bias)
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param padding     padding size needed in [top, bottom, left, right] of this operation
+         * @param filter      Filter of depthwise_conv2d
+         * @param stride_y    stride in height
+         * @param stride_x    stride in width
+         * @param bias        bias of depthwise_conv2d, if you don't specify anything, no bias is added
+         * @param activation  activation of depthwise_conv2d, if you don't specify anything, no activation is applied
+         * @param assign_core not effective yet
+         */
+        void depthwise_conv2d(Tensor<int8_t> &output,
+                              Tensor<int8_t> &input,
+                              std::vector<int> &padding,
+                              const Filter<int8_t> &filter,
+                              const int stride_y,
+                              const int stride_x,
+                              const Bias<int16_t> *bias = NULL,
+                              const Activation<int8_t> *activation = NULL,
+                              const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
        /**
         * @brief activation(depthwise_conv2d(input, filter) + bias)
         * 
@ -67,25 +88,25 @@ namespace dl
         * @param filter          filter of depthwise_conv2d
         * @param stride_y        stride in height
         * @param stride_x        stride in width
-         * @param pad_type        one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+         * @param pad_type        one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN,
         *                        - PADDING_VALID means no padding
-         *                        PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+         *                        PADDING_SAME_END and PADDING_SAME_BEGIN results in padding with zeros evenly to the left/right or up/down of the input 
         *                        such that output has the same height/width dimension as the input,
-         *                        - PADDING_SAME results padding in TensorFlow style
-         *                        - PADDING_SAME_MXNET results padding in MXNET style
+         *                        - PADDING_SAME_END results padding in TensorFlow style
+         *                        - PADDING_SAME_BEGIN results padding in MXNET style
         * @param bias            bias of depthwise_conv2d, if you don't specify anything, no bias is added
         * @param activation      activation of depthwise_conv2d, if you don't specify anything, no activation is applied
         * @param assign_core     not effective yet
         * @return depthwise_conv2d result
         */
-        template <typename feature_t>
+        template <typename feature_t, typename bias_t>
        Tensor<feature_t> depthwise_conv2d(const int output_exponent,
                                           Tensor<feature_t> &input,
                                           const Filter<feature_t> &filter,
                                           const int stride_y,
                                           const int stride_x,
                                           const padding_type_t padding_type,
-                                           const Bias<feature_t> *bias,
+                                           const Bias<bias_t> *bias,
                                           const Activation<feature_t> *activation,
                                           const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
        {
@ -94,20 +115,20 @@ namespace dl
            DL_LOG_NN_LATENCY_START();
            std::vector<int> output_shape = get_output_shape(input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type);
            Tensor<feature_t> output;
-            output.set_exponent(output_exponent).set_shape(output_shape).apply_element();
+            output.set_exponent(output_exponent).set_shape(output_shape).malloc_element();
            DL_LOG_NN_LATENCY_END("apply");

+            std::vector<int> padding(4, 0);
+
            DL_LOG_NN_LATENCY_START();
-            if (padding_type == PADDING_SAME || padding_type == PADDING_SAME_MXNET)
+            if (padding_type == PADDING_SAME_END || padding_type == PADDING_SAME_BEGIN)
            {
-                std::vector<int> padding = get_pad_size(output_shape, input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type);
-                input.set_padding_size(padding);
-                input.set_padding_value(padding, 0);
+                padding = get_pad_size(output_shape, input.shape, filter.shape_with_dilation, stride_y, stride_x, padding_type);
            }
            DL_LOG_NN_LATENCY_END("padding");

            DL_LOG_NN_LATENCY_START();
-            depthwise_conv2d(output, input, input.padding, filter, stride_y, stride_x, bias, activation, assign_core);
+            depthwise_conv2d(output, input, padding, filter, stride_y, stride_x, bias, activation, assign_core);
            DL_LOG_NN_LATENCY_END("depthwise_conv2d");

            return output;
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_fully_connected.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_fully_connected.hpp
@ -0,0 +1,126 @@
+#pragma once
+
+#include "dl_constant.hpp"
+#include "dl_variable.hpp"
+#include "dl_nn.hpp"
+
+namespace dl
+{
+    namespace nn
+    {
+        /**
+         * @brief activation(FullyConnected(input, filter) + bias).
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param filter      filter of FullyConnected
+         * @param bias        bias of FullyConnected, if you don't specify anything, no bias is added
+         * @param activation  activation of FullyConnected, if you don't specify anything, no activation is applied
+         * @param flatten     true: input shape is [x1, x2, ..., xn], filter shape is [1, 1, x1 * x2 * ... * xn, output_dim], output shape is [output_dim]
+         *                    false: input shape is [x1, x2, ..., xn, input_dim], filter shape is [1, 1, input_dim, output_dim], output shape is [x1, x2, ...., xn, output_dim]
+         * @param assign_core not effective yet
+         */
+        void fully_connected(Tensor<int16_t> &output,
+                             Tensor<int16_t> &input,
+                             const Filter<int16_t> &filter,
+                             const Bias<int16_t> *const bias = NULL,
+                             const Activation<int16_t> *const activation = NULL,
+                             const bool flatten = true,
+                             const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief activation(FullyConnected(input, filter) + bias).
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param filter      filter of FullyConnected
+         * @param bias        bias of FullyConnected, if you don't specify anything, no bias is added
+         * @param activation  activation of FullyConnected, if you don't specify anything, no activation is applied
+         * @param flatten     true: input shape is [x1, x2, ..., xn], filter shape is [1, 1, x1 * x2 * ... * xn, output_dim], output shape is [output_dim]
+         *                    false: input shape is [x1, x2, ..., xn, input_dim], filter shape is [1, 1, input_dim, output_dim], output shape is [x1, x2, ...., xn, output_dim]
+         * @param assign_core not effective yet
+         */
+        void fully_connected(Tensor<int8_t> &output,
+                             Tensor<int8_t> &input,
+                             const Filter<int8_t> &filter,
+                             const Bias<int8_t> *const bias = NULL,
+                             const Activation<int8_t> *const activation = NULL,
+                             const bool flatten = true,
+                             const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief activation(FullyConnected(input, filter) + bias).
+         * 
+         * @param output      as an output
+         * @param input       as an input
+         * @param filter      filter of FullyConnected
+         * @param bias        bias of FullyConnected, if you don't specify anything, no bias is added
+         * @param activation  activation of FullyConnected, if you don't specify anything, no activation is applied
+         * @param flatten     true: input shape is [x1, x2, ..., xn], filter shape is [1, 1, x1 * x2 * ... * xn, output_dim], output shape is [output_dim]
+         *                    false: input shape is [x1, x2, ..., xn, input_dim], filter shape is [1, 1, input_dim, output_dim], output shape is [x1, x2, ...., xn, output_dim]
+         * @param assign_core not effective yet
+         */
+        void fully_connected(Tensor<int8_t> &output,
+                             Tensor<int8_t> &input,
+                             const Filter<int8_t> &filter,
+                             const Bias<int16_t> *const bias = NULL,
+                             const Activation<int8_t> *const activation = NULL,
+                             const bool flatten = true,
+                             const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+
+        /**
+         * @brief activation(FullyConnected(input, filter) + bias).
+         * 
+         * @tparam feature_t supports int16_t and int8_t,
+         *         - int16_t: stands for operation in int16_t quantize
+         *         - int8_t: stands for operation in int8_t quantize
+         * @param output_exponent exponent of output
+         * @param input           as an input
+         * @param filter          Filter of FullyConnected
+         * @param bias            bias of FullyConnected, if you don't specify anything, no bias is added
+         * @param activation      activation of FullyConnected, if you don't specify anything, no activation is applied
+         * @param flatten         true: input shape is [x1, x2, ..., xn], filter shape is [1, 1, x1 * x2 * ... * xn, output_dim], output shape is [output_dim]
+         *                        false: input shape is [x1, x2, ..., xn, input_dim], filter shape is [1, 1, input_dim, output_dim], output shape is [x1, x2, ...., xn, output_dim]
+         * @param assign_core     not effective yet
+         * @return FullyConnected result
+         */
+        template <typename feature_t>
+        Tensor<feature_t> fully_connected(const int output_exponent,
+                                          Tensor<feature_t> &input,
+                                          const Filter<feature_t> &filter,
+                                          const Bias<feature_t> *bias,
+                                          const Activation<feature_t> *activation,
+                                          const bool flatten,
+                                          const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE)
+        {
+            DL_LOG_NN_LATENCY_INIT();
+
+            DL_LOG_NN_LATENCY_START();
+            assert(filter.shape.size() == 4);
+            assert(filter.shape[0] == 1);
+            assert(filter.shape[1] == 1);
+
+            std::vector<int> output_shape;
+            if (flatten)
+            {
+                assert(input.get_size() == filter.shape[2]);
+                output_shape = {filter.shape.back()};
+            }
+            else
+            {
+                assert(input.shape.back() == filter->shape[2]);
+                output_shape = input.shape;
+                output_shape[output_shape.size() - 1] = filter.shape.back();
+            }
+            Tensor<feature_t> output;
+            output.set_exponent(output_exponent).set_shape(output_shape).malloc_element();
+            DL_LOG_NN_LATENCY_END("apply");
+
+            DL_LOG_NN_LATENCY_START();
+            fully_connected(output, input, filter, bias, activation, flatten, assign_core);
+            DL_LOG_NN_LATENCY_END("fully_connected");
+
+            return output;
+        }
+    } // namespace nn
+} // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_global_avg_pool2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_global_avg_pool2d.hpp
@ -53,7 +53,7 @@ namespace dl
            std::vector<int> output_shape(input.shape.size(), 1);
            output_shape[2] = input.shape[2];
            Tensor<feature_t> output;
-            output.set_exponent(output_exponent).set_shape(output_shape).apply_element();
+            output.set_exponent(output_exponent).set_shape(output_shape).malloc_element();
            DL_LOG_NN_LATENCY_END("apply");

            DL_LOG_NN_LATENCY_START();
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_global_max_pool2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_global_max_pool2d.hpp
@ -51,7 +51,7 @@ namespace dl
            std::vector<int> output_shape(input.shape.size(), 1);
            output_shape[2] = input.shape[2];
            Tensor<feature_t> output;
-            output.set_exponent(input.exponent).set_shape(output_shape).apply_element();
+            output.set_exponent(input.exponent).set_shape(output_shape).malloc_element();
            DL_LOG_NN_LATENCY_END("apply");

            DL_LOG_NN_LATENCY_START();
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_leakyrelu.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_leakyrelu.hpp
@ -52,17 +52,17 @@ namespace dl
         * @return leakyrelu result or no return(result store to input)
         */
        template <bool inplace = false, typename feature_t>
-        auto leakyrelu(Tensor<feature_t> &input, 
-                        const int activation_alpha, 
-                        const int activation_exponent, 
-                        const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        auto leakyrelu(Tensor<feature_t> &input,
+                       const int activation_alpha,
+                       const int activation_exponent,
+                       const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
        {
            DL_LOG_NN_LATENCY_INIT();
            Tensor<feature_t> output;
-            if constexpr(!inplace)
+            if constexpr (!inplace)
            {
                DL_LOG_NN_LATENCY_START();
-                output.set_exponent(input.exponent).set_shape(input.shape).apply_element();
+                output.set_exponent(input.exponent).set_shape(input.shape).malloc_element();
                DL_LOG_NN_LATENCY_END("apply");

                DL_LOG_NN_LATENCY_START();
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_max2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_max2d.hpp
@ -48,20 +48,20 @@ namespace dl
         * @return max2d result or no return(result store to input0)
         */
        template <bool inplace = false, typename feature_t>
-        auto max2d(Tensor<feature_t> &input0, 
-                    Tensor<feature_t> &input1, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        auto max2d(Tensor<feature_t> &input0,
+                   Tensor<feature_t> &input1,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
        {
            assert(input0.is_same_shape(input1));
            assert(input0.exponent == input1.exponent);

            DL_LOG_NN_LATENCY_INIT();
            Tensor<feature_t> output;
-            
-            if constexpr(!inplace)
+
+            if constexpr (!inplace)
            {
                DL_LOG_NN_LATENCY_START();
-                output.set_exponent(input0.exponent).set_shape(input0.shape).apply_element();
+                output.set_exponent(input0.exponent).set_shape(input0.shape).malloc_element();
                DL_LOG_NN_LATENCY_END("apply");

                DL_LOG_NN_LATENCY_START();
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_max_pool2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_max_pool2d.hpp
@ -57,12 +57,12 @@ namespace dl
         * @param filter_shape filter shape in [filter_height, filter_width]
         * @param stride_y     stride in height
         * @param stride_x     stride in width
-         * @param padding_type one of PADDING_VALID or PADDING_SAME or PADDING_SAME_MXNET,
+         * @param padding_type one of PADDING_VALID or PADDING_SAME_END or PADDING_SAME_BEGIN,
         *                     - PADDING_VALID: no padding
-         *                     PADDING_SAME and PADDING_SAME_MXNET results in padding with zeros evenly to the left/right or up/down of the input 
+         *                     PADDING_SAME_END and PADDING_SAME_BEGIN results in padding with zeros evenly to the left/right or up/down of the input 
         *                     such that output has the same height/width dimension as the input,
-         *                     - PADDING_SAME results padding in TensorFlow style
-         *                     - PADDING_SAME_MXNET results padding in MXNET style
+         *                     - PADDING_SAME_END results padding in TensorFlow style
+         *                     - PADDING_SAME_BEGIN results padding in MXNET style
         * @param assign_core  not effective yet
         * @return max_pool2d result
         */
@ -79,20 +79,20 @@ namespace dl
            DL_LOG_NN_LATENCY_START();
            std::vector<int> output_shape = get_output_shape(input.shape, filter_shape, stride_y, stride_x, padding_type);
            Tensor<feature_t> output;
-            output.set_exponent(input.exponent).set_shape(output_shape).apply_element();
+            output.set_exponent(input.exponent).set_shape(output_shape).malloc_element();
            DL_LOG_NN_LATENCY_END("apply");

+            std::vector<int> padding(4, 0);
+
            DL_LOG_NN_LATENCY_START();
-            if (padding_type == PADDING_SAME || padding_type == PADDING_SAME_MXNET)
+            if (padding_type == PADDING_SAME_END || padding_type == PADDING_SAME_BEGIN)
            {
-                std::vector<int> padding = get_pad_size(output_shape, input.shape, filter_shape, stride_y, stride_x, padding_type);
-                input.set_padding_size(padding);
-                input.set_padding_value(padding, 0);
+                padding = get_pad_size(output_shape, input.shape, filter_shape, stride_y, stride_x, padding_type);
            }
            DL_LOG_NN_LATENCY_END("padding");

            DL_LOG_NN_LATENCY_START();
-            max_pool2d(output, input, input.padding, filter_shape, stride_y, stride_x, assign_core);
+            max_pool2d(output, input, padding, filter_shape, stride_y, stride_x, assign_core);
            DL_LOG_NN_LATENCY_END("max_pool2d");

            return output;
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_min2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_min2d.hpp
@ -47,20 +47,20 @@ namespace dl
         * @return min2d result or no return(result store to input0)
         */
        template <bool inplace = false, typename feature_t>
-        auto min2d(Tensor<feature_t> &input0, 
-                    Tensor<feature_t> &input1, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        auto min2d(Tensor<feature_t> &input0,
+                   Tensor<feature_t> &input1,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
        {
            assert(input0.is_same_shape(input1));
            assert(input0.exponent == input1.exponent);

            DL_LOG_NN_LATENCY_INIT();
            Tensor<feature_t> output;
-            
-            if constexpr(!inplace)
+
+            if constexpr (!inplace)
            {
                DL_LOG_NN_LATENCY_START();
-                output.set_exponent(input0.exponent).set_shape(input0.shape).apply_element();
+                output.set_exponent(input0.exponent).set_shape(input0.shape).malloc_element();
                DL_LOG_NN_LATENCY_END("apply");

                DL_LOG_NN_LATENCY_START();
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_mul2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_mul2d.hpp
@ -18,12 +18,12 @@ namespace dl
         * @param assign_core       not effective yet
         * @param output_exponent   exponent of output, only and must specify if inplace operation happens
         */
-        void mul2d(Tensor<int16_t> &output, 
-                    Tensor<int16_t> &input0, 
-                    Tensor<int16_t> &input1, 
-                    const Activation<int16_t> *const activation = NULL, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
-                    const int output_exponent = INT_MIN);
+        void mul2d(Tensor<int16_t> &output,
+                   Tensor<int16_t> &input0,
+                   Tensor<int16_t> &input1,
+                   const Activation<int16_t> *const activation = NULL,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
+                   const int output_exponent = INT_MIN);

        /**
         * @brief activation(mul2d(input0, input1)).
@ -35,12 +35,12 @@ namespace dl
         * @param assign_core       not effective yet
         * @param output_exponent   exponent of output, only and must specify if inplace operation happens
         */
-        void mul2d(Tensor<int8_t> &output, 
-                    Tensor<int8_t> &input0, 
-                    Tensor<int8_t> &input1, 
-                    const Activation<int8_t> *const activation = NULL, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
-                    const int output_exponent = INT_MIN);
+        void mul2d(Tensor<int8_t> &output,
+                   Tensor<int8_t> &input0,
+                   Tensor<int8_t> &input1,
+                   const Activation<int8_t> *const activation = NULL,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
+                   const int output_exponent = INT_MIN);

        /**
         * @brief activation(mul2d(input0, input1)).
@ -57,21 +57,21 @@ namespace dl
         * @return mul2d result or no return(result store to input0)
         */
        template <bool inplace = false, typename feature_t>
-        auto mul2d(const int output_exponent, 
-                    Tensor<feature_t> &input0, 
-                    Tensor<feature_t> &input1, 
-                    const Activation<feature_t> *activation, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        auto mul2d(const int output_exponent,
+                   Tensor<feature_t> &input0,
+                   Tensor<feature_t> &input1,
+                   const Activation<feature_t> *activation,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
        {
            assert(input0.is_same_shape(input1));

            DL_LOG_NN_LATENCY_INIT();
            Tensor<feature_t> output;

-            if constexpr(!inplace)
+            if constexpr (!inplace)
            {
                DL_LOG_NN_LATENCY_START();
-                output.set_exponent(output_exponent).set_shape(input0.shape).apply_element();
+                output.set_exponent(output_exponent).set_shape(input0.shape).malloc_element();
                DL_LOG_NN_LATENCY_END("apply");

                DL_LOG_NN_LATENCY_START();
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_prelu.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_prelu.hpp
@ -52,17 +52,17 @@ namespace dl
         * @return prelu result or no return(result store to input)
         */
        template <bool inplace = false, typename feature_t>
-        auto prelu(Tensor<feature_t> &input, 
-                    const feature_t *activation_element, 
-                    const int activation_exponent, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        auto prelu(Tensor<feature_t> &input,
+                   const feature_t *activation_element,
+                   const int activation_exponent,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
        {
            DL_LOG_NN_LATENCY_INIT();
            Tensor<feature_t> output;
-            if constexpr(!inplace)
+            if constexpr (!inplace)
            {
                DL_LOG_NN_LATENCY_START();
-                output.set_exponent(input.exponent).set_shape(input.shape).apply_element();
+                output.set_exponent(input.exponent).set_shape(input.shape).malloc_element();
                DL_LOG_NN_LATENCY_END("apply");

                DL_LOG_NN_LATENCY_START();
@ -76,7 +76,7 @@ namespace dl
                DL_LOG_NN_LATENCY_START();
                prelu(input, input, activation_element, activation_exponent, assign_core);
                DL_LOG_NN_LATENCY_END("prelu");
-            } 
+            }
        }
    } // namespace nn
 } // namespace dl
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_relu.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_relu.hpp
@ -15,9 +15,9 @@ namespace dl
         * @param input       as an input
         * @param assign_core not effective yet
         */
-        void relu(Tensor<int16_t> &output, 
-                    Tensor<int16_t> &input, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+        void relu(Tensor<int16_t> &output,
+                  Tensor<int16_t> &input,
+                  const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);

        /**
         * @brief relu(input).
@ -26,9 +26,9 @@ namespace dl
         * @param input       as an input
         * @param assign_core not effective yet
         */
-        void relu(Tensor<int8_t> &output, 
-                    Tensor<int8_t> &input, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);
+        void relu(Tensor<int8_t> &output,
+                  Tensor<int8_t> &input,
+                  const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE);

        /**
         * @brief relu(input)
@ -46,11 +46,11 @@ namespace dl
        {
            DL_LOG_NN_LATENCY_INIT();
            Tensor<feature_t> output;
-            
-            if constexpr(!inplace)
+
+            if constexpr (!inplace)
            {
                DL_LOG_NN_LATENCY_START();
-                output.set_exponent(input.exponent).set_shape(input.shape).apply_element();
+                output.set_exponent(input.exponent).set_shape(input.shape).malloc_element();
                DL_LOG_NN_LATENCY_END("apply");

                DL_LOG_NN_LATENCY_START();
--- a/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_sub2d.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/nn/dl_nn_sub2d.hpp
@ -18,12 +18,12 @@ namespace dl
         * @param assign_core not effective yet
         * @param output_exponent   exponent of output, only and must specify if inplace operation happens 
         */
-        void sub2d(Tensor<int16_t> &output, 
-                    Tensor<int16_t> &input0, 
-                    Tensor<int16_t> &input1, 
-                    const Activation<int16_t> *const activation = NULL, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
-                    const int output_exponent = INT_MIN);
+        void sub2d(Tensor<int16_t> &output,
+                   Tensor<int16_t> &input0,
+                   Tensor<int16_t> &input1,
+                   const Activation<int16_t> *const activation = NULL,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
+                   const int output_exponent = INT_MIN);

        /**
         * @brief activation(sub2d(input0, input1)).
@ -35,12 +35,12 @@ namespace dl
         * @param assign_core not effective yet
         * @param output_exponent   exponent of output, only and must specify if inplace operation happens
         */
-        void sub2d(Tensor<int8_t> &output, 
-                    Tensor<int8_t> &input0, 
-                    Tensor<int8_t> &input1, 
-                    const Activation<int8_t> *const activation = NULL, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
-                    const int output_exponent = INT_MIN);
+        void sub2d(Tensor<int8_t> &output,
+                   Tensor<int8_t> &input0,
+                   Tensor<int8_t> &input1,
+                   const Activation<int8_t> *const activation = NULL,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE,
+                   const int output_exponent = INT_MIN);

        /**
         * @brief activation(sub2d(input0, input1)).
@ -57,20 +57,20 @@ namespace dl
         * @return sub2d result or no return(result store to input0)
         */
        template <bool inplace = false, typename feature_t>
-        auto sub2d(const int output_exponent, 
-                    Tensor<feature_t> &input0, 
-                    Tensor<feature_t> &input1, 
-                    const Activation<feature_t> *activation, 
-                    const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
+        auto sub2d(const int output_exponent,
+                   Tensor<feature_t> &input0,
+                   Tensor<feature_t> &input1,
+                   const Activation<feature_t> *activation,
+                   const std::vector<int> &assign_core = CONFIG_DEFAULT_ASSIGN_CORE) -> typename std::conditional<inplace, void, Tensor<feature_t>>::type
        {
            assert(input0.is_same_shape(input1));

            DL_LOG_NN_LATENCY_INIT();
            Tensor<feature_t> output;
-            if constexpr(!inplace)
+            if constexpr (!inplace)
            {
                DL_LOG_NN_LATENCY_START();
-                output.set_exponent(output_exponent).set_shape(input0.shape).apply_element();
+                output.set_exponent(output_exponent).set_shape(input0.shape).malloc_element();
                DL_LOG_NN_LATENCY_END("apply");

                DL_LOG_NN_LATENCY_START();
--- a/tools/sdk/esp32s2/include/esp-face/include/tool/dl_tool.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/tool/dl_tool.hpp
@ -67,62 +67,49 @@ namespace dl
        void copy_memory(void *dst, void *src, const int n);

        /**
-         * @brief Apply memory without initialized. Must use free_aligned() to free the memory.
+         * @brief Apply memory without initialized. Can use free_aligned() to free the memory.
         * 
         * @param number number of elements
         * @param size   size of element
-         * @param align  number of aligned, e.g., 16 means 16-byte aligned
+         * @param align  number of byte aligned, e.g., 16 means 16-byte aligned
         * @return pointer of allocated memory. NULL for failed
         */
-        inline void *malloc_aligned(int number, int size, int align = 0)
+        inline void *malloc_aligned(int number, int size, int align = 4)
        {
-            int n = number * size;
-            n >>= 4;
-            n += 2;
-            n <<= 4;
-            int total_size = n + align + sizeof(void *) + sizeof(int);
-            void *res = malloc(total_size);
+            assert((align > 0) && (((align & (align-1)) == 0)));
+            int total_size = number * size;
+
+            void *res = heap_caps_aligned_alloc(align, total_size, MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
 #if DL_SPIRAM_SUPPORT
            if (NULL == res)
-                res = heap_caps_malloc(total_size, MALLOC_CAP_SPIRAM);
+                res = heap_caps_aligned_alloc(align, total_size, MALLOC_CAP_SPIRAM);
 #endif
            if (NULL == res)
            {
                printf("Fail to malloc %d bytes from DRAM(%d bytyes) and PSRAM(%d bytes), PSRAM is %s.\n",
                       total_size,
-                       heap_caps_get_free_size(MALLOC_CAP_INTERNAL),
+                       heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL),
                       heap_caps_get_free_size(MALLOC_CAP_SPIRAM),
                       DL_SPIRAM_SUPPORT ? "on" : "off");
                return NULL;
            }
-            void **data = (void **)res + 2; // 4-byte for pointer, 4-bytes for n
-            void **aligned;
-            if (align)
-                aligned = (void **)(((size_t)data + (align - 1)) & -align);
-            else
-                aligned = data;

-            aligned[-1] = res;
-            int *temp = (int *)aligned;
-            temp[-2] = n;
-
-            return (void *)aligned;
+            return (void *)res;
        }

        /**
-         * @brief Apply memory with zero-initialized. Must use dl_lib_free() to free the memory.
+         * @brief Apply memory with zero-initialized. Can use free_aligned() to free the memory.
         * 
         * @param number number of elements
         * @param size   size of element
-         * @param align  number of aligned, e.g., 16 means 16-byte aligned
+         * @param align  number of byte aligned, e.g., 16 means 16-byte aligned
         * @return pointer of allocated memory. NULL for failed
         */
-        inline void *calloc_aligned(int number, int size, int align = 0)
+        inline void *calloc_aligned(int number, int size, int align = 4)
        {

            void *aligned = malloc_aligned(number, size, align);
-            int n = *((int *)aligned - 2);
-            set_zero(aligned, n);
+            set_zero(aligned, number * size);

            return (void *)aligned;
        }
@ -137,7 +124,70 @@ namespace dl
            if (NULL == address)
                return;

-            free(((void **)address)[-1]);
+            heap_caps_free(address);
+        }
+
+        /**
+         * @brief Apply memory without initialized in preference order: internal aligned, internal, external aligned
+         * 
+         * @param number number of elements
+         * @param size   size of element
+         * @param align  number of byte aligned, e.g., 16 means 16-byte aligned
+         * @return pointer of allocated memory. NULL for failed
+         */
+        inline void *malloc_aligned_prefer(int number, int size, int align = 4)
+        {
+            assert((align > 0) && (((align & (align-1)) == 0)));
+            int total_size = number * size;
+            void *res = heap_caps_aligned_alloc(align, total_size, MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
+            if (NULL == res){
+                res = heap_caps_malloc(total_size, MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
+            }
+#if DL_SPIRAM_SUPPORT
+            if (NULL == res){
+                res = heap_caps_aligned_alloc(align, total_size, MALLOC_CAP_SPIRAM);
+            }
+#endif
+            if (NULL == res)
+            {
+                printf("Fail to malloc %d bytes from DRAM(%d bytyes) and PSRAM(%d bytes), PSRAM is %s.\n",
+                       total_size,
+                       heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL),
+                       heap_caps_get_free_size(MALLOC_CAP_SPIRAM),
+                       DL_SPIRAM_SUPPORT ? "on" : "off");
+                return NULL;
+            }
+
+            return res;
+        }
+
+        /**
+         * @brief Apply memory with zero-initialized in preference order: internal aligned, internal, external aligned
+         * 
+         * @param number number of elements
+         * @param size   size of element
+         * @param align  number of byte aligned, e.g., 16 means 16-byte aligned
+         * @return pointer of allocated memory. NULL for failed
+         */
+        inline void *calloc_aligned_prefer(int number, int size, int align = 4)
+        {
+            void *res = malloc_aligned_prefer(number, size, align);
+            set_zero(res, number * size);
+
+            return (void *)res;
+        }
+
+        /**
+         * @brief Free the calloc_aligned_prefer() and malloc_aligned_prefer() memory
+         * 
+         * @param address pointer of memory to free
+         */
+        inline void free_aligned_prefer(void *address)
+        {
+            if (NULL == address)
+                return;
+
+            heap_caps_free(address);
        }

        /**
--- a/tools/sdk/esp32s2/include/esp-face/include/typedef/dl_constant.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/typedef/dl_constant.hpp
@ -57,7 +57,8 @@ namespace dl
         * @param exponent exponent of element
         * @param shape    shape of Filter,
         *                 - 1D: reserved
-         *                 - 2D: [filter_height, filter_width, input_channel, output_channel]
+         *                 - 2D: for convolution is [filter_height, filter_width, input_channel, output_channel],
+         *                       for depthwise convolution is [filter_height, filter_width, input_channel, 1]
         * @param dilation dilation of Filter
         *                 - 1D: reserved
         *                 - 2D: [dilation_in_height, dilation_in_width]
@ -97,6 +98,9 @@ namespace dl
    {
    public:
        using Constant<T>::Constant;
+        std::vector<int> channel_exponent;    /*<! exponent for per-channel >*/
+
+        Bias(const T *element, const std::vector<int> channel_exponent, const std::vector<int> shape);
    };

    /**
--- a/tools/sdk/esp32s2/include/esp-face/include/typedef/dl_variable.hpp
+++ b/tools/sdk/esp32s2/include/esp-face/include/typedef/dl_variable.hpp
@ -3,6 +3,7 @@
 #include <stdio.h>
 #include <vector>
 #include <assert.h>
+#include <iostream>

 #include "dl_tool.hpp"

@ -17,27 +18,20 @@ namespace dl
    class Tensor
    {
    private:
-        int size;       /*<! size of element including padding */
-        bool auto_free; /*<! free element when object destroy */
+        int size;                     /*<! size of element including padding */
+        bool auto_free;               /*<! free element when object destroy */
+        std::vector<int> axis_offset; /*<! element offset of each axis */

    public:
-        T *element;                          /*<! point to element */
-        int exponent;                        /*<! exponent of element */
-        std::vector<int> shape;              /*<! shape of Tensor */
-                                             /*<! 2D: shape is [height, width, channel] */
-                                             /*<! 1D: reserved */
-        std::vector<int> shape_with_padding; /*<! shape with padding of Tensor */
-                                             /*<! 2D: shape_with_padding is [height_with_padding, width_with_padding, channel_with_padding] */
-                                             /*<! 1D: reserved */
-        std::vector<int> padding;            /*<! padding of Tensor */
-                                             /*<!- 2D: padding format is [top, bottom, left, right] */
-                                             /*<! - 1D: reserved */
+        T *element;             /*<! point to element */
+        int exponent;           /*<! exponent of element */
+        std::vector<int> shape; /*<! shape of Tensor */

        /**
         * @brief Construct a new Tensor object
         * 
         */
-        Tensor() : size(-1), auto_free(true), element(NULL), exponent(0) {}
+        Tensor() : auto_free(true), element(NULL), exponent(0) { this->set_shape({0}); }

        /**
         * @brief Construct a new Tensor object by copying from input.
@ -49,21 +43,20 @@ namespace dl
         */
        Tensor(Tensor<T> &input, bool deep) : size(input.size),
                                              auto_free(input.auto_free),
-                                              exponent(input.exponent),
-                                              shape(input.shape),
-                                              shape_with_padding(input.shape_with_padding),
-                                              padding(input.padding)
+                                              exponent(input.exponent)
        {
-            if (deep)
+            this->set_shape(input.shape);
+            if (deep && (input.element != NULL))
            {
-                int size_real = input.shape_with_padding.size() ? input.shape_with_padding[0] * input.shape_with_padding[1] * input.shape_with_padding[2] : 0;
-                T *new_element = (T *)tool::calloc_aligned(size_real, sizeof(T), 16);
+                int size_real = input.get_size();
+                T *new_element = (T *)tool::calloc_aligned_prefer(size_real, sizeof(T), 16);
                tool::copy_memory(new_element, input.element, size_real * sizeof(T));
                this->element = new_element;
            }
            else
            {
                this->element = input.element;
+                this->auto_free = false;
            }
        }

@ -77,6 +70,33 @@ namespace dl
                this->free_element();
        }

+        /**
+         * @brief 
+         * 
+         * @param input an input Tensor
+         * @param deep one of true or false
+         *              - true: apply a new memory, copy value from input.element to this new memory
+         *              - false: take over input.element to this->element
+         * @return Tensor<T>& self
+         */
+        Tensor<T> &copy_element(Tensor<T> &input, bool deep)
+        {
+            assert(this->get_size() == input.get_size());
+            assert(input.element != NULL);
+
+            this->malloc_element();
+            if (deep)
+            {
+                tool::copy_memory(this->element, input.element, this->get_size() * sizeof(T));
+            }
+            else
+            {
+                this->element = input.element;
+                this->auto_free = false;
+            }
+            return *this;
+        }
+
        /**
         * @brief Set the auto free object.
         * 
@ -120,190 +140,144 @@ namespace dl
        }

        /**
-         * @brief Set the shape of Tensor. Initial this->padding = {0}. Initial this->size = -1.
+         * @brief Set the shape of Tensor.
         * 
-         * @param shape shape in 
-         *              - 2D: [height, width]
+         * @param shape the target shape 
+         *        
         * @return self
         */
-        Tensor<T> &set_shape(const std::vector<int> shape)
+        Tensor<T> &set_shape(const std::vector<int> shape);
+
+        /**
+         * @brief print the shape of the Tensor
+         * 
+         */
+        void print_shape()
        {
-            for (int i = 0; i < shape.size(); ++i)
+            if (this->shape.size())
            {
-                assert(shape[i] > 0);
+                printf("shape = (");
+                for (int i = 0; i < this->shape.size() - 1; i++)
+                {
+                    printf("%d, ", this->shape[i]);
+                }
+                printf("%d)\n", this->shape.back());
+            }
+            else
+            {
+                printf("shape = ()\n");
            }
-            this->shape = shape;
-            this->shape_with_padding = shape;
-            this->size = -1;
-            this->padding = std::vector<int>(((this->shape.size() - 1) << 1), 0);
-            return *this;
        }

        /**
-         * @brief Set the padding size object.
+         * @brief flatten the Tensor
         * 
-         * @param padding padding size in
-         *                - 2D: [top, bottom, left, right]
-         * @return self
+         * @return Tensor<T>& self
         */
-        Tensor &set_padding_size(std::vector<int> &padding)
-        {
-            assert(this->shape.size());      // call Tensor.set_shape() first
-            assert(this->shape.size() == 3); // TODO: || this->shape.size() == 2
-
-            if (this->shape.size() == 3)
-            {
-                std::vector<int> new_padding = this->padding;
-                bool dont_update = true;
-
-                if (padding[0] > this->padding[0])
-                {
-                    new_padding[0] = padding[0];
-                    dont_update = false;
-                }
-
-                if (padding[1] > this->padding[1])
-                {
-                    new_padding[1] = padding[1];
-                    dont_update = false;
-                }
-
-                if (padding[2] > this->padding[2])
-                {
-                    new_padding[2] = padding[2];
-                    dont_update = false;
-                }
-
-                if (padding[3] > this->padding[3])
-                {
-                    new_padding[3] = padding[3];
-                    dont_update = false;
-                }
-
-                if (dont_update)
-                {
-                    return *this;
-                }
-
-                std::vector<int> new_shape_with_padding = this->shape;
-
-                new_shape_with_padding[0] += (new_padding[0] + new_padding[1]);
-                new_shape_with_padding[1] += (new_padding[2] + new_padding[3]);
-                int new_size = new_shape_with_padding[0] * new_shape_with_padding[1] * new_shape_with_padding[2];
-
-                if (this->element) // if this->element != NULL, do padding by copy memory
-                {
-                    T *new_element = (T *)tool::malloc_aligned(new_size, sizeof(T), 16);
-                    T *dst = new_element + ((new_padding[0] * new_shape_with_padding[1]) + new_padding[2]) * new_shape_with_padding[2];
-                    T *src = this->get_element_ptr();
-                    int offset_dst_next_y = new_shape_with_padding[1] * new_shape_with_padding[2];     // width * channel
-                    int src_copy_length = this->shape[1] * this->shape[2];                             // width * channel
-                    int offset_src_next_y = this->shape_with_padding[1] * this->shape_with_padding[2]; // width * channel
-                    for (int y = 0; y < this->shape[0]; y++)
-                    {
-                        tool::copy_memory(dst, src, src_copy_length * sizeof(T));
-                        dst += offset_dst_next_y;
-                        src += offset_src_next_y;
-                    }
-
-                    if (this->auto_free)
-                        tool::free_aligned(this->element);
-                    this->element = new_element;
-                    this->auto_free = true;
-                }
-                this->padding = new_padding;
-                this->shape_with_padding = new_shape_with_padding;
-                this->size = new_size;
-            }
-            else if (this->shape.size() == 2)
-            {
-                printf("Tensor.set_padding_size with this->shape.size() == 2 not implement yet.\n");
-            }
-
-            return *this;
-        }
+        Tensor<T> &flatten();

        /**
-         * @brief Set the padding value object.
+         * @brief Change a new shape to the Tensor without changing its data.
         * 
-         * @param padding padding size in
-         *                - 2D: [top, bottom, left, right]
-         * @param value   value to set
-         * @return self
+         * @param shape  the target shape
+         * @return Tensor<T>&  self
         */
-        Tensor<T> &set_padding_value(std::vector<int> &padding, T value);
+        Tensor<T> &reshape(std::vector<int> shape);
+
+        /**
+         * @brief Remove dims with length==1 from Tensor
+         * 
+         * @param axis the dim to to be remove. make sure the length of the dim is equal to 1.
+         *              if axis == INT32_MAX, all the dims with length==1 will be removed.
+         * @return Tensor<T>& self 
+         */
+        Tensor<T> &squeeze(int axis = INT32_MAX);
+
+        /**
+         * @brief Insert a new dim that will appear at the axis position in the expanded Tensor shape.
+         * 
+         * @param axis the dim to be inserted
+         * @return Tensor<T>& self
+         */
+        Tensor<T> &expand_dims(int axis);
+
+        /**
+         * @brief Insert a new dim that will appear at the axis position in the expanded Tensor shape.
+         * 
+         * @param axis  the dim to be inserted
+         * @return Tensor<T>& self
+         */
+        Tensor<T> &expand_dims(std::vector<int> axis);
+
+        /**
+         * @brief Reverse or permute the axes of the Tensor
+         * 
+         * @param perm the new arangement of the dims. if perm == {}, the dims arangement will be reversed. 
+         * @return Tensor<T>& self
+         */
+        Tensor<T> &transpose(std::vector<int> perm = {});
+
+        /**
+         * @brief Reverse or permute the axes of the input Tensor
+         * 
+         * @param input the input Tensor
+         * @param perm the new arangement of the dims. if perm == {}, the dims arangement will be reversed. 
+         * @return Tensor<T>& self
+         */
+        Tensor<T> &transpose(Tensor<T> &input, std::vector<int> perm = {});

        /**
         * @brief Get the element pointer.
         * 
-         * @param padding padding size in
-         *                - 2D: [top, bottom, left, right]
-         * @return pointer to memory with padding
+         * @return pointer to memory
         */
-        T *get_element_ptr(const std::vector<int> padding = {0, 0, 0, 0})
+        T *get_element_ptr()
        {
-            assert(this->shape.size() == 3); // TODO: || this->shape.size() == 2
-
-            if (this->shape.size() == 3)
-            {
-                return this->element + ((this->padding[0] - padding[0]) * this->shape_with_padding[1] + (this->padding[2] - padding[2])) * this->shape_with_padding[2];
-            }
-            else if (this->shape.size() == 2)
-            {
-                printf("Tensor.get_element_ptr with this->shape.size() == 2 is not implemented.\n");
-            }
-
-            return NULL;
+            return this->element;
        }

        /**
         * @brief Get the element value.
         * 
-         * @param index        index in
-         *                     - 2D: [y, x, c]
-         * @param with_padding one of true or false,
-         *                     - true: make padding size in count
-         *                     - false: do not
-         * @return element value
+         * @param index   the index of each dim. 
+         * @return T element value
         */
-        T &get_element_value(const std::vector<int> index, const bool with_padding = false)
+        T get_element_value(const std::vector<int> index)
        {
-            assert(index.size() == this->shape.size());
-            assert(this->shape.size() == 3); // TODO: || this->shape() == 2
-
-            int i = 0;
-            if (this->shape.size() == 3)
-            {
-                int y = index[0];
-                int x = index[1];
-                int c = index[2];
-                i = with_padding ? (y * this->shape_with_padding[1] + x) * this->shape_with_padding[2] + c : ((y + this->padding[0]) * this->shape_with_padding[1] + x + this->padding[2]) * this->shape_with_padding[2] + c;
-            }
-            else if (this->shape.size() == 2)
-            {
-                printf("Tensor.get_element_value with this->shape.size() == 2 is not implemented.\n");
-            }
-
-            return this->element[i];
+            return this->element[this->get_element_index(index)];
        }

        /**
-         * @brief Get the size of element.
+         * @brief Get the element value.
         * 
-         * @return size of element including padding
+         * @param index  the index of the element.
+         * @return T  element value
+         */
+        T get_element_value(int index)
+        {
+            return this->element[index];
+        }
+
+        /**
+         * @brief Get the size of Tensor.
+         * 
+         * @return  the size of Tensor.
         */
        int get_size()
        {
-            if (this->size == -1) // didn't call Tensor.set_padding_size() before
-            {
-                this->size = 1;
-                for (std::vector<int>::iterator d = this->shape.begin(); d != this->shape.end(); d++)
-                    this->size *= *d;
-            }
-
            return this->size;
        }

+        /**
+         * @brief Get the axis offset
+         * 
+         * @return std::vector<int> the axis offset
+         */
+        std::vector<int> get_axis_offset()
+        {
+            return this->axis_offset;
+        }
+
        /**
         * @brief Apply memory with zero-initialized only if this->element is NULL.
         * 
@ -319,7 +293,7 @@ namespace dl
            if (this->element != NULL)
                return false;

-            this->element = (T *)dl::tool::calloc_aligned(this->get_size(), sizeof(T), 16);
+            this->element = (T *)dl::tool::calloc_aligned_prefer(this->get_size(), sizeof(T), 16);
            this->auto_free = auto_free;

            return true;
@ -340,31 +314,7 @@ namespace dl
            if (this->element != NULL)
                return false;

-            this->element = (T *)tool::malloc_aligned(this->get_size(), sizeof(T), 16);
-            this->auto_free = auto_free;
-
-            return true;
-        }
-
-        /**
-         * @brief If this->element != NULL no memory will be applied and no value will be set in padding.
-         * Else apply memory without initialized and set value to padding.
-         * 
-         * @param padding_value value to set in padding
-         * @param auto_free     one of true of false
-         *                      - true: free element when object destroyed
-         *                      - false: do not
-         * @return 
-         *         - true: apply memory and set padding value successfully
-         *         - false: no memory applied and no padding value set
-         */
-        bool apply_element(const T padding_value = 0, const bool auto_free = true)
-        {
-            if (this->element != NULL)
-                return false;
-
-            this->element = (T *)tool::malloc_aligned(this->get_size(), sizeof(T), 16);
-            this->set_padding_value(this->padding, padding_value);
+            this->element = (T *)tool::malloc_aligned_prefer(this->get_size(), sizeof(T), 16);
            this->auto_free = auto_free;

            return true;
@ -379,258 +329,56 @@ namespace dl
        {
            if (this->auto_free && this->element)
            {
-                tool::free_aligned(this->element);
+                tool::free_aligned_prefer(this->element);
                this->element = NULL;
            }
        }

        /**
-         * @brief Print the shape of Tensor in format "shape = ({top_padding} + {height} + {bottom_padding}, {left_padding} + {width} + {right_padding}, {channel}(channel_with_padding))\n".
+         * @brief print the element of the tensor
+         * 
+         * @param axis_index_range  the element range of each dims to be print. if axis_index_range == {}, all the element will be print. 
+         * @param message  to print  
         */
-        void print_shape()
-        {
-            printf("shape = (%d + %d + %d, %d + %d + %d, %d(%d))\n",
-                   this->padding[0], this->shape[0], this->padding[1],
-                   this->padding[2], this->shape[1], this->padding[3],
-                   this->shape[2], this->shape_with_padding[2]);
-        }
+        void print(std::vector<int> axis_index_range = {}, const char *message = "");

        /**
-         * @brief Take numpy for example, this function print Tensor[y_start:y_end, x_start:x_end, c_start:c_end].
+         * @brief  print all the element of the Tensor.
         * 
-         * inner box is effective value of Tensor, "0" around is padding.
-         * 
-         * (with padding)
-         *               00000000000000000000000000000000000000000000000000
-         *               00000000000000000000000000000000000000000000000000
-         *               00000000000000000000000000000000000000000000000000
-         *               000000(without padding)                   00000000
-         *               000000                                    00000000
-         *               000000                                    00000000
-         *               000000          effective value           00000000
-         *               000000                                    00000000
-         *               000000                                    00000000
-         *               00000000000000000000000000000000000000000000000000
-         *               00000000000000000000000000000000000000000000000000
-         *               00000000000000000000000000000000000000000000000000
-         * 
-         * @param y_start start index in height
-         * @param y_end   end index in height
-         * @param x_start start index in width
-         * @param x_end   end index in width
-         * @param c_start start index in channel
-         * @param c_end   end index in channel
-         * @param message to print
-         * @param axis    print aligned this axis, effective only if all y_end - y_start, x_end - x_start and c_end - c_start equals to 1
+         * @param message to print  
         * @param with_padding one of true or false,
-         *                     - true: count from (with padding) in upper image
-         *                     - false: count from (without padding) in upper image
+         *                     - true: the padding element will also be ed
+         *                     - false: the padding element will not be ed
         */
-        void print(int y_start, int y_end,
-                   int x_start, int x_end,
-                   int c_start, int c_end,
-                   const char *message, int axis = 0, const bool with_padding = false)
+        void print_all(const char *message = "")
        {
-            assert(y_end > y_start);
-            assert(x_end > x_start);
-            assert(c_end > c_start);
-
-            y_start = DL_MAX(y_start, 0);
-            x_start = DL_MAX(x_start, 0);
-            c_start = DL_MAX(c_start, 0);
-            if (with_padding)
-            {
-                y_end = DL_MIN(y_end, this->shape_with_padding[0]);
-                x_end = DL_MIN(x_end, this->shape_with_padding[1]);
-                c_end = DL_MIN(c_end, this->shape_with_padding[2]);
-            }
-            else
-            {
-                y_end = DL_MIN(y_end, this->shape[0]);
-                x_end = DL_MIN(x_end, this->shape[1]);
-                c_end = DL_MIN(c_end, this->shape[2]);
-            }
-
-            printf("%s[%d:%d, %d:%d, %d:%d] | ", message, y_start, y_end, x_start, x_end, c_start, c_end);
+            std::cout << "\n"
+                      << message << " | ";
            this->print_shape();

-            if (y_end - y_start == 1)
+            for (int i = 0; i < this->get_size(); i++)
            {
-                if (x_end - x_start == 1)
-                {
-                    for (int c = c_start; c < c_end; c++)
-                        printf("%7d", c);
-                    printf("\n");
-
-                    for (int c = c_start; c < c_end; c++)
-                        printf("%7d", this->get_element_value({y_start, x_start, c}, with_padding));
-                    printf("\n");
-
-                    return;
-                }
-                else
-                {
-                    if (c_end - c_start == 1)
-                    {
-                        for (int x = x_start; x < x_end; x++)
-                            printf("%7d", x);
-                        printf("\n");
-
-                        for (int x = x_start; x < x_end; x++)
-                            printf("%7d", this->get_element_value({y_start, x, c_start}, with_padding));
-                        printf("\n");
-
-                        return;
-                    }
-                }
+                std::cout << this->element[i] << " ";
            }
-            else
-            {
-                if (x_end - x_start == 1)
-                {
-                    if (c_end - c_start == 1)
-                    {
-                        for (int y = y_start; y < y_end; y++)
-                            printf("%7d", y);
-                        printf("\n");
-
-                        for (int y = y_start; y < y_end; y++)
-                            printf("%7d", this->get_element_value({y, x_start, c_start}, with_padding));
-                        printf("\n");
-
-                        return;
-                    }
-                }
-            }
-
-            if (y_end - y_start == 1)
-                axis = 0;
-
-            if (x_end - x_start == 1)
-                axis = 1;
-
-            if (c_end - c_start == 1)
-                axis = 2;
-
-            if (axis == 0)
-            {
-                // ______c
-                // |
-                // |
-                // x
-                //
-                for (int y = y_start; y < y_end; y++)
-                {
-                    printf("y = %d\n     ", y);
-
-                    for (int c = c_start; c < c_end; c++)
-                        printf("%7d", c);
-                    printf("\n");
-
-                    for (int x = x_start; x < x_end; x++)
-                    {
-                        printf("%5d", x);
-                        for (int c = c_start; c < c_end; c++)
-                            printf("%7d", this->get_element_value({y, x, c}, with_padding));
-                        printf("\n");
-                    }
-                    printf("\n");
-                }
-            }
-            else if (axis == 1)
-            {
-                // ______c
-                // |
-                // |
-                // y
-                //
-                for (int x = x_start; x < x_end; x++)
-                {
-                    printf("x = %d\n     ", x);
-
-                    for (int c = c_start; c < c_end; c++)
-                        printf("%7d", c);
-                    printf("\n");
-
-                    for (int y = y_start; y < y_end; y++)
-                    {
-                        printf("%5d", y);
-                        for (int c = c_start; c < c_end; c++)
-                            printf("%7d", this->get_element_value({y, x, c}, with_padding));
-                        printf("\n");
-                    }
-                    printf("\n");
-                }
-            }
-            else
-            {
-                // ______x
-                // |
-                // |
-                // y
-                //
-                for (int c = c_start; c < c_end; c++)
-                {
-                    printf("c = %d\n     ", c);
-
-                    for (int x = x_start; x < x_end; x++)
-                        printf("%7d", x);
-                    printf("\n");
-
-                    for (int y = y_start; y < y_end; y++)
-                    {
-                        printf("%5d", y);
-                        for (int x = x_start; x < x_end; x++)
-                            printf("%7d", this->get_element_value({y, x, c}, with_padding));
-                        printf("\n");
-                    }
-                    printf("\n");
-                }
-            }
-
+            std::cout << "\n";
            return;
        }

        /**
-         * @brief print all the element of the Tensor.
+         * @brief Get the index of each dims
         * 
-         * @param message to print
-         * @param with_padding one of true or false,
-         *                     - true: the padding element will also be printed
-         *                     - false: the padding element will not be printed
+         * @param element_index the index of the element
+         * @return std::vector<int> the index of each dims
         */
-        void print_all(const char *message, const bool with_padding = false)
-        {
-            int y_end;
-            int x_end;
-            int c_end;
-            if (with_padding)
-            {
-                y_end = this->shape_with_padding[0];
-                x_end = this->shape_with_padding[1];
-                c_end = this->shape_with_padding[2];
-            }
-            else
-            {
-                y_end = this->shape[0];
-                x_end = this->shape[1];
-                c_end = this->shape[2];
-            }
+        std::vector<int> get_axis_index(int element_index);

-            printf("\n%s | ", message);
-            this->print_shape();
-
-            for (int y = 0; y < y_end; y++)
-            {
-                for (int x = 0; x < x_end; x++)
-                {
-                    for (int c = 0; c < c_end; c++)
-                        printf("%d ", this->get_element_value({y, x, c}, with_padding));
-                }
-            }
-            printf("\n");
-            return;
-        }
+        /**
+         * @brief Get the index of element 
+         * 
+         * @param axis_index the index of each dims
+         * @return int the index of element 
+         */
+        int get_element_index(const std::vector<int> axis_index);

        /**
         * @brief Check the element value with input ground-truth.
@ -638,35 +386,39 @@ namespace dl
         * @param gt_element ground-truth value of element
         * @param bias permissible error
         * @param info one of true or false
-         *             - true: print shape and result
+         *             - true:  shape and result
         *             - false: do not
+         * @param failed_number maximum number of wrong element that will be printed
+         * 
         * @return 
         *         - true: in permissible error
         *         - false: not 
         */
-        bool check_element(T *gt_element, int bias = 2, bool info = true)
+        bool check_element(T *gt_element, int bias = 2, bool info = true, int failed_number = 0)
        {
+            int count = 0;
            if (info)
                this->print_shape();
-            int i = 0;
-            for (int y = 0; y < this->shape[0]; y++)
+            int size = this->get_size();
+            for (int i = 0; i < size; i++)
            {
-                for (int x = 0; x < this->shape[1]; x++)
+                if (DL_ABS(this->element[i] - gt_element[i]) > bias)
                {
-                    for (int c = 0; c < this->shape[2]; c++)
+                    std::vector<int> index = get_axis_index(i);
+                    std::cout << "element[";
+                    for (int j = 0; j < index.size() - 1; j++)
                    {
-                        int a = this->get_element_value({y, x, c});
-                        int b = gt_element[i];
-                        int offset = DL_ABS(a - b);
-                        if (offset > bias)
-                        {
-                            printf("element[%d, %d, %d]: %d v.s. %d\n", y, x, c, a, b);
-                            return false;
-                        }
-                        i++;
+                        std::cout << index[j] << ", ";
                    }
+                    std::cout << index.back() << "]: ";
+                    std::cout << +this->element[i] << " v.s. " << +gt_element[i] << "\n";
+                    count++;
+                    if (count > failed_number)
+                        return false;
                }
            }
+            if (count)
+                return false;

            if (info)
                printf("PASS\n");
@ -700,35 +452,44 @@ namespace dl

        Tensor<T> &operator=(const Tensor<T> &input)
        {
-            this->size = input.size;
            this->auto_free = input.auto_free;
            this->exponent = input.exponent;
-            this->shape = input.shape;
-            this->padding = input.padding;
-            int size_real_tmp = this->shape_with_padding.size() ? this->shape_with_padding[0] * this->shape_with_padding[1] * this->shape_with_padding[2] : 0;
-            int size_input_real = input.shape_with_padding.size() ? input.shape_with_padding[0] * input.shape_with_padding[1] * input.shape_with_padding[2] : 0;
-            this->shape_with_padding = input.shape_with_padding;
-            if (this->element)
+            int size_real_tmp = this->size;
+            int size_input_real = input.size;
+            this->set_shape(input.shape);
+            if (input.element)
            {
-                if (size_real_tmp != size_input_real)
+                if (this->element)
                {
-                    tool::free_aligned(this->element);
-                    T *new_element = (T *)tool::calloc_aligned(size_input_real, sizeof(T), 16);
-                    tool::copy_memory(new_element, input.element, size_input_real * sizeof(T));
-                    this->element = new_element;
+                    if (size_real_tmp != size_input_real)
+                    {
+                        tool::free_aligned_prefer(this->element);
+                        T *new_element = (T *)tool::malloc_aligned_prefer(size_input_real, sizeof(T), 16);
+                        tool::copy_memory(new_element, input.element, size_input_real * sizeof(T));
+                        this->element = new_element;
+                    }
+                    else
+                    {
+                        tool::copy_memory(this->element, input.element, size_input_real * sizeof(T));
+                    }
                }
                else
                {
-                    tool::copy_memory(this->element, input.element, size_input_real * sizeof(T));
+                    T *new_element = (T *)tool::malloc_aligned_prefer(size_input_real, sizeof(T), 16);
+                    tool::copy_memory(new_element, input.element, size_input_real * sizeof(T));
+                    this->element = new_element;
                }
+                return *this;
            }
            else
            {
-                T *new_element = (T *)tool::calloc_aligned(size_input_real, sizeof(T), 16);
-                tool::copy_memory(new_element, input.element, size_input_real * sizeof(T));
-                this->element = new_element;
+                if (this->element)
+                {
+                    tool::free_aligned_prefer(this->element);
+                    this->element = NULL;
+                }
+                return *this;
            }
-            return *this;
        }
    };
 } // namespace dl