Update IDF to e931fe9 and add esp-face (#2291)

* Update IDF to e931fe9 and add esp-face * Fix PIO builds fail because of sketch size * Fix example build failing for Arduino
2019-01-09 19:31:46 +01:00
parent 452c27a74a
commit fa61b3bffe
106 changed files with 2794 additions and 276 deletions
--- a/tools/sdk/include/bt/esp_gap_ble_api.h
+++ b/tools/sdk/include/bt/esp_gap_ble_api.h
@ -1140,7 +1140,7 @@ esp_err_t esp_ble_passkey_reply(esp_bd_addr_t bd_addr, bool accept, uint32_t pas


 /**
-* @brief           Reply the confirm value to the peer device in the legacy connection stage.
+* @brief           Reply the confirm value to the peer device in the secure connection stage.
 *
 * @param[in]       bd_addr : BD address of the peer device
 * @param[in]       accept : numbers to compare are the same or different.
--- a/tools/sdk/include/config/sdkconfig.h
+++ b/tools/sdk/include/config/sdkconfig.h
@ -190,6 +190,7 @@
 #define CONFIG_LWIP_SO_REUSE_RXTOALL 1
 #define CONFIG_MB_CONTROLLER_NOTIFY_TIMEOUT 20
 #define CONFIG_PARTITION_TABLE_SINGLE_APP 1
+#define CONFIG_XTENSA_IMPL 1
 #define CONFIG_UNITY_ENABLE_FLOAT 1
 #define CONFIG_ESP32_WIFI_RX_BA_WIN 6
 #define CONFIG_MBEDTLS_X509_CSR_PARSE_C 1
@ -233,6 +234,7 @@
 #define CONFIG_LOG_BOOTLOADER_LEVEL 0
 #define CONFIG_MBEDTLS_TLS_ENABLED 1
 #define CONFIG_LWIP_MAX_RAW_PCBS 16
+#define CONFIG_BTU_TASK_STACK_SIZE 4096
 #define CONFIG_SMP_ENABLE 1
 #define CONFIG_SPIRAM_SIZE -1
 #define CONFIG_MBEDTLS_SSL_SESSION_TICKETS 1
--- a/tools/sdk/include/esp-face/dl_lib.h
+++ b/tools/sdk/include/esp-face/dl_lib.h
@ -0,0 +1,336 @@
+#ifndef DL_LIB_H
+#define DL_LIB_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "dl_lib_matrix.h"
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrix3d.h"
+#include "dl_lib_matrix3dq.h"
+
+    typedef int padding_state;
+    /**
+     * @brief Does a fast version of the exp() operation on a floating point number.
+     *
+     * As described in https://codingforspeed.com/using-faster-exponential-approximation/
+     * Should be good til an input of 5 or so with a steps factor of 8.
+     *
+     * @param in Floating point input
+     * @param steps Approximation steps. More is more precise. 8 or 10 should be good enough for most purposes.
+     * @return Exp()'ed output
+     */
+    fptp_t fast_exp(double x, int steps);
+
+    /**
+     * @brief Does a softmax operation on a matrix.
+     *
+     * @param in        Input matrix
+     * @param out       Output matrix. Can be the same as the input matrix; if so,
+                                             output results overwrite the input.
+    */
+    void dl_softmax(const dl_matrix2d_t *in,
+                    dl_matrix2d_t *out);
+
+    /**
+     * @brief Does a softmax operation on a quantized matrix.
+     *
+     * @param in        Input matrix
+     * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+     */
+    void dl_softmax_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+    /**
+     * @brief Does a sigmoid operation on a floating point number
+     *
+     * @param in Floating point input
+     * @return Sigmoid output
+     */
+    fptp_t dl_sigmoid_op(fptp_t in);
+
+    /**
+     * @brief Does a sigmoid operation on a matrix.
+     *
+     * @param in        Input matrix
+     * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+     */
+    void dl_sigmoid(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+    /**
+     * @brief Does a tanh operation on a floating point number
+     *
+     * @param in        Floating point input number
+     * @return Tanh value
+     */
+    fptp_t dl_tanh_op(fptp_t v);
+
+    /**
+     * @brief Does a tanh operation on a matrix.
+     *
+     * @param in        Input matrix
+     * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+     */
+    void dl_tanh(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+    /**
+     * @brief Does a relu (Rectifier Linear Unit) operation on a floating point number
+     *
+     * @param in        Floating point input
+     * @param clip      If value is higher than this, it will be clipped to this value
+     * @return Relu output
+     */
+    fptp_t dl_relu_op(fptp_t in, fptp_t clip);
+
+    /**
+     * @brief Does a ReLu operation on a matrix.
+     *
+     * @param in        Input matrix
+     * @param clip      If values are higher than this, they will be clipped to this value
+     * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+     */
+    void dl_relu(const dl_matrix2d_t *in, fptp_t clip, dl_matrix2d_t *out);
+
+    /**
+     * @brief Fully connected layer operation
+     *
+     * @param in        Input vector
+     * @param weight    Weights of the neurons
+     * @param bias      Biases for the neurons. Can be NULL if a bias of 0 is required.
+     * @param out       Output array. Outputs are placed here. Needs to be an initialized, weight->w by in->h in size, matrix.
+     */
+    void dl_fully_connect_layer(const dl_matrix2d_t *in,
+                                const dl_matrix2d_t *weight,
+                                const dl_matrix2d_t *bias,
+                                dl_matrix2d_t *out);
+
+    /**
+     * @brief Pre-calculate the sqrtvari variable for the batch_normalize function.
+     * The sqrtvari matrix depends on the variance and epsilon values, which normally are constant. Hence,
+     * this matrix only needs to be calculated once. This function does that.
+     *
+     * @param
+     * @return
+     */
+    void dl_batch_normalize_get_sqrtvar(const dl_matrix2d_t *variance,
+                                        fptp_t epsilon,
+                                        dl_matrix2d_t *out);
+
+    /**
+     * @brief Batch-normalize a matrix
+     *
+     * @param m         The matrix to normalize
+     * @param offset    Offset matrix
+     * @param scale     Scale matrix
+     * @param mean      Mean matrix
+     * @param sqrtvari  Matrix precalculated using dl_batch_normalize_get_sqrtvar
+     * @return
+     */
+    void dl_batch_normalize(dl_matrix2d_t *m,
+                            const dl_matrix2d_t *offset,
+                            const dl_matrix2d_t *scale,
+                            const dl_matrix2d_t *mean,
+                            const dl_matrix2d_t *sqrtvari);
+
+    /**
+     * @brief Do a basic LSTM layer pass.
+     *
+     * @warning Returns state_h pointer, so do not free result.
+
+    * @param in        Input vector
+    * @param state_c   Internal state of the LSTM network
+    * @param state_h   Internal state (previous output values) of the LSTM network
+    * @param weights   Weights for the neurons
+    * @param bias      Bias for the neurons. Can be NULL if no bias is required
+    * @return          Output values of the neurons
+    */
+    dl_matrix2d_t *dl_basic_lstm_layer(const dl_matrix2d_t *in,
+                                       dl_matrix2d_t *state_c,
+                                       dl_matrix2d_t *state_h,
+                                       const dl_matrix2d_t *weight,
+                                       const dl_matrix2d_t *bias);
+
+    /**
+     * @brief Do a basic LSTM layer pass, partial quantized version.
+     * This LSTM function accepts 16-bit fixed-point weights and 32-bit float-point bias.
+     *
+     * @warning Returns state_h pointer, so do not free result.
+
+    * @param in		Input vector
+    * @param state_c	Internal state of the LSTM network
+    * @param state_h	Internal state (previous output values) of the LSTM network
+    * @param weights	Weights for the neurons, need to be quantised
+    * @param bias		Bias for the neurons. Can be NULL if no bias is required
+    * @return			Output values of the neurons
+    */
+    dl_matrix2d_t *dl_basic_lstm_layer_quantised_weights(const dl_matrix2d_t *in,
+                                                         dl_matrix2d_t *state_c,
+                                                         dl_matrix2d_t *state_h,
+                                                         const dl_matrix2dq_t *weight,
+                                                         const dl_matrix2d_t *bias);
+
+    /**
+     * @brief Do a fully-connected layer pass, fully-quantized version.
+     *
+     * @param in        Input vector
+     * @param weight    Weights of the neurons
+     * @param bias      Bias values of the neurons. Can be NULL if no bias is needed.
+     * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+     * @return          Output values of the neurons
+     */
+    void dl_fully_connect_layer_q(const dl_matrix2dq_t *in,
+                                  const dl_matrix2dq_t *weight,
+                                  const dl_matrix2dq_t *bias,
+                                  dl_matrix2dq_t *out,
+                                  int shift);
+
+    /**
+     * @brief Do a basic LSTM layer pass, fully-quantized version
+     *
+     * @warning Returns state_h pointer, so do not free result.
+
+    * @param in        Input vector
+    * @param state_c   Internal state of the LSTM network
+    * @param state_h   Internal state (previous output values) of the LSTM network
+    * @param weights   Weights for the neurons
+    * @param bias      Bias for the neurons. Can be NULL if no bias is required
+    * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+    * @return          Output values of the neurons
+    */
+    dl_matrix2dq_t *dl_basic_lstm_layer_q(const dl_matrix2dq_t *in,
+                                          dl_matrix2dq_t *state_c,
+                                          dl_matrix2dq_t *state_h,
+                                          const dl_matrix2dq_t *weight,
+                                          const dl_matrix2dq_t *bias,
+                                          int shift);
+
+    /**
+     * @brief Batch-normalize a matrix, fully-quantized version
+     *
+     * @param m         The matrix to normalize
+     * @param offset    Offset matrix
+     * @param scale     Scale matrix
+     * @param mean      Mean matrix
+     * @param sqrtvari  Matrix precalculated using dl_batch_normalize_get_sqrtvar
+     * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+     * @return
+     */
+    void dl_batch_normalize_q(dl_matrix2dq_t *m,
+                              const dl_matrix2dq_t *offset,
+                              const dl_matrix2dq_t *scale,
+                              const dl_matrix2dq_t *mean,
+                              const dl_matrix2dq_t *sqrtvari,
+                              int shift);
+
+    /**
+     * @brief Does a relu (Rectifier Linear Unit) operation on a fixed-point number
+     * This accepts and returns fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+     * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+     *
+     * @param in        Fixed-point input
+     * @param clip      If value is higher than this, it will be clipped to this value
+     * @return Relu output
+     */
+    qtp_t dl_relu_q_op(qtp_t in,
+                       qtp_t clip);
+
+    /**
+     * @brief Does a ReLu operation on a matrix, quantized version
+     *
+     * @param in        Input matrix
+     * @param clip      If values are higher than this, they will be clipped to this value
+     * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+     */
+    void dl_relu_q(const dl_matrix2dq_t *in,
+                   fptp_t clip,
+                   dl_matrix2dq_t *out);
+
+    /**
+     * @brief Does a sigmoid operation on a fixed-point number.
+     * This accepts and returns a fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+     * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+     *
+     * @param in Fixed-point input
+     * @return Sigmoid output
+     */
+    int dl_sigmoid_op_q(const int in);
+
+    /**
+     * @brief Does a sigmoid operation on a matrix, quantized version
+     *
+     * @param in        Input matrix
+     * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+     */
+    void dl_sigmoid_q(const dl_matrix2dq_t *in,
+                      dl_matrix2dq_t *out);
+
+    /**
+     * @brief Does a tanh operation on a matrix, quantized version
+     *
+     * @param in        Input matrix
+     * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+     */
+    void dl_tanh_q(const dl_matrix2dq_t *in,
+                   dl_matrix2dq_t *out);
+
+    /**
+     * @brief Do a basic CNN layer pass.
+     *
+     * @Warning This just supports the single channel input image, and the output is single row matrix.
+                That is to say, the height of output is 1, and the weight of output is out_channels*out_image_width*out_image_height
+    *
+    * @param in             Input single channel image
+    * @param weight         Weights of the neurons, weight->w = out_channels, weight->h = filter_width*filter_height
+    * @param bias           Bias for the CNN layer.
+    * @param filter_height  The height of convolution kernel
+    * @param filter_width   The width of convolution kernel
+    * @param out_channels   The number of output channels of convolution kernel
+    * @param stride_x       The step length of the convolution window in x(width) direction
+    * @param stride_y       The step length of the convolution window in y(height) direction
+    * @param pad            One of `"VALID"` or `"SAME"`, 0 is "VALID" and the other is "SAME"
+    * @param out            The result of CNN layer, out->h=1.
+    * @return               The result of CNN layer.
+    */
+    dl_matrix2d_t *dl_basic_conv_layer(const dl_matrix2d_t *in,
+                                       const dl_matrix2d_t *weight,
+                                       const dl_matrix2d_t *bias,
+                                       int filter_width,
+                                       int filter_height,
+                                       const int out_channels,
+                                       const int stride_x,
+                                       const int stride_y,
+                                       padding_state pad,
+                                       const dl_matrix2d_t *out);
+
+    /**
+     * @brief Do a basic CNN layer pass, quantised wersion.
+     *
+     * @Warning This just supports the single channel input image, and the output is single row matrix.
+                That is to say, the height of output is 1, and the weight of output is out_channels*out_image_width*out_image_height
+    *
+    * @param in             Input single channel image
+    * @param weight         Weights of the neurons, weight->w = out_channels, weight->h = filter_width*filter_height,
+    * @param bias           Bias of the neurons.
+    * @param filter_height  The height of convolution kernel
+    * @param filter_width   The width of convolution kernel
+    * @param out_channels   The number of output channels of convolution kernel
+    * @param stride_x       The step length of the convolution window in x(width) direction
+    * @param stride_y       The step length of the convolution window in y(height) direction
+    * @param pad            One of `"VALID"` or `"SAME"`, 0 is "VALID" and the other is "SAME"
+    * @param out            The result of CNN layer, out->h=1
+    * @return               The result of CNN layer
+    */
+    dl_matrix2d_t *dl_basic_conv_layer_quantised_weight(const dl_matrix2d_t *in,
+                                                        const dl_matrix2dq_t *weight,
+                                                        const dl_matrix2d_t *bias,
+                                                        int filter_width,
+                                                        int filter_height,
+                                                        const int out_channels,
+                                                        const int stride_x,
+                                                        const int stride_y,
+                                                        padding_state pad,
+                                                        const dl_matrix2d_t *out);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/tools/sdk/include/esp-face/dl_lib_coefgetter_if.h
+++ b/tools/sdk/include/esp-face/dl_lib_coefgetter_if.h
@ -0,0 +1,47 @@
+#ifndef DL_LIB_COEFGETTER_IF_H
+#define DL_LIB_COEFGETTER_IF_H
+
+#include "dl_lib_matrix.h"
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrix3d.h"
+#include "dl_lib_matrix3dq.h"
+
+//Set this if the coefficient requested is a batch-normalization popvar matrix which needs to be preprocessed by
+//dl_batch_normalize_get_sqrtvar first.
+#define COEF_GETTER_HINT_BNVAR (1<<0)
+
+/*
+This struct describes the basic information of model data:
+word_num: the number of wake words or speech commands
+word_list: the name list of wake words or speech commands
+thres_list: the threshold list of wake words or speech commands
+info_str: the string used to reflect the version and information of model data
+          which consist of the architecture of network, the version of model data, wake words and their threshold
+*/
+typedef struct {
+    int word_num;
+    char **word_list;
+    int *win_list;
+    float *thresh_list;
+    char *info_str;
+} model_info_t;
+
+/*
+This struct describes a generic coefficient getter: a way to get the constant coefficients needed for a neural network.
+For the two getters, the name describes the name of the coefficient matrix, usually the same as the Numpy filename the
+coefficient was originally stored in. The arg argument can be used to optionally pass an additional user-defined argument
+to the getter (e.g. the directory to look for files in the case of the Numpy file loader getter). The hint argument
+is a bitwise OR of the COEF_GETTER_HINT_* flags or 0 when none is needed. Use the free_f/free_q functions to release the
+memory for the returned matrices, when applicable.
+*/
+typedef struct {
+    const dl_matrix2d_t* (*getter_f)(const char *name, void *arg, int hint);
+    const dl_matrix2dq_t* (*getter_q)(const char *name, void *arg, int hint);
+    const dl_matrix3d_t* (*getter_3d)(const char *name, void *arg, int hint);
+    const dl_matrix3dq_t* (*getter_3dq)(const char *name, void *arg, int hint);
+    void (*free_f)(const dl_matrix2d_t *m);
+    void (*free_q)(const dl_matrix2dq_t *m);
+    const model_info_t* (*getter_info)(void *arg);
+} model_coeff_getter_t;
+
+#endif
--- a/tools/sdk/include/esp-face/dl_lib_matrix.h
+++ b/tools/sdk/include/esp-face/dl_lib_matrix.h
@ -0,0 +1,216 @@
+#ifndef DL_LIB_MATRIX_H
+#define DL_LIB_MATRIX_H
+
+typedef float fptp_t;
+
+
+//Flags for matrices
+#define DL_MF_FOREIGNDATA (1<<0)  /*< Matrix *item data actually points to another matrix and should not be freed */
+
+//'Normal' float matrix
+typedef struct {
+    int w;          /*< Width */
+    int h;          /*< Height */
+    int stride;     /*< Row stride, essentially how many items to skip to get to the same position in the next row */
+    int flags;      /*< Flags. OR of DL_MF_* values */
+    fptp_t *item;   /*< Pointer to item array */
+} dl_matrix2d_t;
+
+//Macro to quickly access the raw items in a matrix
+#define DL_ITM(m, x, y) m->item[(x)+(y)*m->stride]
+
+
+//#define DL_ITM3D(m, n, x, y, z) (m)->item[(n) * (m)->stride * (m)->c + (z) * (m)->stride + (y) * (m)->w + (x)]
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_alloc(int w, int h);
+
+
+/**
+ * @brief Free a matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrix_free(dl_matrix2d_t *m);
+
+/**
+ * @brief Zero out the matrix
+ * Sets all entries in the matrix to 0.
+ *
+ * @param m     Matrix to zero
+ */
+void dl_matrix_zero(dl_matrix2d_t *m);
+
+/**
+ * @brief Generate a new matrix using a range of items from an existing matrix.
+ * When using this, the data of the new matrix is not allocated/copied but it re-uses a pointer
+ * to the existing data. Changing the data in the resulting matrix, as a result, will also change
+ * the data in the existing matrix that has been sliced.
+ *
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix (with foreign data) to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting slice matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_slice(const dl_matrix2d_t *src, int x, int y, int w, int h, dl_matrix2d_t *in);
+
+/**
+ * @brief select a range of items from an existing matrix and flatten them into one dimension.
+ *
+ * @Warning The results are flattened in row-major order.
+ *   
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix to re-use. Passing NULL will allocate a new matrix.
+ * @return  The resulting flatten matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_flatten(const dl_matrix2d_t *src, int x, int y, int w, int h, dl_matrix2d_t *in);
+
+/**
+ * @brief Generate a matrix from existing floating-point data
+ *
+ * @param w     Width of resulting matrix
+ * @param h     Height of resulting matrix
+ * @param data  Data to populate matrix with
+ * @return A newaly allocated matrix populated with the given input data, or NULL if out of memory.
+ */
+dl_matrix2d_t *dl_matrix_from_data(int w, int h, int stride, const void *data);
+
+
+/**
+ * @brief Multiply a pair of matrices item-by-item: res=a*b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Multiplicated data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_mul(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of two matrices : res=a.b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrix_dot(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Add a pair of matrices item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Added data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_add(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Divide a pair of matrices item-by-item: res=a/b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Divided data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_div(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+/**
+ * @brief Subtract a matrix from another, item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Subtracted data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_sub(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+/**
+ * @brief Add a constant to every item of the matrix
+ *
+ * @param subj  Matrix to add the constant to
+ * @param add   The constant
+ */
+void dl_matrix_add_const(dl_matrix2d_t *subj, const fptp_t add);
+
+
+/**
+ * @brief Concatenate the rows of two matrices into a new matrix
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @return A newly allocated array with as avlues a|b
+ */
+dl_matrix2d_t *dl_matrix_concat(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+
+/**
+ * @brief Print the contents of a matrix to stdout. Used for debugging.
+ *
+ * @param a     The matrix to print.
+ */
+void dl_printmatrix(const dl_matrix2d_t *a);
+
+/**
+ * @brief Return the average square error given a correct and a test matrix.
+ *
+ * ...Well, more or less. If anything, it gives an indication of the error between
+ * the two. Check the code for the exact implementation.
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return value indicating the relative difference between matrices
+ */
+float dl_matrix_get_avg_sq_err(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+
+
+/**
+ * @brief Check if two matrices have the same shape, that is, the same amount of rows and columns
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return true if the two matrices are shaped the same, false otherwise.
+ */
+int dl_matrix_same_shape(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+
+/**
+ * @brief Get a specific item from the matrix
+ *
+ * Please use these for external matrix access instead of DL_ITM
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @return Value in that position
+ */
+inline static fptp_t dl_matrix_get(const dl_matrix2d_t *m, const int x, const int y) { 
+    return DL_ITM(m, x, y);
+}
+
+/**
+ * @brief Set a specific item in the matrix to the given value
+ *
+ * Please use these for external matrix access instead of DL_ITM
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @param val   Value to write to that position
+ */
+inline static void dl_matrix_set(dl_matrix2d_t *m, const int x, const int y, fptp_t val) { 
+    DL_ITM(m, x, y)=val;
+}
+
+#endif
+
--- a/tools/sdk/include/esp-face/dl_lib_matrix3d.h
+++ b/tools/sdk/include/esp-face/dl_lib_matrix3d.h
@ -0,0 +1,420 @@
+#pragma once
+
+typedef float fptp_t;
+typedef uint8_t uc_t;
+
+typedef enum
+{
+    DL_C_IMPL = 0,
+    DL_XTENSA_IMPL = 1
+} dl_conv_mode;
+
+typedef enum
+{
+    INPUT_UINT8 = 0,
+    INPUT_FLOAT = 1,
+} dl_op_type;
+
+typedef enum
+{
+    PADDING_VALID = 0,
+    PADDING_SAME = 1,
+} dl_padding_type;
+
+/*
+ * Matrix for 3d
+ * @Warning: the sequence of variables is fixed, cannot be modified, otherwise there will be errors in esp_dsp_dot_float
+ */
+typedef struct
+{
+    /******* fix start *******/
+    int w; // Width
+    int h; // Height
+    int c; // Channel
+    int n; // Number, to record filter's out_channels. input and output must be 1
+    int stride;
+    fptp_t *item;
+    /******* fix end *******/
+} dl_matrix3d_t;
+
+typedef struct
+{
+    int w; // Width
+    int h; // Height
+    int c; // Channel
+    int n; // Number, to record filter's out_channels. input and output must be 1
+    int stride;
+    uc_t *item;
+} dl_matrix3du_t;
+
+typedef struct
+{
+    int stride_x;
+    int stride_y;
+    dl_padding_type padding;
+    dl_conv_mode mode;
+    dl_op_type type;
+} dl_matrix3d_conv_config_t;
+
+/*
+ * @brief Allocate a 3D matrix with float items, the access sequence is NHWC
+ *
+ * @param n     Number of matrix3d, for filters it is out channels, for others it is 1
+ * @param w     Width of matrix3d
+ * @param h     Height of matrix3d
+ * @param c     Channel of matrix3d
+ * @return      3d matrix
+ */
+dl_matrix3d_t *dl_matrix3d_alloc(int n, int w, int h, int c);
+
+/*
+ * @brief Allocate a 3D matrix with 8-bits items, the access sequence is NHWC
+ *
+ * @param n     Number of matrix3d, for filters it is out channels, for others it is 1
+ * @param w     Width of matrix3d
+ * @param h     Height of matrix3d
+ * @param c     Channel of matrix3d
+ * @return      3d matrix
+ */
+dl_matrix3du_t *dl_matrix3du_alloc(int n, int w, int h, int c);
+
+/*
+ * @brief Free a matrix3d
+ *
+ * @param m matrix3d with float items
+ */
+void dl_matrix3d_free(dl_matrix3d_t *m);
+
+/*
+ * @brief Free a matrix3d
+ *
+ * @param m matrix3d with 8-bits items
+ */
+void dl_matrix3du_free(dl_matrix3du_t *m);
+
+/**
+ * @brief Do a relu (Rectifier Linear Unit) operation, update the input matrix3d
+ *
+ * @param in        Floating point input matrix3d
+ * @param clip      If value is higher than this, it will be clipped to this value
+ */
+void dl_matrix3d_relu(dl_matrix3d_t *m, fptp_t clip);
+
+/**
+ * @brief Do a leaky relu (Rectifier Linear Unit) operation, update the input matrix3d
+ *
+ * @param in        Floating point input matrix3d
+ * @param clip      If value is higher than this, it will be clipped to this value
+ * @param alpha     If value is less than zero, it will be updated by multiplying this factor
+ */
+void dl_matrix3d_leaky_relu(dl_matrix3d_t *m, fptp_t clip, fptp_t alpha);
+
+/**
+ * @brief Do a softmax operation on a matrix3d
+ *
+ * @param in        Input matrix3d
+ */
+void dl_matrix3d_softmax(dl_matrix3d_t *m);
+
+/**
+ * @brief Do a general fully connected layer pass, dimension is (number, width, height, channel)
+ *
+ * @param in             Input matrix3d, size is (1, w, 1, 1)
+ * @param filter         Weights of the neurons, size is (1, w, h, 1)
+ * @param bias           Bias for the fc layer, size is (1, 1, 1, h)
+ * @return               The result of fc layer, size is (1, 1, 1, h)
+ */
+dl_matrix3d_t *dl_matrix3d_fc(dl_matrix3d_t *in,
+                              dl_matrix3d_t *filter,
+                              dl_matrix3d_t *bias);
+
+/**
+ * @brief Copy a range of float items from an existing matrix to a preallocated matrix
+ *
+ * @param dst   The destination slice matrix
+ * @param src   The source matrix to slice
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ */
+void dl_matrix3d_slice_copy(dl_matrix3d_t *dst,
+                            dl_matrix3d_t *src,
+                            int x,
+                            int y,
+                            int w,
+                            int h);
+
+/**
+ * @brief Copy a range of 8-bits items from an existing matrix to a preallocated matrix
+ *
+ * @param dst   The destination slice matrix
+ * @param src   The source matrix to slice
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ */
+void dl_matrix3du_slice_copy(dl_matrix3du_t *dst,
+                             dl_matrix3du_t *src,
+                             int x,
+                             int y,
+                             int w,
+                             int h);
+
+/**
+ * @brief Do a general CNN layer pass, dimension is (number, width, height, channel)
+ *
+ * @param in             Input matrix3d
+ * @param filter         Weights of the neurons
+ * @param bias           Bias for the CNN layer
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param padding        One of VALID or SAME
+ * @param mode           Do convolution using C implement or xtensa implement, 0 or 1, with respect
+ *                       If ESP_PLATFORM is not defined, this value is not used. Default is 0
+ * @return               The result of CNN layer
+ */
+dl_matrix3d_t *dl_matrix3d_conv(dl_matrix3d_t *in,
+                                dl_matrix3d_t *filter,
+                                dl_matrix3d_t *bias,
+                                int stride_x,
+                                int stride_y,
+                                int padding,
+                                int mode);
+
+/**
+ * @brief Do a general CNN layer pass, dimension is (number, width, height, channel)
+ *
+ * @param in             Input matrix3d
+ * @param filter         Weights of the neurons
+ * @param bias           Bias for the CNN layer
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param padding        One of VALID or SAME
+ * @param mode           Do convolution using C implement or xtensa implement, 0 or 1, with respect
+ *                       If ESP_PLATFORM is not defined, this value is not used. Default is 0
+ * @return               The result of CNN layer
+ */
+dl_matrix3d_t *dl_matrix3du_conv(dl_matrix3du_t *in,
+                                 dl_matrix3d_t *filter,
+                                 dl_matrix3d_t *bias,
+                                 int stride_x,
+                                 int stride_y,
+                                 int padding,
+                                 int mode);
+
+/**
+ * @brief Do a depthwise CNN layer pass, dimension is (number, width, height, channel)
+ *
+ * @param in             Input matrix3d
+ * @param filter         Weights of the neurons
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param padding        One of VALID or SAME
+ * @param mode           Do convolution using C implement or xtensa implement, 0 or 1, with respect
+ *                       If ESP_PLATFORM is not defined, this value is not used. Default is 0
+ * @return               The result of depthwise CNN layer
+ */
+dl_matrix3d_t *dl_matrix3d_depthwise_conv(dl_matrix3d_t *in,
+                                          dl_matrix3d_t *filter,
+                                          int stride_x,
+                                          int stride_y,
+                                          int padding,
+                                          int mode);
+
+/**
+ * @brief Do a mobilenet block forward, dimension is (number, width, height, channel)
+ *
+ * @param in             Input matrix3d
+ * @param filter         Weights of the neurons
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param padding        One of VALID or SAME
+ * @param mode           Do convolution using C implement or xtensa implement, 0 or 1, with respect
+ *                       If ESP_PLATFORM is not defined, this value is not used. Default is 0
+ * @return               The result of depthwise CNN layer
+ */
+dl_matrix3d_t *dl_matrix3d_mobilenet(void *in,
+                                     dl_matrix3d_t *dilate,
+                                     dl_matrix3d_t *depthwise,
+                                     dl_matrix3d_t *compress,
+                                     dl_matrix3d_t *bias,
+                                     dl_matrix3d_t *prelu,
+                                     dl_matrix3d_conv_config_t *config);
+
+/**
+ * @brief Do a global average pooling layer pass, dimension is (number, width, height, channel)
+ *
+ * @param in             Input matrix3d
+ *
+ * @return               The result of global average pooling layer
+ */
+dl_matrix3d_t *dl_matrix3d_global_pool(dl_matrix3d_t *in);
+
+/**
+ * @brief Do a batch normalization operation, update the input matrix3d: input = input * scale + offset
+ *
+ * @param m              Input matrix3d
+ * @param scale          scale matrix3d,  scale = gamma/((moving_variance+sigma)^(1/2))
+ * @param Offset         Offset matrix3d, offset = beta-(moving_mean*gamma/((moving_variance+sigma)^(1/2)))
+ */
+void dl_matrix3d_batch_normalize(dl_matrix3d_t *m,
+                                 dl_matrix3d_t *scale,
+                                 dl_matrix3d_t *offset);
+
+/**
+ * @brief Add a pair of matrix3d item-by-item: res=in_1+in_2
+ *
+ * @param in_1           First Floating point input matrix3d
+ * @param in_2           Second Floating point input matrix3d
+ *
+ * @return               Added data
+ */
+dl_matrix3d_t *dl_matrix3d_add(dl_matrix3d_t *in_1, dl_matrix3d_t *in_2);
+
+/**
+ * @brief Do a standard relu operation, update the input matrix3d
+ *
+ * @param m        Floating point input matrix3d
+ */
+void dl_matrix3d_relu_std(dl_matrix3d_t *m);
+
+/**
+ * @brief Concatenate the channels of two matrix3ds into a new matrix3d
+ *
+ * @param in_1           First Floating point input matrix3d
+ * @param in_2           Second Floating point input matrix3d
+ *
+ * @return               A newly allocated matrix3d with as avlues in_1|in_2
+ */
+dl_matrix3d_t *dl_matrix3d_concat(dl_matrix3d_t *in_1, dl_matrix3d_t *in_2);
+
+/**
+ * @brief Concatenate the channels of four matrix3ds into a new matrix3d
+ *
+ * @param in_1           First Floating point input matrix3d
+ * @param in_2           Second Floating point input matrix3d
+ * @param in_3           Third Floating point input matrix3d
+ * @param in_4           Fourth Floating point input matrix3d
+ *
+ * @return               A newly allocated matrix3d with as avlues in_1|in_2|in_3|in_4
+ */
+dl_matrix3d_t *dl_matrix3d_concat_4(dl_matrix3d_t *in_1,
+                                    dl_matrix3d_t *in_2,
+                                    dl_matrix3d_t *in_3,
+                                    dl_matrix3d_t *in_4);
+
+/**
+ * @brief Concatenate the channels of eight matrix3ds into a new matrix3d
+ *
+ * @param in_1           First Floating point input matrix3d
+ * @param in_2           Second Floating point input matrix3d
+ * @param in_3           Third Floating point input matrix3d
+ * @param in_4           Fourth Floating point input matrix3d
+ * @param in_5           Fifth Floating point input matrix3d
+ * @param in_6           Sixth Floating point input matrix3d
+ * @param in_7           Seventh Floating point input matrix3d
+ * @param in_8           eighth Floating point input matrix3d
+ *
+ * @return               A newly allocated matrix3d with as avlues in_1|in_2|in_3|in_4|in_5|in_6|in_7|in_8
+ */
+dl_matrix3d_t *dl_matrix3d_concat_8(dl_matrix3d_t *in_1,
+                                    dl_matrix3d_t *in_2,
+                                    dl_matrix3d_t *in_3,
+                                    dl_matrix3d_t *in_4,
+                                    dl_matrix3d_t *in_5,
+                                    dl_matrix3d_t *in_6,
+                                    dl_matrix3d_t *in_7,
+                                    dl_matrix3d_t *in_8);
+
+/**
+ * @brief Do a mobilefacenet block forward, dimension is (number, width, height, channel)
+ *
+ * @param in                    Input matrix3d
+ * @param pw                    Weights of the pointwise conv layer
+ * @param pw_bn_scale           The scale params of the batch_normalize layer after the pointwise conv layer
+ * @param pw_bn_offset          The offset params of the batch_normalize layer after the pointwise conv layer
+ * @param dw                    Weights of the depthwise conv layer
+ * @param dw_bn_scale           The scale params of the batch_normalize layer after the depthwise conv layer
+ * @param dw_bn_offset          The offset params of the batch_normalize layer after the depthwise conv layer
+ * @param pw_linear             Weights of the pointwise linear conv layer
+ * @param pw_linear_bn_scale    The scale params of the batch_normalize layer after the pointwise linear conv layer
+ * @param pw_linear_bn_offset   The offset params of the batch_normalize layer after the pointwise linear conv layer
+ * @param stride_x              The step length of the convolution window in x(width) direction
+ * @param stride_y              The step length of the convolution window in y(height) direction
+ * @param padding               One of VALID or SAME
+ * @param mode                  Do convolution using C implement or xtensa implement, 0 or 1, with respect
+ *                              If ESP_PLATFORM is not defined, this value is not used. Default is 0
+ * @return                      The result of a mobilefacenet block
+ */
+dl_matrix3d_t *dl_matrix3d_mobilefaceblock(void *in,
+                                           dl_matrix3d_t *pw,
+                                           dl_matrix3d_t *pw_bn_scale,
+                                           dl_matrix3d_t *pw_bn_offset,
+                                           dl_matrix3d_t *dw,
+                                           dl_matrix3d_t *dw_bn_scale,
+                                           dl_matrix3d_t *dw_bn_offset,
+                                           dl_matrix3d_t *pw_linear,
+                                           dl_matrix3d_t *pw_linear_bn_scale,
+                                           dl_matrix3d_t *pw_linear_bn_offset,
+                                           int stride_x,
+                                           int stride_y,
+                                           int padding,
+                                           int mode,
+                                           int shortcut);
+
+/**
+ * @brief Do a mobilefacenet block forward with 1x1 split conv, dimension is (number, width, height, channel)
+ *
+ * @param in                    Input matrix3d
+ * @param pw_1                  Weights of the pointwise conv layer 1
+ * @param pw_2                  Weights of the pointwise conv layer 2
+ * @param pw_bn_scale           The scale params of the batch_normalize layer after the pointwise conv layer
+ * @param pw_bn_offset          The offset params of the batch_normalize layer after the pointwise conv layer
+ * @param dw                    Weights of the depthwise conv layer
+ * @param dw_bn_scale           The scale params of the batch_normalize layer after the depthwise conv layer
+ * @param dw_bn_offset          The offset params of the batch_normalize layer after the depthwise conv layer
+ * @param pw_linear_1           Weights of the pointwise linear conv layer 1
+ * @param pw_linear_2           Weights of the pointwise linear conv layer 2
+ * @param pw_linear_bn_scale    The scale params of the batch_normalize layer after the pointwise linear conv layer
+ * @param pw_linear_bn_offset   The offset params of the batch_normalize layer after the pointwise linear conv layer
+ * @param stride_x              The step length of the convolution window in x(width) direction
+ * @param stride_y              The step length of the convolution window in y(height) direction
+ * @param padding               One of VALID or SAME
+ * @param mode                  Do convolution using C implement or xtensa implement, 0 or 1, with respect
+ *                              If ESP_PLATFORM is not defined, this value is not used. Default is 0
+ * @return                      The result of a mobilefacenet block
+ */
+dl_matrix3d_t *dl_matrix3d_mobilefaceblock_split(void *in,
+                                                 dl_matrix3d_t *pw_1,
+                                                 dl_matrix3d_t *pw_2,
+                                                 dl_matrix3d_t *pw_bn_scale,
+                                                 dl_matrix3d_t *pw_bn_offset,
+                                                 dl_matrix3d_t *dw,
+                                                 dl_matrix3d_t *dw_bn_scale,
+                                                 dl_matrix3d_t *dw_bn_offset,
+                                                 dl_matrix3d_t *pw_linear_1,
+                                                 dl_matrix3d_t *pw_linear_2,
+                                                 dl_matrix3d_t *pw_linear_bn_scale,
+                                                 dl_matrix3d_t *pw_linear_bn_offset,
+                                                 int stride_x,
+                                                 int stride_y,
+                                                 int padding,
+                                                 int mode,
+                                                 int shortcut);
+/**
+ * @brief Print the matrix3d items
+ *
+ * @param m              dl_matrix3d_t to be printed
+ * @param message        name of matrix
+ */
+void dl_matrix3d_print(dl_matrix3d_t *m, char *message);
+
+/**
+ * @brief Print the matrix3du items
+ *
+ * @param m              dl_matrix3du_t to be printed
+ * @param message        name of matrix
+ */
+void dl_matrix3du_print(dl_matrix3du_t *m, char *message);
--- a/tools/sdk/include/esp-face/dl_lib_matrix3dq.h
+++ b/tools/sdk/include/esp-face/dl_lib_matrix3dq.h
@ -0,0 +1,119 @@
+#pragma once
+#include "dl_lib_matrix3d.h"
+
+typedef int16_t qtp_t;
+
+/*
+ * Matrix for 3d
+ * @Warning: the sequence of variables is fixed, cannot be modified, otherwise there will be errors in esp_dsp_dot_float
+ */
+typedef struct
+{
+    /******* fix start *******/
+    int w;  // Width
+    int h;  // Height
+    int c;  // Channel
+    int n;  // Number, to record filter's out_channels. input and output must be 1
+    int stride;
+    int exponent;
+    qtp_t *item;
+    /******* fix end *******/
+} dl_matrix3dq_t;
+
+#define DL_QTP_SHIFT 15
+#define DL_QTP_RANGE ((1<<DL_QTP_SHIFT)-1)
+//#define DL_ITMQ(m, x, y) m->itemq[(y)+(x)*m->stride]
+#define DL_QTP_EXP_NA 255 //non-applicable exponent because matrix is null
+
+#define DL_SHIFT_AUTO 32
+
+/*
+ * @brief Allocate a 3D matrix
+ *
+ * @param n,w,h,c   number, width, height, channel
+ * @return 3d matrix
+ */
+dl_matrix3dq_t *dl_matrix3dq_alloc(int n, int w, int h, int c, int e);
+
+/*
+ * @brief Free a 3D matrix
+ *
+ * @param m matrix
+ */
+void dl_matrix3dq_free(dl_matrix3dq_t *m);
+
+/**
+ * @brief Zero out the matrix
+ * Sets all entries in the matrix to 0.
+ *
+ * @param m     Matrix to zero
+ */
+
+ dl_matrix3d_t *dl_matrix3d_from_matrixq(dl_matrix3dq_t *m);
+ dl_matrix3dq_t *dl_matrixq_from_matrix3d_qmf(dl_matrix3d_t *m,int exponent);
+ dl_matrix3dq_t *dl_matrixq_from_matrix3d(dl_matrix3d_t *m);
+/**
+ * @brief Copy a range of items from an existing matrix to a preallocated matrix
+ *
+ * @param in    Old matrix (with foreign data) to re-use. Passing NULL will allocate a new matrix.
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @return The resulting slice matrix
+ */
+void dl_matrix3dq_slice_copy (dl_matrix3dq_t *dst, dl_matrix3dq_t *src, int x, int y, int w, int h);
+
+
+/**
+ * @brief Do a general CNN layer pass, dimension is (number, width, height, channel)
+ *
+ * @param in             Input image
+ * @param filter         Weights of the neurons
+ * @param bias           Bias for the CNN layer.
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param padding        One of VALID or SAME
+ * @param mode           Do convolution using C implement or xtensa implement, 0 or 1, with respect.
+ *                       If ESP_PLATFORM is not defined, this value is not used.
+ * @return               The result of CNN layer.
+ */
+dl_matrix3dq_t *dl_matrix3dq_fc (dl_matrix3dq_t *in, dl_matrix3dq_t *filter, dl_matrix3dq_t *bias, int exponent,int mode);
+
+dl_matrix3dq_t *dl_matrix3dq_conv (dl_matrix3dq_t *in, dl_matrix3dq_t *filter, dl_matrix3dq_t *bias,
+                                    int stride_x, int stride_y, int padding, int exponent, int mode);
+dl_matrix3dq_t *dl_matrix3dq_conv_normal (dl_matrix3dq_t *in, dl_matrix3dq_t *filter, dl_matrix3dq_t *bias,
+                                    int stride_x, int stride_y, int padding, int exponent, int mode);
+
+/**
+ * @brief Print the matrix3d items
+ *
+ * @param m              dl_matrix3d_t to be printed
+ * @param message        name of matrix
+ */
+void dl_matrix3dq_print (dl_matrix3dq_t *m, char *message);
+
+dl_matrix3dq_t *dl_matrix3dq_depthwise_conv (dl_matrix3dq_t *in, dl_matrix3dq_t *filter,
+                                    int stride_x, int stride_y, int padding, int exponent, int mode);
+
+void dl_matrix3dq_relu (dl_matrix3dq_t *m, fptp_t clip);
+
+
+
+dl_matrix3dq_t *dl_matrix3dq_global_pool (dl_matrix3dq_t *in);
+void dl_matrix3dq_batch_normalize (dl_matrix3dq_t *m, dl_matrix3dq_t *scale, dl_matrix3dq_t *offset);
+dl_matrix3dq_t *dl_matrix3dq_add (dl_matrix3dq_t *in_1, dl_matrix3dq_t *in_2, int exponent);
+void dl_matrix3dq_relu_std (dl_matrix3dq_t *m);
+dl_matrix3dq_t *dl_matrix3dq_mobilefaceblock (void *in, dl_matrix3dq_t *pw, dl_matrix3dq_t *pw_bn_scale,dl_matrix3dq_t *pw_bn_offset,
+                                        dl_matrix3dq_t *dw, dl_matrix3dq_t *dw_bn_scale,dl_matrix3dq_t *dw_bn_offset,
+                                        dl_matrix3dq_t *pw_linear, dl_matrix3dq_t *pw_linear_bn_scale,dl_matrix3dq_t *pw_linear_bn_offset,
+                                        int pw_exponent,int dw_exponent,int pw_linear_exponent,int stride_x, int stride_y, int padding, int mode, int shortcut);
+
+dl_matrix3dq_t *dl_matrix3dq_concat(dl_matrix3dq_t *in_1, dl_matrix3dq_t *in_2);
+dl_matrix3dq_t *dl_matrix3dq_concat_4(dl_matrix3dq_t *in_1, dl_matrix3dq_t *in_2, dl_matrix3dq_t *in_3, dl_matrix3dq_t *in_4);
+dl_matrix3dq_t *dl_matrix3dq_concat_8(dl_matrix3dq_t *in_1, dl_matrix3dq_t *in_2, dl_matrix3dq_t *in_3, dl_matrix3dq_t *in_4, dl_matrix3dq_t *in_5, dl_matrix3dq_t *in_6, dl_matrix3dq_t *in_7, dl_matrix3dq_t *in_8);
+
+dl_matrix3dq_t *dl_matrix3dq_mobilefaceblock_split (void *in, dl_matrix3dq_t *pw_1, dl_matrix3dq_t *pw_2, dl_matrix3dq_t *pw_bn_scale,dl_matrix3dq_t *pw_bn_offset,
+                                        dl_matrix3dq_t *dw, dl_matrix3dq_t *dw_bn_scale,dl_matrix3dq_t *dw_bn_offset,
+                                        dl_matrix3dq_t *pw_linear_1, dl_matrix3dq_t *pw_linear_2, dl_matrix3dq_t *pw_linear_bn_scale,dl_matrix3dq_t *pw_linear_bn_offset,
+                                        int pw_exponent,int dw_exponent,int pw_linear_exponent,int stride_x, int stride_y, int padding, int mode, int shortcut);
--- a/tools/sdk/include/esp-face/dl_lib_matrixq.h
+++ b/tools/sdk/include/esp-face/dl_lib_matrixq.h
@ -0,0 +1,359 @@
+#ifndef DL_LIB_MATRIXQ_H
+#define DL_LIB_MATRIXQ_H
+
+#include <stdint.h>
+#include "dl_lib_matrix.h"
+
+typedef int16_t qtp_t;
+
+//Quantized matrix. Uses fixed numbers and has the storage for the rows/columns inverted 
+//for easy use as a multiplicand without stressing out the flash cache too much.
+typedef struct {
+    int w;
+    int h;
+    int stride; //Normally equals h, not w!
+    int flags;
+    int exponent; //The values in items should be multiplied by pow(2,exponent) to get the real values.
+    qtp_t *itemq;
+} dl_matrix2dq_t;
+
+#define DL_QTP_SHIFT 15
+#define DL_QTP_RANGE ((1<<DL_QTP_SHIFT)-1)
+#define DL_ITMQ(m, x, y) m->itemq[(y)+(x)*m->stride]
+#define DL_QTP_EXP_NA 255 //non-applicable exponent because matrix is null
+
+#define DL_SHIFT_AUTO 32
+
+/**
+ * @info About quantized matrices and shift values
+ *
+ * Grab a coffee (or tea, or hot water)  and sit down when you read this for the first 
+ * time. Quantized matrices can speed up your operations, but come with some quirks, and
+ * it's good to understand how they work before using them.
+ *
+ * The data in the quantized matrix type is stored similarily to floating-point types:
+ * when storing a real value, the value is stored as a mantissa (base number) and an
+ * exponent. The 'real' value that can be re-derived from those two numbers is something
+ * similar to mantissa*2^exponent. Up to this point, there's not that much difference from 
+ * the standard floating point implementations like e.g. IEEE-754.
+ *
+ * The difference with respect to quantized matrices is that for a quantized matrix, it is 
+ * assumed all values stored have more-or-less the same order of magnitude. This allows the
+ * matrix to only store all the mantissas, while the exponents are shared; there is only one 
+ * exponent for the entire matrix. This makes it quicker to handle matrix operations - the
+ * logic to fix the exponents only needs to happen once, while the rest can be done in simple
+ * integer arithmetic. It also nets us some memory savings - while normally a floating point
+ * number is 32-bit, storing only 16-bit mantissas as the matrix items almost halves the 
+ * memory requirements.
+ *
+ * While most of the details of handling the intricacies of the quantized matrixes are done
+ * transparently by the code in dl_lib_matrixq.c, some implementation details leak out, 
+ * specifically in places where addition/subtraction/division happens.
+ *
+ * The problem is that the routines do not know what the size of the resulting operation is. For
+ * instance, when adding two matrices of numbers, the resulting numbers *could* be large enough
+ * to overflow the mantissa of the result if the exponent is the same. However, if by default we
+ * assume the mantissas needs to be scaled back, we may lose precision.
+ *
+ * In order to counter this, all operations that have this issue have a ``shift`` argument. If 
+ * the argument is zero, the routine will be conservative, that is, increase the exponent of 
+ * the result to such an extent it's mathematically impossible a value in the result will exceed
+ * the maximum value that can be stored. However, when this argument is larger than zero, the
+ * algorithm will hold back on this scaling by the indicated amount of bits, preserving precision
+ * but increasing the chance of some of the calculated values not fitting in the mantissa anymore.
+ * If this happens, the value will be clipped to the largest (or, for negative values, smallest)
+ * value possible. (Neural networks usually are okay with this happening for a limited amount
+ * of matrix indices).
+ *
+ * For deciding on these shift values, it is recommended to start with a shift value of one, then
+ * use dl_matrixq_check_sanity on the result. If this indicates clipping, lower the shift value. 
+ * If it indicates bits are under-used, increase it. Note that for adding and subtraction, only
+ * shift values of 0 or 1 make sense; these routines will error out if you try to do something
+ * else.
+ *
+ * For neural networks and other noise-tolerant applications, note that even when 
+ * dl_matrixq_check_sanity does not indicate any problems, twiddling with the shift value may lead
+ * to slightly improved precision. Feel free to experiment.
+ **/
+
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_alloc(int w, int h);
+
+/**
+ * @brief Convert a floating-point matrix to a quantized matrix
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ */
+dl_matrix2dq_t *dl_matrixq_from_matrix2d(const dl_matrix2d_t *m, dl_matrix2dq_t *out);
+
+
+/**
+ * TODO: DESCRIBE THIS FUNCTION
+ */
+dl_matrix2dq_t *dl_matrixq_from_matrix2d_by_qmf(const dl_matrix2d_t *m, dl_matrix2dq_t *out, int m_bit, int f_bit);
+
+
+/**
+ * @brief Convert a quantized matrix to a floating-point one.
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ **/
+dl_matrix2d_t *dl_matrix2d_from_matrixq(const dl_matrix2dq_t *m, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Free a quantized matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrixq_free(dl_matrix2dq_t *m);
+
+/**
+ * @brief Zero out the matrix
+ * Sets all entries in the matrix to 0.
+ *
+ * @param m     Matrix to zero
+ */
+void dl_matrixq_zero(dl_matrix2dq_t *m);
+
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b, Result is a fixed-point matrix.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ * @param shift Shift ratio
+ */
+void dl_matrixq_dot(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices: res=a.b, Result is a floating-point matrix.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrixq_dot_matrix_out(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b. This always uses the simple & stupid C algo for the dot product.
+ *
+ * Result is a fixed-point matrix. 
+ *
+ * Use this only if you expect something is wrong with the accelerated routines that dl_matrixq_dot calls; this function can be
+ * much slower than dl_matrixq_dot .
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ * @param shift Shift ratio
+ */
+void dl_matrixq_dot_c_impl(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b. This always uses the simple & stupid C algo for the dot product. 
+ *
+ * Result is a floating-point matrix. 
+ *
+ * Use this only if you expect something is wrong with the accelerated routines that dl_matrixq_dot_matrix_out calls; this function can be
+ * much slower than dl_matrixq_dot_matrix_out.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrixq_dot_matrix_out_c_impl(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of a floating point and a quantized matrix. Result is a floating-point matrix.
+ *
+ * @param a     First multiplicand; float matrix
+ * @param b     Second multiplicand; quantized matrix
+ * @param res   Dotproduct data; float matrix. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrix_matrixq_dot(const dl_matrix2d_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+
+/**
+ * @brief Print the contents of a quantized matrix to stdout. Used for debugging.
+ *
+ * @param a     The matrix to print.
+ */
+void dl_printmatrixq(const dl_matrix2dq_t *a);
+
+
+/**
+ * @brief Add a pair of quantizedmatrices item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Added data. Can be equal to a or b to overwrite that.
+ * @param shift Shift value. Only 0 or 1 makes sense here. <ToDo: check>
+ */
+void dl_matrixq_add(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Generate a new matrix using a range of items from an existing matrix.
+ * When using this, the data of the new matrix is not allocated/copied but it re-uses a pointer
+ * to the existing data. Changing the data in the resulting matrix, as a result, will also change
+ * the data in the existing matrix that has been sliced.
+ *
+ * @Warning In contrast to the floating point equivalent of this function, the fixed-point version
+ * of this has the issue that as soon as the output exponent of one of the slices changes, the data
+ * in the sliced matrix gets corrupted (because the exponent of that matrix is still the same.) If you
+ * use this function, either treat the slices as read-only, or assume the sliced matrix contains
+ * garbage after modifying the data in one of the slices.
+ *
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix (with foreign data) to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting slice matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_slice(const dl_matrix2dq_t *src, int x, int y, int w, int h, dl_matrix2dq_t *in);
+
+/**
+ * @brief select a range of items from an existing matrix and flatten them into one dimension.
+ *
+ * @Warning The results are flattened in row-major order.
+ *   
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting flatten matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_flatten(const dl_matrix2dq_t *src, int x, int y, int w, int h, dl_matrix2dq_t *in);
+
+/**
+ * @brief Subtract a quantized matrix from another, item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Subtracted data. Can be equal to a or b to overwrite that.
+ * @param shift Shift value. Only 0 or 1 makes sense here. <ToDo: check>
+ */
+void dl_matrixq_sub(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Multiply a pair of quantized matrices item-by-item: res=a*b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Multiplicated data. Can be equal to a or b to overwrite that matrix.
+ */
+void dl_matrixq_mul(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res);
+
+/**
+ * @brief Divide a pair of quantized matrices item-by-item: res=a/b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Divided data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrixq_div(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *out, int shift);
+
+/**
+ * @brief Check if two quantized matrices have the same shape, that is, the same amount of 
+ * rows and columns
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return true if the two matrices are shaped the same, false otherwise.
+ */
+int dl_matrixq_same_shape(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b);
+
+/**
+ * @brief Concatenate the rows of two quantized matrices into a new matrix
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @return A newly allocated quantized matrix with as values a|b
+ */
+dl_matrix2dq_t *dl_matrixq_concat(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b);
+
+/**
+ * @brief Add a constant to every item of the quantized matrix
+ *
+ * @param subj  Matrix to add the constant to
+ * @param add   The constant
+ */
+void dl_matrixq_add_const(dl_matrix2dq_t *subj, const fptp_t add, int shift);
+
+/**
+ * @brief Check the sanity of a quantized matrix
+ *
+ * Due to the nature of quantized matrices, depending on the calculations a quantized
+ * matrix is the result of and the shift values chosen in those calculations, a quantized
+ * matrix may have an exponent and mantissas that lead to a loss of precision, either because
+ * most significant mantissa bits are unused, or because a fair amount of mantissas are 
+ * clipped. This function checks if this is the case and will report a message to stdout
+ * if significant loss of precision is detected.
+ *
+ * @param m     The quantized matrix to check
+ * @param name  A string to be displayed in the message if the sanity check fails
+ * @return True if matrix is sane, false otherwise
+ **/
+
+int dl_matrixq_check_sanity(dl_matrix2dq_t *m, const char *name);
+
+/**
+ * @brief re-adjust the exponent of the matrix to fit the mantissa better
+ *
+ * This function will shift up all the data in the mantissas so there are no
+ * most-significant bits that are unused in all mantissas. It will also adjust
+ * the exponent to keep the actua values in the matrix the same.
+ *
+ * Some operations done on a matrix, especially operations that re-use the
+ * result of earlier operations done in the same way, can lead to the loss of
+ * data because the exponent of the quantized matrix is never re-adjusted. You
+ * can do that implicitely by calling this function.
+ *
+ * @param m     The matrix to re-adjust
+**/
+void dl_matrixq_readjust_exp(dl_matrix2dq_t *m);
+
+
+
+/**
+ * @brief Get the floating-point value of a specific item from the quantized matrix
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @return Value in that position
+ */
+fptp_t dl_matrixq_get(const dl_matrix2dq_t *m, const int x, const int y);
+
+/**
+ * @brief Set a specific item in the quantized matrix to the given 
+ * floating-point value
+ *
+ * @warning If the given value is more than the exponent in the quantized matrix
+ * allows for, all mantissas in the matrix will be shifted down to make the value
+ * 'fit'. If, however, the exponent is such that the value would result in a
+ * quantized mantissa of 0, nothing is done.
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @param val   Value to write to that position
+ */
+void dl_matrixq_set(dl_matrix2dq_t *m, const int x, const int y, fptp_t val);
+
+#endif
--- a/tools/sdk/include/esp-face/fd_forward.h
+++ b/tools/sdk/include/esp-face/fd_forward.h
@ -0,0 +1,64 @@
+/*
+  * ESPRESSIF MIT License
+  *
+  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
+  *
+  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
+  * it is free of charge, to any person obtaining a copy of this software and associated
+  * documentation files (the "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
+  * to do so, subject to the following conditions:
+  *
+  * The above copyright notice and this permission notice shall be included in all copies or
+  * substantial portions of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *
+  */
+#pragma once
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+#include "image_util.h"
+#include "dl_lib.h"
+#include "mtmn.h"
+
+    static inline mtmn_config_t mtmn_init_config()
+    {
+        mtmn_config_t mtmn_config;
+        mtmn_config.min_face = 80;
+        mtmn_config.pyramid = 0.7;
+        mtmn_config.p_threshold.score = 0.6;
+        mtmn_config.p_threshold.nms = 0.7;
+        mtmn_config.r_threshold.score = 0.6;
+        mtmn_config.r_threshold.nms = 0.7;
+        mtmn_config.r_threshold.candidate_number = 4;
+        mtmn_config.o_threshold.score = 0.6;
+        mtmn_config.o_threshold.nms = 0.4;
+        mtmn_config.o_threshold.candidate_number = 1;
+
+        return mtmn_config;
+    }
+
+    /**
+     * @brief Do MTMN face detection, return box and landmark infomation.
+     * 
+     * @param image_matrix      Image matrix, rgb888 format
+     * @param config            Configuration of MTMN i.e. score threshold, nms threshold, candidate number threshold, pyramid, min face size
+     * @return box_array_t*     A list of boxes and score.
+     */
+    box_array_t *face_detect(dl_matrix3du_t *image_matrix,
+                             mtmn_config_t *config);
+
+#if __cplusplus
+}
+#endif
--- a/tools/sdk/include/esp-face/fr_flash.h
+++ b/tools/sdk/include/esp-face/fr_flash.h
@ -0,0 +1,45 @@
+#pragma once
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+#include "fr_forward.h"
+
+#define FR_FLASH_TYPE   32
+#define FR_FLASH_SUBTYPE   32
+#define FR_FLASH_PARTITION_NAME "fr"
+#define FR_FLASH_INFO_FLAG 12138
+
+	 /**
+     * @brief Produce face id according to the input aligned face, and save it to dest_id and flash.
+     * 
+     * @param l                     Face id list
+     * @param aligned_face          An aligned face
+     * @return -2                   Flash partition not found
+     * @return 0                    Enrollment finish
+     * @return >=1                  The left piece of aligned faces should be input
+     */
+    int8_t enroll_face_id_to_flash(face_id_list *l,
+            dl_matrix3du_t *aligned_face);
+
+    /**
+     * @brief Read the enrolled face IDs from the flash.
+     * 
+     * @param l                     Face id list
+     * @return int8_t               The number of IDs remaining in flash
+     */
+    int8_t read_face_id_from_flash(face_id_list *l);
+
+    /**
+     * @brief Delete the enrolled face IDs in the flash.
+     * 
+     * @param l                     Face id list
+     * @return int8_t               The number of IDs remaining in flash
+     */
+    int8_t delete_face_id_in_flash(face_id_list *l);
+
+#if __cplusplus
+}
+#endif
--- a/tools/sdk/include/esp-face/fr_forward.h
+++ b/tools/sdk/include/esp-face/fr_forward.h
@ -0,0 +1,116 @@
+#pragma once
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+#include "image_util.h"
+#include "dl_lib.h"
+#include "frmn.h"
+
+#define FACE_WIDTH 56
+#define FACE_HEIGHT 56
+#define FACE_ID_SIZE 512
+#define FACE_REC_THRESHOLD 0.5
+
+#define LEFT_EYE_X 0
+#define LEFT_EYE_Y 1
+#define RIGHT_EYE_X 6
+#define RIGHT_EYE_Y 7
+#define NOSE_X 4
+#define NOSE_Y 5
+
+#define EYE_DIST_SET 16.5f
+#define NOSE_EYE_RATIO_THRES_MIN 0.49f
+#define NOSE_EYE_RATIO_THRES_MAX 2.04f
+
+#define FLASH_INFO_FLAG 12138
+#define FLASH_PARTITION_NAME "fr"
+
+/**
+ * @brief      HTTP Client events data
+ */
+    typedef struct
+    {
+        uint8_t head;               /*!< head index of the id list */
+        uint8_t tail;               /*!< tail index of the id list */
+        uint8_t count;              /*!< number of enrolled ids */
+        uint8_t size;               /*!< max len of id list */
+        uint8_t confirm_times;      /*!< images needed for one enrolling */
+        dl_matrix3d_t **id_list;    /*!< stores face id vectors */
+    } face_id_list;
+
+
+    /**
+     * @brief Initialize face id list
+     * 
+     * @param l                 Face id list
+     * @param size              Size of list, one list contains one vector
+     * @param confirm_times     Enroll times for one id
+     * @return dl_matrix3du_t*          Size: 1xFACE_WIDTHxFACE_HEIGHTx3
+     */
+    void face_id_init(face_id_list *l, uint8_t size, uint8_t confirm_times);
+
+    /**
+     * @brief Alloc memory for aligned face.
+     * 
+     * @return dl_matrix3du_t*          Size: 1xFACE_WIDTHxFACE_HEIGHTx3
+     */
+    dl_matrix3du_t *aligned_face_alloc();
+
+    /**
+     * @brief Align detected face to average face according to landmark
+     * 
+     * @param onet_boxes        Output of MTMN with box and landmark
+     * @param src               Image matrix, rgb888 format
+     * @param dest              Output image
+     * @return ESP_OK           Input face is good for recognition
+     * @return ESP_FAIL         Input face is not good for recognition
+     */
+    int8_t align_face(box_array_t *onet_boxes,
+                      dl_matrix3du_t *src,
+                      dl_matrix3du_t *dest);
+
+    /**
+     * @brief Add src_id to dest_id
+     * 
+     * @param dest_id 
+     * @param src_id 
+     */
+    void add_face_id(dl_matrix3d_t *dest_id,
+                     dl_matrix3d_t *src_id);
+
+    /**
+     * @brief Match face with the id_list, and return matched_id.
+     * 
+     * @param algined_face          An aligned face
+     * @param id_list               An ID list
+     * @return int8_t               Matched face id
+     */
+    int8_t recognize_face(face_id_list *l,
+                            dl_matrix3du_t *algined_face);
+
+    /**
+     * @brief Produce face id according to the input aligned face, and save it to dest_id.
+     * 
+     * @param l                     face id list
+     * @param aligned_face          An aligned face
+     * @param enroll_confirm_times  Confirm times for each face id enrollment
+     * @return -1                   Wrong input enroll_confirm_times
+     * @return 0                    Enrollment finish
+     * @return >=1                  The left piece of aligned faces should be input
+     */
+    int8_t enroll_face(face_id_list *l, 
+                    dl_matrix3du_t *aligned_face);
+
+    /**
+     * @brief Alloc memory for aligned face.
+     * 
+     * @param l                     face id list
+     * @return uint8_t              left count
+     */
+    uint8_t delete_face(face_id_list *l);
+#if __cplusplus
+}
+#endif
--- a/tools/sdk/include/esp-face/frmn.h
+++ b/tools/sdk/include/esp-face/frmn.h
@ -0,0 +1,28 @@
+#pragma once
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+#include "dl_lib.h"
+
+    /**
+     * @brief 
+     * 
+     * @param in 
+     * @return dl_matrix3d_t* 
+     */
+    dl_matrix3d_t *frmn(dl_matrix3d_t *in);
+
+    /**
+     * @brief 
+     * 
+     * @param in 
+     * @return dl_matrix3dq_t* 
+     */
+    dl_matrix3dq_t *frmn_q(dl_matrix3dq_t *in, dl_conv_mode mode);
+
+#if __cplusplus
+}
+#endif
--- a/tools/sdk/include/esp-face/image_util.h
+++ b/tools/sdk/include/esp-face/image_util.h
@ -0,0 +1,275 @@
+/*
+  * ESPRESSIF MIT License
+  *
+  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
+  *
+  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
+  * it is free of charge, to any person obtaining a copy of this software and associated
+  * documentation files (the "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
+  * to do so, subject to the following conditions:
+  *
+  * The above copyright notice and this permission notice shall be included in all copies or
+  * substantial portions of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *
+  */
+#pragma once
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <stdint.h>
+#include "mtmn.h"
+
+#define MAX_VALID_COUNT_PER_IMAGE (30)
+
+#define DL_IMAGE_MIN(A, B) ((A) < (B) ? (A) : (B))
+#define DL_IMAGE_MAX(A, B) ((A) < (B) ? (B) : (A))
+
+#define IMAGE_WIDTH 320
+#define IMAGE_HEIGHT 240
+
+#define RGB565_MASK_RED 0xF800
+#define RGB565_MASK_GREEN 0x07E0
+#define RGB565_MASK_BLUE 0x001F
+
+    typedef struct
+    {
+        fptp_t landmark_p[10];
+    } landmark_t;
+
+    typedef struct
+    {
+        fptp_t box_p[4];
+    } box_t;
+
+    typedef struct tag_box_list
+    {
+        box_t *box;
+        landmark_t *landmark;
+        int len;
+    } box_array_t;
+
+    typedef struct tag_image_box
+    {
+        struct tag_image_box *next;
+        fptp_t score;
+        box_t box;
+        box_t offset;
+        landmark_t landmark;
+    } image_box_t;
+
+    typedef struct tag_image_list
+    {
+        image_box_t *head;
+        image_box_t *origin_head;
+        int len;
+    } image_list_t;
+
+    static inline void image_get_width_and_height(box_t *box, float *w, float *h)
+    {
+        *w = box->box_p[2] - box->box_p[0] + 1;
+        *h = box->box_p[3] - box->box_p[1] + 1;
+    }
+
+    static inline void image_get_area(box_t *box, float *area)
+    {
+        float w, h;
+        image_get_width_and_height(box, &w, &h);
+        *area = w * h;
+    }
+
+    static inline void image_calibrate_by_offset(image_list_t *image_list)
+    {
+        for (image_box_t *head = image_list->head; head; head = head->next)
+        {
+            float w, h;
+            image_get_width_and_height(&(head->box), &w, &h);
+            head->box.box_p[0] = DL_IMAGE_MAX(0, head->box.box_p[0] + head->offset.box_p[0] * w);
+            head->box.box_p[1] = DL_IMAGE_MAX(0, head->box.box_p[1] + head->offset.box_p[1] * w);
+            head->box.box_p[2] += head->offset.box_p[2] * w;
+            if (head->box.box_p[2] > IMAGE_WIDTH)
+            {
+                head->box.box_p[2] = IMAGE_WIDTH - 1;
+                head->box.box_p[0] = IMAGE_WIDTH - w;
+            }
+            head->box.box_p[3] += head->offset.box_p[3] * h;
+            if (head->box.box_p[3] > IMAGE_HEIGHT)
+            {
+                head->box.box_p[3] = IMAGE_HEIGHT - 1;
+                head->box.box_p[1] = IMAGE_HEIGHT - h;
+            }
+        }
+    }
+
+    static inline void image_landmark_calibrate(image_list_t *image_list)
+    {
+        for (image_box_t *head = image_list->head; head; head = head->next)
+        {
+            float w, h;
+            image_get_width_and_height(&(head->box), &w, &h);
+            head->landmark.landmark_p[0] = head->box.box_p[0] + head->landmark.landmark_p[0] * w;
+            head->landmark.landmark_p[1] = head->box.box_p[1] + head->landmark.landmark_p[1] * h;
+
+            head->landmark.landmark_p[2] = head->box.box_p[0] + head->landmark.landmark_p[2] * w;
+            head->landmark.landmark_p[3] = head->box.box_p[1] + head->landmark.landmark_p[3] * h;
+
+            head->landmark.landmark_p[4] = head->box.box_p[0] + head->landmark.landmark_p[4] * w;
+            head->landmark.landmark_p[5] = head->box.box_p[1] + head->landmark.landmark_p[5] * h;
+
+            head->landmark.landmark_p[6] = head->box.box_p[0] + head->landmark.landmark_p[6] * w;
+            head->landmark.landmark_p[7] = head->box.box_p[1] + head->landmark.landmark_p[7] * h;
+
+            head->landmark.landmark_p[8] = head->box.box_p[0] + head->landmark.landmark_p[8] * w;
+            head->landmark.landmark_p[9] = head->box.box_p[1] + head->landmark.landmark_p[9] * h;
+        }
+    }
+
+    static inline void image_rect2sqr(box_array_t *boxes, int width, int height)
+    {
+        for (int i = 0; i < boxes->len; i++)
+        {
+            box_t *box = &(boxes->box[i]);
+            float w, h;
+            image_get_width_and_height(box, &w, &h);
+            float l = DL_IMAGE_MAX(w, h);
+
+            box->box_p[0] = DL_IMAGE_MAX(0, box->box_p[0] + 0.5 * (w - l));
+            box->box_p[1] = DL_IMAGE_MAX(0, box->box_p[1] + 0.5 * (h - l));
+            box->box_p[2] = box->box_p[0] + l - 1;
+            if (box->box_p[2] > width)
+            {
+                box->box_p[2] = width - 1;
+                box->box_p[0] = width - l;
+            }
+            box->box_p[3] = box->box_p[1] + l - 1;
+            if (box->box_p[3] > height)
+            {
+                box->box_p[3] = height - 1;
+                box->box_p[1] = height - l;
+            }
+        }
+    }
+
+    static inline void rgb565_to_888(uint16_t in, uint8_t *dst)
+    {                                           /*{{{*/
+        dst[0] = (in & RGB565_MASK_BLUE) << 3;  // blue
+        dst[1] = (in & RGB565_MASK_GREEN) >> 3; // green
+        dst[2] = (in & RGB565_MASK_RED) >> 8;   // red
+    }                                           /*}}}*/
+
+    static inline void rgb888_to_565(uint16_t *in, uint8_t r, uint8_t g, uint8_t b)
+    { /*{{{*/
+        uint16_t rgb565 = 0;
+        rgb565 = ((r >> 3) << 11);
+        rgb565 |= ((g >> 2) << 5);
+        rgb565 |= (b >> 3);
+        *in = rgb565;
+    } /*}}}*/
+
+    /**
+     * @brief 
+     * 
+     * @param score 
+     * @param offset 
+     * @param width 
+     * @param height 
+     * @param p_net_size
+     * @param score_threshold 
+     * @param scale 
+     * @return image_list_t* 
+     */
+    image_list_t *image_get_valid_boxes(fptp_t *score,
+                                        fptp_t *offset,
+                                        int width,
+                                        int height,
+                                        int p_net_size,
+                                        fptp_t score_threshold,
+                                        fptp_t scale);
+    /**
+     * @brief 
+     * 
+     * @param image_sorted_list 
+     * @param insert_list 
+     */
+    void image_sort_insert_by_score(image_list_t *image_sorted_list, const image_list_t *insert_list);
+
+    /**
+     * @brief 
+     * 
+     * @param image_list 
+     * @param nms_threshold 
+     * @param same_area 
+     */
+    void image_nms_process(image_list_t *image_list, fptp_t nms_threshold, int same_area);
+
+    /**
+     * @brief 
+     * 
+     * @param dst_image 
+     * @param src_image 
+     * @param dst_w 
+     * @param dst_h 
+     * @param dst_c 
+     * @param src_w 
+     * @param src_h 
+     */
+    void image_resize_linear(uint8_t *dst_image, uint8_t *src_image, int dst_w, int dst_h, int dst_c, int src_w, int src_h);
+
+    /**
+     * @brief 
+     * 
+     * @param corp_image 
+     * @param src_image 
+     * @param rotate_angle 
+     * @param ratio 
+     * @param center 
+     */
+    void image_cropper(dl_matrix3du_t *corp_image, dl_matrix3du_t *src_image, float rotate_angle, float ratio, float *center);
+
+    /**
+     * @brief 
+     * 
+     * @param m 
+     * @param bmp 
+     * @param count 
+     */
+    void transform_input_image(uint8_t *m, uint16_t *bmp, int count);
+
+    /**
+     * @brief 
+     * 
+     * @param bmp 
+     * @param m 
+     * @param count 
+     */
+    void transform_output_image(uint16_t *bmp, uint8_t *m, int count);
+
+    /**
+     * @brief 
+     * 
+     * @param buf 
+     * @param boxes 
+     * @param width 
+     */
+    void draw_rectangle_rgb565(uint16_t *buf, box_array_t *boxes, int width);
+
+    /**
+     * @brief 
+     * 
+     * @param buf 
+     * @param boxes 
+     * @param width 
+     */
+    void draw_rectangle_rgb888(uint8_t *buf, box_array_t *boxes, int width);
+#ifdef __cplusplus
+}
+#endif
--- a/tools/sdk/include/esp-face/mtmn.h
+++ b/tools/sdk/include/esp-face/mtmn.h
@ -0,0 +1,99 @@
+/*
+  * ESPRESSIF MIT License
+  *
+  * Copyright (c) 2018 <ESPRESSIF SYSTEMS (SHANGHAI) PTE LTD>
+  *
+  * Permission is hereby granted for use on ESPRESSIF SYSTEMS products only, in which case,
+  * it is free of charge, to any person obtaining a copy of this software and associated
+  * documentation files (the "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+  * and/or sell copies of the Software, and to permit persons to whom the Software is furnished
+  * to do so, subject to the following conditions:
+  *
+  * The above copyright notice and this permission notice shall be included in all copies or
+  * substantial portions of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *
+  */
+#pragma once
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "dl_lib.h"
+
+    typedef enum
+    {
+        PNET = 0, /// P-Net
+        RNET = 1, /// R-Net
+        ONET = 2, /// O-Net
+    } net_type_en;
+
+    typedef struct
+    {
+        float score;          /// score threshold for filter candidates by score
+        float nms;            /// nms threshold for nms process
+        int candidate_number; /// candidate number limitation for each net
+    } threshold_config_t;
+
+    typedef struct
+    {
+        net_type_en net_type;         /// net type
+        char *file_name;              /// net name
+        int w;                        /// net width
+        int h;                        /// net height
+        threshold_config_t threshold; /// threshold of net
+    } net_config_t;
+
+    typedef struct
+    {
+        float min_face;                 /// the minimum size of face can be detected
+        float pyramid;                  /// the pyramid scale
+        threshold_config_t p_threshold; /// score, nms and candidate threshold of pnet
+        threshold_config_t r_threshold; /// score, nms and candidate threshold of rnet
+        threshold_config_t o_threshold; /// score, nms and candidate threshold of onet
+    } mtmn_config_t;
+
+    typedef struct
+    {
+        dl_matrix3d_t *category;
+        dl_matrix3d_t *offset;
+        dl_matrix3d_t *landmark;
+    } mtmn_net_t;
+
+    /**
+     * @brief Forward the pnet process, coarse detection
+     *
+     * @param in        Image matrix, rgb888 format, size is 320x240
+     * @return          Scores for every pixel, and box offset with respect.
+     */
+    mtmn_net_t *pnet(dl_matrix3du_t *in);
+
+    /**
+     * @brief Forward the rnet process, fine determine the boxes from pnet
+     *
+     * @param in        Image matrix, rgb888 format
+     * @param threshold Score threshold to detect human face
+     * @return          Scores for every box, and box offset with respect.
+     */
+    mtmn_net_t *rnet_with_score_verify(dl_matrix3du_t *in, float threshold);
+
+    /**
+     * @brief Forward the onet process, fine determine the boxes from rnet
+     *
+     * @param in        Image matrix, rgb888 format
+     * @param threshold Score threshold to detect human face
+     * @return          Scores for every box, box offset, and landmark with respect.
+     */
+    mtmn_net_t *onet_with_score_verify(dl_matrix3du_t *in, float threshold);
+
+#ifdef __cplusplus
+}
+#endif
--- a/tools/sdk/include/esp32/esp_attr.h
+++ b/tools/sdk/include/esp32/esp_attr.h
@ -14,6 +14,8 @@
 #ifndef __ESP_ATTR_H__
 #define __ESP_ATTR_H__

+#include "sdkconfig.h"
+
 #define ROMFN_ATTR

 //Normally, the linker script will put all code and rodata in flash,
--- a/tools/sdk/include/esp_https_server/esp_https_server.h
+++ b/tools/sdk/include/esp_https_server/esp_https_server.h
@ -19,6 +19,10 @@
 #include "esp_err.h"
 #include "esp_http_server.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef enum {
    HTTPD_SSL_TRANSPORT_SECURE,      // SSL Enabled
    HTTPD_SSL_TRANSPORT_INSECURE     // SSL disabled
@ -92,6 +96,10 @@ typedef struct httpd_ssl_config httpd_ssl_config_t;
        .open_fn = NULL,                          \
        .close_fn = NULL,                         \
    },                                            \
+    .cacert_pem = NULL,                           \
+    .cacert_len = 0,                              \
+    .prvtkey_pem = NULL,                          \
+    .prvtkey_len = 0,                             \
    .transport_mode = HTTPD_SSL_TRANSPORT_SECURE, \
    .port_secure = 443,                           \
    .port_insecure = 80,                          \
@ -114,4 +122,8 @@ esp_err_t httpd_ssl_start(httpd_handle_t *handle, httpd_ssl_config_t *config);
 */
 void httpd_ssl_stop(httpd_handle_t handle);

+#ifdef __cplusplus
+}
+#endif
+
 #endif // _ESP_HTTPS_SERVER_H_
--- a/tools/sdk/include/fb_gfx/fb_gfx.h
+++ b/tools/sdk/include/fb_gfx/fb_gfx.h
@ -0,0 +1,44 @@
+// Copyright 2015-2016 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef _FB_GFX_H_
+#define _FB_GFX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    typedef enum {
+        FB_RGB888, FB_BGR888, FB_RGB565, FB_BGR565
+    } fb_format_t;
+
+    typedef struct {
+            int width;
+            int height;
+            int bytes_per_pixel;
+            fb_format_t format;
+            uint8_t * data;
+    } fb_data_t;
+
+    void     fb_gfx_fillRect     (fb_data_t *fb, int32_t x, int32_t y, int32_t w, int32_t h, uint32_t color);
+    void     fb_gfx_drawFastHLine(fb_data_t *fb, int32_t x, int32_t y, int32_t w, uint32_t color);
+    void     fb_gfx_drawFastVLine(fb_data_t *fb, int32_t x, int32_t y, int32_t h, uint32_t color);
+    uint8_t  fb_gfx_putc         (fb_data_t *fb, int32_t x, int32_t y, uint32_t color, unsigned char c);
+    uint32_t fb_gfx_print        (fb_data_t *fb, int32_t x, int32_t y, uint32_t color, const char * str);
+    uint32_t fb_gfx_printf       (fb_data_t *fb, int32_t x, int32_t y, uint32_t color, const char *format, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FB_GFX_H_ */
--- a/tools/sdk/include/lwip/arch/sys_arch.h
+++ b/tools/sdk/include/lwip/arch/sys_arch.h
@ -62,7 +62,25 @@ typedef struct sys_mbox_s {
 #endif

 #define sys_mbox_valid( x ) ( ( ( *x ) == NULL) ? pdFALSE : pdTRUE )
-#define sys_mbox_set_invalid( x ) ( ( *x ) = NULL )
+
+/* Define the sys_mbox_set_invalid() to empty to support lock-free mbox in ESP LWIP.
+ * 
+ * The basic idea about the lock-free mbox is that the mbox should always be valid unless
+ * no socket APIs are using the socket and the socket is closed. ESP LWIP achieves this by
+ * following two changes to official LWIP:
+ * 1. Postpone the deallocation of mbox to netconn_free(), in other words, free the mbox when
+ *    no one is using the socket.
+ * 2. Define the sys_mbox_set_invalid() to empty if the mbox is not actually freed.
+
+ * The second change is necessary. Consider a common scenario: the application task calls 
+ * recv() to receive packets from the socket, the sys_mbox_valid() returns true. Because there
+ * is no lock for the mbox, the LWIP CORE can call sys_mbox_set_invalid() to set the mbox at 
+ * anytime and the thread-safe issue may happen.
+ *
+ * However, if the sys_mbox_set_invalid() is not called after sys_mbox_free(), e.g. in netconn_alloc(),
+ * we need to initialize the mbox to invalid explicitly since sys_mbox_set_invalid() now is empty.
+ */
+#define sys_mbox_set_invalid( x ) 

 #define sys_sem_valid( x ) ( ( ( *x ) == NULL) ? pdFALSE : pdTRUE )
 #define sys_sem_set_invalid( x ) ( ( *x ) = NULL )
--- a/tools/sdk/include/lwip/lwip/api.h
+++ b/tools/sdk/include/lwip/lwip/api.h
@ -233,15 +233,6 @@ struct netconn {
      by the application thread */
  sys_mbox_t acceptmbox;
 #endif /* LWIP_TCP */
-
-#if ESP_THREAD_SAFE
-  /** point to the same mbox as recvmbox */
-  sys_mbox_t recvmbox_ref;
-#if LWIP_TCP
-  /** point to the same mbox as acceptmbox */
-  sys_mbox_t acceptmbox_ref;
-#endif
-#endif
  /** only used for socket layer */
 #if LWIP_SOCKET
  int socket;
--- a/tools/sdk/include/lwip/sys_arch.h
+++ b/tools/sdk/include/lwip/sys_arch.h
@ -62,7 +62,25 @@ typedef struct sys_mbox_s {
 #endif

 #define sys_mbox_valid( x ) ( ( ( *x ) == NULL) ? pdFALSE : pdTRUE )
-#define sys_mbox_set_invalid( x ) ( ( *x ) = NULL )
+
+/* Define the sys_mbox_set_invalid() to empty to support lock-free mbox in ESP LWIP.
+ * 
+ * The basic idea about the lock-free mbox is that the mbox should always be valid unless
+ * no socket APIs are using the socket and the socket is closed. ESP LWIP achieves this by
+ * following two changes to official LWIP:
+ * 1. Postpone the deallocation of mbox to netconn_free(), in other words, free the mbox when
+ *    no one is using the socket.
+ * 2. Define the sys_mbox_set_invalid() to empty if the mbox is not actually freed.
+
+ * The second change is necessary. Consider a common scenario: the application task calls 
+ * recv() to receive packets from the socket, the sys_mbox_valid() returns true. Because there
+ * is no lock for the mbox, the LWIP CORE can call sys_mbox_set_invalid() to set the mbox at 
+ * anytime and the thread-safe issue may happen.
+ *
+ * However, if the sys_mbox_set_invalid() is not called after sys_mbox_free(), e.g. in netconn_alloc(),
+ * we need to initialize the mbox to invalid explicitly since sys_mbox_set_invalid() now is empty.
+ */
+#define sys_mbox_set_invalid( x ) 

 #define sys_sem_valid( x ) ( ( ( *x ) == NULL) ? pdFALSE : pdTRUE )
 #define sys_sem_set_invalid( x ) ( ( *x ) = NULL )