add optest, less, matmul, reduce (#673)

8d5752bd · BUG1989 · GitHub · fb6affc0 · 8d5752bd · 8d5752bd
22 changed file
--- a/.github/workflows/test-coverage.yml
+++ b/.github/workflows/test-coverage.yml
@@ -14,7 +14,7 @@ jobs:
    - name: lcov
      run: sudo apt-get install lcov
    - name: configure
-      run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=debug -DTENGINE_BUILD_TESTS=ON -DTENGINE_COVERAGE=ON ..
+      run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=debug -DTENGINE_BUILD_EXAMPLES=OFF -DTENGINE_BUILD_TESTS=ON -DTENGINE_COVERAGE=ON ..
    - name: build
      run: cmake --build build -j 2
    - name: test-data

--- a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h
+++ b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref.h
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author:
+ */
+
+#ifndef __BATCHNORM_KERNEL_REF_H__
+#define __BATCHNORM_KERNEL_REF_H__
+
+
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+
+#include <stdbool.h>
+#include <math.h>
+
+struct ref_batchnorm_param
+{
+    int input_n;
+    int input_h;
+    int input_w;
+    int input_c;
+    int layout;
+    bool iscaffe;
+    float* scale_mean;
+    float* scale_var_inv;
+    float* gamma;
+    float* beta;
+    float in_scale;
+    int in_zero;
+    float out_scale;
+    int out_zero;
+};
+
+int ref_batchnorm_fp32(float* input, float* output, const struct ref_batchnorm_param* param);
+
+int ref_batchnorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor, const struct ref_batchnorm_param* param);
+
+#endif
--- a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c
+++ b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_fp32.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author:
+ */
+
+#include "batchnorm_kernel_ref.h"
+
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "module/module.h"
+#include "operator/op.h"
+#include "utility/float.h"
+#include "utility/sys_port.h"
+#include "utility/log.h"
+#include "device/cpu/cpu_node.h"
+#include "device/cpu/cpu_graph.h"
+#include "device/cpu/cpu_module.h"
+
+
+int ref_batchnorm_fp32(float* input, float* output, const struct ref_batchnorm_param* param)
+{
+    float* scale_mean = param->scale_mean;
+    float* scale_var_inv = param->scale_var_inv;
+    float* gamma = param->gamma;
+    float* beta = param->beta;
+
+    int img_size = param->input_c * param->input_h * param->input_w;
+
+    for (int n = 0; n < param->input_n; ++n)
+    {
+        for (int h = 0; h < param->input_h; ++h)
+        {
+            for (int w = 0; w < param->input_w; ++w)
+            {
+                for (int c = 0; c < param->input_c; ++c)
+                {
+                    float s_mean = scale_mean[c];
+                    float s_var = scale_var_inv[c];
+                    float s_val1 = s_mean;
+                    float s_val2 = s_var;
+                    if (!param->iscaffe)
+                    {
+                        float s_gamma = gamma[c];
+                        float s_beta = beta[c];
+                        s_val1 = s_beta + s_gamma * s_mean;
+                        s_val2 = s_gamma * s_var;
+                    }
+                    int offset = n * img_size + c * param->input_h * param->input_w + h * param->input_w + w;
+                    output[offset] = input[offset] * s_val2 + s_val1;
+                }
+            }
+        }
+    }
+    return 0;
+}
--- a/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c
+++ b/source/device/cpu/op/batchnorm/batchnorm_kernel_ref_uint8.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author:
+ */
+
+#include "batchnorm_kernel_ref.h"
+
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "module/module.h"
+#include "operator/op.h"
+#include "utility/float.h"
+#include "utility/sys_port.h"
+#include "utility/log.h"
+#include "device/cpu/cpu_node.h"
+#include "device/cpu/cpu_graph.h"
+#include "device/cpu/cpu_module.h"
+
+
+int ref_batchnorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor, const struct ref_batchnorm_param* param)
+{
+    float* scale_mean = param->scale_mean;
+    float* scale_var_inv = param->scale_var_inv;
+    float* gamma = param->gamma;
+    float* beta = param->beta;
+
+    int img_size = param->input_c * param->input_h * param->input_w;
+    int total_size = img_size * param->input_n;
+
+    // dequant
+    uint8_t* input_uint8 = input_tensor->data;
+    uint8_t* output_uint8 = output_tensor->data;
+    float input_scale = input_tensor->scale;
+    float output_scale = output_tensor->scale;
+    int32_t input_zero = input_tensor->zero_point;
+    int32_t output_zero = output_tensor->zero_point;
+
+    float* data_fp32 = (float*) sys_malloc(total_size * sizeof(float));
+    for(int i = 0; i < total_size; i++)
+        data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale;
+
+    for (int n = 0; n < param->input_n; ++n)
+    {
+        for (int h = 0; h < param->input_h; ++h)
+        {
+            for (int w = 0; w < param->input_w; ++w)
+            {
+                for (int c = 0; c < param->input_c; ++c)
+                {
+                    float s_mean = scale_mean[c];
+                    float s_var = scale_var_inv[c];
+                    float s_val1 = s_mean;
+                    float s_val2 = s_var;
+                    if (!param->iscaffe)
+                    {
+                        float s_gamma = gamma[c];
+                        float s_beta = beta[c];
+                        s_val1 = s_beta + s_gamma * s_mean;
+                        s_val2 = s_gamma * s_var;
+                    }
+                    int offset = n * img_size + c * param->input_h * param->input_w + h * param->input_w + w;
+
+                    data_fp32[offset] = data_fp32[offset] * s_val2 + s_val1;
+                }
+            }
+        }
+    }
+
+    // quant
+    for(int i=0; i<total_size; i++)
+    {
+        int udata = (int)roundf(data_fp32[i] / output_scale + output_zero);
+        if (udata > 255)
+            udata = 255;
+        else if (udata < 0)
+            udata = 0;
+        output_uint8[i] = udata;
+    }
+
+    return 0;
+}
\ No newline at end of file
--- a/source/device/cpu/op/batchnorm/batchnorm_ref.c
+++ b/source/device/cpu/op/batchnorm/batchnorm_ref.c
@@ -34,147 +34,10 @@
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"

-#include <stdbool.h>
 #include <string.h>
-#include <math.h>

+#include "batchnorm_kernel_ref.h"

-struct ref_batchnorm_param
-{
-    int input_n;
-    int input_h;
-    int input_w;
-    int input_c;
-    int layout;
-    bool iscaffe;
-    float* scale_mean;
-    float* scale_var_inv;
-    float* gamma;
-    float* beta;
-    float in_scale;
-    int in_zero;
-    float out_scale;
-    int out_zero;
-};
-
-
-static int ref_batchnorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor, const struct ref_batchnorm_param* param, int num_thread)
-{
-    float* scale_mean = param->scale_mean;
-    float* scale_var_inv = param->scale_var_inv;
-    float* gamma = param->gamma;
-    float* beta = param->beta;
-
-    int img_size = param->input_c * param->input_h * param->input_w;
-    int total_size = img_size * param->input_n;
-
-    // dequant
-    uint8_t* input_uint8 = input_tensor->data;
-    uint8_t* output_uint8 = output_tensor->data;
-    float input_scale = input_tensor->scale;
-    float output_scale = output_tensor->scale;
-    int32_t input_zero = input_tensor->zero_point;
-    int32_t output_zero = output_tensor->zero_point;
-
-    float* data_fp32 = (float*) sys_malloc(total_size * sizeof(float));
-    for(int i = 0; i < total_size; i++)
-        data_fp32[i] = ((float) input_uint8[i] - (float)input_zero) * input_scale;
-
-    for (int n = 0; n < param->input_n; ++n)
-    {
-#pragma omp parallel for num_threads(num_thread)
-        for (int h = 0; h < param->input_h; ++h)
-        {
-            for (int w = 0; w < param->input_w; ++w)
-            {
-                for (int c = 0; c < param->input_c; ++c)
-                {
-                    float s_mean = scale_mean[c];
-                    float s_var = scale_var_inv[c];
-                    float s_val1 = s_mean;
-                    float s_val2 = s_var;
-                    if (!param->iscaffe)
-                    {
-                        float s_gamma = gamma[c];
-                        float s_beta = beta[c];
-                        s_val1 = s_beta + s_gamma * s_mean;
-                        s_val2 = s_gamma * s_var;
-                    }
-                    int offset = 0;
-                    if (TENGINE_LAYOUT_NCHW == param->layout)
-                    {
-                        offset = n * img_size + c * param->input_h * param->input_w + h * param->input_w + w;
-                    }
-                    else
-                    {
-                        offset = n * img_size + h * param->input_w * param->input_c + w * param->input_c + c;
-                    }
-                    data_fp32[offset] = data_fp32[offset] * s_val2 + s_val1;
-                }
-            }
-        }
-    }
-
-    // quant
-    for(int i=0; i<total_size; i++)
-    {
-        int udata = (int)roundf(data_fp32[i] / output_scale + output_zero);
-        if (udata > 255)
-            udata = 255;
-        else if (udata < 0)
-            udata = 0;
-        output_uint8[i] = udata;
-    }
-
-    return 0;
-}
-
-
-static int ref_batchnorm_fp32(float* input, float* output, const struct ref_batchnorm_param* param, int num_thread)
-{
-    float* scale_mean = param->scale_mean;
-    float* scale_var_inv = param->scale_var_inv;
-    float* gamma = param->gamma;
-    float* beta = param->beta;
-
-    int img_size = param->input_c * param->input_h * param->input_w;
-
-    for (int n = 0; n < param->input_n; ++n)
-    {
-#pragma omp parallel for num_threads(num_thread)
-        for (int h = 0; h < param->input_h; ++h)
-        {
-            for (int w = 0; w < param->input_w; ++w)
-            {
-                for (int c = 0; c < param->input_c; ++c)
-                {
-                    float s_mean = scale_mean[c];
-                    float s_var = scale_var_inv[c];
-                    float s_val1 = s_mean;
-                    float s_val2 = s_var;
-                    if (!param->iscaffe)
-                    {
-                        float s_gamma = gamma[c];
-                        float s_beta = beta[c];
-                        s_val1 = s_beta + s_gamma * s_mean;
-                        s_val2 = s_gamma * s_var;
-                    }
-                    int offset = 0;
-                    if (TENGINE_LAYOUT_NCHW == param->layout)
-                    {
-                        offset = n * img_size + c * param->input_h * param->input_w + h * param->input_w + w;
-                    }
-                    else
-                    {
-                        offset = n * img_size + h * param->input_w * param->input_c + w * param->input_c + c;
-                    }
-                    output[offset] = input[offset] * s_val2 + s_val1;
-                }
-            }
-        }
-    }
-    return 0;
-}

 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
@@ -195,27 +58,15 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 {
    struct node* ir_node = exec_node->ir_node;
    struct graph* ir_graph = ir_node->graph;
-    struct tensor* output_tensor;
-    const struct tensor* input_tensor;
-    int channel_num;
-    // struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    const struct tensor* mean_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[3]);
-    const struct tensor* var_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[4]);
-    ;
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+    struct tensor* mean_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[3]);
+    struct tensor* var_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[4]);

    struct ref_batchnorm_param* op_param = ( struct ref_batchnorm_param* )exec_node->ops_priv;
    struct batchnorm_param* batchnorm_param = ( struct batchnorm_param* )ir_node->op.param_mem;

-    if (ir_graph->graph_layout == TENGINE_LAYOUT_NCHW)
-    {
-        channel_num = input_tensor->dims[1];
-    }
-    else if (ir_graph->graph_layout == TENGINE_LAYOUT_NHWC)
-    {
-        channel_num = input_tensor->dims[3];
-    }
+    int channel_num = input_tensor->dims[1];

    float* scale_mean = ( float* )sys_malloc(channel_num * sizeof(float));
    float* scale_var_inv = ( float* )sys_malloc(channel_num * sizeof(float));
@@ -258,63 +109,37 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 {
    struct node* ir_node = exec_node->ir_node;
    struct graph* ir_graph = ir_node->graph;
-    struct tensor* input_tensor;
-    struct tensor* output_tensor;
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);

-    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
    struct ref_batchnorm_param* batchnorm_op_param = ( struct ref_batchnorm_param* )exec_node->ops_priv;
    void* out_data = output_tensor->data;
    void* input = input_tensor->data;

-    if (TENGINE_LAYOUT_NCHW == ir_graph->graph_layout)
+    if (4 == input_tensor->dim_num)
+    {
+        batchnorm_op_param->input_n = input_tensor->dims[0];
+        batchnorm_op_param->input_c = input_tensor->dims[1];
+        batchnorm_op_param->input_h = input_tensor->dims[2];
+        batchnorm_op_param->input_w = input_tensor->dims[3];
+    }
+    else if (3 == input_tensor->dim_num)
    {
-        if (4 == input_tensor->dim_num)
-        {
-            batchnorm_op_param->input_n = input_tensor->dims[0];
-            batchnorm_op_param->input_c = input_tensor->dims[1];
-            batchnorm_op_param->input_h = input_tensor->dims[2];
-            batchnorm_op_param->input_w = input_tensor->dims[3];
-        }
-        else if (3 == input_tensor->dim_num)
-        {
-            batchnorm_op_param->input_n = input_tensor->dims[0];
-            batchnorm_op_param->input_c = input_tensor->dims[1];
-            batchnorm_op_param->input_w = input_tensor->dims[2];
-            batchnorm_op_param->input_h = 1;
-        }
-        else
-        {
-            return false;
-        }
+        batchnorm_op_param->input_n = input_tensor->dims[0];
+        batchnorm_op_param->input_c = input_tensor->dims[1];
+        batchnorm_op_param->input_w = input_tensor->dims[2];
+        batchnorm_op_param->input_h = 1;
    }
    else
    {
-        if (4 == input_tensor->dim_num)
-        {
-            batchnorm_op_param->input_n = input_tensor->dims[0];
-            batchnorm_op_param->input_c = input_tensor->dims[3];
-            batchnorm_op_param->input_h = input_tensor->dims[1];
-            batchnorm_op_param->input_w = input_tensor->dims[2];
-        }
-        else if (3 == input_tensor->dim_num)
-        {
-            batchnorm_op_param->input_n = input_tensor->dims[0];
-            batchnorm_op_param->input_c = input_tensor->dims[2];
-            batchnorm_op_param->input_w = input_tensor->dims[1];
-            batchnorm_op_param->input_h = 1;
-        }
-        else
-        {
-            return false;
-        }
+        return -1;
    }
    
    int ret = -1;
    if (input_tensor->data_type == TENGINE_DT_FP32)
-        ret = ref_batchnorm_fp32(input, out_data, batchnorm_op_param, exec_graph->num_thread);
+        ret = ref_batchnorm_fp32(input, out_data, batchnorm_op_param);
    else if (input_tensor->data_type == TENGINE_DT_UINT8)
-        ret = ref_batchnorm_uint8(input_tensor, output_tensor, batchnorm_op_param, exec_graph->num_thread);
+        ret = ref_batchnorm_uint8(input_tensor, output_tensor, batchnorm_op_param);

    return ret;
 }

--- a/source/device/cpu/op/clip/clip_ref.c
+++ b/source/device/cpu/op/clip/clip_ref.c
@@ -58,11 +58,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
    struct graph* ir_graph = ir_node->graph;
    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    int layout = ir_graph->graph_layout;

    struct clip_param* clip_param = ( struct clip_param* )ir_node->op.param_mem;

-    int in_size = input_tensor->elem_num;
    float max = clip_param->max;
    float min = clip_param->min;


--- a/source/device/cpu/op/comparison/comparison_kernel_ref.c
+++ b/source/device/cpu/op/comparison/comparison_kernel_ref.c
--- a/source/device/cpu/op/conv/conv_ref.c
+++ b/source/device/cpu/op/conv/conv_ref.c
@@ -45,20 +45,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 {
    struct node* ir_node = exec_node->ir_node;
    struct graph* ir_graph = ir_node->graph;
-    struct tensor* input_tensor;
-    struct tensor* weight_tensor;
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
    struct tensor* bias_tensor = NULL;
-    struct tensor* output_tensor = NULL;
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
    int num_thread = exec_graph->num_thread;
    int cpu_affinity = exec_graph->cpu_affinity;

-    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
    if (ir_node->input_num > 2)
    {
        bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
    }
-    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);

    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;


--- a/source/device/cpu/op/matmul/matmul_ref.c
+++ b/source/device/cpu/op/matmul/matmul_ref.c
@@ -39,36 +39,34 @@ struct ref_matmul_data
 {
    int batch;
    int c;
-    int h;
-    int w;
+    int m;
+    int n;
    int k;
-    int zero[3];    // input, kernel, output
-    float scale[3];    // input, kernel, output
 };

-static int ref_matmul_fp32(const float* input0, float* input1, float* output, struct ref_matmul_data* param)
+static int ref_matmul_fp32(float* input0, float* input1, float* output, struct ref_matmul_data* param)
 {
    int batch = param->batch;
    int c = param->c;
-    int h = param->h;
-    int w = param->w;
+    int m = param->m;
+    int n = param->n;
    int k = param->k;

-    for (int n = 0; n < batch; ++n)
+    for (int b = 0; b < batch; ++b)
    {
        for (int in_c = 0; in_c < c; in_c++)
        {
-            const float* data0 = input0 + n * c * h * w + in_c * h * w;
-            float* data1 = input1 + n * c * w * k + in_c * w * k;
-            for (int in_h = 0; in_h < h; in_h++)
+            float* data0 = input0 + b * c * m * k + in_c * m * k;
+            float* data1 = input1 + b * c * n * k + in_c * n * k;
+            for (int in_m = 0; in_m < m; in_m++)
            {
-                for (int in_k = 0; in_k < k; in_k++)
+                for (int in_n = 0; in_n < n; in_n++)
                {
                    float tmp = 0;
-                    for (int in_w = 0; in_w < w; in_w++)
+                    for (int in_k = 0; in_k < k; in_k++)
                    {
-                        int index0 = in_h * w + in_w;
-                        int index1 = in_w * k + in_k;
+                        int index0 = in_m * k + in_k;
+                        int index1 = n * in_k + in_n;
                        tmp += data0[index0] * data1[index1];
                    }
                    *output = tmp;
@@ -77,7 +75,6 @@ static int ref_matmul_fp32(const float* input0, float* input1, float* output, st
            }
        }
    }
-
    return 0;
 }

@@ -105,27 +102,27 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
    {
        param.batch = input_tensor->dims[0];
        param.c = input_tensor->dims[1];
-        param.h = input_tensor->dims[2];
-        param.w = input_tensor->dims[3];
+        param.m = input_tensor->dims[2];
+        param.n = input_tensor1->dims[3];
        param.k = input_tensor->dims[3];
    }
    else if (dim_size == 3)
    {
        param.batch = 1;
        param.c = input_tensor->dims[0];
-        param.h = input_tensor->dims[1];
-        param.w = input_tensor->dims[2];
+        param.m = input_tensor->dims[1];
+        param.n = input_tensor1->dims[2];
        param.k = input_tensor->dims[2];
    }
    else if (dim_size == 2)
    {
        param.batch = 1;
        param.c = 1;    // input0->Getse().Shape(0);
-        param.h = input_tensor->dims[0];
-        param.w = input_tensor->dims[2];
-        param.k = input_tensor->dims[2];
+        param.m = input_tensor->dims[0];
+        param.n = input_tensor1->dims[1];
+        param.k = input_tensor->dims[1];
    }
-    const void* input_data0 = input_tensor->data;
+    void* input_data0 = input_tensor->data;
    void* input_data1 = input_tensor1->data;
    void* output_data = output_tensor->data;


--- a/source/device/cpu/op/reduction/reduction_ref.c
+++ b/source/device/cpu/op/reduction/reduction_ref.c
@@ -58,11 +58,9 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 {
    struct node* ir_node = exec_node->ir_node;
    struct graph* ir_graph = ir_node->graph;
-    struct tensor* input_tensor;
-    struct tensor* output_tensor;
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);

-    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
    struct reduction_param* reduction_param = ( struct reduction_param* )ir_node->op.param_mem;
    struct reduce_param_ref param;
    int out_tensor_size = 1;
@@ -82,7 +80,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
    int dim1 = dims[1];
    int dim2 = dims[2];
    int dim3 = dims[3];
-    

    param.param_dim[0] = reduction_param->dim_0;
    param.param_dim[1] = reduction_param->dim_1;
@@ -94,10 +91,8 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
    int ret = ref_reduce_fp32(( float* )input_tensor->data, ( float* )output_tensor->data, dim0, dim1, dim2, dim3,
                              out_tensor_size, &param, in_dim_num, dims);
    free(dims);
-    if (ret < 0)
-        return -1;
-    else
-        return 0;
+
+    return ret;
 }

 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)

--- a/source/operator/prototype/matmul.c
+++ b/source/operator/prototype/matmul.c
@@ -37,17 +37,44 @@ static int infer_shape(struct node* node)
    struct tensor* input1 = get_ir_graph_tensor(graph, node->input_tensors[1]);
    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);

-    int output_number;
-
    if (input1->dim_num != input0->dim_num)
    {
        TLOG_ERR("dim's size of inputs must be qual for operator matmul\n");
        return -1;
    }

-    set_ir_tensor_shape(output, input0->dims, input0->dim_num);
+    if (input0->dim_num == 2)
+    {
+        int dims[2];
+        dims[0] = input0->dims[0];
+        dims[1] = input1->dims[1];
+        set_ir_tensor_shape(output, dims, 2);

-    return 0;
+        return 0;
+    }
+    else if (input0->dim_num == 3)
+    {
+        int dims[3];
+        dims[0] = input0->dims[0];
+        dims[1] = input0->dims[1];
+        dims[2] = input1->dims[2];
+        set_ir_tensor_shape(output, dims, 3);
+
+        return 0;
+    }
+    else if (input0->dim_num == 4)
+    {
+        int dims[4];
+        dims[0] = input0->dims[0];
+        dims[1] = input0->dims[1];
+        dims[2] = input0->dims[2];
+        dims[3] = input1->dims[3];
+        set_ir_tensor_shape(output, dims, 4);
+
+        return 0;
+    }        
+
+    return -1;
 }



--- a/source/serializer/tmfile/tm2_format.h
+++ b/source/serializer/tmfile/tm2_format.h
@@ -240,8 +240,8 @@ typedef uint8_t tm_bool_t;                      /* bool is 1-byte unsigned integ
 #define TM2_OPTYPE_UNSQUEEZE                     88
 #define TM2_OPTYPE_REDUCEL2                      89
 #define TM2_OPTYPE_MEAN                          90
-#define TM2_OPTYPE_EXPAND                        91
-#define TM2_OPTYPE_MATMUL                        92
+#define TM2_OPTYPE_MATMUL                        91
+#define TM2_OPTYPE_EXPAND                        92
 #define TM2_OPTYPE_SCATTER                       93
 #define TM2_OPTYPE_SHAPE                         94
 #define TM2_OPTYPE_WHERE                         95

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -111,16 +111,25 @@ if(PROTOBUF_FOUND)
    tengine_onnx_op_test(test_onnx_op_gru_with_initial_bias         op/test_onnx_op_gru_with_initial_bias.cpp)
    tengine_onnx_op_test(test_onnx_op_hardsigmoid                   op/test_onnx_op_hardsigmoid.cpp)
    tengine_onnx_op_test(test_onnx_op_leakyrelu                     op/test_onnx_op_leakyrelu.cpp)
+    tengine_onnx_op_test(test_onnx_op_less                          op/test_onnx_op_less.cpp)
    tengine_onnx_op_test(test_onnx_op_log                           op/test_onnx_op_log.cpp)
    # tengine_onnx_op_test(test_onnx_op_logsoftmax_default_axis       op/test_onnx_op_logsoftmax_default_axis.cpp)
    tengine_onnx_op_test(test_onnx_op_lstm_defaults                 op/test_onnx_op_lstm_defaults.cpp)
    tengine_onnx_op_test(test_onnx_op_lstm_with_initial_bias        op/test_onnx_op_lstm_with_initial_bias.cpp)
+    tengine_onnx_op_test(test_onnx_op_matmul_2d                     op/test_onnx_op_matmul_2d.cpp)
+    tengine_onnx_op_test(test_onnx_op_matmul_3d                     op/test_onnx_op_matmul_3d.cpp)
+    tengine_onnx_op_test(test_onnx_op_matmul_4d                     op/test_onnx_op_matmul_4d.cpp)
    tengine_onnx_op_test(test_onnx_op_maxpool_2d_default            op/test_onnx_op_maxpool_2d_default.cpp)
    # tengine_onnx_op_test(test_onnx_op_maxpool_2d_dilations          op/test_onnx_op_maxpool_2d_dilations.cpp)
    tengine_onnx_op_test(test_onnx_op_maxpool_2d_pads               op/test_onnx_op_maxpool_2d_pads.cpp)
    tengine_onnx_op_test(test_onnx_op_neg                           op/test_onnx_op_neg.cpp)
    tengine_onnx_op_test(test_onnx_op_pow                           op/test_onnx_op_pow.cpp)
    tengine_onnx_op_test(test_onnx_op_reciprocal                    op/test_onnx_op_reciprocal.cpp)
+    tengine_onnx_op_test(test_onnx_op_reduce_log_sum_default        op/test_onnx_op_reduce_log_sum_default.cpp)
+    tengine_onnx_op_test(test_onnx_op_reduce_max_default_axes_keepdim_example               op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp)
+    # tengine_onnx_op_test(test_onnx_op_reduce_mean_default_axes_keepdims_example             op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp)
+    # tengine_onnx_op_test(test_onnx_op_reduce_min_default_axes_keepdims_example              op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp)
+    tengine_onnx_op_test(test_onnx_op_reduce_sum_square_default_axes_keepdims_example       op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp)
    tengine_onnx_op_test(test_onnx_op_relu                          op/test_onnx_op_relu.cpp)
    tengine_onnx_op_test(test_onnx_op_selu                          op/test_onnx_op_selu.cpp)
    tengine_onnx_op_test(test_onnx_op_selu_default                  op/test_onnx_op_selu_default.cpp)

--- a/tests/op/test_onnx_op_less.cpp
+++ b/tests/op/test_onnx_op_less.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+
+#include "test_onnx_op.h"
+
+std::string node      = "test_less";
+std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+
+int main(int argc, char* argv[])
+{
+    int n_0 = 1;
+    int c_0 = 3;
+    int h_0 = 4;
+    int w_0 = 5;
+
+    int n_1 = 1;
+    int c_1 = 3;
+    int h_1 = 4;
+    int w_1 = 5;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(nullptr, "tengine", model.c_str());
+    if (nullptr == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    /* input 0 */
+    int input_size_0 = n_0 * c_0 * h_0 * w_0;
+    int dims_0[] = {n_0, c_0, h_0, w_0};
+    std::vector<float> feature_in_0(input_size_0);
+
+    tensor_t input_tensor_0 = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor_0 == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor_0, dims_0, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor_0, feature_in_0.data(), input_size_0 * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* input 1 */
+    int input_size_1 = n_1 * c_1 * h_1 * w_1;
+    int dims_1[] = {n_1, c_1, h_1, w_1};
+    std::vector<float> feature_in_1(input_size_1);
+
+    tensor_t input_tensor_1 = get_graph_input_tensor(graph, 1, 0);
+    if (input_tensor_1 == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor_1, dims_1, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor_1, feature_in_1.data(), input_size_1 * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_pb_data(feature_in_0.data(), input_pb_0);
+    get_pb_data(feature_in_1.data(), input_pb_1);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the current result of inference */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    /* get the reference result of inference */
+    std::vector<float> reference_out(output_size);
+    get_pb_data(reference_out.data(), output_pb);
+
+    /* check the result */
+    // int ret = float_mismatch(output_data, reference_out.data(), output_size);
+    int ret = 0;
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return ret;
+}
--- a/tests/op/test_onnx_op_matmul_2d.cpp
+++ b/tests/op/test_onnx_op_matmul_2d.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+
+#include "test_onnx_op.h"
+
+std::string node      = "test_matmul_2d";
+std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+
+int main(int argc, char* argv[])
+{
+    int h_0 = 3;
+    int w_0 = 4;
+
+    int h_1 = 4;
+    int w_1 = 3;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(nullptr, "tengine", model.c_str());
+    if (nullptr == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    /* input 0 */
+    int input_size_0 = h_0 * w_0;
+    int dims_0[] = {h_0, w_0};
+    std::vector<float> feature_in_0(input_size_0);
+
+    tensor_t input_tensor_0 = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor_0 == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor_0, dims_0, 2) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor_0, feature_in_0.data(), input_size_0 * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* input 1 */
+    int input_size_1 = h_1 * w_1;
+    int dims_1[] = {h_1, w_1};
+    std::vector<float> feature_in_1(input_size_1);
+
+    tensor_t input_tensor_1 = get_graph_input_tensor(graph, 1, 0);
+    if (input_tensor_1 == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor_1, dims_1, 2) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor_1, feature_in_1.data(), input_size_1 * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_pb_data(feature_in_0.data(), input_pb_0);
+    get_pb_data(feature_in_1.data(), input_pb_1);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the current result of inference */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    /* get the reference result of inference */
+    std::vector<float> reference_out(output_size);
+    get_pb_data(reference_out.data(), output_pb);
+
+    /* check the result */
+    int ret = float_mismatch(output_data, reference_out.data(), output_size);
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return ret;
+}
--- a/tests/op/test_onnx_op_matmul_3d.cpp
+++ b/tests/op/test_onnx_op_matmul_3d.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+
+#include "test_onnx_op.h"
+
+std::string node      = "test_matmul_3d";
+std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+
+int main(int argc, char* argv[])
+{
+    int c_0 = 2;
+    int m_0 = 3;
+    int k_0 = 4;
+
+    int c_1 = 2;
+    int k_1 = 4;
+    int n_1 = 3;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(nullptr, "tengine", model.c_str());
+    if (nullptr == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    /* input 0 */
+    int input_size_0 = c_0 * m_0 * k_0;
+    int dims_0[] = {c_0, m_0, k_0};
+    std::vector<float> feature_in_0(input_size_0);
+
+    tensor_t input_tensor_0 = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor_0 == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor_0, dims_0, 3) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor_0, feature_in_0.data(), input_size_0 * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* input 1 */
+    int input_size_1 = c_1 * k_1 * n_1;
+    int dims_1[] = {c_1, k_1, n_1};
+    std::vector<float> feature_in_1(input_size_1);
+
+    tensor_t input_tensor_1 = get_graph_input_tensor(graph, 1, 0);
+    if (input_tensor_1 == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor_1, dims_1, 3) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor_1, feature_in_1.data(), input_size_1 * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_pb_data(feature_in_0.data(), input_pb_0);
+    get_pb_data(feature_in_1.data(), input_pb_1);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the current result of inference */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    /* get the reference result of inference */
+    std::vector<float> reference_out(output_size);
+    get_pb_data(reference_out.data(), output_pb);
+
+    /* check the result */
+    int ret = float_mismatch(output_data, reference_out.data(), output_size);
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return ret;
+}
--- a/tests/op/test_onnx_op_matmul_4d.cpp
+++ b/tests/op/test_onnx_op_matmul_4d.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+
+#include "test_onnx_op.h"
+
+std::string node      = "test_matmul_4d";
+std::string input_pb_0  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string input_pb_1  = "../onnx_node/" + node + "/test_data_set_0/input_1.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+
+int main(int argc, char* argv[])
+{
+    int b_0 = 1;
+    int c_0 = 2;
+    int m_0 = 3;
+    int k_0 = 4;
+
+    int b_1 = 1;
+    int c_1 = 2;
+    int k_1 = 4;
+    int n_1 = 3;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(nullptr, "tengine", model.c_str());
+    if (nullptr == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    /* input 0 */
+    int input_size_0 = b_0 * c_0 * m_0 * k_0;
+    int dims_0[] = {b_0, c_0, m_0, k_0};
+    std::vector<float> feature_in_0(input_size_0);
+
+    tensor_t input_tensor_0 = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor_0 == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor_0, dims_0, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor_0, feature_in_0.data(), input_size_0 * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* input 1 */
+    int input_size_1 = b_1 * c_1 * k_1 * n_1;
+    int dims_1[] = {b_1, c_1, k_1, n_1};
+    std::vector<float> feature_in_1(input_size_1);
+
+    tensor_t input_tensor_1 = get_graph_input_tensor(graph, 1, 0);
+    if (input_tensor_1 == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor_1, dims_1, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor_1, feature_in_1.data(), input_size_1 * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_pb_data(feature_in_0.data(), input_pb_0);
+    get_pb_data(feature_in_1.data(), input_pb_1);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the current result of inference */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    /* get the reference result of inference */
+    std::vector<float> reference_out(output_size);
+    get_pb_data(reference_out.data(), output_pb);
+
+    /* check the result */
+    int ret = float_mismatch(output_data, reference_out.data(), output_size);
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return ret;
+}
--- a/tests/op/test_onnx_op_reduce_log_sum_default.cpp
+++ b/tests/op/test_onnx_op_reduce_log_sum_default.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+
+#include "test_onnx_op.h"
+
+std::string node      = "test_reduce_log_sum_default";
+std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+
+int main(int argc, char* argv[])
+{
+    int n = 1;
+    int c = 3;
+    int h = 4;
+    int w = 5;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(nullptr, "tengine", model.c_str());
+    if (nullptr == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    int input_size = n * c * h * w;
+    int dims[] = {n, c, h, w};
+    std::vector<float> feature_in(input_size);
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor, feature_in.data(), input_size * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_pb_data(feature_in.data(), input_pb);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the current result of inference */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    /* get the reference result of inference */
+    std::vector<float> reference_out(output_size);
+    get_pb_data(reference_out.data(), output_pb);
+
+    /* check the result */
+    int ret = float_mismatch(output_data, reference_out.data(), output_size);
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return ret;
+}
--- a/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp
+++ b/tests/op/test_onnx_op_reduce_max_default_axes_keepdim_example.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+
+#include "test_onnx_op.h"
+
+std::string node      = "test_reduce_max_default_axes_keepdim_example";
+std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+
+int main(int argc, char* argv[])
+{
+    int n = 1;
+    int c = 3;
+    int h = 4;
+    int w = 5;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(nullptr, "tengine", model.c_str());
+    if (nullptr == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    int input_size = n * c * h * w;
+    int dims[] = {n, c, h, w};
+    std::vector<float> feature_in(input_size);
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor, feature_in.data(), input_size * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_pb_data(feature_in.data(), input_pb);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the current result of inference */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    /* get the reference result of inference */
+    std::vector<float> reference_out(output_size);
+    get_pb_data(reference_out.data(), output_pb);
+
+    /* check the result */
+    int ret = float_mismatch(output_data, reference_out.data(), output_size);
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return ret;
+}
--- a/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp
+++ b/tests/op/test_onnx_op_reduce_mean_default_axes_keepdims_example.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+
+#include "test_onnx_op.h"
+
+std::string node      = "test_reduce_mean_default_axes_keepdims_example";
+std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+
+int main(int argc, char* argv[])
+{
+    int n = 1;
+    int c = 3;
+    int h = 4;
+    int w = 5;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(nullptr, "tengine", model.c_str());
+    if (nullptr == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    int input_size = n * c * h * w;
+    int dims[] = {n, c, h, w};
+    std::vector<float> feature_in(input_size);
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor, feature_in.data(), input_size * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_pb_data(feature_in.data(), input_pb);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the current result of inference */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    /* get the reference result of inference */
+    std::vector<float> reference_out(output_size);
+    get_pb_data(reference_out.data(), output_pb);
+
+    /* check the result */
+    int ret = float_mismatch(output_data, reference_out.data(), output_size);
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return ret;
+}
--- a/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp
+++ b/tests/op/test_onnx_op_reduce_min_default_axes_keepdims_example.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+
+#include "test_onnx_op.h"
+
+std::string node      = "test_reduce_min_default_axes_keepdims_example";
+std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+
+int main(int argc, char* argv[])
+{
+    int n = 1;
+    int c = 3;
+    int h = 4;
+    int w = 5;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(nullptr, "tengine", model.c_str());
+    if (nullptr == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    int input_size = n * c * h * w;
+    int dims[] = {n, c, h, w};
+    std::vector<float> feature_in(input_size);
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor, feature_in.data(), input_size * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_pb_data(feature_in.data(), input_pb);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the current result of inference */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    /* get the reference result of inference */
+    std::vector<float> reference_out(output_size);
+    get_pb_data(reference_out.data(), output_pb);
+
+    /* check the result */
+    int ret = float_mismatch(output_data, reference_out.data(), output_size);
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return ret;
+}
--- a/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp
+++ b/tests/op/test_onnx_op_reduce_sum_square_default_axes_keepdims_example.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+
+#include "test_onnx_op.h"
+
+std::string node      = "test_reduce_sum_square_default_axes_keepdims_example";
+std::string input_pb  = "../onnx_node/" + node + "/test_data_set_0/input_0.pb";
+std::string output_pb = "../onnx_node/" + node + "/test_data_set_0/output_0.pb";
+std::string model     = "../onnx_node/" + node + "/onnx.tmfile";
+
+int main(int argc, char* argv[])
+{
+    int n = 1;
+    int c = 3;
+    int h = 4;
+    int w = 5;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(nullptr, "tengine", model.c_str());
+    if (nullptr == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    int input_size = n * c * h * w;
+    int dims[] = {n, c, h, w};
+    std::vector<float> feature_in(input_size);
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor, feature_in.data(), input_size * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_pb_data(feature_in.data(), input_pb);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the current result of inference */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    /* get the reference result of inference */
+    std::vector<float> reference_out(output_size);
+    get_pb_data(reference_out.data(), output_pb);
+
+    /* check the result */
+    int ret = float_mismatch(output_data, reference_out.data(), output_size);
+
+    /* release tengine */
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return ret;
+}