add yolact demo (#366)

* update yolact demo * update action for linux-x86 opencv and avx2 * Update workflows.yml

add yolact demo (#366)
* update yolact demo * update action for linux-x86 opencv and avx2 * Update workflows.yml
f2d49d40 · BUG1989 · GitHub · 7561683b · f2d49d40 · f2d49d40
29 changed file
--- a/.github/workflows/workflows.yml
+++ b/.github/workflows/workflows.yml
@@ -7,12 +7,30 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v1
+    - name: update
+      run: sudo apt-get update
+    - name: protobuf
+      run: sudo apt-get install libopencv-dev    
    - name: configure
      run: mkdir build && cd build && cmake ..
    - name: build
      run: cmake --build build -j 2


+  linux-gcc-avx2:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: update
+      run: sudo apt-get update
+    - name: protobuf
+      run: sudo apt-get install libopencv-dev    
+    - name: configure
+      run: mkdir build && cd build && cmake -DTENGINE_AVX2=ON ..
+    - name: build
+      run: cmake --build build -j 2      
+
+
  android-armv7-a:
    runs-on: ubuntu-latest
    steps:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,7 @@ option(TENGINE_BUILD_TESTS "build tests" OFF)
 option(TENGINE_DEBUG_DATA "extract data for every layer" OFF)
 option(TENGINE_DEBUG_TIME "print time information for every layer" OFF)
 option(TENGINE_DEBUG_MEM_STAT "print memory status for library" OFF)
-option(TENGINE_ARCH_X86_AVX "build avx2 for x86" OFF)
+option(TENGINE_AVX2 "build avx2 for x86" OFF)


 # add_definitions(-DCONFIG_DISABLE_PARAM_ACCESS)

--- a/README.md
+++ b/README.md
@@ -78,6 +78,6 @@ Tengine Lite 参考和借鉴了下列项目：

 ## 技术讨论
 - Github issues
- QQ 群: 829565581 (答案：openailab)
+- QQ 群: 829565581
 - Email: Support@openailab.com
 - Tengine 社区: http://www.tengine.org.cn/
--- a/README_EN.md
+++ b/README_EN.md
@@ -81,6 +81,6 @@ Tengine Lite got ideas and developed based on these projects：

 ## Tech Forum
 - Github issues
- QQ groupchat: 829565581 (Answer: openailab)
+- QQ groupchat: 829565581
 - Email: Support@openailab.com
 - Tengine Community: http://www.tengine.org.cn/
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
-# macro for adding test
+# macro for adding examples
 macro (tengine_example name file)
    add_executable(${name}
        "${CMAKE_CURRENT_SOURCE_DIR}/${file}"
@@ -17,3 +17,20 @@ tengine_example(tm_mobilenet_ssd_uint8      tm_mobilenet_ssd_uint8.cpp)
 tengine_example(tm_retinaface               tm_retinaface.cpp)
 tengine_example(tm_yolov3_tiny              tm_yolov3_tiny.cpp)

+# add examples with opencv
+if (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
+    find_package(OpenCV REQUIRED)
+    include_directories(${OpenCV_INCLUDE_DIRS})
+
+    if(OpenCV_FOUND)
+        # macro for adding examples
+        macro (tengine_example_cv name file)
+            add_executable(${name}  "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
+            target_link_libraries(${name} ${CMAKE_PROJECT_NAME} ${OpenCV_LIBS})
+        endmacro()
+
+        tengine_example_cv(tm_yolact        tm_yolact.cpp)
+    else()
+        message(WARNING "OpenCV not found, some examples won't be built")    
+    endif()
+endif()
--- a/examples/tm_yolact.cpp
+++ b/examples/tm_yolact.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/blob/master/examples/yolact.cpp
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "tengine_c_api.h"
+
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+    std::vector<float> maskdata;
+    cv::Mat mask;
+};
+
+void get_input_data_cv(const cv::Mat& sample, float* input_data, int img_h, int img_w, const float* mean, const float* scale, int swapRB = 0)
+{
+    cv::Mat img;
+    if(sample.channels() == 4)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR);
+    }
+    else if(sample.channels() == 1)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR);
+    }
+    else if(sample.channels() == 3 && swapRB == 1)
+    {
+        cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
+    }
+    else
+    {
+        img = sample;
+    }
+
+    cv::resize(img, img, cv::Size(img_h, img_w));
+    img.convertTo(img, CV_32FC3);
+    float* img_data = ( float* )img.data;
+    int hw = img_h * img_w;
+    for(int h = 0; h < img_h; h++)
+    {
+        for(int w = 0; w < img_w; w++)
+        {
+            for(int c = 0; c < 3; c++)
+            {
+                input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c];
+                img_data++;
+            }
+        }
+    }
+}
+
+struct Box2f
+{
+    float cx;
+    float cy;
+    float w;
+    float h;
+};
+
+static std::vector<Box2f> generate_priorbox(int num_priores)
+{
+    std::vector<Box2f> priorboxs(num_priores);
+
+    const int conv_ws[5] = {69, 35, 18, 9, 5};
+    const int conv_hs[5] = {69, 35, 18, 9, 5};
+
+    const float aspect_ratios[3] = {1.f, 0.5f, 2.f};
+    const float scales[5] = {24.f, 48.f, 96.f, 192.f, 384.f};
+
+    int index = 0;
+
+    for(int i = 0; i < 5; i++)
+    {
+        int conv_w = conv_ws[i];
+        int conv_h = conv_hs[i];
+        int scale = scales[i];
+        for(int ii = 0; ii < conv_h; ii++)
+        {
+            for(int j = 0; j < conv_w; j++)
+            {
+                float cx = (j + 0.5f) / conv_w;
+                float cy = (ii + 0.5f) / conv_h;
+
+                for (int k = 0; k < 3; k++)
+                {
+                    float ar = aspect_ratios[k];
+
+                    ar = sqrt(ar);
+
+                    float w = scale * ar / 550;
+                    float h = scale / ar / 550;
+
+                    h = w;
+
+                    Box2f& priorbox = priorboxs[index];       
+
+                    priorbox.cx = cx;
+                    priorbox.cy = cy;
+                    priorbox.w = w;
+                    priorbox.h = h;
+
+                    index += 1;
+                }
+            }
+        }
+    }
+
+    return priorboxs;
+}
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+
+static void fast_nms(std::vector< std::vector<Object> >& class_candidates, std::vector<Object>& objects, const float iou_thresh, const int nms_top_k, const int keep_top_k)
+{
+    for(int i = 0; i < (int)class_candidates.size(); i++)
+    {
+        std::vector<Object>& candidate = class_candidates[i];
+        std::sort(candidate.begin(), candidate.end(),
+                  [](const Object& a, const Object& b) {return a.prob > b.prob;});
+        if (candidate.size() == 0)
+            continue;
+
+        if(nms_top_k != 0&& nms_top_k > candidate.size())
+        {
+            candidate.erase(candidate.begin()+nms_top_k, candidate.end());
+        }
+
+        objects.push_back(candidate[0]);
+        const int n = candidate.size();
+        std::vector<float> areas(n);
+        std::vector<int> keep(n);
+        for(int j = 0; j < n; j++)
+        {
+            areas[j] = candidate[j].rect.area();
+        }
+        std::vector< std::vector<float> > iou_matrix;
+        for(int j = 0; j < n; j++)
+        {
+            std::vector<float> iou_row(n);
+            for(int k = 0; k < n; k++)
+            {
+                float inter_area = intersection_area(candidate[j], candidate[k]);
+                float union_area = areas[j] + areas[k] - inter_area;
+                iou_row[k] = inter_area / union_area;
+            }
+            iou_matrix.push_back(iou_row);
+        }
+        for(int j = 1; j < n; j++)
+        {
+            std::vector<float>::iterator max_value;
+            max_value = std::max_element(iou_matrix[j].begin(), iou_matrix[j].begin()+j-1);
+            if(*max_value <= iou_thresh)
+            {
+                objects.push_back(candidate[j]);
+            }
+        }
+
+    }
+    std::sort(objects.begin(), objects.end(),
+              [](const Object& a, const Object& b) {return a.prob > b.prob;});
+    if(objects.size() > keep_top_k)
+        objects.resize(keep_top_k);
+}
+
+static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+    fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
+
+    const int target_size = 550;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    const float mean_vals[3] = {123.68f, 116.78f, 103.94f};
+    const float norm_vals[3] = {1.0/58.40f, 1.0/57.12f, 1.0/57.38f};
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(NULL, "tengine", "./yolact_tm.tmfile");
+    if (NULL == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        fprintf(stderr, "errno: %d \n", get_tengine_errno());
+        return -1;
+    }
+
+    /* set the input shape to initial the graph, and prerun graph to infer shape */
+    int img_size      = target_size * target_size * 3;
+    int dims[]        = {1, 3, target_size, target_size};    // nchw
+    float* input_data = ( float* )malloc(img_size * sizeof(float));
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor == NULL)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (prerun_graph_multithread(graph, TENGINE_CLUSTER_ALL, 1) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_input_data_cv(bgr, input_data, target_size, target_size, mean_vals, norm_vals, 1);
+    if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    dump_graph(graph);
+
+    /* run graph */
+    if (run_graph(graph, 1) < 0)
+    {
+        fprintf(stderr, "Run graph failed\n");
+        return -1;
+    }
+
+    /* get the result of classification */
+    tensor_t maskmaps_tensor   = get_graph_output_tensor(graph, 0, 0);
+    tensor_t location_tensor   = get_graph_output_tensor(graph, 1, 0);
+    tensor_t mask_tensor       = get_graph_output_tensor(graph, 2, 0);
+    tensor_t confidence_tensor = get_graph_output_tensor(graph, 3, 0);
+    float* maskmaps     = ( float* )get_tensor_buffer(maskmaps_tensor);
+    float* location     = ( float* )get_tensor_buffer(location_tensor);
+    float* mask         = ( float* )get_tensor_buffer(mask_tensor);
+    float* confidence   = ( float* )get_tensor_buffer(confidence_tensor);
+
+    int num_class = 81;
+    int num_priors = 19248;
+    std::vector<Box2f> priorboxes = generate_priorbox(num_priors);
+    const float confidence_thresh = 0.05f;
+    const float nms_thresh = 0.5f;
+    const int keep_top_k = 200;
+
+    std::vector< std::vector<Object> > class_candidates;
+    class_candidates.resize(num_class);
+
+    for(int i = 0; i < num_priors; i++)
+    {
+        const float* conf = confidence + i * 81;
+        const float* loc  = location + i * 4;
+        const float* maskdata = mask + i * 32;
+        Box2f& priorbox = priorboxes[i];
+
+        int label = 0;
+        float score = 0.f;
+        for(int j = 1; j < num_class; j++)
+        {
+            float class_score = conf[j];
+            if(class_score > score)
+            {
+                label = j;
+                score = class_score;
+            }
+        }
+
+        if(label == 0||score <= confidence_thresh)
+            continue;
+
+        float var[4] = {0.1f, 0.1f, 0.2f, 0.2f};
+
+        float bbox_cx = var[0] * loc[0] * priorbox.w + priorbox.cx;
+        float bbox_cy = var[1] * loc[1] * priorbox.h + priorbox.cy;
+        float bbox_w = (float)(exp(var[2] * loc[2]) * priorbox.w);
+        float bbox_h = (float)(exp(var[3] * loc[3]) * priorbox.h);  
+
+        float obj_x1 = bbox_cx - bbox_w * 0.5f;
+        float obj_y1 = bbox_cy - bbox_h * 0.5f;
+        float obj_x2 = bbox_cx + bbox_w * 0.5f;
+        float obj_y2 = bbox_cy + bbox_h * 0.5f;
+
+        obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
+        obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
+
+        Object obj;
+        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2-obj_x1+1, obj_y2-obj_y1+1);
+        obj.label = label;
+        obj.prob = score;
+
+        obj.maskdata = std::vector<float>(maskdata, maskdata + 32);
+
+        class_candidates[label].push_back(obj);
+    }
+
+    objects.clear();
+    fast_nms(class_candidates, objects, nms_thresh, 0, keep_top_k);
+
+
+    for (int i=0; i<objects.size(); i++)
+    {
+        Object& obj = objects[i];
+
+        cv::Mat mask1(138, 138, CV_32FC1);
+        {
+            mask1 = cv::Scalar(0.f);
+
+            for (int p=0; p<32; p++)
+            {
+                const float* maskmap = maskmaps + p;
+                float coeff = obj.maskdata[p];
+                float* mp = (float*)mask1.data;
+
+                // mask += m * coeff
+                for (int j=0; j< 138 * 138; j++)
+                {
+                    mp[j] += maskmap[j*32] * coeff;
+                }
+            }
+        }
+
+        cv::Mat mask2;
+        cv::resize(mask1, mask2, cv::Size(img_w, img_h));
+
+        // crop obj box and binarize
+        obj.mask = cv::Mat(img_h, img_w, CV_8UC1);
+        {
+            obj.mask = cv::Scalar(0);
+
+            for (int y=0; y<img_h; y++)
+            {
+                if (y < obj.rect.y || y > obj.rect.y + obj.rect.height)
+                    continue;
+
+                const float* mp2 = mask2.ptr<const float>(y);
+                uchar* bmp = obj.mask.ptr<uchar>(y);
+
+                for (int x=0; x<img_w; x++)
+                {
+                    if (x < obj.rect.x || x > obj.rect.x + obj.rect.width)
+                        continue;
+
+                    bmp[x] = mp2[x] > 0.5f ? 255 : 0;
+                }
+            }
+        }
+    }
+
+    /* release tengine */
+    free(input_data);
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "person", "bicycle", "car", "motorcycle", "airplane", "bus",
+                                        "train", "truck", "boat", "traffic light", "fire hydrant",
+                                        "stop sign", "parking meter", "bench", "bird", "cat", "dog",
+                                        "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+                                        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+                                        "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                                        "baseball glove", "skateboard", "surfboard", "tennis racket",
+                                        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                                        "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
+                                        "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                                        "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
+                                        "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
+                                        "toaster", "sink", "refrigerator", "book", "clock", "vase",
+                                        "scissors", "teddy bear", "hair drier", "toothbrush"};
+
+    static const unsigned char colors[19][3] = {
+        {244,  67,  54},
+        {233,  30,  99},
+        {156,  39, 176},
+        {103,  58, 183},
+        { 63,  81, 181},
+        { 33, 150, 243},
+        {  3, 169, 244},
+        {  0, 188, 212},
+        {  0, 150, 136},
+        { 76, 175,  80},
+        {139, 195,  74},
+        {205, 220,  57},
+        {255, 235,  59},
+        {255, 193,   7},
+        {255, 152,   0},
+        {255,  87,  34},
+        {121,  85,  72},
+        {158, 158, 158},
+        { 96, 125, 139}
+    };
+
+    cv::Mat image = bgr.clone();
+
+    int color_index = 0;
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        if (obj.prob < 0.15)
+            continue;
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        const unsigned char* color = colors[color_index++];
+
+        cv::rectangle(image, obj.rect, cv::Scalar(color[0], color[1], color[2]));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y),
+                                      cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+
+        // draw mask
+        for (int y=0; y<image.rows; y++)
+        {
+            const uchar* mp = obj.mask.ptr(y);
+            uchar* p = image.ptr(y);
+            for (int x=0; x<image.cols; x++)
+            {
+                if (mp[x] == 255)
+                {
+                    p[0] = cv::saturate_cast<uchar>(p[0] * 0.5 + color[0] * 0.5);
+                    p[1] = cv::saturate_cast<uchar>(p[1] * 0.5 + color[1] * 0.5);
+                    p[2] = cv::saturate_cast<uchar>(p[2] * 0.5 + color[2] * 0.5);
+                }
+                p += 3;
+            }
+        }
+    }
+
+    cv::imwrite("result.png", image);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolact(m, objects);
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -42,7 +42,7 @@ endif()

 # X86
 if (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
-    if (TENGINE_ARCH_X86_AVX)
+    if (TENGINE_AVX2)
        add_definitions(-mfma -mf16c)
    endif()
    file(GLOB_RECURSE TENGINE_BACKEND_HCL_OPS   "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/*hcl.c"

--- a/src/dev/cpu/op/conv/conv_hcl_kernel.h
+++ b/src/dev/cpu/op/conv/conv_hcl_kernel.h
@@ -8,10 +8,8 @@ struct conv_priv_info
 {
    void* interleave_buffer;    // kernel transform buffer
    void* interleave_buffer_pack4;    // kernel pack4
-    void* p_input_max;
    void* im2col_buffer;    // input data transform buffer
    void* im2col_buffer_pack4;    // input data transform buffer pack4
-    void* p_kernel_max;
    void* input_pad;
    void* dot_block;
    void* transform_input;

--- a/src/dev/cpu/op/conv/conv_ref_kernel.h
+++ b/src/dev/cpu/op/conv/conv_ref_kernel.h
@@ -8,8 +8,6 @@ struct conv_priv_info
 {
    void* interleave_buffer;
    void* im2col_buffer;
-    void* p_input_max;
-    void* p_kernel_max;
    int im2col_buffer_size;
    int interleave_buffer_size;
    int external_im2col_mem;

--- a/src/dev/cpu/op/conv/cortex_a/wino_conv_kernel_arm.c
+++ b/src/dev/cpu/op/conv/cortex_a/wino_conv_kernel_arm.c
@@ -985,9 +985,9 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int

 // trans_input  [block_hw/4][ELEM_SIZE][inc][4]
 // kernel       [out_c/PER_OUT_CHAN][ELEM_SIZE][in_c][PER_OUT_CHAN]
-static void wino_sgemm_set(const float* ker, const float* inp, float* output, const float* bias, int cin, int cout_end,
-                           int block_h, int block_w, int out_h, int out_w, int resi_h, int resi_w, int activation,
-                           int num_thread, int cpu_affinity)
+static void wino_sgemm_set(const float* ker, const float* inp, float* output, const float* bias, int cin,
+                           int cout_end, int block_h, int block_w, int out_h, int out_w, int resi_h,
+                           int resi_w, int activation, int num_thread, int cpu_affinity)
 {
    int flag_outw = 1;
    if (out_w < 16)

--- a/src/dev/cpu/op/conv/ref/conv_kernel_ref.c
+++ b/src/dev/cpu/op/conv/ref/conv_kernel_ref.c
@@ -29,7 +29,10 @@

 static int get_private_mem_size(struct ir_tensor* filter)
 {
-    return filter->elem_num * filter->elem_size;    // caution
+    if (filter->data_type == TENGINE_DT_UINT8)  // simulator uint8 inference with fp32 
+        return filter->elem_num * filter->elem_size * 4;
+    else
+        return filter->elem_num * filter->elem_size;    // caution
 }

 static void interleave(struct ir_tensor* filter, struct conv_priv_info* priv_info)
@@ -38,7 +41,21 @@ static void interleave(struct ir_tensor* filter, struct conv_priv_info* priv_inf
    memcpy(priv_info->interleave_buffer, filter->data, filter->elem_num * filter->elem_size);
 }

-static inline void copy_one_element(void* src, void* dst, int src_off, int dst_off, int elem_type, int zero_point)
+static void interleave_uint8(struct ir_tensor* filter, struct conv_priv_info* priv_info)
+{
+    /* dequant uint8 weight to fp32 for simulator */
+    float* weight_fp32 = (float* )priv_info->interleave_buffer;
+    uint8_t* weight_uint8 = (uint8_t*)filter->data;
+    float scale = filter->scale;
+    int zero_point = filter->zero_point;
+
+    for (int i = 0; i < filter->elem_num; i++)
+    {
+        weight_fp32[i] = ((float)weight_uint8[i] - (float)zero_point) * scale;
+    }
+}
+
+static inline void copy_one_element(void* src, void* dst, int src_off, int dst_off, int elem_type, int zero_point, float scale)
 {
    switch (elem_type)
    {
@@ -65,11 +82,11 @@ static inline void copy_one_element(void* src, void* dst, int src_off, int dst_o
            int_dst[dst_off] = int_src[src_off] - zero_point;
        }
        break;
-        case TENGINE_DT_UINT8:
+        case TENGINE_DT_UINT8:      // simulator uint8 inference with fp32
        {
-            int8_t* int_dst = dst;
+            float* int_dst = dst;
            uint8_t* int_src = src;
-            int_dst[dst_off] = (int8_t)((int)int_src[src_off] - (int)zero_point);
+            int_dst[dst_off] = ((float)int_src[src_off] - (float)zero_point) * scale;
        }
        break;
    }
@@ -96,14 +113,70 @@ static inline void zero_one_element(void* dst, int dst_off, int elem_type)
        case TENGINE_DT_INT8:
        case TENGINE_DT_UINT8:
        {
-            int8_t* int_dst = dst;
+            float* int_dst = dst;   // simulator uint8 inference with fp32
            int_dst[dst_off] = 0x0;
        }
        break;
    }
 }

-static void im2col(struct ir_tensor* input, struct ir_tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n, int group)
+static void im2col_fp32(struct ir_tensor* input, struct ir_tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n, int group)
+{
+    int input_chan = param->input_channel / param->group;
+    int image_size = input->dims[1] * input->dims[2] * input->dims[3];
+    int group_size = input_chan * input->dims[2] * input->dims[3];
+
+    void* input_base  = input->data + (n * image_size + group * group_size) * input->elem_size;
+    void* im2col_buf = priv_info->im2col_buffer;
+
+    float scale = input->scale;
+    int zero_point = input->zero_point;
+
+    int k_h = param->kernel_h;
+    int k_w = param->kernel_w;
+    int in_c = input_chan;
+    int in_h = input->dims[2];
+    int in_w = input->dims[3];
+    int out_h = output->dims[2];
+    int out_w = output->dims[3];
+    int s_h = param->stride_h;
+    int s_w = param->stride_w;
+    int p_h0 = param->pad_h0;
+    int p_w0 = param->pad_w0;
+    int d_h  = param->dilation_h;
+    int d_w  = param->dilation_w;
+    int data_type = input->data_type;
+    int kernel_size = k_h * k_w * in_c;
+
+    for (int i = 0; i < kernel_size; i++)
+    {
+        int c_off = i / (k_h * k_w);
+        int c_left = i % (k_h * k_w);
+
+        int kh_off = c_left / k_w;
+        int kw_off = c_left % k_w;
+
+        for (int l = 0; l < out_h; l++)
+        {
+            for (int m = 0; m < out_w; m++)
+            {
+                int out_off = (l * out_w + m) * kernel_size + i;
+                int img_h = l * s_h - p_h0 + kh_off * d_h;
+                int img_w = m * s_w - p_w0 + kw_off * d_w;
+
+                if (img_h >= 0 && img_w >= 0 && img_h < in_h && img_w < in_w)
+                {
+                    int in_off = c_off * in_h * in_w + img_h * in_w + img_w;
+                    copy_one_element(input_base, im2col_buf, in_off, out_off, data_type, zero_point, scale);
+                }
+                else
+                    zero_one_element(im2col_buf, out_off, data_type);
+            }
+        }
+    }
+}
+
+static void im2col_uint8(struct ir_tensor* input, struct ir_tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n, int group)
 {
    int input_chan = param->input_channel / param->group;
    int image_size = input->dims[1] * input->dims[2] * input->dims[3];
@@ -112,6 +185,7 @@ static void im2col(struct ir_tensor* input, struct ir_tensor* output, struct con
    void* input_base  = input->data + (n * image_size + group * group_size) * input->elem_size;
    void* im2col_buf = priv_info->im2col_buffer;

+    float scale = input->scale;
    int zero_point = input->zero_point;

    int k_h = param->kernel_h;
@@ -149,7 +223,7 @@ static void im2col(struct ir_tensor* input, struct ir_tensor* output, struct con
                if (img_h >= 0 && img_w >= 0 && img_h < in_h && img_w < in_w)
                {
                    int in_off = c_off * in_h * in_w + img_h * in_w + img_w;
-                    copy_one_element(input_base, im2col_buf, in_off, out_off, data_type, zero_point);
+                    copy_one_element(input_base, im2col_buf, in_off, out_off, data_type, zero_point, scale);
                }
                else
                    zero_one_element(im2col_buf, out_off, data_type);
@@ -244,91 +318,115 @@ static void sgemm_fp32(struct ir_tensor* input, struct ir_tensor* filter, struct
    }
 }

-static void sgemm_uint8(struct ir_tensor* input_tensor, struct ir_tensor* filter_tensor, struct ir_tensor* bias_tensor,
-                        struct ir_tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
-                        int n, int group, int num_thread)
+static void sgemm_uint8(struct ir_tensor* input, struct ir_tensor* filter, struct ir_tensor* bias,
+                       struct ir_tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n,
+                       int group, int num_thread)
 {
    int kernel_size = param->kernel_h * param->kernel_w * param->input_channel / param->group;
    int outchan_g = param->output_channel / param->group;

-    int out_h = output_tensor->dims[2];
-    int out_w = output_tensor->dims[3];
-    int out_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3];
-
-    /* data point */
-    unsigned char* interleave_uint8 = ( unsigned char* )priv_info->interleave_buffer + outchan_g * group * kernel_size;
-    signed char* im2col_int8 = priv_info->im2col_buffer;
-    unsigned char* output_uint8 = ( unsigned char* )output_tensor->data + n * out_image_size + outchan_g * group * out_h * out_w;
-    int* bias_int32 = NULL;
-    if (bias_tensor)
-        bias_int32 = ( int* )bias_tensor->data + outchan_g * group;
-
-    /* quantizaion scale and zero-point */
-    float input_scale = input_tensor->scale;
-    float weight_scale = filter_tensor->scale;
-    float output_scale = output_tensor->scale;
-//    float bias_scale = 0.f;
-//    if (bias_tensor)
-//        bias_scale = bias_tensor->scale;
-
-    unsigned char input_zero = input_tensor->zero_point;
-    unsigned char weight_zero = filter_tensor->zero_point;
-    unsigned char output_zero = output_tensor->zero_point;
-
-    /* int8 sgemm */
+    int out_c = output->dims[1];
+    int out_h = output->dims[2];
+    int out_w = output->dims[3];
+    int out_image_size = out_c * out_h * out_w;
+    int out_group_size = outchan_g * out_h * out_w;
+
+    float* interleave_fp32 = ( float* )priv_info->interleave_buffer + outchan_g * group * kernel_size;
+    float* im2col_fp32 = priv_info->im2col_buffer;
+    float* output_fp32 = (float*)sys_malloc(out_group_size * sizeof(float));
+    uint8_t* output_uint8 = ( uint8_t* )output->data + n * out_image_size + outchan_g * group * out_h * out_w;
+    int32_t* bias_int32 = NULL;
+    float bias_scale = 0.f;
+
+    if (bias)
+    {
+        bias_int32 = ( int32_t* )bias->data + outchan_g * group;
+        bias_scale = input->scale * filter->scale;
+    }
+
    #pragma omp parallel for num_threads(num_thread)
-    for (int i = 0; i < outchan_g; i++)
+    for(int i = 0; i < outchan_g; i++)
    {
-        unsigned char* kernel = interleave_uint8 + i * kernel_size;
-        signed char* input = im2col_int8;
-        unsigned char* output = output_uint8 + i * (out_h * out_w);
+        float* kernel = interleave_fp32 + i * kernel_size;
+        float* input = im2col_fp32;
+        float* output = output_fp32 + i * (out_h * out_w);

        for (int j = 0; j < out_h * out_w; j++)
        {
            int im2col_off = j * kernel_size;
-            int sum_int32 = bias_tensor ? bias_int32[i] : 0;

+            float sum = 0.f;
            for (int k = 0; k < kernel_size; k++)
            {
-                int input_data = input[im2col_off + k];
-                int input_data_u32 = ( unsigned char )input[im2col_off + k];
-                int kernel_data = kernel[k] - weight_zero;
-
-                if (input_zero == 0)
-                    sum_int32 += input_data_u32 * kernel_data;
-                else
-                    sum_int32 += input_data * kernel_data;
+                sum += kernel[k] * input[im2col_off + k];
            }
+            output[0] = sum;
+            output++;
+        }
+    }

-            // dequant sum from int32 to fp32
-            float sum_fp32 = (float)sum_int32 * input_scale * weight_scale;
-
-            // relu
-            if (param->activation > 0)
+    // process bias
+    if (bias)
+    {
+        for (int i = 0; i < outchan_g; i++)
+        {
+            for (int j = 0; j < out_h * out_w; j++)
            {
-                if (sum_fp32 < 0)
-                    sum_fp32 = 0;
+                int output_off = i * (out_h * out_w) + j;
+                output_fp32[output_off] += bias_int32[i] * bias_scale;
            }
+        }
+    }

-            // relu6
-            if (param->activation > 0)
+    // process activation relu
+    if (param->activation == 0)
+    {
+        for (int i = 0; i < outchan_g; i++)
+        {
+            for (int j = 0; j < out_h * out_w; j++)
            {
-                if (sum_fp32 < 0)
-                    sum_fp32 = 0;
-                if (sum_fp32 > 6)
-                    sum_fp32 = 6;
+                int output_off = i * (out_h * out_w) + j;
+
+                if (output_fp32[output_off] < 0)
+                    output_fp32[output_off] = 0;
            }
+        }
+    }

-            // quant output from fp32 to uint8
-            sum_int32 = round(sum_fp32 / output_scale) + output_zero;
-            if (sum_int32 > 255)
-                sum_int32 = 255;
-            if (sum_int32 < 0)
-                sum_int32 = 0;
-            output[0] = sum_int32;
-            output++;
+    // process activation relu6
+    if (param->activation > 0)
+    {
+        for (int i = 0; i < outchan_g; i++)
+        {
+            for (int j = 0; j < out_h * out_w; j++)
+            {
+                int output_off = i * (out_h * out_w) + j;
+
+                if (output_fp32[output_off] < 0)
+                    output_fp32[output_off] = 0;
+                if (output_fp32[output_off] > 6)
+                    output_fp32[output_off] = 6;
+            }
        }
    }
+
+    /* quant from fp32 to uint8 */
+    for (int i = 0; i < outchan_g; i++)
+    {
+        for (int j = 0; j < out_h * out_w; j++)
+        {
+            int output_off = i * (out_h * out_w) + j;
+        
+            int udata = (int)(round(output_fp32[output_off] / output->scale) + output->zero_point);
+            if (udata > 255)
+                udata = 255;
+            else if (udata < 0)
+                udata = 0;
+            output_uint8[output_off] = udata;
+        }
+    }    
+
+    sys_free(output_fp32);
 }

 int conv_kernel_get_shared_mem_size(struct ir_tensor* input, struct ir_tensor* output, struct conv_param* param)
@@ -339,6 +437,10 @@ int conv_kernel_get_shared_mem_size(struct ir_tensor* input, struct ir_tensor* o
    int output_xy = output->dims[2] * output->dims[3];
    int elem_size = input->elem_size;

+    // simulator uint8 inference with fp32
+    if (input->data_type == TENGINE_DT_UINT8)
+        elem_size = 4;
+
    return elem_size * output_xy * kernel_size;
 }

@@ -361,7 +463,10 @@ int conv_kernel_prerun(struct ir_tensor* input_tensor, struct ir_tensor* filter_
        priv_info->interleave_buffer_size = mem_size;
    }

-    interleave(filter_tensor, priv_info);
+    if (input_tensor->data_type == TENGINE_DT_UINT8)
+        interleave_uint8(filter_tensor, priv_info);
+    else
+        interleave(filter_tensor, priv_info);

    return 0;
 }
@@ -394,11 +499,17 @@ int conv_kernel_run(struct ir_tensor* input_tensor, struct ir_tensor* filter_ten
    {
        for (int j = 0; j < group; j++)
        {
-            im2col(input_tensor, output_tensor, priv_info, param, i, j);
+            
            if (type == TENGINE_DT_FP32)
+            {
+                im2col_fp32(input_tensor, output_tensor, priv_info, param, i, j);
                sgemm_fp32(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, i, j, num_thread);
-            else
+            }
+            else if (type == TENGINE_DT_UINT8)
+            {
+                im2col_uint8(input_tensor, output_tensor, priv_info, param, i, j);
                sgemm_uint8(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, i, j, num_thread);
+            }
        }
    }


--- a/src/dev/cpu/op/conv/x86/conv_kernel_x86.c
+++ b/src/dev/cpu/op/conv/x86/conv_kernel_x86.c
@@ -1255,7 +1255,7 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w)
        return 0;

    if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 ||
-        dilation_w != 1 || input_chan < 16 || output_chan < 16)
+        dilation_w != 1 || input_chan < 16 || output_chan < 16 || output_chan % 16)
        return 0;

    return 1;

--- a/src/dev/cpu/op/detection_output/detection_output_ref.c
+++ b/src/dev/cpu/op/detection_output/detection_output_ref.c
@@ -159,9 +159,9 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,

 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct ir_node* ir_node = exec_node->ir_node;
+    struct ir_node* ir_node   = exec_node->ir_node;
    struct ir_graph* ir_graph = ir_node->graph;
-    struct ir_tensor* loc_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct ir_tensor* loc_tensor  = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
    struct ir_tensor* conf_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
    struct ir_tensor* priorbox_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
    struct ir_tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
@@ -218,12 +218,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
    }

    const int num_priorx4 = priorbox_tensor->dims[2];
-    const int num_prior = num_priorx4 / 4;
+    const int num_prior   = num_priorx4 / 4;
    const int num_classes = param->num_classes;

    int b = 0;
-    float* loc_ptr = location + b * num_priorx4;
-    float* conf_ptr = confidence + b * num_prior * num_classes;
+    float* loc_ptr   = location + b * num_priorx4;
+    float* conf_ptr  = confidence + b * num_prior * num_classes;
    float* prior_ptr = priorbox + b * num_priorx4 * 2;

    Box_t boxes[num_prior];

--- a/src/dev/cpu/op/eltwise/eltwise_ref.c
+++ b/src/dev/cpu/op/eltwise/eltwise_ref.c
@@ -403,36 +403,71 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
        input1_count4 = input_tensor1->elem_num;
    }

-    int input_chan_0 = 0;
-    int input_hw_0 = 0;
-    int input0_count4 = input_tensor0->elem_num;
-
-    if (layout == TENGINE_LAYOUT_NCHW)
-    {
-        input_chan_0 = input_tensor0->dims[1];
-        input_hw_0 = input_tensor0->dims[2] * input_tensor0->dims[3];
-    }
-    else if (layout == TENGINE_LAYOUT_NHWC)
+    if (input_tensor0->elem_num >= input_tensor1->elem_num)
    {
-        input_chan_0 = input_tensor0->dims[3];
-        input_hw_0 = input_tensor0->dims[1] * input_tensor0->dims[2];
+        int input_chan_0 = 0;
+        int input_hw_0 = 0;
+        int input0_count4 = input_tensor0->elem_num;
+
+        if (layout == TENGINE_LAYOUT_NCHW)
+        {
+            input_chan_0 = input_tensor0->dims[1];
+            input_hw_0 = input_tensor0->dims[2] * input_tensor0->dims[3];
+        }
+        else if (layout == TENGINE_LAYOUT_NHWC)
+        {
+            input_chan_0 = input_tensor0->dims[3];
+            input_hw_0 = input_tensor0->dims[1] * input_tensor0->dims[2];
+        }
+        else
+        {
+            TLOG_ERR("unknown graph layout: %d\n", ir_graph->graph_layout);
+            set_tengine_errno(EFAULT);
+            return -1;
+        }
+
+        int ret = -1;
+        if (input_tensor0->data_type == TENGINE_DT_FP32)
+            ret = ref_eltwise_fp32(output, input0, input1, eltwise_param->type, input0_count4, input_chan_0, input_hw_0,
+                                   input1_count4, exec_graph->num_thread);
+        else
+            ret = ref_eltwise_uint8(output_tensor, input_tensor0, input_tensor1, eltwise_param->type, input0_count4,
+                                    input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread);
+        return ret;
    }
    else
    {
-        TLOG_ERR("unknown graph layout: %d\n", ir_graph->graph_layout);
-        set_tengine_errno(EFAULT);
-        return -1;
+        int input_chan_0 = 0;
+        int input_hw_0 = 0;
+        int input0_count4 = input_tensor1->elem_num;
+        input1_count4 = input_tensor0->elem_num;
+
+        if (layout == TENGINE_LAYOUT_NCHW)
+        {
+            input_chan_0 = input_tensor1->dims[1];
+            input_hw_0 = input_tensor1->dims[2] * input_tensor1->dims[3];
+        }
+        else if (layout == TENGINE_LAYOUT_NHWC)
+        {
+            input_chan_0 = input_tensor1->dims[3];
+            input_hw_0 = input_tensor1->dims[1] * input_tensor1->dims[2];
+        }
+        else
+        {
+            TLOG_ERR("unknown graph layout: %d\n", ir_graph->graph_layout);
+            set_tengine_errno(EFAULT);
+            return -1;
+        }
+
+        int ret = -1;
+        if (input_tensor1->data_type == TENGINE_DT_FP32)
+            ret = ref_eltwise_fp32(output, input1, input0, eltwise_param->type, input0_count4, input_chan_0, input_hw_0,
+                                   input1_count4, exec_graph->num_thread);
+        else
+            ret = ref_eltwise_uint8(output_tensor, input_tensor1, input_tensor0, eltwise_param->type, input0_count4,
+                                    input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread);
+        return ret;
    }
-
-    int ret = -1;
-    if (input_tensor0->data_type == TENGINE_DT_FP32)
-        ret = ref_eltwise_fp32(output, input0, input1, eltwise_param->type, input0_count4, input_chan_0, input_hw_0,
-                               input1_count4, exec_graph->num_thread);
-    else
-        ret = ref_eltwise_uint8(output_tensor, input_tensor0, input_tensor1, eltwise_param->type, input0_count4,
-                                input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread);
-
-    return ret;
 }

 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct ir_node* exec_node)

--- a/src/dev/cpu/op/interp/interp_ref.c
+++ b/src/dev/cpu/op/interp/interp_ref.c
@@ -19,7 +19,7 @@

 /*
 * Copyright (c) 2020, OPEN AI LAB
- * Author: zpluo@openailab.com
+ * Author: qtang@openailab.com
 */

 #include <math.h>
@@ -32,177 +32,87 @@
 #include "tengine_op.h"
 #include "interp_param.h"

-static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
-{
-    return 0;
-}
-
-static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
-{
-    return 0;
-}
+#define INTERP_MIN(a, b) ((a) < (b) ? (a) : (b))

-static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
-{
-    return 0;
-}
-
-typedef struct __interp_param
+int ref_interp_fp32(struct ir_tensor* input_tensor, struct ir_tensor* output_tensor, struct interp_param* param)
 {
-    float width_scale;
-    float height_scale;
-    int batch_number;
-    int inc;
-    int inh;
-    int inw;
-    int output_width;
-    int output_height;
-    int in_channel_size;
-    int out_channel_size;
-    int* buf;
-
-} _interp_param, *p_interp_param;
-
-void linear_coeffs(int w, int outw, int* xofs, float* alpha)
-{
-    double scale = ( double )w / outw;
-
-    for (int dx = 0; dx < outw; dx++)
+    if (input_tensor->dim_num != 4)
    {
-        float fx = ( float )((dx + 0.5) * scale - 0.5);
-        int sx = floor(fx);
-        fx -= sx;
-
-        if (sx < 0)
-        {
-            sx = 0;
-            fx = 0.f;
-        }
-        if (sx >= w - 1)
-        {
-            sx = w - 2;
-            fx = 1.f;
-        }
-
-        xofs[dx] = sx;
-
-        alpha[dx * 2] = 1.f - fx;
-        alpha[dx * 2 + 1] = fx;
+        printf("interp dim num is not 4\n");
+        return -1;
    }
-}
-
-void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, float* beta, int* yofs, int out_h,
-                           int out_w, int in_h, int in_w)
-{
-    int w = out_w;    // dst.w;
-    int h = out_h;    // dst.h;

-    // loop body
-    float* rowsbuf0 = ( float* )sys_malloc(w * sizeof(float));
-    float* rowsbuf1 = ( float* )sys_malloc(w * sizeof(float));
-    float* rows0 = rowsbuf0;
-    float* rows1 = rowsbuf1;
+    float* input = input_tensor->data;
+    float* output = output_tensor->data;

-    int prev_sy1 = -2;
+    int batch = input_tensor->dims[0];
+    int channel = input_tensor->dims[1];
+    int in_h = input_tensor->dims[2];
+    int in_w = input_tensor->dims[3];
+    int out_h = output_tensor->dims[2];
+    int out_w = output_tensor->dims[3];    

-    for (int dy = 0; dy < h; dy++)
+    for (int n = 0; n < batch; ++n) 
    {
-        int sy = yofs[dy];
-
-        if (sy == prev_sy1)
-        {
-            // reuse all rows
-        }
-        else if (sy == prev_sy1 + 1)
-        {
-            // hresize one row
-            float* rows0_old = rows0;
-            rows0 = rows1;
-            rows1 = rows0_old;
-            const float* S1 = src + (sy + 1) * in_w;    // src.row(sy+1);
-
-            const float* alphap = alpha;
-            float* rows1p = rows1;
-
-            for (int dx = 0; dx < w; dx++)
-            {
-                int sx = xofs[dx];
-                const float* S1p = S1 + sx;
-
-                float a0 = alphap[0];
-                float a1 = alphap[1];
-                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
-
-                alphap += 2;
-            }
-        }
-        else
+        for (int c = 0; c < channel; ++c) 
        {
-            // hresize two rows
-            const float* S0 = src + sy * in_w;    // src.row(sy);
-            const float* S1 = src + (sy + 1) * in_w;    // src.row(sy+1);
-
-            const float* alphap = alpha;
-            float* rows0p = rows0;
-            float* rows1p = rows1;
-
-            for (int dx = 0; dx < w; dx++)
+            for (int y = 0; y < param->output_height; ++y) 
            {
-                int sx = xofs[dx];
-                const float* S0p = S0 + sx;
-                const float* S1p = S1 + sx;
-
-                float a0 = alphap[0];
-                float a1 = alphap[1];
-                rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
-                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
-
-                alphap += 2;
+                float in_y = INTERP_MIN(y / param->height_scale, (float)(in_h - 1));
+                const int in_y1 = INTERP_MIN((int)(in_y), in_h - 1);
+                const int in_y2 = INTERP_MIN(in_y1 + 1, in_h - 1);
+                float dy1 = fabs(in_y - in_y1);
+                float dy2 = fabs(in_y - in_y2);
+                if (in_y1 == in_y2) 
+                {
+                    dy1 = 0.5f;
+                    dy2 = 0.5f;
+                }
+
+                const int input_width_mul_y1 = in_w * in_y1;
+                const int input_width_mul_y2 = in_w * in_y2;
+
+                for (int x = 0; x < param->output_width; ++x) 
+                {
+                    float in_x = INTERP_MIN(x / param->width_scale, (float)(in_w - 1));
+                    const int in_x1 = INTERP_MIN((int)(in_x), in_w - 1);
+                    const int in_x2 = INTERP_MIN(in_x1 + 1, in_w - 1);
+
+                    float dx1 = fabs(in_x - in_x1);
+                    float dx2 = fabs(in_x - in_x2);
+                    if (in_x1 == in_x2) 
+                    {
+                        dx1 = 0.5f;
+                        dx2 = 0.5f;
+                    }
+
+                    float X11 = input[input_width_mul_y1 + in_x1];
+                    float X21 = input[input_width_mul_y1 + in_x2];
+                    float X12 = input[input_width_mul_y2 + in_x1];
+                    float X22 = input[input_width_mul_y2 + in_x2];
+                    output[param->output_width * y + x] = dx2 * dy2 * X11 +dx1 * dy2 * X21 +dx2 * dy1 * X12 +dx1 * dy1 * X22;
+                }
            }
+            input += in_h * in_w;
+            output += param->output_width * param->output_height;
        }
-
-        prev_sy1 = sy;
-
-        // vresize
-        float b0 = beta[0];
-        float b1 = beta[1];
-
-        float* rows0p = rows0;
-        float* rows1p = rows1;
-        float* Dp = dst + dy * out_w;    // dst.row(dy);
-
-        for (int dx = 0; dx < w; dx++)
-        {
-            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
-        }
-
-        beta += 2;
    }

-    sys_free(rowsbuf0);
-    sys_free(rowsbuf1);
-    rows0 = NULL;
-    rows1 = NULL;
+    return 0;
 }

-int ref_interp_fp32(float* input, float* output, p_interp_param param)
+static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    int* xofs = param->buf;    // new int[ow];
-    int* yofs = param->buf + param->output_width;    // new int[oh];
-
-    float* alpha = ( float* )(param->buf + param->output_width + param->output_height);    // new float[ow * 2];
-    float* beta = ( float* )(param->buf + param->output_width + param->output_height +
-                             param->output_width * 2);    // new float[oh * 2];
-
-    linear_coeffs(param->inw, param->output_width, xofs, alpha);
-    linear_coeffs(param->inh, param->output_height, yofs, beta);
+    return 0;
+}

-    for (int q = 0; q < param->inc; ++q)
-    {
-        resize_bilinear_image(input + param->in_channel_size * q, output + param->out_channel_size * q, alpha, xofs,
-                              beta, yofs, param->output_height, param->output_width, param->inh, param->inw);
-    }
+static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    return 0;
+}

+static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
    return 0;
 }

@@ -210,43 +120,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 {
    struct ir_node* node = exec_node->ir_node;
    struct ir_graph* graph = node->graph;
-
    struct ir_tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
    struct ir_tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
-
-    void* input = input_tensor->data;
-    void* output = output_tensor->data;
-
    struct interp_param* param = ( struct interp_param* )node->op.param_mem;

-    float width_scale = param->width_scale;
-    float height_scale = param->height_scale;
-
-    int batch_number = input_tensor->dims[0];
-    int inc = input_tensor->dims[1];
-    int inh = input_tensor->dims[2];
-    int inw = input_tensor->dims[3];
-
-    int output_width = inh * width_scale;
-    int output_height = inw * height_scale;
-
-    _interp_param op_param;
-
-    op_param.inc = inc;
-    op_param.inh = inh;
-    op_param.inw = inw;
-    op_param.batch_number = batch_number;
-    op_param.output_width = output_width;
-    op_param.output_height = output_height;
-    op_param.width_scale = width_scale;
-    op_param.height_scale = height_scale;
-    op_param.out_channel_size = output_height * output_width;
-    op_param.in_channel_size = inh * inw;
-
-    op_param.buf = ( int* )malloc(sizeof(int) * (param->output_width + param->output_height + param->output_width * 2 +
-                                                 param->output_height * 2));
-    int ret = ref_interp_fp32(input, output, &op_param);
-    free(op_param.buf);
+    int ret = ref_interp_fp32(input_tensor, output_tensor, param);

    return ret;
 }

--- a/src/dev/cpu/op/softmax/softmax_ref.c
+++ b/src/dev/cpu/op/softmax/softmax_ref.c
@@ -180,8 +180,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex

        /* quant to uint8 */
        for (int i = 0; i < out_size; i++)
+        {
            for (int j = 0; j < on_in_size; j++)
-                output[i * on_in_size + j] = round((output_f[i * on_in_size + j] / output_scale) + output_zero);
+            {
+                int udata = (int)(round(output_f[i * on_in_size + j] / output_scale) + output_zero);
+                if (udata > 255)
+                    udata = 255;
+                else if (udata < 0)
+                    udata = 0;
+                output[i * on_in_size + j] = udata;
+            }
+        }

        free(input_f);
        free(output_f);

--- a/src/dev/cpu/op/split/split_ref.c
+++ b/src/dev/cpu/op/split/split_ref.c
@@ -78,18 +78,26 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
        struct ir_tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[i]);
        float* input_data = input_tensor->data;
        float* output_data = output_tensor->data;
-        int out_slice = 0;

-        out_slice = output_tensor->dims[slice_axis];
-
-        for (int n = 0; n < num_slices; n++)
+        if (split_param->is_caffe)
        {
-            int in_offset = (n * in_slice + slice_index) * slice_size;
-            int out_offset = n * out_slice * slice_size;
-            memcpy(output_data + out_offset, input_data + in_offset, slice_size * out_slice * sizeof(float));
+            memcpy(output_data, input_data, input_tensor->elem_num * sizeof(float));
        }
+        else
+        {
+            int out_slice = 0;
+
+            out_slice = output_tensor->dims[slice_axis];

-        slice_index += out_slice;
+            for (int n = 0; n < num_slices; n++)
+            {
+                int in_offset = (n * in_slice + slice_index) * slice_size;
+                int out_offset = n * out_slice * slice_size;
+                memcpy(output_data + out_offset, input_data + in_offset, slice_size * out_slice * sizeof(float));
+            }
+
+            slice_index += out_slice;
+        }
    }

    return 0;

--- a/src/dev/cpu/op/tanh/tanh_ref.c
+++ b/src/dev/cpu/op/tanh/tanh_ref.c
@@ -33,25 +33,12 @@

 int ref_tanh_fp32(struct ir_tensor* input_tensor, struct ir_tensor* output_tensor, int num_thread)
 {
-    int w = input_tensor->dims[3];
-    int h = output_tensor->dims[2];
-    int channels = input_tensor->dims[1];
-    int size = h * w;
-    int c_step = h * w;
-
    float* input_data = input_tensor->data;
    float* out_data = output_tensor->data;

-#pragma omp parallel for num_threads(num_thread)
-    for (int q = 0; q < channels; q++)
+    for (int i = 0; i < input_tensor->elem_num; i++)
    {
-        float* src = input_data + c_step * q;
-        float* dst = out_data + c_step * q;
-
-        for (int i = 0; i < size; i++)
-        {
-            dst[i] = tanhf(src[i]);
-        }
+        out_data[i] = tanhf(input_data[i]);
    }

    return 0;
@@ -59,15 +46,11 @@ int ref_tanh_fp32(struct ir_tensor* input_tensor, struct ir_tensor* output_tenso

 static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    exec_node->inplace_map[0] = 0;
-    exec_node->inplace_map[1] = 0;
-    exec_node->inplace_map_num = 1;
    return 0;
 }

 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    exec_node->inplace_map_num = 0;
    return 0;
 }

@@ -81,13 +64,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);

-    if (input_tensor->data != output_tensor->data)
-    {
-        TLOG_ERR("input and output are not the same mem\n");
-        set_tengine_errno(EFAULT);
-        return -1;
-    }
-
    ref_tanh_fp32(input_tensor, output_tensor, exec_graph->num_thread);

    return 0;

--- a/src/dev/cpu/op/unary/unary_ref.c
+++ b/src/dev/cpu/op/unary/unary_ref.c
@@ -38,11 +38,7 @@ static int ref_unary_fp32(struct ir_tensor* input_tensor, struct ir_tensor* outp
    float* in_data = input_tensor->data;
    float* out_data = output_tensor->data;

-    int n = input_tensor->dims[0];
-    int c = input_tensor->dims[1];
-    int h = input_tensor->dims[2];
-    int w = input_tensor->dims[3];
-    int size = n * c * h * w;
+    int size = input_tensor->elem_num;

    int type = param->type;


--- a/src/op/concat.c
+++ b/src/op/concat.c
@@ -81,6 +81,14 @@ static int infer_shape(struct ir_node* node)

    for (int i = 0; i < node->input_num; i++)
    {
+        struct ir_tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
+
+        if (axis < 0)
+        {
+            axis = input_tensor->dim_num + axis;
+            concat_param->axis = axis;
+        }
+
        struct ir_tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]);
        concat_shape += input->dims[axis];
    }

--- a/src/op/eltwise.c
+++ b/src/op/eltwise.c
@@ -60,17 +60,20 @@ static int infer_shape(struct ir_node* node)

    int i0_size = input0->elem_num;
    int i1_size = input1->elem_num;
+    int dim_num = 0;

    if (i0_size >= i1_size)
    {
        memcpy(output->dims, input0->dims, input0->dim_num * sizeof(int));
+        dim_num = input0->dim_num;
    }
-    else if (i0_size < i1_size)
+    else
    {
-        memcpy(output->dims, input1->dims, input0->dim_num * sizeof(int));
+        memcpy(output->dims, input1->dims, input1->dim_num * sizeof(int));
+        dim_num = input1->dim_num;
    }

-    set_ir_tensor_shape(output, output->dims, 4);
+    set_ir_tensor_shape(output, output->dims, dim_num);

    return 0;
 }

--- a/src/op/interp.c
+++ b/src/op/interp.c
@@ -53,8 +53,16 @@ static int infer_shape(struct ir_node* node)
        return -1;
    }

-    param->output_height = in_h * param->height_scale;
-    param->output_width = in_w * param->width_scale;
+    if (param->height_scale != 0 && param->width_scale != 0)
+    {
+        param->output_height = in_h * param->height_scale;
+        param->output_width = in_w * param->width_scale;
+    }
+    else
+    {
+        param->height_scale = (float )param->output_height / (float )in_h;
+        param->width_scale = (float )param->output_width / (float )in_w;
+    }

    int dim[4] = {0};


--- a/src/op/reshape.c
+++ b/src/op/reshape.c
@@ -60,7 +60,8 @@ static int infer_shape(struct ir_node* node)
            else
            {
                int temp = 1;
-                push_vector_data(new_shape, ( void* )&temp);
+                if (i == 0)
+                    push_vector_data(new_shape, ( void* )&temp);
            }

            in_idx++;
@@ -146,6 +147,8 @@ static int infer_shape(struct ir_node* node)
        new_shape_temp[i] = *a;
    }

+    output->layout = input->layout;
+
    set_ir_tensor_shape(output, new_shape_temp, get_vector_num(new_shape));

    sys_free(new_shape_temp);

--- a/src/op/softmax.c
+++ b/src/op/softmax.c
@@ -41,6 +41,8 @@ static int infer_shape(struct ir_node* node)
    struct ir_tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
    struct ir_tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);

+    output->layout = input->layout;
+
    set_ir_tensor_shape(output, input->dims, input->dim_num);

    return 0;

--- a/src/op/tanh.c
+++ b/src/op/tanh.c
@@ -34,10 +34,23 @@

 // DEFINE_PARM_PARSE_ENTRY(tanh_param, negative_slope);

+static int infer_shape(struct ir_node* node)
+{
+    struct ir_graph* ir_graph = node->graph;
+    struct ir_tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
+    struct ir_tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
+
+    output->layout = input->layout;
+    
+    set_ir_tensor_shape(output, input->dims, input->dim_num);
+
+    return 0;
+}
+
 static int init_op(struct ir_op* op)
 {
-    op->same_shape = 1;
-    op->infer_shape = NULL;
+    op->same_shape = 0;
+    op->infer_shape = infer_shape;

    return 0;
 }

--- a/src/op/unary.c
+++ b/src/op/unary.c
@@ -40,6 +40,8 @@ static int infer_shape(struct ir_node* node)
    struct ir_tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
    struct ir_tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);

+    output->layout = input->layout;
+
    set_ir_tensor_shape(output, input->dims, input->dim_num);

    return 0;

--- a/src/serializer/tm/op/tm2_const.c
+++ b/src/serializer/tm/op/tm2_const.c
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "sys_port.h"
-#include "module.h"
-#include "tengine_ir.h"
-#include "tengine_errno.h"
-#include "tengine_log.h"
-#include "tengine_serializer.h"
-#include "tm2_serializer.h"
-#include "tengine_op.h"
-
-static int const_op_map(int op)
-{
-    return OP_CONST;
-}
-
-static int tm2_load_const(struct ir_graph* ir_graph, struct ir_node* ir_node, const TM2_Node* tm_node,
-                          const TM2_Operator* tm_op)
-{
-    return 0;
-}
-
-static int reg_tm2_ops(void* arg)
-{
-    struct serializer* tm2_s = find_serializer("tengine");
-
-    if (tm2_s == NULL)
-    {
-        TLOG_ERR("tengine serializer has not been registered yet\n");
-        return -1;
-    }
-
-    tm2_s->register_op_loader(tm2_s, TM2_OPTYPE_CONST, 1, tm2_load_const, const_op_map, NULL);
-
-    return 0;
-}
-
-static int unreg_tm2_ops(void* arg)
-{
-    struct serializer* tm2_s = find_serializer("tengine");
-
-    tm2_s->unregister_op_loader(tm2_s, TM2_OPTYPE_CONST, 1, tm2_load_const);
-
-    return 0;
-}
-
-REGISTER_MODULE_INIT(MOD_OP_LEVEL, "reg_const_ops", reg_tm2_ops);
-REGISTER_MODULE_EXIT(MOD_OP_LEVEL, "unreg_const_ops", unreg_tm2_ops);
--- a/src/serializer/tm/op/tm2_deconv.c
+++ b/src/serializer/tm/op/tm2_deconv.c
@@ -62,10 +62,10 @@ static int tm2_load_deconv(struct ir_graph* ir_graph, struct ir_node* ir_node, c

    deconv_param->dilation_h = tm_param->dilation_h;
    deconv_param->dilation_w = tm_param->dilation_w;
-    /* TODO: get input_channel from tm_param */
-
-    deconv_param->group = deconv_param->group;

+    deconv_param->group = tm_param->group ;
+    deconv_param->num_output = tm_param->num_output ;
+    deconv_param->activation = tm_param->activation ;
    return 0;
 }


--- a/src/serializer/tm/op/tm2_interp.c
+++ b/src/serializer/tm/op/tm2_interp.c
@@ -48,11 +48,11 @@ static int tm2_load_interp(struct ir_graph* ir_graph, struct ir_node* ir_node, c
    const char* mem_base = tm2_priv->base;
    const TM2_InterpParam* tm_param = ( TM2_InterpParam* )(mem_base + tm_op->offset_t_param);

-    param->height_scale = tm_param->height_scale;
-    param->output_height = tm_param->output_height;
-    param->output_width = tm_param->output_width;
    param->resize_type = tm_param->resize_type;
    param->width_scale = tm_param->width_scale;
+    param->height_scale = tm_param->height_scale;
+    param->output_width = tm_param->output_width;
+    param->output_height = tm_param->output_height;

    return 0;
 }