未验证 提交 f2d49d40 编写于 作者: B BUG1989 提交者: GitHub

add yolact demo (#366)

* update yolact demo

* update action for linux-x86 opencv and avx2

* Update workflows.yml
上级 7561683b
......@@ -7,12 +7,30 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- name: update
run: sudo apt-get update
- name: protobuf
run: sudo apt-get install libopencv-dev
- name: configure
run: mkdir build && cd build && cmake ..
- name: build
run: cmake --build build -j 2
linux-gcc-avx2:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- name: update
run: sudo apt-get update
- name: protobuf
run: sudo apt-get install libopencv-dev
- name: configure
run: mkdir build && cd build && cmake -DTENGINE_AVX2=ON ..
- name: build
run: cmake --build build -j 2
android-armv7-a:
runs-on: ubuntu-latest
steps:
......
......@@ -65,7 +65,7 @@ option(TENGINE_BUILD_TESTS "build tests" OFF)
option(TENGINE_DEBUG_DATA "extract data for every layer" OFF)
option(TENGINE_DEBUG_TIME "print time information for every layer" OFF)
option(TENGINE_DEBUG_MEM_STAT "print memory status for library" OFF)
option(TENGINE_ARCH_X86_AVX "build avx2 for x86" OFF)
option(TENGINE_AVX2 "build avx2 for x86" OFF)
# add_definitions(-DCONFIG_DISABLE_PARAM_ACCESS)
......
......@@ -78,6 +78,6 @@ Tengine Lite 参考和借鉴了下列项目:
## 技术讨论
- Github issues
- QQ 群: 829565581 (答案:openailab)
- QQ 群: 829565581
- Email: Support@openailab.com
- Tengine 社区: http://www.tengine.org.cn/
......@@ -81,6 +81,6 @@ Tengine Lite got ideas and developed based on these projects:
## Tech Forum
- Github issues
- QQ groupchat: 829565581 (Answer: openailab)
- QQ groupchat: 829565581
- Email: Support@openailab.com
- Tengine Community: http://www.tengine.org.cn/
# macro for adding test
# macro for adding examples
macro (tengine_example name file)
add_executable(${name}
"${CMAKE_CURRENT_SOURCE_DIR}/${file}"
......@@ -17,3 +17,20 @@ tengine_example(tm_mobilenet_ssd_uint8 tm_mobilenet_ssd_uint8.cpp)
tengine_example(tm_retinaface tm_retinaface.cpp)
tengine_example(tm_yolov3_tiny tm_yolov3_tiny.cpp)
# add examples with opencv
if (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
if(OpenCV_FOUND)
# macro for adding examples
macro (tengine_example_cv name file)
add_executable(${name} "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
target_link_libraries(${name} ${CMAKE_PROJECT_NAME} ${OpenCV_LIBS})
endmacro()
tengine_example_cv(tm_yolact tm_yolact.cpp)
else()
message(WARNING "OpenCV not found, some examples won't be built")
endif()
endif()
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2020, OPEN AI LAB
* Author: qtang@openailab.com
*/
/*
* Parts of the following code in this file refs to
* https://github.com/Tencent/ncnn/blob/master/examples/yolact.cpp
* Tencent is pleased to support the open source community by making ncnn
* available.
*
* Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
*
* Licensed under the BSD 3-Clause License (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the
* License at
*
* https://opensource.org/licenses/BSD-3-Clause
*/
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include "tengine_c_api.h"
struct Object
{
cv::Rect_<float> rect;
int label;
float prob;
std::vector<float> maskdata;
cv::Mat mask;
};
void get_input_data_cv(const cv::Mat& sample, float* input_data, int img_h, int img_w, const float* mean, const float* scale, int swapRB = 0)
{
cv::Mat img;
if(sample.channels() == 4)
{
cv::cvtColor(sample, img, cv::COLOR_BGRA2BGR);
}
else if(sample.channels() == 1)
{
cv::cvtColor(sample, img, cv::COLOR_GRAY2BGR);
}
else if(sample.channels() == 3 && swapRB == 1)
{
cv::cvtColor(sample, img, cv::COLOR_BGR2RGB);
}
else
{
img = sample;
}
cv::resize(img, img, cv::Size(img_h, img_w));
img.convertTo(img, CV_32FC3);
float* img_data = ( float* )img.data;
int hw = img_h * img_w;
for(int h = 0; h < img_h; h++)
{
for(int w = 0; w < img_w; w++)
{
for(int c = 0; c < 3; c++)
{
input_data[c * hw + h * img_w + w] = (*img_data - mean[c]) * scale[c];
img_data++;
}
}
}
}
struct Box2f
{
float cx;
float cy;
float w;
float h;
};
static std::vector<Box2f> generate_priorbox(int num_priores)
{
std::vector<Box2f> priorboxs(num_priores);
const int conv_ws[5] = {69, 35, 18, 9, 5};
const int conv_hs[5] = {69, 35, 18, 9, 5};
const float aspect_ratios[3] = {1.f, 0.5f, 2.f};
const float scales[5] = {24.f, 48.f, 96.f, 192.f, 384.f};
int index = 0;
for(int i = 0; i < 5; i++)
{
int conv_w = conv_ws[i];
int conv_h = conv_hs[i];
int scale = scales[i];
for(int ii = 0; ii < conv_h; ii++)
{
for(int j = 0; j < conv_w; j++)
{
float cx = (j + 0.5f) / conv_w;
float cy = (ii + 0.5f) / conv_h;
for (int k = 0; k < 3; k++)
{
float ar = aspect_ratios[k];
ar = sqrt(ar);
float w = scale * ar / 550;
float h = scale / ar / 550;
h = w;
Box2f& priorbox = priorboxs[index];
priorbox.cx = cx;
priorbox.cy = cy;
priorbox.w = w;
priorbox.h = h;
index += 1;
}
}
}
}
return priorboxs;
}
static inline float intersection_area(const Object& a, const Object& b)
{
cv::Rect_<float> inter = a.rect & b.rect;
return inter.area();
}
static void fast_nms(std::vector< std::vector<Object> >& class_candidates, std::vector<Object>& objects, const float iou_thresh, const int nms_top_k, const int keep_top_k)
{
for(int i = 0; i < (int)class_candidates.size(); i++)
{
std::vector<Object>& candidate = class_candidates[i];
std::sort(candidate.begin(), candidate.end(),
[](const Object& a, const Object& b) {return a.prob > b.prob;});
if (candidate.size() == 0)
continue;
if(nms_top_k != 0&& nms_top_k > candidate.size())
{
candidate.erase(candidate.begin()+nms_top_k, candidate.end());
}
objects.push_back(candidate[0]);
const int n = candidate.size();
std::vector<float> areas(n);
std::vector<int> keep(n);
for(int j = 0; j < n; j++)
{
areas[j] = candidate[j].rect.area();
}
std::vector< std::vector<float> > iou_matrix;
for(int j = 0; j < n; j++)
{
std::vector<float> iou_row(n);
for(int k = 0; k < n; k++)
{
float inter_area = intersection_area(candidate[j], candidate[k]);
float union_area = areas[j] + areas[k] - inter_area;
iou_row[k] = inter_area / union_area;
}
iou_matrix.push_back(iou_row);
}
for(int j = 1; j < n; j++)
{
std::vector<float>::iterator max_value;
max_value = std::max_element(iou_matrix[j].begin(), iou_matrix[j].begin()+j-1);
if(*max_value <= iou_thresh)
{
objects.push_back(candidate[j]);
}
}
}
std::sort(objects.begin(), objects.end(),
[](const Object& a, const Object& b) {return a.prob > b.prob;});
if(objects.size() > keep_top_k)
objects.resize(keep_top_k);
}
static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects)
{
/* inital tengine */
if (init_tengine() != 0)
{
fprintf(stderr, "Initial tengine failed.\n");
return -1;
}
fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
const int target_size = 550;
int img_w = bgr.cols;
int img_h = bgr.rows;
const float mean_vals[3] = {123.68f, 116.78f, 103.94f};
const float norm_vals[3] = {1.0/58.40f, 1.0/57.12f, 1.0/57.38f};
/* create graph, load tengine model xxx.tmfile */
graph_t graph = create_graph(NULL, "tengine", "./yolact_tm.tmfile");
if (NULL == graph)
{
fprintf(stderr, "Create graph failed.\n");
fprintf(stderr, "errno: %d \n", get_tengine_errno());
return -1;
}
/* set the input shape to initial the graph, and prerun graph to infer shape */
int img_size = target_size * target_size * 3;
int dims[] = {1, 3, target_size, target_size}; // nchw
float* input_data = ( float* )malloc(img_size * sizeof(float));
tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
if (input_tensor == NULL)
{
fprintf(stderr, "Get input tensor failed\n");
return -1;
}
if (set_tensor_shape(input_tensor, dims, 4) < 0)
{
fprintf(stderr, "Set input tensor shape failed\n");
return -1;
}
if (prerun_graph_multithread(graph, TENGINE_CLUSTER_ALL, 1) < 0)
{
fprintf(stderr, "Prerun multithread graph failed.\n");
return -1;
}
/* prepare process input data, set the data mem to input tensor */
get_input_data_cv(bgr, input_data, target_size, target_size, mean_vals, norm_vals, 1);
if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0)
{
fprintf(stderr, "Set input tensor buffer failed\n");
return -1;
}
dump_graph(graph);
/* run graph */
if (run_graph(graph, 1) < 0)
{
fprintf(stderr, "Run graph failed\n");
return -1;
}
/* get the result of classification */
tensor_t maskmaps_tensor = get_graph_output_tensor(graph, 0, 0);
tensor_t location_tensor = get_graph_output_tensor(graph, 1, 0);
tensor_t mask_tensor = get_graph_output_tensor(graph, 2, 0);
tensor_t confidence_tensor = get_graph_output_tensor(graph, 3, 0);
float* maskmaps = ( float* )get_tensor_buffer(maskmaps_tensor);
float* location = ( float* )get_tensor_buffer(location_tensor);
float* mask = ( float* )get_tensor_buffer(mask_tensor);
float* confidence = ( float* )get_tensor_buffer(confidence_tensor);
int num_class = 81;
int num_priors = 19248;
std::vector<Box2f> priorboxes = generate_priorbox(num_priors);
const float confidence_thresh = 0.05f;
const float nms_thresh = 0.5f;
const int keep_top_k = 200;
std::vector< std::vector<Object> > class_candidates;
class_candidates.resize(num_class);
for(int i = 0; i < num_priors; i++)
{
const float* conf = confidence + i * 81;
const float* loc = location + i * 4;
const float* maskdata = mask + i * 32;
Box2f& priorbox = priorboxes[i];
int label = 0;
float score = 0.f;
for(int j = 1; j < num_class; j++)
{
float class_score = conf[j];
if(class_score > score)
{
label = j;
score = class_score;
}
}
if(label == 0||score <= confidence_thresh)
continue;
float var[4] = {0.1f, 0.1f, 0.2f, 0.2f};
float bbox_cx = var[0] * loc[0] * priorbox.w + priorbox.cx;
float bbox_cy = var[1] * loc[1] * priorbox.h + priorbox.cy;
float bbox_w = (float)(exp(var[2] * loc[2]) * priorbox.w);
float bbox_h = (float)(exp(var[3] * loc[3]) * priorbox.h);
float obj_x1 = bbox_cx - bbox_w * 0.5f;
float obj_y1 = bbox_cy - bbox_h * 0.5f;
float obj_x2 = bbox_cx + bbox_w * 0.5f;
float obj_y2 = bbox_cy + bbox_h * 0.5f;
obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
Object obj;
obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2-obj_x1+1, obj_y2-obj_y1+1);
obj.label = label;
obj.prob = score;
obj.maskdata = std::vector<float>(maskdata, maskdata + 32);
class_candidates[label].push_back(obj);
}
objects.clear();
fast_nms(class_candidates, objects, nms_thresh, 0, keep_top_k);
for (int i=0; i<objects.size(); i++)
{
Object& obj = objects[i];
cv::Mat mask1(138, 138, CV_32FC1);
{
mask1 = cv::Scalar(0.f);
for (int p=0; p<32; p++)
{
const float* maskmap = maskmaps + p;
float coeff = obj.maskdata[p];
float* mp = (float*)mask1.data;
// mask += m * coeff
for (int j=0; j< 138 * 138; j++)
{
mp[j] += maskmap[j*32] * coeff;
}
}
}
cv::Mat mask2;
cv::resize(mask1, mask2, cv::Size(img_w, img_h));
// crop obj box and binarize
obj.mask = cv::Mat(img_h, img_w, CV_8UC1);
{
obj.mask = cv::Scalar(0);
for (int y=0; y<img_h; y++)
{
if (y < obj.rect.y || y > obj.rect.y + obj.rect.height)
continue;
const float* mp2 = mask2.ptr<const float>(y);
uchar* bmp = obj.mask.ptr<uchar>(y);
for (int x=0; x<img_w; x++)
{
if (x < obj.rect.x || x > obj.rect.x + obj.rect.width)
continue;
bmp[x] = mp2[x] > 0.5f ? 255 : 0;
}
}
}
}
/* release tengine */
free(input_data);
postrun_graph(graph);
destroy_graph(graph);
release_tengine();
return 0;
}
static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
static const char* class_names[] = {"background",
"person", "bicycle", "car", "motorcycle", "airplane", "bus",
"train", "truck", "boat", "traffic light", "fire hydrant",
"stop sign", "parking meter", "bench", "bird", "cat", "dog",
"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat",
"baseball glove", "skateboard", "surfboard", "tennis racket",
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
"banana", "apple", "sandwich", "orange", "broccoli", "carrot",
"hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop",
"mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
"toaster", "sink", "refrigerator", "book", "clock", "vase",
"scissors", "teddy bear", "hair drier", "toothbrush"};
static const unsigned char colors[19][3] = {
{244, 67, 54},
{233, 30, 99},
{156, 39, 176},
{103, 58, 183},
{ 63, 81, 181},
{ 33, 150, 243},
{ 3, 169, 244},
{ 0, 188, 212},
{ 0, 150, 136},
{ 76, 175, 80},
{139, 195, 74},
{205, 220, 57},
{255, 235, 59},
{255, 193, 7},
{255, 152, 0},
{255, 87, 34},
{121, 85, 72},
{158, 158, 158},
{ 96, 125, 139}
};
cv::Mat image = bgr.clone();
int color_index = 0;
for (size_t i = 0; i < objects.size(); i++)
{
const Object& obj = objects[i];
if (obj.prob < 0.15)
continue;
fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
const unsigned char* color = colors[color_index++];
cv::rectangle(image, obj.rect, cv::Scalar(color[0], color[1], color[2]));
char text[256];
sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
int baseLine = 0;
cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
int x = obj.rect.x;
int y = obj.rect.y - label_size.height - baseLine;
if (y < 0)
y = 0;
if (x + label_size.width > image.cols)
x = image.cols - label_size.width;
cv::rectangle(image, cv::Rect(cv::Point(x, y),
cv::Size(label_size.width, label_size.height + baseLine)),
cv::Scalar(255, 255, 255), -1);
cv::putText(image, text, cv::Point(x, y + label_size.height),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
// draw mask
for (int y=0; y<image.rows; y++)
{
const uchar* mp = obj.mask.ptr(y);
uchar* p = image.ptr(y);
for (int x=0; x<image.cols; x++)
{
if (mp[x] == 255)
{
p[0] = cv::saturate_cast<uchar>(p[0] * 0.5 + color[0] * 0.5);
p[1] = cv::saturate_cast<uchar>(p[1] * 0.5 + color[1] * 0.5);
p[2] = cv::saturate_cast<uchar>(p[2] * 0.5 + color[2] * 0.5);
}
p += 3;
}
}
}
cv::imwrite("result.png", image);
}
int main(int argc, char** argv)
{
if (argc != 2)
{
fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
return -1;
}
const char* imagepath = argv[1];
cv::Mat m = cv::imread(imagepath, 1);
if (m.empty())
{
fprintf(stderr, "cv::imread %s failed\n", imagepath);
return -1;
}
std::vector<Object> objects;
detect_yolact(m, objects);
draw_objects(m, objects);
return 0;
}
......@@ -42,7 +42,7 @@ endif()
# X86
if (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
if (TENGINE_ARCH_X86_AVX)
if (TENGINE_AVX2)
add_definitions(-mfma -mf16c)
endif()
file(GLOB_RECURSE TENGINE_BACKEND_HCL_OPS "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/*hcl.c"
......
......@@ -8,10 +8,8 @@ struct conv_priv_info
{
void* interleave_buffer; // kernel transform buffer
void* interleave_buffer_pack4; // kernel pack4
void* p_input_max;
void* im2col_buffer; // input data transform buffer
void* im2col_buffer_pack4; // input data transform buffer pack4
void* p_kernel_max;
void* input_pad;
void* dot_block;
void* transform_input;
......
......@@ -8,8 +8,6 @@ struct conv_priv_info
{
void* interleave_buffer;
void* im2col_buffer;
void* p_input_max;
void* p_kernel_max;
int im2col_buffer_size;
int interleave_buffer_size;
int external_im2col_mem;
......
......@@ -985,9 +985,9 @@ static inline void transform_output_f43_4tile(float* buffer_ptr, float* out, int
// trans_input [block_hw/4][ELEM_SIZE][inc][4]
// kernel [out_c/PER_OUT_CHAN][ELEM_SIZE][in_c][PER_OUT_CHAN]
static void wino_sgemm_set(const float* ker, const float* inp, float* output, const float* bias, int cin, int cout_end,
int block_h, int block_w, int out_h, int out_w, int resi_h, int resi_w, int activation,
int num_thread, int cpu_affinity)
static void wino_sgemm_set(const float* ker, const float* inp, float* output, const float* bias, int cin,
int cout_end, int block_h, int block_w, int out_h, int out_w, int resi_h,
int resi_w, int activation, int num_thread, int cpu_affinity)
{
int flag_outw = 1;
if (out_w < 16)
......
......@@ -29,7 +29,10 @@
static int get_private_mem_size(struct ir_tensor* filter)
{
return filter->elem_num * filter->elem_size; // caution
if (filter->data_type == TENGINE_DT_UINT8) // simulator uint8 inference with fp32
return filter->elem_num * filter->elem_size * 4;
else
return filter->elem_num * filter->elem_size; // caution
}
static void interleave(struct ir_tensor* filter, struct conv_priv_info* priv_info)
......@@ -38,7 +41,21 @@ static void interleave(struct ir_tensor* filter, struct conv_priv_info* priv_inf
memcpy(priv_info->interleave_buffer, filter->data, filter->elem_num * filter->elem_size);
}
static inline void copy_one_element(void* src, void* dst, int src_off, int dst_off, int elem_type, int zero_point)
static void interleave_uint8(struct ir_tensor* filter, struct conv_priv_info* priv_info)
{
/* dequant uint8 weight to fp32 for simulator */
float* weight_fp32 = (float* )priv_info->interleave_buffer;
uint8_t* weight_uint8 = (uint8_t*)filter->data;
float scale = filter->scale;
int zero_point = filter->zero_point;
for (int i = 0; i < filter->elem_num; i++)
{
weight_fp32[i] = ((float)weight_uint8[i] - (float)zero_point) * scale;
}
}
static inline void copy_one_element(void* src, void* dst, int src_off, int dst_off, int elem_type, int zero_point, float scale)
{
switch (elem_type)
{
......@@ -65,11 +82,11 @@ static inline void copy_one_element(void* src, void* dst, int src_off, int dst_o
int_dst[dst_off] = int_src[src_off] - zero_point;
}
break;
case TENGINE_DT_UINT8:
case TENGINE_DT_UINT8: // simulator uint8 inference with fp32
{
int8_t* int_dst = dst;
float* int_dst = dst;
uint8_t* int_src = src;
int_dst[dst_off] = (int8_t)((int)int_src[src_off] - (int)zero_point);
int_dst[dst_off] = ((float)int_src[src_off] - (float)zero_point) * scale;
}
break;
}
......@@ -96,14 +113,70 @@ static inline void zero_one_element(void* dst, int dst_off, int elem_type)
case TENGINE_DT_INT8:
case TENGINE_DT_UINT8:
{
int8_t* int_dst = dst;
float* int_dst = dst; // simulator uint8 inference with fp32
int_dst[dst_off] = 0x0;
}
break;
}
}
static void im2col(struct ir_tensor* input, struct ir_tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n, int group)
static void im2col_fp32(struct ir_tensor* input, struct ir_tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n, int group)
{
int input_chan = param->input_channel / param->group;
int image_size = input->dims[1] * input->dims[2] * input->dims[3];
int group_size = input_chan * input->dims[2] * input->dims[3];
void* input_base = input->data + (n * image_size + group * group_size) * input->elem_size;
void* im2col_buf = priv_info->im2col_buffer;
float scale = input->scale;
int zero_point = input->zero_point;
int k_h = param->kernel_h;
int k_w = param->kernel_w;
int in_c = input_chan;
int in_h = input->dims[2];
int in_w = input->dims[3];
int out_h = output->dims[2];
int out_w = output->dims[3];
int s_h = param->stride_h;
int s_w = param->stride_w;
int p_h0 = param->pad_h0;
int p_w0 = param->pad_w0;
int d_h = param->dilation_h;
int d_w = param->dilation_w;
int data_type = input->data_type;
int kernel_size = k_h * k_w * in_c;
for (int i = 0; i < kernel_size; i++)
{
int c_off = i / (k_h * k_w);
int c_left = i % (k_h * k_w);
int kh_off = c_left / k_w;
int kw_off = c_left % k_w;
for (int l = 0; l < out_h; l++)
{
for (int m = 0; m < out_w; m++)
{
int out_off = (l * out_w + m) * kernel_size + i;
int img_h = l * s_h - p_h0 + kh_off * d_h;
int img_w = m * s_w - p_w0 + kw_off * d_w;
if (img_h >= 0 && img_w >= 0 && img_h < in_h && img_w < in_w)
{
int in_off = c_off * in_h * in_w + img_h * in_w + img_w;
copy_one_element(input_base, im2col_buf, in_off, out_off, data_type, zero_point, scale);
}
else
zero_one_element(im2col_buf, out_off, data_type);
}
}
}
}
static void im2col_uint8(struct ir_tensor* input, struct ir_tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n, int group)
{
int input_chan = param->input_channel / param->group;
int image_size = input->dims[1] * input->dims[2] * input->dims[3];
......@@ -112,6 +185,7 @@ static void im2col(struct ir_tensor* input, struct ir_tensor* output, struct con
void* input_base = input->data + (n * image_size + group * group_size) * input->elem_size;
void* im2col_buf = priv_info->im2col_buffer;
float scale = input->scale;
int zero_point = input->zero_point;
int k_h = param->kernel_h;
......@@ -149,7 +223,7 @@ static void im2col(struct ir_tensor* input, struct ir_tensor* output, struct con
if (img_h >= 0 && img_w >= 0 && img_h < in_h && img_w < in_w)
{
int in_off = c_off * in_h * in_w + img_h * in_w + img_w;
copy_one_element(input_base, im2col_buf, in_off, out_off, data_type, zero_point);
copy_one_element(input_base, im2col_buf, in_off, out_off, data_type, zero_point, scale);
}
else
zero_one_element(im2col_buf, out_off, data_type);
......@@ -244,91 +318,115 @@ static void sgemm_fp32(struct ir_tensor* input, struct ir_tensor* filter, struct
}
}
static void sgemm_uint8(struct ir_tensor* input_tensor, struct ir_tensor* filter_tensor, struct ir_tensor* bias_tensor,
struct ir_tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
int n, int group, int num_thread)
static void sgemm_uint8(struct ir_tensor* input, struct ir_tensor* filter, struct ir_tensor* bias,
struct ir_tensor* output, struct conv_priv_info* priv_info, struct conv_param* param, int n,
int group, int num_thread)
{
int kernel_size = param->kernel_h * param->kernel_w * param->input_channel / param->group;
int outchan_g = param->output_channel / param->group;
int out_h = output_tensor->dims[2];
int out_w = output_tensor->dims[3];
int out_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3];
/* data point */
unsigned char* interleave_uint8 = ( unsigned char* )priv_info->interleave_buffer + outchan_g * group * kernel_size;
signed char* im2col_int8 = priv_info->im2col_buffer;
unsigned char* output_uint8 = ( unsigned char* )output_tensor->data + n * out_image_size + outchan_g * group * out_h * out_w;
int* bias_int32 = NULL;
if (bias_tensor)
bias_int32 = ( int* )bias_tensor->data + outchan_g * group;
/* quantizaion scale and zero-point */
float input_scale = input_tensor->scale;
float weight_scale = filter_tensor->scale;
float output_scale = output_tensor->scale;
// float bias_scale = 0.f;
// if (bias_tensor)
// bias_scale = bias_tensor->scale;
unsigned char input_zero = input_tensor->zero_point;
unsigned char weight_zero = filter_tensor->zero_point;
unsigned char output_zero = output_tensor->zero_point;
/* int8 sgemm */
int out_c = output->dims[1];
int out_h = output->dims[2];
int out_w = output->dims[3];
int out_image_size = out_c * out_h * out_w;
int out_group_size = outchan_g * out_h * out_w;
float* interleave_fp32 = ( float* )priv_info->interleave_buffer + outchan_g * group * kernel_size;
float* im2col_fp32 = priv_info->im2col_buffer;
float* output_fp32 = (float*)sys_malloc(out_group_size * sizeof(float));
uint8_t* output_uint8 = ( uint8_t* )output->data + n * out_image_size + outchan_g * group * out_h * out_w;
int32_t* bias_int32 = NULL;
float bias_scale = 0.f;
if (bias)
{
bias_int32 = ( int32_t* )bias->data + outchan_g * group;
bias_scale = input->scale * filter->scale;
}
#pragma omp parallel for num_threads(num_thread)
for (int i = 0; i < outchan_g; i++)
for(int i = 0; i < outchan_g; i++)
{
unsigned char* kernel = interleave_uint8 + i * kernel_size;
signed char* input = im2col_int8;
unsigned char* output = output_uint8 + i * (out_h * out_w);
float* kernel = interleave_fp32 + i * kernel_size;
float* input = im2col_fp32;
float* output = output_fp32 + i * (out_h * out_w);
for (int j = 0; j < out_h * out_w; j++)
{
int im2col_off = j * kernel_size;
int sum_int32 = bias_tensor ? bias_int32[i] : 0;
float sum = 0.f;
for (int k = 0; k < kernel_size; k++)
{
int input_data = input[im2col_off + k];
int input_data_u32 = ( unsigned char )input[im2col_off + k];
int kernel_data = kernel[k] - weight_zero;
if (input_zero == 0)
sum_int32 += input_data_u32 * kernel_data;
else
sum_int32 += input_data * kernel_data;
sum += kernel[k] * input[im2col_off + k];
}
output[0] = sum;
output++;
}
}
// dequant sum from int32 to fp32
float sum_fp32 = (float)sum_int32 * input_scale * weight_scale;
// relu
if (param->activation > 0)
// process bias
if (bias)
{
for (int i = 0; i < outchan_g; i++)
{
for (int j = 0; j < out_h * out_w; j++)
{
if (sum_fp32 < 0)
sum_fp32 = 0;
int output_off = i * (out_h * out_w) + j;
output_fp32[output_off] += bias_int32[i] * bias_scale;
}
}
}
// relu6
if (param->activation > 0)
// process activation relu
if (param->activation == 0)
{
for (int i = 0; i < outchan_g; i++)
{
for (int j = 0; j < out_h * out_w; j++)
{
if (sum_fp32 < 0)
sum_fp32 = 0;
if (sum_fp32 > 6)
sum_fp32 = 6;
int output_off = i * (out_h * out_w) + j;
if (output_fp32[output_off] < 0)
output_fp32[output_off] = 0;
}
}
}
// quant output from fp32 to uint8
sum_int32 = round(sum_fp32 / output_scale) + output_zero;
if (sum_int32 > 255)
sum_int32 = 255;
if (sum_int32 < 0)
sum_int32 = 0;
output[0] = sum_int32;
output++;
// process activation relu6
if (param->activation > 0)
{
for (int i = 0; i < outchan_g; i++)
{
for (int j = 0; j < out_h * out_w; j++)
{
int output_off = i * (out_h * out_w) + j;
if (output_fp32[output_off] < 0)
output_fp32[output_off] = 0;
if (output_fp32[output_off] > 6)
output_fp32[output_off] = 6;
}
}
}
/* quant from fp32 to uint8 */
for (int i = 0; i < outchan_g; i++)
{
for (int j = 0; j < out_h * out_w; j++)
{
int output_off = i * (out_h * out_w) + j;
int udata = (int)(round(output_fp32[output_off] / output->scale) + output->zero_point);
if (udata > 255)
udata = 255;
else if (udata < 0)
udata = 0;
output_uint8[output_off] = udata;
}
}
sys_free(output_fp32);
}
int conv_kernel_get_shared_mem_size(struct ir_tensor* input, struct ir_tensor* output, struct conv_param* param)
......@@ -339,6 +437,10 @@ int conv_kernel_get_shared_mem_size(struct ir_tensor* input, struct ir_tensor* o
int output_xy = output->dims[2] * output->dims[3];
int elem_size = input->elem_size;
// simulator uint8 inference with fp32
if (input->data_type == TENGINE_DT_UINT8)
elem_size = 4;
return elem_size * output_xy * kernel_size;
}
......@@ -361,7 +463,10 @@ int conv_kernel_prerun(struct ir_tensor* input_tensor, struct ir_tensor* filter_
priv_info->interleave_buffer_size = mem_size;
}
interleave(filter_tensor, priv_info);
if (input_tensor->data_type == TENGINE_DT_UINT8)
interleave_uint8(filter_tensor, priv_info);
else
interleave(filter_tensor, priv_info);
return 0;
}
......@@ -394,11 +499,17 @@ int conv_kernel_run(struct ir_tensor* input_tensor, struct ir_tensor* filter_ten
{
for (int j = 0; j < group; j++)
{
im2col(input_tensor, output_tensor, priv_info, param, i, j);
if (type == TENGINE_DT_FP32)
{
im2col_fp32(input_tensor, output_tensor, priv_info, param, i, j);
sgemm_fp32(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, i, j, num_thread);
else
}
else if (type == TENGINE_DT_UINT8)
{
im2col_uint8(input_tensor, output_tensor, priv_info, param, i, j);
sgemm_uint8(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, i, j, num_thread);
}
}
}
......
......@@ -1255,7 +1255,7 @@ static int winograd_support(struct conv_param* param, int in_h, int in_w)
return 0;
if (group != 1 || kernel_h != 3 || kernel_w != 3 || stride_h != 1 || stride_w != 1 || dilation_h != 1 ||
dilation_w != 1 || input_chan < 16 || output_chan < 16)
dilation_w != 1 || input_chan < 16 || output_chan < 16 || output_chan % 16)
return 0;
return 1;
......
......@@ -159,9 +159,9 @@ static int release_node(struct node_ops* node_ops, struct exec_node* exec_node,
static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct ir_node* ir_node = exec_node->ir_node;
struct ir_node* ir_node = exec_node->ir_node;
struct ir_graph* ir_graph = ir_node->graph;
struct ir_tensor* loc_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
struct ir_tensor* loc_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
struct ir_tensor* conf_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
struct ir_tensor* priorbox_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
struct ir_tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
......@@ -218,12 +218,12 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
}
const int num_priorx4 = priorbox_tensor->dims[2];
const int num_prior = num_priorx4 / 4;
const int num_prior = num_priorx4 / 4;
const int num_classes = param->num_classes;
int b = 0;
float* loc_ptr = location + b * num_priorx4;
float* conf_ptr = confidence + b * num_prior * num_classes;
float* loc_ptr = location + b * num_priorx4;
float* conf_ptr = confidence + b * num_prior * num_classes;
float* prior_ptr = priorbox + b * num_priorx4 * 2;
Box_t boxes[num_prior];
......
......@@ -403,36 +403,71 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
input1_count4 = input_tensor1->elem_num;
}
int input_chan_0 = 0;
int input_hw_0 = 0;
int input0_count4 = input_tensor0->elem_num;
if (layout == TENGINE_LAYOUT_NCHW)
{
input_chan_0 = input_tensor0->dims[1];
input_hw_0 = input_tensor0->dims[2] * input_tensor0->dims[3];
}
else if (layout == TENGINE_LAYOUT_NHWC)
if (input_tensor0->elem_num >= input_tensor1->elem_num)
{
input_chan_0 = input_tensor0->dims[3];
input_hw_0 = input_tensor0->dims[1] * input_tensor0->dims[2];
int input_chan_0 = 0;
int input_hw_0 = 0;
int input0_count4 = input_tensor0->elem_num;
if (layout == TENGINE_LAYOUT_NCHW)
{
input_chan_0 = input_tensor0->dims[1];
input_hw_0 = input_tensor0->dims[2] * input_tensor0->dims[3];
}
else if (layout == TENGINE_LAYOUT_NHWC)
{
input_chan_0 = input_tensor0->dims[3];
input_hw_0 = input_tensor0->dims[1] * input_tensor0->dims[2];
}
else
{
TLOG_ERR("unknown graph layout: %d\n", ir_graph->graph_layout);
set_tengine_errno(EFAULT);
return -1;
}
int ret = -1;
if (input_tensor0->data_type == TENGINE_DT_FP32)
ret = ref_eltwise_fp32(output, input0, input1, eltwise_param->type, input0_count4, input_chan_0, input_hw_0,
input1_count4, exec_graph->num_thread);
else
ret = ref_eltwise_uint8(output_tensor, input_tensor0, input_tensor1, eltwise_param->type, input0_count4,
input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread);
return ret;
}
else
{
TLOG_ERR("unknown graph layout: %d\n", ir_graph->graph_layout);
set_tengine_errno(EFAULT);
return -1;
int input_chan_0 = 0;
int input_hw_0 = 0;
int input0_count4 = input_tensor1->elem_num;
input1_count4 = input_tensor0->elem_num;
if (layout == TENGINE_LAYOUT_NCHW)
{
input_chan_0 = input_tensor1->dims[1];
input_hw_0 = input_tensor1->dims[2] * input_tensor1->dims[3];
}
else if (layout == TENGINE_LAYOUT_NHWC)
{
input_chan_0 = input_tensor1->dims[3];
input_hw_0 = input_tensor1->dims[1] * input_tensor1->dims[2];
}
else
{
TLOG_ERR("unknown graph layout: %d\n", ir_graph->graph_layout);
set_tengine_errno(EFAULT);
return -1;
}
int ret = -1;
if (input_tensor1->data_type == TENGINE_DT_FP32)
ret = ref_eltwise_fp32(output, input1, input0, eltwise_param->type, input0_count4, input_chan_0, input_hw_0,
input1_count4, exec_graph->num_thread);
else
ret = ref_eltwise_uint8(output_tensor, input_tensor1, input_tensor0, eltwise_param->type, input0_count4,
input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread);
return ret;
}
int ret = -1;
if (input_tensor0->data_type == TENGINE_DT_FP32)
ret = ref_eltwise_fp32(output, input0, input1, eltwise_param->type, input0_count4, input_chan_0, input_hw_0,
input1_count4, exec_graph->num_thread);
else
ret = ref_eltwise_uint8(output_tensor, input_tensor0, input_tensor1, eltwise_param->type, input0_count4,
input_chan_0, input_hw_0, input1_count4, exec_graph->num_thread);
return ret;
}
static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct ir_node* exec_node)
......
......@@ -19,7 +19,7 @@
/*
* Copyright (c) 2020, OPEN AI LAB
* Author: zpluo@openailab.com
* Author: qtang@openailab.com
*/
#include <math.h>
......@@ -32,177 +32,87 @@
#include "tengine_op.h"
#include "interp_param.h"
static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}
static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}
#define INTERP_MIN(a, b) ((a) < (b) ? (a) : (b))
static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}
typedef struct __interp_param
int ref_interp_fp32(struct ir_tensor* input_tensor, struct ir_tensor* output_tensor, struct interp_param* param)
{
float width_scale;
float height_scale;
int batch_number;
int inc;
int inh;
int inw;
int output_width;
int output_height;
int in_channel_size;
int out_channel_size;
int* buf;
} _interp_param, *p_interp_param;
void linear_coeffs(int w, int outw, int* xofs, float* alpha)
{
double scale = ( double )w / outw;
for (int dx = 0; dx < outw; dx++)
if (input_tensor->dim_num != 4)
{
float fx = ( float )((dx + 0.5) * scale - 0.5);
int sx = floor(fx);
fx -= sx;
if (sx < 0)
{
sx = 0;
fx = 0.f;
}
if (sx >= w - 1)
{
sx = w - 2;
fx = 1.f;
}
xofs[dx] = sx;
alpha[dx * 2] = 1.f - fx;
alpha[dx * 2 + 1] = fx;
printf("interp dim num is not 4\n");
return -1;
}
}
void resize_bilinear_image(float* src, float* dst, float* alpha, int* xofs, float* beta, int* yofs, int out_h,
int out_w, int in_h, int in_w)
{
int w = out_w; // dst.w;
int h = out_h; // dst.h;
// loop body
float* rowsbuf0 = ( float* )sys_malloc(w * sizeof(float));
float* rowsbuf1 = ( float* )sys_malloc(w * sizeof(float));
float* rows0 = rowsbuf0;
float* rows1 = rowsbuf1;
float* input = input_tensor->data;
float* output = output_tensor->data;
int prev_sy1 = -2;
int batch = input_tensor->dims[0];
int channel = input_tensor->dims[1];
int in_h = input_tensor->dims[2];
int in_w = input_tensor->dims[3];
int out_h = output_tensor->dims[2];
int out_w = output_tensor->dims[3];
for (int dy = 0; dy < h; dy++)
for (int n = 0; n < batch; ++n)
{
int sy = yofs[dy];
if (sy == prev_sy1)
{
// reuse all rows
}
else if (sy == prev_sy1 + 1)
{
// hresize one row
float* rows0_old = rows0;
rows0 = rows1;
rows1 = rows0_old;
const float* S1 = src + (sy + 1) * in_w; // src.row(sy+1);
const float* alphap = alpha;
float* rows1p = rows1;
for (int dx = 0; dx < w; dx++)
{
int sx = xofs[dx];
const float* S1p = S1 + sx;
float a0 = alphap[0];
float a1 = alphap[1];
rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
alphap += 2;
}
}
else
for (int c = 0; c < channel; ++c)
{
// hresize two rows
const float* S0 = src + sy * in_w; // src.row(sy);
const float* S1 = src + (sy + 1) * in_w; // src.row(sy+1);
const float* alphap = alpha;
float* rows0p = rows0;
float* rows1p = rows1;
for (int dx = 0; dx < w; dx++)
for (int y = 0; y < param->output_height; ++y)
{
int sx = xofs[dx];
const float* S0p = S0 + sx;
const float* S1p = S1 + sx;
float a0 = alphap[0];
float a1 = alphap[1];
rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
alphap += 2;
float in_y = INTERP_MIN(y / param->height_scale, (float)(in_h - 1));
const int in_y1 = INTERP_MIN((int)(in_y), in_h - 1);
const int in_y2 = INTERP_MIN(in_y1 + 1, in_h - 1);
float dy1 = fabs(in_y - in_y1);
float dy2 = fabs(in_y - in_y2);
if (in_y1 == in_y2)
{
dy1 = 0.5f;
dy2 = 0.5f;
}
const int input_width_mul_y1 = in_w * in_y1;
const int input_width_mul_y2 = in_w * in_y2;
for (int x = 0; x < param->output_width; ++x)
{
float in_x = INTERP_MIN(x / param->width_scale, (float)(in_w - 1));
const int in_x1 = INTERP_MIN((int)(in_x), in_w - 1);
const int in_x2 = INTERP_MIN(in_x1 + 1, in_w - 1);
float dx1 = fabs(in_x - in_x1);
float dx2 = fabs(in_x - in_x2);
if (in_x1 == in_x2)
{
dx1 = 0.5f;
dx2 = 0.5f;
}
float X11 = input[input_width_mul_y1 + in_x1];
float X21 = input[input_width_mul_y1 + in_x2];
float X12 = input[input_width_mul_y2 + in_x1];
float X22 = input[input_width_mul_y2 + in_x2];
output[param->output_width * y + x] = dx2 * dy2 * X11 +dx1 * dy2 * X21 +dx2 * dy1 * X12 +dx1 * dy1 * X22;
}
}
input += in_h * in_w;
output += param->output_width * param->output_height;
}
prev_sy1 = sy;
// vresize
float b0 = beta[0];
float b1 = beta[1];
float* rows0p = rows0;
float* rows1p = rows1;
float* Dp = dst + dy * out_w; // dst.row(dy);
for (int dx = 0; dx < w; dx++)
{
*Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
}
beta += 2;
}
sys_free(rowsbuf0);
sys_free(rowsbuf1);
rows0 = NULL;
rows1 = NULL;
return 0;
}
int ref_interp_fp32(float* input, float* output, p_interp_param param)
static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
int* xofs = param->buf; // new int[ow];
int* yofs = param->buf + param->output_width; // new int[oh];
float* alpha = ( float* )(param->buf + param->output_width + param->output_height); // new float[ow * 2];
float* beta = ( float* )(param->buf + param->output_width + param->output_height +
param->output_width * 2); // new float[oh * 2];
linear_coeffs(param->inw, param->output_width, xofs, alpha);
linear_coeffs(param->inh, param->output_height, yofs, beta);
return 0;
}
for (int q = 0; q < param->inc; ++q)
{
resize_bilinear_image(input + param->in_channel_size * q, output + param->out_channel_size * q, alpha, xofs,
beta, yofs, param->output_height, param->output_width, param->inh, param->inw);
}
static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}
static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}
......@@ -210,43 +120,11 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
{
struct ir_node* node = exec_node->ir_node;
struct ir_graph* graph = node->graph;
struct ir_tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
struct ir_tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
void* input = input_tensor->data;
void* output = output_tensor->data;
struct interp_param* param = ( struct interp_param* )node->op.param_mem;
float width_scale = param->width_scale;
float height_scale = param->height_scale;
int batch_number = input_tensor->dims[0];
int inc = input_tensor->dims[1];
int inh = input_tensor->dims[2];
int inw = input_tensor->dims[3];
int output_width = inh * width_scale;
int output_height = inw * height_scale;
_interp_param op_param;
op_param.inc = inc;
op_param.inh = inh;
op_param.inw = inw;
op_param.batch_number = batch_number;
op_param.output_width = output_width;
op_param.output_height = output_height;
op_param.width_scale = width_scale;
op_param.height_scale = height_scale;
op_param.out_channel_size = output_height * output_width;
op_param.in_channel_size = inh * inw;
op_param.buf = ( int* )malloc(sizeof(int) * (param->output_width + param->output_height + param->output_width * 2 +
param->output_height * 2));
int ret = ref_interp_fp32(input, output, &op_param);
free(op_param.buf);
int ret = ref_interp_fp32(input_tensor, output_tensor, param);
return ret;
}
......
......@@ -180,8 +180,17 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
/* quant to uint8 */
for (int i = 0; i < out_size; i++)
{
for (int j = 0; j < on_in_size; j++)
output[i * on_in_size + j] = round((output_f[i * on_in_size + j] / output_scale) + output_zero);
{
int udata = (int)(round(output_f[i * on_in_size + j] / output_scale) + output_zero);
if (udata > 255)
udata = 255;
else if (udata < 0)
udata = 0;
output[i * on_in_size + j] = udata;
}
}
free(input_f);
free(output_f);
......
......@@ -78,18 +78,26 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
struct ir_tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[i]);
float* input_data = input_tensor->data;
float* output_data = output_tensor->data;
int out_slice = 0;
out_slice = output_tensor->dims[slice_axis];
for (int n = 0; n < num_slices; n++)
if (split_param->is_caffe)
{
int in_offset = (n * in_slice + slice_index) * slice_size;
int out_offset = n * out_slice * slice_size;
memcpy(output_data + out_offset, input_data + in_offset, slice_size * out_slice * sizeof(float));
memcpy(output_data, input_data, input_tensor->elem_num * sizeof(float));
}
else
{
int out_slice = 0;
out_slice = output_tensor->dims[slice_axis];
slice_index += out_slice;
for (int n = 0; n < num_slices; n++)
{
int in_offset = (n * in_slice + slice_index) * slice_size;
int out_offset = n * out_slice * slice_size;
memcpy(output_data + out_offset, input_data + in_offset, slice_size * out_slice * sizeof(float));
}
slice_index += out_slice;
}
}
return 0;
......
......@@ -33,25 +33,12 @@
int ref_tanh_fp32(struct ir_tensor* input_tensor, struct ir_tensor* output_tensor, int num_thread)
{
int w = input_tensor->dims[3];
int h = output_tensor->dims[2];
int channels = input_tensor->dims[1];
int size = h * w;
int c_step = h * w;
float* input_data = input_tensor->data;
float* out_data = output_tensor->data;
#pragma omp parallel for num_threads(num_thread)
for (int q = 0; q < channels; q++)
for (int i = 0; i < input_tensor->elem_num; i++)
{
float* src = input_data + c_step * q;
float* dst = out_data + c_step * q;
for (int i = 0; i < size; i++)
{
dst[i] = tanhf(src[i]);
}
out_data[i] = tanhf(input_data[i]);
}
return 0;
......@@ -59,15 +46,11 @@ int ref_tanh_fp32(struct ir_tensor* input_tensor, struct ir_tensor* output_tenso
static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
exec_node->inplace_map[0] = 0;
exec_node->inplace_map[1] = 0;
exec_node->inplace_map_num = 1;
return 0;
}
static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
exec_node->inplace_map_num = 0;
return 0;
}
......@@ -81,13 +64,6 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
if (input_tensor->data != output_tensor->data)
{
TLOG_ERR("input and output are not the same mem\n");
set_tengine_errno(EFAULT);
return -1;
}
ref_tanh_fp32(input_tensor, output_tensor, exec_graph->num_thread);
return 0;
......
......@@ -38,11 +38,7 @@ static int ref_unary_fp32(struct ir_tensor* input_tensor, struct ir_tensor* outp
float* in_data = input_tensor->data;
float* out_data = output_tensor->data;
int n = input_tensor->dims[0];
int c = input_tensor->dims[1];
int h = input_tensor->dims[2];
int w = input_tensor->dims[3];
int size = n * c * h * w;
int size = input_tensor->elem_num;
int type = param->type;
......
......@@ -81,6 +81,14 @@ static int infer_shape(struct ir_node* node)
for (int i = 0; i < node->input_num; i++)
{
struct ir_tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
if (axis < 0)
{
axis = input_tensor->dim_num + axis;
concat_param->axis = axis;
}
struct ir_tensor* input = get_ir_graph_tensor(graph, node->input_tensors[i]);
concat_shape += input->dims[axis];
}
......
......@@ -60,17 +60,20 @@ static int infer_shape(struct ir_node* node)
int i0_size = input0->elem_num;
int i1_size = input1->elem_num;
int dim_num = 0;
if (i0_size >= i1_size)
{
memcpy(output->dims, input0->dims, input0->dim_num * sizeof(int));
dim_num = input0->dim_num;
}
else if (i0_size < i1_size)
else
{
memcpy(output->dims, input1->dims, input0->dim_num * sizeof(int));
memcpy(output->dims, input1->dims, input1->dim_num * sizeof(int));
dim_num = input1->dim_num;
}
set_ir_tensor_shape(output, output->dims, 4);
set_ir_tensor_shape(output, output->dims, dim_num);
return 0;
}
......
......@@ -53,8 +53,16 @@ static int infer_shape(struct ir_node* node)
return -1;
}
param->output_height = in_h * param->height_scale;
param->output_width = in_w * param->width_scale;
if (param->height_scale != 0 && param->width_scale != 0)
{
param->output_height = in_h * param->height_scale;
param->output_width = in_w * param->width_scale;
}
else
{
param->height_scale = (float )param->output_height / (float )in_h;
param->width_scale = (float )param->output_width / (float )in_w;
}
int dim[4] = {0};
......
......@@ -60,7 +60,8 @@ static int infer_shape(struct ir_node* node)
else
{
int temp = 1;
push_vector_data(new_shape, ( void* )&temp);
if (i == 0)
push_vector_data(new_shape, ( void* )&temp);
}
in_idx++;
......@@ -146,6 +147,8 @@ static int infer_shape(struct ir_node* node)
new_shape_temp[i] = *a;
}
output->layout = input->layout;
set_ir_tensor_shape(output, new_shape_temp, get_vector_num(new_shape));
sys_free(new_shape_temp);
......
......@@ -41,6 +41,8 @@ static int infer_shape(struct ir_node* node)
struct ir_tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
struct ir_tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
output->layout = input->layout;
set_ir_tensor_shape(output, input->dims, input->dim_num);
return 0;
......
......@@ -34,10 +34,23 @@
// DEFINE_PARM_PARSE_ENTRY(tanh_param, negative_slope);
static int infer_shape(struct ir_node* node)
{
struct ir_graph* ir_graph = node->graph;
struct ir_tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
struct ir_tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
output->layout = input->layout;
set_ir_tensor_shape(output, input->dims, input->dim_num);
return 0;
}
static int init_op(struct ir_op* op)
{
op->same_shape = 1;
op->infer_shape = NULL;
op->same_shape = 0;
op->infer_shape = infer_shape;
return 0;
}
......
......@@ -40,6 +40,8 @@ static int infer_shape(struct ir_node* node)
struct ir_tensor* input = get_ir_graph_tensor(ir_graph, node->input_tensors[0]);
struct ir_tensor* output = get_ir_graph_tensor(ir_graph, node->output_tensors[0]);
output->layout = input->layout;
set_ir_tensor_shape(output, input->dims, input->dim_num);
return 0;
......
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2020, OPEN AI LAB
* Author: qtang@openailab.com
*/
#include <stdio.h>
#include <stdlib.h>
#include "sys_port.h"
#include "module.h"
#include "tengine_ir.h"
#include "tengine_errno.h"
#include "tengine_log.h"
#include "tengine_serializer.h"
#include "tm2_serializer.h"
#include "tengine_op.h"
static int const_op_map(int op)
{
return OP_CONST;
}
static int tm2_load_const(struct ir_graph* ir_graph, struct ir_node* ir_node, const TM2_Node* tm_node,
const TM2_Operator* tm_op)
{
return 0;
}
static int reg_tm2_ops(void* arg)
{
struct serializer* tm2_s = find_serializer("tengine");
if (tm2_s == NULL)
{
TLOG_ERR("tengine serializer has not been registered yet\n");
return -1;
}
tm2_s->register_op_loader(tm2_s, TM2_OPTYPE_CONST, 1, tm2_load_const, const_op_map, NULL);
return 0;
}
static int unreg_tm2_ops(void* arg)
{
struct serializer* tm2_s = find_serializer("tengine");
tm2_s->unregister_op_loader(tm2_s, TM2_OPTYPE_CONST, 1, tm2_load_const);
return 0;
}
REGISTER_MODULE_INIT(MOD_OP_LEVEL, "reg_const_ops", reg_tm2_ops);
REGISTER_MODULE_EXIT(MOD_OP_LEVEL, "unreg_const_ops", unreg_tm2_ops);
......@@ -62,10 +62,10 @@ static int tm2_load_deconv(struct ir_graph* ir_graph, struct ir_node* ir_node, c
deconv_param->dilation_h = tm_param->dilation_h;
deconv_param->dilation_w = tm_param->dilation_w;
/* TODO: get input_channel from tm_param */
deconv_param->group = deconv_param->group;
deconv_param->group = tm_param->group ;
deconv_param->num_output = tm_param->num_output ;
deconv_param->activation = tm_param->activation ;
return 0;
}
......
......@@ -48,11 +48,11 @@ static int tm2_load_interp(struct ir_graph* ir_graph, struct ir_node* ir_node, c
const char* mem_base = tm2_priv->base;
const TM2_InterpParam* tm_param = ( TM2_InterpParam* )(mem_base + tm_op->offset_t_param);
param->height_scale = tm_param->height_scale;
param->output_height = tm_param->output_height;
param->output_width = tm_param->output_width;
param->resize_type = tm_param->resize_type;
param->width_scale = tm_param->width_scale;
param->height_scale = tm_param->height_scale;
param->output_width = tm_param->output_width;
param->output_height = tm_param->output_height;
return 0;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册