提交 e06e9c71 编写于 作者: W wandongdong

add print info for GPU subgraph

上级 e6ffdeeb
......@@ -273,6 +273,7 @@ int LiteSession::Init(Context *context) {
if (context_->device_ctx_.type == DT_GPU) {
auto opencl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
opencl_runtime->Init();
MS_LOG(INFO) << "Init OpenCL runtime.";
}
#endif
executor = new Executor();
......
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip, __write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
......@@ -27,13 +28,14 @@ __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __globa
}
FLT4 bias_p = bias[Z];
FLT4 res = TO_FLT4(r) + bias_p;
res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res);
}
__kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip, __write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
......@@ -58,13 +60,14 @@ __kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global
}
FLT4 bias_p = bias[Z];
FLT4 res = TO_FLT4(r) + bias_p;
res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), res);
}
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip, __write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__write_only image2d_t dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
......@@ -81,7 +84,6 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __gl
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 flt_p = filter[fx_c];
// FLT4 src_p =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
r += TO_FLT4(src_p * flt_p);
}
......@@ -89,13 +91,13 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __gl
}
FLT4 bias_p = bias[Z];
FLT4 res = TO_FLT4(r) + bias_p;
res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
// dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
WRITE_IMAGE(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res);
}
__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
......@@ -120,13 +122,14 @@ __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4
}
FLT4 bias_p = bias[Z];
FLT4 res = TO_FLT4(r) + bias_p;
res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res;
}
__kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
......@@ -151,13 +154,14 @@ __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *
}
FLT4 bias_p = bias[Z];
FLT4 res = TO_FLT4(r) + bias_p;
res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
}
__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
float relu_clip_min, float relu_clip_max) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
......@@ -181,6 +185,6 @@ __kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FL
}
FLT4 bias_p = bias[Z];
FLT4 res = TO_FLT4(r) + bias_p;
res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max));
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
}
......@@ -186,8 +186,8 @@ int Conv2dTransposeOpenCLKernel::Run() {
cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1};
int arg_cnt = 0;
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padWeight_);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride);
......
......@@ -254,34 +254,34 @@ int ConvolutionOpenCLKernel::Run() {
arg_cn = 0;
cl_int4 _4x4to36_in_shape = {1, IH, IW, CI_SLICES};
cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES};
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_);
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_in_shape);
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_out_shape);
arg_cn = 0;
cl_int4 conv_in_shape = {1, 36, TILES_XY, CI_SLICES};
cl_int4 conv_out_shape = {1, 36, TILES_XY, CO_SLICES};
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_in_shape);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_out_shape);
arg_cn = 0;
cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES};
cl_int4 _36to4x4_out_shape = {1, OH, OW, CO_SLICES};
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape);
} else {
arg_cn = 0;
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
}
if (use_winograd_) {
......@@ -594,9 +594,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
if (param->act_type_ == ActType_Relu) {
code += " acc = max(acc, (float4)(0.0f));\n";
code += " acc = max(acc, (FLT4)(0.0f));\n";
} else if (param->act_type_ == ActType_Relu6) {
code += " acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n";
code += " acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n";
}
code +=
......
......@@ -15,6 +15,7 @@
*/
#include "src/runtime/kernel/opencl/kernel/depthwise_conv2d.h"
#include <float.h>
#include <string>
#include <set>
#include <utility>
......@@ -180,7 +181,8 @@ int DepthwiseConv2dOpenCLKernel::Run() {
std::vector<size_t> local;
GetLocalSize(0, global, &local);
float relu_clip1 = 6.0;
std::map<ActType, std::pair<float, float>> relu_clips{
{ActType_No, {FLT_MIN, FLT_MAX}}, {ActType_Relu, {0.0, FLT_MAX}}, {ActType_Relu6, {0, 6.0}}};
cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_};
cl_int2 stride = {parameter->stride_h_, parameter->stride_w_};
cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_};
......@@ -189,17 +191,19 @@ int DepthwiseConv2dOpenCLKernel::Run() {
cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
(cl_int)out_tensors_[0]->Batch()};
ocl_runtime->SetKernelArg(kernel_, 1, packed_weight_);
ocl_runtime->SetKernelArg(kernel_, 2, bias_data_);
ocl_runtime->SetKernelArg(kernel_, 3, relu_clip1);
ocl_runtime->SetKernelArg(kernel_, 5, kernel_size);
ocl_runtime->SetKernelArg(kernel_, 6, stride);
ocl_runtime->SetKernelArg(kernel_, 7, padding);
ocl_runtime->SetKernelArg(kernel_, 8, dilation);
ocl_runtime->SetKernelArg(kernel_, 9, src_size);
ocl_runtime->SetKernelArg(kernel_, 10, dst_size);
ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, 4, out_tensors_[0]->Data());
int arg_cnt = 0;
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padding);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dilation);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first);
ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
return RET_OK;
}
......
......@@ -163,8 +163,8 @@ int MatMulOpenCLKernel::Run() {
std::vector<size_t> global = {UP_ROUND(sizeCO.s[1], local[0]), 4};
int arg_count = 0;
ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_);
ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_);
ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCI);
ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCO);
......
......@@ -147,8 +147,10 @@ int ToFormatOpenCLKernel::Run() {
cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]};
cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data());
auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF;
ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data(), src_mem_type);
ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data(), dst_mem_type);
ocl_runtime->SetKernelArg(kernel_, 2, gsize);
ocl_runtime->SetKernelArg(kernel_, 3, shape);
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
......
......@@ -275,20 +275,23 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
// fp16 enable, kernel will use half and read_imageh and write_imageh.
build_options_str =
"-DFLT=half -DFLT4=half4 -DFLT16=half16 "
"-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4";
"-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4 ";
} else {
// fp16 not enable, kernel will use float and read_imagef and write_imagef.
build_options_str =
"-DFLT=float -DFLT4=float4 -DFLT16=float16 "
"-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4";
"-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4 ";
}
build_options_str = std::accumulate(
build_options.begin(), build_options.end(), build_options_str,
[](const std::string &options, const std::string &option) -> std::string { return options + " " + option; });
auto build_options_ext = std::accumulate(
build_options.begin(), build_options.end(), std::string(""),
[](const std::string &options, const std::string &option) -> std::string {
auto res = options + " " + option;
return res;
});
build_options_str += default_build_opts_;
// program identifier = program_name + build_options
std::string build_program_key = program_name + build_options_str;
std::string build_program_key = program_name + build_options_str + build_options_ext;
auto build_program_it = program_map_.find(build_program_key);
cl::Program program;
......@@ -409,9 +412,7 @@ GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_
bool OpenCLRuntime::LoadSource(const std::string &program_name, const std::string &source) {
auto it_source = g_opencl_program_map.find(program_name);
if (it_source != g_opencl_program_map.end()) {
it_source->second = source;
} else {
if (it_source == g_opencl_program_map.end()) {
g_opencl_program_map.emplace(program_name, source);
}
return true;
......
......@@ -16,6 +16,8 @@
#include "src/scheduler.h"
#include <vector>
#include <string>
#include <algorithm>
#include "include/errorcode.h"
#include "src/kernel_registry.h"
#include "src/common/graph_util.h"
......@@ -140,7 +142,7 @@ int Scheduler::InitOp2Kernel(const lite::Model *model, std::vector<tensor::Tenso
outputs.emplace_back(tensors->at(size_t(outIndexes->GetAs<uint32_t>(j))));
}
auto *primitive = model->GetOp(cNode->name()->str());
auto *kernel = this->ScheduleNode(inputs, outputs, primitive);
auto *kernel = this->ScheduleNode(inputs, outputs, primitive, cNode);
if (nullptr == kernel) {
MS_LOG(ERROR) << "ScheduleNode return nullptr, name: " << cNode->name()->str()
<< ", type: " << schema::EnumNamePrimitiveType(cNode->primitive()->value_type());
......@@ -176,6 +178,7 @@ void Scheduler::ConstructSubgraphs(std::vector<kernel::LiteKernel *> *kernels) {
}
std::vector<kernel::LiteKernel *> subgraph_kernels;
size_t sub_cnt{0};
for (auto temp_kernels : sub_kernels_list) {
kernel::KERNEL_ARCH arch = temp_kernels.front()->desc().arch;
if (arch == kernel::KERNEL_ARCH::kCPU) {
......@@ -194,7 +197,10 @@ void Scheduler::ConstructSubgraphs(std::vector<kernel::LiteKernel *> *kernels) {
} else {
auto subgraph_kernel = CreateSubKernel(temp_kernels, arch);
subgraph_kernels.emplace_back(subgraph_kernel);
std::string arch_name = (arch == kernel::KERNEL_ARCH::kGPU) ? "GPU" : "NPU";
MS_LOG(INFO) << arch_name << " subgraph id" << sub_cnt << " created.";
}
++sub_cnt;
}
kernels->clear();
kernels->insert(kernels->begin(), subgraph_kernels.begin(), subgraph_kernels.end());
......@@ -223,7 +229,7 @@ kernel::LiteKernel *Scheduler::CreateSubKernel(const std::vector<kernel::LiteKer
kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<tensor::Tensor *> &in_tensors,
const std::vector<tensor::Tensor *> &out_tensors,
const mindspore::lite::PrimitiveC *primitive) {
const mindspore::lite::PrimitiveC *primitive, const schema::CNode *cnode) {
MS_ASSERT(nullptr != primitive);
auto data_type = in_tensors.front()->data_type();
kernel::KernelKey desc{kernel::KERNEL_ARCH::kCPU, data_type, static_cast<schema::PrimitiveType>(primitive->Type())};
......@@ -233,6 +239,10 @@ kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<tensor::Tensor *>
if (nullptr != kernel) {
kernel->set_desc(desc);
return kernel;
} else {
MS_LOG(ERROR) << "Not supported GPU Op "
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(primitive->Type())) << " "
<< (cnode->name()->str());
}
}
......
......@@ -35,7 +35,8 @@ class Scheduler {
protected:
kernel::LiteKernel *ScheduleNode(const std::vector<tensor::Tensor *> &in_tensors,
const std::vector<tensor::Tensor *> &out_tensors,
const mindspore::lite::PrimitiveC *primitive);
const mindspore::lite::PrimitiveC *primitive,
const schema::CNode *cnode);
private:
int InitOp2Kernel(const lite::Model *model, std::vector<tensor::Tensor *> *tensors,
......
......@@ -207,7 +207,7 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686,
2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
}
TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
......@@ -279,7 +279,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933,
1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203};
DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NC4HW4);
}
TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
......@@ -324,7 +324,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686,
2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
// delete conv_param;
}
TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
......@@ -396,7 +397,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933,
1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203};
DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4);
}
TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) {
......@@ -443,7 +444,7 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) {
2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988};
lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true);
DepthWiseTestMain<float16_t, float16_t>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
DepthWiseTestMain<float16_t, float16_t>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
kNumberTypeFloat16, true, 1e-2);
}
......@@ -517,7 +518,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp16) {
1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203};
lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true);
DepthWiseTestMain<float16_t, float16_t>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
DepthWiseTestMain<float16_t, float16_t>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4,
kNumberTypeFloat16, true, 1e-2);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册