未验证 提交 2d4e9c13 编写于 作者: X xiebaiyuan 提交者: GitHub

[LITE][OPENCL] conv2d_1x1_image, choose simple kernel when in some ca… (#2771)

* [LITE][OPENCL] conv2d_1x1_image, choose simple kernel when in some case. for opencl ,test=develop

* [LITE][OPENCL] conv2d_1x1_image, add looptest ,test=develop
上级 0197977d
...@@ -214,3 +214,172 @@ __kernel void conv2d_1x1(__private const int global_size_dim0, ...@@ -214,3 +214,172 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
} }
} }
__kernel void conv2d_1x1_simple(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int input_c_origin,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
int2 in_pos_in_one_block0 =
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
int2 in_pos_in_one_block1 =
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
int2 in_pos_in_one_block2 =
ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
int2 in_pos_in_one_block3 =
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
CL_DTYPE4 output0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
CL_DTYPE4 output1 = output0;
CL_DTYPE4 output2 = output0;
CL_DTYPE4 output3 = output0;
#elif defined(BIASE_ELE)
CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
CL_DTYPE4 output1 = output0;
CL_DTYPE4 output2 = output0;
CL_DTYPE4 output3 = output0;
#else
CL_DTYPE4 output0 = 0.0f;
CL_DTYPE4 output1 = 0.0f;
CL_DTYPE4 output2 = 0.0f;
CL_DTYPE4 output3 = 0.0f;
#endif
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
CL_DTYPE4 input0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
CL_DTYPE4 weight0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
CL_DTYPE4 weight1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
CL_DTYPE4 weight2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
CL_DTYPE4 weight3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
CL_DTYPE4 input1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
CL_DTYPE4 input2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
output2 = mad(input2.w, weight3, output2);
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
CL_DTYPE4 input3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
output3 = mad(input3.w, weight3, output3);
}
#ifdef BATCH_NORM
output0 = output0 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation_type4(output0);
output1 = activation_type4(output1);
output2 = activation_type4(output2);
output3 = activation_type4(output3);
#endif
if (out_w0 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
}
if (out_w1 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
}
if (out_w2 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
}
if (out_w3 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
}
}
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <vector> #include <vector>
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
...@@ -45,9 +46,15 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL), ...@@ -45,9 +46,15 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH"; build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
} }
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
if (param.x->dims()[1] % 4 == 0) {
context.cl_context()->AddKernel(kernel_func_name_simple_,
"image/conv2d_1x1_kernel.cl",
build_options_);
} else {
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_); kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_);
} }
}
void Run() override { void Run() override {
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -135,7 +142,11 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL), ...@@ -135,7 +142,11 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
if (input_dims[1] % 4 == 0) {
kernel_key << kernel_func_name_simple_ << build_options_;
} else {
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
}
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int maped_w = maptofactor(w, 4); int maped_w = maptofactor(w, 4);
...@@ -215,6 +226,7 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL), ...@@ -215,6 +226,7 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
private: private:
std::string kernel_func_name_{"conv2d_1x1"}; std::string kernel_func_name_{"conv2d_1x1"};
std::string kernel_func_name_simple_{"conv2d_1x1_simple"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -13,12 +13,14 @@ ...@@ -13,12 +13,14 @@
// limitations under the License. // limitations under the License.
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <random> #include <random>
#include "lite/backends/opencl/cl_image_converter.h" #include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/target_wrapper.h" #include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/utils/logging.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -106,7 +108,6 @@ static void conv_basic(const Dtype1* din, ...@@ -106,7 +108,6 @@ static void conv_basic(const Dtype1* din,
} }
} }
} }
TEST(conv2d_1x1, compute) { TEST(conv2d_1x1, compute) {
// conv infos // conv infos
const int ksize = 1; const int ksize = 1;
...@@ -114,30 +115,40 @@ TEST(conv2d_1x1, compute) { ...@@ -114,30 +115,40 @@ TEST(conv2d_1x1, compute) {
const int pad = 0; const int pad = 0;
const int group = 1; const int group = 1;
const int dilation = 0; const int dilation = 0;
// int loop_cnt = 0; // int loop_cnt = 0;
const bool bias_flag = true; #ifdef LOOP_TEST
const bool relu_flag = true; for (int batch_size = 1; batch_size < 4; ++batch_size) {
const int batch_size = 8; for (int oc = 4; oc < 10; oc += 1) { // oc
const int oc = 64; for (int ih = 4; ih < 9; ih += 1) { // ih
const int ih = 28; /*int iw = ih;*/ for (int iw = 4; iw < 10; iw += 1) { // iw
const int iw = 28; for (int ic = 4; ic < 10; ic += 1) { // ic
const int ic = 63; for (bool bias_flag : {true, false}) {
for (bool relu_flag : {true, false}) {
#else
const int batch_size = 1;
const int oc = 4;
const int ih = 8;
const int iw = 8;
const int ic = 4;
const bool bias_flag = false;
const bool relu_flag = false;
#endif
const int oh = ih; const int oh = ih;
const int ow = iw; const int ow = iw;
LOG(INFO) << "to get kernel ..."; VLOG(4) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create("conv2d_1x1", auto kernels =
KernelRegistry::Global().Create("conv2d_1x1",
TARGET(kOpenCL), TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front()); auto kernel = std::move(kernels.front());
LOG(INFO) << "created conv2d_1x1 kernel"; VLOG(4) << "created conv2d_1x1 kernel";
LOG(INFO) << "prepare kernel ------"; VLOG(4) << "prepare kernel ------";
lite::Tensor input, filter, bias, output; lite::Tensor input, filter, bias, output;
operators::ConvParam param; operators::ConvParam param;
...@@ -159,7 +170,8 @@ TEST(conv2d_1x1, compute) { ...@@ -159,7 +170,8 @@ TEST(conv2d_1x1, compute) {
std::unique_ptr<KernelContext> context(new KernelContext); std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce(); context->As<OpenCLContext>().InitOnce();
std::unique_ptr<KernelContext> conv_1x1_context(new KernelContext); std::unique_ptr<KernelContext> conv_1x1_context(
new KernelContext);
context->As<OpenCLContext>().CopySharedTo( context->As<OpenCLContext>().CopySharedTo(
&(conv_1x1_context->As<OpenCLContext>())); &(conv_1x1_context->As<OpenCLContext>()));
kernel->SetContext(std::move(conv_1x1_context)); kernel->SetContext(std::move(conv_1x1_context));
...@@ -195,46 +207,18 @@ TEST(conv2d_1x1, compute) { ...@@ -195,46 +207,18 @@ TEST(conv2d_1x1, compute) {
size_t filter_image_width = ksize * ((oc + 3) / 4); size_t filter_image_width = ksize * ((oc + 3) / 4);
size_t filter_image_height = ic * ksize; size_t filter_image_height = ic * ksize;
auto* input_data = input.mutable_data<float, cl::Image2D>(input_image_width,
input_image_height);
auto* filter_data = filter.mutable_data<float, cl::Image2D>(
filter_image_width, filter_image_height);
bias.mutable_data<float, cl::Image2D>(bias_image_width, bias_image_height);
auto* bias_data = bias.mutable_data<float, cl::Image2D>(bias_image_width,
bias_image_height);
const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0}; const size_t cl_image2d_slice_pitch{0};
LOG(INFO) << "map input ...";
auto* mapped_input =
static_cast<float*>(TargetWrapperCL::MapImage(input_data,
input_image_width,
input_image_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch));
LOG(INFO) << "map filter ...";
auto* mapped_filter =
static_cast<float*>(TargetWrapperCL::MapImage(filter_data,
filter_image_width,
filter_image_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch));
std::default_random_engine engine; std::default_random_engine engine;
std::uniform_real_distribution<float> gen(-5, 5); std::uniform_real_distribution<float> gen(-5, 5);
std::vector<float> input_v(batch_size * ic * ih * iw); std::vector<float> input_v(batch_size * ic * ih * iw);
std::vector<float> filter_v(oc * ic * ksize * ksize); std::vector<float> filter_v(oc * ic * ksize * ksize);
std::vector<float> output_v(batch_size * oc * ih * iw); std::vector<float> output_v(batch_size * oc * ih * iw);
std::vector<float> bias_v(oc); std::vector<float> bias_v(oc);
float* input_v_data = &input_v[0]; VLOG(4) << "gen input and filter ...";
float* filter_v_data = &filter_v[0];
float* output_v_data = &output_v[0];
float* bias_v_data = &bias_v[0];
LOG(INFO) << "gen input and filter ...";
for (auto& i : input_v) { for (auto& i : input_v) {
i = gen(engine); i = gen(engine);
...@@ -243,89 +227,138 @@ TEST(conv2d_1x1, compute) { ...@@ -243,89 +227,138 @@ TEST(conv2d_1x1, compute) {
f = gen(engine); f = gen(engine);
} }
LOG(INFO) << "after gen input and filter ..."; VLOG(4) << "after gen input and filter ...";
LOG(INFO) << "input_v.size(): " << input_v.size(); VLOG(4) << "input_v.size(): " << input_v.size();
LOG(INFO) << "filter_v.size(): " << filter_v.size(); VLOG(4) << "filter_v.size(): " << filter_v.size();
LOG(INFO) << "output_v.size(): " << output_v.size(); VLOG(4) << "output_v.size(): " << output_v.size();
LOG(INFO) << "bias_v.size(): " << bias_v.size(); VLOG(4) << "bias_v.size(): " << bias_v.size();
LOG(INFO) << "input_dim.production(): " << input_dim.production(); VLOG(4) << "input_dim.production(): " << input_dim.production();
LOG(INFO) << "filter_dim.production(): " << filter_dim.production(); VLOG(4) << "filter_dim.production(): "
LOG(INFO) << "out_dim.production(): " << out_dim.production(); << filter_dim.production();
LOG(INFO) << "bias_dim.production(): " << bias_dim.production(); VLOG(4) << "out_dim.production(): " << out_dim.production();
LOG(INFO) << "4 * input_image_height * input_image_width: " VLOG(4) << "bias_dim.production(): " << bias_dim.production();
VLOG(4) << "4 * input_image_height * input_image_width: "
<< 4 * input_image_height * input_image_width; << 4 * input_image_height * input_image_width;
LOG(INFO) << "4 * filter_image_width * filter_image_height: " VLOG(4) << "4 * filter_image_width * filter_image_height: "
<< 4 * filter_image_width * filter_image_height; << 4 * filter_image_width * filter_image_height;
CHECK(input_dim.production() == input_v.size()); CHECK(input_dim.production() == input_v.size());
CHECK_LE(input_dim.production(), 4 * input_image_height * input_image_width); CHECK_LE(input_dim.production(),
4 * input_image_height * input_image_width);
CHECK(filter_dim.production() == filter_v.size()); CHECK(filter_dim.production() == filter_v.size());
CHECK_LE(filter_dim.production(), CHECK_LE(filter_dim.production(),
4 * filter_image_width * filter_image_height); 4 * filter_image_width * filter_image_height);
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
LOG(INFO) << "set mapped input ..."; VLOG(4) << "set mapped input ...";
default_convertor.NCHWToImage(input_v_data, mapped_input, input_dim); std::vector<float> x_image_v(
LOG(INFO) << "set mapped filter ..."; input_image_width * input_image_height * 4); // 4 : RGBA
std::vector<float> filter_image_v(
filter_image_width * filter_image_height * 4); // 4 : RGBA
std::vector<float> bias_image_v(
bias_image_width * bias_image_height * 4); // 4 : RGBA
std::vector<float> out_image_v(
out_image_width * out_image_height * 4); // 4 : RGBA
default_convertor.NCHWToImage(
input_v.data(), x_image_v.data(), input_dim);
/* for (int j = 0; j < input_v.size(); j += 1) {
// VLOG(4) << "input_v
input[" << j << "]:
// " << input_v.data()[j];
std::cout << j << " " << input_v.data()[j] <<
std::endl;
}
std::cout << std::endl;
for (int j = 0; j < x_image_v.size(); j += 1) {
// VLOG(4) << "x_image_v
input[" << j <<
// "]: " <<
x_image_v.data()[j];
std::cout << j << " " << x_image_v.data()[j]
<< std::endl;
}*/
VLOG(4) << "set mapped filter ...";
paddle::lite::CLImageConverterNWBlock nw_convertor; paddle::lite::CLImageConverterNWBlock nw_convertor;
nw_convertor.NCHWToImage(filter_v_data, mapped_filter, filter_dim); nw_convertor.NCHWToImage(
filter_v.data(), filter_image_v.data(), filter_dim);
LOG(INFO) << "resize output ...";
output.Resize(out_dim);
// cpu conv basic calc auto* input_image2d = input.mutable_data<float, cl::Image2D>(
lite::Tensor out_ref; input_image_width, input_image_height, x_image_v.data());
out_ref.Resize(out_dim); auto* filter_image2d = filter.mutable_data<float, cl::Image2D>(
filter_image_width,
filter_image_height,
filter_image_v.data());
float* mapped_bias = nullptr;
if (bias_flag) { if (bias_flag) {
mapped_bias = nw_convertor.NCHWToImage(
static_cast<float*>(TargetWrapperCL::MapImage(bias_data, filter_v.data(), filter_image_v.data(), filter_dim);
bias_image_width,
bias_image_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch));
for (int i = 0; i < bias_dim.production(); ++i) { for (int i = 0; i < bias_dim.production(); ++i) {
bias_v[i] = static_cast<int>(gen(engine)); bias_v[i] = static_cast<int>(gen(engine));
} }
CLImageConverterFolder folder_convertor; CLImageConverterFolder folder_convertor;
folder_convertor.NCHWToImage(bias_v_data, mapped_bias, bias_dim); folder_convertor.NCHWToImage(
bias_v.data(), bias_image_v.data(), bias_dim);
auto* bias_data = bias.mutable_data<float, cl::Image2D>(
bias_image_width, bias_image_height, bias_image_v.data());
} }
LOG(INFO) << "prepare kernel ready";
LOG(INFO) << "kernel launch ..."; VLOG(4) << "resize output ...";
output.Resize(out_dim);
// cpu conv basic calc
lite::Tensor out_ref;
out_ref.Resize(out_dim);
VLOG(4) << "prepare kernel ready";
VLOG(4) << "kernel launch ...";
kernel->Launch(); kernel->Launch();
LOG(INFO) << "mutable output ..."; VLOG(4) << "mutable output ...";
auto* output_data = output.mutable_data<float, cl::Image2D>(out_image_width, auto* output_image2d = output.mutable_data<float, cl::Image2D>(
out_image_height); out_image_width, out_image_height);
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.output->data<float, cl::Image2D>(); auto* out_ptr = param.output->data<float, cl::Image2D>();
auto it = wait_list->find(out_ptr); auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) { if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto& event = *(it->second); auto& event = *(it->second);
event.wait(); event.wait();
} else { } else {
LOG(FATAL) << "Could not find the sync event for the target cl tensor."; LOG(FATAL) << "Could not find the sync event for the target "
"cl tensor.";
} }
auto* mapped_output = TargetWrapperCL::ImgcpySync(out_image_v.data(),
static_cast<float*>(TargetWrapperCL::MapImage(output_data, output.data<float, cl::Image2D>(),
out_image_width, out_image_width,
out_image_height, out_image_height,
cl_image2d_row_pitch, cl_image2d_row_pitch,
cl_image2d_slice_pitch)); cl_image2d_slice_pitch,
LOG(INFO) << "mutable_data out_ref_data: "; IoDirection::DtoH);
DDim out_image_shape =
default_convertor.InitImageDimInfoWith(output.dims());
default_convertor.ImageToNCHW(out_image_v.data(),
output_v.data(),
out_image_shape,
output.dims());
VLOG(4) << "mutable_data out_ref_data: ";
// run cpu ref // run cpu ref
auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM)); auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
LOG(INFO) << " conv_basic beigin ..... "; VLOG(4) << " conv_basic beigin ..... ";
conv_basic<float, float>(input_v_data, conv_basic<float, float>(input_v.data(),
out_ref_data, out_ref_data,
batch_size, batch_size,
oc, oc,
...@@ -334,8 +367,8 @@ TEST(conv2d_1x1, compute) { ...@@ -334,8 +367,8 @@ TEST(conv2d_1x1, compute) {
ic, ic,
ih, ih,
iw, iw,
filter_v_data, filter_v.data(),
bias_v_data, // mapped_bias, bias_v.data(), // mapped_bias,
group, group,
ksize, ksize,
ksize, ksize,
...@@ -347,29 +380,31 @@ TEST(conv2d_1x1, compute) { ...@@ -347,29 +380,31 @@ TEST(conv2d_1x1, compute) {
pad, pad,
bias_flag, bias_flag,
relu_flag); relu_flag);
LOG(INFO) << " conv_basic end ..... "; VLOG(4) << " conv_basic end ..... ";
LOG(INFO) << " out_dim: " << out_dim; VLOG(4) << " out_dim: " << out_dim;
const DDim& out_image_dims = lite::DDim{ const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
std::vector<int64_t>({static_cast<int64_t>(out_image_width), {static_cast<int64_t>(out_image_width),
static_cast<int64_t>(out_image_height)})}; static_cast<int64_t>(out_image_height)})};
default_convertor.ImageToNCHW(
mapped_output, output_v_data, out_image_dims, out_dim);
for (int i = 0; i < out_dim.production(); i++) { for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(output_v_data[i], out_ref_data[i], 1e-3); EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
if (abs(output_v_data[i] - out_ref_data[i]) > 1e-3) { if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
LOG(FATAL) << "error idx:" << i; LOG(FATAL) << "error idx:" << i;
} }
} }
TargetWrapperCL::Unmap(output_data, mapped_output); #ifdef LOOP_TEST
TargetWrapperCL::Unmap(filter_data, mapped_filter); }
TargetWrapperCL::Unmap(input_data, mapped_input); }
if (bias_flag) { }
if (mapped_bias) { }
TargetWrapperCL::Unmap(bias_data, mapped_bias); }
} }
} }
#else
// nothing to do.
#endif
} }
} // namespace lite } // namespace lite
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册