提交 cc46d43c 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Add relu image2d kernel unit test, Fix conv2d_1x1, relu, layout...

[LITE][OPENCL] Add relu image2d kernel unit test, Fix conv2d_1x1, relu, layout using new Image2D Layout (#2564)

* add 3 layout for opencl image. test=develop

* add relu image2d test. test=develop
上级 cff0c364
...@@ -11,8 +11,8 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt ...@@ -11,8 +11,8 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image)
lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper
ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper
ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
add_dependencies(cl_wrapper opencl_clhpp) add_dependencies(cl_wrapper opencl_clhpp)
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace lite { namespace lite {
static void CopyImageData(CLContext* context, static void CopyImageData(CLContext* context,
const CLImage& cl_image, const CLImage& cl_image,
float* out) { float* out) {
...@@ -51,119 +52,5 @@ bool InitOpenCLRuntime(std::string cl_path) { ...@@ -51,119 +52,5 @@ bool InitOpenCLRuntime(std::string cl_path) {
return runtime->IsInitSuccess(); return runtime->IsInitSuccess();
} }
void elementwise_add(CLContext* context,
const float* in,
const DDim& in_dim,
const float* bias,
const DDim& bias_dim,
float* out,
const DDim& out_dim) {
if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) {
LOG(FATAL) << "Error: bias dims is error";
return;
}
auto kernel = bias_dim.size() == 1 ? context->GetKernel("channel_add")
: context->GetKernel("elementwise_add");
CLImage in_image;
in_image.set_tensor_data(in, in_dim);
in_image.InitNormalCLImage(context->GetContext());
VLOG(3) << " --- Inpu image: " << in_image << " --- ";
CLImage bias_image;
bias_image.set_tensor_data(bias, bias_dim);
bias_image.InitCLImage(context->GetContext());
VLOG(3) << " --- Bias image: " << bias_image << " --- ";
CLImage out_image;
out_image.InitEmptyImage(context->GetContext(), out_dim);
cl_int status;
status = kernel.setArg(0, *in_image.cl_image());
CL_CHECK_FATAL(status);
status = kernel.setArg(1, *bias_image.cl_image());
CL_CHECK_FATAL(status);
status = kernel.setArg(2, *out_image.cl_image());
CL_CHECK_FATAL(status);
if (bias_dim.size() == 1) {
int tensor_w = in_dim[3];
status = kernel.setArg(3, tensor_w);
CL_CHECK_FATAL(status);
}
size_t width = in_image.ImageWidth();
size_t height = in_image.ImageHeight();
auto global_work_size = cl::NDRange{width, height};
status = context->GetCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_FATAL(status);
status = context->GetCommandQueue().finish();
CL_CHECK_FATAL(status);
VLOG(3) << " --- Out image: " << out_image << " --- ";
CopyImageData(context, out_image, out);
}
void pool(CLContext* context,
const std::string pooling_type,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int ksize_h,
const int ksize_w,
const float* in,
const DDim& in_dim,
float* out,
const DDim& out_dim) {
auto kernel =
context->GetKernel(string_format("pool_%s", pooling_type.c_str()));
CLImage in_image;
in_image.set_tensor_data(in, in_dim);
in_image.InitNormalCLImage(context->GetContext());
VLOG(3) << " --- Inpu image: " << in_image << " --- ";
CLImage out_image;
out_image.InitEmptyImage(context->GetContext(), out_dim);
auto global_work_size = context->DefaultWorkSize(out_image);
auto* in_converter =
dynamic_cast<CLImageConverterNormal*>(in_image.image_converter());
auto* out_converter =
dynamic_cast<CLImageConverterNormal*>(out_image.image_converter());
const int in_height = in_converter->HeightOfOneBlock();
const int in_width = in_converter->WidthOfOneBlock();
const int out_height = out_converter->HeightOfOneBlock();
const int out_width = out_converter->WidthOfOneBlock();
cl_int status;
status = kernel.setArg(0, in_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, in_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, out_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, out_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(4, pad_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(5, pad_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(6, stride_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(7, stride_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(8, ksize_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(9, ksize_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(10, *in_image.cl_image());
CL_CHECK_FATAL(status);
status = kernel.setArg(11, *out_image.cl_image());
CL_CHECK_FATAL(status);
status = context->GetCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_FATAL(status);
status = context->GetCommandQueue().finish();
CL_CHECK_FATAL(status);
VLOG(3) << " --- Out image: " << out_image << " --- ";
CopyImageData(context, out_image, out);
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -23,30 +23,5 @@ namespace lite { ...@@ -23,30 +23,5 @@ namespace lite {
bool InitOpenCLRuntime(std::string cl_path); bool InitOpenCLRuntime(std::string cl_path);
/// An elementwise_add method to embed OpenCL logic inside, it is used as a
/// black box so that the framework can remain simple.
/// NOTE Currently, these methods are quite expensive, we will optimize them
/// latter.
void elementwise_add(CLContext* context,
const float* in,
const DDim& in_dim,
const float* bias,
const DDim& bias_dim,
float* out,
const DDim& out_dim);
void pool(CLContext* context,
const std::string pooling_type,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int ksize_h,
const int ksize_w,
const float* in,
const DDim& in_dim,
float* out,
const DDim& out_dim);
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -41,9 +41,10 @@ TEST(cl_test, runtime_test) { ...@@ -41,9 +41,10 @@ TEST(cl_test, runtime_test) {
auto &context = runtime->context(); auto &context = runtime->context();
auto program = runtime->CreateProgram( auto program = runtime->CreateProgram(
context, context,
runtime->cl_path() + "/cl_kernel/" + "image/elementwise_add_kernel.cl"); runtime->cl_path() + "/cl_kernel/" + "buffer/elementwise_add_kernel.cl");
auto event = runtime->CreateEvent(context); auto event = runtime->CreateEvent(context);
CHECK(runtime->BuildProgram(program.get())); const std::string build_option("-DCL_DTYPE_float");
CHECK(runtime->BuildProgram(program.get(), build_option));
} }
TEST(cl_test, context_test) { TEST(cl_test, context_test) {
...@@ -51,9 +52,11 @@ TEST(cl_test, context_test) { ...@@ -51,9 +52,11 @@ TEST(cl_test, context_test) {
CHECK(runtime->IsInitSuccess()); CHECK(runtime->IsInitSuccess());
runtime->set_cl_path(FLAGS_cl_path); runtime->set_cl_path(FLAGS_cl_path);
CLContext context; CLContext context;
context.AddKernel("pool_max", "image/pool_kernel.cl", ""); context.AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float");
context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", ""); context.AddKernel(
context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", ""); "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
context.AddKernel(
"elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
} }
TEST(cl_test, kernel_test) { TEST(cl_test, kernel_test) {
...@@ -61,9 +64,11 @@ TEST(cl_test, kernel_test) { ...@@ -61,9 +64,11 @@ TEST(cl_test, kernel_test) {
CHECK(runtime->IsInitSuccess()); CHECK(runtime->IsInitSuccess());
runtime->set_cl_path(FLAGS_cl_path); runtime->set_cl_path(FLAGS_cl_path);
std::unique_ptr<CLContext> context(new CLContext); std::unique_ptr<CLContext> context(new CLContext);
context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); context->AddKernel(
context->AddKernel("pool_max", "image/pool_kernel.cl"); "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); context->AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float");
context->AddKernel(
"elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
auto kernel = context->GetKernel(2); auto kernel = context->GetKernel(2);
std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]); std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]);
...@@ -115,203 +120,12 @@ TEST(cl_test, kernel_test) { ...@@ -115,203 +120,12 @@ TEST(cl_test, kernel_test) {
LOG(INFO) << out_image; LOG(INFO) << out_image;
} }
TEST(cl_test, channel_add_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
in_data[i] = dist(engine);
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{16});
std::unique_ptr<float[]> bias_data(new float[16]);
for (int i = 0; i < 16; i++) {
bias_data[i] = dist(engine);
}
std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 16; j++) {
float b = bias_data[j];
for (int k = 0; k < 256 * 512; k++) {
int index = (i * 16 + j) * 256 * 512 + k;
out_ref[index] = in_data[index] + b;
}
}
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
bool status = InitOpenCLRuntime(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL runtime.";
std::unique_ptr<CLContext> context(new CLContext);
context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
context->AddKernel("channel_add", "image/channel_add_kernel.cl");
elementwise_add(context.get(),
in_data.get(),
in_dim,
bias_data.get(),
bias_dim,
out.get(),
out_dim);
int stride = 4 * 16 * 256 * 512 / 20;
for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
std::cout << out[i] << " ";
}
std::cout << std::endl;
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
}
TEST(cl_test, elementwise_add_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
in_data[i] = dist(engine);
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> bias_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
bias_data[i] = dist(engine);
}
std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
out_ref[i] = in_data[i] + bias_data[i];
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
bool status = InitOpenCLRuntime(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL runtime.";
std::unique_ptr<CLContext> context(new CLContext);
context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
context->AddKernel("channel_add", "image/channel_add_kernel.cl");
elementwise_add(context.get(),
in_data.get(),
in_dim,
bias_data.get(),
bias_dim,
out.get(),
out_dim);
int stride = 4 * 16 * 256 * 512 / 20;
for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
std::cout << out[i] << " ";
}
std::cout << std::endl;
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
}
void pool_avg(const int padding_height,
const int padding_width,
const int stride_height,
const int stride_width,
const int ksize_height,
const int ksize_width,
const float *input_data,
const DDim &in_dim,
float *output_data,
const DDim &out_dim) {
const int batch_size = in_dim[0];
const int input_height = in_dim[2];
const int input_width = in_dim[3];
const int output_channels = out_dim[1];
const int output_height = out_dim[2];
const int output_width = out_dim[3];
const size_t input_spatial_size = input_height * input_width;
const size_t output_spatial_size = output_height * output_width;
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
int channel = i * output_channels + c;
const float *input_ptr = input_data + channel * input_spatial_size;
float *output_ptr = output_data + channel * output_spatial_size;
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
float val = 0.f;
int count = 0;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += input_ptr[h * input_width + w];
++count;
}
}
output_ptr[ph * output_width + pw] =
(count > 0) ? val * (1.f / count) : 0.f;
}
}
}
}
}
TEST(cl_test, pool_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
std::unique_ptr<float[]> in_data(new float[4 * 1024 * 7 * 7]);
for (int i = 0; i < 4 * 1024 * 7 * 7; i++) {
in_data[i] = dist(engine);
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
std::unique_ptr<float[]> out(new float[4 * 1024 * 1 * 1]);
std::unique_ptr<float[]> out_ref(new float[4 * 1024 * 1 * 1]);
bool status = InitOpenCLRuntime(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL runtime.";
std::unique_ptr<CLContext> context(new CLContext);
context->AddKernel("pool_max", "image/pool_kernel.cl");
context->AddKernel("pool_avg", "image/pool_kernel.cl");
pool(context.get(),
"avg",
0,
0,
1,
1,
7,
7,
in_data.get(),
in_dim,
out.get(),
out_dim);
pool_avg(0, 0, 1, 1, 7, 7, in_data.get(), in_dim, out_ref.get(), out_dim);
for (int i = 0; i < 4 * 1024 * 1 * 1; i++) {
EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
}
TEST(cl_test, target_wrapper_buffer_test) { TEST(cl_test, target_wrapper_buffer_test) {
bool inited = InitOpenCLRuntime(FLAGS_cl_path); bool inited = InitOpenCLRuntime(FLAGS_cl_path);
CHECK(inited) << "Fail to initialize OpenCL runtime."; CHECK(inited) << "Fail to initialize OpenCL runtime.";
std::unique_ptr<CLContext> context(new CLContext); std::unique_ptr<CLContext> context(new CLContext);
std::string kernel_name = "elementwise_add"; std::string kernel_name = "elementwise_add";
std::string build_options = "-DCL_DTYPE=float"; std::string build_options = "-DCL_DTYPE_float";
context->AddKernel( context->AddKernel(
kernel_name, "buffer/elementwise_add_kernel.cl", build_options); kernel_name, "buffer/elementwise_add_kernel.cl", build_options);
std::vector<float> h_a; std::vector<float> h_a;
...@@ -396,10 +210,13 @@ TEST(cl_test, target_wrapper_buffer_test) { ...@@ -396,10 +210,13 @@ TEST(cl_test, target_wrapper_buffer_test) {
TEST(cl_test, target_wrapper_image_test) { TEST(cl_test, target_wrapper_image_test) {
const size_t cl_image2d_width = 28; const size_t cl_image2d_width = 28;
const size_t cl_image2d_height = 32; const size_t cl_image2d_height = 32;
const size_t cl_image2d_elem_size =
cl_image2d_width * cl_image2d_height * 4; // 4 for RGBA channels
const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0}; const size_t cl_image2d_slice_pitch{0};
auto *d_image = static_cast<cl::Image2D *>( auto *d_image = static_cast<cl::Image2D *>(
TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height)); TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
// Map/Unmap test // Map/Unmap test
auto *h_image = auto *h_image =
static_cast<float *>(TargetWrapperCL::MapImage(d_image, static_cast<float *>(TargetWrapperCL::MapImage(d_image,
...@@ -407,15 +224,11 @@ TEST(cl_test, target_wrapper_image_test) { ...@@ -407,15 +224,11 @@ TEST(cl_test, target_wrapper_image_test) {
cl_image2d_height, cl_image2d_height,
cl_image2d_row_pitch, cl_image2d_row_pitch,
cl_image2d_slice_pitch)); cl_image2d_slice_pitch));
CHECK_EQ( CHECK_EQ(cl_image2d_slice_pitch, 0);
cl_image2d_row_pitch,
cl_image2d_width * 4 *
4); // row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes)
CHECK_EQ(cl_image2d_slice_pitch, 0); // slice_pitch = 0
LOG(INFO) << "cl_image2d_row_pitch = " << cl_image2d_row_pitch LOG(INFO) << "cl_image2d_row_pitch = " << cl_image2d_row_pitch
<< ", cl_image2d_slice_pitch " << cl_image2d_slice_pitch; << ", cl_image2d_slice_pitch " << cl_image2d_slice_pitch;
for (int i = 0; i < 10; i++) { for (int i = 0; i < cl_image2d_elem_size; i++) {
h_image[i] = 3.14f * i; h_image[i] = 3.14f * i;
} }
TargetWrapperCL::Unmap(d_image, h_image); TargetWrapperCL::Unmap(d_image, h_image);
...@@ -426,15 +239,14 @@ TEST(cl_test, target_wrapper_image_test) { ...@@ -426,15 +239,14 @@ TEST(cl_test, target_wrapper_image_test) {
cl_image2d_height, cl_image2d_height,
cl_image2d_row_pitch, cl_image2d_row_pitch,
cl_image2d_slice_pitch)); cl_image2d_slice_pitch));
for (int i = 0; i < 10; i++) { for (int i = 0; i < cl_image2d_elem_size; i++) {
EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6); EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6);
} }
TargetWrapperCL::Unmap(d_image, h_ptr); TargetWrapperCL::Unmap(d_image, h_ptr);
// Imagecpy test // Imagecpy test
std::vector<float> h_image_cpy(cl_image2d_width * 4 * std::vector<float> h_image_cpy(cl_image2d_elem_size);
cl_image2d_height); // 4 for RGBA channels for (int i = 0; i < cl_image2d_elem_size; i++) {
for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
h_image_cpy[i] = 3.14f; h_image_cpy[i] = 3.14f;
} }
TargetWrapperCL::ImgcpySync(d_image, TargetWrapperCL::ImgcpySync(d_image,
...@@ -446,6 +258,8 @@ TEST(cl_test, target_wrapper_image_test) { ...@@ -446,6 +258,8 @@ TEST(cl_test, target_wrapper_image_test) {
IoDirection::HtoD); IoDirection::HtoD);
auto *d_image_cpy = static_cast<cl::Image2D *>( auto *d_image_cpy = static_cast<cl::Image2D *>(
TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height)); TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
// device to device
TargetWrapperCL::ImgcpySync(d_image_cpy, TargetWrapperCL::ImgcpySync(d_image_cpy,
d_image, d_image,
cl_image2d_width, cl_image2d_width,
...@@ -454,6 +268,8 @@ TEST(cl_test, target_wrapper_image_test) { ...@@ -454,6 +268,8 @@ TEST(cl_test, target_wrapper_image_test) {
cl_image2d_slice_pitch, cl_image2d_slice_pitch,
IoDirection::DtoD); IoDirection::DtoD);
std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0); std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0);
// host to device
TargetWrapperCL::ImgcpySync(h_image_cpy.data(), TargetWrapperCL::ImgcpySync(h_image_cpy.data(),
d_image_cpy, d_image_cpy,
cl_image2d_width, cl_image2d_width,
...@@ -461,7 +277,7 @@ TEST(cl_test, target_wrapper_image_test) { ...@@ -461,7 +277,7 @@ TEST(cl_test, target_wrapper_image_test) {
cl_image2d_row_pitch, cl_image2d_row_pitch,
cl_image2d_slice_pitch, cl_image2d_slice_pitch,
IoDirection::DtoH); IoDirection::DtoH);
for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) { for (int i = 0; i < cl_image2d_elem_size; i++) {
EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6); EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6);
} }
......
#include <cl_common.h> #include <cl_common.h>
__kernel void conv_1x1( __kernel void conv2d_1x1(
__private const int global_size_dim0, __private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cl_common.h>
__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) { __kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
......
...@@ -103,6 +103,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent( ...@@ -103,6 +103,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) { bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
std::string build_option = options + " -cl-fast-relaxed-math -I " + std::string build_option = options + " -cl-fast-relaxed-math -I " +
CLRuntime::Global()->cl_path() + "/cl_kernel"; CLRuntime::Global()->cl_path() + "/cl_kernel";
VLOG(4) << "OpenCL build_option: " << build_option;
status_ = program->build({*device_}, build_option.c_str()); status_ = program->build({*device_}, build_option.c_str());
CL_CHECK_ERROR(status_); CL_CHECK_ERROR(status_);
......
...@@ -40,9 +40,9 @@ lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc ...@@ -40,9 +40,9 @@ lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#TODO(ysh329): comment buffer-impl relu #TODO(ysh329): comment buffer-impl relu
#lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
# DEPS relu_opencl op_registry program context DEPS relu_opencl layout_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter
......
...@@ -26,13 +26,13 @@ namespace kernels { ...@@ -26,13 +26,13 @@ namespace kernels {
namespace opencl { namespace opencl {
#define USE_BUFFER_FOR_CONV1x1_BIAS #define USE_BUFFER_FOR_CONV1x1_BIAS
class Conv2d1x1Image2DCompute class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> { PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ConvParam; using param_t = operators::ConvParam;
void PrepareForRun() override { void PrepareForRun() override {
LOG(INFO) << "PrepareForRun ...";
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
if (param.fuse_relu) { if (param.fuse_relu) {
build_options_ += " -DRELU"; build_options_ += " -DRELU";
...@@ -46,43 +46,38 @@ class Conv2d1x1Image2DCompute ...@@ -46,43 +46,38 @@ class Conv2d1x1Image2DCompute
} }
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/conv_1x1_kernel.cl", build_options_); kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_);
LOG(INFO) << "PrepareForRun Ready";
} }
void Run() override { void Run() override {
LOG(INFO) << "opencl conv 1x1 run begin ...";
LOG(INFO) << "param_.get_mutable<param_t> ...";
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
LOG(INFO) << "get param dims ...";
auto input_dims = param.x->dims(); auto input_dims = param.x->dims();
CHECK_GE(input_dims.size(), 4); auto paddings = *param.paddings;
LOG(INFO) << "input_dims: " << input_dims; auto strides = param.strides;
auto* input_image = param.x->data<float, cl::Image2D>();
int input_width = input_dims[3]; auto* filter_image = param.filter->data<float, cl::Image2D>();
int input_height = input_dims[2];
auto filter_dims = param.filter->dims(); auto filter_dims = param.filter->dims();
LOG(INFO) << "filter_dims: " << filter_dims;
auto output_dims = param.output->dims(); auto output_dims = param.output->dims();
LOG(INFO) << "output_dims: " << output_dims; int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3]; int output_width = output_dims[3];
int output_height = output_dims[2]; int output_height = output_dims[2];
// mute output image
auto out_image_shape = InitImageDimInfoWith(output_dims); auto out_image_shape = InitImageDimInfoWith(output_dims);
LOG(INFO) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
auto* out_image = param.output->mutable_data<float, cl::Image2D>( auto* out_image = param.output->mutable_data<float, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]); out_image_shape["width"], out_image_shape["height"]);
// gen default_work_size const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size = const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims, DefaultWorkSize(output_dims,
...@@ -93,28 +88,38 @@ class Conv2d1x1Image2DCompute ...@@ -93,28 +88,38 @@ class Conv2d1x1Image2DCompute
int c_block = default_work_size[0]; int c_block = default_work_size[0];
int w = default_work_size[1]; int w = default_work_size[1];
int nh = default_work_size[2]; int nh = default_work_size[2];
LOG(INFO) << "default work size: "
VLOG(4) << "============ conv2d_1x1 params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << "" << "{" << c_block << ", " << w << ", " << nh << ""
<< "}"; << "}";
auto paddings = *param.paddings; CHECK_GE(dilations.size(), 2);
LOG(INFO) << "paddings: " << paddings[0] << "," << paddings[1]; CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
auto strides = param.strides; CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
auto& context = ctx_->As<OpenCLContext>(); CHECK_GE(strides.size(), 2);
CHECK(context.cl_context() != nullptr); CHECK(strides[0] == strides[1]);
auto* input_image = param.x->data<float, cl::Image2D>();
auto* filter_image = param.filter->data<float, cl::Image2D>();
// handle bias use buffer for channel wise , use image for element wise // handle bias use buffer for channel wise , use image for element wise
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
LOG(INFO) << "has bias: " << has_bias;
LOG(INFO) << "is_element_wise_bias : " << is_element_wise_bias;
LOG(INFO) << "get kernel ...";
const cl::Buffer* bias_buf = nullptr; const cl::Buffer* bias_buf = nullptr;
const cl::Image2D* bias_image = nullptr; const cl::Image2D* bias_image = nullptr;
if (has_bias) { if (has_bias) {
...@@ -126,48 +131,37 @@ class Conv2d1x1Image2DCompute ...@@ -126,48 +131,37 @@ class Conv2d1x1Image2DCompute
bias_image = param.bias->data<float, cl::Image2D>(); bias_image = param.bias->data<float, cl::Image2D>();
#endif #endif
} }
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
LOG(INFO) << "kernel_key: " << kernel_key.str();
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int maped_w = maptofactor(w, 4);
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "maped_w: " << maped_w;
LOG(INFO) << "kernel ready ... " << kernel_key.str();
cl_int status; cl_int status;
auto numel = output_dims.production();
int arg_idx = 0; int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block); status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
int maped_w = maptofactor(w, 4);
LOG(INFO) << "maped_w: " << maped_w;
status = kernel.setArg(++arg_idx, maped_w); status = kernel.setArg(++arg_idx, maped_w);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh); status = kernel.setArg(++arg_idx, nh);
LOG(INFO) << "nh: " << nh;
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image); status = kernel.setArg(++arg_idx, *input_image);
LOG(INFO) << "input_image: ";
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image); status = kernel.setArg(++arg_idx, *filter_image);
LOG(INFO) << "filter_image: ";
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
if (has_bias) { if (has_bias) {
#ifndef USE_BUFFER_FOR_CONV1x1_BIAS #ifndef USE_BUFFER_FOR_CONV1x1_BIAS
if (is_element_wise_bias != 0) { if (is_element_wise_bias != 0) {
LOG(INFO) << "set bias_image: "; VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image); status = kernel.setArg(++arg_idx, *bias_image);
} else { } else {
LOG(INFO) << "set bias_buf: "; VLOG(4) << "set bias_buf: ";
status = kernel.setArg(++arg_idx, *bias_buf); status = kernel.setArg(++arg_idx, *bias_buf);
} }
#else #else
...@@ -175,77 +169,38 @@ class Conv2d1x1Image2DCompute ...@@ -175,77 +169,38 @@ class Conv2d1x1Image2DCompute
#endif #endif
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} }
status = kernel.setArg(++arg_idx, *out_image); status = kernel.setArg(++arg_idx, *out_image);
LOG(INFO) << "out_image: ";
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
status = kernel.setArg(++arg_idx, strides[0]); status = kernel.setArg(++arg_idx, strides[0]);
LOG(INFO) << "strides: " << strides[0] << "," << strides[1];
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
LOG(INFO) << "offset: " << offset;
status = kernel.setArg(++arg_idx, offset); status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
LOG(INFO) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
int input_c_block = input_image_shape["width"] / input_dims[3];
LOG(INFO) << "input_c_block: " << input_c_block;
status = kernel.setArg(++arg_idx, input_c_block); status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
int input_c = input_dims[1];
LOG(INFO) << "input_c: " << input_c;
status = kernel.setArg(++arg_idx, input_c); status = kernel.setArg(++arg_idx, input_c);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto dilations = *param.dilations;
LOG(INFO) << "dilations.size : " << dilations.size();
LOG(INFO) << "dilations: " << dilations[0] << ", " << dilations[1];
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
status = kernel.setArg(++arg_idx, dilations[0]); status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width); status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height); status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width); status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height); status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w); status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
// clac gloabl_work_size
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]), cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(maped_w), static_cast<size_t>(maped_w),
static_cast<size_t>(default_work_size.data()[2])}; static_cast<size_t>(default_work_size.data()[2])};
LOG(INFO) << "global_work_size :" << global_work_size; VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
...@@ -259,8 +214,8 @@ class Conv2d1x1Image2DCompute ...@@ -259,8 +214,8 @@ class Conv2d1x1Image2DCompute
} }
private: private:
std::string kernel_func_name_{"conv_1x1"}; std::string kernel_func_name_{"conv2d_1x1"};
std::string build_options_{"-DCL_DTYPE=float "}; std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
...@@ -272,7 +227,7 @@ class Conv2d1x1Image2DCompute ...@@ -272,7 +227,7 @@ class Conv2d1x1Image2DCompute
REGISTER_LITE_KERNEL(conv2d_1x1, REGISTER_LITE_KERNEL(conv2d_1x1,
kOpenCL, kOpenCL,
kFloat, kFloat,
kNHWC, kImageDefault,
paddle::lite::kernels::opencl::Conv2d1x1Image2DCompute, paddle::lite::kernels::opencl::Conv2d1x1Image2DCompute,
image2d) image2d)
.BindInput("Input", .BindInput("Input",
......
...@@ -128,8 +128,10 @@ TEST(conv2d_1x1, compute) { ...@@ -128,8 +128,10 @@ TEST(conv2d_1x1, compute) {
const int ow = iw; const int ow = iw;
LOG(INFO) << "to get kernel ..."; LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create( auto kernels = KernelRegistry::Global().Create("conv2d_1x1",
"conv2d_1x1", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)); TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front()); auto kernel = std::move(kernels.front());
...@@ -373,4 +375,4 @@ TEST(conv2d_1x1, compute) { ...@@ -373,4 +375,4 @@ TEST(conv2d_1x1, compute) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, kNHWC, image2d); USE_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, kImageDefault, image2d);
...@@ -41,7 +41,7 @@ class ElementwiseAddCompute ...@@ -41,7 +41,7 @@ class ElementwiseAddCompute
size_t num_{1}; size_t num_{1};
param_t* ele_param_{nullptr}; param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_add"}; std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE=float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -28,8 +28,11 @@ namespace lite { ...@@ -28,8 +28,11 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class LayoutComputeBufferChwToImage2DHwc // [NCHW] -> [ImageDefault]
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC)> { class LayoutComputeBufferChwToImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::LayoutParam; using param_t = operators::LayoutParam;
...@@ -117,7 +120,8 @@ class LayoutComputeBufferChwToImage2DHwc ...@@ -117,7 +120,8 @@ class LayoutComputeBufferChwToImage2DHwc
} }
std::string doc() const override { std::string doc() const override {
return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(RGBA)"; return "Trans Layout from cl::Buffer(NCHW) to "
"cl::Image2D(ImageDefault/RGBA)";
} }
private: private:
...@@ -126,11 +130,9 @@ class LayoutComputeBufferChwToImage2DHwc ...@@ -126,11 +130,9 @@ class LayoutComputeBufferChwToImage2DHwc
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
// buffer chw 2 image2d nw // [ImageDefault] -> [NCHW]
class LayoutComputeBufferChwToImage2DNw class LayoutComputeImageDefaultToBufferChw
: public KernelLite<TARGET(kOpenCL), : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)> {
PRECISION(kFloat),
DATALAYOUT(kImageNW)> {
public: public:
using param_t = operators::LayoutParam; using param_t = operators::LayoutParam;
...@@ -142,31 +144,29 @@ class LayoutComputeBufferChwToImage2DNw ...@@ -142,31 +144,29 @@ class LayoutComputeBufferChwToImage2DNw
void Run() override { void Run() override {
auto& param = Param<param_t>(); auto& param = Param<param_t>();
auto* x_data = param.x->data<float, cl::Buffer>(); auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto x_dims = param.x->dims();
CHECK(x_dims.size() == 4) << " Tensor dim is not 4.";
size_t image_width = x_dims[3] * ((x_dims[0] + 3) / 4);
size_t image_height = x_dims[1] * x_dims[2];
auto* y_data =
param.y->mutable_data<float, cl::Image2D>(image_width, image_height);
auto y_dims = param.y->dims(); auto y_dims = param.y->dims();
auto* x_data = param.x->data<float, cl::Image2D>();
auto x_dims = param.x->dims();
// out info
std::vector<size_t> new_dims = {1, 1, 1, 1}; std::vector<size_t> new_dims = {1, 1, 1, 1};
for (int tidx = 0; tidx < x_dims.size(); ++tidx) { for (int j = 0; j < x_dims.size(); ++j) {
new_dims[4 - x_dims.size() + tidx] = x_dims[tidx]; new_dims[4 - x_dims.size() + j] = x_dims[j];
} }
const int out_N = new_dims[0]; VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
const int out_C = new_dims[1]; << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
const int out_H = new_dims[2]; VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
const int out_W = new_dims[3]; << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
<< new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
const int Stride2 = out_C * out_H * out_W; size_t C = new_dims[1];
const int Stride1 = out_H * out_W; size_t in_height = new_dims[2];
const int Stride0 = out_W; size_t in_width = new_dims[3];
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
...@@ -177,27 +177,26 @@ class LayoutComputeBufferChwToImage2DNw ...@@ -177,27 +177,26 @@ class LayoutComputeBufferChwToImage2DNw
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_data); cl_int status = kernel.setArg(arg_idx, *x_data);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_data); status = kernel.setArg(++arg_idx, static_cast<const int>(in_width));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_H)); status = kernel.setArg(++arg_idx, static_cast<const int>(in_height));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_W)); status = kernel.setArg(++arg_idx, *y_data);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_N)); status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride0)); status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride1)); status = kernel.setArg(++arg_idx, static_cast<const int>(size_batch));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2)); status = kernel.setArg(++arg_idx, static_cast<const int>(C));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " " << " " << (new_dims[0] * new_dims[2]);
<< (out_C * out_H);
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4), // N blocks cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
static_cast<cl::size_type>(out_W), // w static_cast<cl::size_type>(new_dims[3]),
static_cast<cl::size_type>(out_C * out_H)}; // ch static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
...@@ -209,21 +208,24 @@ class LayoutComputeBufferChwToImage2DNw ...@@ -209,21 +208,24 @@ class LayoutComputeBufferChwToImage2DNw
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(y_data, event_); // context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish(); context.cl_context()->GetCommandQueue().finish();
// auto image_shape = InitImageDimInfoWith(x_dims);
} }
std::string doc() const override { std::string doc() const override {
return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(CLNW)"; return "Trans Layout from cl::Image2D(ImageDefault/RGBA) to "
"cl::Buffer(NCHW)";
} }
private: private:
std::string kernel_func_name_{"buffer_to_image2d_nw"}; std::string kernel_func_name_{"image2d_to_buffer"};
std::string build_options_{"-DCL_DTYPE_float "}; std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class LayoutComputeImage2DHwcToBufferChw // [NCHW] -> [ImageDW]
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)> { class LayoutComputeBufferChwToImage2DNw
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageNW)> {
public: public:
using param_t = operators::LayoutParam; using param_t = operators::LayoutParam;
...@@ -235,29 +237,31 @@ class LayoutComputeImage2DHwcToBufferChw ...@@ -235,29 +237,31 @@ class LayoutComputeImage2DHwcToBufferChw
void Run() override { void Run() override {
auto& param = Param<param_t>(); auto& param = Param<param_t>();
auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); auto* x_data = param.x->data<float, cl::Buffer>();
auto y_dims = param.y->dims();
auto* x_data = param.x->data<float, cl::Image2D>();
auto x_dims = param.x->dims(); auto x_dims = param.x->dims();
CHECK(x_dims.size() == 4) << " Tensor dim is not 4.";
size_t image_width = x_dims[3] * ((x_dims[0] + 3) / 4);
size_t image_height = x_dims[1] * x_dims[2];
auto* y_data =
param.y->mutable_data<float, cl::Image2D>(image_width, image_height);
auto y_dims = param.y->dims();
// out info
std::vector<size_t> new_dims = {1, 1, 1, 1}; std::vector<size_t> new_dims = {1, 1, 1, 1};
for (int j = 0; j < x_dims.size(); ++j) { for (int tidx = 0; tidx < x_dims.size(); ++tidx) {
new_dims[4 - x_dims.size() + j] = x_dims[j]; new_dims[4 - x_dims.size() + tidx] = x_dims[tidx];
} }
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " const int out_N = new_dims[0];
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; const int out_C = new_dims[1];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " const int out_H = new_dims[2];
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; const int out_W = new_dims[3];
VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
<< new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
size_t C = new_dims[1]; const int Stride2 = out_C * out_H * out_W;
size_t in_height = new_dims[2]; const int Stride1 = out_H * out_W;
size_t in_width = new_dims[3]; const int Stride0 = out_W;
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
...@@ -268,26 +272,27 @@ class LayoutComputeImage2DHwcToBufferChw ...@@ -268,26 +272,27 @@ class LayoutComputeImage2DHwcToBufferChw
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_data); cl_int status = kernel.setArg(arg_idx, *x_data);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_width)); status = kernel.setArg(++arg_idx, *y_data);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_height)); status = kernel.setArg(++arg_idx, static_cast<const int>(out_H));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_data); status = kernel.setArg(++arg_idx, static_cast<const int>(out_W));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch)); status = kernel.setArg(++arg_idx, static_cast<const int>(out_N));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch)); status = kernel.setArg(++arg_idx, static_cast<const int>(Stride0));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(size_batch)); status = kernel.setArg(++arg_idx, static_cast<const int>(Stride1));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(C)); status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
<< " " << (new_dims[0] * new_dims[2]); VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " "
<< (out_C * out_H);
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4), cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4), // N blocks
static_cast<cl::size_type>(new_dims[3]), static_cast<cl::size_type>(out_W), // w
static_cast<cl::size_type>(new_dims[0] * new_dims[2])}; static_cast<cl::size_type>(out_C * out_H)}; // ch
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
...@@ -299,15 +304,16 @@ class LayoutComputeImage2DHwcToBufferChw ...@@ -299,15 +304,16 @@ class LayoutComputeImage2DHwcToBufferChw
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(y_data, event_); // context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish(); context.cl_context()->GetCommandQueue().finish();
// auto image_shape = InitImageDimInfoWith(x_dims);
} }
std::string doc() const override { std::string doc() const override {
return "Trans Layout from cl::Image2D(RGBA) to cl::Buffer(NCHW)"; return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(ImageDW/CLNW)";
} }
private: private:
std::string kernel_func_name_{"image2d_to_buffer"}; std::string kernel_func_name_{"buffer_to_image2d_nw"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_float "};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
...@@ -316,15 +322,14 @@ class LayoutComputeImage2DHwcToBufferChw ...@@ -316,15 +322,14 @@ class LayoutComputeImage2DHwcToBufferChw
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// BufferChwToImage2DHwc // [NCHW] -> [ImageDefault]
// [chw] -> [hwc]
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout, layout,
kOpenCL, kOpenCL,
kAny, kAny,
kNHWC, kImageDefault,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc, paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
buffer_chw_to_image2d_hwc_opencl_fp32) NCHW_to_ImageDefault)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny), PRECISION(kAny),
...@@ -332,17 +337,16 @@ REGISTER_LITE_KERNEL( ...@@ -332,17 +337,16 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
// [chw] -> [hwc]
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout_once, layout_once,
kOpenCL, kOpenCL,
kAny, kAny,
kNHWC, kImageDefault,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc, paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
buffer_chw_to_image2d_hwc_opencl_fp32) NCHW_to_ImageDefault)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny), PRECISION(kAny),
...@@ -350,54 +354,52 @@ REGISTER_LITE_KERNEL( ...@@ -350,54 +354,52 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
// Image2DHwcBufferChw // [ImageDefault] -> [NCHW]
// [hwc] -> [chw]
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout, layout,
kOpenCL, kOpenCL,
kAny, kAny,
kNCHW, kNCHW,
paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw, paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
image2d_hwc_to_buffer_chw_opencl_fp32) ImageDefault_to_NCHW)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kNCHW))}) DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
// [hwc] -> [chw]
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout_once, layout_once,
kOpenCL, kOpenCL,
kAny, kAny,
kNCHW, kNCHW,
paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw, paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
image2d_hwc_to_buffer_chw_opencl_fp32) ImageDefault_to_NCHW)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kNHWC))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kNCHW))}) DATALAYOUT(kNCHW))})
.Finalize(); .Finalize();
// [hwc] -> [chw] // [NCHW] -> [ImageNW]
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
layout_once, layout_once,
kOpenCL, kOpenCL,
kFloat, kFloat,
kImageNW, kImageNW,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DNw, paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DNw,
buffer_chw_to_image2d_nw_opencl_fp32) NCHW_to_ImageNW)
.BindInput("Input", .BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFloat),
......
...@@ -24,7 +24,7 @@ namespace lite { ...@@ -24,7 +24,7 @@ namespace lite {
// #define LOOP_TEST // #define LOOP_TEST
// #define PRINT_RESULT // #define PRINT_RESULT
TEST(layout, compute) { TEST(layout_ImageDefault, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) " LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) "
"-> device"; "-> device";
...@@ -43,8 +43,11 @@ TEST(layout, compute) { ...@@ -43,8 +43,11 @@ TEST(layout, compute) {
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========"; << h << " " << w << " ========";
// set layout kernels // set layout kernels
auto buf_to_img_kernels = KernelRegistry::Global().Create( auto buf_to_img_kernels =
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC)); KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create( auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
ASSERT_FALSE(buf_to_img_kernels.empty()); ASSERT_FALSE(buf_to_img_kernels.empty());
...@@ -144,7 +147,8 @@ TEST(layout, compute) { ...@@ -144,7 +147,8 @@ TEST(layout, compute) {
// nothing to do. // nothing to do.
#endif #endif
} }
TEST(layout, compute_buffer2image2dnw) {
TEST(layout_ImageNW, compute) {
#ifdef LOOP_TEST #ifdef LOOP_TEST
for (int n = 1; n <= 100; n += 21) { for (int n = 1; n <= 100; n += 21) {
for (auto c : {1, 3}) { for (auto c : {1, 3}) {
...@@ -282,12 +286,6 @@ TEST(layout, compute_buffer2image2dnw) { ...@@ -282,12 +286,6 @@ TEST(layout, compute_buffer2image2dnw) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL( USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
layout, kOpenCL, kAny, kNHWC, buffer_chw_to_image2d_hwc_opencl_fp32); USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL( USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW);
layout, kOpenCL, kAny, kNCHW, image2d_hwc_to_buffer_chw_opencl_fp32);
USE_LITE_KERNEL(layout_once,
kOpenCL,
kFloat,
kImageNW,
buffer_chw_to_image2d_nw_opencl_fp32);
...@@ -29,6 +29,7 @@ class ReluCompute ...@@ -29,6 +29,7 @@ class ReluCompute
public: public:
using param_t = operators::ActivationParam; using param_t = operators::ActivationParam;
std::string doc() const override { return "Relu using cl::Buffer"; }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
...@@ -72,15 +73,21 @@ class ReluCompute ...@@ -72,15 +73,21 @@ class ReluCompute
private: private:
std::string kernel_func_name_{"relu"}; std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE=float -DRELU"}; std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class ReluComputeFloatImage class ReluComputeFloatImageDefault
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> { : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ActivationParam; using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu using cl::Image2D(ImageDefault/RGBA)";
}
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
...@@ -154,18 +161,19 @@ class ReluComputeFloatImage ...@@ -154,18 +161,19 @@ class ReluComputeFloatImage
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) // .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize(); // .Finalize();
REGISTER_LITE_KERNEL(relu, REGISTER_LITE_KERNEL(
relu,
kOpenCL, kOpenCL,
kFloat, kFloat,
kNHWC, kImageDefault,
paddle::lite::kernels::opencl::ReluComputeFloatImage, paddle::lite::kernels::opencl::ReluComputeFloatImageDefault,
image2d) ImageDefault)
.BindInput("X", .BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNHWC))}) DATALAYOUT(kImageDefault))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL), {LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNHWC))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "lite/backends/opencl/target_wrapper.h" #include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -28,7 +29,8 @@ void relu_compute_ref(const dtype *x_data, const DDim &x_dim, dtype *out_data) { ...@@ -28,7 +29,8 @@ void relu_compute_ref(const dtype *x_data, const DDim &x_dim, dtype *out_data) {
} }
} }
TEST(opencl_relu, compute) { #if 0 // relu_buffer
TEST(opencl_relu_buffer, compute) {
// prepare data // prepare data
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10}); const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
lite::Tensor x, out; lite::Tensor x, out;
...@@ -87,8 +89,171 @@ TEST(opencl_relu, compute) { ...@@ -87,8 +89,171 @@ TEST(opencl_relu, compute) {
TargetWrapperCL::Unmap(out_data, mapped_out); TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x); TargetWrapperCL::Unmap(x_data, mapped_x);
} }
#endif // relu_buffer
// #define LOOP_TEST
// #define PRINT_RESULT
TEST(relu_image2d, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
"layout(img2buf) "
"-> host";
#ifdef LOOP_TEST
for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) {
for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) {
#else
const int n = 1;
const int c = 2;
const int h = 3;
const int w = 4;
#endif // LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
auto relu_img_kernels =
KernelRegistry::Global().Create("relu",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(relu_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
auto relu_img_kernel = std::move(relu_img_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
// set tensors about op param
LOG(INFO) << "set tensors about op param";
// layout(buf->img): x -> relu_in
// relu(img): relu_in -> relu_out
// layout(img->buf): relu_out -> y
lite::Tensor x, y, relu_in, relu_out, y_ref;
operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x;
BufferToImageParam.y = &relu_in;
ImageToBufferParam.x = &relu_out;
ImageToBufferParam.y = &y;
operators::ActivationParam ReluParam;
ReluParam.X = &relu_in;
ReluParam.Out = &relu_out;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(x_dim);
y.Resize(x_dim);
relu_in.Resize(x_dim);
relu_out.Resize(x_dim);
y_ref.Resize(x_dim);
auto relu_image2d_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
// initialize tensors
LOG(INFO) << "initialize tensors";
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
mapped_y[i] = static_cast<int>(0);
}
auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
// set context and kernel args
LOG(INFO) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
buf_to_img_kernel->SetParam(BufferToImageParam);
std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(buf_to_img_context->As<OpenCLContext>()));
buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
img_to_buf_kernel->SetParam(ImageToBufferParam);
std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
relu_img_kernel->SetParam(ReluParam);
std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(relu_img_context->As<OpenCLContext>()));
relu_img_kernel->SetContext(std::move(relu_img_context));
// run kernels
LOG(INFO) << "run kernel: buf_to_img_kernel";
buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: relu_img_kernel";
relu_img_kernel->Launch();
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result
#ifdef PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
}
#endif // PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]:" << mapped_y[eidx];
break;
}
}
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef LOOP_TEST
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def); // relu buffer
// USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
// relu image2d
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册