diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt index 1acb98321844191832fd55b640a9b56d3d51b400..dd7f6b417e0d6416eec9bb3e60ef088432776112 100644 --- a/lite/backends/opencl/CMakeLists.txt +++ b/lite/backends/opencl/CMakeLists.txt @@ -11,8 +11,8 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper - ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl) + ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper - ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl) + ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) add_dependencies(cl_wrapper opencl_clhpp) diff --git a/lite/backends/opencl/cl_caller.cc b/lite/backends/opencl/cl_caller.cc index 4926a53c43d54b4e2b4d802a7d8ef289c7e87fc5..6b9cab1056beaa6f516a0d3a202a7816c911f1b2 100644 --- a/lite/backends/opencl/cl_caller.cc +++ b/lite/backends/opencl/cl_caller.cc @@ -23,6 +23,7 @@ limitations under the License. */ namespace paddle { namespace lite { + static void CopyImageData(CLContext* context, const CLImage& cl_image, float* out) { @@ -51,119 +52,5 @@ bool InitOpenCLRuntime(std::string cl_path) { return runtime->IsInitSuccess(); } -void elementwise_add(CLContext* context, - const float* in, - const DDim& in_dim, - const float* bias, - const DDim& bias_dim, - float* out, - const DDim& out_dim) { - if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) { - LOG(FATAL) << "Error: bias dims is error"; - return; - } - auto kernel = bias_dim.size() == 1 ? context->GetKernel("channel_add") - : context->GetKernel("elementwise_add"); - CLImage in_image; - in_image.set_tensor_data(in, in_dim); - in_image.InitNormalCLImage(context->GetContext()); - VLOG(3) << " --- Inpu image: " << in_image << " --- "; - CLImage bias_image; - bias_image.set_tensor_data(bias, bias_dim); - bias_image.InitCLImage(context->GetContext()); - VLOG(3) << " --- Bias image: " << bias_image << " --- "; - CLImage out_image; - out_image.InitEmptyImage(context->GetContext(), out_dim); - cl_int status; - status = kernel.setArg(0, *in_image.cl_image()); - CL_CHECK_FATAL(status); - status = kernel.setArg(1, *bias_image.cl_image()); - CL_CHECK_FATAL(status); - status = kernel.setArg(2, *out_image.cl_image()); - CL_CHECK_FATAL(status); - - if (bias_dim.size() == 1) { - int tensor_w = in_dim[3]; - status = kernel.setArg(3, tensor_w); - CL_CHECK_FATAL(status); - } - size_t width = in_image.ImageWidth(); - size_t height = in_image.ImageHeight(); - auto global_work_size = cl::NDRange{width, height}; - status = context->GetCommandQueue().enqueueNDRangeKernel( - kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); - CL_CHECK_FATAL(status); - - status = context->GetCommandQueue().finish(); - CL_CHECK_FATAL(status); - VLOG(3) << " --- Out image: " << out_image << " --- "; - CopyImageData(context, out_image, out); -} - -void pool(CLContext* context, - const std::string pooling_type, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int ksize_h, - const int ksize_w, - const float* in, - const DDim& in_dim, - float* out, - const DDim& out_dim) { - auto kernel = - context->GetKernel(string_format("pool_%s", pooling_type.c_str())); - CLImage in_image; - in_image.set_tensor_data(in, in_dim); - in_image.InitNormalCLImage(context->GetContext()); - VLOG(3) << " --- Inpu image: " << in_image << " --- "; - CLImage out_image; - out_image.InitEmptyImage(context->GetContext(), out_dim); - auto global_work_size = context->DefaultWorkSize(out_image); - auto* in_converter = - dynamic_cast(in_image.image_converter()); - auto* out_converter = - dynamic_cast(out_image.image_converter()); - const int in_height = in_converter->HeightOfOneBlock(); - const int in_width = in_converter->WidthOfOneBlock(); - const int out_height = out_converter->HeightOfOneBlock(); - const int out_width = out_converter->WidthOfOneBlock(); - cl_int status; - status = kernel.setArg(0, in_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(1, in_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(2, out_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(3, out_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(4, pad_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(5, pad_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(6, stride_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(7, stride_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(8, ksize_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(9, ksize_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(10, *in_image.cl_image()); - CL_CHECK_FATAL(status); - status = kernel.setArg(11, *out_image.cl_image()); - CL_CHECK_FATAL(status); - - status = context->GetCommandQueue().enqueueNDRangeKernel( - kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); - CL_CHECK_FATAL(status); - - status = context->GetCommandQueue().finish(); - CL_CHECK_FATAL(status); - VLOG(3) << " --- Out image: " << out_image << " --- "; - CopyImageData(context, out_image, out); -} - } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_caller.h b/lite/backends/opencl/cl_caller.h index ed5c9153d3cedf140cbf0570b7f71393fb918bf9..1817db9f6bd6d9ecf21978b8293bd9534328de0f 100644 --- a/lite/backends/opencl/cl_caller.h +++ b/lite/backends/opencl/cl_caller.h @@ -23,30 +23,5 @@ namespace lite { bool InitOpenCLRuntime(std::string cl_path); -/// An elementwise_add method to embed OpenCL logic inside, it is used as a -/// black box so that the framework can remain simple. -/// NOTE Currently, these methods are quite expensive, we will optimize them -/// latter. -void elementwise_add(CLContext* context, - const float* in, - const DDim& in_dim, - const float* bias, - const DDim& bias_dim, - float* out, - const DDim& out_dim); - -void pool(CLContext* context, - const std::string pooling_type, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int ksize_h, - const int ksize_w, - const float* in, - const DDim& in_dim, - float* out, - const DDim& out_dim); - } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc index b9f6648c9956e1952b65f66abfa40d912a99ee67..70f47b47946641edf4d023437b48d46cae93ca6e 100644 --- a/lite/backends/opencl/cl_functions_test.cc +++ b/lite/backends/opencl/cl_functions_test.cc @@ -41,9 +41,10 @@ TEST(cl_test, runtime_test) { auto &context = runtime->context(); auto program = runtime->CreateProgram( context, - runtime->cl_path() + "/cl_kernel/" + "image/elementwise_add_kernel.cl"); + runtime->cl_path() + "/cl_kernel/" + "buffer/elementwise_add_kernel.cl"); auto event = runtime->CreateEvent(context); - CHECK(runtime->BuildProgram(program.get())); + const std::string build_option("-DCL_DTYPE_float"); + CHECK(runtime->BuildProgram(program.get(), build_option)); } TEST(cl_test, context_test) { @@ -51,9 +52,11 @@ TEST(cl_test, context_test) { CHECK(runtime->IsInitSuccess()); runtime->set_cl_path(FLAGS_cl_path); CLContext context; - context.AddKernel("pool_max", "image/pool_kernel.cl", ""); - context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", ""); - context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", ""); + context.AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float"); + context.AddKernel( + "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float"); + context.AddKernel( + "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float"); } TEST(cl_test, kernel_test) { @@ -61,9 +64,11 @@ TEST(cl_test, kernel_test) { CHECK(runtime->IsInitSuccess()); runtime->set_cl_path(FLAGS_cl_path); std::unique_ptr context(new CLContext); - context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); - context->AddKernel("pool_max", "image/pool_kernel.cl"); - context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); + context->AddKernel( + "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float"); + context->AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float"); + context->AddKernel( + "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float"); auto kernel = context->GetKernel(2); std::unique_ptr in_data(new float[4 * 3 * 256 * 512]); @@ -115,203 +120,12 @@ TEST(cl_test, kernel_test) { LOG(INFO) << out_image; } -TEST(cl_test, channel_add_test) { - std::default_random_engine engine; - std::uniform_real_distribution dist(-5, 5); - - const DDim in_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr in_data(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - in_data[i] = dist(engine); - } - - const DDim bias_dim = DDim(std::vector{16}); - std::unique_ptr bias_data(new float[16]); - for (int i = 0; i < 16; i++) { - bias_data[i] = dist(engine); - } - - std::unique_ptr out_ref(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 16; j++) { - float b = bias_data[j]; - for (int k = 0; k < 256 * 512; k++) { - int index = (i * 16 + j) * 256 * 512 + k; - out_ref[index] = in_data[index] + b; - } - } - } - - const DDim out_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr out(new float[4 * 16 * 256 * 512]); - - bool status = InitOpenCLRuntime(FLAGS_cl_path); - CHECK(status) << "Fail to initialize OpenCL runtime."; - std::unique_ptr context(new CLContext); - context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); - context->AddKernel("channel_add", "image/channel_add_kernel.cl"); - elementwise_add(context.get(), - in_data.get(), - in_dim, - bias_data.get(), - bias_dim, - out.get(), - out_dim); - - int stride = 4 * 16 * 256 * 512 / 20; - for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) { - std::cout << out[i] << " "; - } - std::cout << std::endl; - - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - EXPECT_NEAR(out[i], out_ref[i], 1e-6); - } -} - -TEST(cl_test, elementwise_add_test) { - std::default_random_engine engine; - std::uniform_real_distribution dist(-5, 5); - - const DDim in_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr in_data(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - in_data[i] = dist(engine); - } - - const DDim bias_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr bias_data(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - bias_data[i] = dist(engine); - } - - std::unique_ptr out_ref(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - out_ref[i] = in_data[i] + bias_data[i]; - } - - const DDim out_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr out(new float[4 * 16 * 256 * 512]); - - bool status = InitOpenCLRuntime(FLAGS_cl_path); - CHECK(status) << "Fail to initialize OpenCL runtime."; - std::unique_ptr context(new CLContext); - context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); - context->AddKernel("channel_add", "image/channel_add_kernel.cl"); - elementwise_add(context.get(), - in_data.get(), - in_dim, - bias_data.get(), - bias_dim, - out.get(), - out_dim); - - int stride = 4 * 16 * 256 * 512 / 20; - for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) { - std::cout << out[i] << " "; - } - std::cout << std::endl; - - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - EXPECT_NEAR(out[i], out_ref[i], 1e-6); - } -} - -void pool_avg(const int padding_height, - const int padding_width, - const int stride_height, - const int stride_width, - const int ksize_height, - const int ksize_width, - const float *input_data, - const DDim &in_dim, - float *output_data, - const DDim &out_dim) { - const int batch_size = in_dim[0]; - const int input_height = in_dim[2]; - const int input_width = in_dim[3]; - const int output_channels = out_dim[1]; - const int output_height = out_dim[2]; - const int output_width = out_dim[3]; - - const size_t input_spatial_size = input_height * input_width; - const size_t output_spatial_size = output_height * output_width; - - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - int channel = i * output_channels + c; - const float *input_ptr = input_data + channel * input_spatial_size; - float *output_ptr = output_data + channel * output_spatial_size; - - for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - - float val = 0.f; - int count = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - val += input_ptr[h * input_width + w]; - ++count; - } - } - output_ptr[ph * output_width + pw] = - (count > 0) ? val * (1.f / count) : 0.f; - } - } - } - } -} - -TEST(cl_test, pool_test) { - std::default_random_engine engine; - std::uniform_real_distribution dist(-5, 5); - - const DDim in_dim = DDim(std::vector{4, 1024, 7, 7}); - std::unique_ptr in_data(new float[4 * 1024 * 7 * 7]); - for (int i = 0; i < 4 * 1024 * 7 * 7; i++) { - in_data[i] = dist(engine); - } - - const DDim out_dim = DDim(std::vector{4, 1024, 1, 1}); - std::unique_ptr out(new float[4 * 1024 * 1 * 1]); - std::unique_ptr out_ref(new float[4 * 1024 * 1 * 1]); - - bool status = InitOpenCLRuntime(FLAGS_cl_path); - CHECK(status) << "Fail to initialize OpenCL runtime."; - std::unique_ptr context(new CLContext); - context->AddKernel("pool_max", "image/pool_kernel.cl"); - context->AddKernel("pool_avg", "image/pool_kernel.cl"); - pool(context.get(), - "avg", - 0, - 0, - 1, - 1, - 7, - 7, - in_data.get(), - in_dim, - out.get(), - out_dim); - pool_avg(0, 0, 1, 1, 7, 7, in_data.get(), in_dim, out_ref.get(), out_dim); - - for (int i = 0; i < 4 * 1024 * 1 * 1; i++) { - EXPECT_NEAR(out[i], out_ref[i], 1e-6); - } -} - TEST(cl_test, target_wrapper_buffer_test) { bool inited = InitOpenCLRuntime(FLAGS_cl_path); CHECK(inited) << "Fail to initialize OpenCL runtime."; std::unique_ptr context(new CLContext); std::string kernel_name = "elementwise_add"; - std::string build_options = "-DCL_DTYPE=float"; + std::string build_options = "-DCL_DTYPE_float"; context->AddKernel( kernel_name, "buffer/elementwise_add_kernel.cl", build_options); std::vector h_a; @@ -396,10 +210,13 @@ TEST(cl_test, target_wrapper_buffer_test) { TEST(cl_test, target_wrapper_image_test) { const size_t cl_image2d_width = 28; const size_t cl_image2d_height = 32; + const size_t cl_image2d_elem_size = + cl_image2d_width * cl_image2d_height * 4; // 4 for RGBA channels const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_slice_pitch{0}; auto *d_image = static_cast( TargetWrapperCL::MallocImage(cl_image2d_width, cl_image2d_height)); + // Map/Unmap test auto *h_image = static_cast(TargetWrapperCL::MapImage(d_image, @@ -407,15 +224,11 @@ TEST(cl_test, target_wrapper_image_test) { cl_image2d_height, cl_image2d_row_pitch, cl_image2d_slice_pitch)); - CHECK_EQ( - cl_image2d_row_pitch, - cl_image2d_width * 4 * - 4); // row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes) - CHECK_EQ(cl_image2d_slice_pitch, 0); // slice_pitch = 0 + CHECK_EQ(cl_image2d_slice_pitch, 0); LOG(INFO) << "cl_image2d_row_pitch = " << cl_image2d_row_pitch << ", cl_image2d_slice_pitch " << cl_image2d_slice_pitch; - for (int i = 0; i < 10; i++) { + for (int i = 0; i < cl_image2d_elem_size; i++) { h_image[i] = 3.14f * i; } TargetWrapperCL::Unmap(d_image, h_image); @@ -426,15 +239,14 @@ TEST(cl_test, target_wrapper_image_test) { cl_image2d_height, cl_image2d_row_pitch, cl_image2d_slice_pitch)); - for (int i = 0; i < 10; i++) { + for (int i = 0; i < cl_image2d_elem_size; i++) { EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6); } TargetWrapperCL::Unmap(d_image, h_ptr); // Imagecpy test - std::vector h_image_cpy(cl_image2d_width * 4 * - cl_image2d_height); // 4 for RGBA channels - for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) { + std::vector h_image_cpy(cl_image2d_elem_size); + for (int i = 0; i < cl_image2d_elem_size; i++) { h_image_cpy[i] = 3.14f; } TargetWrapperCL::ImgcpySync(d_image, @@ -446,6 +258,8 @@ TEST(cl_test, target_wrapper_image_test) { IoDirection::HtoD); auto *d_image_cpy = static_cast( TargetWrapperCL::MallocImage(cl_image2d_width, cl_image2d_height)); + + // device to device TargetWrapperCL::ImgcpySync(d_image_cpy, d_image, cl_image2d_width, @@ -454,6 +268,8 @@ TEST(cl_test, target_wrapper_image_test) { cl_image2d_slice_pitch, IoDirection::DtoD); std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0); + + // host to device TargetWrapperCL::ImgcpySync(h_image_cpy.data(), d_image_cpy, cl_image2d_width, @@ -461,7 +277,7 @@ TEST(cl_test, target_wrapper_image_test) { cl_image2d_row_pitch, cl_image2d_slice_pitch, IoDirection::DtoH); - for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) { + for (int i = 0; i < cl_image2d_elem_size; i++) { EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6); } diff --git a/lite/backends/opencl/cl_kernel/image/conv_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl similarity index 99% rename from lite/backends/opencl/cl_kernel/image/conv_1x1_kernel.cl rename to lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl index 361d55531e2157c5070eaf50e302265933a93f73..5dc264f7e5b0e276c37566393acc355d83c4fed7 100644 --- a/lite/backends/opencl/cl_kernel/image/conv_1x1_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl @@ -1,6 +1,6 @@ #include -__kernel void conv_1x1( +__kernel void conv2d_1x1( __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, @@ -200,4 +200,4 @@ __kernel void conv_1x1( if (out_w3 < old_w) { write_imagef(output_image, output_pos3, output3); } -} \ No newline at end of file +} diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl index ecf719ae9316ed14743e872a1c2cde4b254b35ff..a95c6c6897944c9c943f65b72e51a2ced94befa6 100644 --- a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include + __kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) { int x = get_global_id(0); int y = get_global_id(1); diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index c2504ab611e93399c70169f3f123d4a0514c07ad..0c7b2f8575a88082f6d79a5392c4468715a701b9 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -103,6 +103,7 @@ std::unique_ptr CLRuntime::CreateEvent( bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) { std::string build_option = options + " -cl-fast-relaxed-math -I " + CLRuntime::Global()->cl_path() + "/cl_kernel"; + VLOG(4) << "OpenCL build_option: " << build_option; status_ = program->build({*device_}, build_option.c_str()); CL_CHECK_ERROR(status_); diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index c1b9798fc660d7026e8f60400fa394ca6e9a131c..13f527a2200198569054f2314e190b958973004e 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -40,9 +40,9 @@ lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) #TODO(ysh329): comment buffer-impl relu -#lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc -# DEPS relu_opencl op_registry program context -# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) +lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc + DEPS relu_opencl layout_opencl op_registry program context + ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter diff --git a/lite/kernels/opencl/conv2d_1x1_compute.cc b/lite/kernels/opencl/conv2d_1x1_compute.cc index e02b37b09610b99298ce505df19c922d65763f2f..3f313f542cbb22629b17db5dae197fe682d00f47 100644 --- a/lite/kernels/opencl/conv2d_1x1_compute.cc +++ b/lite/kernels/opencl/conv2d_1x1_compute.cc @@ -26,13 +26,13 @@ namespace kernels { namespace opencl { #define USE_BUFFER_FOR_CONV1x1_BIAS -class Conv2d1x1Image2DCompute - : public KernelLite { +class Conv2d1x1Image2DCompute : public KernelLite { public: using param_t = operators::ConvParam; void PrepareForRun() override { - LOG(INFO) << "PrepareForRun ..."; const auto& param = *param_.get_mutable(); if (param.fuse_relu) { build_options_ += " -DRELU"; @@ -46,43 +46,38 @@ class Conv2d1x1Image2DCompute } auto& context = ctx_->As(); context.cl_context()->AddKernel( - kernel_func_name_, "image/conv_1x1_kernel.cl", build_options_); - LOG(INFO) << "PrepareForRun Ready"; + kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_); } void Run() override { - LOG(INFO) << "opencl conv 1x1 run begin ..."; - LOG(INFO) << "param_.get_mutable ..."; const auto& param = *param_.get_mutable(); - - LOG(INFO) << "get param dims ..."; auto input_dims = param.x->dims(); - CHECK_GE(input_dims.size(), 4); - LOG(INFO) << "input_dims: " << input_dims; - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - + auto paddings = *param.paddings; + auto strides = param.strides; + auto* input_image = param.x->data(); + auto* filter_image = param.filter->data(); auto filter_dims = param.filter->dims(); - - LOG(INFO) << "filter_dims: " << filter_dims; - auto output_dims = param.output->dims(); - LOG(INFO) << "output_dims: " << output_dims; - + int input_width = input_dims[3]; + int input_height = input_dims[2]; int output_width = output_dims[3]; int output_height = output_dims[2]; - // mute output image auto out_image_shape = InitImageDimInfoWith(output_dims); - - LOG(INFO) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - auto* out_image = param.output->mutable_data( out_image_shape["width"], out_image_shape["height"]); - // gen default_work_size + const bool has_bias = param.bias != nullptr; + const bool is_element_wise_bias = + has_bias && param.output->dims() == param.bias->dims(); + int offset = static_cast(param.filter->dims()[2]) / 2 - + static_cast(paddings[0]); + + // calc input_c_block + auto input_image_shape = InitImageDimInfoWith(input_dims); + int input_c_block = input_image_shape["width"] / input_dims[3]; + int input_c = input_dims[1]; + auto dilations = *param.dilations; const std::vector& default_work_size = DefaultWorkSize(output_dims, @@ -93,28 +88,38 @@ class Conv2d1x1Image2DCompute int c_block = default_work_size[0]; int w = default_work_size[1]; int nh = default_work_size[2]; - LOG(INFO) << "default work size: " - << "{" << c_block << ", " << w << ", " << nh << "" - << "}"; - - auto paddings = *param.paddings; - LOG(INFO) << "paddings: " << paddings[0] << "," << paddings[1]; - auto strides = param.strides; + VLOG(4) << "============ conv2d_1x1 params ============"; + VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," + << input_image_shape["height"]; + VLOG(4) << "input_c_block: " << input_c_block; + VLOG(4) << "input_c: " << input_c; + VLOG(4) << "input_image: " << input_image; + VLOG(4) << "filter_dims: " << filter_dims; + VLOG(4) << "filter_image: " << filter_image; + VLOG(4) << "output_dims: " << output_dims; + VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " + << out_image_shape["height"]; + VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; + VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; + VLOG(4) << "strides: " << strides[0] << "," << strides[1]; + VLOG(4) << "offset: " << offset; + VLOG(4) << "dilations.size : " << dilations.size(); + VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + VLOG(4) << "default work size{c_block, w, nh}: " + << "{" << c_block << ", " << w << ", " << nh << "" + << "}"; - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - auto* input_image = param.x->data(); - auto* filter_image = param.filter->data(); + CHECK_GE(dilations.size(), 2); + CHECK(dilations[0] == dilations[1]); + CHECK_GE(input_dims.size(), 4); + CHECK_GE(paddings.size(), 2); + CHECK(paddings[0] == paddings[1]); + CHECK_GE(strides.size(), 2); + CHECK(strides[0] == strides[1]); // handle bias use buffer for channel wise , use image for element wise - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - - LOG(INFO) << "has bias: " << has_bias; - LOG(INFO) << "is_element_wise_bias : " << is_element_wise_bias; - LOG(INFO) << "get kernel ..."; const cl::Buffer* bias_buf = nullptr; const cl::Image2D* bias_image = nullptr; if (has_bias) { @@ -126,48 +131,37 @@ class Conv2d1x1Image2DCompute bias_image = param.bias->data(); #endif } + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; - - LOG(INFO) << "kernel_key: " << kernel_key.str(); auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + int maped_w = maptofactor(w, 4); + + VLOG(4) << "kernel_key: " << kernel_key.str(); + VLOG(4) << "kernel ready ... " << kernel_key.str(); + VLOG(4) << "maped_w: " << maped_w; - LOG(INFO) << "kernel ready ... " << kernel_key.str(); cl_int status; - auto numel = output_dims.production(); int arg_idx = 0; - status = kernel.setArg(arg_idx, c_block); CL_CHECK_FATAL(status); - - int maped_w = maptofactor(w, 4); - LOG(INFO) << "maped_w: " << maped_w; - status = kernel.setArg(++arg_idx, maped_w); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh); - LOG(INFO) << "nh: " << nh; - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - LOG(INFO) << "input_image: "; - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - LOG(INFO) << "filter_image: "; - CL_CHECK_FATAL(status); - if (has_bias) { #ifndef USE_BUFFER_FOR_CONV1x1_BIAS if (is_element_wise_bias != 0) { - LOG(INFO) << "set bias_image: "; + VLOG(4) << "set bias_image: "; status = kernel.setArg(++arg_idx, *bias_image); } else { - LOG(INFO) << "set bias_buf: "; + VLOG(4) << "set bias_buf: "; status = kernel.setArg(++arg_idx, *bias_buf); } #else @@ -175,77 +169,38 @@ class Conv2d1x1Image2DCompute #endif CL_CHECK_FATAL(status); } - status = kernel.setArg(++arg_idx, *out_image); - LOG(INFO) << "out_image: "; - CL_CHECK_FATAL(status); - - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); status = kernel.setArg(++arg_idx, strides[0]); - LOG(INFO) << "strides: " << strides[0] << "," << strides[1]; - CL_CHECK_FATAL(status); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - LOG(INFO) << "offset: " << offset; - status = kernel.setArg(++arg_idx, offset); CL_CHECK_FATAL(status); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - LOG(INFO) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - - int input_c_block = input_image_shape["width"] / input_dims[3]; - LOG(INFO) << "input_c_block: " << input_c_block; - status = kernel.setArg(++arg_idx, input_c_block); CL_CHECK_FATAL(status); - - int input_c = input_dims[1]; - LOG(INFO) << "input_c: " << input_c; - status = kernel.setArg(++arg_idx, input_c); CL_CHECK_FATAL(status); - - auto dilations = *param.dilations; - LOG(INFO) << "dilations.size : " << dilations.size(); - LOG(INFO) << "dilations: " << dilations[0] << ", " << dilations[1]; - - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); status = kernel.setArg(++arg_idx, dilations[0]); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w); CL_CHECK_FATAL(status); - // clac gloabl_work_size auto global_work_size = cl::NDRange{static_cast(default_work_size.data()[0]), static_cast(maped_w), static_cast(default_work_size.data()[2])}; - LOG(INFO) << "global_work_size :" << global_work_size; + VLOG(4) << "out_image: " << out_image; + VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," + << global_work_size[1] << "," << global_work_size[2] << "}"; status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, @@ -259,8 +214,8 @@ class Conv2d1x1Image2DCompute } private: - std::string kernel_func_name_{"conv_1x1"}; - std::string build_options_{"-DCL_DTYPE=float "}; + std::string kernel_func_name_{"conv2d_1x1"}; + std::string build_options_{"-DCL_DTYPE_float"}; std::shared_ptr event_{new cl::Event}; }; @@ -272,7 +227,7 @@ class Conv2d1x1Image2DCompute REGISTER_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, - kNHWC, + kImageDefault, paddle::lite::kernels::opencl::Conv2d1x1Image2DCompute, image2d) .BindInput("Input", diff --git a/lite/kernels/opencl/conv2d_1x1_compute_test.cc b/lite/kernels/opencl/conv2d_1x1_compute_test.cc index 591e9ad795b96c832a5b169570b4773646276695..c35e73449277aad3de3b3112624f0b1b0d26a4ae 100644 --- a/lite/kernels/opencl/conv2d_1x1_compute_test.cc +++ b/lite/kernels/opencl/conv2d_1x1_compute_test.cc @@ -128,8 +128,10 @@ TEST(conv2d_1x1, compute) { const int ow = iw; LOG(INFO) << "to get kernel ..."; - auto kernels = KernelRegistry::Global().Create( - "conv2d_1x1", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)); + auto kernels = KernelRegistry::Global().Create("conv2d_1x1", + TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault)); ASSERT_FALSE(kernels.empty()); auto kernel = std::move(kernels.front()); @@ -373,4 +375,4 @@ TEST(conv2d_1x1, compute) { } // namespace lite } // namespace paddle -USE_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, kNHWC, image2d); +USE_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, kImageDefault, image2d); diff --git a/lite/kernels/opencl/elementwise_add_compute.h b/lite/kernels/opencl/elementwise_add_compute.h index 2f41dfaa2b2f88977a5f56b3c33b556fb06c9125..bd0398ca3f286abca369910a649947d112b40b9a 100644 --- a/lite/kernels/opencl/elementwise_add_compute.h +++ b/lite/kernels/opencl/elementwise_add_compute.h @@ -41,7 +41,7 @@ class ElementwiseAddCompute size_t num_{1}; param_t* ele_param_{nullptr}; std::string kernel_func_name_{"elementwise_add"}; - std::string build_options_{"-DCL_DTYPE=float"}; + std::string build_options_{"-DCL_DTYPE_float"}; std::shared_ptr event_{new cl::Event}; }; diff --git a/lite/kernels/opencl/layout_compute.cc b/lite/kernels/opencl/layout_compute.cc index e2e1530ba62010fdb930ccdf852cf2fc2ebc39a5..f3393fcaa14f4eeae1672eaaf2973efe6b5c1de5 100644 --- a/lite/kernels/opencl/layout_compute.cc +++ b/lite/kernels/opencl/layout_compute.cc @@ -28,8 +28,11 @@ namespace lite { namespace kernels { namespace opencl { -class LayoutComputeBufferChwToImage2DHwc - : public KernelLite { +// [NCHW] -> [ImageDefault] +class LayoutComputeBufferChwToImageDefault + : public KernelLite { public: using param_t = operators::LayoutParam; @@ -117,7 +120,8 @@ class LayoutComputeBufferChwToImage2DHwc } std::string doc() const override { - return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(RGBA)"; + return "Trans Layout from cl::Buffer(NCHW) to " + "cl::Image2D(ImageDefault/RGBA)"; } private: @@ -126,11 +130,9 @@ class LayoutComputeBufferChwToImage2DHwc std::shared_ptr event_{new cl::Event}; }; -// buffer chw 2 image2d nw -class LayoutComputeBufferChwToImage2DNw - : public KernelLite { +// [ImageDefault] -> [NCHW] +class LayoutComputeImageDefaultToBufferChw + : public KernelLite { public: using param_t = operators::LayoutParam; @@ -142,31 +144,29 @@ class LayoutComputeBufferChwToImage2DNw void Run() override { auto& param = Param(); - auto* x_data = param.x->data(); - auto x_dims = param.x->dims(); - - CHECK(x_dims.size() == 4) << " Tensor dim is not 4."; - size_t image_width = x_dims[3] * ((x_dims[0] + 3) / 4); - size_t image_height = x_dims[1] * x_dims[2]; - - auto* y_data = - param.y->mutable_data(image_width, image_height); + auto* y_data = param.y->mutable_data(TARGET(kOpenCL)); auto y_dims = param.y->dims(); + auto* x_data = param.x->data(); + auto x_dims = param.x->dims(); - // out info std::vector new_dims = {1, 1, 1, 1}; - for (int tidx = 0; tidx < x_dims.size(); ++tidx) { - new_dims[4 - x_dims.size() + tidx] = x_dims[tidx]; + for (int j = 0; j < x_dims.size(); ++j) { + new_dims[4 - x_dims.size() + j] = x_dims[j]; } - const int out_N = new_dims[0]; - const int out_C = new_dims[1]; - const int out_H = new_dims[2]; - const int out_W = new_dims[3]; + VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " + << x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; + VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " + << y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; + VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " " + << new_dims[1] << " " << new_dims[2] << " " << new_dims[3]; - const int Stride2 = out_C * out_H * out_W; - const int Stride1 = out_H * out_W; - const int Stride0 = out_W; + size_t C = new_dims[1]; + size_t in_height = new_dims[2]; + size_t in_width = new_dims[3]; + int size_ch = in_height * in_width; + int size_block = size_ch * 4; + int size_batch = size_ch * C; auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); @@ -177,27 +177,26 @@ class LayoutComputeBufferChwToImage2DNw int arg_idx = 0; cl_int status = kernel.setArg(arg_idx, *x_data); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *y_data); + status = kernel.setArg(++arg_idx, static_cast(in_width)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(out_H)); + status = kernel.setArg(++arg_idx, static_cast(in_height)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(out_W)); + status = kernel.setArg(++arg_idx, *y_data); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(out_N)); + status = kernel.setArg(++arg_idx, static_cast(size_ch)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(Stride0)); + status = kernel.setArg(++arg_idx, static_cast(size_ch)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(Stride1)); + status = kernel.setArg(++arg_idx, static_cast(size_batch)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(Stride2)); + status = kernel.setArg(++arg_idx, static_cast(C)); CL_CHECK_FATAL(status); - - VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " " - << (out_C * out_H); + VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3] + << " " << (new_dims[0] * new_dims[2]); auto global_work_size = - cl::NDRange{static_cast((out_N + 3) / 4), // N blocks - static_cast(out_W), // w - static_cast(out_C * out_H)}; // ch + cl::NDRange{static_cast((new_dims[1] + 3) / 4), + static_cast(new_dims[3]), + static_cast(new_dims[0] * new_dims[2])}; status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, @@ -209,21 +208,24 @@ class LayoutComputeBufferChwToImage2DNw // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` // context.cl_wait_list()->emplace(y_data, event_); context.cl_context()->GetCommandQueue().finish(); - // auto image_shape = InitImageDimInfoWith(x_dims); } std::string doc() const override { - return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(CLNW)"; + return "Trans Layout from cl::Image2D(ImageDefault/RGBA) to " + "cl::Buffer(NCHW)"; } private: - std::string kernel_func_name_{"buffer_to_image2d_nw"}; - std::string build_options_{"-DCL_DTYPE_float "}; + std::string kernel_func_name_{"image2d_to_buffer"}; + std::string build_options_{"-DCL_DTYPE_float"}; std::shared_ptr event_{new cl::Event}; }; -class LayoutComputeImage2DHwcToBufferChw - : public KernelLite { +// [NCHW] -> [ImageDW] +class LayoutComputeBufferChwToImage2DNw + : public KernelLite { public: using param_t = operators::LayoutParam; @@ -235,29 +237,31 @@ class LayoutComputeImage2DHwcToBufferChw void Run() override { auto& param = Param(); - auto* y_data = param.y->mutable_data(TARGET(kOpenCL)); - auto y_dims = param.y->dims(); - auto* x_data = param.x->data(); + auto* x_data = param.x->data(); auto x_dims = param.x->dims(); + CHECK(x_dims.size() == 4) << " Tensor dim is not 4."; + size_t image_width = x_dims[3] * ((x_dims[0] + 3) / 4); + size_t image_height = x_dims[1] * x_dims[2]; + + auto* y_data = + param.y->mutable_data(image_width, image_height); + auto y_dims = param.y->dims(); + + // out info std::vector new_dims = {1, 1, 1, 1}; - for (int j = 0; j < x_dims.size(); ++j) { - new_dims[4 - x_dims.size() + j] = x_dims[j]; + for (int tidx = 0; tidx < x_dims.size(); ++tidx) { + new_dims[4 - x_dims.size() + tidx] = x_dims[tidx]; } - VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " - << x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; - VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " - << y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; - VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " " - << new_dims[1] << " " << new_dims[2] << " " << new_dims[3]; + const int out_N = new_dims[0]; + const int out_C = new_dims[1]; + const int out_H = new_dims[2]; + const int out_W = new_dims[3]; - size_t C = new_dims[1]; - size_t in_height = new_dims[2]; - size_t in_width = new_dims[3]; - int size_ch = in_height * in_width; - int size_block = size_ch * 4; - int size_batch = size_ch * C; + const int Stride2 = out_C * out_H * out_W; + const int Stride1 = out_H * out_W; + const int Stride0 = out_W; auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); @@ -268,26 +272,27 @@ class LayoutComputeImage2DHwcToBufferChw int arg_idx = 0; cl_int status = kernel.setArg(arg_idx, *x_data); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(in_width)); + status = kernel.setArg(++arg_idx, *y_data); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(in_height)); + status = kernel.setArg(++arg_idx, static_cast(out_H)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *y_data); + status = kernel.setArg(++arg_idx, static_cast(out_W)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(size_ch)); + status = kernel.setArg(++arg_idx, static_cast(out_N)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(size_ch)); + status = kernel.setArg(++arg_idx, static_cast(Stride0)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(size_batch)); + status = kernel.setArg(++arg_idx, static_cast(Stride1)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(C)); + status = kernel.setArg(++arg_idx, static_cast(Stride2)); CL_CHECK_FATAL(status); - VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3] - << " " << (new_dims[0] * new_dims[2]); + + VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " " + << (out_C * out_H); auto global_work_size = - cl::NDRange{static_cast((new_dims[1] + 3) / 4), - static_cast(new_dims[3]), - static_cast(new_dims[0] * new_dims[2])}; + cl::NDRange{static_cast((out_N + 3) / 4), // N blocks + static_cast(out_W), // w + static_cast(out_C * out_H)}; // ch status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, @@ -299,15 +304,16 @@ class LayoutComputeImage2DHwcToBufferChw // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` // context.cl_wait_list()->emplace(y_data, event_); context.cl_context()->GetCommandQueue().finish(); + // auto image_shape = InitImageDimInfoWith(x_dims); } std::string doc() const override { - return "Trans Layout from cl::Image2D(RGBA) to cl::Buffer(NCHW)"; + return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(ImageDW/CLNW)"; } private: - std::string kernel_func_name_{"image2d_to_buffer"}; - std::string build_options_{"-DCL_DTYPE_float"}; + std::string kernel_func_name_{"buffer_to_image2d_nw"}; + std::string build_options_{"-DCL_DTYPE_float "}; std::shared_ptr event_{new cl::Event}; }; @@ -316,15 +322,14 @@ class LayoutComputeImage2DHwcToBufferChw } // namespace lite } // namespace paddle -// BufferChwToImage2DHwc -// [chw] -> [hwc] +// [NCHW] -> [ImageDefault] REGISTER_LITE_KERNEL( layout, kOpenCL, kAny, - kNHWC, - paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc, - buffer_chw_to_image2d_hwc_opencl_fp32) + kImageDefault, + paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault, + NCHW_to_ImageDefault) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kAny), @@ -332,17 +337,16 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kAny), - DATALAYOUT(kNHWC))}) + DATALAYOUT(kImageDefault))}) .Finalize(); -// [chw] -> [hwc] REGISTER_LITE_KERNEL( layout_once, kOpenCL, kAny, - kNHWC, - paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc, - buffer_chw_to_image2d_hwc_opencl_fp32) + kImageDefault, + paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault, + NCHW_to_ImageDefault) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kAny), @@ -350,54 +354,52 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kAny), - DATALAYOUT(kNHWC))}) + DATALAYOUT(kImageDefault))}) .Finalize(); -// Image2DHwcBufferChw -// [hwc] -> [chw] +// [ImageDefault] -> [NCHW] REGISTER_LITE_KERNEL( layout, kOpenCL, kAny, kNCHW, - paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw, - image2d_hwc_to_buffer_chw_opencl_fp32) + paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw, + ImageDefault_to_NCHW) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kAny), - DATALAYOUT(kNHWC))}) + DATALAYOUT(kImageDefault))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW))}) .Finalize(); -// [hwc] -> [chw] REGISTER_LITE_KERNEL( layout_once, kOpenCL, kAny, kNCHW, - paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw, - image2d_hwc_to_buffer_chw_opencl_fp32) + paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw, + ImageDefault_to_NCHW) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kAny), - DATALAYOUT(kNHWC))}) + DATALAYOUT(kImageDefault))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW))}) .Finalize(); -// [hwc] -> [chw] +// [NCHW] -> [ImageNW] REGISTER_LITE_KERNEL( layout_once, kOpenCL, kFloat, kImageNW, paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DNw, - buffer_chw_to_image2d_nw_opencl_fp32) + NCHW_to_ImageNW) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFloat), diff --git a/lite/kernels/opencl/layout_compute_test.cc b/lite/kernels/opencl/layout_compute_test.cc index 3968e23d6be10bf050bcfc478d278398bf16fd7e..34fdce98251f6056cb4c5ff23393b7b6f12fe89f 100644 --- a/lite/kernels/opencl/layout_compute_test.cc +++ b/lite/kernels/opencl/layout_compute_test.cc @@ -24,7 +24,7 @@ namespace lite { // #define LOOP_TEST // #define PRINT_RESULT -TEST(layout, compute) { +TEST(layout_ImageDefault, compute) { LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) " "-> device"; @@ -43,8 +43,11 @@ TEST(layout, compute) { LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " << h << " " << w << " ========"; // set layout kernels - auto buf_to_img_kernels = KernelRegistry::Global().Create( - "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC)); + auto buf_to_img_kernels = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); auto img_to_buf_kernels = KernelRegistry::Global().Create( "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); ASSERT_FALSE(buf_to_img_kernels.empty()); @@ -144,7 +147,8 @@ TEST(layout, compute) { // nothing to do. #endif } -TEST(layout, compute_buffer2image2dnw) { + +TEST(layout_ImageNW, compute) { #ifdef LOOP_TEST for (int n = 1; n <= 100; n += 21) { for (auto c : {1, 3}) { @@ -282,12 +286,6 @@ TEST(layout, compute_buffer2image2dnw) { } // namespace lite } // namespace paddle -USE_LITE_KERNEL( - layout, kOpenCL, kAny, kNHWC, buffer_chw_to_image2d_hwc_opencl_fp32); -USE_LITE_KERNEL( - layout, kOpenCL, kAny, kNCHW, image2d_hwc_to_buffer_chw_opencl_fp32); -USE_LITE_KERNEL(layout_once, - kOpenCL, - kFloat, - kImageNW, - buffer_chw_to_image2d_nw_opencl_fp32); +USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); +USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); +USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW); diff --git a/lite/kernels/opencl/relu_compute.cc b/lite/kernels/opencl/relu_compute.cc index c7b89c939b0bf571f27ac1dfdd272a9324f8e89f..addf628bb0f0bd5c5031f2a8ee19ba167d7032ac 100644 --- a/lite/kernels/opencl/relu_compute.cc +++ b/lite/kernels/opencl/relu_compute.cc @@ -29,6 +29,7 @@ class ReluCompute public: using param_t = operators::ActivationParam; + std::string doc() const override { return "Relu using cl::Buffer"; } void PrepareForRun() override { auto& context = ctx_->As(); context.cl_context()->AddKernel( @@ -72,15 +73,21 @@ class ReluCompute private: std::string kernel_func_name_{"relu"}; - std::string build_options_{"-DCL_DTYPE=float -DRELU"}; + std::string build_options_{"-DCL_DTYPE_float -DRELU"}; std::shared_ptr event_{new cl::Event}; }; -class ReluComputeFloatImage - : public KernelLite { +class ReluComputeFloatImageDefault + : public KernelLite { public: using param_t = operators::ActivationParam; + std::string doc() const override { + return "Relu using cl::Image2D(ImageDefault/RGBA)"; + } + void PrepareForRun() override { auto& context = ctx_->As(); context.cl_context()->AddKernel( @@ -154,18 +161,19 @@ class ReluComputeFloatImage // .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) // .Finalize(); -REGISTER_LITE_KERNEL(relu, - kOpenCL, - kFloat, - kNHWC, - paddle::lite::kernels::opencl::ReluComputeFloatImage, - image2d) +REGISTER_LITE_KERNEL( + relu, + kOpenCL, + kFloat, + kImageDefault, + paddle::lite::kernels::opencl::ReluComputeFloatImageDefault, + ImageDefault) .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFloat), - DATALAYOUT(kNHWC))}) + DATALAYOUT(kImageDefault))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFloat), - DATALAYOUT(kNHWC))}) + DATALAYOUT(kImageDefault))}) .Finalize(); diff --git a/lite/kernels/opencl/relu_compute_test.cc b/lite/kernels/opencl/relu_compute_test.cc index d2f0812bae608324a8ab31756981271fe1c334e4..45a60f81e11d2f1e3cdac3e58b950368d90aeb80 100644 --- a/lite/kernels/opencl/relu_compute_test.cc +++ b/lite/kernels/opencl/relu_compute_test.cc @@ -17,6 +17,7 @@ #include "lite/backends/opencl/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" +#include "lite/kernels/opencl/image_helper.h" namespace paddle { namespace lite { @@ -28,7 +29,8 @@ void relu_compute_ref(const dtype *x_data, const DDim &x_dim, dtype *out_data) { } } -TEST(opencl_relu, compute) { +#if 0 // relu_buffer +TEST(opencl_relu_buffer, compute) { // prepare data const DDim x_dim = DDim(std::vector{3, 6, 10, 10}); lite::Tensor x, out; @@ -87,8 +89,171 @@ TEST(opencl_relu, compute) { TargetWrapperCL::Unmap(out_data, mapped_out); TargetWrapperCL::Unmap(x_data, mapped_x); } +#endif // relu_buffer + +// #define LOOP_TEST +// #define PRINT_RESULT +TEST(relu_image2d, compute) { + LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> " + "layout(img2buf) " + "-> host"; + +#ifdef LOOP_TEST + for (int n = 1; n <= 100; n += 33) { + for (auto c : {1, 3}) { + for (int h = 12; h <= 100; h += 13) { + for (int w = 12; w <= 100; w += 25) { +#else + const int n = 1; + const int c = 2; + const int h = 3; + const int w = 4; +#endif // LOOP_TEST + + LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " + << h << " " << w << " ========"; + // set layout kernels + auto buf_to_img_kernels = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + auto img_to_buf_kernels = KernelRegistry::Global().Create( + "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); + auto relu_img_kernels = + KernelRegistry::Global().Create("relu", + TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(buf_to_img_kernels.empty()); + ASSERT_FALSE(buf_to_img_kernels.empty()); + ASSERT_FALSE(relu_img_kernels.empty()); + + auto buf_to_img_kernel = std::move(buf_to_img_kernels.front()); + auto img_to_buf_kernel = std::move(img_to_buf_kernels.front()); + auto relu_img_kernel = std::move(relu_img_kernels.front()); + LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc(); + LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc(); + LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc(); + + // set tensors about op param + LOG(INFO) << "set tensors about op param"; + // layout(buf->img): x -> relu_in + // relu(img): relu_in -> relu_out + // layout(img->buf): relu_out -> y + lite::Tensor x, y, relu_in, relu_out, y_ref; + operators::LayoutParam BufferToImageParam; + operators::LayoutParam ImageToBufferParam; + BufferToImageParam.x = &x; + BufferToImageParam.y = &relu_in; + ImageToBufferParam.x = &relu_out; + ImageToBufferParam.y = &y; + operators::ActivationParam ReluParam; + ReluParam.X = &relu_in; + ReluParam.Out = &relu_out; + + const DDim x_dim = DDim(std::vector{n, c, h, w}); + x.Resize(x_dim); + y.Resize(x_dim); + relu_in.Resize(x_dim); + relu_out.Resize(x_dim); + y_ref.Resize(x_dim); + auto relu_image2d_shape = + paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim); + + // initialize tensors + LOG(INFO) << "initialize tensors"; + auto *x_data = x.mutable_data(TARGET(kOpenCL)); + auto *y_data = y.mutable_data(TARGET(kOpenCL)); + auto *y_data_ref = y_ref.mutable_data(TARGET(kARM)); + auto *mapped_x = static_cast(TargetWrapperCL::Map( + x_data, 0, sizeof(float) * x_dim.production())); + auto *mapped_y = static_cast(TargetWrapperCL::Map( + y_data, 0, sizeof(float) * x_dim.production())); + for (int i = 0; i < x_dim.production(); ++i) { + mapped_x[i] = static_cast(i) - x_dim.production() / 2; + mapped_y[i] = static_cast(0); + } + auto *relu_in_data = relu_in.mutable_data( + relu_image2d_shape["width"], relu_image2d_shape["height"]); + auto *relu_out_data = relu_out.mutable_data( + relu_image2d_shape["width"], relu_image2d_shape["height"]); + + // set context and kernel args + LOG(INFO) << "set context and kernel args"; + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + buf_to_img_kernel->SetParam(BufferToImageParam); + std::unique_ptr buf_to_img_context(new KernelContext); + context->As().CopySharedTo( + &(buf_to_img_context->As())); + buf_to_img_kernel->SetContext(std::move(buf_to_img_context)); + + img_to_buf_kernel->SetParam(ImageToBufferParam); + std::unique_ptr img_to_buf_context(new KernelContext); + context->As().CopySharedTo( + &(img_to_buf_context->As())); + img_to_buf_kernel->SetContext(std::move(img_to_buf_context)); + + relu_img_kernel->SetParam(ReluParam); + std::unique_ptr relu_img_context(new KernelContext); + context->As().CopySharedTo( + &(relu_img_context->As())); + relu_img_kernel->SetContext(std::move(relu_img_context)); + + // run kernels + LOG(INFO) << "run kernel: buf_to_img_kernel"; + buf_to_img_kernel->Launch(); + LOG(INFO) << "run kernel: relu_img_kernel"; + relu_img_kernel->Launch(); + LOG(INFO) << "run kernel: img_to_buf_kernel"; + img_to_buf_kernel->Launch(); + + // compute ref cpu + relu_compute_ref(mapped_x, x_dim, y_data_ref); +// result +#ifdef PRINT_RESULT + LOG(INFO) << "---- print kernel result (input -> output) ----"; + for (int eidx = 0; eidx < x_dim.production(); ++eidx) { + std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx] + << std::endl; + } +#endif // PRINT_RESULT + + // check result: compare kernel output and cpu output(y_data_ref) + for (int eidx = 0; eidx < x_dim.production(); eidx++) { + EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6); + if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) { + LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx + << " / " << x_dim.production() << ", y_data_ref[" + << eidx << "]:" << y_data_ref[eidx] << ", mapped_y[" + << eidx << "]:" << mapped_y[eidx]; + break; + } + } + + // free + LOG(INFO) << "free: unmap x, y"; + TargetWrapperCL::Unmap(x_data, mapped_x); + TargetWrapperCL::Unmap(y_data, mapped_y); +#ifdef LOOP_TEST + } // w + } // h + } // c + } // n +#else +// nothing to do. +#endif +} } // namespace lite } // namespace paddle -USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def); +// relu buffer +// USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def); + +// relu image2d +USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); +USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); +USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);