提交 cc46d43c 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Add relu image2d kernel unit test, Fix conv2d_1x1, relu, layout...

[LITE][OPENCL] Add relu image2d kernel unit test, Fix conv2d_1x1, relu, layout using new Image2D Layout (#2564)

* add 3 layout for opencl image. test=develop

* add relu image2d test. test=develop
上级 cff0c364
......@@ -11,8 +11,8 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image)
lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper
ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl)
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper
ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl)
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
add_dependencies(cl_wrapper opencl_clhpp)
......@@ -23,6 +23,7 @@ limitations under the License. */
namespace paddle {
namespace lite {
static void CopyImageData(CLContext* context,
const CLImage& cl_image,
float* out) {
......@@ -51,119 +52,5 @@ bool InitOpenCLRuntime(std::string cl_path) {
return runtime->IsInitSuccess();
}
void elementwise_add(CLContext* context,
const float* in,
const DDim& in_dim,
const float* bias,
const DDim& bias_dim,
float* out,
const DDim& out_dim) {
if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) {
LOG(FATAL) << "Error: bias dims is error";
return;
}
auto kernel = bias_dim.size() == 1 ? context->GetKernel("channel_add")
: context->GetKernel("elementwise_add");
CLImage in_image;
in_image.set_tensor_data(in, in_dim);
in_image.InitNormalCLImage(context->GetContext());
VLOG(3) << " --- Inpu image: " << in_image << " --- ";
CLImage bias_image;
bias_image.set_tensor_data(bias, bias_dim);
bias_image.InitCLImage(context->GetContext());
VLOG(3) << " --- Bias image: " << bias_image << " --- ";
CLImage out_image;
out_image.InitEmptyImage(context->GetContext(), out_dim);
cl_int status;
status = kernel.setArg(0, *in_image.cl_image());
CL_CHECK_FATAL(status);
status = kernel.setArg(1, *bias_image.cl_image());
CL_CHECK_FATAL(status);
status = kernel.setArg(2, *out_image.cl_image());
CL_CHECK_FATAL(status);
if (bias_dim.size() == 1) {
int tensor_w = in_dim[3];
status = kernel.setArg(3, tensor_w);
CL_CHECK_FATAL(status);
}
size_t width = in_image.ImageWidth();
size_t height = in_image.ImageHeight();
auto global_work_size = cl::NDRange{width, height};
status = context->GetCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_FATAL(status);
status = context->GetCommandQueue().finish();
CL_CHECK_FATAL(status);
VLOG(3) << " --- Out image: " << out_image << " --- ";
CopyImageData(context, out_image, out);
}
void pool(CLContext* context,
const std::string pooling_type,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int ksize_h,
const int ksize_w,
const float* in,
const DDim& in_dim,
float* out,
const DDim& out_dim) {
auto kernel =
context->GetKernel(string_format("pool_%s", pooling_type.c_str()));
CLImage in_image;
in_image.set_tensor_data(in, in_dim);
in_image.InitNormalCLImage(context->GetContext());
VLOG(3) << " --- Inpu image: " << in_image << " --- ";
CLImage out_image;
out_image.InitEmptyImage(context->GetContext(), out_dim);
auto global_work_size = context->DefaultWorkSize(out_image);
auto* in_converter =
dynamic_cast<CLImageConverterNormal*>(in_image.image_converter());
auto* out_converter =
dynamic_cast<CLImageConverterNormal*>(out_image.image_converter());
const int in_height = in_converter->HeightOfOneBlock();
const int in_width = in_converter->WidthOfOneBlock();
const int out_height = out_converter->HeightOfOneBlock();
const int out_width = out_converter->WidthOfOneBlock();
cl_int status;
status = kernel.setArg(0, in_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(1, in_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(2, out_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(3, out_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(4, pad_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(5, pad_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(6, stride_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(7, stride_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(8, ksize_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(9, ksize_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(10, *in_image.cl_image());
CL_CHECK_FATAL(status);
status = kernel.setArg(11, *out_image.cl_image());
CL_CHECK_FATAL(status);
status = context->GetCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_FATAL(status);
status = context->GetCommandQueue().finish();
CL_CHECK_FATAL(status);
VLOG(3) << " --- Out image: " << out_image << " --- ";
CopyImageData(context, out_image, out);
}
} // namespace lite
} // namespace paddle
......@@ -23,30 +23,5 @@ namespace lite {
bool InitOpenCLRuntime(std::string cl_path);
/// An elementwise_add method to embed OpenCL logic inside, it is used as a
/// black box so that the framework can remain simple.
/// NOTE Currently, these methods are quite expensive, we will optimize them
/// latter.
void elementwise_add(CLContext* context,
const float* in,
const DDim& in_dim,
const float* bias,
const DDim& bias_dim,
float* out,
const DDim& out_dim);
void pool(CLContext* context,
const std::string pooling_type,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int ksize_h,
const int ksize_w,
const float* in,
const DDim& in_dim,
float* out,
const DDim& out_dim);
} // namespace lite
} // namespace paddle
......@@ -41,9 +41,10 @@ TEST(cl_test, runtime_test) {
auto &context = runtime->context();
auto program = runtime->CreateProgram(
context,
runtime->cl_path() + "/cl_kernel/" + "image/elementwise_add_kernel.cl");
runtime->cl_path() + "/cl_kernel/" + "buffer/elementwise_add_kernel.cl");
auto event = runtime->CreateEvent(context);
CHECK(runtime->BuildProgram(program.get()));
const std::string build_option("-DCL_DTYPE_float");
CHECK(runtime->BuildProgram(program.get(), build_option));
}
TEST(cl_test, context_test) {
......@@ -51,9 +52,11 @@ TEST(cl_test, context_test) {
CHECK(runtime->IsInitSuccess());
runtime->set_cl_path(FLAGS_cl_path);
CLContext context;
context.AddKernel("pool_max", "image/pool_kernel.cl", "");
context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", "");
context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", "");
context.AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float");
context.AddKernel(
"elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
context.AddKernel(
"elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
}
TEST(cl_test, kernel_test) {
......@@ -61,9 +64,11 @@ TEST(cl_test, kernel_test) {
CHECK(runtime->IsInitSuccess());
runtime->set_cl_path(FLAGS_cl_path);
std::unique_ptr<CLContext> context(new CLContext);
context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
context->AddKernel("pool_max", "image/pool_kernel.cl");
context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
context->AddKernel(
"elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
context->AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float");
context->AddKernel(
"elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
auto kernel = context->GetKernel(2);
std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]);
......@@ -115,203 +120,12 @@ TEST(cl_test, kernel_test) {
LOG(INFO) << out_image;
}
TEST(cl_test, channel_add_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
in_data[i] = dist(engine);
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{16});
std::unique_ptr<float[]> bias_data(new float[16]);
for (int i = 0; i < 16; i++) {
bias_data[i] = dist(engine);
}
std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 16; j++) {
float b = bias_data[j];
for (int k = 0; k < 256 * 512; k++) {
int index = (i * 16 + j) * 256 * 512 + k;
out_ref[index] = in_data[index] + b;
}
}
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
bool status = InitOpenCLRuntime(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL runtime.";
std::unique_ptr<CLContext> context(new CLContext);
context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
context->AddKernel("channel_add", "image/channel_add_kernel.cl");
elementwise_add(context.get(),
in_data.get(),
in_dim,
bias_data.get(),
bias_dim,
out.get(),
out_dim);
int stride = 4 * 16 * 256 * 512 / 20;
for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
std::cout << out[i] << " ";
}
std::cout << std::endl;
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
}
TEST(cl_test, elementwise_add_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
in_data[i] = dist(engine);
}
const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> bias_data(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
bias_data[i] = dist(engine);
}
std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
out_ref[i] = in_data[i] + bias_data[i];
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
bool status = InitOpenCLRuntime(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL runtime.";
std::unique_ptr<CLContext> context(new CLContext);
context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
context->AddKernel("channel_add", "image/channel_add_kernel.cl");
elementwise_add(context.get(),
in_data.get(),
in_dim,
bias_data.get(),
bias_dim,
out.get(),
out_dim);
int stride = 4 * 16 * 256 * 512 / 20;
for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
std::cout << out[i] << " ";
}
std::cout << std::endl;
for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
}
void pool_avg(const int padding_height,
const int padding_width,
const int stride_height,
const int stride_width,
const int ksize_height,
const int ksize_width,
const float *input_data,
const DDim &in_dim,
float *output_data,
const DDim &out_dim) {
const int batch_size = in_dim[0];
const int input_height = in_dim[2];
const int input_width = in_dim[3];
const int output_channels = out_dim[1];
const int output_height = out_dim[2];
const int output_width = out_dim[3];
const size_t input_spatial_size = input_height * input_width;
const size_t output_spatial_size = output_height * output_width;
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
int channel = i * output_channels + c;
const float *input_ptr = input_data + channel * input_spatial_size;
float *output_ptr = output_data + channel * output_spatial_size;
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
float val = 0.f;
int count = 0;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += input_ptr[h * input_width + w];
++count;
}
}
output_ptr[ph * output_width + pw] =
(count > 0) ? val * (1.f / count) : 0.f;
}
}
}
}
}
TEST(cl_test, pool_test) {
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
std::unique_ptr<float[]> in_data(new float[4 * 1024 * 7 * 7]);
for (int i = 0; i < 4 * 1024 * 7 * 7; i++) {
in_data[i] = dist(engine);
}
const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
std::unique_ptr<float[]> out(new float[4 * 1024 * 1 * 1]);
std::unique_ptr<float[]> out_ref(new float[4 * 1024 * 1 * 1]);
bool status = InitOpenCLRuntime(FLAGS_cl_path);
CHECK(status) << "Fail to initialize OpenCL runtime.";
std::unique_ptr<CLContext> context(new CLContext);
context->AddKernel("pool_max", "image/pool_kernel.cl");
context->AddKernel("pool_avg", "image/pool_kernel.cl");
pool(context.get(),
"avg",
0,
0,
1,
1,
7,
7,
in_data.get(),
in_dim,
out.get(),
out_dim);
pool_avg(0, 0, 1, 1, 7, 7, in_data.get(), in_dim, out_ref.get(), out_dim);
for (int i = 0; i < 4 * 1024 * 1 * 1; i++) {
EXPECT_NEAR(out[i], out_ref[i], 1e-6);
}
}
TEST(cl_test, target_wrapper_buffer_test) {
bool inited = InitOpenCLRuntime(FLAGS_cl_path);
CHECK(inited) << "Fail to initialize OpenCL runtime.";
std::unique_ptr<CLContext> context(new CLContext);
std::string kernel_name = "elementwise_add";
std::string build_options = "-DCL_DTYPE=float";
std::string build_options = "-DCL_DTYPE_float";
context->AddKernel(
kernel_name, "buffer/elementwise_add_kernel.cl", build_options);
std::vector<float> h_a;
......@@ -396,10 +210,13 @@ TEST(cl_test, target_wrapper_buffer_test) {
TEST(cl_test, target_wrapper_image_test) {
const size_t cl_image2d_width = 28;
const size_t cl_image2d_height = 32;
const size_t cl_image2d_elem_size =
cl_image2d_width * cl_image2d_height * 4; // 4 for RGBA channels
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
auto *d_image = static_cast<cl::Image2D *>(
TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
// Map/Unmap test
auto *h_image =
static_cast<float *>(TargetWrapperCL::MapImage(d_image,
......@@ -407,15 +224,11 @@ TEST(cl_test, target_wrapper_image_test) {
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch));
CHECK_EQ(
cl_image2d_row_pitch,
cl_image2d_width * 4 *
4); // row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes)
CHECK_EQ(cl_image2d_slice_pitch, 0); // slice_pitch = 0
CHECK_EQ(cl_image2d_slice_pitch, 0);
LOG(INFO) << "cl_image2d_row_pitch = " << cl_image2d_row_pitch
<< ", cl_image2d_slice_pitch " << cl_image2d_slice_pitch;
for (int i = 0; i < 10; i++) {
for (int i = 0; i < cl_image2d_elem_size; i++) {
h_image[i] = 3.14f * i;
}
TargetWrapperCL::Unmap(d_image, h_image);
......@@ -426,15 +239,14 @@ TEST(cl_test, target_wrapper_image_test) {
cl_image2d_height,
cl_image2d_row_pitch,
cl_image2d_slice_pitch));
for (int i = 0; i < 10; i++) {
for (int i = 0; i < cl_image2d_elem_size; i++) {
EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6);
}
TargetWrapperCL::Unmap(d_image, h_ptr);
// Imagecpy test
std::vector<float> h_image_cpy(cl_image2d_width * 4 *
cl_image2d_height); // 4 for RGBA channels
for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
std::vector<float> h_image_cpy(cl_image2d_elem_size);
for (int i = 0; i < cl_image2d_elem_size; i++) {
h_image_cpy[i] = 3.14f;
}
TargetWrapperCL::ImgcpySync(d_image,
......@@ -446,6 +258,8 @@ TEST(cl_test, target_wrapper_image_test) {
IoDirection::HtoD);
auto *d_image_cpy = static_cast<cl::Image2D *>(
TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
// device to device
TargetWrapperCL::ImgcpySync(d_image_cpy,
d_image,
cl_image2d_width,
......@@ -454,6 +268,8 @@ TEST(cl_test, target_wrapper_image_test) {
cl_image2d_slice_pitch,
IoDirection::DtoD);
std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0);
// host to device
TargetWrapperCL::ImgcpySync(h_image_cpy.data(),
d_image_cpy,
cl_image2d_width,
......@@ -461,7 +277,7 @@ TEST(cl_test, target_wrapper_image_test) {
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
for (int i = 0; i < cl_image2d_elem_size; i++) {
EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6);
}
......
#include <cl_common.h>
__kernel void conv_1x1(
__kernel void conv2d_1x1(
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
......@@ -200,4 +200,4 @@ __kernel void conv_1x1(
if (out_w3 < old_w) {
write_imagef(output_image, output_pos3, output3);
}
}
\ No newline at end of file
}
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
......
......@@ -103,6 +103,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
std::string build_option = options + " -cl-fast-relaxed-math -I " +
CLRuntime::Global()->cl_path() + "/cl_kernel";
VLOG(4) << "OpenCL build_option: " << build_option;
status_ = program->build({*device_}, build_option.c_str());
CL_CHECK_ERROR(status_);
......
......@@ -40,9 +40,9 @@ lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
#TODO(ysh329): comment buffer-impl relu
#lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
# DEPS relu_opencl op_registry program context
# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
DEPS relu_opencl layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter
......
......@@ -26,13 +26,13 @@ namespace kernels {
namespace opencl {
#define USE_BUFFER_FOR_CONV1x1_BIAS
class Conv2d1x1Image2DCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConvParam;
void PrepareForRun() override {
LOG(INFO) << "PrepareForRun ...";
const auto& param = *param_.get_mutable<param_t>();
if (param.fuse_relu) {
build_options_ += " -DRELU";
......@@ -46,43 +46,38 @@ class Conv2d1x1Image2DCompute
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/conv_1x1_kernel.cl", build_options_);
LOG(INFO) << "PrepareForRun Ready";
kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_);
}
void Run() override {
LOG(INFO) << "opencl conv 1x1 run begin ...";
LOG(INFO) << "param_.get_mutable<param_t> ...";
const auto& param = *param_.get_mutable<param_t>();
LOG(INFO) << "get param dims ...";
auto input_dims = param.x->dims();
CHECK_GE(input_dims.size(), 4);
LOG(INFO) << "input_dims: " << input_dims;
int input_width = input_dims[3];
int input_height = input_dims[2];
auto paddings = *param.paddings;
auto strides = param.strides;
auto* input_image = param.x->data<float, cl::Image2D>();
auto* filter_image = param.filter->data<float, cl::Image2D>();
auto filter_dims = param.filter->dims();
LOG(INFO) << "filter_dims: " << filter_dims;
auto output_dims = param.output->dims();
LOG(INFO) << "output_dims: " << output_dims;
int input_width = input_dims[3];
int input_height = input_dims[2];
int output_width = output_dims[3];
int output_height = output_dims[2];
// mute output image
auto out_image_shape = InitImageDimInfoWith(output_dims);
LOG(INFO) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
auto* out_image = param.output->mutable_data<float, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
// gen default_work_size
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
int input_c_block = input_image_shape["width"] / input_dims[3];
int input_c = input_dims[1];
auto dilations = *param.dilations;
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
......@@ -93,28 +88,38 @@ class Conv2d1x1Image2DCompute
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
LOG(INFO) << "default work size: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
auto paddings = *param.paddings;
LOG(INFO) << "paddings: " << paddings[0] << "," << paddings[1];
auto strides = param.strides;
VLOG(4) << "============ conv2d_1x1 params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
VLOG(4) << "input_c_block: " << input_c_block;
VLOG(4) << "input_c: " << input_c;
VLOG(4) << "input_image: " << input_image;
VLOG(4) << "filter_dims: " << filter_dims;
VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "offset: " << offset;
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* input_image = param.x->data<float, cl::Image2D>();
auto* filter_image = param.filter->data<float, cl::Image2D>();
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
// handle bias use buffer for channel wise , use image for element wise
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
LOG(INFO) << "has bias: " << has_bias;
LOG(INFO) << "is_element_wise_bias : " << is_element_wise_bias;
LOG(INFO) << "get kernel ...";
const cl::Buffer* bias_buf = nullptr;
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
......@@ -126,48 +131,37 @@ class Conv2d1x1Image2DCompute
bias_image = param.bias->data<float, cl::Image2D>();
#endif
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
LOG(INFO) << "kernel_key: " << kernel_key.str();
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int maped_w = maptofactor(w, 4);
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "maped_w: " << maped_w;
LOG(INFO) << "kernel ready ... " << kernel_key.str();
cl_int status;
auto numel = output_dims.production();
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
int maped_w = maptofactor(w, 4);
LOG(INFO) << "maped_w: " << maped_w;
status = kernel.setArg(++arg_idx, maped_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, nh);
LOG(INFO) << "nh: " << nh;
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
LOG(INFO) << "input_image: ";
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
LOG(INFO) << "filter_image: ";
CL_CHECK_FATAL(status);
if (has_bias) {
#ifndef USE_BUFFER_FOR_CONV1x1_BIAS
if (is_element_wise_bias != 0) {
LOG(INFO) << "set bias_image: ";
VLOG(4) << "set bias_image: ";
status = kernel.setArg(++arg_idx, *bias_image);
} else {
LOG(INFO) << "set bias_buf: ";
VLOG(4) << "set bias_buf: ";
status = kernel.setArg(++arg_idx, *bias_buf);
}
#else
......@@ -175,77 +169,38 @@ class Conv2d1x1Image2DCompute
#endif
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
LOG(INFO) << "out_image: ";
CL_CHECK_FATAL(status);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
status = kernel.setArg(++arg_idx, strides[0]);
LOG(INFO) << "strides: " << strides[0] << "," << strides[1];
CL_CHECK_FATAL(status);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
static_cast<int>(paddings[0]);
LOG(INFO) << "offset: " << offset;
status = kernel.setArg(++arg_idx, offset);
CL_CHECK_FATAL(status);
// calc input_c_block
auto input_image_shape = InitImageDimInfoWith(input_dims);
LOG(INFO) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
int input_c_block = input_image_shape["width"] / input_dims[3];
LOG(INFO) << "input_c_block: " << input_c_block;
status = kernel.setArg(++arg_idx, input_c_block);
CL_CHECK_FATAL(status);
int input_c = input_dims[1];
LOG(INFO) << "input_c: " << input_c;
status = kernel.setArg(++arg_idx, input_c);
CL_CHECK_FATAL(status);
auto dilations = *param.dilations;
LOG(INFO) << "dilations.size : " << dilations.size();
LOG(INFO) << "dilations: " << dilations[0] << ", " << dilations[1];
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w);
CL_CHECK_FATAL(status);
// clac gloabl_work_size
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(maped_w),
static_cast<size_t>(default_work_size.data()[2])};
LOG(INFO) << "global_work_size :" << global_work_size;
VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
......@@ -259,8 +214,8 @@ class Conv2d1x1Image2DCompute
}
private:
std::string kernel_func_name_{"conv_1x1"};
std::string build_options_{"-DCL_DTYPE=float "};
std::string kernel_func_name_{"conv2d_1x1"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......@@ -272,7 +227,7 @@ class Conv2d1x1Image2DCompute
REGISTER_LITE_KERNEL(conv2d_1x1,
kOpenCL,
kFloat,
kNHWC,
kImageDefault,
paddle::lite::kernels::opencl::Conv2d1x1Image2DCompute,
image2d)
.BindInput("Input",
......
......@@ -128,8 +128,10 @@ TEST(conv2d_1x1, compute) {
const int ow = iw;
LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create(
"conv2d_1x1", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC));
auto kernels = KernelRegistry::Global().Create("conv2d_1x1",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
......@@ -373,4 +375,4 @@ TEST(conv2d_1x1, compute) {
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, kNHWC, image2d);
USE_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, kImageDefault, image2d);
......@@ -41,7 +41,7 @@ class ElementwiseAddCompute
size_t num_{1};
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE=float"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -28,8 +28,11 @@ namespace lite {
namespace kernels {
namespace opencl {
class LayoutComputeBufferChwToImage2DHwc
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC)> {
// [NCHW] -> [ImageDefault]
class LayoutComputeBufferChwToImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::LayoutParam;
......@@ -117,7 +120,8 @@ class LayoutComputeBufferChwToImage2DHwc
}
std::string doc() const override {
return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(RGBA)";
return "Trans Layout from cl::Buffer(NCHW) to "
"cl::Image2D(ImageDefault/RGBA)";
}
private:
......@@ -126,11 +130,9 @@ class LayoutComputeBufferChwToImage2DHwc
std::shared_ptr<cl::Event> event_{new cl::Event};
};
// buffer chw 2 image2d nw
class LayoutComputeBufferChwToImage2DNw
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageNW)> {
// [ImageDefault] -> [NCHW]
class LayoutComputeImageDefaultToBufferChw
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::LayoutParam;
......@@ -142,31 +144,29 @@ class LayoutComputeBufferChwToImage2DNw
void Run() override {
auto& param = Param<param_t>();
auto* x_data = param.x->data<float, cl::Buffer>();
auto x_dims = param.x->dims();
CHECK(x_dims.size() == 4) << " Tensor dim is not 4.";
size_t image_width = x_dims[3] * ((x_dims[0] + 3) / 4);
size_t image_height = x_dims[1] * x_dims[2];
auto* y_data =
param.y->mutable_data<float, cl::Image2D>(image_width, image_height);
auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto y_dims = param.y->dims();
auto* x_data = param.x->data<float, cl::Image2D>();
auto x_dims = param.x->dims();
// out info
std::vector<size_t> new_dims = {1, 1, 1, 1};
for (int tidx = 0; tidx < x_dims.size(); ++tidx) {
new_dims[4 - x_dims.size() + tidx] = x_dims[tidx];
for (int j = 0; j < x_dims.size(); ++j) {
new_dims[4 - x_dims.size() + j] = x_dims[j];
}
const int out_N = new_dims[0];
const int out_C = new_dims[1];
const int out_H = new_dims[2];
const int out_W = new_dims[3];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
<< new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
const int Stride2 = out_C * out_H * out_W;
const int Stride1 = out_H * out_W;
const int Stride0 = out_W;
size_t C = new_dims[1];
size_t in_height = new_dims[2];
size_t in_width = new_dims[3];
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
......@@ -177,27 +177,26 @@ class LayoutComputeBufferChwToImage2DNw
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_data);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_data);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_width));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_H));
status = kernel.setArg(++arg_idx, static_cast<const int>(in_height));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_W));
status = kernel.setArg(++arg_idx, *y_data);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_N));
status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride0));
status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride1));
status = kernel.setArg(++arg_idx, static_cast<const int>(size_batch));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
status = kernel.setArg(++arg_idx, static_cast<const int>(C));
CL_CHECK_FATAL(status);
VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " "
<< (out_C * out_H);
VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
<< " " << (new_dims[0] * new_dims[2]);
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4), // N blocks
static_cast<cl::size_type>(out_W), // w
static_cast<cl::size_type>(out_C * out_H)}; // ch
cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
static_cast<cl::size_type>(new_dims[3]),
static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
......@@ -209,21 +208,24 @@ class LayoutComputeBufferChwToImage2DNw
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish();
// auto image_shape = InitImageDimInfoWith(x_dims);
}
std::string doc() const override {
return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(CLNW)";
return "Trans Layout from cl::Image2D(ImageDefault/RGBA) to "
"cl::Buffer(NCHW)";
}
private:
std::string kernel_func_name_{"buffer_to_image2d_nw"};
std::string build_options_{"-DCL_DTYPE_float "};
std::string kernel_func_name_{"image2d_to_buffer"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class LayoutComputeImage2DHwcToBufferChw
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)> {
// [NCHW] -> [ImageDW]
class LayoutComputeBufferChwToImage2DNw
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageNW)> {
public:
using param_t = operators::LayoutParam;
......@@ -235,29 +237,31 @@ class LayoutComputeImage2DHwcToBufferChw
void Run() override {
auto& param = Param<param_t>();
auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto y_dims = param.y->dims();
auto* x_data = param.x->data<float, cl::Image2D>();
auto* x_data = param.x->data<float, cl::Buffer>();
auto x_dims = param.x->dims();
CHECK(x_dims.size() == 4) << " Tensor dim is not 4.";
size_t image_width = x_dims[3] * ((x_dims[0] + 3) / 4);
size_t image_height = x_dims[1] * x_dims[2];
auto* y_data =
param.y->mutable_data<float, cl::Image2D>(image_width, image_height);
auto y_dims = param.y->dims();
// out info
std::vector<size_t> new_dims = {1, 1, 1, 1};
for (int j = 0; j < x_dims.size(); ++j) {
new_dims[4 - x_dims.size() + j] = x_dims[j];
for (int tidx = 0; tidx < x_dims.size(); ++tidx) {
new_dims[4 - x_dims.size() + tidx] = x_dims[tidx];
}
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
<< new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
const int out_N = new_dims[0];
const int out_C = new_dims[1];
const int out_H = new_dims[2];
const int out_W = new_dims[3];
size_t C = new_dims[1];
size_t in_height = new_dims[2];
size_t in_width = new_dims[3];
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
const int Stride2 = out_C * out_H * out_W;
const int Stride1 = out_H * out_W;
const int Stride0 = out_W;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
......@@ -268,26 +272,27 @@ class LayoutComputeImage2DHwcToBufferChw
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_data);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_width));
status = kernel.setArg(++arg_idx, *y_data);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(in_height));
status = kernel.setArg(++arg_idx, static_cast<const int>(out_H));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_data);
status = kernel.setArg(++arg_idx, static_cast<const int>(out_W));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
status = kernel.setArg(++arg_idx, static_cast<const int>(out_N));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride0));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(size_batch));
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride1));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(C));
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
CL_CHECK_FATAL(status);
VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
<< " " << (new_dims[0] * new_dims[2]);
VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " "
<< (out_C * out_H);
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
static_cast<cl::size_type>(new_dims[3]),
static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4), // N blocks
static_cast<cl::size_type>(out_W), // w
static_cast<cl::size_type>(out_C * out_H)}; // ch
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
......@@ -299,15 +304,16 @@ class LayoutComputeImage2DHwcToBufferChw
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(y_data, event_);
context.cl_context()->GetCommandQueue().finish();
// auto image_shape = InitImageDimInfoWith(x_dims);
}
std::string doc() const override {
return "Trans Layout from cl::Image2D(RGBA) to cl::Buffer(NCHW)";
return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(ImageDW/CLNW)";
}
private:
std::string kernel_func_name_{"image2d_to_buffer"};
std::string build_options_{"-DCL_DTYPE_float"};
std::string kernel_func_name_{"buffer_to_image2d_nw"};
std::string build_options_{"-DCL_DTYPE_float "};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......@@ -316,15 +322,14 @@ class LayoutComputeImage2DHwcToBufferChw
} // namespace lite
} // namespace paddle
// BufferChwToImage2DHwc
// [chw] -> [hwc]
// [NCHW] -> [ImageDefault]
REGISTER_LITE_KERNEL(
layout,
kOpenCL,
kAny,
kNHWC,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
buffer_chw_to_image2d_hwc_opencl_fp32)
kImageDefault,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
NCHW_to_ImageDefault)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
......@@ -332,17 +337,16 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNHWC))})
DATALAYOUT(kImageDefault))})
.Finalize();
// [chw] -> [hwc]
REGISTER_LITE_KERNEL(
layout_once,
kOpenCL,
kAny,
kNHWC,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
buffer_chw_to_image2d_hwc_opencl_fp32)
kImageDefault,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
NCHW_to_ImageDefault)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
......@@ -350,54 +354,52 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNHWC))})
DATALAYOUT(kImageDefault))})
.Finalize();
// Image2DHwcBufferChw
// [hwc] -> [chw]
// [ImageDefault] -> [NCHW]
REGISTER_LITE_KERNEL(
layout,
kOpenCL,
kAny,
kNCHW,
paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
image2d_hwc_to_buffer_chw_opencl_fp32)
paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
ImageDefault_to_NCHW)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNHWC))})
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.Finalize();
// [hwc] -> [chw]
REGISTER_LITE_KERNEL(
layout_once,
kOpenCL,
kAny,
kNCHW,
paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
image2d_hwc_to_buffer_chw_opencl_fp32)
paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
ImageDefault_to_NCHW)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNHWC))})
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.Finalize();
// [hwc] -> [chw]
// [NCHW] -> [ImageNW]
REGISTER_LITE_KERNEL(
layout_once,
kOpenCL,
kFloat,
kImageNW,
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DNw,
buffer_chw_to_image2d_nw_opencl_fp32)
NCHW_to_ImageNW)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
......
......@@ -24,7 +24,7 @@ namespace lite {
// #define LOOP_TEST
// #define PRINT_RESULT
TEST(layout, compute) {
TEST(layout_ImageDefault, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) "
"-> device";
......@@ -43,8 +43,11 @@ TEST(layout, compute) {
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC));
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
ASSERT_FALSE(buf_to_img_kernels.empty());
......@@ -144,7 +147,8 @@ TEST(layout, compute) {
// nothing to do.
#endif
}
TEST(layout, compute_buffer2image2dnw) {
TEST(layout_ImageNW, compute) {
#ifdef LOOP_TEST
for (int n = 1; n <= 100; n += 21) {
for (auto c : {1, 3}) {
......@@ -282,12 +286,6 @@ TEST(layout, compute_buffer2image2dnw) {
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(
layout, kOpenCL, kAny, kNHWC, buffer_chw_to_image2d_hwc_opencl_fp32);
USE_LITE_KERNEL(
layout, kOpenCL, kAny, kNCHW, image2d_hwc_to_buffer_chw_opencl_fp32);
USE_LITE_KERNEL(layout_once,
kOpenCL,
kFloat,
kImageNW,
buffer_chw_to_image2d_nw_opencl_fp32);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW);
......@@ -29,6 +29,7 @@ class ReluCompute
public:
using param_t = operators::ActivationParam;
std::string doc() const override { return "Relu using cl::Buffer"; }
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
......@@ -72,15 +73,21 @@ class ReluCompute
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE=float -DRELU"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class ReluComputeFloatImage
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
class ReluComputeFloatImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu using cl::Image2D(ImageDefault/RGBA)";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
......@@ -154,18 +161,19 @@ class ReluComputeFloatImage
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
// .Finalize();
REGISTER_LITE_KERNEL(relu,
kOpenCL,
kFloat,
kNHWC,
paddle::lite::kernels::opencl::ReluComputeFloatImage,
image2d)
REGISTER_LITE_KERNEL(
relu,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::ReluComputeFloatImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
DATALAYOUT(kImageDefault))})
.Finalize();
......@@ -17,6 +17,7 @@
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
......@@ -28,7 +29,8 @@ void relu_compute_ref(const dtype *x_data, const DDim &x_dim, dtype *out_data) {
}
}
TEST(opencl_relu, compute) {
#if 0 // relu_buffer
TEST(opencl_relu_buffer, compute) {
// prepare data
const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
lite::Tensor x, out;
......@@ -87,8 +89,171 @@ TEST(opencl_relu, compute) {
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x_data, mapped_x);
}
#endif // relu_buffer
// #define LOOP_TEST
// #define PRINT_RESULT
TEST(relu_image2d, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
"layout(img2buf) "
"-> host";
#ifdef LOOP_TEST
for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) {
for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) {
#else
const int n = 1;
const int c = 2;
const int h = 3;
const int w = 4;
#endif // LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
auto relu_img_kernels =
KernelRegistry::Global().Create("relu",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(relu_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
auto relu_img_kernel = std::move(relu_img_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
// set tensors about op param
LOG(INFO) << "set tensors about op param";
// layout(buf->img): x -> relu_in
// relu(img): relu_in -> relu_out
// layout(img->buf): relu_out -> y
lite::Tensor x, y, relu_in, relu_out, y_ref;
operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x;
BufferToImageParam.y = &relu_in;
ImageToBufferParam.x = &relu_out;
ImageToBufferParam.y = &y;
operators::ActivationParam ReluParam;
ReluParam.X = &relu_in;
ReluParam.Out = &relu_out;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(x_dim);
y.Resize(x_dim);
relu_in.Resize(x_dim);
relu_out.Resize(x_dim);
y_ref.Resize(x_dim);
auto relu_image2d_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
// initialize tensors
LOG(INFO) << "initialize tensors";
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
mapped_y[i] = static_cast<int>(0);
}
auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
// set context and kernel args
LOG(INFO) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
buf_to_img_kernel->SetParam(BufferToImageParam);
std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(buf_to_img_context->As<OpenCLContext>()));
buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
img_to_buf_kernel->SetParam(ImageToBufferParam);
std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
relu_img_kernel->SetParam(ReluParam);
std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(relu_img_context->As<OpenCLContext>()));
relu_img_kernel->SetContext(std::move(relu_img_context));
// run kernels
LOG(INFO) << "run kernel: buf_to_img_kernel";
buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: relu_img_kernel";
relu_img_kernel->Launch();
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result
#ifdef PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
}
#endif // PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]:" << mapped_y[eidx];
break;
}
}
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef LOOP_TEST
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
// relu buffer
// USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
// relu image2d
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册