// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include "lite/backends/opencl/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" namespace paddle { namespace lite { #define A(i, j) a[i * lda + j] #define B(i, j) cur_b[i * ldb + j] #define C(i, j) cur_c[i * ldc + j] template static void conv_basic(const Dtype1* din, Dtype2* dout, int num, int chout, int hout, int wout, int chin, int hin, int win, const Dtype1* weights, const Dtype2* bias, int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, int pad_w, int pad_h, bool flag_bias, std::string flag_relu) { Dtype2 beta = 0; auto src_data = din; auto dst_data_ref = dout; auto weights_data = weights; auto with_bias = flag_bias; auto bias_data = bias; int in_num = num; int out_channels = chout; int out_h = hout; int out_w = wout; int in_channel = chin; int in_h = hin; int in_w = win; int out_c_group = out_channels / group; int in_c_group = in_channel / group; for (int n = 0; n < in_num; ++n) { for (int g = 0; g < group; ++g) { for (int oc = 0; oc < out_c_group; ++oc) { for (int oh = 0; oh < out_h; ++oh) { for (int ow = 0; ow < out_w; ++ow) { int out_idx = n * group * out_c_group * out_h * out_w + g * out_c_group * out_h * out_w + oc * out_h * out_w + oh * out_w + ow; Dtype2 bias_d = with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0; dst_data_ref[out_idx] = bias_d; // + dst_data_ref[out_idx] * beta; for (int ic = 0; ic < in_c_group; ++ic) { for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int iw = ow * stride_w - pad_w + kw * (dila_w); int ih = oh * stride_h - pad_h + kh * (dila_h); if (iw < 0 || iw >= in_w) continue; if (ih < 0 || ih >= in_h) continue; int iidx = n * in_channel * in_h * in_w + g * in_c_group * in_h * in_w + ic * in_h * in_w + ih * in_w + iw; int widx = g * out_c_group * in_c_group * kernel_h * kernel_w + oc * in_c_group * kernel_h * kernel_w + ic * kernel_h * kernel_w + kh * kernel_w + kw; dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx]; } } } if (flag_relu == "relu") { dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0 ? dst_data_ref[out_idx] : (Dtype2)0; } else if (flag_relu == "relu6") { auto dst_tmp = (dst_data_ref[out_idx] > (Dtype2)0) ? dst_data_ref[out_idx] : (Dtype2)0; dst_data_ref[out_idx] = (dst_tmp < 6.f) ? dst_tmp : 6.f; } } } } } } } template void gemm_batch_bias(const int batch_size, const T* a, const int M, const int K, const T* b, const int K_, const int N, T* biases, T* c) { EXPECT_TRUE(K_ == K && M > 0 && N > 0 && K > 0); for (int bidx = 0; bidx < batch_size; ++bidx) { const T* cur_b = b + K * N * bidx; T* cur_c = c + M * N * bidx; EXPECT_TRUE(a && cur_b && cur_c); const int lda = K; const int ldb = N; const int ldc = N; for (int m = 0; m < M; ++m) { for (int n = 0; n < N; ++n) { C(m, n) = 0.0f; for (int k = 0; k < K; ++k) { C(m, n) += A(m, k) * B(k, n); } } } if (biases) { for (int m = 0; m < M; ++m) { for (int n = 0; n < N; ++n) { C(m, n) += biases[m]; } } } } } void PrintData(std::string name, float* a, const int rows, const int cols, const int batch_size = 1) { std::cout << "==== " << name << " ====" << std::endl; for (int b = 0; b < batch_size; ++b) { std::cout << "-- bidx = " << b << " --" << std::endl; for (int r = 0; r < rows; ++r) { for (int c = 0; c < cols; ++c) { std::cout << " " << a[b * rows * cols + r * cols + c]; } std::cout << std::endl; } } } // #define PRINT_RESULT #define LOOP_TEST TEST(conv2d, compute_conv2d_1x1) { // conv2d 1x1 note // kernel/filter size = 1x1, group = 1, pad = 0, stride = 1, dilation = 1 // gemm implement // a: filter_d ==> <=> // b: x_d ==> <=> // c: output_d ==> <=> std::unique_ptr context(new KernelContext); context->As().InitOnce(); const int ksize = 1; const int stride = 1; const int pad = 0; const int group = 1; const int dilation = 1; int loop_cnt = 0; #ifdef LOOP_TEST for (int batch_size = 1; batch_size < 4; ++batch_size) { for (int oc = 1; oc < 10; oc += 1) { // m for (int ih = 1; ih < 9; ih += 1) { // ih /*int iw = ih;*/ for (int iw = 1; iw < 10; iw += 1) { // iw for (int ic = 1; ic < 10; ic += 1) { // k for (bool bias_flag : {true /*, false*/}) { for (std::string relu_flag : {"relu" /*, "relu6", "None"*/}) { #else // groups:1 stride_h:1 stride_w:1 pad_h:0 pad_w:0 kernel_h:1 kernel_h:1 // x_dims:1 32 112 112 // output_dims:1 64 112 112 // filter_dims:64 32 1 1 const bool bias_flag = true; const bool relu_flag = true; const int batch_size = 8; const int oc = 64; const int ih = 112; const int iw = 112; const int ic = 32; #endif const int oh = ih; const int ow = iw; const int m = oc; const int n = ih * iw; const int k = ic; LOG(INFO) << "loop_cnt=" << ++loop_cnt << " bs=" << batch_size << " m=oc=" << m << " n=ih*iw=" << n << " k=ic=" << ic << " relu_fuse=" << relu_flag << " bias_flag=" << bias_flag; auto kernels = KernelRegistry::Global().Create("conv2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)); ASSERT_FALSE(kernels.empty()); auto kernel = std::move(kernels.front()); lite::Tensor x, filter, bias, out, out_ref; operators::ConvParam param; param.x = &x; param.filter = &filter; param.bias = bias_flag ? &bias : nullptr; param.output = &out; param.strides = {stride, stride}; std::vector paddings = {pad, pad, pad, pad}; param.groups = group; std::vector dilations = {dilation, dilation}; if (relu_flag == "relu") { param.fuse_relu = true; } else if (relu_flag == "None") { param.fuse_relu = false; } else if (relu_flag == "relu6") { param.activation_param.Relu_clipped_coef = 6.f; param.activation_param.has_active = true; param.activation_param.active_type = lite_api::ActivationType::kRelu6; } param.paddings = std::make_shared>(paddings); param.dilations = std::make_shared>(dilations); kernel->SetParam(param); std::unique_ptr conv_context(new KernelContext); context->As().CopySharedTo( &(conv_context->As())); kernel->SetContext(std::move(conv_context)); // a: filter_d ==> <=> // b: x_d ==> <=> // c: output_d ==> <=> const DDim x_dim = DDim(std::vector{batch_size, ic, ih, iw}); const DDim filter_dim = DDim(std::vector{oc, ic, ksize, ksize}); const DDim bias_dim = DDim(std::vector{oc}); const DDim out_dim = DDim(std::vector{batch_size, oc, oh, ow}); x.Resize(x_dim); filter.Resize(filter_dim); bias.Resize(bias_dim); out.Resize(out_dim); out_ref.Resize(out_dim); auto* x_data = x.mutable_data(TARGET(kOpenCL)); auto* filter_data = filter.mutable_data(TARGET(kOpenCL)); auto* bias_data = bias.mutable_data(TARGET(kOpenCL)); std::default_random_engine engine; std::uniform_real_distribution dist(-5, 5); auto* mapped_x = static_cast(TargetWrapperCL::Map( x_data, 0, sizeof(float) * x_dim.production())); for (int i = 0; i < x_dim.production(); ++i) { mapped_x[i] = static_cast(dist(engine)); } auto* mapped_filter = static_cast(TargetWrapperCL::Map( filter_data, 0, sizeof(float) * filter_dim.production())); for (int i = 0; i < filter_dim.production(); ++i) { mapped_filter[i] = static_cast(dist(engine)); } auto* mapped_bias = static_cast(TargetWrapperCL::Map( bias_data, 0, sizeof(float) * bias_dim.production())); float* bias_cpu_data = bias_flag ? reinterpret_cast(calloc( sizeof(float), bias_dim.production())) : nullptr; for (int i = 0; i < bias_dim.production(); ++i) { mapped_bias[i] = static_cast(dist(engine)); bias_cpu_data[i] = mapped_bias[i]; } // run opencl kernel kernel->Launch(); auto* wait_list = context->As().cl_wait_list(); auto* out_ptr = param.output->data(); auto it = wait_list->find(out_ptr); if (it != wait_list->end()) { VLOG(4) << "--- Find the sync event for the target cl " "tensor. ---"; auto& event = *(it->second); event.wait(); double start_nanos = event.getProfilingInfo(); double stop_nanos = event.getProfilingInfo(); double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us."; } else { LOG(FATAL) << "Could not find the sync event for the target " "cl tensor."; } // run cpu ref auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); conv_basic(mapped_x, out_ref_data, batch_size, oc, oh, ow, ic, ih, iw, mapped_filter, bias_cpu_data, // mapped_bias, group, ksize, ksize, stride, stride, dilation, dilation, pad, pad, bias_flag, relu_flag); auto* out_data = out.mutable_data(); auto* mapped_out = static_cast(TargetWrapperCL::Map( out_data, 0, sizeof(float) * out_dim.production())); #ifdef PRINT_RESULT // a: filter_d ==> <=> // b: x_d ==> <=> // c: output_d ==> <=> PrintData( "mapped_filter", static_cast(mapped_filter), m, k); PrintData("mapped_x", static_cast(mapped_x), k, n, batch_size); PrintData( "mapped_bias", static_cast(mapped_bias), m, 1); std::cout << "mapped_bias[0]:" << mapped_bias[0] << std::endl; PrintData("out_ref_data", static_cast(out_ref_data), m, n, batch_size); PrintData("mapped_out", static_cast(mapped_out), m, n, batch_size); #endif for (int i = 0; i < out_dim.production(); i++) { EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6); if (abs(mapped_out[i] - out_ref_data[i]) > 1e-6) { LOG(FATAL) << "error idx:" << i; } } free(bias_cpu_data); TargetWrapperCL::Unmap(x_data, mapped_x); TargetWrapperCL::Unmap(filter_data, mapped_filter); TargetWrapperCL::Unmap(bias_data, mapped_bias); TargetWrapperCL::Unmap(out_data, mapped_out); #ifdef LOOP_TEST } // relu } // bias } // ic } // iw } // ih } // oc } // batch_size #endif } #undef LOOP_TEST #undef PRINT_RESULT // #define PRINT_RESULT // #define LOOP_TEST TEST(conv2d, compute_conv2d_gemm) { std::unique_ptr context(new KernelContext); context->As().InitOnce(); // x_dims:1 3 224 224 // output_dims:1 32 112 112 // filter_dims:32 3 3 3 int ksize = 3; const int stride = 2; const int pad = 1; const int group = 1; const int dilation = 1; int loop_cnt = 0; #ifdef LOOP_TEST for (int batch_size = 1; batch_size < 3; ++batch_size) { for (int oc = 1; oc < 10; oc += 1) { // m for (int ih = 1; ih < 10; ih += 1) { // ih for (int iw = 1; iw < 10; iw += 1) { // iw for (int ic = 1; ic < 10; ic += 1) { // k for (bool bias_flag : {true, false}) { for (std::string relu_flag : {"relu", "relu6", "None"}) { #else const int batch_size = 8; const int oc = 32; const int ih = 224; const int iw = 224; const int ic = 3; const bool bias_flag = true; const std::string relu_flag = "relu6"; // "relu", "relu6", "None" #endif const int oh = (ih + 2 * pad - ksize) / stride + 1; const int ow = (iw + 2 * pad - ksize) / stride + 1; // a: filter_d ==> <=> // b: x_d ==> <=> // c: output_d ==> <=> int m = oc; int k = ic * ksize * ksize; int n = oc; LOG(INFO) << "bs=" << batch_size << " oc=" << oc << " ic=" << ic << " ih=" << ih << " iw=" << iw << " oh=" << oh << " ow=" << ow << " bias_flag=" << bias_flag << " relu_flag=" << relu_flag; LOG(INFO) << "m=oc=" << oc << " k=ic*ksize*ksize=" << ic * ksize * ksize << " n=oc=" << oc; auto kernels = KernelRegistry::Global().Create("conv2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)); ASSERT_FALSE(kernels.empty()); auto kernel = std::move(kernels.front()); lite::Tensor x, filter, bias, out, out_ref; operators::ConvParam param; param.x = &x; param.filter = &filter; param.bias = bias_flag ? &bias : nullptr; param.output = &out; param.strides = {stride, stride}; std::vector paddings = {pad, pad, pad, pad}; param.groups = group; std::vector dilations = {dilation, dilation}; if (relu_flag == "relu") { param.fuse_relu = true; } else if (relu_flag == "None") { param.fuse_relu = false; } else if (relu_flag == "relu6") { param.activation_param.Relu_clipped_coef = 6.f; param.activation_param.has_active = true; param.activation_param.active_type = lite_api::ActivationType::kRelu6; } param.paddings = std::make_shared>(paddings); param.dilations = std::make_shared>(dilations); kernel->SetParam(param); std::unique_ptr conv_context(new KernelContext); context->As().CopySharedTo( &(conv_context->As())); kernel->SetContext(std::move(conv_context)); const DDim x_dim = DDim(std::vector{batch_size, ic, ih, iw}); const DDim filter_dim = DDim(std::vector{oc, ic, ksize, ksize}); const DDim bias_dim = DDim(std::vector{oc}); const DDim out_dim = DDim(std::vector{batch_size, oc, oh, ow}); x.Resize(x_dim); filter.Resize(filter_dim); bias.Resize(bias_dim); out.Resize(out_dim); out_ref.Resize(out_dim); auto* x_data = x.mutable_data(TARGET(kOpenCL)); auto* filter_data = filter.mutable_data(TARGET(kOpenCL)); auto* bias_data = bias.mutable_data(TARGET(kOpenCL)); std::default_random_engine engine; std::uniform_real_distribution dist(-5, 5); auto* mapped_x = static_cast(TargetWrapperCL::Map( x_data, 0, sizeof(float) * x_dim.production())); for (int i = 0; i < x_dim.production(); ++i) { mapped_x[i] = static_cast(dist(engine)); } auto* mapped_filter = static_cast(TargetWrapperCL::Map( filter_data, 0, sizeof(float) * filter_dim.production())); for (int i = 0; i < filter_dim.production(); ++i) { mapped_filter[i] = static_cast(dist(engine)); } auto* mapped_bias = static_cast(TargetWrapperCL::Map( bias_data, 0, sizeof(float) * bias_dim.production())); for (int i = 0; i < bias_dim.production(); ++i) { mapped_bias[i] = static_cast(dist(engine)); } // run opencl kernel kernel->Launch(); auto* wait_list = context->As().cl_wait_list(); auto* out_ptr = param.output->data(); auto it = wait_list->find(out_ptr); if (it != wait_list->end()) { VLOG(4) << "--- Find the sync event for the target cl " "tensor. ---"; auto& event = *(it->second); event.wait(); double start_nanos = event.getProfilingInfo(); double stop_nanos = event.getProfilingInfo(); double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us."; } else { LOG(FATAL) << "Could not find the sync event for the target " "cl tensor."; } // run cpu ref auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); conv_basic(mapped_x, out_ref_data, batch_size, oc, oh, ow, ic, ih, iw, mapped_filter, mapped_bias, group, ksize, ksize, stride, stride, dilation, dilation, pad, pad, bias_flag, relu_flag); auto* out_data = out.mutable_data(); auto* mapped_out = static_cast(TargetWrapperCL::Map( out_data, 0, sizeof(float) * out_dim.production())); #ifdef PRINT_RESULT PrintData( "mapped_filter", static_cast(mapped_filter), k, n); PrintData("mapped_x", static_cast(mapped_x), batch_size * m, k); PrintData( "mapped_bias", static_cast(mapped_bias), 1, n); PrintData("out_ref_data", static_cast(out_ref_data), batch_size * m, n); PrintData("mapped_out", static_cast(mapped_out), batch_size * m, n); #endif for (int i = 0; i < out_dim.production(); i++) { EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6); if (abs(mapped_out[i] - out_ref_data[i]) > 1e-6) { LOG(FATAL) << "error output idx:" << i; } } TargetWrapperCL::Unmap(x_data, mapped_x); TargetWrapperCL::Unmap(filter_data, mapped_filter); TargetWrapperCL::Unmap(bias_data, mapped_bias); TargetWrapperCL::Unmap(out_data, mapped_out); #ifdef LOOP_TEST } // with_relu } // with_bias } // ic } // iw } // ih } // oc } // batch_size #endif } } // namespace lite } // namespace paddle USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);