add depthwise_cov2d&relu6 gpu op (#1719)

a0655313 · zp7 · Jiaying Zhao · f4aabbdb · a0655313 · a0655313
7 changed file
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -142,7 +142,7 @@ LOAD_OP2(softmax, CPU, GPU_CL);
 LOAD_OP1(shape, CPU);
 #endif
 #ifdef DEPTHWISECONV_OP
-LOAD_OP1(depthwise_conv2d, CPU);
+LOAD_OP2(depthwise_conv2d, CPU, GPU_CL);
 #endif
 #ifdef CONV_TRANSPOSE_OP
 LOAD_OP1(conv2d_transpose, CPU);
@@ -200,7 +200,7 @@ LOAD_OP1(norm, CPU);
 #endif
 #ifdef RELU_OP
 LOAD_OP2(relu, CPU, GPU_CL);
-LOAD_OP1(relu6, CPU);
+LOAD_OP2(relu6, CPU, GPU_CL);
 #endif
 #ifdef IM2SEQUENCE_OP
 LOAD_OP1(im2sequence, CPU);

--- a/src/operators/activation_op.cpp
+++ b/src/operators/activation_op.cpp
@@ -63,6 +63,7 @@ REGISTER_OPERATOR_FPGA(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(relu, ops::ReluOp);
+REGISTER_OPERATOR_CL(relu6, ops::Relu6Op);
 #endif
 #endif  // RELU_OP

--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -56,5 +56,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(depthwise_conv2d, ops::DepthwiseConvOp);
+#endif
 #endif
--- a/src/operators/kernel/cl/cl_kernel/relu6.cl
+++ b/src/operators/kernel/cl/cl_kernel/relu6.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void relu6(__read_only image2d_t input,
+                   __write_only image2d_t output){
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, (int2)(x, y));
+  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  in = min((half4)(6.0f, 6.0f, 6.0f, 6.0f), in);
+  write_imageh(output, (int2)(x, y), in);
+}
--- a/src/operators/kernel/cl/relu6_kernel.cpp
+++ b/src/operators/kernel/cl/relu6_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RELU_OP
+#include "operators/kernel/activation_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool Relu6Kernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
+  this->cl_helper_.AddKernel("relu6", "relu6.cl");
+  return true;
+}
+template <>
+void Relu6Kernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  const auto* input = param.InputX();
+  auto* output = param.Out();
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
+                         work_size, NULL, 0, NULL, NULL);
+}
+template class Relu6Kernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/test/net/test_net.cpp
+++ b/test/net/test_net.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <fstream>
 #include <iostream>
 #include <string>
-#include <fstream>
 #include "../test_helper.h"
 #include "../test_include.h"
@@ -44,8 +44,14 @@ void test(int argc, char *argv[]) {
  // out_file.write(out_data, len);
  // out_file.close();
+#ifdef PADDLE_MOBILE_CL
+  //  config.load_when_predict = true;
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#else
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
  paddle_mobile.SetThreadNum(1);
+#endif
  int dim_count = std::stoi(argv[arg_index]);
  arg_index++;

--- a/test/test_include.h
+++ b/test/test_include.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "./test_helper.h"
 #include "common/enforce.h"
-#include "common/util.h"
 #include "common/log.h"
+#include "common/util.h"
 #include "executor_for_test.h"
 #include "framework/ddim.h"
 #include "framework/lod_tensor.h"