[LITE][OPENCL] conv2d_1x1_image, choose simple kernel when in some ca… (#2771)

* [LITE][OPENCL] conv2d_1x1_image, choose simple kernel when in some case. for opencl ,test=develop * [LITE][OPENCL] conv2d_1x1_image, add looptest ,test=develop

[LITE][OPENCL] conv2d_1x1_image, choose simple kernel when in some ca… (#2771)
* [LITE][OPENCL] conv2d_1x1_image, choose simple kernel when in some case. for opencl ,test=develop * [LITE][OPENCL] conv2d_1x1_image, add looptest ,test=develop
2d4e9c13 · xiebaiyuan · GitHub · 0197977d · 2d4e9c13 · 2d4e9c13
3 changed file
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
@@ -214,3 +214,172 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
  }
 }
+__kernel void conv2d_1x1_simple(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+__read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int input_c_origin,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height,
+                         __private const int old_w) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+  int out_w3 = out_w + global_size_dim1 * 3;
+  int outpos_main = mul24(out_c, old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+  int2 in_pos_in_one_block2 =
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+  int2 in_pos_in_one_block3 =
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+#ifdef BIASE_CH
+  CL_DTYPE4 output0 =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+#else
+  CL_DTYPE4 output0 = 0.0f;
+  CL_DTYPE4 output1 = 0.0f;
+  CL_DTYPE4 output2 = 0.0f;
+  CL_DTYPE4 output3 = 0.0f;
+#endif
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
+    CL_DTYPE4 input0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    CL_DTYPE4 weight0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
+    CL_DTYPE4 weight1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
+    CL_DTYPE4 weight2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
+    CL_DTYPE4 weight3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
+    CL_DTYPE4 input1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
+    CL_DTYPE4 input2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    output2 = mad(input2.x, weight0, output2);
+    output2 = mad(input2.y, weight1, output2);
+    output2 = mad(input2.z, weight2, output2);
+    output2 = mad(input2.w, weight3, output2);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
+    CL_DTYPE4 input3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    output3 = mad(input3.x, weight0, output3);
+    output3 = mad(input3.y, weight1, output3);
+    output3 = mad(input3.z, weight2, output3);
+    output3 = mad(input3.w, weight3, output3);
+  }
+#ifdef BATCH_NORM
+  output0 = output0 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+#ifdef RELU
+  output0 = activation_type4(output0);
+  output1 = activation_type4(output1);
+  output2 = activation_type4(output2);
+  output3 = activation_type4(output3);
+#endif
+  if (out_w0 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
+  }
+  if (out_w1 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
+  }
+  if (out_w2 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
+  }
+  if (out_w3 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
+  }
+}
--- a/lite/kernels/opencl/conv2d_1x1_compute.cc
+++ b/lite/kernels/opencl/conv2d_1x1_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <vector>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
@@ -45,9 +46,15 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
      build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
    }
    auto& context = ctx_->As<OpenCLContext>();
+    if (param.x->dims()[1] % 4 == 0) {
+      context.cl_context()->AddKernel(kernel_func_name_simple_,
+                                      "image/conv2d_1x1_kernel.cl",
+                                      build_options_);
+    } else {
      context.cl_context()->AddKernel(
          kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_);
    }
+  }
  void Run() override {
    const auto& param = *param_.get_mutable<param_t>();
@@ -135,7 +142,11 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
    STL::stringstream kernel_key;
+    if (input_dims[1] % 4 == 0) {
+      kernel_key << kernel_func_name_simple_ << build_options_;
+    } else {
      kernel_key << kernel_func_name_ << build_options_;
+    }
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
    int maped_w = maptofactor(w, 4);
@@ -215,6 +226,7 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
 private:
  std::string kernel_func_name_{"conv2d_1x1"};
+  std::string kernel_func_name_simple_{"conv2d_1x1_simple"};
  std::string build_options_{"-DCL_DTYPE_float"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

--- a/lite/kernels/opencl/conv2d_1x1_compute_test.cc
+++ b/lite/kernels/opencl/conv2d_1x1_compute_test.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 #include <gtest/gtest.h>
 #include <random>
 #include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/utils/logging.h"
 namespace paddle {
 namespace lite {
@@ -106,7 +108,6 @@ static void conv_basic(const Dtype1* din,
    }
  }
 }
 TEST(conv2d_1x1, compute) {
  // conv infos
  const int ksize = 1;
@@ -114,30 +115,40 @@ TEST(conv2d_1x1, compute) {
  const int pad = 0;
  const int group = 1;
  const int dilation = 0;
-  //  int loop_cnt = 0;
+//  int loop_cnt = 0;
-  const bool bias_flag = true;
+#ifdef LOOP_TEST
-  const bool relu_flag = true;
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
-  const int batch_size = 8;
+    for (int oc = 4; oc < 10; oc += 1) {                       // oc
-  const int oc = 64;
+      for (int ih = 4; ih < 9; ih += 1) {                      // ih
-  const int ih = 28;
+        /*int iw = ih;*/ for (int iw = 4; iw < 10; iw += 1) {  // iw
-  const int iw = 28;
+          for (int ic = 4; ic < 10; ic += 1) {                 // ic
-  const int ic = 63;
+            for (bool bias_flag : {true, false}) {
+              for (bool relu_flag : {true, false}) {
+#else
+  const int batch_size = 1;
+  const int oc = 4;
+  const int ih = 8;
+  const int iw = 8;
+  const int ic = 4;
+  const bool bias_flag = false;
+  const bool relu_flag = false;
+#endif
                const int oh = ih;
                const int ow = iw;
-  LOG(INFO) << "to get kernel ...";
+                VLOG(4) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create("conv2d_1x1",
+                auto kernels =
+                    KernelRegistry::Global().Create("conv2d_1x1",
                                                    TARGET(kOpenCL),
                                                    PRECISION(kFloat),
                                                    DATALAYOUT(kImageDefault));
                ASSERT_FALSE(kernels.empty());
                auto kernel = std::move(kernels.front());
-  LOG(INFO) << "created conv2d_1x1 kernel";
+                VLOG(4) << "created conv2d_1x1 kernel";
-  LOG(INFO) << "prepare kernel ------";
+                VLOG(4) << "prepare kernel ------";
                lite::Tensor input, filter, bias, output;
                operators::ConvParam param;
@@ -159,7 +170,8 @@ TEST(conv2d_1x1, compute) {
                std::unique_ptr<KernelContext> context(new KernelContext);
                context->As<OpenCLContext>().InitOnce();
-  std::unique_ptr<KernelContext> conv_1x1_context(new KernelContext);
+                std::unique_ptr<KernelContext> conv_1x1_context(
+                    new KernelContext);
                context->As<OpenCLContext>().CopySharedTo(
                    &(conv_1x1_context->As<OpenCLContext>()));
                kernel->SetContext(std::move(conv_1x1_context));
@@ -195,46 +207,18 @@ TEST(conv2d_1x1, compute) {
                size_t filter_image_width = ksize * ((oc + 3) / 4);
                size_t filter_image_height = ic * ksize;
-  auto* input_data = input.mutable_data<float, cl::Image2D>(input_image_width,
-                                                            input_image_height);
-  auto* filter_data = filter.mutable_data<float, cl::Image2D>(
-      filter_image_width, filter_image_height);
-  bias.mutable_data<float, cl::Image2D>(bias_image_width, bias_image_height);
-  auto* bias_data = bias.mutable_data<float, cl::Image2D>(bias_image_width,
-                                                          bias_image_height);
                const size_t cl_image2d_row_pitch{0};
                const size_t cl_image2d_slice_pitch{0};
-  LOG(INFO) << "map input ...";
-  auto* mapped_input =
-      static_cast<float*>(TargetWrapperCL::MapImage(input_data,
-                                                    input_image_width,
-                                                    input_image_height,
-                                                    cl_image2d_row_pitch,
-                                                    cl_image2d_slice_pitch));
-  LOG(INFO) << "map filter ...";
-  auto* mapped_filter =
-      static_cast<float*>(TargetWrapperCL::MapImage(filter_data,
-                                                    filter_image_width,
-                                                    filter_image_height,
-                                                    cl_image2d_row_pitch,
-                                                    cl_image2d_slice_pitch));
                std::default_random_engine engine;
                std::uniform_real_distribution<float> gen(-5, 5);
                std::vector<float> input_v(batch_size * ic * ih * iw);
                std::vector<float> filter_v(oc * ic * ksize * ksize);
                std::vector<float> output_v(batch_size * oc * ih * iw);
                std::vector<float> bias_v(oc);
-  float* input_v_data = &input_v[0];
+                VLOG(4) << "gen input and filter ...";
-  float* filter_v_data = &filter_v[0];
-  float* output_v_data = &output_v[0];
-  float* bias_v_data = &bias_v[0];
-  LOG(INFO) << "gen input and filter ...";
                for (auto& i : input_v) {
                  i = gen(engine);
@@ -243,89 +227,138 @@ TEST(conv2d_1x1, compute) {
                  f = gen(engine);
                }
-  LOG(INFO) << "after gen input and filter ...";
+                VLOG(4) << "after gen input and filter ...";
-  LOG(INFO) << "input_v.size(): " << input_v.size();
+                VLOG(4) << "input_v.size(): " << input_v.size();
-  LOG(INFO) << "filter_v.size(): " << filter_v.size();
+                VLOG(4) << "filter_v.size(): " << filter_v.size();
-  LOG(INFO) << "output_v.size(): " << output_v.size();
+                VLOG(4) << "output_v.size(): " << output_v.size();
-  LOG(INFO) << "bias_v.size(): " << bias_v.size();
+                VLOG(4) << "bias_v.size(): " << bias_v.size();
-  LOG(INFO) << "input_dim.production(): " << input_dim.production();
+                VLOG(4) << "input_dim.production(): " << input_dim.production();
-  LOG(INFO) << "filter_dim.production(): " << filter_dim.production();
+                VLOG(4) << "filter_dim.production(): "
-  LOG(INFO) << "out_dim.production(): " << out_dim.production();
+                        << filter_dim.production();
-  LOG(INFO) << "bias_dim.production(): " << bias_dim.production();
+                VLOG(4) << "out_dim.production(): " << out_dim.production();
-  LOG(INFO) << "4 * input_image_height * input_image_width: "
+                VLOG(4) << "bias_dim.production(): " << bias_dim.production();
+                VLOG(4) << "4 * input_image_height * input_image_width: "
                        << 4 * input_image_height * input_image_width;
-  LOG(INFO) << "4 * filter_image_width * filter_image_height: "
+                VLOG(4) << "4 * filter_image_width * filter_image_height: "
                        << 4 * filter_image_width * filter_image_height;
                CHECK(input_dim.production() == input_v.size());
-  CHECK_LE(input_dim.production(), 4 * input_image_height * input_image_width);
+                CHECK_LE(input_dim.production(),
+                         4 * input_image_height * input_image_width);
                CHECK(filter_dim.production() == filter_v.size());
                CHECK_LE(filter_dim.production(),
                         4 * filter_image_width * filter_image_height);
                paddle::lite::CLImageConverterDefault default_convertor;
-  LOG(INFO) << "set mapped input  ...";
+                VLOG(4) << "set mapped input  ...";
-  default_convertor.NCHWToImage(input_v_data, mapped_input, input_dim);
+                std::vector<float> x_image_v(
-  LOG(INFO) << "set mapped filter  ...";
+                    input_image_width * input_image_height * 4);  // 4 : RGBA
+                std::vector<float> filter_image_v(
+                    filter_image_width * filter_image_height * 4);  // 4 : RGBA
+                std::vector<float> bias_image_v(
+                    bias_image_width * bias_image_height * 4);  // 4 : RGBA
+                std::vector<float> out_image_v(
+                    out_image_width * out_image_height * 4);  // 4 : RGBA
+                default_convertor.NCHWToImage(
+                    input_v.data(), x_image_v.data(), input_dim);
+                /*                for (int j = 0; j < input_v.size(); j += 1) {
+                                  //                  VLOG(4) << "input_v
+                   input[" << j << "]:
+                                  //                  " << input_v.data()[j];
+                                  std::cout << j << "  " << input_v.data()[j] <<
+                   std::endl;
+                                }
+                                std::cout << std::endl;
+                                for (int j = 0; j < x_image_v.size(); j += 1) {
+                                  //                  VLOG(4) << "x_image_v
+                   input[" << j <<
+                                  //                  "]: " <<
+                   x_image_v.data()[j];
+                                  std::cout << j << "  " << x_image_v.data()[j]
+                   << std::endl;
+                                }*/
+                VLOG(4) << "set mapped filter  ...";
                paddle::lite::CLImageConverterNWBlock nw_convertor;
-  nw_convertor.NCHWToImage(filter_v_data, mapped_filter, filter_dim);
+                nw_convertor.NCHWToImage(
+                    filter_v.data(), filter_image_v.data(), filter_dim);
-  LOG(INFO) << "resize output  ...";
-  output.Resize(out_dim);
-  // cpu conv basic calc
+                auto* input_image2d = input.mutable_data<float, cl::Image2D>(
-  lite::Tensor out_ref;
+                    input_image_width, input_image_height, x_image_v.data());
-  out_ref.Resize(out_dim);
+                auto* filter_image2d = filter.mutable_data<float, cl::Image2D>(
+                    filter_image_width,
+                    filter_image_height,
+                    filter_image_v.data());
-  float* mapped_bias = nullptr;
                if (bias_flag) {
-    mapped_bias =
+                  nw_convertor.NCHWToImage(
-        static_cast<float*>(TargetWrapperCL::MapImage(bias_data,
+                      filter_v.data(), filter_image_v.data(), filter_dim);
-                                                      bias_image_width,
-                                                      bias_image_height,
-                                                      cl_image2d_row_pitch,
-                                                      cl_image2d_slice_pitch));
                  for (int i = 0; i < bias_dim.production(); ++i) {
                    bias_v[i] = static_cast<int>(gen(engine));
                  }
                  CLImageConverterFolder folder_convertor;
-    folder_convertor.NCHWToImage(bias_v_data, mapped_bias, bias_dim);
+                  folder_convertor.NCHWToImage(
+                      bias_v.data(), bias_image_v.data(), bias_dim);
+                  auto* bias_data = bias.mutable_data<float, cl::Image2D>(
+                      bias_image_width, bias_image_height, bias_image_v.data());
                }
-  LOG(INFO) << "prepare kernel ready";
-  LOG(INFO) << "kernel launch ...";
+                VLOG(4) << "resize output  ...";
+                output.Resize(out_dim);
+                // cpu conv basic calc
+                lite::Tensor out_ref;
+                out_ref.Resize(out_dim);
+                VLOG(4) << "prepare kernel ready";
+                VLOG(4) << "kernel launch ...";
                kernel->Launch();
-  LOG(INFO) << "mutable output ...";
+                VLOG(4) << "mutable output ...";
-  auto* output_data = output.mutable_data<float, cl::Image2D>(out_image_width,
+                auto* output_image2d = output.mutable_data<float, cl::Image2D>(
-                                                              out_image_height);
+                    out_image_width, out_image_height);
                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
                auto* out_ptr = param.output->data<float, cl::Image2D>();
                auto it = wait_list->find(out_ptr);
                if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+                  VLOG(4) << "--- Find the sync event for the target cl "
+                             "tensor. ---";
                  auto& event = *(it->second);
                  event.wait();
                } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+                  LOG(FATAL) << "Could not find the sync event for the target "
+                                "cl tensor.";
                }
-  auto* mapped_output =
+                TargetWrapperCL::ImgcpySync(out_image_v.data(),
-      static_cast<float*>(TargetWrapperCL::MapImage(output_data,
+                                            output.data<float, cl::Image2D>(),
                                            out_image_width,
                                            out_image_height,
                                            cl_image2d_row_pitch,
-                                                    cl_image2d_slice_pitch));
+                                            cl_image2d_slice_pitch,
-  LOG(INFO) << "mutable_data out_ref_data: ";
+                                            IoDirection::DtoH);
+                DDim out_image_shape =
+                    default_convertor.InitImageDimInfoWith(output.dims());
+                default_convertor.ImageToNCHW(out_image_v.data(),
+                                              output_v.data(),
+                                              out_image_shape,
+                                              output.dims());
+                VLOG(4) << "mutable_data out_ref_data: ";
                // run cpu ref
                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-  LOG(INFO) << " conv_basic beigin ..... ";
+                VLOG(4) << " conv_basic beigin ..... ";
-  conv_basic<float, float>(input_v_data,
+                conv_basic<float, float>(input_v.data(),
                                         out_ref_data,
                                         batch_size,
                                         oc,
@@ -334,8 +367,8 @@ TEST(conv2d_1x1, compute) {
                                         ic,
                                         ih,
                                         iw,
-                           filter_v_data,
+                                         filter_v.data(),
-                           bias_v_data,  // mapped_bias,
+                                         bias_v.data(),  // mapped_bias,
                                         group,
                                         ksize,
                                         ksize,
@@ -347,29 +380,31 @@ TEST(conv2d_1x1, compute) {
                                         pad,
                                         bias_flag,
                                         relu_flag);
-  LOG(INFO) << " conv_basic end ..... ";
+                VLOG(4) << " conv_basic end ..... ";
-  LOG(INFO) << " out_dim: " << out_dim;
+                VLOG(4) << " out_dim: " << out_dim;
-  const DDim& out_image_dims = lite::DDim{
+                const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
-      std::vector<int64_t>({static_cast<int64_t>(out_image_width),
+                    {static_cast<int64_t>(out_image_width),
                     static_cast<int64_t>(out_image_height)})};
-  default_convertor.ImageToNCHW(
-      mapped_output, output_v_data, out_image_dims, out_dim);
                for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(output_v_data[i], out_ref_data[i], 1e-3);
+                  EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
-    if (abs(output_v_data[i] - out_ref_data[i]) > 1e-3) {
+                  if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
                    LOG(FATAL) << "error idx:" << i;
                  }
                }
-  TargetWrapperCL::Unmap(output_data, mapped_output);
+#ifdef LOOP_TEST
-  TargetWrapperCL::Unmap(filter_data, mapped_filter);
+              }
-  TargetWrapperCL::Unmap(input_data, mapped_input);
+            }
-  if (bias_flag) {
+          }
-    if (mapped_bias) {
+        }
-      TargetWrapperCL::Unmap(bias_data, mapped_bias);
+      }
    }
  }
+#else
+// nothing to do.
+#endif
 }
 }  // namespace lite