From 936ef61864b3dbbdb5af3acaedcc8fd775ad1ce1 Mon Sep 17 00:00:00 2001
From: Liangliang He <lliang.he@gmail.com>
Date: Wed, 29 Nov 2017 21:07:31 +0800
Subject: [PATCH] Fix conv1x1 opencl tests

---
 mace/kernels/opencl/cl/conv_2d_1x1.cl     | 127 ++++++++++++----------
 mace/kernels/opencl/conv_2d_opencl_1x1.cc |   8 +-
 mace/ops/conv_2d_test.cc                  |  54 ++++++---
 mace/ops/ops_test_util.h                  |   8 +-
 mace/utils/utils.h                        |   8 ++
 5 files changed, 122 insertions(+), 83 deletions(-)

diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index e3e8f2c1..689a7df2 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -19,49 +19,64 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 
   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-  DATA_TYPE4 out[4] = {0};
 #ifdef BIAS
-  out[0] =
-      READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
-  out[1] = out[0];
-  out[2] = out[0];
-  out[3] = out[0];
+  DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
 #endif
 
-  int w[4];
-  w[0] = out_w_blk;
-  w[1] = w[0] + out_w_blks;
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
+  int4 w;
+  w.x = out_w_blk;
+  w.y = w.x + out_w_blks;
+  w.z = w.y + out_w_blks;
+  w.w = w.z + out_w_blks;
 
   // Unrolling this loop hurt perfmance
   int in_x_base = 0;
   for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    DATA_TYPE4 in[4];
-    in[0] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[0], out_hb));
-    if (w[1] < width) {
+    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb));
+    DATA_TYPE4 in1 = 0;
+    DATA_TYPE4 in2 = 0;
+    DATA_TYPE4 in3 = 0;
+    if (w.y < width) {
       // conditional load hurt perf, this branching helps sometimes
-      in[1] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[1], out_hb));
-      in[2] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[2], out_hb));
-      in[3] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[3], out_hb));
+      in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb));
+      in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb));
+      in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb));
     }
 
     const int filter_x0 = in_ch_blk << 2;
-    DATA_TYPE4 weights[4];
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      weights[c] = READ_IMAGET(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
-    }
+    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
+    DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
+    DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
+    DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
     // Will prefetch L2 improve performance? How to pretch image data?
 
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
-    for (int wi = 0; wi < 4; ++wi) {
-      out[wi] += in[wi].x * weights[0];
-      out[wi] += in[wi].y * weights[1];
-      out[wi] += in[wi].z * weights[2];
-      out[wi] += in[wi].w * weights[3];
-    }
+    out0 += in0.x * weights0;
+    out0 += in0.y * weights1;
+    out0 += in0.z * weights2;
+    out0 += in0.w * weights3;
+
+    out1 += in1.x * weights0;
+    out1 += in1.y * weights1;
+    out1 += in1.z * weights2;
+    out1 += in1.w * weights3;
+
+    out2 += in2.x * weights0;
+    out2 += in2.y * weights1;
+    out2 += in2.z * weights2;
+    out2 += in2.w * weights3;
+
+    out3 += in3.x * weights0;
+    out3 += in3.y * weights1;
+    out3 += in3.z * weights2;
+    out3 += in3.w * weights3;
 
     in_x_base += width;
   }
@@ -70,42 +85,40 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   // batch norm
   DATA_TYPE4 bn_scale_value =
       READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0));
-  DATA_TYPE4 scale[4];
-  scale[0] = (DATA_TYPE4)(bn_scale_value.x);
-  scale[1] = (DATA_TYPE4)(bn_scale_value.y);
-  scale[2] = (DATA_TYPE4)(bn_scale_value.z);
-  scale[3] = (DATA_TYPE4)(bn_scale_value.w);
+  DATA_TYPE4 scale0 = (DATA_TYPE4)(bn_scale_value.x);
+  DATA_TYPE4 scale1 = (DATA_TYPE4)(bn_scale_value.y);
+  DATA_TYPE4 scale2 = (DATA_TYPE4)(bn_scale_value.z);
+  DATA_TYPE4 scale3 = (DATA_TYPE4)(bn_scale_value.w);
   DATA_TYPE4 bn_offset_value =
       READ_IMAGET(bn_offset, sampler, (int2)(out_ch_blk, 0));
-  DATA_TYPE4 offset[4];
-  offset[0] = (DATA_TYPE4)(bn_offset_value.x);
-  offset[1] = (DATA_TYPE4)(bn_offset_value.y);
-  offset[2] = (DATA_TYPE4)(bn_offset_value.z);
-  offset[3] = (DATA_TYPE4)(bn_offset_value.w);
-
-  #pragma unroll
-  for (int wi = 0; wi < 4; ++wi) {
-    out[wi] = out[wi] * scale[wi] + offset[wi];
-  }
+  DATA_TYPE4 offset0 = (DATA_TYPE4)(bn_offset_value.x);
+  DATA_TYPE4 offset1 = (DATA_TYPE4)(bn_offset_value.y);
+  DATA_TYPE4 offset2 = (DATA_TYPE4)(bn_offset_value.z);
+  DATA_TYPE4 offset3 = (DATA_TYPE4)(bn_offset_value.w);
+
+  out0 = out0 * scale0 + offset0;
+  out1 = out1 * scale1 + offset1;
+  out2 = out2 * scale2 + offset2;
+  out3 = out3 * scale3 + offset3;
 #endif
 
 #ifdef FUSED_RELU
-  #pragma unroll
-  for (int wi = 0; wi < 4; ++wi) {
-    // TODO relux
-    out[wi] = fmax(out[wi], 0);
-  }
+  // TODO relux
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
 #endif
 
   const int out_x_base = out_ch_blk * width;
-  WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[0]);
+  WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0);
 
-  if (w[1] >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w[1], out_hb), out[1]);
+  if (w.y >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1);
 
-  if (w[2] >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[2]);
+  if (w.z >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2);
 
-  if (w[3] >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[3]);
+  if (w.w >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3);
 }
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 4198cf78..e8a88db5 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -5,8 +5,8 @@
 #include "mace/kernels/conv_2d.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/utils/utils.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
@@ -36,8 +36,10 @@ void Conv1x1(const Tensor *input,
   std::set<std::string> built_options;
   built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
   built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
-  built_options.emplace("-DSTRIDE_1");
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (bias != nullptr) {
+    built_options.emplace("-DBIAS");
+  }
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 864c882a..3ee0dd60 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -398,17 +398,7 @@ TEST_F(Conv2dOpTest, CPUCombined) {
 
 template<DeviceType D>
 void TestConv1x1() {
-  // Construct graph
   OpsTestNet net;
-  OpDefBuilder("Conv2D", "Conv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
   net.AddInputFromArray<D, float>(
@@ -425,8 +415,39 @@ void TestConv1x1() {
       {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
   net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
 
-  // Run
-  net.RunOp(D);
+  // Construct graph
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+      .Input("InputImage")
+      .Input("FilterImage")
+      .Input("BiasImage")
+      .Output("OutputImage")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+
+  } else {
+    OpDefBuilder("Conv2D", "Conv2DTest")
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+
+    net.RunOp(D);
+  }
 
   // Check
   auto expected = CreateTensor<float>(
@@ -445,9 +466,9 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
   TestConv1x1<DeviceType::CPU>();
 }
 
-//TEST_F(Conv2dOpTest, OPENCLConv1x1) {
-//  TestConv1x1<DeviceType::OPENCL>();
-//}
+TEST_F(Conv2dOpTest, OPENCLConv1x1) {
+  TestConv1x1<DeviceType::OPENCL>();
+}
 
 template<DeviceType D>
 static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
@@ -457,6 +478,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
     srand(time(NULL));
 
     // generate random input
+    // TODO test all sizes
     index_t batch = 3 + (rand() % 10);
     index_t height = shape[0];
     index_t width = shape[1];
@@ -507,7 +529,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
   };
 
-  for (int kernel_size : {3}) {
+  for (int kernel_size : {1, 3}) {
     for (int stride : {1}) {
       func(kernel_size, kernel_size, stride, stride, VALID);
       func(kernel_size, kernel_size, stride, stride, SAME);
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 6bdf5db5..3bdb3ca5 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -13,6 +13,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 
@@ -337,13 +338,6 @@ void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
   Expector<T>::Near(x, y, abs_err);
 }
 
-template <typename T>
-std::string ToString(const T &input) {
-  std::stringstream ss;
-  ss << input;
-  return ss.str();
-}
-
 template <DeviceType D>
 void BufferToImage(OpsTestNet &net,
                    const std::string &input_name,
diff --git a/mace/utils/utils.h b/mace/utils/utils.h
index 536a7fb8..a8b13828 100644
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -6,6 +6,7 @@
 #define MACE_UTILS_UTILS_H_
 
 #include <sys/time.h>
+#include <sstream>
 
 namespace mace {
 template <typename Integer>
@@ -40,5 +41,12 @@ inline int64_t NowInMicroSec() {
   return static_cast<int64_t>(tv.tv_sec * 1000000 + tv.tv_usec);
 }
 
+template <typename T>
+inline std::string ToString(T v) {
+  std::ostringstream ss;
+  ss << v;
+  return ss.str();
+}
+
 }  //  namespace mace
 #endif  //  MACE_UTILS_UTILS_H_
-- 
GitLab