Merge branch 'conv1x1-test' into 'master'

Fix conv1x1 opencl tests See merge request !130

Merge branch 'conv1x1-test' into 'master'
Fix conv1x1 opencl tests See merge request !130
2e062c5a · Liangliang He · e4370b73 · 936ef618 · 2e062c5a · 2e062c5a
5 changed file
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -19,49 +19,64 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]

  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-  DATA_TYPE4 out[4] = {0};
 #ifdef BIAS
-  out[0] =
-      READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
-  out[1] = out[0];
-  out[2] = out[0];
-  out[3] = out[0];
+  DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
 #endif

-  int w[4];
-  w[0] = out_w_blk;
-  w[1] = w[0] + out_w_blks;
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
+  int4 w;
+  w.x = out_w_blk;
+  w.y = w.x + out_w_blks;
+  w.z = w.y + out_w_blks;
+  w.w = w.z + out_w_blks;

  // Unrolling this loop hurt perfmance
  int in_x_base = 0;
  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    DATA_TYPE4 in[4];
-    in[0] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[0], out_hb));
-    if (w[1] < width) {
+    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb));
+    DATA_TYPE4 in1 = 0;
+    DATA_TYPE4 in2 = 0;
+    DATA_TYPE4 in3 = 0;
+    if (w.y < width) {
      // conditional load hurt perf, this branching helps sometimes
-      in[1] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[1], out_hb));
-      in[2] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[2], out_hb));
-      in[3] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[3], out_hb));
+      in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb));
+      in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb));
+      in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb));
    }

    const int filter_x0 = in_ch_blk << 2;
-    DATA_TYPE4 weights[4];
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      weights[c] = READ_IMAGET(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
-    }
+    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
+    DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
+    DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
+    DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
    // Will prefetch L2 improve performance? How to pretch image data?

-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
-    for (int wi = 0; wi < 4; ++wi) {
-      out[wi] += in[wi].x * weights[0];
-      out[wi] += in[wi].y * weights[1];
-      out[wi] += in[wi].z * weights[2];
-      out[wi] += in[wi].w * weights[3];
-    }
+    out0 += in0.x * weights0;
+    out0 += in0.y * weights1;
+    out0 += in0.z * weights2;
+    out0 += in0.w * weights3;
+
+    out1 += in1.x * weights0;
+    out1 += in1.y * weights1;
+    out1 += in1.z * weights2;
+    out1 += in1.w * weights3;
+
+    out2 += in2.x * weights0;
+    out2 += in2.y * weights1;
+    out2 += in2.z * weights2;
+    out2 += in2.w * weights3;
+
+    out3 += in3.x * weights0;
+    out3 += in3.y * weights1;
+    out3 += in3.z * weights2;
+    out3 += in3.w * weights3;

    in_x_base += width;
  }
@@ -70,42 +85,40 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  // batch norm
  DATA_TYPE4 bn_scale_value =
      READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0));
-  DATA_TYPE4 scale[4];
-  scale[0] = (DATA_TYPE4)(bn_scale_value.x);
-  scale[1] = (DATA_TYPE4)(bn_scale_value.y);
-  scale[2] = (DATA_TYPE4)(bn_scale_value.z);
-  scale[3] = (DATA_TYPE4)(bn_scale_value.w);
+  DATA_TYPE4 scale0 = (DATA_TYPE4)(bn_scale_value.x);
+  DATA_TYPE4 scale1 = (DATA_TYPE4)(bn_scale_value.y);
+  DATA_TYPE4 scale2 = (DATA_TYPE4)(bn_scale_value.z);
+  DATA_TYPE4 scale3 = (DATA_TYPE4)(bn_scale_value.w);
  DATA_TYPE4 bn_offset_value =
      READ_IMAGET(bn_offset, sampler, (int2)(out_ch_blk, 0));
-  DATA_TYPE4 offset[4];
-  offset[0] = (DATA_TYPE4)(bn_offset_value.x);
-  offset[1] = (DATA_TYPE4)(bn_offset_value.y);
-  offset[2] = (DATA_TYPE4)(bn_offset_value.z);
-  offset[3] = (DATA_TYPE4)(bn_offset_value.w);
-
-  #pragma unroll
-  for (int wi = 0; wi < 4; ++wi) {
-    out[wi] = out[wi] * scale[wi] + offset[wi];
-  }
+  DATA_TYPE4 offset0 = (DATA_TYPE4)(bn_offset_value.x);
+  DATA_TYPE4 offset1 = (DATA_TYPE4)(bn_offset_value.y);
+  DATA_TYPE4 offset2 = (DATA_TYPE4)(bn_offset_value.z);
+  DATA_TYPE4 offset3 = (DATA_TYPE4)(bn_offset_value.w);
+
+  out0 = out0 * scale0 + offset0;
+  out1 = out1 * scale1 + offset1;
+  out2 = out2 * scale2 + offset2;
+  out3 = out3 * scale3 + offset3;
 #endif

 #ifdef FUSED_RELU
-  #pragma unroll
-  for (int wi = 0; wi < 4; ++wi) {
  // TODO relux
-    out[wi] = fmax(out[wi], 0);
-  }
+  out0 = fmax(out0, 0);
+  out1 = fmax(out1, 0);
+  out2 = fmax(out2, 0);
+  out3 = fmax(out3, 0);
 #endif

  const int out_x_base = out_ch_blk * width;
-  WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[0]);
+  WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0);

-  if (w[1] >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w[1], out_hb), out[1]);
+  if (w.y >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1);

-  if (w[2] >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[2]);
+  if (w.z >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2);

-  if (w[3] >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[3]);
+  if (w.w >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3);
 }
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -5,8 +5,8 @@
 #include "mace/kernels/conv_2d.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/utils/utils.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"

 namespace mace {
 namespace kernels {
@@ -36,8 +36,10 @@ void Conv1x1(const Tensor *input,
  std::set<std::string> built_options;
  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
-  built_options.emplace("-DSTRIDE_1");
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  if (bias != nullptr) {
+    built_options.emplace("-DBIAS");
+  }

  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -398,17 +398,7 @@ TEST_F(Conv2dOpTest, CPUCombined) {

 template<DeviceType D>
 void TestConv1x1() {
-  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("Conv2D", "Conv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddInputFromArray<D, float>(
@@ -425,8 +415,39 @@ void TestConv1x1() {
      {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});

-  // Run
+  // Construct graph
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+      .Input("InputImage")
+      .Input("FilterImage")
+      .Input("BiasImage")
+      .Output("OutputImage")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+
+  } else {
+    OpDefBuilder("Conv2D", "Conv2DTest")
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+
    net.RunOp(D);
+  }

  // Check
  auto expected = CreateTensor<float>(
@@ -445,9 +466,9 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
  TestConv1x1<DeviceType::CPU>();
 }

-//TEST_F(Conv2dOpTest, OPENCLConv1x1) {
-//  TestConv1x1<DeviceType::OPENCL>();
-//}
+TEST_F(Conv2dOpTest, OPENCLConv1x1) {
+  TestConv1x1<DeviceType::OPENCL>();
+}

 template<DeviceType D>
 static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
@@ -457,6 +478,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
    srand(time(NULL));

    // generate random input
+    // TODO test all sizes
    index_t batch = 3 + (rand() % 10);
    index_t height = shape[0];
    index_t width = shape[1];
@@ -507,7 +529,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
  };

-  for (int kernel_size : {3}) {
+  for (int kernel_size : {1, 3}) {
    for (int stride : {1}) {
      func(kernel_size, kernel_size, stride, stride, VALID);
      func(kernel_size, kernel_size, stride, stride, SAME);

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -13,6 +13,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"

 namespace mace {

@@ -337,13 +338,6 @@ void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
  Expector<T>::Near(x, y, abs_err);
 }

-template <typename T>
-std::string ToString(const T &input) {
-  std::stringstream ss;
-  ss << input;
-  return ss.str();
-}
-
 template <DeviceType D>
 void BufferToImage(OpsTestNet &net,
                   const std::string &input_name,

--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -6,6 +6,7 @@
 #define MACE_UTILS_UTILS_H_

 #include <sys/time.h>
+#include <sstream>

 namespace mace {
 template <typename Integer>
@@ -40,5 +41,12 @@ inline int64_t NowInMicroSec() {
  return static_cast<int64_t>(tv.tv_sec * 1000000 + tv.tv_usec);
 }

+template <typename T>
+inline std::string ToString(T v) {
+  std::ostringstream ss;
+  ss << v;
+  return ss.str();
+}
+
 }  //  namespace mace
 #endif  //  MACE_UTILS_UTILS_H_