Merge branch 'new-fc' into 'master'

Optimize fully connected op for adreno gpu and Support winograd convolution for caffe model. See merge request !300

Merge branch 'new-fc' into 'master'
Optimize fully connected op for adreno gpu and Support winograd convolution for caffe model. See merge request !300
917f19e9 · Liangliang He · ef032272 · bd9157f5 · 917f19e9 · 917f19e9
17 changed file
--- a/mace/core/runtime/opencl/opencl_extension.h
+++ b/mace/core/runtime/opencl/opencl_extension.h
@@ -25,4 +25,6 @@ typedef cl_uint cl_priority_hint;
 #define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB
 #define CL_PRIORITY_HINT_LOW_QCOM 0x40CC

+/* Accepted by clGetKernelWorkGroupInfo */
+#define CL_KERNEL_WAVE_SIZE_QCOM 0xAA02
 #endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_EXTENSION_H_
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -331,4 +331,11 @@ uint32_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
  return static_cast<uint32_t>(size);
 }

+// TODO(liuqi): not compatible with mali gpu.
+uint32_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
+  unsigned long long size = 0;
+  kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, &size);
+  return static_cast<uint32_t>(size);
+}
+
 }  // namespace mace
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -48,6 +48,7 @@ class OpenCLRuntime {
  void GetCallStats(const cl::Event &event, CallStats *stats);
  uint32_t GetDeviceMaxWorkGroupSize();
  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
+  uint32_t GetKernelWaveSize(const cl::Kernel &kernel);
  cl::Kernel BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
                         const std::set<std::string> &build_options);

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -9,24 +9,30 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

 struct FullyConnectedBase {
-  FullyConnectedBase(const ActivationType activation,
+  FullyConnectedBase(const BufferType weight_type,
+                     const ActivationType activation,
                     const float relux_max_limit)
-      : activation_(activation), relux_max_limit_(relux_max_limit) {}
+      : weight_type_(weight_type),
+        activation_(activation),
+        relux_max_limit_(relux_max_limit) {}

+  const int weight_type_;
  const ActivationType activation_;
  const float relux_max_limit_;
 };

 template <DeviceType D, typename T>
 struct FullyConnectedFunctor : FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(const BufferType weight_type,
+                        const ActivationType activation,
                        const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(weight_type, activation, relux_max_limit) {}

  void operator()(const Tensor *input,
                  const Tensor *weight,
@@ -70,9 +76,10 @@ struct FullyConnectedFunctor : FullyConnectedBase {

 template <typename T>
 struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
-  FullyConnectedFunctor(const ActivationType activation,
+  FullyConnectedFunctor(const BufferType weight_type,
+                        const ActivationType activation,
                        const float relux_max_limit)
-      : FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(weight_type, activation, relux_max_limit) {}

  void operator()(const Tensor *input,
                  const Tensor *weight,
@@ -81,6 +88,8 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
                  StatsFuture *future);

  cl::Kernel kernel_;
+  std::vector<uint32_t> gws_;
+  std::vector<uint32_t> lws_;
 };

 }  // namespace kernels

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -49,6 +49,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                         : "in_out_height_buffer_to_image";
      break;
    case IN_OUT_WIDTH:
+    case WEIGHT_WIDTH:
      MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now";
      kernel_name = "in_out_width_buffer_to_image";
      break;
@@ -88,7 +89,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
  }
  if (type == ARGUMENT) {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
-  } else if (type == WEIGHT_HEIGHT) {
+  } else if (type == WEIGHT_HEIGHT || type == WEIGHT_WIDTH) {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
    b2f_kernel.setArg(idx++, 1);

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -318,10 +318,11 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
                                           __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+  const int width_blks = (width + 3) / 4;
  const int batch_idx = h / height;
  const int height_idx = h % height;
-  const int width_idx = (w % width) << 2;
-  const int channel_idx = w / width;
+  const int width_idx = (w % width_blks) << 2;
+  const int channel_idx = w / width_blks;
  const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
      + channel_idx;


--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
@@ -4,7 +4,7 @@
 __kernel void fully_connected(__read_only image2d_t input,
                              __read_only image2d_t weight,
 #ifdef BIAS
-                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
                              __write_only image2d_t output,
                              __private const int input_height,
@@ -55,3 +55,80 @@ __kernel void fully_connected(__read_only image2d_t input,
 #endif
  WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
 }
+
+// output = weight * input + bias
+__kernel void fully_connected_width(__read_only image2d_t input,
+                                    __read_only image2d_t weight,
+#ifdef BIAS
+    __read_only image2d_t bias,
+#endif
+                                    __write_only image2d_t output,
+                                    __local float *intermediate_output,
+                                    __private const int input_height,
+                                    __private const int input_width,
+                                    __private const int in_chan_blks,
+                                    __private const int out_blks,
+                                    __private const float relux_max_limit) {
+  const int inter_out_idx = get_global_id(0);
+  const int width_blk_idx = get_global_id(1);
+  const int width_blk_count = get_global_size(1);
+  const int batch_out_blk_idx = get_global_id(2);
+  const int batch_idx = batch_out_blk_idx / out_blks;
+  const int out_blk_idx = batch_out_blk_idx % out_blks;
+
+  const short in_outer_size = mul24(input_width, in_chan_blks);
+  const short weight_y = mad24(out_blk_idx, 4, inter_out_idx);
+
+  int2 input_coord, weight_coord;
+  DATA_TYPE4 in, w;
+  DATA_TYPE sum = 0.0;
+
+  input_coord = (int2)(0, mul24(batch_idx, input_height));
+
+  for (int h_idx = 0; h_idx < input_height; ++h_idx) {
+    int weight_x_base = mul24(h_idx, in_outer_size);
+    for (int w_idx = width_blk_idx; w_idx < input_width;
+         w_idx += width_blk_count) {
+      int weight_x = mad24(w_idx, in_chan_blks, weight_x_base);
+      weight_coord = (int2)(weight_x, weight_y);
+      input_coord.x = w_idx;
+#pragma unroll
+      for (int chan_idx = 0; chan_idx < in_chan_blks; ++chan_idx) {
+        in = READ_IMAGET(input, SAMPLER, input_coord);
+
+        w = READ_IMAGET(weight, SAMPLER, weight_coord);
+
+        sum += dot(in, w);
+
+        input_coord.x += input_width;
+        weight_coord.x += 1;
+      }
+    }
+    input_coord.y++;
+  }
+
+  const short inter_out_offset = mad24(get_local_id(1), 4, get_local_id(0));
+  const short local_width_blk_size = (short)get_local_size(1);
+  const short local_size = mul24((short)get_local_size(0),
+                                 local_width_blk_size);
+  short inter_idx = mad24((short)get_local_id(2), local_size, inter_out_offset);
+  intermediate_output[inter_idx] = sum;
+
+  if (inter_out_offset == 0) {
+#ifdef BIAS
+    DATA_TYPE4 result = READ_IMAGET(bias, SAMPLER, (int2)(out_blk_idx, 0));
+#else
+    DATA_TYPE4 result = (DATA_TYPE4)(0, 0, 0, 0);
+#endif
+
+    for(short i = 0; i < local_width_blk_size; ++i) {
+      result += vload4(0, intermediate_output+inter_idx);
+      inter_idx += 4;
+    }
+
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+    result = do_activation(result, relux_max_limit);
+#endif
+    WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
+  }
+}
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -3,31 +3,113 @@
 //

 #include "mace/kernels/fully_connected.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
 #include "mace/utils/tuner.h"

 namespace mace {
 namespace kernels {

 template <typename T>
-void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
-    const Tensor *input,
-    const Tensor *weight,
-    const Tensor *bias,
-    Tensor *output,
-    StatsFuture *future) {
-  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
-  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
-  output->ResizeImage(output_shape, output_image_shape);
+void FCWXKernel(cl::Kernel *kernel,
+                const Tensor *input,
+                const Tensor *weight,
+                const Tensor *bias,
+                Tensor *output,
+                const ActivationType activation,
+                std::vector<uint32_t> &gws,
+                std::vector<uint32_t> &lws,
+                const float relux_max_limit,
+                StatsFuture *future) {
+  MACE_CHECK(input->dim(3) % 4 == 0)
+    << "FC width kernel only support input with 4x channel.";
+  auto runtime = OpenCLRuntime::Global();

-  const index_t batch = output->dim(0);
-  const index_t output_size = output->dim(3);
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
+    kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
+    built_options.emplace("-Dfully_connected_width=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (bias != nullptr) {
+      built_options.emplace("-DBIAS");
+    }
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+
+    *kernel =
+        runtime->BuildKernel("fully_connected", kernel_name, built_options);
+
+    const index_t batch = output->dim(0);
+    const index_t output_size = output->dim(3);
+    const index_t output_blocks = RoundUpDiv4(output_size);
+    const uint32_t wave_size = runtime->GetKernelWaveSize(*kernel);
+
+    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
+
+    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(*kernel);
+    const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]);
+    lws = {gws[0], gws[1], inter_local_blks};
+
+    uint32_t idx = 0;
+    kernel->setArg(idx++, *(input->opencl_image()));
+    kernel->setArg(idx++, *(weight->opencl_image()));
+    if (bias != nullptr) {
+      kernel->setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel->setArg(idx++, *(output->opencl_image()));
+    kernel->setArg(idx++, (lws[0] * lws[1] * lws[2] * sizeof(float)), nullptr);
+    kernel->setArg(idx++, static_cast<int>(input->dim(1)));
+    kernel->setArg(idx++, static_cast<int>(input->dim(2)));
+    kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
+    kernel->setArg(idx++, static_cast<int>(output_blocks));
+    kernel->setArg(idx++, relux_max_limit);
+  }
+  cl::Event event;
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+      cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }

-  const index_t output_blocks = RoundUpDiv4(output_size);
+}

-  if (kernel_.get() == nullptr) {
+template <typename T>
+void FCWTXKernel(cl::Kernel *kernel,
+                 const Tensor *input,
+                 const Tensor *weight,
+                 const Tensor *bias,
+                 Tensor *output,
+                 const ActivationType activation,
+                 std::vector<uint32_t> &gws,
+                 std::vector<uint32_t> &lws,
+                 const float relux_max_limit,
+                 StatsFuture *future) {
+  if (kernel->get() == nullptr) {
    auto runtime = OpenCLRuntime::Global();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
@@ -38,7 +120,7 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
    if (bias != nullptr) {
      built_options.emplace("-DBIAS");
    }
-    switch (activation_) {
+    switch (activation) {
      case NOOP:
        break;
      case RELU:
@@ -54,33 +136,61 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
        built_options.emplace("-DUSE_SIGMOID");
        break;
      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
+        LOG(FATAL) << "Unknown activation type: " << activation;
    }
-    kernel_ =
+    *kernel =
        runtime->BuildKernel("fully_connected", kernel_name, built_options);

    uint32_t idx = 0;
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(weight->opencl_image()));
+    kernel->setArg(idx++, *(input->opencl_image()));
+    kernel->setArg(idx++, *(weight->opencl_image()));
    if (bias != nullptr) {
-      kernel_.setArg(idx++, *(bias->opencl_image()));
+      kernel->setArg(idx++, *(bias->opencl_image()));
    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int>(input->dim(3)));
+    kernel->setArg(idx++, *(output->opencl_image()));
+    kernel->setArg(idx++, static_cast<int>(input->dim(1)));
+    kernel->setArg(idx++, static_cast<int>(input->dim(2)));
+    kernel->setArg(idx++, static_cast<int>(input->dim(3)));
    // FIXME handle flexable data type: half not supported
-    kernel_.setArg(idx++, relux_max_limit_);
+    kernel->setArg(idx++, relux_max_limit);
+
+    const index_t batch = output->dim(0);
+    const index_t output_size = output->dim(3);
+
+    const index_t output_blocks = RoundUpDiv4(output_size);
+
+    gws = {
+        static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
+    };
+    lws = {16, 64, 1};
  }

-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
-  };
-  const std::vector<uint32_t> lws = {16, 64, 1};
  std::stringstream ss;
  ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
     << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  TuningOrRun2DKernel(*kernel, ss.str(), gws.data(), lws, future);
+
+}
+
+template <typename T>
+void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
+    const Tensor *input,
+    const Tensor *weight,
+    const Tensor *bias,
+    Tensor *output,
+    StatsFuture *future) {
+  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+  output->ResizeImage(output_shape, output_image_shape);
+
+  if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
+    FCWTXKernel<T>(&kernel_, input, weight, bias, output,
+                   activation_, gws_, lws_, relux_max_limit_, future);
+  } else {
+    FCWXKernel<T>(&kernel_, input, weight, bias, output,
+                     activation_, gws_, lws_, relux_max_limit_, future);
+  }
 };

 template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -84,6 +84,15 @@ void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* HW */
  image_shape[1] = RoundUpDiv4(shape[0]);
 }

+// [(W + 3) / 4, H]
+void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* HW */
+                              std::vector<size_t> &image_shape) {
+  MACE_CHECK(shape.size() == 2);
+  image_shape.resize(2);
+  image_shape[0] = RoundUpDiv4(shape[1]);
+  image_shape[1] = shape[0];
+}
+
 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
                     std::vector<size_t> &image_shape) {
@@ -112,6 +121,9 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
    case WEIGHT_HEIGHT:
      CalWeightHeightImageShape(shape, image_shape);
      break;
+    case WEIGHT_WIDTH:
+      CalWeightWidthImageShape(shape, image_shape);
+      break;
    default:
      LOG(FATAL) << "Mace not supported yet.";
  }

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -25,6 +25,7 @@ enum BufferType {
  WINOGRAD_FILTER = 5,
  DW_CONV2D_FILTER = 6,
  WEIGHT_HEIGHT = 7,
+  WEIGHT_WIDTH = 8,
 };

 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */

--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
@@ -15,7 +15,11 @@ class FullyConnectedOp : public Operator<D, T> {
 public:
  FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
      : Operator<D, T>(operator_def, ws),
-        functor_(kernels::StringToActivationType(
+        functor_(static_cast<kernels::BufferType>(
+                     OperatorBase::GetSingleArgument<int>(
+                         "weight_type", static_cast<int>(
+                             kernels::WEIGHT_WIDTH))),
+                 kernels::StringToActivationType(
                     OperatorBase::GetSingleArgument<std::string>("activation",
                                                                  "NOOP")),
                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}

--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -22,10 +22,12 @@ static void FCBenchmark(
  net.AddRandomInput<D, float>("Bias", {out_channel});

  if (D == DeviceType::OPENCL) {
+    const int width_size = height * width * channel;
+    kernels::BufferType weight_type = kernels::BufferType::WEIGHT_WIDTH;
+    BufferToImage<D, T>(net, "Weight", "WeightImage",
+                        weight_type);
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(net, "Weight", "WeightImage",
-                        kernels::BufferType::WEIGHT_HEIGHT);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);

@@ -34,6 +36,7 @@ static void FCBenchmark(
        .Input("WeightImage")
        .Input("BiasImage")
        .Output("OutputImage")
+        .AddIntArg("weight_type", static_cast<int>(weight_type))
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  } else {
@@ -78,4 +81,6 @@ static void FCBenchmark(

 BM_FC(1, 16, 16, 32, 32);
 BM_FC(1, 8, 8, 32, 1000);
+BM_FC(1, 2, 2, 512, 2);
+BM_FC(1, 7, 7, 512, 4096);
 }  // namespace mace
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -39,6 +39,7 @@ void Simple(const std::vector<index_t> &input_shape,
        .Input("WeightImage")
        .Input("BiasImage")
        .Output("OutputImage")
+        .AddIntArg("weight_type", kernels::BufferType::WEIGHT_HEIGHT)
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -147,6 +148,7 @@ void Complex(const index_t batch,
      .Input("WeightImage")
      .Input("BiasImage")
      .Output("OutputImage")
+      .AddIntArg("weight_type", kernels::BufferType::WEIGHT_HEIGHT)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

@@ -183,4 +185,82 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) {
  Complex<half>(16, 13, 12, 31, 113);
  Complex<half>(31, 21, 11, 23, 103);
 }
+
+template <typename T>
+void TestWXFormat(const index_t batch,
+                  const index_t height,
+                  const index_t width,
+                  const index_t channels,
+                  const index_t out_channel) {
+  srand(time(NULL));
+
+  // Construct graph
+  OpsTestNet net;
+  OpDefBuilder("FC", "FullyConnectedTest")
+      .Input("Input")
+      .Input("Weight")
+      .Input("Bias")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<DeviceType::OPENCL, float>(
+      "Input", {batch, height, width, channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>(
+      "Weight", {out_channel, height * width * channels});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
+
+  // run cpu
+  net.RunOp();
+
+  // Check
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Run on opencl
+  BufferToImage<DeviceType::OPENCL, T>(net, "Input", "InputImage",
+                                       kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<DeviceType::OPENCL, T>(net, "Weight", "WeightImage",
+                                       kernels::BufferType::WEIGHT_WIDTH);
+  BufferToImage<DeviceType::OPENCL, float>(net, "Bias", "BiasImage",
+                                           kernels::BufferType::ARGUMENT);
+
+  OpDefBuilder("FC", "FullyConnectedTest")
+      .Input("InputImage")
+      .Input("WeightImage")
+      .Input("BiasImage")
+      .Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Run on opencl
+  net.RunOp(DeviceType::OPENCL);
+
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
+                                           kernels::BufferType::IN_OUT_CHANNEL);
+  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1);
+  } else {
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
+  }
+}
+
+TEST_F(FullyConnectedOpTest, OPENCLWidthFormatAligned) {
+  TestWXFormat<float>(1, 7, 7, 32, 16);
+  TestWXFormat<float>(1, 7, 7, 512, 128);
+  TestWXFormat<float>(1, 1, 1, 2048, 1024);
+}
+
+TEST_F(FullyConnectedOpTest, OPENCLWidthFormatMultiBatch) {
+  TestWXFormat<float>(11, 7, 7, 32, 16);
+  TestWXFormat<float>(5, 7, 7, 512, 128);
+  TestWXFormat<float>(3, 1, 1, 2048, 1024);
+}
+
+TEST_F(FullyConnectedOpTest, OPENCLHalfWidthFormatAligned) {
+  TestWXFormat<float>(1, 2, 2, 512, 2);
+  TestWXFormat<half>(1, 11, 11, 32, 16);
+  TestWXFormat<half>(1, 16, 32, 32, 32);
+}
+
 }
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -148,4 +148,106 @@ TEST_F(WinogradConvlutionTest, BatchConvolution) {
  WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31,
                                                 Padding::SAME);
 }
+
+template <DeviceType D, typename T>
+void WinogradConvolutionWithPad(const index_t batch,
+                                const index_t height,
+                                const index_t width,
+                                const index_t in_channels,
+                                const index_t out_channels,
+                                const int padding) {
+  srand(time(NULL));
+
+  // Construct graph
+  OpsTestNet net;
+  // Add input data
+  std::vector<float> filter_data;
+  std::vector<index_t> filter_shape = {3, 3, out_channels, in_channels};
+  GenerateRandomRealTypeData<float>(filter_shape, filter_data);
+  net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
+  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
+  net.AddRandomInput<D, T>("Bias", {out_channels});
+
+  BufferToImage<D, T>(net, "Input", "InputImage",
+                      kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<D, T>(net, "Filter", "FilterImage",
+                      kernels::BufferType::CONV2D_FILTER);
+  BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+  OpDefBuilder("Conv2D", "Conv2dTest")
+      .Input("InputImage")
+      .Input("FilterImage")
+      .Input("BiasImage")
+      .Output("OutputImage")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntsArg("padding_values", {padding, padding})
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp(D);
+
+  // Transfer output
+  ImageToBuffer<D, T>(net, "OutputImage", "ConvOutput",
+                      kernels::BufferType::IN_OUT_CHANNEL);
+  Tensor expected;
+  expected.Copy(*net.GetOutput("ConvOutput"));
+  auto output_shape = expected.shape();
+
+  // Winograd convolution
+  // transform filter
+  std::vector<float> wino_filter_data;
+  TransposeFilter(filter_data, filter_shape, wino_filter_data);
+  net.AddInputFromArray<D, float>(
+      "WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data);
+  BufferToImage<D, T>(net, "WinoFilterData", "WinoFilter",
+                      kernels::BufferType::WINOGRAD_FILTER);
+
+  // transform input
+  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
+      .Input("InputImage")
+      .Output("WinoInput")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntsArg("padding_values", {padding, padding})
+      .Finalize(net.NewOperatorDef());
+
+  // Run on opencl
+  net.RunOp(D);
+
+  // MatMul
+  OpDefBuilder("MatMul", "MatMulTest")
+      .Input("WinoFilter")
+      .Input("WinoInput")
+      .Output("WinoGemm")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  // Run on opencl
+  net.RunOp(D);
+
+  // Inverse transform
+  OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
+      .Input("WinoGemm")
+      .Input("BiasImage")
+      .AddIntArg("batch", batch)
+      .AddIntArg("height", output_shape[1])
+      .AddIntArg("width", output_shape[2])
+      .Output("WinoOutputImage")
+      .Finalize(net.NewOperatorDef());
+
+  // Run on opencl
+  net.RunOp(D);
+  net.Sync();
+
+  ImageToBuffer<D, float>(net, "WinoOutputImage", "WinoOutput",
+                          kernels::BufferType::IN_OUT_CHANNEL);
+  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
+    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-1);
+  } else {
+    ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-3);
+  }
+}
+
+TEST_F(WinogradConvlutionTest, UnAlignedConvolutionPad2) {
+  WinogradConvolutionWithPad<DeviceType::OPENCL, float>(1, 64, 64, 40, 19, 2);
+  WinogradConvolutionWithPad<DeviceType::OPENCL, float>(1, 32, 32, 96, 109, 2);
+}
+
 }
--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -19,6 +19,7 @@ buffer_type_map = {
  'WINOGRAD_FILTER' : 5,
  'DW_CONV2D_FILTER' : 6,
  'WEIGHT_HEIGHT' : 7,
+  'WEIGHT_WIDTH' : 8,
 }

 data_type_map = {
@@ -310,24 +311,25 @@ class CaffeConverter(object):
      pad = [param.pad * 2, param.pad * 2]
      kernel = [param.kernel_size, param.kernel_size]

-    strides_arg = op_def.arg.add()
-    strides_arg.name = 'strides'
    if param.HasField("stride_h") or param.HasField("stride_w"):
      stride = [param.stride_h, param.stride_w]
-    strides_arg.ints.extend(stride)
    # Pad
-    padding_arg = op_def.arg.add()
-    padding_arg.name = 'padding_values'
    if param.HasField("pad_h") or param.HasField("pad_w"):
      pad = [param.pad_h * 2, param.pad_w * 2]
-    padding_arg.ints.extend(pad)
-    # kernel
-    if op_def.type == 'Pooling':
-      kernel_arg = op_def.arg.add()
-      kernel_arg.name = 'kernels'
-      if param.HasField("kernel_h") or param.HasField("kernel_w"):
-        kernel = [param.kernel_h, param.kernel_w]
-      kernel_arg.ints.extend(kernel)
+
+    if op_def is not None:
+      strides_arg = op_def.arg.add()
+      strides_arg.name = 'strides'
+      strides_arg.ints.extend(stride)
+
+      padding_arg = op_def.arg.add()
+      padding_arg.name = 'padding_values'
+      padding_arg.ints.extend(pad)
+
+      if op_def.type == 'Pooling':
+        if param.HasField("kernel_h") or param.HasField("kernel_w"):
+          kernel = [param.kernel_h, param.kernel_w]
+
    return pad, stride, kernel

  def convert_conv2d(self, op):
@@ -391,6 +393,125 @@ class CaffeConverter(object):
    self.add_output_shape(op_def, output_shape)
    self.net_def.op.extend([op_def])

+  def check_winograd_conv(self, op):
+    param = op.layer.convolution_param
+    filter_shape = np.asarray(op.data[0].shape)
+    filter_shape = filter_shape[[2, 3, 0, 1]]
+    paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
+
+    dilations = [1, 1]
+    if len(param.dilation) > 0:
+      if len(param.dilation) == 1:
+        dilations = [param.dilation[0], param.dilation[0]]
+      elif len(param.dilation) == 2:
+        dilations = [param.dilation[0], param.dilation[1]]
+
+    output_shape = Shapes.conv_pool_shape(
+      op.get_single_parent().output_shape_map[op.layer.bottom[0]],
+      filter_shape, paddings, strides, dilations, math.floor)
+    width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
+    return self.winograd and self.device == 'gpu' and \
+           filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
+           dilations[0] == 1 and (dilations[0] == dilations[1]) and\
+           (strides[0] == 1) and (strides[0] == strides[1]) and \
+           (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
+           (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
+           (width < OPENCL_IMAGE_MAX_SIZE)
+
+  def convert_winograd_conv(self, op):
+    # Add filter
+    weight_tensor_name = op.name + '_weight:0'
+    self.add_tensor(weight_tensor_name, op.data[0])
+
+    buffer_type = "WINOGRAD_FILTER"
+    filter_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
+
+    param = op.layer.convolution_param
+    paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
+
+    filter_shape = np.asarray(op.data[0].shape)
+    filter_shape = filter_shape[[2, 3, 0, 1]]
+
+    output_shape = Shapes.conv_pool_shape(
+      op.get_single_parent().output_shape_map[op.layer.bottom[0]],
+      filter_shape, paddings, strides, [1, 1], math.floor)
+
+    # Input transform
+    wt_op = mace_pb2.OperatorDef()
+    arg = wt_op.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    padding_arg = wt_op.arg.add()
+    padding_arg.name = 'padding_values'
+    padding_arg.ints.extend(paddings)
+    wt_op.name = op.name + '_input_transform'
+    wt_op.type = 'WinogradTransform'
+    wt_op.input.extend([name+':0' for name in self.inputs_map[op.name]])
+    wt_output_name = wt_op.name + ":0"
+    wt_op.output.extend([wt_output_name])
+    wt_output_shape = mace_pb2.OutputShape()
+    wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
+    wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
+    wt_op.output_shape.extend([wt_output_shape])
+
+    # MatMul
+    matmul_op = mace_pb2.OperatorDef()
+    arg = matmul_op.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    matmul_op.name = op.name + '_matmul'
+    matmul_op.type = 'MatMul'
+    matmul_op.input.extend([filter_name, wt_output_name])
+    matmul_output_name = matmul_op.name + ":0"
+    matmul_op.output.extend([matmul_output_name])
+    matmul_output_shape = mace_pb2.OutputShape()
+    matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
+    matmul_op.output_shape.extend([matmul_output_shape])
+
+    # Inverse transform
+    iwt_op = mace_pb2.OperatorDef()
+    arg = iwt_op.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    batch_arg = iwt_op.arg.add()
+    batch_arg.name = 'batch'
+    batch_arg.i = output_shape[0]
+    height_arg = iwt_op.arg.add()
+    height_arg.name = 'height'
+    height_arg.i = output_shape[1]
+    width_arg = iwt_op.arg.add()
+    width_arg.name = 'width'
+    width_arg.i = output_shape[2]
+    iwt_op.name = op.name + '_inverse_transform'
+    iwt_op.type = 'WinogradInverseTransform'
+    iwt_op.input.extend([matmul_output_name])
+
+    # Add Bias
+    if len(op.data) == 2:
+      bias_tensor_name = op.name + '_bias:0'
+      bias_data = op.data[1].reshape(-1)
+      self.add_tensor(bias_tensor_name, bias_data)
+      output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT")
+      iwt_op.input.extend([output_name])
+
+    final_op = op
+    final_op.output_shape_map[final_op.layer.top[0]] = output_shape
+    self.resolved_ops.add(op.name)
+
+    if len(self.ops_map[final_op.name].children) == 1 \
+        and self.ops_map[final_op.name].children[0].type in activation_name_map:
+      activation_op = self.ops_map[final_op.name].children[0]
+      fused_act_arg = iwt_op.arg.add()
+      fused_act_arg.name = 'activation'
+      fused_act_arg.s = activation_name_map[activation_op.type]
+      final_op = activation_op
+      final_op.output_shape_map[final_op.layer.top[0]] = output_shape
+      self.resolved_ops.add(activation_op.name)
+
+    iwt_op.output.extend([final_op.name+':0'])
+    self.add_output_shape(iwt_op, output_shape)
+    self.net_def.op.extend([wt_op, matmul_op, iwt_op])
+
  def convert_batchnorm(self, op):
    if len(op.children) != 1 or op.children[0].type != 'Scale':
      raise Exception('Now only support BatchNorm+Scale')
@@ -468,10 +589,21 @@ class CaffeConverter(object):
    self.add_tensor(weight_tensor_name, weight_data)
    if self.device == 'gpu':
      if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \
-          or weight_data.shape[1] > OPENCL_IMAGE_MAX_SIZE:
+          and (weight_data.shape[1] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
+        raise Exception('Mace gpu do not support FC with weight shape: '
+                        +str(weight_data.shape))
+      if input_shape[3] % 4 == 0:
+        buffer_type = "WEIGHT_WIDTH"
+      else:
+        buffer_type = "WEIGHT_HEIGHT"
+        weight_type_arg = op_def.arg.add()
+        weight_type_arg.name = 'weight_type'
+        weight_type_arg.i = buffer_type_map['WEIGHT_HEIGHT']
+
+      if buffer_type == "WEIGHT_HEIGHT" and \
+                  (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
        raise Exception('Mace gpu do not support FC with weight shape: '
                        +str(weight_data.shape))
-      buffer_type = "WEIGHT_HEIGHT"
      output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
      op_def.input.extend([output_name])
    else:
@@ -521,6 +653,13 @@ class CaffeConverter(object):
    pooling_type_arg.i = pooling_type_mode[pooling_type]

    input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
+    if param.HasField('global_pooling') and param.global_pooling:
+      kernels = [input_shape[1], input_shape[2]]
+
+    kernel_arg = op_def.arg.add()
+    kernel_arg.name = 'kernels'
+    kernel_arg.ints.extend(kernels)
+
    filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]]
    output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
                                          paddings, strides, [1, 1], math.ceil)
@@ -684,7 +823,10 @@ class CaffeConverter(object):
      if op.type == 'Input':
        self.resolved_ops.add(op.name)
      elif op.type == 'Convolution':
-        self.convert_conv2d(op)
+        if self.check_winograd_conv(op):
+          self.convert_winograd_conv(op)
+        else:
+          self.convert_conv2d(op)
      elif op.type == 'BatchNorm':
        self.convert_batchnorm(op)
      elif op.type == 'InnerProduct':
@@ -719,7 +861,8 @@ class CaffeConverter(object):
        print 'Unresolve Op: %s with type %s' % (op.name, op.type)


-def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str, output_node_str, data_type, device, winograd):
+def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str,
+                       output_node_str, data_type, device, winograd):
  net_def = mace_pb2.NetDef()
  dt = data_type_map[data_type]


--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
@@ -35,7 +35,7 @@ class MemoryOptimizer(object):

  def get_mem_size(self, op_type, output_shape):
    mem_size = [0, 0]
-    if op_type == 'WinogradTransform' or op_type == 'GEMM':
+    if op_type == 'WinogradTransform' or op_type == 'MatMul':
      mem_size[0] = output_shape[2] * output_shape[3]
      mem_size[1] = output_shape[0] * int((output_shape[1]+3)/4)
    else:

--- a/tools/validate.py
+++ b/tools/validate.py
@@ -5,6 +5,7 @@ import os.path
 import numpy as np
 import re
 from scipy import spatial
+from scipy import stats

 # Validation Flow:
 # 1. Generate input data
@@ -30,7 +31,10 @@ def format_output_name(name):

 def compare_output(output_name, mace_out_value, out_value):
  if mace_out_value.size != 0:
-    similarity = (1 - spatial.distance.cosine(out_value.flat, mace_out_value.flat))
+    out_value = out_value.reshape(-1)
+    mace_out_value = mace_out_value.reshape(-1)
+    assert len(out_value) == len(mace_out_value)
+    similarity = (1 - spatial.distance.cosine(out_value, mace_out_value))
    print output_name, 'MACE VS', FLAGS.platform.upper(), 'similarity: ', similarity
    if (FLAGS.mace_runtime == "cpu" and similarity > 0.999) or \
        (FLAGS.mace_runtime == "gpu" and similarity > 0.995) or \
@@ -92,7 +96,10 @@ def validate_caffe_model(input_names, input_shapes, output_names, output_shapes)
  for i in range(len(input_names)):
    input_value = load_data(FLAGS.input_file + "_" + input_names[i])
    input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, 2))
-    net.blobs[input_names[i]].data[0] = input_value
+    input_blob_name = input_names[i]
+    if input_names[i] in net.top_names:
+      input_blob_name = net.top_names[input_names[i]][0]
+    net.blobs[input_blob_name].data[0] = input_value

  net.forward()