Finish atrous convolution: faster than space_to_batch method.

336f35b4 · liuqi · 41a905c9 · 336f35b4 · 336f35b4 · 336f35b4
17 changed file
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -60,7 +60,9 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
                      img_format,
                      image_shape[0], image_shape[1],
                      0, nullptr, &error);
-  MACE_CHECK(error == CL_SUCCESS);
+  MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
+                                  << image_shape[0] << ", " << image_shape[1]
+                                  << "]";

  return cl_image;
 }

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -192,18 +192,9 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
  *program = cl::Program(this->context(), {device()}, {binary});
 #else
  std::string source_filename = kernel_path_ + program_file_name;
-  std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";

  // Create program
-  bool is_binary_filename_exist = std::ifstream(binary_filename).is_open();
-  if (is_binary_filename_exist) {
-    VLOG(1) << "Create program with binary: " << binary_filename;
-    std::vector<unsigned char> binary;
-    MACE_CHECK(ReadFile(binary_filename, true, &binary));
-
-    *program = cl::Program(this->context(), {device()}, {binary});
-
-  } else if (std::ifstream(source_filename).is_open()) {
+  if (std::ifstream(source_filename).is_open()) {
    VLOG(1) << "Create program with source: " << source_filename;
    std::vector<unsigned char> kernel_source;
    MACE_CHECK(ReadFile(source_filename, false, &kernel_source));
@@ -214,8 +205,7 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
    *program = cl::Program(this->context(), sources);

  } else {
-    LOG(FATAL) << "Failed to open kernel file " << binary_filename << " or "
-               << source_filename;
+    LOG(FATAL) << "Failed to open kernel file " << source_filename;
  }
 #endif

@@ -237,32 +227,31 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,

 #ifndef MACE_EMBED_BINARY_PROGRAM
  // Write binary if necessary
-  if (!is_binary_filename_exist) {
-    size_t device_list_size = 1;
-    std::unique_ptr<size_t[]> program_binary_sizes(
-        new size_t[device_list_size]);
-    cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
-                                  sizeof(size_t) * device_list_size,
-                                  program_binary_sizes.get(), nullptr);
-    MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
-    std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
-        new std::unique_ptr<unsigned char[]>[device_list_size]);
-    for (cl_uint i = 0; i < device_list_size; ++i) {
-      program_binaries[i] = std::unique_ptr<unsigned char[]>(
-          new unsigned char[program_binary_sizes[i]]);
-    }
+  std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
+  size_t device_list_size = 1;
+  std::unique_ptr<size_t[]> program_binary_sizes(
+      new size_t[device_list_size]);
+  cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
+                                sizeof(size_t) * device_list_size,
+                                program_binary_sizes.get(), nullptr);
+  MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
+  std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
+      new std::unique_ptr<unsigned char[]>[device_list_size]);
+  for (cl_uint i = 0; i < device_list_size; ++i) {
+    program_binaries[i] = std::unique_ptr<unsigned char[]>(
+        new unsigned char[program_binary_sizes[i]]);
+  }

-    err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
-                           sizeof(unsigned char *) * device_list_size,
-                           program_binaries.get(), nullptr);
-    MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
-    std::vector<unsigned char> content(
-        reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
-        reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
-            program_binary_sizes[0]);
+  err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
+                         sizeof(unsigned char *) * device_list_size,
+                         program_binaries.get(), nullptr);
+  MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
+  std::vector<unsigned char> content(
+      reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
+      reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
+          program_binary_sizes[0]);

-    MACE_CHECK(WriteFile(binary_filename, true, content));
-  }
+  MACE_CHECK(WriteFile(binary_filename, true, content));
 #endif
 }


--- a/mace/examples/mace_run.cc
+++ b/mace/examples/mace_run.cc
@@ -212,12 +212,14 @@ int main(int argc, char **argv) {
    LOG(INFO) << "Avg duration: " << (t1 - t0) / round << " us";
  }

-  MACE_CHECK(engine.Run(input_data.get(), input_shape_vec, output_data.get()));
  if (output_data != nullptr) {
    ofstream out_file(output_file, ios::binary);
    out_file.write((const char *) (output_data.get()),
                   output_size * sizeof(float));
    out_file.flush();
    out_file.close();
+    LOG(INFO) << "Write output file done.";
+  } else {
+    LOG(ERROR) << "output data is null";
  }
 }
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -14,7 +14,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __private const int filter_height,
                      __private const int filter_width,
                      __private const int padding_top,
-                      __private const int padding_left) {
+                      __private const int padding_left,
+                      __private const int dilation_h,
+                      __private const int dilation_w) {
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
  const int out_w_blks = get_global_size(1);
@@ -57,7 +59,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
    const int in_idx = mul24(in_ch_blk, in_width);
    int filter_x_part0 = in_ch_blk << 2;
    for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
-      int in_hb_value = height_idx + hb_idx;
+      int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
      in_hb_value = select(in_hb_value + batch_idx,
                           -1,
                           (in_hb_value < 0 || in_hb_value >= in_height));
@@ -66,7 +68,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
      for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
        int in_width_value;
 #define READ_INPUT(i)                                                                \
-        in_width_value = in_width##i + width_idx;                                    \
+        in_width_value = in_width##i + mul24(width_idx, dilation_w);                 \
        in_width_value = select(in_idx + in_width_value,                             \
                                -1,                                                  \
                                (in_width_value < 0 || in_width_value >= in_width)); \

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -12,7 +12,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                          __private const int out_height,
                          __private const int out_width,
                          __private const int padding_top,
-                          __private const int padding_left) {
+                          __private const int padding_left,
+                          __private const int dilation_h,
+                          __private const int dilation_w) {
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
  const int out_w_blks = get_global_size(1);
@@ -55,12 +57,11 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]

  DATA_TYPE4 in0, in1, in2, in3, in4;
  DATA_TYPE4 weights0, weights1, weights2, weights3;
-  int hb_idx, width_idx, in_width_idx;
  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
    const int in_idx = mul24(in_ch_blk, in_width);
    int filter_x_part0 = in_ch_blk << 2;
    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
-      int in_hb_value = height_idx + hb_idx;
+      int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
      in_hb_value = select(in_hb_value + batch_idx,
                           -1,
                           (in_hb_value < 0 || in_hb_value >= in_height));
@@ -68,7 +69,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
      for (short width_idx = 0; width_idx < 3; ++width_idx) {
        int in_width_value;
 #define READ_INPUT(i)                                                                \
-        in_width_value = in_width##i + width_idx;                                    \
+        in_width_value = in_width##i + mul24(width_idx, dilation_w);                 \
        in_width_value = select(in_idx + in_width_value,                             \
                                -1,                                                  \
                                (in_width_value < 0 || in_width_value >= in_width)); \

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -10,29 +10,33 @@ namespace kernels {

 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output, StatsFuture *future);
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
+                               StatsFuture *future);

 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output, StatsFuture *future);
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
+                               StatsFuture *future);

 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output, StatsFuture *future);
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
+                               StatsFuture *future);

 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output, StatsFuture *future);
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
+                               StatsFuture *future);

 extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                         const Tensor *bias, const bool fused_relu,
                         const uint32_t stride, const int *padding,
-                         const DataType dt, Tensor *output,
-                         StatsFuture *future);
+                         const int *dilations, const DataType dt,
+                         Tensor *output, StatsFuture *future);

 template<typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
@@ -42,8 +46,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                      StatsFuture *future) {
  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
                                       const Tensor *bias, const bool fused_relu,
-                                       const int *padding, const DataType dt,
-                                       Tensor *output,
+                                       const int *padding, const int *dilations,
+                                       const DataType dt, Tensor *output,
                                       StatsFuture *future);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5][2] = {
@@ -55,12 +59,14 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,

  index_t kernel_h = filter->dim(0);
  index_t kernel_w = filter->dim(1);
-  if (!input->is_image() || strides_[0] != strides_[1] ||
-      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
+  if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 ||
+      (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
    LOG(WARNING) << "OpenCL conv2d kernel with "
                 << "filter" << kernel_h << "x" << kernel_w << ","
                 << " stride " << strides_[0] << "x" << strides_[1]
-                 << " is not implemented yet, using slow version";
+                 << ",dilations " << dilations_[0] << "x" << dilations_[1]
+                 << " and input image: " << input->is_image()
+                 << " is not implemented yet.";
    MACE_NOT_IMPLEMENTED;
  }

@@ -77,11 +83,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  if (kernel_h == kernel_w && kernel_h <= 5 &&
      selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
    auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-    conv2d_func(input, filter, bias, false, paddings.data(),
+    conv2d_func(input, filter, bias, false, paddings.data(), dilations_,
                DataTypeToEnum<T>::value, output, future);
  } else {
    Conv2dOpencl(input, filter, bias, false, strides_[0],
-                 paddings.data(), DataTypeToEnum<T>::value,
+                 paddings.data(), dilations_, DataTypeToEnum<T>::value,
                 output, future);
  }


--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -129,6 +129,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
                               const Tensor *bias,
                               const bool fused_relu,
                               const int *padding,
+                               const int *dilations,
                               const DataType dt,
                               Tensor *output,
                               StatsFuture *future) {
@@ -140,6 +141,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
                               const Tensor *bias,
                               const bool fused_relu,
                               const int *padding,
+                               const int *dilations,
                               const DataType dt,
                               Tensor *output,
                               StatsFuture *future) {

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -15,8 +15,8 @@ namespace kernels {
 static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
                         const Tensor *bias, const bool fused_relu,
                         const uint32_t stride, const int *padding,
-                         const DataType dt, Tensor *output,
-                         StatsFuture *future) {
+                         const int *dilations, const DataType dt,
+                         Tensor *output, StatsFuture *future) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -53,6 +53,8 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
  conv_2d_kernel.setArg(idx++, padding[0] / 2);
  conv_2d_kernel.setArg(idx++, padding[1] / 2);
+  conv_2d_kernel.setArg(idx++, dilations[0]);
+  conv_2d_kernel.setArg(idx++, dilations[1]);

  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width_blocks),
@@ -121,10 +123,11 @@ void Conv2dOpenclK3x3S1(const Tensor *input,
                        const Tensor *bias,
                        const bool fused_relu,
                        const int *padding,
+                        const int *dilations,
                        const DataType dt,
                        Tensor *output,
                        StatsFuture *future) {
-  Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output, future);
+  Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dilations, dt, output, future);
 };

 void Conv2dOpenclK3x3S2(const Tensor *input,
@@ -132,10 +135,11 @@ void Conv2dOpenclK3x3S2(const Tensor *input,
                        const Tensor *bias,
                        const bool fused_relu,
                        const int *padding,
+                        const int *dilations,
                        const DataType dt,
                        Tensor *output,
                        StatsFuture *future) {
-  Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output, future);
+  Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dilations, dt, output, future);
 };

 }  // namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -15,8 +15,8 @@ namespace kernels {
 void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                  const Tensor *bias, const bool fused_relu,
                  const uint32_t stride, const int *padding,
-                  const DataType dt, Tensor *output,
-                  StatsFuture *future) {
+                  const int *dilations, const DataType dt,
+                  Tensor *output, StatsFuture *future) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -55,6 +55,8 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
  conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
  conv_2d_kernel.setArg(idx++, padding[0] / 2);
  conv_2d_kernel.setArg(idx++, padding[1] / 2);
+  conv_2d_kernel.setArg(idx++, dilations[0]);
+  conv_2d_kernel.setArg(idx++, dilations[1]);

  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width_blocks),

--- a/mace/kernels/opencl/fused_conv_2d_opencl.cc
+++ b/mace/kernels/opencl/fused_conv_2d_opencl.cc
@@ -10,33 +10,33 @@ namespace kernels {

 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output,
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
                               StatsFuture *future);

 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output,
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
                               StatsFuture *future);

 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output,
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
                               StatsFuture *future);

 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output,
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
                               StatsFuture *future);

 extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                         const Tensor *bias, const bool fused_relu,
                         const uint32_t stride, const int *padding,
-                         const DataType dt, Tensor *output,
-                         StatsFuture *future);
+                         const int *dilations, const DataType dt,
+                         Tensor *output, StatsFuture *future);

 template<typename T>
 void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
@@ -46,8 +46,9 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                           StatsFuture *future) {
  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
                                       const Tensor *bias, const bool fused_relu,
-                                       const int *padding, const DataType dt,
-                                       Tensor *output, StatsFuture *future);
+                                       const int *padding, const int *dilations,
+                                       const DataType dt, Tensor *output,
+                                       StatsFuture *future);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5][2] = {
      {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
@@ -57,12 +58,14 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
      {nullptr, nullptr}};
  index_t kernel_h = filter->dim(0);
  index_t kernel_w = filter->dim(1);
-  if (!input->is_image() || strides_[0] != strides_[1] ||
-      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
+  if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 ||
+      (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
    LOG(WARNING) << "OpenCL conv2d kernel with "
                 << "filter" << kernel_h << "x" << kernel_w << ","
                 << " stride " << strides_[0] << "x" << strides_[1]
-                 << " is not implemented yet, using slow version";
+                 << ",dilations " << dilations_[0] << "x" << dilations_[1]
+                 << " and input image: " << input->is_image()
+                 << " is not implemented yet.";
    MACE_NOT_IMPLEMENTED;
  }

@@ -79,11 +82,11 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  if (kernel_h == kernel_w && kernel_h <= 5 &&
      selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
    auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-    conv2d_func(input, filter, bias, true, paddings.data(),
+    conv2d_func(input, filter, bias, true, paddings.data(), dilations_,
                DataTypeToEnum<T>::value, output, future);
  } else {
    Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(),
-                 DataTypeToEnum<T>::value, output, future);
+                 dilations_, DataTypeToEnum<T>::value, output, future);
  }
 }


--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -2,6 +2,7 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

+#include <fstream>
 #include "mace/ops/conv_2d.h"
 #include "mace/ops/ops_test_util.h"

@@ -564,19 +565,20 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {

 template<DeviceType D>
 static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
-                                      const std::vector<index_t> &filter_shape) {
+                                      const std::vector<index_t> &filter_shape,
+                                      const std::vector<int> &dilations) {
  testing::internal::LogToStderr();
  srand(time(NULL));

  auto func = [&](int stride_h, int stride_w, Padding padding) {
    // generate random input
-    index_t batch = 3 + (rand() % 10);
+    index_t batch = 3;
    index_t height = input_shape[0];
    index_t width = input_shape[1];
    index_t kernel_h = filter_shape[0];
    index_t kernel_w = filter_shape[1];
-    index_t input_channels = filter_shape[2] + (rand() % 10);
-    index_t output_channels = filter_shape[3] + (rand() % 10);
+    index_t input_channels = filter_shape[2];
+    index_t output_channels = filter_shape[3];
    // Construct graph
    OpsTestNet net;
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -586,7 +588,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
        .Output("Output")
        .AddIntsArg("strides", {stride_h, stride_w})
        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {1, 1})
+        .AddIntsArg("dilations", {dilations[0], dilations[1]})
        .Finalize(net.NewOperatorDef());

    std::vector<float> float_input_data;
@@ -619,7 +621,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
        .Output("OutputImage")
        .AddIntsArg("strides", {stride_h, stride_w})
        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {1, 1})
+        .AddIntsArg("dilations", {dilations[0], dilations[1]})
        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
        .Finalize(net.NewOperatorDef());
    // Run on device
@@ -630,43 +632,154 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
  };

-  for (int stride : {1, 2}) {
-    func(stride, stride, VALID);
-    func(stride, stride, SAME);
+  func(1, 1, VALID);
+  func(1, 1, SAME);
+  if (dilations[0] == 1) {
+    func(2, 2, VALID);
+    func(2, 2, SAME);
  }
 }

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {1, 1, 32, 64});
+                                                {1, 1, 32, 64},
+                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {3, 3, 32, 64});
+                                                {3, 3, 32, 64},
+                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {15, 1, 256, 2});
+                                                {15, 1, 256, 2},
+                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {1, 15, 256, 2});
+                                                {1, 15, 256, 2},
+                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {7, 7, 3, 64});
+                                                {7, 7, 3, 64},
+                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
-                                                {1, 1, 5, 7});
+                                                {1, 1, 5, 7},
+                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
-                                                {3, 3, 5, 7});
+                                                {3, 3, 5, 7},
+                                                {1, 1});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
+                                                {5, 5, 16, 16},
+                                                {2, 2});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
+                                                {7, 7, 16, 16},
+                                                {2, 2});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67},
+                                                {7, 7, 16, 16},
+                                                {4, 4});
+}
+
+template<DeviceType D, typename T>
+static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dilation_rate) {
+  testing::internal::LogToStderr();
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
+                  Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 1;
+    index_t height = shape[0];
+    index_t width = shape[1];
+    index_t input_channels = shape[2];
+    index_t output_channels = shape[3];
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {dilation_rate, dilation_rate})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {dilation_rate, dilation_rate})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
+  };
+
+  for (int kernel_size : {3}) {
+    for (int stride : {1}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
+      func(kernel_size, kernel_size, stride, stride, SAME);
+    }
+  }
 }
+
+TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
+  TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
+                                                 2);
+}
+
+TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
+  TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
+                                                 4);
+}
+
+TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
+  TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
+                                                 4);
+}
+
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
@@ -486,3 +486,160 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
                                                   {15, 1, 32, 64});
 }

+template<DeviceType D, typename T>
+static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilation) {
+  testing::internal::LogToStderr();
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
+                  Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 1;
+    index_t height = shape[0];
+    index_t width = shape[1];
+    index_t input_channels = shape[2];
+    index_t output_channels = shape[3];
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {dilation, dilation})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {dilation, dilation})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
+  };
+
+  for (int kernel_size : {3}) {
+    for (int stride : {1}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
+      func(kernel_size, kernel_size, stride, stride, SAME);
+    }
+  }
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN2) {
+  TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 2);
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN4) {
+  TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
+  TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
+}
+
+template<DeviceType D>
+static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
+                                      const std::vector<index_t> &filter_shape,
+                                      const std::vector<int> &dilations) {
+  testing::internal::LogToStderr();
+  auto func = [&](int stride_h, int stride_w, Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 1;
+    index_t height = image_shape[0];
+    index_t width = image_shape[1];
+    index_t input_channels = filter_shape[2];
+    index_t output_channels = filter_shape[3];
+    index_t kernel_h = filter_shape[0];
+    index_t kernel_w = filter_shape[1];
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+
+    // Add input data
+    net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, float>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels});
+    net.AddRandomInput<D, float>("Bias", {output_channels});
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7);
+  };
+
+  func(1, 1, VALID);
+  func(1, 1, SAME);
+}
+
+TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
+  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32},
+                                                {7, 7, 3, 16},
+                                                {2, 2});
+}
+
+TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
+  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71},
+                                                {15, 15, 16, 16},
+                                                {2, 2});
+}
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -322,18 +322,25 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
    Tensor::MappingGuard y_mapper(&y);
    auto a = x.data<EXP_TYPE>();
    auto b = y.data<RES_TYPE>();
-    for (int n = 0; n < x.dim(0); ++n) {
-      for (int h = 0; h < x.dim(1); ++h) {
-        for (int w = 0; w < x.dim(2); ++w) {
-          for (int c = 0; c < x.dim(3); ++c) {
-            EXPECT_NEAR(*a, *b, abs_err) << "with index = ["
-                                         << n << ", " << h << ", "
-                                         << w << ", " << c << "]";
-            a++;
-            b++;
+    if (x.dim_size() == 4) {
+      for (int n = 0; n < x.dim(0); ++n) {
+        for (int h = 0; h < x.dim(1); ++h) {
+          for (int w = 0; w < x.dim(2); ++w) {
+            for (int c = 0; c < x.dim(3); ++c) {
+              EXPECT_NEAR(*a, *b, abs_err) << "with index = ["
+                                           << n << ", " << h << ", "
+                                           << w << ", " << c << "]";
+              a++;
+              b++;
+            }
          }
        }
      }
+    } else {
+      for (int i = 0; i < x.size(); ++i) {
+        EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
+                                         << " index = " << i;
+      }
    }
  }


--- a/mace/python/tools/tf_converter.py
+++ b/mace/python/tools/tf_converter.py
@@ -21,7 +21,6 @@ def main(unused_args):
    data = f.read()
    input_graph_def.ParseFromString(data)

-  print 'done'
  if FLAGS.runtime == 'dsp':
    output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
      input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
@@ -38,6 +37,7 @@ def main(unused_args):
    with gfile.GFile(FLAGS.output + '_txt', "wb") as f:
      # output_graph_def.ClearField('tensors')
      f.write(str(output_graph_def))
+  print("Model conversion is completed.")


 def parse_args():
@@ -82,7 +82,7 @@ def parse_args():
  parser.add_argument(
    "--output_type",
    type=str,
-    default="source",
+    default="pb",
    help="output type: source/pb")
  parser.add_argument(
    "--template",

--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -44,6 +44,7 @@ class TFConverter(object):
    self.device = device
    self.tf_graph = {}
    self.resolved_ops = {}
+    self.unused_tensor = set()

    for op in tf_ops:
      self.resolved_ops[op.name] = 0
@@ -72,6 +73,23 @@ class TFConverter(object):
    arg.i = self.dt
    return output_name

+  def add_image_to_buffer(self, input_name, input_type):
+    output_name = input_name[:-2] + "_i2b" + input_name[-2:]
+    op_def = self.net_def.op.add()
+    op_def.name = output_name[:-2]
+    op_def.type = 'ImageToBuffer'
+    op_def.input.extend([input_name])
+    op_def.output.extend([output_name])
+
+    arg = op_def.arg.add()
+    arg.name = 'buffer_type'
+    arg.i = buffer_type_map[input_type]
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    return output_name
+
+
  def add_input_transform(self, name):
    new_input_name = MACE_INPUT_NODE_NAME + ":0"
    op_def = self.net_def.op.add()
@@ -111,22 +129,23 @@ class TFConverter(object):
    op.output_shape.extend(output_shapes)

  def convert_tensor(self, op):
-    tensor = self.net_def.tensors.add()
-    tf_tensor = op.outputs[0].eval()
-    tensor.name = op.outputs[0].name
-
-    shape = list(tf_tensor.shape)
-    tensor.dims.extend(shape)
-
-    tf_dt = op.get_attr('dtype')
-    if tf_dt == tf.float32:
-      tensor.data_type = mace_pb2.DT_FLOAT
-      tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
-    elif tf_dt == tf.int32:
-      tensor.data_type = mace_pb2.DT_INT32
-      tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
-    else:
-      raise Exception("Not supported tensor type: " + tf_dt.name)
+    if op.outputs[0].name not in self.unused_tensor:
+      tensor = self.net_def.tensors.add()
+      tf_tensor = op.outputs[0].eval()
+      tensor.name = op.outputs[0].name
+
+      shape = list(tf_tensor.shape)
+      tensor.dims.extend(shape)
+
+      tf_dt = op.get_attr('dtype')
+      if tf_dt == tf.float32:
+        tensor.data_type = mace_pb2.DT_FLOAT
+        tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
+      elif tf_dt == tf.int32:
+        tensor.data_type = mace_pb2.DT_INT32
+        tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
+      else:
+        raise Exception("Not supported tensor type: " + tf_dt.name)
    self.resolved_ops[op.name] = 1

  def convert_conv2d(self, op):
@@ -253,6 +272,7 @@ class TFConverter(object):
    data_format_arg = op_def.arg.add()
    data_format_arg.name = 'data_format'
    data_format_arg.s = 'NHWC'
+    self.unused_tensor.add(get_input_tensor(op, 1).name)

    self.net_def.op.extend([op_def])
    for i in range(0, 7):
@@ -326,6 +346,7 @@ class TFConverter(object):
    axis_arg.i = get_input_tensor(op, 2).eval().astype(np.int32)
    self.add_output_shape(op.outputs, op_def)
    self.resolved_ops[op.name] = 1
+    self.unused_tensor.add(get_input_tensor(op, 2).name)

  def convert_resize_bilinear(self, op):
    op_def = self.net_def.op.add()
@@ -344,6 +365,7 @@ class TFConverter(object):
    size_arg.i = op.get_attr('align_corners')
    self.add_output_shape(op.outputs, op_def)
    self.resolved_ops[op.name] = 1
+    self.unused_tensor.add(get_input_tensor(op, 1).name)

  def convert_bias_add(self, op):
    op_def = mace_pb2.OperatorDef()
@@ -383,6 +405,79 @@ class TFConverter(object):
    size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat)
    self.add_output_shape(op.outputs, op_def)
    self.resolved_ops[op.name] = 1
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
+    self.unused_tensor.add(get_input_tensor(op, 2).name)
+
+  def is_atrous_conv2d(self, op):
+    return op.type == 'SpaceToBatchND' and\
+           len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Conv2D'
+
+  def convert_atrous_conv2d(self, op):
+    op_def = mace_pb2.OperatorDef()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    conv_op = self.tf_graph[op.name][0]
+    op_def.name = conv_op.name
+    op_def.type = conv_op.type
+    if self.device == 'gpu':
+      op_def.input.extend([op.inputs[0].name])
+      output_name = self.add_buffer_to_image(conv_op.inputs[1].name, "FILTER")
+      op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([op.inputs[0].name])
+      op_def.input.extend([conv_op.inputs[1].name])
+
+    dilation_arg = op_def.arg.add()
+    dilation_arg.name = 'dilations'
+    dilation_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
+    padding_arg = op_def.arg.add()
+    padding_arg.name = 'padding'
+    padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
+    if len(padding_values) > 0 and padding_values[0] > 0:
+      padding_arg.i = padding_mode['SAME']
+    else:
+      padding_arg.i = padding_mode['VALID']
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
+    self.unused_tensor.add(get_input_tensor(op, 2).name)
+
+    strides_arg = op_def.arg.add()
+    strides_arg.name = 'strides'
+    strides_arg.ints.extend([1, 1])
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
+    final_op = conv_op
+    self.resolved_ops[op.name] = 1
+    self.resolved_ops[conv_op.name] = 1
+
+    if len(self.tf_graph[final_op.name]) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd' :
+      bias_add_op = self.tf_graph[final_op.name][0]
+      if self.device == 'gpu':
+        output_name = self.add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT")
+        op_def.input.extend([output_name])
+      else:
+        op_def.input.extend([bias_add_op.inputs[1].name])
+      final_op = bias_add_op
+      self.resolved_ops[bias_add_op.name] = 1
+
+    if len(self.tf_graph[final_op.name]) == 1 \
+      and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
+      final_op = self.tf_graph[final_op.name][0]
+      self.resolved_ops[final_op.name] = 1
+    else:
+      raise Exception('Convert atrous conv error: no BatchToSpaceND op')
+
+    if len(self.tf_graph[final_op.name]) == 1 \
+        and self.tf_graph[final_op.name][0].type == 'Relu':
+      relu_op = self.tf_graph[final_op.name][0]
+      op_def.type = "FusedConv2D"
+      final_op = relu_op
+      self.resolved_ops[relu_op.name] = 1
+
+    op_def.output.extend([output.name for output in final_op.outputs])
+    self.add_output_shape(final_op.outputs, op_def)
+    self.net_def.op.extend([op_def])

  def convert_normal_op(self, op):
    op_def = self.net_def.op.add()
@@ -407,7 +502,9 @@ class TFConverter(object):
        self.resolved_ops[op.name] = 1
        pass
      elif op.type == 'Const':
-        self.convert_tensor(op)
+        pass
+      elif self.is_atrous_conv2d(op):
+        self.convert_atrous_conv2d(op)
      elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
        self.convert_conv2d(op)
      elif op.type == 'FusedBatchNorm':
@@ -435,6 +532,15 @@ class TFConverter(object):
      else:
        raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))

+
+    for op in self.tf_ops:
+      if self.resolved_ops[op.name] == 1:
+        continue
+      elif op.type == 'Const':
+        self.convert_tensor(op)
+      else:
+        raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
+
    if self.device == 'gpu':
      self.add_output_transform(output_node)


--- a/tools/validate.py
+++ b/tools/validate.py
@@ -10,8 +10,8 @@ from tensorflow import gfile

 # Validation Flow:
 # 1. Generate input data
-#    python validate_icnet.py --generate_data 1 \
-#          --random_seed 1
+#    python validate_icnet.py --generate_data 1
+#
 # 2. Use mace_run to run icnet on phone.
 # 3. adb pull the result.
 # 4. Compare output data of mace and tf
@@ -20,7 +20,7 @@ from tensorflow import gfile
 #        --mace_out_file icnet.out

 def generate_data(shape):
-  np.random.seed(FLAGS.random_seed)
+  np.random.seed()
  data = np.random.random(shape) * -1
  print FLAGS.input_file
  data.astype(np.float32).tofile(FLAGS.input_file)
@@ -122,12 +122,7 @@ def parse_args():
    "--generate_data",
    type='bool',
    default="false",
-    help="Random seed for generate test case.")
-  parser.add_argument(
-    "--random_seed",
-    type=int,
-    default="0",
-    help="Random seed for generate test case.")
+    help="Generate data or not.")

  return parser.parse_known_args()


--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
@@ -79,7 +79,8 @@ build_and_run()
 }

 echo "Step 1: Generate input data"
-python tools/validate.py --generate_data true --random_seed 1 \
+rm -rf ${MODEL_DIR}/${INPUT_FILE_NAME}
+python tools/validate.py --generate_data true \
 --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
 --input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3"