diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index 280b84d38659605c29ee71c6f479747bd506abac..4e14636e00d49b0ac023f1e319818258f2d23000 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -60,7 +60,9 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
                       img_format,
                       image_shape[0], image_shape[1],
                       0, nullptr, &error);
-  MACE_CHECK(error == CL_SUCCESS);
+  MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
+                                  << image_shape[0] << ", " << image_shape[1]
+                                  << "]";
 
   return cl_image;
 }
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 70ca43b67efcfdfea95e5d8246f08d61450cc053..21b8d5198b6e57d32c278d09e2bb8cc87aea666c 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -192,18 +192,9 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
   *program = cl::Program(this->context(), {device()}, {binary});
 #else
   std::string source_filename = kernel_path_ + program_file_name;
-  std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
 
   // Create program
-  bool is_binary_filename_exist = std::ifstream(binary_filename).is_open();
-  if (is_binary_filename_exist) {
-    VLOG(1) << "Create program with binary: " << binary_filename;
-    std::vector<unsigned char> binary;
-    MACE_CHECK(ReadFile(binary_filename, true, &binary));
-
-    *program = cl::Program(this->context(), {device()}, {binary});
-
-  } else if (std::ifstream(source_filename).is_open()) {
+  if (std::ifstream(source_filename).is_open()) {
     VLOG(1) << "Create program with source: " << source_filename;
     std::vector<unsigned char> kernel_source;
     MACE_CHECK(ReadFile(source_filename, false, &kernel_source));
@@ -214,8 +205,7 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
     *program = cl::Program(this->context(), sources);
 
   } else {
-    LOG(FATAL) << "Failed to open kernel file " << binary_filename << " or "
-               << source_filename;
+    LOG(FATAL) << "Failed to open kernel file " << source_filename;
   }
 #endif
 
@@ -237,32 +227,31 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
 
 #ifndef MACE_EMBED_BINARY_PROGRAM
   // Write binary if necessary
-  if (!is_binary_filename_exist) {
-    size_t device_list_size = 1;
-    std::unique_ptr<size_t[]> program_binary_sizes(
-        new size_t[device_list_size]);
-    cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
-                                  sizeof(size_t) * device_list_size,
-                                  program_binary_sizes.get(), nullptr);
-    MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
-    std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
-        new std::unique_ptr<unsigned char[]>[device_list_size]);
-    for (cl_uint i = 0; i < device_list_size; ++i) {
-      program_binaries[i] = std::unique_ptr<unsigned char[]>(
-          new unsigned char[program_binary_sizes[i]]);
-    }
+  std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
+  size_t device_list_size = 1;
+  std::unique_ptr<size_t[]> program_binary_sizes(
+      new size_t[device_list_size]);
+  cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
+                                sizeof(size_t) * device_list_size,
+                                program_binary_sizes.get(), nullptr);
+  MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
+  std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
+      new std::unique_ptr<unsigned char[]>[device_list_size]);
+  for (cl_uint i = 0; i < device_list_size; ++i) {
+    program_binaries[i] = std::unique_ptr<unsigned char[]>(
+        new unsigned char[program_binary_sizes[i]]);
+  }
 
-    err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
-                           sizeof(unsigned char *) * device_list_size,
-                           program_binaries.get(), nullptr);
-    MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
-    std::vector<unsigned char> content(
-        reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
-        reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
-            program_binary_sizes[0]);
+  err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
+                         sizeof(unsigned char *) * device_list_size,
+                         program_binaries.get(), nullptr);
+  MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
+  std::vector<unsigned char> content(
+      reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
+      reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
+          program_binary_sizes[0]);
 
-    MACE_CHECK(WriteFile(binary_filename, true, content));
-  }
+  MACE_CHECK(WriteFile(binary_filename, true, content));
 #endif
 }
 
diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc
index ccdc3c7c1a202fd3f61c3d97c6ab85f5484fa423..3cfbcbf98bf3c215c7c9041eb8d2237f980c0673 100644
--- a/mace/examples/mace_run.cc
+++ b/mace/examples/mace_run.cc
@@ -212,12 +212,14 @@ int main(int argc, char **argv) {
     LOG(INFO) << "Avg duration: " << (t1 - t0) / round << " us";
   }
 
-  MACE_CHECK(engine.Run(input_data.get(), input_shape_vec, output_data.get()));
   if (output_data != nullptr) {
     ofstream out_file(output_file, ios::binary);
     out_file.write((const char *) (output_data.get()),
                    output_size * sizeof(float));
     out_file.flush();
     out_file.close();
+    LOG(INFO) << "Write output file done.";
+  } else {
+    LOG(ERROR) << "output data is null";
   }
 }
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index ccfe86d85c83a24898ce9640747fe46b95be2d1f..8929b0488d2c18cf033fdb3a4ae2fc19e0c24326 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -14,7 +14,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                       __private const int filter_height,
                       __private const int filter_width,
                       __private const int padding_top,
-                      __private const int padding_left) {
+                      __private const int padding_left,
+                      __private const int dilation_h,
+                      __private const int dilation_w) {
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_w_blks = get_global_size(1);
@@ -57,7 +59,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
     const int in_idx = mul24(in_ch_blk, in_width);
     int filter_x_part0 = in_ch_blk << 2;
     for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
-      int in_hb_value = height_idx + hb_idx;
+      int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
       in_hb_value = select(in_hb_value + batch_idx,
                            -1,
                            (in_hb_value < 0 || in_hb_value >= in_height));
@@ -66,7 +68,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
       for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
         int in_width_value;
 #define READ_INPUT(i)                                                                \
-        in_width_value = in_width##i + width_idx;                                    \
+        in_width_value = in_width##i + mul24(width_idx, dilation_w);                 \
         in_width_value = select(in_idx + in_width_value,                             \
                                 -1,                                                  \
                                 (in_width_value < 0 || in_width_value >= in_width)); \
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 8a5ece6b31d907fc0a564c7407c969d6102b4c3a..019db2378755314bee009dceb084233a7a72db5a 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -12,7 +12,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __private const int out_height,
                           __private const int out_width,
                           __private const int padding_top,
-                          __private const int padding_left) {
+                          __private const int padding_left,
+                          __private const int dilation_h,
+                          __private const int dilation_w) {
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_w_blks = get_global_size(1);
@@ -55,12 +57,11 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 
   DATA_TYPE4 in0, in1, in2, in3, in4;
   DATA_TYPE4 weights0, weights1, weights2, weights3;
-  int hb_idx, width_idx, in_width_idx;
   for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
     const int in_idx = mul24(in_ch_blk, in_width);
     int filter_x_part0 = in_ch_blk << 2;
     for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
-      int in_hb_value = height_idx + hb_idx;
+      int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
       in_hb_value = select(in_hb_value + batch_idx,
                            -1,
                            (in_hb_value < 0 || in_hb_value >= in_height));
@@ -68,7 +69,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
       for (short width_idx = 0; width_idx < 3; ++width_idx) {
         int in_width_value;
 #define READ_INPUT(i)                                                                \
-        in_width_value = in_width##i + width_idx;                                    \
+        in_width_value = in_width##i + mul24(width_idx, dilation_w);                 \
         in_width_value = select(in_idx + in_width_value,                             \
                                 -1,                                                  \
                                 (in_width_value < 0 || in_width_value >= in_width)); \
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 947f781811356b10a93d61ccdf51b0956ac036e0..5fa46128d9c8f0cb2f7bfe5fd42f6299997613b9 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -10,29 +10,33 @@ namespace kernels {
 
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output, StatsFuture *future);
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
+                               StatsFuture *future);
 
 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output, StatsFuture *future);
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
+                               StatsFuture *future);
 
 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output, StatsFuture *future);
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
+                               StatsFuture *future);
 
 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output, StatsFuture *future);
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
+                               StatsFuture *future);
 
 extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                          const Tensor *bias, const bool fused_relu,
                          const uint32_t stride, const int *padding,
-                         const DataType dt, Tensor *output,
-                         StatsFuture *future);
+                         const int *dilations, const DataType dt,
+                         Tensor *output, StatsFuture *future);
 
 template<typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
@@ -42,8 +46,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                       StatsFuture *future) {
   typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
                                        const Tensor *bias, const bool fused_relu,
-                                       const int *padding, const DataType dt,
-                                       Tensor *output,
+                                       const int *padding, const int *dilations,
+                                       const DataType dt, Tensor *output,
                                        StatsFuture *future);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dOpenclFunction selector[5][2] = {
@@ -55,12 +59,14 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   index_t kernel_h = filter->dim(0);
   index_t kernel_w = filter->dim(1);
-  if (!input->is_image() || strides_[0] != strides_[1] ||
-      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
+  if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 ||
+      (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
     LOG(WARNING) << "OpenCL conv2d kernel with "
                  << "filter" << kernel_h << "x" << kernel_w << ","
                  << " stride " << strides_[0] << "x" << strides_[1]
-                 << " is not implemented yet, using slow version";
+                 << ",dilations " << dilations_[0] << "x" << dilations_[1]
+                 << " and input image: " << input->is_image()
+                 << " is not implemented yet.";
     MACE_NOT_IMPLEMENTED;
   }
 
@@ -77,11 +83,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   if (kernel_h == kernel_w && kernel_h <= 5 &&
       selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
     auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-    conv2d_func(input, filter, bias, false, paddings.data(),
+    conv2d_func(input, filter, bias, false, paddings.data(), dilations_,
                 DataTypeToEnum<T>::value, output, future);
   } else {
     Conv2dOpencl(input, filter, bias, false, strides_[0],
-                 paddings.data(), DataTypeToEnum<T>::value,
+                 paddings.data(), dilations_, DataTypeToEnum<T>::value,
                  output, future);
   }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index e2146a2e00dcc9ed0b1425a10bd1ee470e13dbb4..d56926a85357978f1857768b55362bc44ed1a5c3 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -129,6 +129,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
                                const Tensor *bias,
                                const bool fused_relu,
                                const int *padding,
+                               const int *dilations,
                                const DataType dt,
                                Tensor *output,
                                StatsFuture *future) {
@@ -140,6 +141,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
                                const Tensor *bias,
                                const bool fused_relu,
                                const int *padding,
+                               const int *dilations,
                                const DataType dt,
                                Tensor *output,
                                StatsFuture *future) {
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 6902448a1888e7cec9adea05ed239321a9b5cc15..f48c7fc3450928a3245da17e09fa20e3aac9c326 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -15,8 +15,8 @@ namespace kernels {
 static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
                          const Tensor *bias, const bool fused_relu,
                          const uint32_t stride, const int *padding,
-                         const DataType dt, Tensor *output,
-                         StatsFuture *future) {
+                         const int *dilations, const DataType dt,
+                         Tensor *output, StatsFuture *future) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -53,6 +53,8 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
   conv_2d_kernel.setArg(idx++, static_cast<int>(width));
   conv_2d_kernel.setArg(idx++, padding[0] / 2);
   conv_2d_kernel.setArg(idx++, padding[1] / 2);
+  conv_2d_kernel.setArg(idx++, dilations[0]);
+  conv_2d_kernel.setArg(idx++, dilations[1]);
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
@@ -121,10 +123,11 @@ void Conv2dOpenclK3x3S1(const Tensor *input,
                         const Tensor *bias,
                         const bool fused_relu,
                         const int *padding,
+                        const int *dilations,
                         const DataType dt,
                         Tensor *output,
                         StatsFuture *future) {
-  Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output, future);
+  Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dilations, dt, output, future);
 };
 
 void Conv2dOpenclK3x3S2(const Tensor *input,
@@ -132,10 +135,11 @@ void Conv2dOpenclK3x3S2(const Tensor *input,
                         const Tensor *bias,
                         const bool fused_relu,
                         const int *padding,
+                        const int *dilations,
                         const DataType dt,
                         Tensor *output,
                         StatsFuture *future) {
-  Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output, future);
+  Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dilations, dt, output, future);
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 1ad8b194eb9428cc473a36bf0a73cb1cf3f09d62..2bd897f0a7eaf9203466016e9308d344fbe2889e 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -15,8 +15,8 @@ namespace kernels {
 void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                   const Tensor *bias, const bool fused_relu,
                   const uint32_t stride, const int *padding,
-                  const DataType dt, Tensor *output,
-                  StatsFuture *future) {
+                  const int *dilations, const DataType dt,
+                  Tensor *output, StatsFuture *future) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -55,6 +55,8 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
   conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
   conv_2d_kernel.setArg(idx++, padding[0] / 2);
   conv_2d_kernel.setArg(idx++, padding[1] / 2);
+  conv_2d_kernel.setArg(idx++, dilations[0]);
+  conv_2d_kernel.setArg(idx++, dilations[1]);
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
diff --git a/mace/kernels/opencl/fused_conv_2d_opencl.cc b/mace/kernels/opencl/fused_conv_2d_opencl.cc
index f2bd514967d1359397762f4775a4d498af3b1ea7..2f4e608699042b72af545dda471dcd843a2cfdf5 100644
--- a/mace/kernels/opencl/fused_conv_2d_opencl.cc
+++ b/mace/kernels/opencl/fused_conv_2d_opencl.cc
@@ -10,33 +10,33 @@ namespace kernels {
 
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output,
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
                                StatsFuture *future);
 
 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output,
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
                                StatsFuture *future);
 
 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output,
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
                                StatsFuture *future);
 
 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, const bool fused_relu,
-                               const int *padding, const DataType dt,
-                               Tensor *output,
+                               const int *padding, const int *dilations,
+                               const DataType dt, Tensor *output,
                                StatsFuture *future);
 
 extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                          const Tensor *bias, const bool fused_relu,
                          const uint32_t stride, const int *padding,
-                         const DataType dt, Tensor *output,
-                         StatsFuture *future);
+                         const int *dilations, const DataType dt,
+                         Tensor *output, StatsFuture *future);
 
 template<typename T>
 void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
@@ -46,8 +46,9 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                            StatsFuture *future) {
   typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
                                        const Tensor *bias, const bool fused_relu,
-                                       const int *padding, const DataType dt,
-                                       Tensor *output, StatsFuture *future);
+                                       const int *padding, const int *dilations,
+                                       const DataType dt, Tensor *output,
+                                       StatsFuture *future);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dOpenclFunction selector[5][2] = {
       {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
@@ -57,12 +58,14 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       {nullptr, nullptr}};
   index_t kernel_h = filter->dim(0);
   index_t kernel_w = filter->dim(1);
-  if (!input->is_image() || strides_[0] != strides_[1] ||
-      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) {
+  if (!input->is_image() || strides_[0] != strides_[1] || strides_[0] > 2 ||
+      (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
     LOG(WARNING) << "OpenCL conv2d kernel with "
                  << "filter" << kernel_h << "x" << kernel_w << ","
                  << " stride " << strides_[0] << "x" << strides_[1]
-                 << " is not implemented yet, using slow version";
+                 << ",dilations " << dilations_[0] << "x" << dilations_[1]
+                 << " and input image: " << input->is_image()
+                 << " is not implemented yet.";
     MACE_NOT_IMPLEMENTED;
   }
 
@@ -79,11 +82,11 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   if (kernel_h == kernel_w && kernel_h <= 5 &&
       selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
     auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-    conv2d_func(input, filter, bias, true, paddings.data(),
+    conv2d_func(input, filter, bias, true, paddings.data(), dilations_,
                 DataTypeToEnum<T>::value, output, future);
   } else {
     Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(),
-                 DataTypeToEnum<T>::value, output, future);
+                 dilations_, DataTypeToEnum<T>::value, output, future);
   }
 }
 
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index e39c9b740049e96af68e04315872d6b5c30e98a3..f137da6e911ff3ffc986393f19c2f8326077e03c 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -2,6 +2,7 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
+#include <fstream>
 #include "mace/ops/conv_2d.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -564,19 +565,20 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
 
 template<DeviceType D>
 static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
-                                      const std::vector<index_t> &filter_shape) {
+                                      const std::vector<index_t> &filter_shape,
+                                      const std::vector<int> &dilations) {
   testing::internal::LogToStderr();
   srand(time(NULL));
 
   auto func = [&](int stride_h, int stride_w, Padding padding) {
     // generate random input
-    index_t batch = 3 + (rand() % 10);
+    index_t batch = 3;
     index_t height = input_shape[0];
     index_t width = input_shape[1];
     index_t kernel_h = filter_shape[0];
     index_t kernel_w = filter_shape[1];
-    index_t input_channels = filter_shape[2] + (rand() % 10);
-    index_t output_channels = filter_shape[3] + (rand() % 10);
+    index_t input_channels = filter_shape[2];
+    index_t output_channels = filter_shape[3];
     // Construct graph
     OpsTestNet net;
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -586,7 +588,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
         .Output("Output")
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {1, 1})
+        .AddIntsArg("dilations", {dilations[0], dilations[1]})
         .Finalize(net.NewOperatorDef());
 
     std::vector<float> float_input_data;
@@ -619,7 +621,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
         .Output("OutputImage")
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {1, 1})
+        .AddIntsArg("dilations", {dilations[0], dilations[1]})
         .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
         .Finalize(net.NewOperatorDef());
     // Run on device
@@ -630,43 +632,154 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
   };
 
-  for (int stride : {1, 2}) {
-    func(stride, stride, VALID);
-    func(stride, stride, SAME);
+  func(1, 1, VALID);
+  func(1, 1, SAME);
+  if (dilations[0] == 1) {
+    func(2, 2, VALID);
+    func(2, 2, SAME);
   }
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
   TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {1, 1, 32, 64});
+                                                {1, 1, 32, 64},
+                                                {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
   TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {3, 3, 32, 64});
+                                                {3, 3, 32, 64},
+                                                {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
   TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {15, 1, 256, 2});
+                                                {15, 1, 256, 2},
+                                                {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
   TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {1, 15, 256, 2});
+                                                {1, 15, 256, 2},
+                                                {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
   TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {7, 7, 3, 64});
+                                                {7, 7, 3, 64},
+                                                {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
   TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
-                                                {1, 1, 5, 7});
+                                                {1, 1, 5, 7},
+                                                {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
   TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
-                                                {3, 3, 5, 7});
+                                                {3, 3, 5, 7},
+                                                {1, 1});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
+                                                {5, 5, 16, 16},
+                                                {2, 2});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
+                                                {7, 7, 16, 16},
+                                                {2, 2});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67},
+                                                {7, 7, 16, 16},
+                                                {4, 4});
+}
+
+template<DeviceType D, typename T>
+static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dilation_rate) {
+  testing::internal::LogToStderr();
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
+                  Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 1;
+    index_t height = shape[0];
+    index_t width = shape[1];
+    index_t input_channels = shape[2];
+    index_t output_channels = shape[3];
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {dilation_rate, dilation_rate})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {dilation_rate, dilation_rate})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
+  };
+
+  for (int kernel_size : {3}) {
+    for (int stride : {1}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
+      func(kernel_size, kernel_size, stride, stride, SAME);
+    }
+  }
 }
+
+TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
+  TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
+                                                 2);
+}
+
+TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
+  TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
+                                                 4);
+}
+
+TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
+  TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
+                                                 4);
+}
+
diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc
index 7ce58e6ce18b34f5c2c4f8b97de3ff2cb3f0e508..f1effb3ee99cb9dd6353c1beae5f581515a87125 100644
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
@@ -486,3 +486,160 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
                                                    {15, 1, 32, 64});
 }
 
+template<DeviceType D, typename T>
+static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilation) {
+  testing::internal::LogToStderr();
+  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
+                  Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 1;
+    index_t height = shape[0];
+    index_t width = shape[1];
+    index_t input_channels = shape[2];
+    index_t output_channels = shape[3];
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {dilation, dilation})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {dilation, dilation})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
+  };
+
+  for (int kernel_size : {3}) {
+    for (int stride : {1}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
+      func(kernel_size, kernel_size, stride, stride, SAME);
+    }
+  }
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN2) {
+  TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 2);
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN4) {
+  TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
+}
+
+TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
+  TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
+}
+
+template<DeviceType D>
+static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
+                                      const std::vector<index_t> &filter_shape,
+                                      const std::vector<int> &dilations) {
+  testing::internal::LogToStderr();
+  auto func = [&](int stride_h, int stride_w, Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 1;
+    index_t height = image_shape[0];
+    index_t width = image_shape[1];
+    index_t input_channels = filter_shape[2];
+    index_t output_channels = filter_shape[3];
+    index_t kernel_h = filter_shape[0];
+    index_t kernel_w = filter_shape[1];
+    // Construct graph
+    OpsTestNet net;
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+
+    // Add input data
+    net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, float>(
+        "Filter", {kernel_h, kernel_w, input_channels, output_channels});
+    net.AddRandomInput<D, float>("Bias", {output_channels});
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7);
+  };
+
+  func(1, 1, VALID);
+  func(1, 1, SAME);
+}
+
+TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
+  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32},
+                                                {7, 7, 3, 16},
+                                                {2, 2});
+}
+
+TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
+  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71},
+                                                {15, 15, 16, 16},
+                                                {2, 2});
+}
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index ea86167ab681d8deab7589777bfb3135bf51da75..e9f2fd048598de6edf4921ebee562d46036e73bb 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -322,18 +322,25 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
     Tensor::MappingGuard y_mapper(&y);
     auto a = x.data<EXP_TYPE>();
     auto b = y.data<RES_TYPE>();
-    for (int n = 0; n < x.dim(0); ++n) {
-      for (int h = 0; h < x.dim(1); ++h) {
-        for (int w = 0; w < x.dim(2); ++w) {
-          for (int c = 0; c < x.dim(3); ++c) {
-            EXPECT_NEAR(*a, *b, abs_err) << "with index = ["
-                                         << n << ", " << h << ", "
-                                         << w << ", " << c << "]";
-            a++;
-            b++;
+    if (x.dim_size() == 4) {
+      for (int n = 0; n < x.dim(0); ++n) {
+        for (int h = 0; h < x.dim(1); ++h) {
+          for (int w = 0; w < x.dim(2); ++w) {
+            for (int c = 0; c < x.dim(3); ++c) {
+              EXPECT_NEAR(*a, *b, abs_err) << "with index = ["
+                                           << n << ", " << h << ", "
+                                           << w << ", " << c << "]";
+              a++;
+              b++;
+            }
           }
         }
       }
+    } else {
+      for (int i = 0; i < x.size(); ++i) {
+        EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
+                                         << " index = " << i;
+      }
     }
   }
 
diff --git a/mace/python/tools/tf_converter.py b/mace/python/tools/tf_converter.py
index 1251bf55f61c5b674b6bab538e36f485cad383b8..d258f7bb7be5f39426075976c9987dfb17674b1c 100644
--- a/mace/python/tools/tf_converter.py
+++ b/mace/python/tools/tf_converter.py
@@ -21,7 +21,6 @@ def main(unused_args):
     data = f.read()
     input_graph_def.ParseFromString(data)
 
-  print 'done'
   if FLAGS.runtime == 'dsp':
     output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
       input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
@@ -38,6 +37,7 @@ def main(unused_args):
     with gfile.GFile(FLAGS.output + '_txt', "wb") as f:
       # output_graph_def.ClearField('tensors')
       f.write(str(output_graph_def))
+  print("Model conversion is completed.")
 
 
 def parse_args():
@@ -82,7 +82,7 @@ def parse_args():
   parser.add_argument(
     "--output_type",
     type=str,
-    default="source",
+    default="pb",
     help="output type: source/pb")
   parser.add_argument(
     "--template",
diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py
index 0378c018b3bc33564bcc3cfd80669e1082309be0..e224b6112234464df3f4b5411303d9cff0c36579 100644
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -44,6 +44,7 @@ class TFConverter(object):
     self.device = device
     self.tf_graph = {}
     self.resolved_ops = {}
+    self.unused_tensor = set()
 
     for op in tf_ops:
       self.resolved_ops[op.name] = 0
@@ -72,6 +73,23 @@ class TFConverter(object):
     arg.i = self.dt
     return output_name
 
+  def add_image_to_buffer(self, input_name, input_type):
+    output_name = input_name[:-2] + "_i2b" + input_name[-2:]
+    op_def = self.net_def.op.add()
+    op_def.name = output_name[:-2]
+    op_def.type = 'ImageToBuffer'
+    op_def.input.extend([input_name])
+    op_def.output.extend([output_name])
+
+    arg = op_def.arg.add()
+    arg.name = 'buffer_type'
+    arg.i = buffer_type_map[input_type]
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    return output_name
+
+
   def add_input_transform(self, name):
     new_input_name = MACE_INPUT_NODE_NAME + ":0"
     op_def = self.net_def.op.add()
@@ -111,22 +129,23 @@ class TFConverter(object):
     op.output_shape.extend(output_shapes)
 
   def convert_tensor(self, op):
-    tensor = self.net_def.tensors.add()
-    tf_tensor = op.outputs[0].eval()
-    tensor.name = op.outputs[0].name
-
-    shape = list(tf_tensor.shape)
-    tensor.dims.extend(shape)
-
-    tf_dt = op.get_attr('dtype')
-    if tf_dt == tf.float32:
-      tensor.data_type = mace_pb2.DT_FLOAT
-      tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
-    elif tf_dt == tf.int32:
-      tensor.data_type = mace_pb2.DT_INT32
-      tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
-    else:
-      raise Exception("Not supported tensor type: " + tf_dt.name)
+    if op.outputs[0].name not in self.unused_tensor:
+      tensor = self.net_def.tensors.add()
+      tf_tensor = op.outputs[0].eval()
+      tensor.name = op.outputs[0].name
+
+      shape = list(tf_tensor.shape)
+      tensor.dims.extend(shape)
+
+      tf_dt = op.get_attr('dtype')
+      if tf_dt == tf.float32:
+        tensor.data_type = mace_pb2.DT_FLOAT
+        tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
+      elif tf_dt == tf.int32:
+        tensor.data_type = mace_pb2.DT_INT32
+        tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
+      else:
+        raise Exception("Not supported tensor type: " + tf_dt.name)
     self.resolved_ops[op.name] = 1
 
   def convert_conv2d(self, op):
@@ -253,6 +272,7 @@ class TFConverter(object):
     data_format_arg = op_def.arg.add()
     data_format_arg.name = 'data_format'
     data_format_arg.s = 'NHWC'
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
 
     self.net_def.op.extend([op_def])
     for i in range(0, 7):
@@ -326,6 +346,7 @@ class TFConverter(object):
     axis_arg.i = get_input_tensor(op, 2).eval().astype(np.int32)
     self.add_output_shape(op.outputs, op_def)
     self.resolved_ops[op.name] = 1
+    self.unused_tensor.add(get_input_tensor(op, 2).name)
 
   def convert_resize_bilinear(self, op):
     op_def = self.net_def.op.add()
@@ -344,6 +365,7 @@ class TFConverter(object):
     size_arg.i = op.get_attr('align_corners')
     self.add_output_shape(op.outputs, op_def)
     self.resolved_ops[op.name] = 1
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
 
   def convert_bias_add(self, op):
     op_def = mace_pb2.OperatorDef()
@@ -383,6 +405,79 @@ class TFConverter(object):
     size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat)
     self.add_output_shape(op.outputs, op_def)
     self.resolved_ops[op.name] = 1
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
+    self.unused_tensor.add(get_input_tensor(op, 2).name)
+
+  def is_atrous_conv2d(self, op):
+    return op.type == 'SpaceToBatchND' and\
+           len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Conv2D'
+
+  def convert_atrous_conv2d(self, op):
+    op_def = mace_pb2.OperatorDef()
+    arg = op_def.arg.add()
+    arg.name = 'T'
+    arg.i = self.dt
+    conv_op = self.tf_graph[op.name][0]
+    op_def.name = conv_op.name
+    op_def.type = conv_op.type
+    if self.device == 'gpu':
+      op_def.input.extend([op.inputs[0].name])
+      output_name = self.add_buffer_to_image(conv_op.inputs[1].name, "FILTER")
+      op_def.input.extend([output_name])
+    else:
+      op_def.input.extend([op.inputs[0].name])
+      op_def.input.extend([conv_op.inputs[1].name])
+
+    dilation_arg = op_def.arg.add()
+    dilation_arg.name = 'dilations'
+    dilation_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
+    padding_arg = op_def.arg.add()
+    padding_arg.name = 'padding'
+    padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
+    if len(padding_values) > 0 and padding_values[0] > 0:
+      padding_arg.i = padding_mode['SAME']
+    else:
+      padding_arg.i = padding_mode['VALID']
+    self.unused_tensor.add(get_input_tensor(op, 1).name)
+    self.unused_tensor.add(get_input_tensor(op, 2).name)
+
+    strides_arg = op_def.arg.add()
+    strides_arg.name = 'strides'
+    strides_arg.ints.extend([1, 1])
+    data_format_arg = op_def.arg.add()
+    data_format_arg.name = 'data_format'
+    data_format_arg.s = 'NHWC'
+    final_op = conv_op
+    self.resolved_ops[op.name] = 1
+    self.resolved_ops[conv_op.name] = 1
+
+    if len(self.tf_graph[final_op.name]) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd' :
+      bias_add_op = self.tf_graph[final_op.name][0]
+      if self.device == 'gpu':
+        output_name = self.add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT")
+        op_def.input.extend([output_name])
+      else:
+        op_def.input.extend([bias_add_op.inputs[1].name])
+      final_op = bias_add_op
+      self.resolved_ops[bias_add_op.name] = 1
+
+    if len(self.tf_graph[final_op.name]) == 1 \
+      and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
+      final_op = self.tf_graph[final_op.name][0]
+      self.resolved_ops[final_op.name] = 1
+    else:
+      raise Exception('Convert atrous conv error: no BatchToSpaceND op')
+
+    if len(self.tf_graph[final_op.name]) == 1 \
+        and self.tf_graph[final_op.name][0].type == 'Relu':
+      relu_op = self.tf_graph[final_op.name][0]
+      op_def.type = "FusedConv2D"
+      final_op = relu_op
+      self.resolved_ops[relu_op.name] = 1
+
+    op_def.output.extend([output.name for output in final_op.outputs])
+    self.add_output_shape(final_op.outputs, op_def)
+    self.net_def.op.extend([op_def])
 
   def convert_normal_op(self, op):
     op_def = self.net_def.op.add()
@@ -407,7 +502,9 @@ class TFConverter(object):
         self.resolved_ops[op.name] = 1
         pass
       elif op.type == 'Const':
-        self.convert_tensor(op)
+        pass
+      elif self.is_atrous_conv2d(op):
+        self.convert_atrous_conv2d(op)
       elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
         self.convert_conv2d(op)
       elif op.type == 'FusedBatchNorm':
@@ -435,6 +532,15 @@ class TFConverter(object):
       else:
         raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
 
+
+    for op in self.tf_ops:
+      if self.resolved_ops[op.name] == 1:
+        continue
+      elif op.type == 'Const':
+        self.convert_tensor(op)
+      else:
+        raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
+
     if self.device == 'gpu':
       self.add_output_transform(output_node)
 
diff --git a/tools/validate.py b/tools/validate.py
index c1560a16e85b91906106c54438aa0cbf6a86df7b..46ada3d1606faca8db5c0de65fe025102cbc5fb9 100644
--- a/tools/validate.py
+++ b/tools/validate.py
@@ -10,8 +10,8 @@ from tensorflow import gfile
 
 # Validation Flow:
 # 1. Generate input data
-#    python validate_icnet.py --generate_data 1 \
-#          --random_seed 1
+#    python validate_icnet.py --generate_data 1
+#
 # 2. Use mace_run to run icnet on phone.
 # 3. adb pull the result.
 # 4. Compare output data of mace and tf
@@ -20,7 +20,7 @@ from tensorflow import gfile
 #        --mace_out_file icnet.out
 
 def generate_data(shape):
-  np.random.seed(FLAGS.random_seed)
+  np.random.seed()
   data = np.random.random(shape) * -1
   print FLAGS.input_file
   data.astype(np.float32).tofile(FLAGS.input_file)
@@ -122,12 +122,7 @@ def parse_args():
     "--generate_data",
     type='bool',
     default="false",
-    help="Random seed for generate test case.")
-  parser.add_argument(
-    "--random_seed",
-    type=int,
-    default="0",
-    help="Random seed for generate test case.")
+    help="Generate data or not.")
 
   return parser.parse_known_args()
 
diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh
index 1359a356bc84b89b6c711d2ab1e2108e4ddb99d3..35973abf42d8813713c14395640e21957b8d5a1f 100644
--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
@@ -79,7 +79,8 @@ build_and_run()
 }
 
 echo "Step 1: Generate input data"
-python tools/validate.py --generate_data true --random_seed 1 \
+rm -rf ${MODEL_DIR}/${INPUT_FILE_NAME}
+python tools/validate.py --generate_data true \
  --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
  --input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3"