diff --git a/go/master/client.go b/go/master/client.go
index 62801b9b7fe85fe27147b12160f48d988623d547..f04cf50ce3cf765a79cbe555d3edb68f3dbb911e 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -63,13 +63,24 @@ func WithAddr(addr string) func(c *Client) error {
 // WithEtcd sets the client to use etcd for master discovery.
 func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
 	return func(c *Client) error {
-		cli, err := clientv3.New(clientv3.Config{
-			Endpoints:   endpoints,
-			DialTimeout: timeout,
-		})
-		if err != nil {
+		var cli *clientv3.Client
+		f := func() error {
+			var err error
+			cli, err = clientv3.New(clientv3.Config{
+				Endpoints:   endpoints,
+				DialTimeout: timeout,
+			})
 			return err
 		}
+		for {
+			err := f()
+			if err != nil {
+				log.Warningln(err)
+			} else {
+				break
+			}
+			time.Sleep(time.Second)
+		}
 
 		ch := make(chan string, 1)
 		a, err := GetKey(cli, DefaultAddrPath, timeout)
@@ -101,9 +112,6 @@ func NewClient(opts ...func(*Client) error) (*Client, error) {
 		}
 	}
 	c.ch = make(chan record, c.bufSize)
-	// FIXME: connection is created asyncrosly in monitorMaster go routine,
-	//        ensure the connection is ready for use before calling c.addClient.
-	time.Sleep(time.Second)
 	return c, nil
 }
 
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index db18e4912b63ec18dcfff3ef3aaf0c7947e0af18..3f68c62de6d9b3aaadc9180d86159089dc728ea9 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -214,7 +214,8 @@ extern void hl_conv_workspace(hl_tensor_descriptor input,
                               int* convBwdDataAlgo,
                               size_t* bwdDataLimitBytes,
                               int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes);
+                              size_t* bwdFilterLimitBytes,
+                              bool useDilation);
 
 /**
  * @brief   destroy filter descriptor.
@@ -242,7 +243,9 @@ extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                              int padding_height,
                                              int padding_width,
                                              int stride_height,
-                                             int stride_width);
+                                             int stride_width,
+                                             int dilation_h = 1,
+                                             int dilation_w = 1);
 
 /**
  * @brief   reset convolution descriptor.
@@ -262,7 +265,9 @@ extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                             int padding_height,
                                             int padding_width,
                                             int stride_height,
-                                            int stride_width);
+                                            int stride_width,
+                                            int dilation_h = 1,
+                                            int dilation_w = 1);
 
 /**
  * @brief   destroy convolution descriptor.
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index abd0d6b09901a7cd124c245e359f9d38f52bda26..3afcc6fa85a4a6a03697663719b6ab685897b68b 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -78,7 +78,9 @@ inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                              int padding_height,
                                              int padding_width,
                                              int stride_height,
-                                             int stride_width) {}
+                                             int stride_width,
+                                             int dilation_h,
+                                             int dilation_w) {}
 
 inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                             hl_tensor_descriptor image,
@@ -86,7 +88,9 @@ inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                             int padding_height,
                                             int padding_width,
                                             int stride_height,
-                                            int stride_width) {}
+                                            int stride_width,
+                                            int dilation_h,
+                                            int dilation_w) {}
 
 inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
 
@@ -99,7 +103,8 @@ inline void hl_conv_workspace(hl_tensor_descriptor input,
                               int* convBwdDataAlgo,
                               size_t* bwdDataLimitBytes,
                               int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes) {}
+                              size_t* bwdFilterLimitBytes,
+                              bool useDilation) {}
 
 inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    real* input_data,
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 78642a17443b0b4d81defaa46579332ef20c71a1..f38ef692558b908ed65d2c84821bbb7c3b439742 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -201,7 +201,8 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                        int* convBwdDataAlgo,
                        size_t* bwdDataLimitBytes,
                        int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes) {
+                       size_t* bwdFilterLimitBytes,
+                       bool useDilation) {
 #if CUDNN_VERSION >= 4000
 
   CHECK_NOTNULL(input);
@@ -213,21 +214,60 @@ void hl_conv_workspace(hl_tensor_descriptor input,
   size_t memoryLimitBytes =
       (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
 
+  // For dilation
+  int algo = 0;
+
   // cudnn convolution forward configuration
   cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
   cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
   cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
   cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
 
-  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-      t_resource.cudnn_handle,
-      fwd_src_desc,
-      fwd_filter_desc,
-      fwd_conv_desc,
-      fwd_dest_desc,
-      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+  if (useDilation) {
+    convFwdAlgo = &algo;
+    convBwdDataAlgo = &algo;
+    convBwdFilterAlgo = &algo;
+  } else {
+    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+        t_resource.cudnn_handle,
+        fwd_src_desc,
+        fwd_filter_desc,
+        fwd_conv_desc,
+        fwd_dest_desc,
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_filter_src_desc,
+        bwd_filter_diff_desc,
+        bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+  }
 
   CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
       t_resource.cudnn_handle,
@@ -238,23 +278,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
       static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
       fwdLimitBytes));
 
-  // cudnn convolution backward data configuration
-  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-      t_resource.cudnn_handle,
-      bwd_data_filter_desc,
-      bwd_data_diff_desc,
-      bwd_data_conv_desc,
-      bwd_data_grad_desc,
-      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-
   CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
       t_resource.cudnn_handle,
       bwd_data_filter_desc,
@@ -264,23 +287,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
       static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
       bwdDataLimitBytes));
 
-  // cudnn convolution backward filter configuration
-  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-      t_resource.cudnn_handle,
-      bwd_filter_src_desc,
-      bwd_filter_diff_desc,
-      bwd_filter_conv_desc,
-      bwd_filter_grad_desc,
-      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-
   CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
       t_resource.cudnn_handle,
       bwd_filter_src_desc,
@@ -603,7 +609,9 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                       int padding_height,
                                       int padding_width,
                                       int stride_height,
-                                      int stride_width) {
+                                      int stride_width,
+                                      int dilation_h,
+                                      int dilation_w) {
   CHECK_NOTNULL(conv);
 
   cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
@@ -625,18 +633,24 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode,
                                                        data_type));
 #else
+  if (dilation_h > 1 || dilation_w > 1) {
+    LOG(FATAL)
+        << "Current cuDNN version does't support for dilation convolution. "
+        << "The dilation convolution requires cuDNN >= v6.0.";
+  }
+
   CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
                                                        padding_height,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode));
 #endif
 
@@ -659,7 +673,9 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                     int stride_width) {
+                                     int stride_width,
+                                     int dilation_h,
+                                     int dilation_w) {
   CHECK_NOTNULL(conv);
   CHECK_NOTNULL(image);
   CHECK_NOTNULL(filter);
@@ -678,8 +694,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode,
                                                        data_type));
 #else
@@ -688,8 +704,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode));
 #endif
 
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index e161d89c38a290000a2cbdb2905e56901ae4c144..a5328ef8343e1050352fc48530e041fb6ce12a8b 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -32,9 +32,11 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     const ConvConfig& conf = inputConfig.conv_conf();
     padding_.push_back(conf.padding());
     stride_.push_back(conf.stride());
+    dilation_.push_back(conf.dilation());
     filterSize_.push_back(conf.filter_size());
     paddingY_.push_back(conf.padding_y());
     strideY_.push_back(conf.stride_y());
+    dilationY_.push_back(conf.dilation_y());
     filterSizeY_.push_back(conf.filter_size_y());
     filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
     channels_.push_back(conf.channels());
@@ -89,7 +91,11 @@ size_t ConvBaseLayer::calOutputSize() {
   size_t layerSize = 0;
 
   auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
+    size_t filterSizeY;
+    size_t filterSize;
     for (size_t i = 0; i < inputLayers_.size(); i++) {
+      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
+      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
       inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
       inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
       const ConvConfig& conf = config_.inputs(i).conv_conf();
@@ -98,17 +104,17 @@ size_t ConvBaseLayer::calOutputSize() {
           inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
         if (inW[i] == 0) inW[i] = conf.output_x();
         outH.push_back(imageSize(
-            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(imageSize(
-            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(
+            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
       } else {
         if (inH[i] == 0)
           inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
         if (inW[i] == 0) inW[i] = conf.img_size();
         outH.push_back(outputSize(
-            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
         outW.push_back(outputSize(
-            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
       }
       CHECK_EQ(outH[i], outH[0]);
       CHECK_EQ(outW[i], outW[0]);
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index e9d15d94f806a5d2e6f11cbbfc29e291dfe8538f..223bce8e296d748c8e17eb105aa67e8a1c1219b6 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -40,6 +40,10 @@ protected:
   IntV stride_;
   /// The y dimension of the stride.
   IntV strideY_;
+  /// The x dimension of the dilation.
+  IntV dilation_;
+  /// The y dimension of the dilation.
+  IntV dilationY_;
   /// The x dimension of a filter kernel.
   IntV filterSize_;
   /// The y dimension of a filter kernel.
diff --git a/paddle/gserver/layers/ConvBaseOperator.cpp b/paddle/gserver/layers/ConvBaseOperator.cpp
index 5c231986292d2cd26ee30ccc122142fccd5b4949..5469c41c87468001232f7bae0d5b6bf26693b9e0 100644
--- a/paddle/gserver/layers/ConvBaseOperator.cpp
+++ b/paddle/gserver/layers/ConvBaseOperator.cpp
@@ -59,7 +59,8 @@ void ConvBaseOperator::allocConvWorkSpace() {
                     &bwdDataAlgo_,
                     &bwdDataLimitBytes_,
                     &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_);
+                    &bwdFilterLimitBytes_,
+                    /*useDilation*/ false);
 
   size_t maxWorkSpace = 0;
   maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
index eb6b0445c95a9e9a7acd5d693ecdb11a263f41fd..08f36c516cfdadd42e9333c1c5a7a247df1f263e 100644
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -41,6 +41,11 @@ void ConvBaseProjection::getConvParams() {
   strideH_ = conf.stride_y();
   strideW_ = conf.stride();
 
+  dilationH_ = conf.dilation_y();
+  dilationW_ = conf.dilation();
+  CHECK_GT(dilationH_, 0);
+  CHECK_GT(dilationW_, 0);
+
   filterH_ = conf.filter_size_y();
   filterW_ = conf.filter_size();
 
@@ -77,7 +82,9 @@ void ConvBaseProjection::initCudnn() {
                                    paddingH_,
                                    paddingW_,
                                    strideH_,
-                                   strideW_);
+                                   strideW_,
+                                   dilationH_,
+                                   dilationW_);
 
   // initialize all to default algorithms
   fwdAlgo_ = 0;
@@ -131,7 +138,9 @@ void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
                                   paddingH_,
                                   paddingW_,
                                   strideH_,
-                                  strideW_);
+                                  strideW_,
+                                  dilationH_,
+                                  dilationW_);
 }
 
 void ConvBaseProjection::reshape(int batchSize) {
@@ -140,6 +149,10 @@ void ConvBaseProjection::reshape(int batchSize) {
   CHECK_EQ(calInputSize(), in_->value->getWidth());
 
   reshapeTensorDesc(batchSize);
+  bool useDilation = false;
+  if (dilationH_ > 1 || dilationW_ > 1) {
+    useDilation = true;
+  }
   hl_conv_workspace(imageDesc_,
                     outputDesc_,
                     filterDesc_,
@@ -149,7 +162,8 @@ void ConvBaseProjection::reshape(int batchSize) {
                     &bwdDataAlgo_,
                     &bwdDataLimitBytes_,
                     &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_);
+                    &bwdFilterLimitBytes_,
+                    useDilation);
 
   size_t maxWorkSpace = 0;
   maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index e9d9f8f1b2937b3a3b7323c43ef5608ffc5f82ca..ebdb57845bb36ac607b1e4c8e02f9d20b6e82a36 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -63,6 +63,7 @@ protected:
   int configChannels_, configNumFilters_;
   int paddingH_, paddingW_;
   int strideH_, strideW_;
+  int dilationH_, dilationW_;
   int filterH_, filterW_;
   /// One group offset of input data.
   int inputOffset_;
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index 5b7ecc5560c1e7431305b34a331fe1fbc96c6b06..6f0106b713d93494ba9baa5c7afa0a6b1f167262 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -25,12 +25,12 @@ size_t ConvProjection::calOutputSize() {
   if (imageH_ == 0) imageH_ = configImgH_;
   if (imageW_ == 0) imageW_ = configImgW_;
   outputH_ = outputSize(imageH_,
-                        filterH_,
+                        (filterH_ - 1) * dilationH_ + 1,
                         paddingH_,
                         strideH_,
                         /* caffeMode */ true);
   outputW_ = outputSize(imageW_,
-                        filterW_,
+                        (filterW_ - 1) * dilationW_ + 1,
                         paddingW_,
                         strideW_,
                         /* caffeMode */ true);
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index dd2c955e6a4660a1811f205ec5c5861798291912..9946f7666498e27a3149816c67ff4c9a9f3bb02a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifndef PADDLE_ONLY_CPU
+#include <cudnn.h>
+#endif
 #include <gtest/gtest.h>
 #include <string>
 #include <vector>
@@ -189,10 +192,16 @@ TEST(Projection, scaling) {
 void testProjectionConv(size_t groups, bool isDeconv) {
   const int NUM_FILTERS = 18;
   const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 4;
+  const int FILTER_SIZE_Y = 2;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
 
+#if CUDNN_VERSION >= 6000
+  const int DILATION = 2;
+#else
+  const int DILATION = 1;
+#endif
+
   ProjectionConfig conf;
   if (isDeconv) {
     conf.set_type("convt");
@@ -209,6 +218,8 @@ void testProjectionConv(size_t groups, bool isDeconv) {
   conv->set_padding_y(1);
   conv->set_stride(2);
   conv->set_stride_y(2);
+  conv->set_dilation(DILATION);
+  conv->set_dilation_y(DILATION);
   conv->set_groups(groups);
   if (isDeconv) {
     conv->set_filter_channels(NUM_FILTERS / conv->groups());
@@ -217,12 +228,12 @@ void testProjectionConv(size_t groups, bool isDeconv) {
   }
   conv->set_img_size(IMAGE_SIZE);
   int output_x = outputSize(conv->img_size(),
-                            conv->filter_size(),
+                            (conv->filter_size() - 1) * DILATION + 1,
                             conv->padding(),
                             conv->stride(),
                             /* caffeMode */ true);
   int output_y = outputSize(conv->img_size(),
-                            conv->filter_size_y(),
+                            (conv->filter_size_y() - 1) * DILATION + 1,
                             conv->padding_y(),
                             conv->stride_y(),
                             /* caffeMode */ true);
@@ -424,27 +435,38 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
+  int dilation = 1;
+  if (type == "cudnn_conv") {
+#if CUDNN_VERSION >= 6000
+    dilation = 2;
+#else
+    dilation = 1;
+#endif
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
+  conv->set_filter_size_y(2);
   conv->set_channels(3);
   conv->set_padding(0);
   conv->set_padding_y(1);
   conv->set_stride(2);
   conv->set_stride_y(2);
+  conv->set_dilation(dilation);
+  conv->set_dilation_y(dilation);
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(16);
-  conv->set_img_size_y(8);
+  conv->set_img_size_y(16);
   conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
+                                (conv->filter_size() - 1) * dilation + 1,
                                 conv->padding(),
                                 conv->stride(),
                                 /* caffeMode */ true));
   conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
+                                (conv->filter_size_y() - 1) * dilation + 1,
                                 conv->padding_y(),
                                 conv->stride_y(),
                                 /* caffeMode */ true));
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index b56a45b6bd1e4d834a3c11da989b4a0707a24bf6..f466dbc79a2059faa1e3d4ad6ede3f2394580842 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -42,6 +42,7 @@ function(op_library TARGET)
 endfunction()
 
 add_subdirectory(math)
+
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 op_library(gather_op SRCS gather_op.cc gather_op.cu)
 
@@ -67,7 +68,7 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
 
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
     DEPS framework_proto tensor op_registry operator net_op)
-op_library(uniform_random_op
-        SRCS uniform_random_op.cc uniform_random_op.cu)
+op_library(uniform_random_op SRCS uniform_random_op.cc uniform_random_op.cu)
+op_library(lookup_table_op SRCS lookup_table_op.cc lookup_table_op.cu)
 op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
 op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index fd380ca8514b0ac50f39613368a4836bd485668b..969998ce2eae02b8ad057c6259703e51559bf98a 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -26,7 +26,7 @@ class FillZerosLikeKernel : public framework::OpKernel {
     auto* output = context.Output<framework::Tensor>("Dst");
     output->mutable_data<T>(context.GetPlace());
     auto t = framework::EigenVector<T>::Flatten(*output);
-    t.device(context.GetEigenDevice<Place>()) = t.constant(T(0));
+    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
   }
 };
 
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..94d40890a765413e88a35a6ad995ca97ac84dcda
--- /dev/null
+++ b/paddle/operators/lookup_table_op.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lookup_table_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LookupTableOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &context) const override {
+    auto table_t = context.Input<Tensor>("W");
+    auto ids_t = context.Input<Tensor>("Ids");
+    auto output_t = context.Output<Tensor>("Out");
+
+    output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
+  }
+};
+
+class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LookupTableOpMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("W",
+             "An input represents embedding tensors,"
+             " which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int32 or int64"
+             "contains the ids to be looked up in W.");
+    AddOutput("Out", "The lookup results, which have the same type with W.");
+    AddComment(
+        "This operator is used to perform lookups on the parameter W,"
+        "then concatenated into a dense tensor.");
+  }
+};
+
+class LookupTableOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &context) const override {
+    auto table = context.Input<Tensor>("W");
+    auto d_table = context.Output<Tensor>(framework::GradVarName("W"));
+    d_table->Resize(table->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
+            lookup_table_grad, ops::LookupTableOpGrad);
+
+REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>);
+REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>);
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27eee3436af8107cef2aa3577ea238be49edf1af
--- /dev/null
+++ b/paddle/operators/lookup_table_op.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+__global__ void LookupTable(T* output, const T* table, const int32_t* ids,
+                            const int N, const int K, const int D) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+
+  while (idy < K) {
+    int id = ids[idy];
+    PADDLE_ASSERT(id >= 0);
+    PADDLE_ASSERT(id < N);
+    T* out = output + idy * D;
+    const T* tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      out[i] = tab[i];
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+__global__ void LookupTableGrad(T* table, const T* output, const int32_t* ids,
+                                const int N, const int K, const int D) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+
+  while (idy < K) {
+    int id = ids[idy];
+    PADDLE_ASSERT(id >= 0);
+    PADDLE_ASSERT(id < N);
+    const T* out = output + idy * D;
+    T* tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T>
+class LookupTableCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto table_t = context.Input<Tensor>("W");
+    auto ids_t = context.Input<Tensor>("Ids");
+    auto output_t = context.Output<Tensor>("Out");
+
+    size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+    size_t K = product(ids_t->dims());
+    auto ids = ids_t->data<int32_t>();
+    auto table = table_t->data<T>();
+    auto output = output_t->mutable_data<T>(context.GetPlace());
+
+    dim3 threads(128, 8);
+    dim3 grids(8, 1);
+    LookupTable<T, 128, 8, 8><<<grids, threads>>>(output, table, ids, N, K, D);
+  }
+};
+
+template <typename T>
+class LookupTableGradCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto ids_t = context.Input<Tensor>("Ids");
+    auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
+
+    int N = d_table_t->dims()[0];
+    int D = d_table_t->dims()[1];
+    int K = product(ids_t->dims());
+    const int32_t* ids = ids_t->data<int32_t>();
+    const T* d_output = d_output_t->data<T>();
+    T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+    t.device(context.GetEigenDevice<platform::GPUPlace>()) =
+        t.constant(static_cast<T>(0));
+
+    dim3 threads(128, 8);
+    dim3 grids(8, 1);
+    LookupTableGrad<T, 128, 8, 8><<<grids, threads>>>(d_table, d_output, ids, N,
+                                                      K, D);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lookup_table_grad,
+                       ops::LookupTableGradCUDAKernel<float>);
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4da8079b91624c3510cae89fd599a7035a4c7477
--- /dev/null
+++ b/paddle/operators/lookup_table_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class LookupTableKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto table_t = context.Input<Tensor>("W");      // float tensor
+    auto ids_t = context.Input<Tensor>("Ids");      // int tensor
+    auto output_t = context.Output<Tensor>("Out");  // float tensor
+
+    size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+    auto ids = ids_t->data<int32_t>();
+    auto table = table_t->data<T>();
+    auto output = output_t->mutable_data<T>(context.GetPlace());
+    for (size_t i = 0; i < product(ids_t->dims()); ++i) {
+      PADDLE_ENFORCE_LT(ids[i], N);
+      PADDLE_ENFORCE_GE(ids[i], 0);
+      memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+    }
+  }
+};
+
+template <typename T>
+class LookupTableGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto ids_t = context.Input<Tensor>("Ids");
+    auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
+
+    size_t N = d_table_t->dims()[0];
+    size_t D = d_table_t->dims()[1];
+    auto ids = ids_t->data<int32_t>();
+    const T* d_output = d_output_t->data<T>();
+    T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+    t.device(context.GetEigenDevice<platform::CPUPlace>()) =
+        t.constant(static_cast<T>(0));
+
+    for (size_t i = 0; i < product(ids_t->dims()); ++i) {
+      PADDLE_ENFORCE_LT(ids[i], N);
+      PADDLE_ENFORCE_GE(ids[i], 0);
+      for (size_t j = 0; j < D; ++j) {
+        d_table[ids[i] * D + j] += d_output[i * D + j];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index cbc61ad3e117fc79a674ca21831d3fec59d1ec5b..4a57f64c890ce99d6060faec6a4a01b107403344 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -18,3 +18,6 @@
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
     rowwise_add, ops::RowwiseAddKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add_grad,
+    ops::RowwiseAddGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/platform/cuda_helper.h b/paddle/platform/cuda_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..6feec0d7f8bd5d32d9e5eedee962fcbeff655f1c
--- /dev/null
+++ b/paddle/platform/cuda_helper.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda.h>
+
+namespace paddle {
+namespace platform {
+
+#define CUDA_ATOMIC_WRAPPER(op, T) \
+  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
+
+#define USE_CUDA_ATOMIC(op, T) \
+  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
+
+// For atomicAdd.
+USE_CUDA_ATOMIC(Add, float);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+USE_CUDA_ATOMIC(Add, double);
+#else
+CUDA_ATOMIC_WRAPPER(Add, double) {
+  unsigned long long int* address_as_ull =
+      reinterpret_cast<unsigned long long int*>(address);
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index 40db811767a9c273f073f4715e6ddfbf05887730..abb9c248eee9c59e8e6b9fa9d1878fec5dd67569 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -15,6 +15,7 @@ cc_library(paddle_pybind SHARED
     uniform_random_op
     gaussian_random_op
     fill_zeros_like_op
+    lookup_table_op
     scale_op
     minus_op)
 endif(WITH_PYTHON)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 27b98e77db80505f7498deb75164e184b900262b..8fa8be2cef5fff04ed61ac726e5d8111e30c8a09 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -42,6 +42,7 @@ USE_OP(fill_zeros_like);
 USE_OP_ITSELF(recurrent_op);
 USE_OP(gaussian_random);
 USE_OP(uniform_random);
+USE_OP(lookup_table);
 USE_OP(scale);
 USE_OP_ITSELF(identity);
 USE_OP(minus);
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 4f3d5bf3f6cb96c97285f40e3a3d100c2af47ad5..1ea1e052596524f5baa0a55f601c4fa928acd8af 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -82,6 +82,9 @@ message ConvConfig {
 
   // if not set, use img_size
   optional uint32 img_size_y = 14;
+
+  optional uint32 dilation = 15 [ default = 1 ];
+  optional uint32 dilation_y = 16 [ default = 1 ];
 }
 
 message PoolConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 56afaa59bb08b33b7c2bbbff86e77cd29af260ae..efc76764662b3832dbacc6c8a3c2bca4ccbe4cd8 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -870,12 +870,16 @@ class Conv(Cfg):
                  caffe_mode=True,
                  filter_size_y=None,
                  padding_y=None,
-                 stride_y=None):
+                 stride_y=None,
+                 dilation=None,
+                 dilation_y=None):
         self.add_keys(locals())
         if filter_size_y is None:
             self.filter_size_y = filter_size
         if padding_y is None:
             self.padding_y = padding
+        if dilation_y is None:
+            self.dilation_y = dilation
         if stride_y is None:
             self.stride_y = stride
         if output_x is not None:
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 252b66feba8c7933fe8d789bbbc365cd51c51476..f323b017c06bb3d0f2c69de0faccb04258ccd2ad 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2342,6 +2342,7 @@ def img_conv_layer(input,
                    groups=1,
                    stride=1,
                    padding=0,
+                   dilation=1,
                    bias_attr=None,
                    param_attr=None,
                    shared_biases=True,
@@ -2349,6 +2350,7 @@ def img_conv_layer(input,
                    filter_size_y=None,
                    stride_y=None,
                    padding_y=None,
+                   dilation_y=None,
                    trans=False,
                    layer_type=None):
     """
@@ -2413,6 +2415,11 @@ def img_conv_layer(input,
     :type padding: int|tuple|list
     :param padding_y: The y dimension of the padding.
     :type padding_y: int
+    :param dilation: The x dimension of the dilation. Or input a tuple for two
+                    image dimension
+    :type dilation: int|tuple|list
+    :param dilation_y: The y dimension of the dilation.
+    :type dilation_y: int
     :param bias_attr: Convolution bias attribute. None means default bias.
                       False means no bias.
     :type bias_attr: ParameterAttribute|False
@@ -2460,6 +2467,13 @@ def img_conv_layer(input,
         else:
             padding_y = padding
 
+    if dilation_y is None:
+        if isinstance(dilation, collections.Sequence):
+            assert len(dilation) == 2
+            dilation, dilation_y = dilation
+        else:
+            dilation_y = dilation
+
     if param_attr.attr.get('initial_smart'):
         # special initial for conv layers.
         init_w = (2.0 / (filter_size**2 * num_channels))**0.5
@@ -2469,6 +2483,8 @@ def img_conv_layer(input,
         param_attr.attr["initial_smart"] = False
 
     if layer_type:
+        if dilation > 1 or dilation_y > 1:
+            assert layer_type in ["cudnn_conv", "cudnn_convt"]
         if trans:
             assert layer_type in ["exconvt", "cudnn_convt"]
         else:
@@ -2484,11 +2500,13 @@ def img_conv_layer(input,
             conv=Conv(
                 filter_size=filter_size,
                 padding=padding,
+                dilation=dilation,
                 stride=stride,
                 channels=num_channels,
                 groups=groups,
                 filter_size_y=filter_size_y,
                 padding_y=padding_y,
+                dilation_y=dilation_y,
                 stride_y=stride_y),
             **param_attr.attr),
         active_type=act.name,
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
index 9fda16a5407a1fe0af8c5986023a8368e5b87222..01d31ef3fad827bfd103ee00f4ddd1bde14e0f82 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
@@ -12,6 +12,7 @@ img_conv = img_conv_layer(
     num_filters=64,
     filter_size=(32, 32),
     padding=(1, 1),
+    dilation=(1, 1),
     stride=(1, 1),
     act=LinearActivation())
 img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 3f4110e4a9de796140af9703559937338d27f251..fb4686889a644753afdeb748b444e757ed016eda 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -28,4 +28,6 @@ py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
 py_test(test_recurrent_op SRCS test_recurrent_op.py)
 py_test(test_sgd_op SRCS test_sgd_op.py)
 py_test(test_gradient_checker SRCS test_gradient_checker.py)
+py_test(test_lookup_table SRCS test_lookup_table.py)
 py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
+py_test(mnist SRCS mnist.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index d7809e52fbd720c74bc6598cb03623800a4fbdf7..8eb9f3f07321d388e0035825b71534b7d9076cf6 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -23,6 +23,10 @@ def grad_var_name(var_name):
     return var_name + "@GRAD"
 
 
+def empty_var_name():
+    return "@EMPTY@"
+
+
 def get_numeric_gradient(op,
                          input_values,
                          output_name,
@@ -182,7 +186,7 @@ class GradientChecker(unittest.TestCase):
         ]
         return outs
 
-    def compare_grad(self, forward_op, input_value):
+    def compare_grad(self, forward_op, input_value, no_grad_set=None):
         """ Compare the input gradients between CPU and GPU for the given forward
         operator.
 
@@ -190,15 +194,20 @@ class GradientChecker(unittest.TestCase):
         :type forward_op: Operator
         :param input_value: input values.
         :type input_value: dict{string:numpy.array}
+        :param no_grad_set: the set of variables names without gradients.
+        :type no_grad_set: a set of string
         :raises: AssertionError, there is different gradient value.
         """
-        backward_op = core.Operator.backward(forward_op, set())
+        if no_grad_set is None:
+            no_grad_set = set()
+        backward_op = core.Operator.backward(forward_op, no_grad_set)
         # return if not compile with GPU or not implementing GPU kernel
         if not (core.is_compile_gpu() and backward_op.support_gpu()):
             return
 
         outputs = backward_op.outputs()
         out_names = [item for k in outputs for item in outputs[k]]
+        out_names = filter(lambda x: x != empty_var_name(), out_names)
         cpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
                                         out_names, core.CPUPlace())
         gpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
diff --git a/python/paddle/v2/framework/tests/mnist.py b/python/paddle/v2/framework/tests/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a0b109850e92c66e69f74c5cd0853a09b5551a1
--- /dev/null
+++ b/python/paddle/v2/framework/tests/mnist.py
@@ -0,0 +1,249 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+import numpy
+import paddle.v2 as paddle
+
+BATCH_SIZE = 100
+
+scope = core.Scope()
+place = core.CPUPlace()
+# if you want to test GPU training, you can use gpu place
+# place = core.GPUPlace(0)
+dev_ctx = core.DeviceContext.create(place)
+
+init_net = core.Net.create()
+forward_net = core.Net.create()
+backward_net = None
+optimize_net = core.Net.create()
+
+
+def atomic_id():
+    id = 0
+    while True:
+        yield id
+        id += 1
+
+
+uniq_id = atomic_id().next
+
+
+def data_layer(name, dims):
+    var = scope.new_var(name)
+    tensor = var.get_tensor()
+    tensor.set_dims(dims)  # 1 is batch size holder.
+    return name
+
+
+def feed_data(name, data):
+    assert isinstance(data, numpy.ndarray)
+    tensor = scope.find_var(name).get_tensor()
+    tensor.set_dims(data.shape)
+    if data.dtype == numpy.dtype('int32'):
+        tensor.alloc_int(place)
+    elif data.dtype == numpy.dtype('float32'):
+        tensor.alloc_float(place)
+    else:
+        raise ValueError("data type not supported")
+    tensor.set(data, place)
+
+
+def grad_var_name(var_name):
+    return var_name + "@GRAD"
+
+
+def sgd_optimizer(net, param_name, learning_rate=0.005):
+    grad_name = grad_var_name(param_name)
+    optimize_op = Operator(
+        "sgd",
+        param=param_name,
+        grad=grad_name,
+        param_out=param_name,
+        learning_rate=learning_rate)
+    net.append_op(optimize_op)
+
+
+# should use operator and add these to the init_network
+def init_param(net, param_name, dims):
+    scope.new_var(param_name)
+    op = Operator(
+        "uniform_random", Out=param_name, dims=dims, min=-0.5, max=0.5, seed=10)
+    op.infer_shape(scope)
+    net.append_op(op)
+
+
+# fc_layer
+def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
+    """
+    Add a fc layer to net
+
+    :param input: input variable name.
+    :type input: str
+    :param size: fully connected layer size.
+    :param act: activation name
+    :param param: parameter attribute, used for initialize parameters.
+    :param bias: bias attribute. False will not have a bias.
+    :param name: the name of fc layer. If not set, model will generate a
+    readable name
+    :return: output variable name.
+    """
+    if name is None:
+        name = 'fc_%d' % uniq_id()
+    if not isinstance(name, str):
+        raise ValueError("name should be string")
+
+    input_dims = scope.find_var(input).get_tensor().get_dims()
+
+    w_name = param or name + ".w"
+    init_param(net=init_net, param_name=w_name, dims=[input_dims[1], size])
+    sgd_optimizer(net=optimize_net, param_name=w_name, learning_rate=0.01)
+
+    pre_activation = name + ".mul.out"
+    scope.new_var(pre_activation)
+    mul_op = Operator("mul", X=input, Y=w_name, Out=pre_activation)
+    net.append_op(mul_op)
+
+    # create bias variable if needed
+    if bias:
+        bias_name = name + ".b"
+        init_param(net=init_net, param_name=bias_name, dims=[size])
+        sgd_optimizer(
+            net=optimize_net, param_name=bias_name, learning_rate=0.001)
+        bias_out = name + ".rowwise_add.out"
+        scope.new_var(bias_out)
+        rowwise_append_op = Operator(
+            "rowwise_add", X=pre_activation, b=bias_name, Out=bias_out)
+        net.append_op(rowwise_append_op)
+        pre_activation = bias_out
+
+    activation_op = Operator(act, X=pre_activation, Y=name)
+    net.append_op(activation_op)
+    scope.new_var(name)
+    net.infer_shape(scope)
+    return name
+
+
+def cross_entropy_layer(net, input, label):
+    cost_name = 'cross_entropy_%d' % uniq_id()
+    cross_entropy_op = Operator(
+        "onehot_cross_entropy", X=input, label=label, Y=cost_name)
+    net.append_op(cross_entropy_op)
+    scope.new_var(cost_name)
+    net.infer_shape(scope)
+    return cost_name
+
+
+def create_backward_net(forward_net):
+    net = core.Operator.backward(forward_net, set())
+    for input in net.inputs()["all"]:
+        var = scope.new_var(input)
+        var.get_tensor()
+    for output in net.outputs()["all"]:
+        var = scope.new_var(output)
+        var.get_tensor()
+    return net
+
+
+def debug_print_op(op):
+    print("===============" + op.type() + "==============")
+    print("***inputs:***")
+    for input in op.inputs()["all"]:
+        print input, scope.find_var(input).get_tensor().get_dims()
+    print("\n***outputs:***")
+    for output in op.outputs()["all"]:
+        print output, scope.find_var(output).get_tensor().get_dims()
+    print("")
+    print("")
+
+
+def set_cost(cost):
+    cost_shape = numpy.array(scope.find_var(cost).get_tensor()).shape
+    cost_grad = \
+        scope.find_var(grad_var_name(cost)).get_tensor()
+    cost_grad.set_dims(cost_shape)
+    cost_grad.alloc_float(place)
+    cost_grad.set(numpy.ones(cost_shape).astype("float32"), place)
+
+
+def get_cost_mean(cost):
+    cost_data = numpy.array(scope.find_var(cost).get_tensor())
+    return cost_data.sum() / len(cost_data)
+
+
+def error_rate(predict, label):
+    predict_var = numpy.array(scope.find_var(predict).get_tensor()).argmax(
+        axis=1)
+    label = numpy.array(scope.find_var(label).get_tensor())
+    error_num = numpy.sum(predict_var != label)
+    return error_num / float(len(label))
+
+
+images = data_layer(name='pixel', dims=[BATCH_SIZE, 784])
+labels = data_layer(name='label', dims=[BATCH_SIZE])
+fc1 = fc_layer(net=forward_net, input=images, size=100, act="sigmoid")
+fc2 = fc_layer(net=forward_net, input=fc1, size=100, act="sigmoid")
+predict = fc_layer(net=forward_net, input=fc2, size=100, act="softmax")
+cost = cross_entropy_layer(net=forward_net, input=predict, label=labels)
+
+init_net.complete_add_op(True)
+forward_net.complete_add_op(True)
+backward_net = create_backward_net(forward_net)
+optimize_net.complete_add_op(True)
+
+print(init_net)
+print(forward_net)
+print(backward_net)
+print(optimize_net)
+
+debug_print_op(forward_net)
+debug_print_op(backward_net)
+debug_print_op(optimize_net)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+
+def test(cost_name):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    cost = []
+    error = []
+    for data in test_reader():
+        image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
+        label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
+        feed_data(images, image_data)
+        feed_data(labels, label_data)
+
+        forward_net.infer_shape(scope)
+        forward_net.run(scope, dev_ctx)
+        cost.append(get_cost_mean(cost_name))
+        error.append(error_rate(predict, "label"))
+    print("cost=" + str(sum(cost) / float(len(cost))) + " error_rate=" + str(
+        sum(error) / float(len(error))))
+
+
+PASS_NUM = 1
+
+init_net.run(scope, dev_ctx)
+for pass_id in range(PASS_NUM):
+    batch_id = 0
+
+    for data in train_reader():
+        image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
+        label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
+        feed_data(images, image_data)
+        feed_data(labels, label_data)
+
+        forward_net.infer_shape(scope)
+        forward_net.run(scope, dev_ctx)
+        set_cost(cost)
+        backward_net.infer_shape(scope)
+        backward_net.run(scope, dev_ctx)
+
+        optimize_net.run(scope, dev_ctx)
+        if batch_id % 100 == 0:
+            print("pass[" + str(pass_id) + "] batch_id[" + str(batch_id) + "]")
+            test(cost)
+
+        batch_id = batch_id + 1
diff --git a/python/paddle/v2/framework/tests/test_lookup_table.py b/python/paddle/v2/framework/tests/test_lookup_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..19eb464baa555fb67a994f3cfb4d3ed628367c73
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_lookup_table.py
@@ -0,0 +1,31 @@
+import unittest
+import numpy as np
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+
+
+class TestSigmoidOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = 'lookup_table'
+        table = np.random.random((17, 31)).astype('float32')
+        ids = np.random.randint(0, 17, 4).astype('int32')
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids]}
+
+
+class TestSigmoidGradOp(GradientChecker):
+    def test_grad(self):
+        op = create_op('lookup_table')
+        table = np.random.random((17, 31)).astype('float32')
+        ids = np.random.randint(0, 17, 4).astype('int32')
+        inputs = {'W': table, 'Ids': ids}
+        # comapre gradients 
+        self.compare_grad(op, inputs, set(['Ids']))
+        # check gradients 
+        self.check_grad(op, inputs, set('W'), 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()