Merge branch 'develop' into fix-crf-weight-and-coeff-bug

1242e794 · Peng Li · d60d34ef · 257819d3 · 1242e794 · d60d34ef
18 changed file
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -15,25 +15,11 @@ set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
 # HTML output directory
 set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")

-
-set(PADDLE_DOXYGEN_OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/doxygen_xml")
-
 configure_file(
    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
    "${BINARY_BUILD_DIR}/conf.py"
    @ONLY)

-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in"
-    "${CMAKE_CURRENT_BINARY_DIR}/Doxyfile"
-    @ONLY
-  )
-
-add_custom_target(paddle_doxygen_docs ALL
-    ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-)
-
 sphinx_add_target(paddle_docs
                  html
                  ${BINARY_BUILD_DIR}
@@ -41,6 +27,5 @@ sphinx_add_target(paddle_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR})

-add_dependencies(paddle_docs 
-  gen_proto_py
-  paddle_doxygen_docs)
+add_dependencies(paddle_docs
+  gen_proto_py)
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
--- a/doc/conf.py.in
+++ b/doc/conf.py.in
@@ -25,20 +25,6 @@ sys.path.insert(0, '@PROJ_ROOT@/python')

 templates_path = ["@PROJ_ROOT@/doc/templates"]

-# -- Doxygen Settings
-breathe_projects = {
-   'paddle': '@PADDLE_DOXYGEN_OUTPUT@/xml'
-}
-breathe_default_project = 'paddle'
-
-breathe_domain_by_extension = {
-    'h': 'cpp',  # mapping XXX.h XXX.cpp together
-}
-
-breathe_default_members = {
-    'protected-members','undoc-members'
-}
-
 # -- General configuration ------------------------------------------------

 # General information about the project.
@@ -62,7 +48,6 @@ extensions = [
    'sphinx.ext.autosummary',
    'sphinx.ext.mathjax',
    'sphinx.ext.napoleon',
-    'breathe'
 ]



--- a/doc_cn/build/docker/build_docker_image.rst
+++ b/doc_cn/build/docker/build_docker_image.rst
-构建PaddlePaddle Docker Image
-===========================
-
-PaddlePaddle的Docker Image构建源码放置在 :code:`${源码根目录}/paddle/scripts/docker/`目录下。
-该Image基于ubuntu 14.04。该目录下有两个文件，Dockerfile和build.sh。其中:
-
-*  Dockerfile是docker image的主要描述文件。描述了Docker image的构建步骤、各种参数和维护
-   人员等等。
-*  build.sh是docker image的主要构建步骤。
-
-该image的构建在docker 1.12版本测试通过, 低于docker 1.12版本的情况下并没有测试。主要由于旧版本
-的docker可能缺乏 :code:`--build-arg` 参数，从而不能在运行编译命令的时候接受参数。
-
-同时，该构建脚本充分考虑了网络不稳定的情况，对于cuda的Toolkit有断点续传和传输速度过小重启下载的
-简单优化。
-
-使用脚本构建PaddlePaddle Docker Image
-------------------------------------------
-
-该脚本的使用方法是，进入该源码目录，执行 :code:`docker build .` 命令。可以使用
- :code:`--build-arg` 传入的配置参数包括:
-
-*  LOWEST\_DL\_SPEED\: 多线程下载过程中，最低线程的下载速度(默认单位是Bytes，可以传入10K, 
-   10M，或者10G这样的单位)。如果小于这个下载速度，那么这个下载线程将会关闭。所有的下载线程关闭时，
-   下载进程会重启。
-*  WITH\_GPU\: ON or OFF。是否开启GPU功能。注意，编译PaddlePaddle的GPU版本并不需要一定在具有GPU
-   的机器上进行。但是，运行PaddlePaddle的GPU版本一定要在具有CUDA的机器上运行。
-
-简单的使用样例为\:
-
-..  code-block:: bash
-
-    cd ${源码根目录}/paddle/scripts/docker/
-    docker build --build-arg LOWEST_DL_SPEED=50K\
-                 --build-arg WITH_GPU=ON \
-                 --tag  paddle_gpu:latest .
-
-即可在本地编译出PaddlePaddle的镜像。
--- a/doc_cn/howto/build_docker_image.rst
+++ b/doc_cn/howto/build_docker_image.rst
+构建PaddlePaddle的Docker Image
+==============================
+PaddlePaddle的Docker Image构建源码放置在 ``${源码根目录}/paddle/scripts/docker/`` 目录下。该目录有三类文件：
+
+- Dockerfile：Docker Image的描述文件，包括构建步骤、各种参数和维护人员等。
+  
+  - 一共维护了12个Dockerfile，Dockerfile.m4是它们的模板。
+  - PaddlePaddle中所有的Image都基于ubuntu 14.04。
+
+- build.sh：Docker Image的构建脚本，使用方式见下一小节。
+- generate.sh：通过Dockerfile.m4模板生成不同的Dockerfile。
+
+使用脚本构建Docker Image
+------------------------
+
+进入源码目录，执行 ``docker build`` 命令，即可在本地编译出PaddlePaddle的镜像。简单的使用样例为
+
+..  code-block:: bash
+
+    cd ${源码根目录}/paddle/scripts/docker/
+    docker build --build-arg LOWEST_DL_SPEED=50K\
+                 --build-arg WITH_GPU=ON \
+                 --tag  paddle_gpu:latest .
+
+其中，``--build-arg`` 传入的配置参数包括:
+
+- LOWEST\_DL\_SPEED\: 在多线程下载过程中，设置下线线程的最低速度。
+
+  - 默认单位是Bytes，但可以传入10K、10M、或10G等这样的单位。
+  - 如果小于这个速度，那么这个线程将会关闭。当所有的线程都关闭了，那么下载进程将会重启。
+-  WITH\_GPU\: ON or OFF，是否开启GPU功能。注意，
+  - **编译** PaddlePaddle的GPU版本 **不一定** 要在具有GPU的机器上进行。
+  - **运行** PaddlePaddle的GPU版本 **一定** 要在具有GPU的机器上运行。
+
+注意：所有Image的构建在Docker 1.12版本测试通过, 低于1.12的版本并没有测试。原因是旧版本可能缺乏 ``--build-arg`` 参数，从而不能在运行编译命令的时候接受参数。
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -16,6 +16,7 @@ PaddlePaddle文档
 --------
 * `新写Layer <../doc/dev/new_layer/index.html>`_
 * `如何贡献文档 <howto/how_to_write_docs/index.html>`_
+* `如何构建Docker Image <howto/build_docker_image.html>`_

 算法教程
 --------

--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -59,7 +59,8 @@ void ConvProjection::getConvParams() {

 void ConvProjection::initCudnn() {
  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterH_, filterW_);
+      &filterDesc_, channels_ / groups_, numFilters_ / groups_,
+      filterH_, filterW_);
  hl_create_tensor_descriptor(&inputDesc_);
  hl_create_tensor_descriptor(&outputDesc_);
  hl_create_convolution_descriptor(&convDesc_,
@@ -86,7 +87,7 @@ void ConvProjection::initCudnn() {
 void ConvProjection::reshapeTensorDesc(int batchSize) {
  hl_tensor_reshape(inputDesc_,
                    batchSize,
-                    channels_,
+                    channels_ / groups_,
                    imageH_,
                    imageW_,
                    channels_ * imageH_ * imageW_,
@@ -115,7 +116,7 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {

  hl_tensor_reshape(outputDesc_,
                    batchSize,
-                    numFilters_,
+                    numFilters_ / groups_,
                    outputH_,
                    outputW_,
                    nStride,

--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -145,7 +145,7 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
  real *expInData = expandInput_->getData();
  for (int g = 0; g < groups_[inIdx]; ++g) {
    MatrixPtr A =
-        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
+        Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
    C->mul(A, B, 1, 1);
@@ -182,7 +182,7 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
      // create temporary matrix
      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
+      MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
      C->mul(A, B);  // mul

      // clear the temporary matrix
@@ -247,10 +247,10 @@ void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,

    // expand-mul one-group by one
    for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
-      C->mul(A, B, 1, 1);
+      MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
+      MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
+      MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
+      C->mul(B, A, 1, 1);

      A->clear();
      B->clear();

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -43,7 +43,14 @@ add_unittest_without_exec(test_ConvTrans

 add_test(NAME test_ConvTrans
    COMMAND test_ConvTrans)
+################# test_ConvUnify #######################
+add_unittest_without_exec(test_ConvUnify
+    test_ConvUnify.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)

+add_test(NAME test_ConvUnify
+    COMMAND test_ConvUnify)
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
    test_Evaluator.cpp

--- a/paddle/gserver/tests/img_conv_a.conf
+++ b/paddle/gserver/tests/img_conv_a.conf
@@ -34,6 +34,7 @@ conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
                      num_channels=8,
                      num_filters=16, stride=1,
                      bias_attr=True,
-                      act=LinearActivation())
+                      act=LinearActivation(),
+                      groups=2)

 outputs(concat, conv)
--- a/paddle/gserver/tests/img_conv_b.conf
+++ b/paddle/gserver/tests/img_conv_b.conf
@@ -24,7 +24,7 @@ proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
 concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())

 proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8, num_filters=16, stride=1)
+                       num_channels=8, num_filters=16, stride=1, groups=2)

 with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
    conv += proj

--- a/paddle/gserver/tests/img_conv_c.conf
+++ b/paddle/gserver/tests/img_conv_c.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8,
+                        num_filters=16, stride=1,
+                        bias_attr=False,
+                        act=ReluActivation(),
+                        layer_type="exconv")
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation(),
+                       layer_type="exconv")
+
+concat = concat_layer(input=[conv1, conv2])
+
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=16, stride=1,
+                      bias_attr=True,
+                      act=LinearActivation(),
+                      groups=2,
+                      layer_type="exconv")
+
+outputs(concat, conv)
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/layers/ExpandConvTransLayer.h"
+#include "paddle/math/MathUtils.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_double(checkgrad_eps);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+P_DECLARE_bool(prev_batch_state);
+
+// Do one forward pass of convTrans layer and check to see if its output
+// matches the given result
+MatrixPtr doOneConvTest(size_t imgSize, size_t output_x, size_t stride,
+                    size_t padding, size_t filter_size, size_t channel,
+                    size_t numfilters, size_t groups, MatrixPtr& inputData,
+                    real* param, bool useGpu) {
+  TestConfig config;
+  config.biasSize = numfilters;
+  if (useGpu) {
+    config.layerConfig.set_type("cudnn_conv");
+  } else {
+    config.layerConfig.set_type("exconv");
+  }
+  config.layerConfig.set_num_filters(numfilters);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  size_t weightSize = channel* filter_size * filter_size *
+      config.layerConfig.num_filters() / groups;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0",
+                              imgSize * imgSize * channel,
+                              weightSize});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(channel);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(groups);
+  conv->set_filter_channels(channel/groups);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(config, &dataLayers, &datas, &layerMap, "conv",
+                1, false, useGpu);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters, &convLayer);
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]->zeroMem();
+  convLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->copyFrom(param,
+      weightSize);
+  convLayer->forward(PASS_GC);
+
+  return convLayer->getOutputValue();
+}
+
+TEST(Layer, convParaUnified) {
+  #ifndef PADDLE_ONLY_CPU
+    MatrixPtr input, resultCpu, resultGpu;
+    input = Matrix::create(1, 4 * 4, false, false);
+    float inputData[] = {1, 2, 3, 4,
+                         5, 6, 7, 8,
+                         9, 10, 11, 12,
+                         13, 14, 15, 16};
+    float param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9,
+                     9, 8, 7, 6, 5, 4, 3, 2, 1};
+
+    input->setData(inputData);
+
+    resultCpu = doOneConvTest(/* imgSize */ 4,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 3,
+                   /*channel*/ 1,
+                   /*numfilters*/ 2,
+                   /*groups*/ 1,
+                   input, param, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 4,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 3,
+                       /*channel*/ 1,
+                       /*numfilters*/ 2,
+                       /*groups*/ 1,
+                       input, param, true);
+    checkMatrixEqual(resultCpu, resultGpu);
+
+    input = Matrix::create(1, 3 * 3 * 2, false, false);
+    float inputData2[] = {1, 2, 3,
+                          4, 5, 6,
+                          7, 8, 9,
+
+                          10, 11, 12,
+                          13, 14, 15,
+                          16, 17, 18};
+    float param2[] = {1, 2, 3, 4, 5, 6, 7, 8,
+                      8, 7, 6, 5, 4, 3, 2, 1};
+
+    input->setData(inputData2);
+
+    resultCpu = doOneConvTest(/* imgSize */ 3,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 2,
+                   /*channel*/ 2,
+                   /*numfilters*/ 2,
+                   /*groups*/ 1,
+                   input, param2, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 3,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 2,
+                       /*channel*/ 2,
+                       /*numfilters*/ 2,
+                       /*groups*/ 1,
+                       input, param2, true);
+    checkMatrixEqual(resultCpu, resultGpu);
+
+
+    float param3[] = {1, 2, 3, 4,
+                      4, 3, 2, 1};
+
+    resultCpu = doOneConvTest(/* imgSize */ 3,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 2,
+                   /*channel*/ 2,
+                   /*numfilters*/ 2,
+                   /*groups*/ 2,
+                   input, param3, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 3,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 2,
+                       /*channel*/ 2,
+                       /*numfilters*/ 2,
+                       /*groups*/ 2,
+                       input, param3, true);
+    checkMatrixEqual(resultCpu, resultGpu);
+  #endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -166,9 +166,8 @@ TEST(Projection, scaling) {
  }
 }

-#ifndef PADDLE_ONLY_CPU
-TEST(Projection, conv) {
-  const int NUM_FILTERS = 16;
+void testProjectionConv(size_t groups) {
+  const int NUM_FILTERS = 18;
  const int FILTER_SIZE = 2;
  const int FILTER_SIZE_Y = 3;
  const int CHANNELS = 3;
@@ -186,7 +185,7 @@ TEST(Projection, conv) {
  conv->set_padding_y(1);
  conv->set_stride(2);
  conv->set_stride_y(2);
-  conv->set_groups(1);
+  conv->set_groups(groups);
  conv->set_filter_channels(conv->channels() / conv->groups());
  conv->set_img_size(IMAGE_SIZE);
  int output_x = outputSize(conv->img_size(),
@@ -206,13 +205,20 @@ TEST(Projection, conv) {
  testProjectionGrad(
      conf,
      INPUT_DATA,
-      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
+      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y
+                          / groups,
      /* batchSize */ 100,
      true,
      false,
      NUM_FILTERS,
      true);
 }
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  testProjectionConv(1);
+  testProjectionConv(3);
+}
 #endif

 TEST(Layer, BilinearInterpLayer) {

--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -255,6 +255,16 @@ TEST(Compare, img_conv) {
  compareNetwork(config_file_a, config_file_b);
  FLAGS_use_gpu = useGpu;
 }
+
+// Test cudnn_conv and exconv give the same result
+TEST(Compare, img_conv2) {
+  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./gserver/tests/img_conv_c.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
 #endif

 P_DEFINE_string(config_file_a, "", "config of one network to compare");

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -21,7 +21,7 @@ if [ ${WITH_GPU} == 'ON' ]; then
 fi

 cd ~
-git clone https://github.com/baidu/Paddle.git paddle
+git clone https://github.com/PaddlePaddle/Paddle.git paddle
 cd paddle
 git checkout ${GIT_CHECKOUT}
 mkdir build

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -698,7 +698,8 @@ class ConvProjection(Projection):
        ci = self.proj_conf.conv_conf.channels
        fh = self.proj_conf.conv_conf.filter_size
        fw = self.proj_conf.conv_conf.filter_size_y
-        return co * ci * fh * fw
+        gr = self.proj_conf.conv_conf.groups
+        return co * ci * fh * fw / gr

    def calc_bias_size(self):
        return self.proj_conf.num_filters

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -129,6 +129,9 @@ class LayerType(object):
    HSIGMOID = 'hsigmoid'
    CONV_LAYER = "conv"
    CONVTRANS_LAYER = "convt"
+    EXCONV_LAYER = "exconv"
+    EXCONVTRANS_LAYER = "exconvt"
+    CUDNNCONV_LAYER = "cudnn_conv"
    POOL_LAYER = "pool"
    BATCH_NORM_LAYER = 'batch_norm'
    NORM_LAYER = 'norm'
@@ -1762,7 +1765,8 @@ def img_conv_layer(input,
                   filter_size_y=None,
                   stride_y=None,
                   padding_y=None,
-                   trans=False):
+                   trans=False,
+                   layer_type=None):
    """
    Convolution layer for image. Paddle only support square input currently and
    thus input image's width equals height.
@@ -1829,6 +1833,10 @@ def img_conv_layer(input,
    :type layer_attr: ExtraLayerAttribute
    :param trans: true if it is a convTransLayer, false if it is a convLayer
    :type trans: bool
+    :param layer_type: specify the layer_type, default is None. If trans=True,
+                       layer_type has to be "exconvt", otherwise layer_type 
+                       has to be either "exconv" or "cudnn_conv"
+    :type layer_type: String
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -1864,8 +1872,15 @@ def img_conv_layer(input,
        param_attr.attr["initial_std"] = init_w
        param_attr.attr["initial_strategy"] = 0
        param_attr.attr["initial_smart"] = False
-
-    lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
+    
+    if layer_type:
+        if trans:
+            assert layer_type in ["exconvt"]
+        else:
+            assert layer_type in ["exconv", "cudnn_conv"]
+        lt = layer_type
+    else:
+        lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER

    l = Layer(
        name=name,