Merge branch 'develop' of github.com:PaddlePaddle/Paddle into enforce_add_more_helpers

992c04db · Superjom · 5201b911 · a28478fc · 992c04db · 992c04db
30 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -187,7 +187,13 @@ function(cc_library TARGET_NAME)
    endif()
    # cpplint code style
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+    foreach(source_file ${cc_library_SRCS})
+      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+      endif()
+    endforeach()
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
  else(cc_library_SRCS)
    if (cc_library_DEPS)
@@ -239,6 +245,14 @@ function(nv_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
      endif()
+      # cpplint code style
+      foreach(source_file ${nv_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
    else(nv_library_SRCS)
      if (nv_library_DEPS)
        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})

--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -25,18 +25,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-namespace {
-typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                       Dim<8>, Dim<9>>
-    DDimVar;
-}
 /**
 * \brief A dynamically sized dimension.
 *
 * The number of dimensions must be between [1, 9].
 */
 struct DDim {
+  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                         Dim<8>, Dim<9>>
+      DDimVar;
  DDimVar var;
  DDim() : var(Dim<1>()) {}

--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #pragma once
 #include "paddle/framework/operator.h"

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -314,7 +314,7 @@ class OpRegistry {
  static std::unordered_map<std::string, OpProto>& protos() {
    static std::unordered_map<std::string, OpProto> protos_;
    return protos_;
-  };
+  }
  static std::unordered_map<std::string, std::string>& grad_ops() {
    static std::unordered_map<std::string, std::string> grad_ops_;
@@ -336,7 +336,7 @@ class OpRegistry {
  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
    return op_checkers_;
-  };
+  }
  static void GenerateTempVariableName(OperatorBase* op) {
    static std::atomic<size_t> gUniqId(0UL);
@@ -353,7 +353,7 @@ class OpRegistry {
 template <typename OpType, typename ProtoMakerType>
 class OpRegisterHelper {
 public:
-  OpRegisterHelper(const char* op_type) {
+  explicit OpRegisterHelper(const char* op_type) {
    OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
  }
 };

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -284,7 +284,7 @@ class OperatorWithKernel : public OperatorBase {
    platform::Place place_;
    OpKernelKey() = default;
-    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
      place_ = dev_ctx.GetPlace();
    }

--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -105,7 +105,16 @@ PYBIND11_PLUGIN(core) {
      .def("set", PyCUDATensorSetFromArray<float>)
      .def("set", PyCUDATensorSetFromArray<int>)
 #endif
-      .def("shape", [](Tensor &self) { return vectorize(self.dims()); });
+      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
+      .def("set_float_element",
+           [](Tensor &self, size_t offset, float f) {
+             // TODO(yuyang18): Only support GPU now.
+             self.data<float>()[offset] = f;
+           })
+      .def("get_float_element", [](Tensor &self, size_t offset) -> float {
+        // TODO(yuyang18): Only support GPU now.
+        return self.data<float>()[offset];
+      });
  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -967,8 +967,9 @@ void RecurrentGradientMachine::generateSequence() {
  size_t numSequences = getGenBatchSize();
  resizeBootFrame(numSequences);
-  // We create only two sub-network in generation for alternate use.
+  // We create only two sub-network in generation, one stores states of all
-  // Thus, we can reduce total memory of output_ in layer forward.
+  // layers in previous time step and the other storing the states at current
+  // time step.
  resizeOrCreateFrames(2);
  // outFrameLines_.size() > 1UL
@@ -1001,10 +1002,9 @@ void RecurrentGradientMachine::generateSequence() {
  // init outArg
  size_t resultNum = generator_.config.num_results_per_sample();
-  IVector::resizeOrCreate(
+  size_t maxGenWordCount =
-      generator_.outArg.ids,
+      generator_.config.max_num_frames() * numSequences * resultNum;
-      generator_.config.max_num_frames() * numSequences * resultNum,
+  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
-      false);
  if (resultNum > 1) {
    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
    Matrix::resizeOrCreate(generator_.outArg.in,
@@ -1012,6 +1012,11 @@ void RecurrentGradientMachine::generateSequence() {
                           /* width */ resultNum,
                           false,
                           /* useGpu */ false);
+    Matrix::resizeOrCreate(generator_.outArg.value,
+                           /* height */ maxGenWordCount,
+                           /* width */ 1,
+                           false,
+                           /* useGpu */ false);
  }
  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                numSequences + 1,
@@ -1313,13 +1318,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
  starts[0] = 0;
  if (numResults > 1) {
    real* probs = generator_.outArg.in->getData();
+    real* idsProb = generator_.outArg.value->getData();
+    size_t curPos = 0;
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
        Path& path = finalPaths_[i][j];
-        generator_.ids.push_back(path.ids.size());  // sequence size
+        size_t genLen = path.ids.size();
+        generator_.ids.push_back(genLen);  // sequence size
        generator_.ids.insert(
            generator_.ids.end(), path.ids.begin(), path.ids.end());
        generator_.ids.push_back(-1);  // end of sequence
+        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
+        curPos += genLen;
+        idsProb[curPos++] = -1.0;
        probs[i * numResults + j] = path.logProb;
        if (!j && dataArgsSize_) {

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -189,6 +189,11 @@ public:
     */
    std::vector<int> ids;
+    /**
+     * @brief idsProb, log probability of each generated words.
+     */
+    std::vector<real> idsProb;
    /**
     * @brief logProb, current probability of path.
     */
@@ -228,11 +233,13 @@ public:
     */
    Path(Path& old, int newId, real logProb, int machineId, int topIndex)
        : ids(old.ids),
+          idsProb(old.idsProb),
          logProb(old.logProb + logProb),
          machineId(machineId),
          topIndex(topIndex),
          seqId(old.seqId) {
      ids.push_back(newId);
+      idsProb.push_back(logProb);
      if (!old.probHistory.empty()) {
        this->probHistory = old.probHistory;
        // probHistory store current prob, not sum
@@ -411,8 +418,9 @@ protected:
  struct Generator {
    GeneratorConfig config;
-    std::vector<int> ids;  // store generated sequences
+    std::vector<int> ids;       // store generated sequences
-    Argument outArg;       // final output argument
+    std::vector<real> idsProb;  // log probability of each generated word
+    Argument outArg;            // final output argument
  };
  bool generating_;
  Generator generator_;

--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -39,7 +39,7 @@ class BuddyAllocator {
 public:
  void* Alloc(size_t unaligned_size);
-  void Free(void*);
+  void Free(void* ptr);
  size_t Used();
 public:

--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
@@ -33,17 +33,17 @@ namespace detail {
 */
 class MetadataCache {
 public:
-  MetadataCache(bool uses_gpu);
+  explicit MetadataCache(bool uses_gpu);
 public:
  /*! \brief Load the associated metadata for the specified memory block. */
-  Metadata load(const MemoryBlock*);
+  Metadata load(const MemoryBlock* memory_block);
  /*! \brief Store the associated metadata for the specified memory block. */
-  void store(MemoryBlock*, const Metadata&);
+  void store(MemoryBlock* memory_block, const Metadata& meta_data);
  /*! \brief Indicate that the specified metadata will no longer be used. */
-  void invalidate(MemoryBlock*);
+  void invalidate(MemoryBlock* memory_block);
 public:
  MetadataCache(const MetadataCache&) = delete;

--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -68,7 +68,7 @@ class PODDeleter {
  static_assert(std::is_pod<T>::value, "T must be POD");
 public:
-  PODDeleter(Place place) : place_(place) {}
+  explicit PODDeleter(Place place) : place_(place) {}
  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
 private:

--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"

--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
 REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
                       ops::OnehotCrossEntropyOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_zeros_like_op.h"
 REGISTER_OP_GPU_KERNEL(
    fill_zeros_like,
    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/mean_op.h"
 REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<ops::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -15,4 +15,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
-using namespace paddle::framework;
+using namespace paddle::framework;  // NOLINT
 namespace rnn {
@@ -94,7 +94,7 @@ void InitArgument(const ArgumentName& name, Argument* arg);
 };  // namespace rnn
 // The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
-// TODO:
+// TODO(Yan Chunwei):
 // 1. No-padding computing for sequences with indifinite length in one batch.
 // 2. Hierarchical RNN for sequence with sub-sequence.
 // 3. Internal Memory.
@@ -172,12 +172,10 @@ public:
  /**
   * InferShape must be called before Run.
   */
-  virtual void InferShape(const Scope& scope) const override {
+  void InferShape(const Scope& scope) const override { alg_.InferShape(scope); }
-    alg_.InferShape(scope);
-  }
-  virtual void Run(const Scope& scope,
+  void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const override {
+           const platform::DeviceContext& dev_ctx) const override {
    alg_.Run(scope, dev_ctx);
  }
@@ -194,12 +192,10 @@ public:
  /**
   * InferShape must be called before Run.
   */
-  virtual void InferShape(const Scope& scope) const override {
+  void InferShape(const Scope& scope) const override { alg_.InferShape(scope); }
-    alg_.InferShape(scope);
-  }
-  virtual void Run(const Scope& scope,
+  void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const override {
+           const platform::DeviceContext& dev_ctx) const override {
    alg_.Run(scope, dev_ctx);
  }

--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"

--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
 REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"

--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"
 REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(softmax_grad, ops::SoftmaxGradKernel<ops::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(softmax_grad,
+                       ops::SoftmaxGradKernel<ops::GPUPlace, float>);
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -40,7 +40,7 @@ class DeviceContext {
 class CPUDeviceContext : public DeviceContext {
 public:
  CPUDeviceContext();
-  CPUDeviceContext(CPUPlace);
+  explicit CPUDeviceContext(CPUPlace);
  virtual ~CPUDeviceContext() {}
  Eigen::DefaultDevice* eigen_device() const;
@@ -55,7 +55,7 @@ class CPUDeviceContext : public DeviceContext {
 class CUDADeviceContext : public DeviceContext {
 public:
-  explicit CUDADeviceContext(GPUPlace);
+  CUDADeviceContext(GPUPlace);  // NOLINT
  virtual ~CUDADeviceContext();
  /*! \brief  Wait for all operations completion in the stream. */
@@ -69,10 +69,10 @@ class CUDADeviceContext : public DeviceContext {
  // clang-format off
  /*! \brief  Return cublas handle in the device context. */
-  cublasHandle_t    cublas_handle   ();
+  cublasHandle_t    cublas_handle();
  /*! \brief  Return cudnn  handle in the device context. */
-  cudnnHandle_t     cudnn_handle    ();
+  cudnnHandle_t     cudnn_handle();
  /*! \brief  Return curand handle in the device context. */
  curandGenerator_t curand_generator();

--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <paddle/platform/dynload/cublas.h>
 namespace paddle {

--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <paddle/platform/dynload/cudnn.h>
 namespace paddle {
@@ -25,4 +39,4 @@ CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
\ No newline at end of file
--- a/paddle/platform/dynload/curand.cc
+++ b/paddle/platform/dynload/curand.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <paddle/platform/dynload/curand.h>
 namespace paddle {
@@ -10,6 +24,7 @@ void *curand_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
-}
-}
+}  // namespace dynload
-}
+}  // namespace platform
\ No newline at end of file
+}  // namespace paddle
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -32,7 +32,7 @@ struct CPUPlace {
 struct GPUPlace {
  GPUPlace() : GPUPlace(0) {}
-  GPUPlace(int d) : device(d) {}
+  GPUPlace(int d) : device(d) {}  // NOLINT
  // needed for variant equality comparison
  inline bool operator==(const GPUPlace &o) const { return device == o.device; }

--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -39,8 +39,8 @@ public:
  // size_ is 0.
  Piece();
  Piece(const char* d, size_t n);
-  Piece(const char* d);
+  Piece(const char* d);         // NOLINT
-  Piece(const std::string& s);
+  Piece(const std::string& s);  // NOLINT
  const char* data() const { return data_; }
  size_t len() const { return size_; }

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -13,4 +13,5 @@ add_python_test(test_framework
    test_sigmoid_op.py
    test_softmax_op.py
    test_rowwise_add_op.py
-    test_network.py)
+    test_network.py
+    gradient_checker.py)
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
+import paddle.v2.framework.core as core
+from paddle.v2.framework.create_op_creation_methods import op_creations
+import numpy
+import unittest
+__all__ = ['get_numeric_gradient']
+def get_numeric_gradient(op,
+                         input_values,
+                         output_name,
+                         input_to_check,
+                         delta=1e-2,
+                         local_scope=None):
+    """
+    Get Numeric Gradient for an operator's input.
+    :param op: C++ operator instance, could be an network 
+    :param input_values: The input variables. Should be an dictionary, key is 
+    variable name. Value is numpy array.
+    :param output_name: The final output variable name. 
+    :param input_to_check: The input variable need to get gradient.
+    :param delta: The perturbation value for numeric gradient method. The 
+    smaller delta is, the more accurate result will get. But if that delta is
+     too small, it could occur numerical stability problem.
+    :param local_scope: The local scope used for get_numeric_gradient.
+    :return: The gradient array in numpy format.
+    """
+    if local_scope is None:
+        local_scope = core.Scope()
+    # Create all input variable in local_scope
+    for var_name in input_values:
+        var = local_scope.new_var(var_name)
+        tensor = var.get_tensor()
+        tensor.set_dims(input_values[var_name].shape)
+        tensor.alloc_float(core.CPUPlace())
+        tensor.set(input_values[var_name], core.CPUPlace())
+    # Create all output variable in local_scope
+    for output in op.outputs():
+        if local_scope.find_var(output) is None:
+            local_scope.new_var(output).get_tensor()
+    op.infer_shape(local_scope)
+    # allocate output memory
+    for output in op.outputs():
+        local_scope.find_var(output).get_tensor().alloc_float(core.CPUPlace())
+    # TODO(yuyang18): Only CPU is support now.
+    cpu_ctx = core.DeviceContext.create(core.CPUPlace())
+    def get_output():
+        op.run(local_scope, cpu_ctx)
+        return numpy.array(local_scope.find_var(output_name).get_tensor()).sum()
+    def product(dim):
+        return reduce(lambda a, b: a * b, dim, 1)
+    tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
+    tensor_size = product(tensor_to_check.get_dims())
+    gradient_flat = numpy.zeros(shape=(tensor_size, ), dtype='float32')
+    for i in xrange(tensor_size):
+        origin = tensor_to_check.get_float_element(i)
+        x_pos = origin + delta
+        tensor_to_check.set_float_element(i, x_pos)
+        y_pos = get_output()
+        x_neg = origin - delta
+        tensor_to_check.set_float_element(i, x_neg)
+        y_neg = get_output()
+        tensor_to_check.set_float_element(i, origin)  # restore old value
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+if __name__ == '__main__':
+    class GetNumericGradientTest(unittest.TestCase):
+        def test_add_op(self):
+            add_op = op_creations.add_two(X="X", Y="Y", Out="Z")
+            x = numpy.random.random((10, 1)).astype("float32")
+            y = numpy.random.random((10, 1)).astype("float32")
+            arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
+            self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2)
+    unittest.main()