merge baidu/develop

331b304a · qijun · 52b52ba8 · d2e94346 · 331b304a · 331b304a
26 changed file
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -50,5 +50,6 @@ cc_library(paddle_pybind SHARED
    cross_entropy_op
    recurrent_op
    uniform_random_op
+    gaussian_random_op
    fill_zeros_like_op)
 endif(WITH_PYTHON)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -133,8 +133,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
    for (std::string& grad_input : grad_op->inputs_) {
      if (no_grad_names.count(grad_input)) {
-        std::string prefix =
-            grad_input.substr(0, grad_input.size() - kGradVarSuffix.size());
+        // +1 for \0
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
        grad_input = prefix + kZeroVarSuffix;

        // If part of input gradient of that operator is not calculated, fill
@@ -167,7 +168,7 @@ std::shared_ptr<OperatorBase> Backward(
  std::unordered_set<std::string> no_grad_names;
  no_grad_names.reserve(no_grad_vars.size());

-  no_grad_names.insert(kEmptyVarName + kGradVarSuffix);
+  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);

  for (auto& name : no_grad_vars) {
    no_grad_names.insert(name + kGradVarSuffix);

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -171,10 +171,10 @@ TEST(Backward, simple_op_grad) {
  ASSERT_EQ(4UL, gop->inputs_.size());
  ASSERT_EQ(f::kEmptyVarName, gop->inputs_[0]);
  ASSERT_EQ("rowwise_add_grad", gop->type_);
-  ASSERT_EQ("X" + f::kGradVarSuffix, gop->outputs_[0]);
-  ASSERT_EQ("b" + f::kGradVarSuffix, gop->outputs_[1]);
+  ASSERT_EQ(f::GradVarName("X"), gop->outputs_[0]);
+  ASSERT_EQ(f::GradVarName("b"), gop->outputs_[1]);

-  ASSERT_EQ("X" + f::kGradVarSuffix, gop->Output("X" + f::kGradVarSuffix));
+  ASSERT_EQ(f::GradVarName("X"), gop->Output(f::GradVarName("X")));
 }

 TEST(Backward, simple_op_not_need_grad) {
@@ -182,7 +182,7 @@ TEST(Backward, simple_op_not_need_grad) {
  ASSERT_NE(fwd, nullptr);
  auto gop = f::Backward(*fwd, {"X"});
  ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
-                      "X" + f::kGradVarSuffix),
+                      f::GradVarName("X")),
            gop->outputs_.end());

  auto no_input_gop = f::Backward(*fwd, {"X", "b"});
@@ -250,18 +250,18 @@ TEST(Backward, net_input_of_network_not_need_grad) {
  all_output.erase(f::kEmptyVarName);

  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
-    ASSERT_NE(all_output.find(out + f::kGradVarSuffix), all_output.end());
+    ASSERT_NE(all_output.find(f::GradVarName(out)), all_output.end());
  }

  // Not Generated X
-  ASSERT_EQ(all_output.find("X" + f::kGradVarSuffix), all_output.end());
+  ASSERT_EQ(all_output.find(f::GradVarName("X")), all_output.end());

  ASSERT_EQ(2UL, bwd_net->ops_.size());
  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
  ASSERT_EQ(f::kEmptyVarName,
-            first_fc_grad->ops_[2]->Output("A" + f::kGradVarSuffix));
+            first_fc_grad->ops_[2]->Output(f::GradVarName("A")));
 }

 TEST(Backward, net_shared_weight) {
@@ -313,15 +313,15 @@ TEST(Backward, op_part_of_output_are_not_need) {
  ASSERT_EQ(1UL, fill_zero.inputs_.size());
  ASSERT_EQ("Z", fill_zero.inputs_[0]);
  ASSERT_EQ(1UL, fill_zero.outputs_.size());
-  ASSERT_EQ("Z" + f::kZeroVarSuffix, fill_zero.outputs_[0]);
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.outputs_[0]);

  auto &d_many_out = *net->ops_[1];
  ASSERT_EQ("many_output_op_grad", d_many_out.type_);
  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size());  // I/O/OG
-  ASSERT_EQ("Z" + f::kZeroVarSuffix, d_many_out.Input("z" + f::kGradVarSuffix));
-  ASSERT_EQ("Y" + f::kGradVarSuffix, d_many_out.Input("y" + f::kGradVarSuffix));
-  ASSERT_EQ("X" + f::kGradVarSuffix,
-            d_many_out.Output("x" + f::kGradVarSuffix));
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
+            d_many_out.Input(f::GradVarName("z")));
+  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
+  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
 }

 TEST(Backward, op_part_of_input_are_not_need) {
@@ -331,10 +331,9 @@ TEST(Backward, op_part_of_input_are_not_need) {
  ASSERT_EQ(grad_mul.type_, "mul_grad");
  ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
  ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
-  ASSERT_EQ(grad_mul.Output("A" + f::kGradVarSuffix), f::kEmptyVarName);
-  ASSERT_EQ(grad_mul.Output("B" + f::kGradVarSuffix), "b" + f::kGradVarSuffix);
-  ASSERT_EQ(grad_mul.Input("Out" + f::kGradVarSuffix),
-            "out" + f::kGradVarSuffix);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("A")), f::kEmptyVarName);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("B")), f::GradVarName("b"));
+  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
  ASSERT_EQ(grad_mul.Input("A"), "a");
  ASSERT_EQ(grad_mul.Input("B"), "b");
  ASSERT_EQ(grad_mul.Input("Out"), "out");

--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -83,21 +83,19 @@ TEST(GradOpBuilder, MutiInOut) {
  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
            std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_test_op->Input("Out1" + f::kGradVarSuffix),
-            "out1" + f::kGradVarSuffix);
-  EXPECT_EQ(grad_test_op->Inputs("Out2_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
+            f::GradVarName("out1"));
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
            std::vector<std::string>(
-                {"out2_1" + f::kGradVarSuffix, "out2_2" + f::kGradVarSuffix}));
+                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));

  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
-  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
-            "in1" + f::kGradVarSuffix);
-  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
-            std::vector<std::string>({"in2_1" + f::kGradVarSuffix,
-                                      "in2_2" + f::kGradVarSuffix,
-                                      "in2_3" + f::kGradVarSuffix}));
-  EXPECT_EQ(grad_test_op->Output("In3" + f::kGradVarSuffix),
-            "in3" + f::kGradVarSuffix);
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
+            std::vector<std::string>({f::GradVarName("in2_1"),
+                                      f::GradVarName("in2_2"),
+                                      f::GradVarName("in2_3")}));
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
 }

 TEST(GradOpBuilder, IOIgnoredInGradient) {
@@ -119,19 +117,18 @@ TEST(GradOpBuilder, IOIgnoredInGradient) {
  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
            std::vector<std::string>({"out1_1", "out1_2"}));
  EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName);
-  EXPECT_EQ(grad_test_op->Inputs("Out1_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
            std::vector<std::string>(
-                {"out1_1" + f::kGradVarSuffix, "out1_2" + f::kGradVarSuffix}));
-  EXPECT_EQ(grad_test_op->Input("Out2" + f::kGradVarSuffix),
-            "out2" + f::kGradVarSuffix);
+                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
+            f::GradVarName("out2"));

  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
-  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
-            "in1" + f::kGradVarSuffix);
-  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
            std::vector<std::string>(
-                {"in2_1" + f::kGradVarSuffix, "in2_2" + f::kGradVarSuffix}));
-  EXPECT_EQ(grad_test_op->Outputs("In3_mult" + f::kGradVarSuffix),
+                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
            std::vector<std::string>(
-                {"in3_1" + f::kGradVarSuffix, "in3_2" + f::kGradVarSuffix}));
+                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
 }
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -33,19 +33,19 @@ namespace paddle {
 namespace framework {

 /// If a variable is a empty variable, that name will be used.
-const std::string kEmptyVarName = "@EMPTY@";
+constexpr char kEmptyVarName[] = "@EMPTY@";

 /// If a variable is a temporary variable, that name will be set in Python,
 /// but it will be convert to a unique name in scope after OpCreator.
-const std::string kTempVarName = "@TEMP@";
+constexpr char kTempVarName[] = "@TEMP@";

 /// If a variable's name has a certain suffix, it means that the
 /// variable is the gradient of another varibale.
 /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
-const std::string kGradVarSuffix = "@GRAD";
+constexpr char kGradVarSuffix[] = "@GRAD";

 /// Variables with this suffix are supposed to be filled up with zeros.
-const std::string kZeroVarSuffix = "@ZERO";
+constexpr char kZeroVarSuffix[] = "@ZERO";

 inline std::string GradVarName(const std::string& var_name) {
  return var_name + kGradVarSuffix;

--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/string/to_string.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
@@ -39,7 +40,9 @@ USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
 USE_OP_WITHOUT_KERNEL(recurrent_op);
+USE_OP(gaussian_random);
 USE_OP(uniform_random);
+
 namespace paddle {
 namespace framework {

@@ -205,9 +208,13 @@ All parameter, weight, gradient are variables in Paddle.
                  });
  // clang-format on

-  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
+  py::class_<platform::GPUPlace>(m, "GPUPlace")
+      .def(py::init<int>())
+      .def("__str__", string::to_string<const platform::GPUPlace &>);

-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CPUPlace &>);

  py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
      m, "Operator");

--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -96,6 +96,11 @@ TEST(Layer, kmaxSeqScoreLayer) {
  MatrixPtr inValue =
      Matrix::create(subSeqStartPosition.back(), 1, false, false);

+  std::vector<bool> mode = {false};
+#ifndef PADDLE_ONLY_CPU
+  mode.push_back(true);
+#endif
+
  for (auto hasSubseq : {false, true}) {
    vector<vector<int>> groundTruth;
    inValue->randomizeUniform();
@@ -104,7 +109,7 @@ TEST(Layer, kmaxSeqScoreLayer) {
                         hasSubseq ? subSeqStartPosition : seqStartPosition,
                         beamSize);

-    for (auto useGpu : {false, true}) {
+    for (auto useGpu : mode) {
      TestConfig config;
      config.layerConfig.set_type("kmax_seq_score");
      config.layerConfig.set_beam_size(beamSize);

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -42,26 +42,25 @@ function(op_library TARGET)
 endfunction()

 add_subdirectory(math)
+cc_test(gather_test SRCS gather_test.cc DEPS tensor)

 cc_library(net_op SRCS net_op.cc DEPS op_registry)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)

 op_library(add_op SRCS add_op.cc add_op.cu)
-cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)

 op_library(mean_op SRCS mean_op.cc mean_op.cu)
-cc_test(mean_op_test SRCS mean_op_test.cc DEPS mean_op)

 op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
 op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)

 op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
 op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
+op_library(gaussian_random_op SRCS gaussian_random_op.cc gaussian_random_op.cu)
 op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
 op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)

 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
-cc_test(sgd_op_test SRCS sgd_op_test.cc DEPS sgd_op)

 op_library(fc_op
    SRCS fc_op.cc

--- a/paddle/operators/add_op_test.cc
+++ b/paddle/operators/add_op_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#define private public
-#include "paddle/framework/op_registry.h"
-
-USE_OP(add_two);
-
-TEST(AddOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("add_two");
-  ASSERT_NE(it, protos.end());
-  auto& op_creators = paddle::framework::OpRegistry::op_creators();
-  auto it1 = op_creators.find("add_two_grad");
-  ASSERT_NE(it1, op_creators.end());
-}
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory.h>
+#include <cstring>
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+// Implementation of CPU copy
+template <typename T>
+void CPUGather(const T* params, const int* indices, const int slice_size,
+               const int index_size, T* output) {
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (size_t i = 0; i < index_size; ++i) {
+    int index_ = indices[i];
+    memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
+  }
+}
+
+// Implementation of GPU copy:
+template <typename T>
+void GPUGather(const T* src, const int* index, const int slice_size,
+               const int index_size, T* output);
+
+/**
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
+            const paddle::framework::Tensor* index,
+            paddle::framework::Tensor* output) {
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index->dims().size() == 1);
+  int index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+  paddle::framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (size_t i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  // Gathering
+  if (platform::is_cpu_place(place)) {
+    CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size,
+                 output->data<T>());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/mean_op_test.cc
+++ b/paddle/operators/mean_op_test.cc
@@ -12,14 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/operators/gather.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
 #include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+
+TEST(Gather, GatherData) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  Tensor* src = new Tensor();
+  Tensor* index = new Tensor();
+  Tensor* output = new Tensor();
+
+  int* p_src = nullptr;
+  int* p_index = nullptr;
+  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
+  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+
+  for (size_t i = 0; i < 12; ++i) p_src[i] = i;
+  p_index[0] = 1;
+  p_index[1] = 0;

-#include <paddle/framework/op_registry.h>
+  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());

-USE_OP(mean);
+  Gather<int>(CPUPlace(), src, index, output);

-TEST(MeanOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("mean");
-  ASSERT_NE(it, protos.end());
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
+  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
 }
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <random>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GaussianRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.op_.GetAttr<float>("mean");
+    float std = context.op_.GetAttr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    // TODO(dzh): attribute does not support unsigned int.
+    // And we need a global random seed configuration.
+    int seed = context.op_.GetAttr<int>("seed");
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    std::mt19937 g(seed);
+    std::normal_distribution<T> distribution(mean, std);
+    ssize_t size = framework::product(tensor->dims());
+    for (int i = 0; i < size; ++i) {
+      data[i] = distribution(g);
+    }
+  }
+};
+
+class GaussianRandomOp : public framework::OperatorWithKernel {
+ protected:
+  void InferShape(const framework::InferShapeContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    auto dims = GetAttr<std::vector<int>>("dims");
+    PADDLE_ENFORCE(dims.size() > 0UL,
+                   "dims can be one int or array. dims must be set.");
+    tensor->Resize(framework::make_ddim(dims));
+  }
+};
+
+class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GaussianRandomOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "output matrix of random op");
+    AddComment(R"DOC(
+GaussianRandom operator.
+Use to initialize tensor with gaussian random generator.
+)DOC");
+
+    AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
+    AddAttr<float>("mean", "mean value of random.").SetDefault(.0f);
+    AddAttr<float>("std", "minimum value of random value.").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of generator."
+                 "0 means use system wide seed")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <memory>
+#include <random>
+#include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/gpu_info.h"
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GaussianRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.op_.GetAttr<float>("mean");
+    float std = context.op_.GetAttr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    int seed = context.op_.GetAttr<int>("seed");
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    curandGenerator_t g;
+    PADDLE_ENFORCE(platform::dynload::curandCreateGenerator(
+        &g, CURAND_RNG_PSEUDO_DEFAULT));
+    PADDLE_ENFORCE(
+        platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed));
+    curandGenerateNormal(g, data, framework::product(tensor->dims()), mean,
+                         std);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
\ No newline at end of file
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -73,7 +73,7 @@ void matmul<platform::CPUPlace, float>(const framework::Tensor& in1, bool in1_T,
  int K = in1_dim[1];

  CBLAS_TRANSPOSE in1_Trans = (in1_T == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE in2_Trans = (in1_T == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE in2_Trans = (in2_T == false) ? CblasNoTrans : CblasTrans;

  gemm<platform::CPUPlace, float>(in1_Trans, in2_Trans, M, N, K, alpha,
                                  in1.data<float>(), in2.data<float>(), beta,
@@ -106,7 +106,7 @@ void matmul<platform::CPUPlace, double>(const framework::Tensor& in1,
  int N = out_dim[1];
  int K = in1_dim[1];
  CBLAS_TRANSPOSE in1_Trans = (in1_T == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE in2_Trans = (in1_T == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE in2_Trans = (in2_T == false) ? CblasNoTrans : CblasTrans;

  gemm<platform::CPUPlace, double>(in1_Trans, in2_Trans, M, N, K, alpha,
                                   in1.data<double>(), in2.data<double>(), beta,

--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -85,7 +85,7 @@ void matmul<platform::GPUPlace, float>(const framework::Tensor& in1, bool in1_T,
  int K = in1_dim[1];

  CBLAS_TRANSPOSE in1_Trans = (in1_T == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE in2_Trans = (in1_T == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE in2_Trans = (in2_T == false) ? CblasNoTrans : CblasTrans;

  gemm<platform::GPUPlace, float>(in1_Trans, in2_Trans, M, N, K, alpha,
                                  in1.data<float>(), in2.data<float>(), beta,
@@ -118,7 +118,7 @@ void matmul<platform::GPUPlace, double>(const framework::Tensor& in1,
  int N = out_dim[1];
  int K = in1_dim[1];
  CBLAS_TRANSPOSE in1_Trans = (in1_T == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE in2_Trans = (in1_T == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE in2_Trans = (in2_T == false) ? CblasNoTrans : CblasTrans;

  gemm<platform::GPUPlace, double>(in1_Trans, in2_Trans, M, N, K, alpha,
                                   in1.data<double>(), in2.data<double>(), beta,

--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -41,7 +41,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 class MeanGradOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>("X" + framework::kGradVarSuffix)
+    ctx.Output<Tensor>(framework::GradVarName("X"))
        ->Resize(ctx.Input<Tensor>("X")->dims());
  }
 };

--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -48,10 +48,10 @@ template <typename Place, typename T>
 class MeanGradKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto OG = context.Input<Tensor>("Out" + framework::kGradVarSuffix);
+    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
    PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
                   "Mean Gradient should be scalar");
-    auto IG = context.Output<Tensor>("X" + framework::kGradVarSuffix);
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
    IG->mutable_data<T>(context.GetPlace());

    T ig_size = (T)framework::product(IG->dims());

--- a/paddle/operators/sgd_op_test.cc
+++ b/paddle/operators/sgd_op_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/framework/op_registry.h>
-USE_OP(sgd);
-TEST(SGDOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("sgd");
-  ASSERT_NE(it, protos.end());
-}
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -15,11 +15,12 @@ limitations under the License. */
 #pragma once

 #include <execinfo.h>
-#include <paddle/string/printf.h>
 #include <iomanip>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include "paddle/string/printf.h"
+#include "paddle/string/to_string.h"

 #ifndef PADDLE_ONLY_CPU

@@ -194,8 +195,8 @@ inline void throw_on_error(T e) {
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
  PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
                 "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
-                 #__VAL0, #__VAL1, std::to_string(__VAL0),                    \
-                 std::to_string(__VAL1),                                      \
+                 #__VAL0, #__VAL1, paddle::string::to_string(__VAL0),         \
+                 paddle::string::to_string(__VAL1),                           \
                 paddle::string::Sprintf("" __VA_ARGS__));

 }  // namespace platform

--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -9,6 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <array>
+#include <iostream>
 #include <memory>

 #include "gtest/gtest.h"
@@ -83,7 +85,7 @@ TEST(ENFORCE_NE, FAIL) {
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "enforce 1.0 != 1UL failed, 1.000000 == 1"))
+                          "enforce 1.0 != 1UL failed, 1 == 1"))
        << error.what() << " does not have expected prefix";
  }
  EXPECT_TRUE(caught_exception);
@@ -176,3 +178,39 @@ TEST(ENFORCE_NOT_NULL, FAIL) {
  }
  EXPECT_TRUE(caught_exception);
 }
+
+struct Dims {
+  size_t dims_[4];
+
+  bool operator==(const Dims& o) const {
+    for (size_t i = 0; i < 4; ++i) {
+      if (dims_[i] != o.dims_[i]) return false;
+    }
+    return true;
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const Dims& d) {
+  for (size_t i = 0; i < 4; ++i) {
+    if (i == 0) {
+      os << "[";
+    }
+    os << d.dims_[i];
+    if (i == 4 - 1) {
+      os << "]";
+    } else {
+      os << ", ";
+    }
+  }
+  return os;
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
+  Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
+  PADDLE_ENFORCE_EQ(a, b);
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
+  Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
+  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
+}
\ No newline at end of file
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -2,3 +2,4 @@ cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)

 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
+cc_test(to_string_test SRCS to_string_test.cc)
--- a/paddle/string/to_string.h
+++ b/paddle/string/to_string.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <sstream>
+#include <string>
+
+namespace paddle {
+namespace string {
+template <typename T>
+inline std::string to_string(T v) {
+  std::ostringstream sout;
+  sout << v;
+  return sout.str();
+}
+
+// Faster std::string/const char* type
+template <>
+inline std::string to_string(std::string v) {
+  return v;
+}
+
+template <>
+inline std::string to_string(const char* v) {
+  return std::string(v);
+}
+
+}  // namespace string
+}  // namespace paddle
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/string/to_string.h"
+#include <gtest/gtest.h>
+
+constexpr char kOutputString[] = "User Defined Output";
+class UserDefinedClass {
+public:
+};
+
+std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
+  s << kOutputString;
+  return s;
+}
+
+TEST(to_string, normal) {
+  using namespace paddle::string;
+  ASSERT_EQ("10", to_string(10));
+  ASSERT_EQ("abc", to_string("abc"));
+  ASSERT_EQ("1.2", to_string(1.2));
+}
+
+TEST(to_string, user_defined) {
+  using namespace paddle::string;
+  UserDefinedClass instance;
+  ASSERT_EQ(kOutputString, to_string(instance));
+}
\ No newline at end of file
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -21,5 +21,8 @@ py_test(gradient_checker SRCS gradient_checker.py)
 py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)

 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
+
 py_test(test_operator SRCS test_operator.py)
+
+py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
 py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -92,15 +92,27 @@ def get_numeric_gradient(op,


 class GradientChecker(unittest.TestCase):
-    def __is_close(self, numeric_grads, scope, max_relative_error):
+    def assert_is_close(self, numeric_grads, scope, max_relative_error,
+                        msg_prefix):
        for name in numeric_grads:
-            op_grad = numpy.array(
-                scope.find_var(grad_var_name(name)).get_tensor())
-            is_close = numpy.allclose(
-                numeric_grads[name], op_grad, rtol=max_relative_error, atol=100)
-            if not is_close:
-                return False
-        return True
+            b = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+            a = numeric_grads[name]
+
+            abs_a = numpy.abs(a)
+            # if abs_a is nearly zero, then use abs error for a, not relative
+            # error.
+            abs_a[abs_a < 1e-3] = 1
+
+            diff_mat = numpy.abs(a - b) / abs_a
+            max_diff = numpy.max(diff_mat)
+
+            def err_msg():
+                offset = numpy.argmax(diff_mat > max_relative_error)
+                return "%s Variable %s max gradient diff %f over limit %f, the first " \
+                       "error element is %d" % (
+                       msg_prefix, name, max_diff, max_relative_error, offset)
+
+            self.assertLessEqual(max_diff, max_relative_error, err_msg())

    def check_grad(self,
                   forward_op,
@@ -145,7 +157,8 @@ class GradientChecker(unittest.TestCase):
        # get numeric gradient
        for check_name in inputs_to_check:
            numeric_grad[check_name] = \
-                get_numeric_gradient(forward_op, input_vars, output_name, check_name)
+                get_numeric_gradient(forward_op, input_vars, output_name,
+                                     check_name)

        # get operator gradient according to different device
        for place in places:
@@ -187,15 +200,8 @@ class GradientChecker(unittest.TestCase):
            backward_op.infer_shape(scope)
            backward_op.run(scope, ctx)

-            if isinstance(place, core.CPUPlace):
-                msg = "CPU kernel gradient is not close to numeric gradient"
-            else:
-                if isinstance(place, core.GPUPlace):
-                    msg = "GPU kernel gradient is not close to numeric gradient"
-                else:
-                    raise ValueError("unknown place " + type(place))
-            self.assertTrue(
-                self.__is_close(numeric_grad, scope, max_relative_error), msg)
+            self.assert_is_close(numeric_grad, scope, max_relative_error,
+                                 "Gradient Check On %s" % str(place))


 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+import unittest
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+import numpy
+
+
+class GaussianRandomTest(unittest.TestCase):
+    def test_cpu(self):
+        self.gaussian_random_test(place=core.CPUPlace())
+
+    def test_gpu(self):
+        if core.is_compile_gpu():
+            self.gaussian_random_test(place=core.GPUPlace(0))
+
+    def gaussian_random_test(self, place):
+        scope = core.Scope()
+        scope.new_var("Out").get_tensor()
+
+        op = Operator(
+            "gaussian_random",
+            Out="Out",
+            dims=[1000, 784],
+            mean=.0,
+            std=1.,
+            seed=10)
+
+        op.infer_shape(scope)
+        context = core.DeviceContext.create(place)
+        op.run(scope, context)
+        tensor = numpy.array(scope.find_var("Out").get_tensor())
+        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
+        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()