Merge branch 'develop' into core_add_fc_op

3285b00d · Liu Yiqun · 1348c20e · 7b679631 · 3285b00d · 3285b00d
33 changed file
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -23,15 +23,18 @@
 - `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
 - `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
-依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 内容            | 定义位置
 --------------  | :----------------------
 OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
 Op定义           | `.cc`文件
-Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。 
+Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，GPU 实现在`.cu`文件中。
-注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
+注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，GPU实现在`.cu`文件中
+实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
 下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
@@ -44,7 +47,7 @@ Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`
 矩阵乘的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释：
-```
+```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -60,19 +63,19 @@ The equation is: Out = X * Y
 };
 ```
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数包括2个参数：
   - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
   - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
 构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加该Op的注释，这些函数会将对应内容添加到`OpProto`中。
-在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，该命名尽可能的规范。
+在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守命名规范。
 再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子：
-```
+```cpp
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
@@ -88,7 +91,7 @@ The equation is: Out = scale*X
 };
 ```
- 在这个例子里，两处不同：
+ 这个例子有两处不同：
  - `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。
  - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
@@ -97,7 +100,7 @@ The equation is: Out = scale*X
 ### 2. 定义Operator类
-```c++
+```cpp
 class MulOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -122,13 +125,13 @@ class MulOp : public framework::OperatorWithKernel {
 [`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
-```c++
+```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
 ```
 这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
-```c++
+```cpp
 MulOp(const std::string &type, const framework::VariableNameMap &inputs,
      const framework::VariableNameMap &outputs,
      const framework::AttributeMap &attrs)
@@ -144,7 +147,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 ### 3. 定义OpKernel类
-```C++
+```cpp
 template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
 public:
@@ -178,7 +181,7 @@ class MulKernel : public framework::OpKernel {
 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
-```c++
+```cpp
 namespace ops = paddle::operators;
 REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
 REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
@@ -192,7 +195,7 @@ REGISTER_OP_CPU_KERNEL(mul_grad,
 在 `.cu`文件中注册GPU Kernel。请注意，如果GPU Kernel的实现是基于Eigen unsupported模块，那么在 `.cu`的最前面请加上宏定义 `#define EIGEN_USE_GPU`
-```c++
+```cpp
 // if use Eigen unsupported module before include head files
 #define EIGEN_USE_GPU
@@ -204,17 +207,18 @@ REGISTER_OP_GPU_KERNEL(mul_grad,
 ### 5. 编译
-在[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件中添加编译。
+- 简单**无特殊依赖**的OP无需修改CMakeList.txt文件。[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt) 会自动将 `paddle/operators` 目录下新增的 `*_op.cc` 文件加入编译。
+- 较为复杂、**有额外依赖** 的operator仍需要修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)。如，`mul_op` 依赖 `math_function`，需要在`CMakeLists.txt`中添加如下内容：
-```
+    ```
-op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
+    op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)		 +
-```
+    ```
-下面命令可以编译：
+- 运行下面命令可以进行编译：
-```
+    ```
-make mul_op
+    make mul_op
-```
+    ```
 ## 绑定Python
@@ -243,27 +247,17 @@ make mul_op
 - 生成库
-   在 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件添加类到`DEPS`中，使得该Op可以链接到生成的lib库中。
+   无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件，`paddle/operators` 目录下新增的 `*_op.cc` 文件会自动被添加链接到生成的lib库中。
-   ```
-   if(WITH_PYTHON)
-     cc_library(paddle_pybind SHARED
-     SRCS pybind.cc
-     DEPS pybind python backward
-     mul_op
-     minus_op)
-   endif(WITH_PYTHON)
-   ```
 ## 实现单元测试
 单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单测](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
-### 前向Operator单测
+### 前向Operator单元测试
 前向Op单测继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`，具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数，以及Python对比的输出值。
-```
+```python
 import unittest
 import numpy as np
 from gradient_checker import GradientChecker, create_op
@@ -287,11 +281,11 @@ class TestMulOp(unittest.TestCase):
   - `self.outputs` : 定义输出，并得到Python结算结果。
-### 反向Operator单测
+### 反向Operator单元测试
 反向Op单测继承自`GradientChecker`，而`GradientChecker`集成自`unittest.TestCase`，所以反向单测函数需要`test_`开头。
-```
+```cpp
 class TestMulGradOp(GradientChecker):
    def setUp(self):
        self.op = create_op("mul")
@@ -337,21 +331,22 @@ class TestMulGradOp(GradientChecker):
   - `test_ignore_x`和`test_ignore_y`分支测试只需要计算一个输入梯度的情况。
-### 编译和执行 
+### 编译和执行单元测试
-单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
+单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加以下内容将单测加入工程中：
 ```
 py_test(test_mul_op SRCS test_mul_op.py)
 ```
-编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`，编译成功之后执行单测命令为：
+请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单测：
-```
+```bash
 make test ARGS="-R test_mul_op -V"
 ```
 或者:
-```
+```bash
 ctest -R test_mul_op
 ```
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -21,16 +21,16 @@ namespace framework {
 /// @cond HIDDEN
 template <int i>
-Dim<i> make_dim(const int* d) {
+Dim<i> make_dim(const int64_t* d) {
  return Dim<i>(*d, make_dim<i - 1>(d + 1));
 }
 template <>
-Dim<1> make_dim<1>(const int* d) {
+Dim<1> make_dim<1>(const int64_t* d) {
  return Dim<1>(*d);
 }
-void make_ddim(DDim& ddim, const int* dims, int n) {
+void make_ddim(DDim& ddim, const int64_t* dims, int n) {
  switch (n) {
    case 1:
      ddim = make_dim<1>(dims);
@@ -67,13 +67,13 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
 /// @endcond
-DDim make_ddim(std::initializer_list<int> dims) {
+DDim make_ddim(std::initializer_list<int64_t> dims) {
  DDim result(make_dim(0));
  make_ddim(result, dims.begin(), dims.size());
  return result;
 }
-DDim make_ddim(const std::vector<int>& dims) {
+DDim make_ddim(const std::vector<int64_t>& dims) {
  DDim result(make_dim(0));
  make_ddim(result, &dims[0], dims.size());
  return result;
@@ -81,12 +81,12 @@ DDim make_ddim(const std::vector<int>& dims) {
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
-class DynamicMutableIndexer : public boost::static_visitor<int&> {
+class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
 public:
  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
  template <int D>
-  int& operator()(Dim<D>& dim) const {
+  int64_t& operator()(Dim<D>& dim) const {
    return dim[idx_];
  }
@@ -94,12 +94,12 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
  int idx_;
 };
-class DynamicConstIndexer : public boost::static_visitor<int> {
+class DynamicConstIndexer : public boost::static_visitor<int64_t> {
 public:
  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
  template <int D>
-  int operator()(const Dim<D>& dim) const {
+  int64_t operator()(const Dim<D>& dim) const {
    return dim[idx_];
  }
@@ -109,22 +109,22 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
 /// @endcond
-int& DDim::operator[](int idx) {
+int64_t& DDim::operator[](int idx) {
  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
 }
-int DDim::operator[](int idx) const {
+int64_t DDim::operator[](int idx) const {
  return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
-ssize_t DDim::size() const { return arity(*this); }
+int64_t DDim::size() const { return arity(*this); }
 bool DDim::operator==(DDim d) const {
  if (var.which() != d.getVar().which()) {
    return false;
  } else {
-    std::vector<int> v1 = vectorize(*this);
+    std::vector<int64_t> v1 = vectorize(*this);
-    std::vector<int> v2 = vectorize(d);
+    std::vector<int64_t> v2 = vectorize(d);
    for (unsigned int i = 0; i < v1.size(); i++) {
      if (v1[i] != v2[i]) {
@@ -139,10 +139,10 @@ bool DDim::operator==(DDim d) const {
 bool DDim::operator!=(DDim d) const { return !(*this == d); }
 DDim DDim::operator+(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
+  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
+  std::vector<int64_t> v2 = vectorize(d);
-  std::vector<int> v3;
+  std::vector<int64_t> v3;
  assert(v1.size() == v2.size());
@@ -154,10 +154,10 @@ DDim DDim::operator+(DDim d) const {
 }
 DDim DDim::operator*(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
+  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
+  std::vector<int64_t> v2 = vectorize(d);
-  std::vector<int> v3;
+  std::vector<int64_t> v3;
  assert(v1.size() == v2.size());
@@ -168,15 +168,15 @@ DDim DDim::operator*(DDim d) const {
  return make_ddim(v3);
 }
-int get(const DDim& ddim, int idx) { return ddim[idx]; }
+int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
 /// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int>& vector;
+  std::vector<int64_t>& vector;
-  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
  template <typename T>
  void operator()(const T& t) {
@@ -188,31 +188,31 @@ struct VectorizeVisitor : public boost::static_visitor<> {
 };
 /// @endcond
-std::vector<int> vectorize(const DDim& ddim) {
+std::vector<int64_t> vectorize(const DDim& ddim) {
-  std::vector<int> result;
+  std::vector<int64_t> result;
  VectorizeVisitor visitor(result);
  boost::apply_visitor(visitor, ddim);
  return result;
 }
-struct ProductVisitor : public boost::static_visitor<ssize_t> {
+struct ProductVisitor : public boost::static_visitor<int64_t> {
  template <int D>
-  ssize_t operator()(const Dim<D>& dim) {
+  int64_t operator()(const Dim<D>& dim) {
    return product(dim);
  }
 };
-ssize_t product(const DDim& ddim) {
+int64_t product(const DDim& ddim) {
  ProductVisitor visitor;
  return boost::apply_visitor(visitor, ddim);
 }
 struct SliceVectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int>& vector;
+  std::vector<int64_t>& vector;
  int begin;
  int end;
-  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
      : vector(v), begin(b), end(e) {
    PADDLE_ENFORCE(begin < end,
                   "Begin index must be less than end index in ddim slice.");
@@ -240,7 +240,7 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
 };
 DDim slice_ddim(const DDim& dim, int begin, int end) {
-  std::vector<int> vec;
+  std::vector<int64_t> vec;
  vec.reserve(end - begin);
  SliceVectorizeVisitor visitor(vec, begin, end);
  boost::apply_visitor(visitor, dim);
@@ -280,7 +280,7 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  return os;
 }
-DDim::DDim(std::initializer_list<int> init_list) {
+DDim::DDim(std::initializer_list<int64_t> init_list) {
  *this = make_ddim(init_list);
 }
 }  // namespace framework

--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -40,7 +40,7 @@ struct DDim {
  template <int D>
  explicit DDim(const Dim<D>& in) : var(in) {}
-  /*implicit*/ DDim(std::initializer_list<int> init_list);
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
  template <int D>
  DDim& operator=(const Dim<D>& in) {
@@ -48,8 +48,8 @@ struct DDim {
    return *this;
  }
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
  template <typename Visitor>
  typename Visitor::result_type apply_visitor(Visitor& visitor) {
@@ -71,15 +71,15 @@ struct DDim {
  DDim operator*(DDim d) const;
-  ssize_t size() const;
+  int64_t size() const;
 };
 /**
- * \brief Make a DDim from std::vector<int>
+ * \brief Make a DDim from std::vector<int64_t>
 *
 * \param dims An vector of ints. Must be sized between [1, 9]
 */
-DDim make_ddim(const std::vector<int>& dims);
+DDim make_ddim(const std::vector<int64_t>& dims);
 /**
 * \brief Make a DDim from an initializer list
@@ -87,14 +87,14 @@ DDim make_ddim(const std::vector<int>& dims);
 * \param dims An initializer list of ints. Must be sized between [1, 9]
 *
 */
-DDim make_ddim(std::initializer_list<int> dims);
+DDim make_ddim(std::initializer_list<int64_t> dims);
-int get(const DDim& dim, int idx);
+int64_t get(const DDim& dim, int idx);
 void set(DDim& dim, int idx, int val);
-std::vector<int> vectorize(const DDim& ddim);
+std::vector<int64_t> vectorize(const DDim& ddim);
-ssize_t product(const DDim& ddim);
+int64_t product(const DDim& ddim);
 /**
 * \brief Slice a ddim

--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -12,7 +12,7 @@ TEST(DDim, Equality) {
  EXPECT_EQ(ddim[2], 5);
  // construct a DDim from a vector
-  std::vector<int> vec({9, 1, 5});
+  std::vector<int64_t> vec({9, 1, 5});
  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
  EXPECT_EQ(ddim[0], 9);
  EXPECT_EQ(ddim[1], 1);
@@ -25,7 +25,7 @@ TEST(DDim, Equality) {
  EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
  // vectorize a DDim
-  std::vector<int> res_vec = paddle::framework::vectorize(vddim);
+  std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
  EXPECT_EQ(res_vec[0], 9);
  EXPECT_EQ(res_vec[1], 1);
  EXPECT_EQ(res_vec[2], 5);

--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -17,13 +17,13 @@ struct Dim {
  static constexpr int dimensions = i;
  template <typename... Args>
-  HOSTDEVICE Dim(int _head, Args... _tail) : head(_head), tail(_tail...) {
+  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
    static_assert(sizeof...(_tail) == i - 1,
                  "Dim initialized with the wrong number of parameters");
  }
  HOSTDEVICE
-  Dim(int _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
  HOSTDEVICE
  Dim() : head(0), tail() {}
@@ -31,12 +31,12 @@ struct Dim {
  /** Construct a Dim from a linear index and size.  Uses Fortran order
   * indexing. */
  HOSTDEVICE
-  Dim(int idx, const Dim<i>& size)
+  Dim(int64_t idx, const Dim<i>& size)
      : head(idx % size.head), tail(idx / size.head, size.tail) {}
  /** Construct a Dim with each dimension set to the given index */
  HOSTDEVICE
-  Dim(int idx) : head(idx), tail(idx) {}
+  Dim(int64_t idx) : head(idx), tail(idx) {}
  HOSTDEVICE
  bool operator==(const Dim<i>& o) const {
@@ -47,13 +47,13 @@ struct Dim {
  bool operator!=(const Dim<i>& o) const { return !(*this == o); }
  HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
  HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
  HOST std::string to_string() const;
-  int head;
+  int64_t head;
  Dim<i - 1> tail;
 };
@@ -63,7 +63,7 @@ struct Dim<1> {
  static constexpr int dimensions = 1;
  HOSTDEVICE
-  Dim(int _head) : head(_head) {}
+  Dim(int64_t _head) : head(_head) {}
  HOSTDEVICE
  Dim() : head(0) {}
@@ -86,11 +86,11 @@ struct Dim<1> {
  bool operator!=(const Dim<1>& o) const { return !(*this == o); }
  HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
  HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
-  int head;
+  int64_t head;
 };
 namespace {
@@ -100,12 +100,12 @@ template <int i>
 struct DimGetter {
  // Return a copy if Dim is const
  template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
    return DimGetter<i - 1>::impl(d.tail);
  }
  // Return a reference if Dim is mutable
  template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
    return DimGetter<i - 1>::impl(d.tail);
  }
 };
@@ -115,18 +115,18 @@ template <>
 struct DimGetter<0> {
  // Return a copy if Dim is const
  template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
    return d.head;
  }
  // Return a reference if Dim is mutable
  template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
    return d.head;
  }
 };
 template <int D>
-HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
  if (idx < 0) {
    throw std::invalid_argument("Tried to access a negative dimension");
@@ -141,7 +141,7 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
 }
 template <>
-HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
  if (idx != 0) {
    throw std::invalid_argument("Invalid index");
@@ -153,7 +153,7 @@ HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
 }
 template <int D>
-HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
  if (idx < 0) {
    throw std::invalid_argument("Tried to access a negative dimension");
@@ -168,7 +168,7 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
 }
 template <>
-HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
  if (idx != 0) {
    throw std::invalid_argument("Invalid index");
@@ -182,73 +182,76 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
 }  // namespace
 // Static access to constant Dim
 template <int i, int l>
-HOSTDEVICE int get(const Dim<l>& d) {
+HOSTDEVICE int64_t get(const Dim<l>& d) {
  return DimGetter<i>::impl(d);
 }
 // Static access to mutable Dim
 template <int i, int l>
-HOSTDEVICE int& get(Dim<l>& d) {
+HOSTDEVICE int64_t& get(Dim<l>& d) {
  return DimGetter<i>::impl(d);
 }
 // Dynamic access to constant Dim
 template <int l>
-HOSTDEVICE int Dim<l>::operator[](int i) const {
+HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
  return indexer(*this, i);
 }
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE int& Dim<l>::operator[](int i) {
+HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
  return indexer(*this, i);
 }
 // Dynamic access to constant Dim
-inline HOSTDEVICE int Dim<1>::operator[](int i) const {
+inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
  return indexer(*this, i);
 }
 // Dynamic access to mutable Dim
-inline HOSTDEVICE int& Dim<1>::operator[](int i) { return indexer(*this, i); }
+inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+  return indexer(*this, i);
+}
 // Dynamic access to constant Dim
 // without std::enable_if will try to instantiate this on get<0>(d)
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int>::type get(const Dim<l>& d,
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
                                                               int i) {
  return d[i];
 }
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int&>::type get(Dim<l>& d, int i) {
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
+                                                                int i) {
  return d[i];
 }
 // Dot product of two dims
 template <int i>
-HOSTDEVICE int linearize(const Dim<i>& a, const Dim<i>& b) {
+HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
  return a.head * b.head + linearize(a.tail, b.tail);
 }
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int linearize(const Dim<1>& a, const Dim<1>& b) {
+HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
  return a.head * b.head;
 }
 // Product of a Dim
 template <int i>
-HOSTDEVICE int product(const Dim<i>& a, int prod = 1) {
+HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
  return prod * a.head * product(a.tail);
 }
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int product(const Dim<1>& a, int prod) {
+HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
  return prod * a.head;
 }

--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -8,7 +8,7 @@ __global__ void test(paddle::framework::Dim<2>* o) {
  o[0] = paddle::framework::make_dim(5, 6);
 }
-__global__ void dyn_idx_gpu(int* o) {
+__global__ void dyn_idx_gpu(int64_t* o) {
  auto d = paddle::framework::make_dim(5, 6);
  o[0] = d[1];
 }
@@ -47,9 +47,9 @@ TEST(Dim, Equality) {
  EXPECT_EQ(b[1], 11);
  // dynamic access on GPU
-  thrust::device_vector<int> r(1);
+  thrust::device_vector<int64_t> r(1);
  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
-  int res = r[0];
+  int64_t res = r[0];
  EXPECT_EQ(res, 6);
  // ex_prefix_mul

--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@@ -28,7 +28,7 @@ struct EigenDim {
  static Type From(const DDim& dims) {
    PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
    Type ret;
-    for (int d = 0; d < arity(dims); d++) {
+    for (int64_t d = 0; d < arity(dims); d++) {
      ret[d] = dims[d];
    }
    return ret;

--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -3,7 +3,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
-USE_OP(add_two);
+USE_OP(add);
 namespace paddle {
 namespace framework {
@@ -41,7 +41,7 @@ namespace f = paddle::framework;
 TEST(GradOpBuilder, AddTwo) {
  std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
+      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
  std::shared_ptr<f::OperatorBase> grad_add_op =
      f::OpRegistry::CreateGradOp(*add_op);
  EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -58,7 +58,7 @@ inline T* Tensor::mutable_data(platform::Place place) {
                    "Tensor's numel must be larger than zero to call "
                    "Tensor::mutable_data. Call Tensor::set_dim first.");
  /* some versions of boost::variant don't have operator!= */
-  size_t size = product(dims_) * sizeof(T);
+  int64_t size = product(dims_) * sizeof(T);
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
    if (platform::is_cpu_place(place)) {
@@ -131,7 +131,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
  PADDLE_ENFORCE_LT(begin_idx, end_idx,
                    "Begin index must be less than end index.");
  PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
-  int base = product(dims_) / dims_[0];
+  size_t base = product(dims_) / dims_[0];
  Tensor dst;
  dst.holder_ = holder_;
  DDim dst_dims = dims_;

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -14,6 +14,15 @@ function(op_library TARGET)
    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
            "${multiValueArgs}" ${ARGN})
+    list(LENGTH op_library_SRCS op_library_SRCS_len)
+    if (${op_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND cc_srcs ${TARGET}.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+            list(APPEND cu_srcs ${TARGET}.cu)
+        endif()
+    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
@@ -23,18 +32,13 @@ function(op_library TARGET)
                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
            endif()
        endforeach()
+    endif()
    list(LENGTH cc_srcs cc_srcs_len)
    if (${cc_srcs_len} EQUAL 0)
        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
    endif()
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH op_library_DEPS dep_len)
-    if (${cu_srcs_len} EQUAL 0 AND ${dep_len} EQUAL 0)
-        message(WARNING "The op library ${TARGET} not support GPU!")
-    endif()
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
@@ -46,25 +50,24 @@ endfunction()
 add_subdirectory(math)
-list(REMOVE_ITEM GENERAL_OPS
+set(DEPS_OPS
+    identity_op
    fc_op
-     net_op
    minus_op
    mul_op
    recurrent_op
    scale_op)
+op_library(identity_op DEPS scale_op)
-op_library(fc_op SRCS fc_op.cc
+op_library(fc_op SRCS DEPS mul_op rowwise_add_op identity_op softmax_op sigmoid_op)
-           DEPS mul_op rowwise_add_op scale_op softmax_op sigmoid_op)
+op_library(minus_op DEPS scale_op)
-op_library(net_op SRCS net_op.cc)
+op_library(mul_op DEPS math_function)
-op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
-op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc 
           DEPS framework_proto tensor operator net_op)
-op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
+op_library(scale_op DEPS net_op)
+list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
-    op_library(${src} SRCS ${src}.cc ${src}.cu)
+    op_library(${src})
 endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")

--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -57,7 +57,6 @@ class AddOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker, add_two_grad, ops::AddOpGrad);
+REGISTER_OP(add, ops::AddOp, ops::AddOpMaker, add_grad, ops::AddOpGrad);
-REGISTER_OP_CPU_KERNEL(add_two,
+REGISTER_OP_CPU_KERNEL(add, ops::AddKernel<paddle::platform::CPUPlace, float>);
-                       ops::AddKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -12,10 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(add_two,
+REGISTER_OP_GPU_KERNEL(add, ops::AddKernel<paddle::platform::GPUPlace, float>);
-                       ops::AddKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/cos_sim_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class CosSimOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                      ctx.Input<Tensor>("Y")->dims(),
+                      "Dimensions of Input(X) and Input(Y) must be the same.");
+    auto dims = ctx.Input<Tensor>("X")->dims();
+    ctx.Output<Tensor>("Out")->Resize({dims[0], 1});
+    ctx.Output<Tensor>("XNorm")->Resize({dims[0], 1});
+    ctx.Output<Tensor>("YNorm")->Resize({dims[0], 1});
+  }
+};
+class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CosSimOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of cos_sim op.");
+    AddInput("Y", "The second input of cos_sim op.");
+    AddOutput("Out", "The output of cos_sim op.");
+    AddOutput("XNorm", "Row norm of the first input.").AsIntermediate();
+    AddOutput("YNorm", "Row norm of the second input.").AsIntermediate();
+    AddComment(R"DOC(
+Cosine Similarity Operator.
+The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y))
+)DOC");
+  }
+};
+class CosSimOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("XNorm"),
+                            "Input(XNorm) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("YNorm"),
+                            "Input(YNorm) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) must not be null.");
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto xnorm_dims = ctx.Input<Tensor>("XNorm")->dims();
+    auto ynorm_dims = ctx.Input<Tensor>("YNorm")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    PADDLE_ENFORCE_EQ(x_dims, y_dims,
+                      "Dimensions of Input(X) and Input(Y) must be the same.");
+    PADDLE_ENFORCE_EQ(xnorm_dims[0], x_dims[0],
+                      "1st dimension of XNorm must equal that of Input(X).");
+    PADDLE_ENFORCE_EQ(xnorm_dims[1], 1, "2st dimension of XNorm must be one.");
+    PADDLE_ENFORCE_EQ(ynorm_dims[0], y_dims[0],
+                      "1st dimension of YNorm must equal that of Input(Y).");
+    PADDLE_ENFORCE_EQ(ynorm_dims[1], 1, "2st dimension of YNorm must be one.");
+    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
+                      "1st dimension of Out@GRAD must equal that of Input(X)");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1, "1st dimension of Out@GRAD must be one.");
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    if (x_grad) x_grad->Resize(x_dims);
+    if (y_grad) y_grad->Resize(y_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
+            ops::CosSimOpGrad);
+REGISTER_OP_CPU_KERNEL(cos_sim,
+                       ops::CosSimKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@@ -13,8 +13,10 @@
   limitations under the License. */
 #define EIGEN_USE_GPU
-#include "paddle/operators/gather_op.h"
+#include "paddle/operators/cos_sim_op.h"
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gather,
+REGISTER_OP_GPU_KERNEL(cos_sim,
-                       ops::GatherOpKernel<paddle::platform::GPUPlace, float>);
+                       ops::CosSimKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    cos_sim_grad, ops::CosSimGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename Place, typename T>
+class CosSimKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input_x = context.Input<Tensor>("X");
+    auto* input_y = context.Input<Tensor>("Y");
+    auto* output_z = context.Output<Tensor>("Out");
+    auto* output_x_norm = context.Output<Tensor>("XNorm");
+    auto* output_y_norm = context.Output<Tensor>("YNorm");
+    output_z->mutable_data<T>(context.GetPlace());
+    output_x_norm->mutable_data<T>(context.GetPlace());
+    output_y_norm->mutable_data<T>(context.GetPlace());
+    auto dims = input_x->dims();
+    int size = static_cast<int>(framework::product(dims));
+    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto y = EigenMatrix<T>::From(*input_y, new_dims);
+    auto z = EigenVector<T>::Flatten(*output_z);
+    auto x_norm = EigenVector<T>::Flatten(*output_x_norm);
+    auto y_norm = EigenVector<T>::Flatten(*output_y_norm);
+    auto place = context.GetEigenDevice<Place>();
+    auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
+    x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
+    y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
+    z.device(place) = xy / x_norm / y_norm;
+  }
+};
+template <typename Place, typename T>
+class CosSimGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input_x = context.Input<Tensor>("X");
+    auto* input_y = context.Input<Tensor>("Y");
+    auto* input_z = context.Input<Tensor>("Out");
+    auto* input_x_norm = context.Input<Tensor>("XNorm");
+    auto* input_y_norm = context.Input<Tensor>("YNorm");
+    auto* output_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* input_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto dims = input_x->dims();
+    int size = static_cast<int>(framework::product(dims));
+    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
+    auto x = EigenMatrix<T>::From(*input_x, new_dims);
+    auto y = EigenMatrix<T>::From(*input_y, new_dims);
+    auto z = EigenMatrix<T>::From(*input_z);
+    auto x_norm = EigenMatrix<T>::From(*input_x_norm);
+    auto y_norm = EigenMatrix<T>::From(*input_y_norm);
+    auto dz = EigenMatrix<T>::From(*input_grad_z);
+    Eigen::DSizes<int, 2> bcast(1, new_dims[1]);
+    auto z_bcast = z.broadcast(bcast);
+    auto dz_bcast = dz.broadcast(bcast);
+    auto place = context.GetEigenDevice<Place>();
+    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast);
+    auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast);
+    auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast);
+    if (output_grad_x) {
+      output_grad_x->mutable_data<T>(context.GetPlace());
+      auto dx = EigenMatrix<T>::From(*output_grad_x, new_dims);
+      dx.device(place) =
+          dz_bcast * (y / norm_prod_bcast - z_bcast * x / x_snorm_bcast);
+    }
+    if (output_grad_y) {
+      output_grad_y->mutable_data<T>(context.GetPlace());
+      auto dy = EigenMatrix<T>::From(*output_grad_y, new_dims);
+      dy.device(place) =
+          dz_bcast * (x / norm_prod_bcast - z_bcast * y / y_snorm_bcast);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -31,8 +31,8 @@ class CPUGaussianRandomKernel : public framework::OpKernel {
    }
    engine.seed(seed);
    std::normal_distribution<T> dist(mean, std);
-    ssize_t size = framework::product(tensor->dims());
+    int64_t size = framework::product(tensor->dims());
-    for (ssize_t i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
      data[i] = dist(engine);
    }
  }
@@ -46,9 +46,14 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  void InferShape(const framework::InferShapeContext& context) const override {
    auto* tensor = context.Output<framework::Tensor>("Out");
    auto dims = GetAttr<std::vector<int>>("dims");
+    std::vector<int64_t> temp;
+    temp.reserve(dims.size());
+    for (auto dim : dims) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
    PADDLE_ENFORCE(dims.size() > 0UL,
                   "dims can be one int or array. dims must be set.");
-    tensor->Resize(framework::make_ddim(dims));
+    tensor->Resize(framework::make_ddim(temp));
  }
 };

--- a/paddle/operators/identity_op.cc
+++ b/paddle/operators/identity_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/scale_op.h"
+namespace paddle {
+namespace operators {
+// identity is a alias of scale op. This is also a example for creating a alias
+// operator.
+template <typename AttrType>
+class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IdentityOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input tensor of identity op");
+    AddOutput("Out", "output tensor of identity op");
+    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
+  }
+};
+template <typename AttrType>
+class IdentityOp : public NetOp {
+ public:
+  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(framework::OpRegistry::CreateOp(
+        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
+        {{"scale", static_cast<AttrType>(1)}}));
+    CompleteAddOp(false);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
+                             ops::IdentityOpMaker<float>);
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -61,7 +61,7 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
      PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
                     outlinks[i].internal);
      f::DDim step_dims = step_scope_var->template GetMutable<Tensor>()->dims();
-      std::vector<int> dims_vec = vectorize(step_dims);
+      std::vector<int64_t> dims_vec = vectorize(step_dims);
      dims_vec.insert(dims_vec.begin(), seq_len);
      output->Resize(f::make_ddim(dims_vec));
    } else {

--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -48,7 +48,7 @@ The equation is: Out = scale*X
  }
 };
-// Identity Op's gradient is identity op, too.
+// Scale Op's gradient is scale op, too.
 // Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
 template <typename AttrType>
 class ScaleGradOp : public NetOp {
@@ -65,34 +65,6 @@ class ScaleGradOp : public NetOp {
  }
 };
-// identity is a alias of scale op. This is also a example for creating a alias
-// operator.
-template <typename AttrType>
-class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  IdentityOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input tensor of identity op");
-    AddOutput("Out", "output tensor of identity op");
-    AddComment("identity operator. Just a alias of scale op which scale = 1.0");
-  }
-};
-template <typename AttrType>
-class IdentityOp : public NetOp {
- public:
-  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(framework::OpRegistry::CreateOp(
-        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
-        {{"scale", static_cast<AttrType>(1)}}));
-    CompleteAddOp(false);
-  }
-};
 }  // namespace operators
 }  // namespace paddle
@@ -102,5 +74,3 @@ REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
            ops::ScaleGradOp<float>);
 REGISTER_OP_CPU_KERNEL(scale,
                       ops::ScaleKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
-                             ops::IdentityOpMaker<float>);
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#define EIGEN_USE_GPU
-#include "paddle/operators/scatter_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(scatter,
-                       ops::ScatterOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -24,7 +24,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
-                   "The input of softmax op must be matrix");
+                   "The input of softmax op must be a matrix.");
    ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
  }
 };
@@ -34,9 +34,27 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  SoftmaxOpMaker(framework::OpProto *proto,
                 framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "input of softmax");
+    AddInput("X",
-    AddOutput("Y", "output of softmax");
+             "The input tensor of softmax. "
-    AddComment("Softmax Op");
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Y", "The normalized values with the same shape as X.");
+    AddComment(R"DOC(
+The input of softmax operator is a 2-D tensor with shape N x K (N is the
+batch_size, K is the dimension of input feature). The output tensor has the
+same shape as the input tensor.
+For each row of the input tensor, the softmax operator squashes the
+K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+values in the range [0, 1] that add up to 1. Specifically, it computes the
+exponential of the given dimension and the sum of exponential values of all
+the other dimensions in the K-dimensional vector input. Then the ratio of the
+exponential of the given dimension and the sum of exponential values of all
+the other dimensions is the output of the softmax operator.
+For each row `i` and each column `j` in X, we have:
+    Y[i, j] = exp(X[i, j]) / sum_j(exp(X[i, j]))
+)DOC");
  }
 };

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -35,8 +35,8 @@ class CPUUniformRandomKernel : public framework::OpKernel {
    std::uniform_real_distribution<T> dist(
        static_cast<T>(context.GetAttr<float>("min")),
        static_cast<T>(context.GetAttr<float>("max")));
-    ssize_t size = framework::product(tensor->dims());
+    int64_t size = framework::product(tensor->dims());
-    for (ssize_t i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
      data[i] = dist(engine);
    }
  }
@@ -52,7 +52,12 @@ class UniformRandomOp : public framework::OperatorWithKernel {
                   "uniform_random's min must less then max");
    auto* tensor = ctx.Output<framework::Tensor>("Out");
    auto dims = GetAttr<std::vector<int>>("dims");
-    tensor->Resize(framework::make_ddim(dims));
+    std::vector<int64_t> temp;
+    temp.reserve(dims.size());
+    for (auto dim : dims) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
+    tensor->Resize(framework::make_ddim(temp));
  }
 };

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 namespace py = pybind11;
-USE_OP(add_two);
+USE_OP(add);
 USE_OP(onehot_cross_entropy);
 USE_OP(sgd);
 USE_OP(mul);
@@ -47,6 +47,7 @@ USE_OP(scale);
 USE_NO_KERNEL_OP(identity);
 USE_NO_KERNEL_OP(fc);
 USE_OP(minus);
+USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
@@ -77,7 +78,7 @@ PYBIND11_PLUGIN(core) {
      .def("get_dims",
           [](const Tensor &self) { return vectorize(self.dims()); })
      .def("set_dims",
-           [](Tensor &self, const std::vector<int> &dim) {
+           [](Tensor &self, const std::vector<int64_t> &dim) {
             self.Resize(make_ddim(dim));
           })
      .def("alloc_float",

--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -85,7 +85,7 @@ void PyCPUTensorSetFromArray(
    framework::Tensor &self,
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
    paddle::platform::CPUPlace &place) {
-  std::vector<int> dims;
+  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back((int)array.shape()[i]);
@@ -102,7 +102,7 @@ void PyCUDATensorSetFromArray(
    framework::Tensor &self,
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
    paddle::platform::GPUPlace &place) {
-  std::vector<int> dims;
+  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back((int)array.shape()[i]);

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ py_test(test_scope SRCS test_scope.py)
 py_test(test_tensor SRCS test_tensor.py)
 py_test(test_mul_op SRCS test_mul_op.py)
+py_test(test_cos_sim_op SRCS test_cos_sim_op.py)
 py_test(test_mean_op SRCS test_mean_op.py)

--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -11,7 +11,7 @@ class TestAddOp(unittest.TestCase):
    __metaclass__ = OpTestMeta
    def setUp(self):
-        self.type = "add_two"
+        self.type = "add"
        self.inputs = {
            'X': numpy.random.random((102, 105)).astype("float32"),
            'Y': numpy.random.random((102, 105)).astype("float32")

--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+class TestCosSimOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+    def setUp(self):
+        self.type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((32, 64)).astype("float32"),
+            'Y': np.random.random((32, 64)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+class TestCosSimGradOp(GradientChecker):
+    def setUp(self):
+        self.op = create_op("cos_sim")
+        self.inputs = {
+            'X': np.random.random((10, 5)).astype("float32"),
+            'Y': np.random.random((10, 5)).astype("float32")
+        }
+    def test_cpu_gpu_compare(self):
+        self.compare_grad(self.op, self.inputs)
+    def test_normal(self):
+        self.check_grad(
+            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.05)
+    def test_ignore_x(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["Y"],
+            "Out",
+            max_relative_error=0.05,
+            no_grad_set={"X"})
+    def test_ignore_y(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["X"],
+            "Out",
+            max_relative_error=0.05,
+            no_grad_set={"Y"})
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
@@ -7,7 +7,7 @@ from gradient_checker import get_numeric_gradient
 class GetNumericGradientTest(unittest.TestCase):
    def test_add_op(self):
-        add_op = Operator('add_two', X="X", Y="Y", Out="Z")
+        add_op = Operator('add', X="X", Y="Y", Out="Z")
        x = numpy.random.random((10, 1)).astype("float32")
        y = numpy.random.random((10, 1)).astype("float32")

--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -15,7 +15,7 @@ def fc(X, W, Y):
 class TestNet(unittest.TestCase):
    def test_net_all(self):
        net = core.Net.create()
-        op1 = Operator("add_two", X="X", Y="Y", Out="Out")
+        op1 = Operator("add", X="X", Y="Y", Out="Out")
        net.append_op(op1)
        net2 = core.Net.create()
@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase):
        expected = '''
 Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(add_two), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
+    Op(add), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
    Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
        Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
            Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.

--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase):
 class TestOpCreations(unittest.TestCase):
    def test_all(self):
-        add_op = op.Operator("add_two", X="a", Y="b", Out="z")
+        add_op = op.Operator("add", X="a", Y="b", Out="z")
        self.assertIsNotNone(add_op)
        # Invoke C++ DebugString()
-        self.assertEqual('Op(add_two), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
+        self.assertEqual('Op(add), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
                         str(add_op))

--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -146,7 +146,7 @@ class TestRecurrentOp(unittest.TestCase):
        stepnet = core.Net.create()
        x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add_two", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
        sig_op = Operator("sigmoid", X="sum", Y="h@alias")
        for op in [x_fc_op, h_fc_op, sum_op, sig_op]: