Resolve conflicts

6216b595 · Yi Wang · 2f024ba5 · 737f2bf3 · 6216b595 · 6216b595
35 changed file
--- a/doc/getstarted/build_and_install/docker_install.rst
+++ b/doc/getstarted/build_and_install/docker_install.rst
@@ -19,8 +19,8 @@ automatically runs the following commands:
 .. code-block:: base
-   docker build -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
-   docker build -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
 To run the CPU-only image as an interactive container:
@@ -81,3 +81,25 @@ source code:
   cd Paddle
   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+Documentation
+-------------
+Paddle Docker images include an HTML version of C++ source code
+generated using `woboq code browser
+<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
+for users to browse and understand the C++ source code.
+As long as we give the Paddle Docker container a name, we can run an
+additional nginx Docker container to serve the volume from the Paddle
+container:
+.. code-block:: bash
+   docker run -d --name paddle-cpu-doc paddle:cpu
+   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+Then we can direct our Web browser to the HTML version of source code
+at http://localhost:8088/paddle/
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -20,11 +20,7 @@ popd > /dev/null
 cd $SCRIPTPATH
-if [ ! -f ../../dist/*.whl ] ; then  # Swig not compiled.
+rm -rf .test_env
-  exit 0
-fi
-rm .test_env -rf
 virtualenv .test_env
 source .test_env/bin/activate

--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_MATRIX_TYPE_CUH_
 #define HL_MATRIX_TYPE_CUH_
 #include "hl_base.h"
 #ifdef __CUDA_ARCH__
-// typedef void*  vecType;
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
@@ -37,4 +35,10 @@ typedef __m128d vecType;
 #endif
 #endif
-#endif /* HL_MATRIX_TYPE_CUH_ */
+#ifdef __CUDA_ARCH__
+#define INLINE   __device__ inline
+#else
+#define INLINE   inline
+#endif
+#endif  // HL_MATRIX_TYPE_CUH_
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_TENSOR_OPS_H_
+#define HL_TENSOR_OPS_H_
+#include <cmath>
+#include "hl_matrix_type.cuh"
+namespace hppl {
+namespace unary {
+template <class T>
+class add_scale {
+private:
+  const T p;
+public:
+  INLINE add_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a + p; }
+};
+template <class T>
+class sub_scale {
+private:
+  const T p;
+public:
+  INLINE sub_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a - p; }
+};
+template <class T>
+class mul_scale {
+private:
+  const T p;
+public:
+  INLINE mul_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a * p; }
+};
+template <class T>
+class div_scale {
+private:
+  const T p;
+public:
+  INLINE div_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a / p; }
+};
+template <class T>
+class neg {
+public:
+  INLINE T operator()(const T a) const { return -a; }
+};
+template <class T>
+class exp_op {
+public:
+  INLINE T operator()(const T a) const { return std::exp(a); }
+};
+template <class T>
+class log_op {
+public:
+  INLINE T operator()(const T a) const { return std::log(a); }
+};
+template <class T>
+class sqrt_op {
+public:
+  INLINE T operator()(const T a) const { return std::sqrt(a); }
+};
+template <class T>
+class square {
+public:
+  INLINE T operator()(const T a) const { return a * a; }
+};
+template <class T>
+class reciprocal {
+public:
+  INLINE T operator()(const T a) const { return T(1) / a; }
+};
+template <class T>
+class abs {
+public:
+  INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
+};
+template <class T>
+class sign {
+public:
+  INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
+};
+template <class T>
+class min {
+private:
+  const T p;
+public:
+  INLINE min(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a > p ? p : a; }
+};
+template <class T>
+class max {
+private:
+  const T p;
+public:
+  INLINE max(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a < p ? p : a; }
+};
+template <class T>
+class pow_op {
+private:
+  const T p;
+public:
+  INLINE pow_op(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return std::pow(a, p); }
+};
+template <class T>
+class constant {
+private:
+  const T p;
+public:
+  INLINE constant(const T s) : p(s) {}
+  INLINE T operator()(int i) const { return p; }
+  INLINE T operator()(int i, int j) const { return p; }
+};
+template <class T>
+class cmp_eq {
+private:
+  const T p;
+public:
+  INLINE cmp_eq(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a == p; }
+};
+template <class T>
+class cmp_ne {
+private:
+  const T p;
+public:
+  INLINE cmp_ne(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a != p; }
+};
+template <class T>
+class cmp_le {
+private:
+  const T p;
+public:
+  INLINE cmp_le(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a <= p; }
+};
+template <class T>
+class cmp_lt {
+private:
+  const T p;
+public:
+  INLINE cmp_lt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a < p; }
+};
+template <class T>
+class cmp_ge {
+private:
+  const T p;
+public:
+  INLINE cmp_ge(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a >= p; }
+};
+template <class T>
+class cmp_gt {
+private:
+  const T p;
+public:
+  INLINE cmp_gt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a > p; }
+};
+template <class T>
+class and_op {
+private:
+  const T p;
+public:
+  INLINE and_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a && p; }
+};
+template <class T>
+class or_op {
+private:
+  const T p;
+public:
+  INLINE or_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a || p; }
+};
+}  // namespace unary
+namespace binary {
+template <class T>
+class add {
+public:
+  INLINE T operator()(const T a, const T b) const { return a + b; }
+};
+template <class T>
+class add_scale {
+private:
+  const T p1;
+  const T p2;
+public:
+  INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
+  INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
+};
+template <class T>
+class sub {
+public:
+  INLINE T operator()(const T a, const T b) const { return a - b; }
+};
+template <class T>
+class mul {
+public:
+  INLINE T operator()(const T a, const T b) const { return a * b; }
+};
+template <class T>
+class div {
+public:
+  INLINE T operator()(const T a, const T b) const { return a / b; }
+};
+template <class T>
+class cmp_eq {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a == b; }
+};
+template <class T>
+class cmp_ne {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a != b; }
+};
+template <class T>
+class cmp_le {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a <= b; }
+};
+template <class T>
+class cmp_lt {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a < b; }
+};
+template <class T>
+class cmp_ge {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a >= b; }
+};
+template <class T>
+class cmp_gt {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a > b; }
+};
+template <class T>
+class and_op {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a && b; }
+};
+template <class T>
+class or_op {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a || b; }
+};
+template <class T>
+class min {
+public:
+  INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
+};
+template <class T>
+class max {
+public:
+  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
+};
+}  // namespace binary
+}  // namespace hppl
+#endif  // HL_TENSOR_OPS_H_
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -289,7 +289,7 @@ void forward(Argument& act) {
                         useGpu(act.deviceId));
  act.in->copyFrom(*act.value);
-  act.value->abs(*act.value);
+  act.value->abs2(*act.value);
 }
 void backward(Argument& act) { act.grad->absDerivative(*act.in); }
@@ -311,7 +311,7 @@ void forward(Argument& act) {
                         useGpu(act.deviceId));
  act.in->copyFrom(*act.value);
-  act.value->square(*act.value);
+  act.value->square2(*act.value);
 }
 void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
@@ -324,7 +324,7 @@ END_DEFINE_ACTIVATION(square)
 * \f]
 */
 BEGIN_DEFINE_ACTIVATION(exponential)
-void forward(Argument& act) { act.value->exp(*act.value); }
+void forward(Argument& act) { act.value->exp2(*act.value); }
 void backward(Argument& act) { act.grad->expDerivative(*act.value); }
 END_DEFINE_ACTIVATION(exponential)
@@ -345,7 +345,7 @@ void forward(Argument& act) {
                         useGpu(act.deviceId));
  act.in->copyFrom(*act.value);
-  act.value->log(*act.value);
+  act.value->log2(*act.value);
 }
 void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }

--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -40,7 +40,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
  savedMean_->mulScalar(1.0 / numSamples);  // E[x]
  tmpMat_->assign(*mat);
-  tmpMat_->square();
+  tmpMat_->square2();
  savedInvVar_->zeroMem();
  savedInvVar_->accumulateColSum(*tmpMat_);
  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
@@ -54,7 +54,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
  calMovingMeanAndVar();
  savedInvVar_->subScalar(-EPS);
-  savedInvVar_->sqrt(*savedInvVar_);
+  savedInvVar_->sqrt2(*savedInvVar_);
 }
 void BatchNormalizationLayer::calMovingMeanAndVar() {
@@ -85,7 +85,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
  savedInvVar_->downClip(real(0.0));
  savedInvVar_->subScalar(-EPS);
-  savedInvVar_->sqrt(*savedInvVar_);
+  savedInvVar_->sqrt2(*savedInvVar_);
 }
 void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {

--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -115,12 +115,12 @@ void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
                                                    Matrix& target) {
  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
  output.rowSum(*sftMaxSum_);
-  sftMaxSum_->log();
+  sftMaxSum_->log2();
  target.oneHotCrossEntropy(output, *label.ids);
  target.add(*sftMaxSum_);
-  sftMaxSum_->square();
+  sftMaxSum_->square2();
  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
 }
@@ -131,12 +131,12 @@ void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
  output.rowSum(*sftMaxSum_);
  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
-  sftMaxSum_->reciprocal(*sumInv_);
+  sftMaxSum_->reciprocal2(*sumInv_);
  outputG.oneHotCrossEntropyBp(output, *label.ids);
  outputG.addColumnVector(*sumInv_);
-  sftMaxSum_->log();
+  sftMaxSum_->log2();
  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -316,12 +316,12 @@ void Layer::showOutputStats() {
    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
    min = tmpMat->getMin();
    max = tmpMat->getMax();
-    tmpMat->square();
+    tmpMat->square2();
    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
  } else {
    min = outSquare->getMin();
    max = outSquare->getMax();
-    outSquare->square();
+    outSquare->square2();
  }
  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
  std = std > 0 ? std : 0;

--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -60,7 +60,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
  expX_->assign(*matX);
  // subtract max to avoid overflow or underflow
  expX_->mul(maxX_, ones_, (real)-1, (real)1);
-  expX_->exp();
+  expX_->exp2();
  real* a = a_->getData();
  real* b = b_->getData();
@@ -69,7 +69,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
  real* expX = expX_->getData();
  real* maxX = maxX_->getData();
-  expW_->exp(*w_);
+  expW_->exp2(*w_);
  real* expW = expW_->getData();
  for (int i = 0; i < numClasses_; ++i) {

--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -99,7 +99,7 @@ void PowerLayer::backward(const UpdateCallback& callback) {
    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
    if (inG0) {
-      tmpMtx->log(*inV1);
+      tmpMtx->log2(*inV1);
      tmpMtx->dotMul(*tmpMtx, *outV);
      // inG0 += outG .* (log(inV1) * outV)

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -355,11 +355,11 @@ void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
 DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
 template<>
-void BaseMatrixT<real>::exp() { applyUnary(unary::Exp<real>()); }
+void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
 DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
 template<>
-void BaseMatrixT<real>::log() {
+void BaseMatrixT<real>::log2() {
  if (useGpu_) {
    applyUnary(unary::Log<real>());
  } else {
@@ -369,23 +369,23 @@ void BaseMatrixT<real>::log() {
 DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
 template<>
-void BaseMatrixT<real>::sqrt() { applyUnary(unary::Sqrt<real>()); }
+void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
 DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
 template<class T>
-void BaseMatrixT<T>::square() { applyUnary(unary::Square<T>()); }
+void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
 DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
 template<class T>
-void BaseMatrixT<T>::reciprocal() { applyUnary(unary::Reciprocal<T>()); }
+void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
 DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
 template<class T>
-void BaseMatrixT<T>::abs() { applyUnary(unary::Abs<T>()); }
+void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
 DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
 template<class T>
-void BaseMatrixT<T>::sign() { applyUnary(unary::Sign<T>()); }
+void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 template<class T>
@@ -405,7 +405,7 @@ void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
 template<>
-void BaseMatrixT<real>::pow(real p) {
+void BaseMatrixT<real>::pow2(real p) {
  if (useGpu_) {
    applyUnary(unary::Pow<real>(p));
  } else {
@@ -534,7 +534,7 @@ void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
 template<>
-void BaseMatrixT<real>::pow(BaseMatrixT& b, real p) {
+void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
  if (useGpu_) {
    applyBinary(binary::Pow<real>(p), b);
  } else {
@@ -615,7 +615,7 @@ void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
 DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
 template<class T>
-void BaseMatrixT<T>::square(BaseMatrixT& b) {
+void BaseMatrixT<T>::square2(BaseMatrixT& b) {
  applyBinary(binary::Square<T>(), b);
 }
@@ -657,7 +657,7 @@ void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
 DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
 template<class T>
-void BaseMatrixT<T>::reciprocal(BaseMatrixT& b) {
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
  applyBinary(binary::Reciprocal<T>(), b);
 }
@@ -669,7 +669,7 @@ void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
 DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
 template<class T>
-void BaseMatrixT<T>::abs(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
 DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
 template<class T>
@@ -729,17 +729,19 @@ void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
 DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
 template<class T>
-void BaseMatrixT<T>::sign(BaseMatrixT& b) { applyBinary(binary::Sign<T>(), b); }
+void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
+  applyBinary(binary::Sign<T>(), b);
+}
 DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
 template<>
-void BaseMatrixT<real>::exp(BaseMatrixT& b) {
+void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
  applyBinary(binary::Exp<real>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
 template<>
-void BaseMatrixT<real>::log(BaseMatrixT& b) {
+void BaseMatrixT<real>::log2(BaseMatrixT& b) {
  if (useGpu_) {
    applyBinary(binary::Log<real>(), b);
  } else {
@@ -749,7 +751,7 @@ void BaseMatrixT<real>::log(BaseMatrixT& b) {
 DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
 template<>
-void BaseMatrixT<real>::sqrt(BaseMatrixT& b) {
+void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
  applyBinary(binary::Sqrt<real>(), b);
 }
@@ -1065,7 +1067,7 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
 DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
 template<class T>
-void BaseMatrixT<T>::max(BaseMatrixT& b, BaseMatrixT& c) {  // NOLINT
+void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::Max<T>(), b, c);
 }
@@ -1168,7 +1170,7 @@ void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
                                  a = 1 / (p1 * b + p2));
 template<class T>
-void BaseMatrixT<T>::reciprocal(BaseMatrixT& b, T p1, T p2) {
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
 }

--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cstddef>
 #include <stdint.h>
 #include "paddle/utils/TypeDefs.h"
+#include "TensorExpression.h"
 namespace paddle {
@@ -70,7 +71,7 @@ public:
 };
 template <class T>
-class BaseMatrixT {
+class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
 public:
  size_t height_, width_;
  size_t stride_;
@@ -427,14 +428,14 @@ public:
   *
   */
  void neg();
-  void exp();
+  void exp2();
-  void pow(T p);
+  void pow2(T p);
-  void log();
+  void log2();
-  void sqrt();
+  void sqrt2();
-  void square();
+  void square2();
-  void reciprocal();
+  void reciprocal2();
-  void abs();
+  void abs2();
-  void sign();
+  void sign2();
  void zero();
  /**
@@ -603,7 +604,7 @@ public:
   * b = this * this
   * @endcode
   */
-  void square(BaseMatrixT& b);
+  void square2(BaseMatrixT& b);
  void squareDerivative(BaseMatrixT& b);
  /**
@@ -627,7 +628,7 @@ public:
   * b = 1.0f / this
   * @endcode
   */
-  void reciprocal(BaseMatrixT& b);
+  void reciprocal2(BaseMatrixT& b);
  void reciprocalDerivative(BaseMatrixT& b);
  /**
@@ -635,7 +636,7 @@ public:
   * b = this > 0.0f ? this : -this
   * @endcode
   */
-  void abs(BaseMatrixT& b);
+  void abs2(BaseMatrixT& b);
  void absDerivative(BaseMatrixT& b);
  /**
@@ -653,12 +654,12 @@ public:
   */
  void expDerivative(BaseMatrixT& b);
-  void sign(BaseMatrixT& b);
+  void sign2(BaseMatrixT& b);
-  void exp(BaseMatrixT& b);
+  void exp2(BaseMatrixT& b);
-  void pow(BaseMatrixT& b, T p);
+  void pow2(BaseMatrixT& b, T p);
-  void log(BaseMatrixT& b);
+  void log2(BaseMatrixT& b);
-  void sqrt(BaseMatrixT& b);
+  void sqrt2(BaseMatrixT& b);
  void addScalar(BaseMatrixT& b, T p);
  void subScalar(BaseMatrixT& b, T p);
  void mulScalar(BaseMatrixT& b, T p);
@@ -828,7 +829,7 @@ public:
   * this = b>c ? b : c
   * @endcode
   */
-  void max(BaseMatrixT& b, BaseMatrixT& c);  //  NOLINT
+  void max2(BaseMatrixT& b, BaseMatrixT& c);
  /**
   * @code
@@ -927,7 +928,7 @@ public:
   * this = 1 / (p1 * b + p2)
   * @endcode
   */
-  void reciprocal(BaseMatrixT& b, T p1, T p2);
+  void reciprocal2(BaseMatrixT& b, T p1, T p2);
  /**
   * @code
@@ -1050,6 +1051,32 @@ public:
  void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
  virtual bool isSparse() const { return false; }
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+  template <typename ExpressionType>
+  void operator+=(const ExpressionType& expr) {
+    (*this) = (*this) + expr;
+  }
+  template <typename ExpressionType>
+  void operator-=(const ExpressionType& expr) {
+    (*this) = (*this) - expr;
+  }
+  template <typename ExpressionType>
+  void operator*=(const ExpressionType& expr) {
+    (*this) = (*this) * expr;
+  }
+  template <typename ExpressionType>
+  void operator/=(const ExpressionType& expr) {
+    (*this) = (*this) / expr;
+  }
 };
 typedef BaseMatrixT<real> BaseMatrix;

--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -16,10 +16,12 @@ file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
 set(MATH_SOURCES
    "${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
+    "${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
    ${MATH_SOURCES})
 if(NOT WITH_GPU)
    # then compile BaseMatrix.cu as c++ file
    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
    add_library(paddle_math STATIC
        ${MATH_SOURCES})
 else()

--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -136,7 +136,7 @@ public:
    return sum;
  }
-  virtual void square() {
+  virtual void square2() {
    CHECK(isContiguous());
    if (valueType_ == NO_VALUE) {
      return;

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -1122,6 +1122,7 @@ public:
  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
    LOG(FATAL) << "Not implemented";
  }
  virtual void bilinearForward(const Matrix& in,
                               const size_t inImgH,
                               const size_t inImgW,
@@ -1142,6 +1143,15 @@ public:
                                const real ratioW) {
    LOG(FATAL) << "Not implemented";
  }
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<real>(*this, expr);
+    } else {
+      TensorCpuApply<real>(*this, expr);
+    }
+  }
 };
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
@@ -1518,6 +1528,11 @@ public:
  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<real>(*this, expr);
+  }
 };
 class CpuMatrix : public Matrix {
@@ -1917,6 +1932,11 @@ public:
                        const size_t numChannels,
                        const real ratioH,
                        const real ratioW);
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<real>(*this, expr);
+  }
 };
 class SharedCpuMatrix : public CpuMatrix {
@@ -1957,6 +1977,7 @@ public:
  void add(real p1, real p2);
 private:
+  using Matrix::mul;
  void initShared(int blockNum);
  void initBlock(int blockNum);

--- a/paddle/math/TensorApply.h
+++ b/paddle/math/TensorApply.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+namespace paddle {
+/**
+ * \brief The tensor evaluator classes.
+ */
+template <typename Derived, class T>
+class TensorApply {
+public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+  INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
+  INLINE T& applyRef(int index) { return data_[index]; }
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+  T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+/**
+ * \brief The tensor evaluator classes.
+ * evaluator for rvalues
+ */
+template <typename Derived, class T>
+class TensorApply<const Derived, T> {
+public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+  const T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+template <typename Derived, class T>
+class TensorApply<const TensorExpression<Derived, T>, T> {
+public:
+  explicit TensorApply(const TensorExpression<Derived, T>& expr)
+      : expr_(expr.derived()) {}
+  INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
+  INLINE T apply(int index) const { return expr_.apply(index); }
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+  TensorApply<const Derived, T> expr_;
+};
+/**
+ * \brief The unary expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
+public:
+  explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+  INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
+  INLINE T apply(int index) const { return op_(expr_.apply(index)); }
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+/**
+ * \brief The binary expression evaluator classes.
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+public:
+  explicit INLINE TensorApply(
+      const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
+      : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+  INLINE T apply(int i, int j) const {
+    return op_(lhs_.apply(i, j), rhs_.apply(i, j));
+  }
+  INLINE T apply(int index) const {
+    return op_(lhs_.apply(index), rhs_.apply(index));
+  }
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+  const OP op_;
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<RhsType, T> rhs_;
+};
+/**
+ * \brief The ternary expression evaluator classes.
+ */
+template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
+class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
+public:
+  explicit INLINE TensorApply(
+      const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
+      : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
+    CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
+    CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
+    CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
+    CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
+    CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
+#endif
+  }
+  INLINE T apply(int i, int j) const {
+    return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
+  }
+  INLINE T apply(int index) const {
+    return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
+  }
+  INLINE size_t getWidth() const { return expr1_.getWidth(); }
+  INLINE size_t getHeight() const { return expr1_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return expr1_.isContiguous() && expr2_.isContiguous() &&
+           expr3_.isContiguous();
+  }
+  INLINE bool useGpu() const { return expr1_.useGpu(); }
+  TensorApply<ArgType1, T> expr1_;
+  TensorApply<ArgType2, T> expr2_;
+  TensorApply<ArgType3, T> expr3_;
+};
+/**
+ * \brief The const expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
+public:
+  explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+  INLINE T apply(int i, int j) const { return op_(i, j); }
+  INLINE T apply(int index) const { return op_(index); }
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return true; }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+}  // namespace paddle
--- a/paddle/math/TensorAssign.h
+++ b/paddle/math/TensorAssign.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+namespace paddle {
+/**
+ * \brief Tensor Assign Expression(return by lazyAssign,
+ * and evaluated by AssignEvaluate)
+ */
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp {
+public:
+  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
+      : lhs_(lhs), rhs_(rhs) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+  INLINE void apply(const int i, const int j) {
+    lhs_.applyRef(i, j) = rhs_.apply(i, j);
+  }
+  INLINE void apply(const int index) {
+    lhs_.applyRef(index) = rhs_.apply(index);
+  }
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+private:
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<const RhsType, T> rhs_;
+};
+template <typename Assign, typename... AssignOp>
+void AssignCpuEvaluate(int height,
+                       int width,
+                       bool isContiguous,
+                       Assign&& assign,
+                       AssignOp&&... args) {
+  if (isContiguous) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      assign.apply(index);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        assign.apply(i, j);
+        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+      }
+    }
+  }
+}
+#ifdef __NVCC__
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate1(const int border,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    assign.apply(idx);
+    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
+  }
+}
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate2(const int height,
+                                   const int width,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
+      assign.apply(i, j);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+    }
+  }
+}
+#endif
+/**
+ * \brief Evaluate one or more TensorAssignOp objects.
+ *
+ * \note At least one assignment expression is required
+ */
+template <typename Assign, typename... AssignOp>
+void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
+  const bool useGpu_ = assign.useGpu();
+  bool isContiguous_ = assign.isContiguous();
+  const size_t height = assign.getHeight();
+  const size_t width = assign.getWidth();
+  const int packSize = sizeof...(args);
+  const bool packUseGpu[] = {((args)).useGpu()...};
+  const bool packIsContiguous[] = {((args)).isContiguous()...};
+  const size_t packHeight[] = {((args)).getHeight()...};
+  const size_t packWidth[] = {((args)).getWidth()...};
+  for (int i = 0; i < packSize; i++) {
+    CHECK_EQ(useGpu_, packUseGpu[i]);
+    CHECK_EQ(height, packHeight[i]);
+    CHECK_EQ(width, packWidth[i]);
+    isContiguous_ = isContiguous_ && packIsContiguous[i];
+  }
+  if (useGpu_) {
+#ifdef __NVCC__
+    if (isContiguous_) {
+      int size = height * width;
+      int blockSize = size <= 1024 ? size : 1024;
+      int gridSize = (size + 1024 - 1) / 1024;
+      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+          size, assign, args...);
+    } else {
+      int blockSizeY = std::min(32, (int)height);
+      int blockSizeX = (32 / blockSizeY) * 32;
+      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
+      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
+      dim3 threads(blockSizeX, blockSizeY);
+      dim3 grid(gridSizeX, gridSizeY);
+      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          height, width, assign, args...);
+    }
+    CHECK_SYNC("AssignEvaluate failed");
+#endif
+  } else {
+    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
+  }
+}
+}  // namespace paddle
--- a/paddle/math/TensorEvaluate.h
+++ b/paddle/math/TensorEvaluate.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+#include "hl_base.h"
+namespace paddle {
+/**
+ * \brief The tensor cpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+  int height = lhs_.getHeight();
+  int width = lhs_.getWidth();
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      lhs_.applyRef(index) = rhs_.apply(index);
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        lhs_.applyRef(i, j) = rhs_.apply(i, j);
+      }
+    }
+  }
+}
+#ifdef __NVCC__
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs,
+                                    RightType rhs,
+                                    const int border) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    lhs.applyRef(idx) = rhs.apply(idx);
+  }
+}
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
+      lhs.applyRef(i, j) = rhs.apply(i, j);
+    }
+  }
+}
+/**
+ * \brief The tensor gpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+  int dimM = lhs_.getHeight();
+  int dimN = lhs_.getWidth();
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+        lhs_, rhs_, size);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
+  }
+  CHECK_SYNC("TensorGpuApply failed");
+}
+#else
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
+#endif
+}  // namespace paddle
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstddef>
+#include <stdint.h>
+#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Logging.h"
+#include "hl_tensor_ops.h"
+namespace paddle {
+template <class OP, typename ExprType, class T>
+class TensorConstant;
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp;
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp;
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp;
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp;
+/**
+ * \brief Tensor base class.
+ *
+ * This is the base class of all Tensor and Expression class.
+ */
+template <typename Derived, class T>
+class TensorExpression {
+public:
+  /**
+   * Element wise unary expression.
+   */
+  template <typename UnaryOp>
+  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
+      const UnaryOp& op) const {
+    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
+  }
+  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+      T p) const {
+    return unaryExpression(hppl::unary::add_scale<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
+      T p) const {
+    return unaryExpression(hppl::unary::sub_scale<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+      T p) const {
+    return unaryExpression(hppl::unary::mul_scale<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
+      T p) const {
+    return unaryExpression(hppl::unary::div_scale<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
+    return unaryExpression(hppl::unary::neg<T>());
+  }
+  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
+    return unaryExpression(hppl::unary::exp_op<T>());
+  }
+  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
+    return unaryExpression(hppl::unary::log_op<T>());
+  }
+  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
+    return unaryExpression(hppl::unary::sqrt_op<T>());
+  }
+  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
+    return unaryExpression(hppl::unary::square<T>());
+  }
+  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
+      const {
+    return unaryExpression(hppl::unary::reciprocal<T>());
+  }
+  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
+    return unaryExpression(hppl::unary::abs<T>());
+  }
+  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
+    return unaryExpression(hppl::unary::sign<T>());
+  }
+  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
+    return unaryExpression(hppl::unary::pow_op<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
+    return unaryExpression(hppl::unary::min<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
+    return unaryExpression(hppl::unary::max<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_eq<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ne<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_le<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_lt<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ge<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_gt<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
+      T p) const {
+    return unaryExpression(hppl::unary::and_op<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
+      T p) const {
+    return unaryExpression(hppl::unary::or_op<T>(p));
+  }
+  /**
+   * Element wise binary expression.
+   */
+  template <typename BinaryOp, typename ExpressionType>
+  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
+  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
+    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
+        op, derived(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator==(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator!=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_le<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::and_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator&&(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::and_op<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::or_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator||(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::or_op<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::add<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator+(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::add<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::sub<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator-(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::sub<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::mul<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator*(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::mul<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::div<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator/(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::div<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::min<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  min(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::min<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::max<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  max(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::max<T>(), expr);
+  }
+  /**
+   * Element wise ternary expression.
+   *
+   * ternary conditional operator(?: operator).
+   * The conditional expression returns one of two values depending on
+   * the result of derived expression.
+   * If derived expression evaluates to true, then expression1 is evaluated.
+   * If derived expression evaluates to false, then expression2 is evaluated.
+   */
+  template <typename ExprType1, typename ExprType2>
+  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
+  condition(const ExprType1& expr1, const ExprType2& expr2) const {
+    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
+        derived(), expr1, expr2);
+  }
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const ExprType,
+      T>
+  condition(T p, const ExprType& expr) const {
+    return condition(constant(p), expr);
+  }
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const ExprType,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(const ExprType& expr, T p) const {
+    return condition(expr, constant(p));
+  }
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(T p1, T p2) const {
+    return condition(constant(p1), constant(p2));
+  }
+  /**
+   * return a TensorConstant. A TensorConstant object hold a constant value.
+   */
+  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
+      T p) const {
+    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
+        hppl::unary::constant<T>(p), derived());
+  }
+  /**
+   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
+   * TensorAssignOp objects.
+   */
+  template <typename ExpressionType>
+  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
+      const ExpressionType& expr) const {
+    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
+  }
+protected:
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+/**
+ * \brief Unary Operator Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp
+    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
+public:
+  explicit TensorUnaryOp(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+  const OP op_;
+  const ExprType expr_;
+};
+/**
+ * \brief Binary Operator Expression
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp
+    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+public:
+  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
+      : op_(op), lhs_(lhs), rhs_(rhs) {}
+  const OP op_;
+  const LhsType lhs_;
+  const RhsType rhs_;
+};
+/**
+ * \brief Ternary Operator Expression
+ */
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp : public TensorExpression<
+                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
+                            T> {
+public:
+  explicit TensorTernaryOp(const ExprType1& expr1,
+                           const ExprType2& expr2,
+                           const ExprType3& expr3)
+      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
+  const ExprType1 expr1_;
+  const ExprType2 expr2_;
+  const ExprType3 expr3_;
+};
+/**
+ * \brief Constant Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorConstant
+    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
+public:
+  explicit TensorConstant(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+  const OP op_;
+  const ExprType expr_;
+};
+/**
+ * \brief operator+ overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr + p;
+}
+/**
+ * \brief operator* overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr * p;
+}
+}  // namespace paddle
+#include "TensorApply.h"
+#include "TensorEvaluate.h"
--- a/paddle/math/TrainingAlgorithmOp.cu
+++ b/paddle/math/TrainingAlgorithmOp.cu
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/utils/Logging.h"
+#include "BaseMatrix.h"
+#include "TrainingAlgorithmOp.h"
+#if __cplusplus > 199711L
+#include "TensorAssign.h"
+namespace paddle {
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
+  auto expr2 = momV.lazyAssign(
+    momV + (tau * alpha * gamma * learningRate) * grad);
+  auto expr3 = value.lazyAssign(
+    (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV);
+  AssignEvaluate(expr1, expr2, expr3);
+}
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
+  auto expr2 = lr.lazyAssign(
+    ((accum_update + epsilon) / (accum + epsilon)).sqrt());
+  auto expr3 = accum_update.lazyAssign(
+    rou * accum_update + ((real)1 - rou) * (grad * lr).square());
+  auto expr4 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+}
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  auto expr1 = accum.lazyAssign(accum + grad.square());
+  auto expr2 = lr.lazyAssign(
+    (accum_buffer + accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+  AssignEvaluate(expr1, expr2, expr3, expr4);
+}
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
+  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
+  auto expr4 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+  if (firstTime) {
+    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  } else {
+    auto expr1 = g.lazyAssign(
+      accumulatedRou * g + ((real)1 - rou) * grad.square());
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  }
+}
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+  if (firstTime) {
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  } else {
+    auto expr1 = accum.lazyAssign(
+      accumulatedRou * accum + ((real)1 - rou) * grad.square());
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  }
+}
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha = learningRate *
+      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
+  auto expr3 = value.lazyAssign(
+    value - (mom * alpha) / (v.sqrt() + epsilon));
+  AssignEvaluate(expr1, expr2, expr3);
+}
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = u.lazyAssign(
+    (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
+  auto expr3 = value.lazyAssign(
+    value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
+  AssignEvaluate(expr1, expr2, expr3);
+}
+}  // namespace paddle
+#else
+namespace paddle {
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  /**
+   * \alpha_t = \alpha_{t-1} / k
+   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
+   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
+   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
+   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
+   */
+  momU -= (alpha * gamma * learningRate) * grad;
+  momV += (tau * alpha * gamma * learningRate) * grad;
+  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
+}
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  accum = rou * accum + ((real)1 - rou) * grad.square();
+  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
+  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  accum += grad.square();
+  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    g = accumulatedRou * g + grad.square();
+  } else {
+    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
+  }
+  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
+  f = accumulatedRou * f + ((real)1 - rou) * grad;
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  lr = (g - f.square() + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    accum = accumulatedRou * accum + grad.square();
+  } else {
+    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
+  }
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  lr = (accum + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha = learningRate *
+      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  v = beta2 * v + ((real)1 - beta2) * grad.square();
+  value -=  (mom * alpha) / (v.sqrt() + epsilon);
+}
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
+}
+}  // namespace paddle
+#endif
--- a/paddle/math/TrainingAlgorithmOp.h
+++ b/paddle/math/TrainingAlgorithmOp.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/utils/Logging.h"
+#include "BaseMatrix.h"
+namespace paddle {
+/**
+ * \brief Sparse Momentum optimizer.
+ */
+extern void sparseMomentumApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& momU,
+                                BaseMatrix& momV,
+                                real alpha,
+                                real beta,
+                                real gamma,
+                                real tau,
+                                real learningRate);
+/**
+ * \brief AdaDelta optimizer.
+ */
+extern void adadeltaApply(BaseMatrix& value,
+                          BaseMatrix& grad,
+                          BaseMatrix& sum,
+                          BaseMatrix& sum1,
+                          BaseMatrix& mom,
+                          BaseMatrix& lr,
+                          real rou,
+                          real epsilon,
+                          real learningRate,
+                          real momentum,
+                          real decayRate);
+/**
+ * \brief AdaGrad optimizer.
+ */
+extern void adagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& sum,
+                         BaseMatrix& sum1,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate);
+/**
+ * \brief RMSProp optimizer.
+ */
+extern void rmspropApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& g,
+                         BaseMatrix& f,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime);
+/**
+ * \brief Decayed AdaGrad optimizer.
+ */
+extern void decayedAdagradApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& mom,
+                                BaseMatrix& accum,
+                                BaseMatrix& lr,
+                                real accumulatedRou,
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate,
+                                bool firstTime);
+/**
+ * \brief Adam optimizer.
+ */
+extern void adamApply(BaseMatrix& value,
+                      BaseMatrix& grad,
+                      BaseMatrix& mom,
+                      BaseMatrix& v,
+                      real beta1,
+                      real beta2,
+                      real beta1_power,
+                      real beta2_power,
+                      real epsilon,
+                      real learningRate);
+/**
+ * \brief AdaMax optimizer.
+ */
+extern void adamaxApply(BaseMatrix& value,
+                        BaseMatrix& grad,
+                        BaseMatrix& mom,  // firse moment
+                        BaseMatrix& u,    // weighted infinity norm
+                        real beta1,
+                        real beta2,
+                        int64_t step,
+                        real alpha);
+}  // namespace paddle
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -265,6 +265,15 @@ public:
  /// print the "idx" element of the Vector
  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (BaseVector<T>::useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
 protected:
  friend class GpuVectorT<T>;
  friend class CpuVectorT<T>;
@@ -322,6 +331,11 @@ public:
  virtual void print(std::ostream& os, size_t num) const;
  virtual void printOneElement(std::ostream& os, size_t idx) const;
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<T>(*this, expr);
+  }
 protected:
  virtual void copyTo(CpuVectorT<T>* dest) const;
  virtual void copyTo(GpuVectorT<T>* dest) const;
@@ -385,6 +399,11 @@ public:
  virtual T get(size_t pos);
  virtual void print(std::ostream& os, size_t num) const;
  virtual void printOneElement(std::ostream& os, size_t idx) const;
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<T>(*this, expr);
+  }
 };
 template <class T>

--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
+add_simple_unittest(test_TrainingAlgorithm)
 add_simple_unittest(test_SparseMatrix)
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
@@ -13,6 +14,21 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+if(WITH_GPU)
+    if(COMPILER_SUPPORT_CXX11)
+    	CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
+		link_paddle_test(test_Tensor)
+        CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
+        link_paddle_test(test_lazyAssign)
+    endif()
+else()
+    compile_cu_as_cpp(test_Tensor.cu)
+    add_unittest(test_Tensor test_Tensor.cu)
+    compile_cu_as_cpp(test_lazyAssign.cu)
+    add_unittest(test_lazyAssign test_lazyAssign.cu)
+endif(WITH_GPU)
 add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)

--- a/paddle/math/tests/OriginalOptimizerApi.h
+++ b/paddle/math/tests/OriginalOptimizerApi.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/math/Vector.h"
+using namespace paddle;  // NOLINT
+void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
+                                      real alpha,
+                                      real beta,
+                                      real gamma,
+                                      real tau,
+                                      real learningRate) {
+  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                   -alpha * gamma * learningRate);
+  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                   tau * alpha * gamma * learningRate);
+  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                             tau / beta + 1.0 / alpha,
+                             *vecs[PARAMETER_MOMENTUM_VT],
+                             1.0 / beta);
+}
+void AdagradParameterOptimizer(const VectorPtr vecs[],
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate) {
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+                                                1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
+  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
+                                        epsilon,
+                                        epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+      *vecs[PARAMETER_GRADIENT],
+      *vecs[PARAMETER_LEARNING_RATE],
+      rou,
+      1.0f - rou);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+void RMSPropParameterOptimizer(const VectorPtr vecs[],
+                               real accumulatedRou,
+                               real rou,
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate,
+                               bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                           -1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
+                                      real accumulatedRou,
+                                      real rou,
+                                      real epsilon,
+                                      real learningRate,
+                                      real momentum,
+                                      real decayRate,
+                                      bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+void AdamParameterOptimizer(const VectorPtr vecs[],
+                            real beta1,
+                            real beta2,
+                            real beta1_power,
+                            real beta2_power,
+                            real epsilon,
+                            real learningRate) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  g->square2();
+  v->add(*g, beta2, 1 - beta2);
+  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+  g->sqrt2(*v);
+  g->dotDiv(*m, *g, 0., epsilon);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  theta->add(*theta, 1.0, *g, -alpha);
+}
+void AdamaxParameterOptimizer(
+    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u->mulScalar(beta2);
+  g->abs2();
+  u->max2(*u, *g);
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  g->dotDiv(*m, *u);
+  real learningRate = alpha / (1 - std::pow(beta1, step));
+  theta->add(*theta, 1.0, *g, -learningRate);
+}
--- a/paddle/math/tests/PerfUtils.h
+++ b/paddle/math/tests/PerfUtils.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+// Performance Check
+#ifdef PADDLE_DISABLE_TIMER
+#define EXPRESSION_PERFORMANCE(expression) expression;
+#else
+#include "paddle/utils/Stat.h"
+using namespace paddle;  // NOLINT
+#define EXPRESSION_PERFORMANCE(expression)                             \
+  do {                                                                 \
+    char expr[30];                                                     \
+    strncpy(expr, #expression, 30);                                    \
+    if (expr[29] != '\0') {                                            \
+      expr[27] = '.';                                                  \
+      expr[28] = '.';                                                  \
+      expr[29] = '\0';                                                 \
+    }                                                                  \
+    expression;                                                        \
+    for (int i = 0; i < 20; i++) {                                     \
+      REGISTER_TIMER(expr);                                            \
+      expression;                                                      \
+    }                                                                  \
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
+              << *globalStat.getStat(expr);                            \
+    globalStat.reset();                                                \
+  } while (0)
+#endif
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -37,13 +37,13 @@ TEST(BaseMatrix, void) {
      };
      compare(&BaseMatrix::neg);
-      compare(&BaseMatrix::exp);
+      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log);
+      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt);
+      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::square);
+      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::reciprocal);
+      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::abs);
+      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::sign);
+      compare(&BaseMatrix::sign2);
      compare(&BaseMatrix::zero);
      compare(&BaseMatrix::one);
    }
@@ -59,7 +59,7 @@ TEST(BaseMatrix, real) {
        test.cmpWithoutArg<0>(f, height, width);
      };
-      compare(&BaseMatrix::pow);
+      compare(&BaseMatrix::pow2);
      compare(&BaseMatrix::subScalar);
      compare(&BaseMatrix::mulScalar);
      compare(&BaseMatrix::divScalar);
@@ -88,21 +88,21 @@ TEST(BaseMatrix, BaseMatrix) {
      compare(&BaseMatrix::softreluDerivative);
      compare(&BaseMatrix::brelu);
      compare(&BaseMatrix::breluDerivative);
-      compare(&BaseMatrix::square);
+      compare(&BaseMatrix::square2);
      compare(&BaseMatrix::squareDerivative);
      compare(&BaseMatrix::tanh);
      compare(&BaseMatrix::tanhDerivative);
-      compare(&BaseMatrix::reciprocal);
+      compare(&BaseMatrix::reciprocal2);
      compare(&BaseMatrix::reciprocalDerivative);
-      compare(&BaseMatrix::abs);
+      compare(&BaseMatrix::abs2);
      compare(&BaseMatrix::absDerivative);
      compare(&BaseMatrix::sigmoid);
      compare(&BaseMatrix::sigmoidDerivative);
      compare(&BaseMatrix::expDerivative);
-      compare(&BaseMatrix::sign);
+      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::exp);
+      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log);
+      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt);
+      compare(&BaseMatrix::sqrt2);
      compare(&BaseMatrix::dotMul);
      compare(&BaseMatrix::dotMulSquare);
      compare(&BaseMatrix::dotSquareMul);
@@ -143,7 +143,7 @@ TEST(BaseMatrix, BaseMatrix_real) {
      compare(&BaseMatrix::addBias);
      compare(&BaseMatrix::add);
      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::pow);
+      compare(&BaseMatrix::pow2);
      compare(&BaseMatrix::addScalar);
      compare(&BaseMatrix::subScalar);
      compare(&BaseMatrix::mulScalar);
@@ -176,7 +176,7 @@ TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
      compare(&BaseMatrix::logisticRegressionLoss);
      compare(&BaseMatrix::logisticRegressionLossBp);
      compare(&BaseMatrix::biggerThan);
-      compare(&BaseMatrix::max);
+      compare(&BaseMatrix::max2);
      compare(&BaseMatrix::dotMulSquare);
      compare(&BaseMatrix::dotSquareSquare);
    }

--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/utils/Util.h"
+#include "paddle/math/TrainingAlgorithmOp.h"
+#include "OriginalOptimizerApi.h"
+#include "TensorCheck.h"
+#include "PerfUtils.h"
+using namespace paddle;  // NOLINT
+#ifndef PADDLE_TYPE_DOUBLE
+P_DEFINE_double(max_diff, 1e-5, "max diff allowed");
+#else
+P_DEFINE_double(max_diff, 1e-13, "max diff allowed");
+#endif
+class SetMaxDiff {
+public:
+  explicit SetMaxDiff(double max_diff) {
+    max_diff_ = FLAGS_max_diff;
+    FLAGS_max_diff = max_diff;
+  }
+  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
+private:
+  double max_diff_;
+};
+#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
+  do {                                                   \
+    if (vector->useGpu()) {                              \
+      cpuVec = Vector::create(vector->getSize(), false); \
+      cpuVec->copyFrom(*vector);                         \
+    } else {                                             \
+      cpuVec = vector;                                   \
+    }                                                    \
+  } while (0)
+int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (fabs(a - b) > FLAGS_max_diff) {
+      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
+        count++;
+      }
+    }
+  }
+  return count;
+}
+int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
+  VectorPtr tmp1;
+  VectorPtr tmp2;
+  COPY_VECTOR_TO_CPU(tmp1, vector1);
+  COPY_VECTOR_TO_CPU(tmp2, vector2);
+  return VectorCheckErr(*tmp1, *tmp2);
+}
+#ifdef PADDLE_DISABLE_TIMER
+#define CHECK_VECTORPTR(vector1, vector2) \
+  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
+#else
+#define CHECK_VECTORPTR(vector1, vector2)
+#endif
+typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
+void testCase(testMatrixFunc matrixFunc) {
+#ifndef PADDLE_ONLY_CPU
+  for (auto useGpu : {false, true}) {
+#else
+  for (auto useGpu : {false}) {
+#endif
+    for (auto size : {1,
+                      32,
+                      64,
+                      128,
+                      512,
+                      1024,
+                      4096,
+                      32768,
+                      65536,
+                      131072,
+                      262144,
+                      524288,
+                      1048576,
+                      2097152}) {
+      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
+      matrixFunc(size, useGpu);
+    }
+  }
+}
+#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
+  vec1[type] = Vector::create(size, useGpu);        \
+  vec2[type] = Vector::create(size, useGpu);        \
+  vec1[type]->rand();                               \
+  vec2[type]->copyFrom(*vec1[type]);
+void testAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
+      bufs1, epsilon, learningRate, momentum, decayRate));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+  EXPRESSION_PERFORMANCE(adagradApply(value,
+                                      grad,
+                                      mom,
+                                      accum_buffer,
+                                      accum,
+                                      lr,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+TEST(Training, Adagrad) { testCase(testAdagrad); }
+void testAdaDelta(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
+      bufs1, rou, epsilon, learningRate, momentum, decayRate));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+  EXPRESSION_PERFORMANCE(adadeltaApply(value,
+                                       grad,
+                                       mom,
+                                       accum,
+                                       accum_update,
+                                       lr,
+                                       rou,
+                                       epsilon,
+                                       learningRate,
+                                       momentum,
+                                       decayRate));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+TEST(Training, AdaDelta) { testCase(testAdaDelta); }
+template <bool isFirstTime>
+void testRMSProp(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+  /* make sure 'g - f.square()' greater than 0 */
+  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
+  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
+      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
+                                                   accumulatedRou,
+                                                   rou,
+                                                   epsilon,
+                                                   learningRate,
+                                                   momentum,
+                                                   decayRate,
+                                                   isFirstTime));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+  EXPRESSION_PERFORMANCE(rmspropApply(value,
+                                      grad,
+                                      mom,
+                                      sum,
+                                      sum1,
+                                      lr,
+                                      accumulatedRou,
+                                      rou,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate,
+                                      isFirstTime));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+TEST(Training, RMSProp) {
+  testCase(testRMSProp<true>);
+  testCase(testRMSProp<false>);
+}
+template <bool isFirstTime>
+void testDecayedAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+  if (isFirstTime) {
+    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+  }
+  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
+                                                          accumulatedRou,
+                                                          rou,
+                                                          epsilon,
+                                                          learningRate,
+                                                          momentum,
+                                                          decayRate,
+                                                          isFirstTime));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
+                                             grad,
+                                             mom,
+                                             sum,
+                                             lr,
+                                             accumulatedRou,
+                                             rou,
+                                             epsilon,
+                                             learningRate,
+                                             momentum,
+                                             decayRate,
+                                             isFirstTime));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+TEST(Training, DecayedAdagrad) {
+  testCase(testDecayedAdagrad<false>);
+  testCase(testDecayedAdagrad<true>);
+}
+void testAdam(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
+  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
+      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
+  EXPRESSION_PERFORMANCE(adamApply(value,
+                                   grad,
+                                   mom,
+                                   v,
+                                   beta1,
+                                   beta2,
+                                   beta1_power,
+                                   beta2_power,
+                                   epsilon,
+                                   learningRate));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
+                  bufs2[PARAMETER_SECOND_MOMENTUM]);
+}
+TEST(Training, Adam) { testCase(testAdam); }
+void testAdamax(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
+  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
+  int64_t step = 2;
+  EXPRESSION_PERFORMANCE(
+      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
+  EXPRESSION_PERFORMANCE(
+      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
+                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
+}
+TEST(Training, Adamax) {
+#ifndef PADDLE_TYPE_DOUBLE
+  SetMaxDiff diff(1e-4);
+#endif
+  testCase(testAdamax);
+}
+void testSparseMomentum(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
+  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
+  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
+      bufs1, alpha, beta, gamma, tau, learningRate));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
+  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
+  EXPRESSION_PERFORMANCE(sparseMomentumApply(
+      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
+}
+TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/math/TensorAssign.h"
+#include "TensorCheck.h"
+#include "PerfUtils.h"
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+typedef std::function<void(int height, int width)> testMatrixFunc;
+void testMatrixCase(testMatrixFunc matrixFunc) {
+  for (auto height : {1}) {
+    for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
+                       262144, 524288, 1048576, 2097152, 4194304, 8388608}) {
+      matrixFunc(height, width);
+    }
+  }
+}
+template<typename Tensor>
+void testLazyAssign(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor B(height, width);
+  Tensor C(height, width);
+  Tensor D(height, width);
+  A1.randomizeUniform();
+  B.randomizeUniform();
+  C.randomizeUniform();
+  D.randomizeUniform();
+  A2.copyFrom(A1);
+  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
+  EXPRESSION_PERFORMANCE(
+    auto expr1 = A2.lazyAssign(B + C);
+    auto expr2 = A2.lazyAssign(A2 * D);
+    AssignEvaluate(expr1, expr2););
+  TensorCheckErr(A1, A2);
+}
+TEST(lazyAssign, CPU) {
+  testMatrixCase(testLazyAssign<CpuMatrix>);
+}
+#ifndef PADDLE_ONLY_CPU
+TEST(lazyAssign, GPU) {
+  testMatrixCase(testLazyAssign<GpuMatrix>);
+}
+#endif
+template<typename Tensor>
+void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D,
+     real p1, real p2, real p3) {
+  C = C * p2 - D * (B + A * p3) * p1;
+  A += C;
+}
+void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B,
+    BaseMatrix& C, BaseMatrix& D,
+    real p1, real p2, real p3) {
+  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
+  auto expr2 = A.lazyAssign(A + C);
+  AssignEvaluate(expr1, expr2);
+}
+template<typename Tensor>
+void testSgdUpdate(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor A3(height, width);
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  A3.copyFrom(A1);
+  Tensor B(height, width);
+  B.randomizeUniform();
+  Tensor C1(height, width);
+  Tensor C2(height, width);
+  Tensor C3(height, width);
+  C1.randomizeUniform();
+  C2.copyFrom(C1);
+  C3.copyFrom(C1);
+  Tensor D(height, width);
+  D.randomizeUniform();
+  real p1 = 0.2;
+  real p2 = 0.3;
+  real p3 = 0.5;
+  /**
+   * c = p2 * c - p1 * (b + p3 * a);
+   * a = a + c;
+   */
+  // BaseMatrix API
+  EXPRESSION_PERFORMANCE(
+  A1.sgdUpdate(B, C1, D, p1, p2, p3););
+  // Tensor expression
+  EXPRESSION_PERFORMANCE(
+    sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
+  // lazyAssign
+  EXPRESSION_PERFORMANCE(
+    sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
+  TensorCheckErr(A1, A2);
+  TensorCheckErr(A1, A3);
+  TensorCheckErr(C1, C2);
+  TensorCheckErr(C1, C3);
+}
+TEST(sgdUpdate, CPU) {
+  testMatrixCase(testSgdUpdate<CpuMatrix>);
+}
+#ifndef PADDLE_ONLY_CPU
+TEST(sgdUpdate, GPU) {
+  testMatrixCase(testSgdUpdate<GpuMatrix>);
+}
+#endif
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  hl_start();
+  hl_init(0);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
+#include "paddle/math/TrainingAlgorithmOp.h"
 #include "FirstOrderOptimizer.h"
 #include <cmath>
@@ -115,19 +115,28 @@ void SparseMomentumParameterOptimizer::finishBatch() {
 void AdagradParameterOptimizer::update(const VectorPtr vecs[],
                                       const ParameterConfig& config,
                                       size_t sparseId) const {
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-                                                1.0f);
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+  real epsilon = optConfig_.ada_epsilon();
-                                   *vecs[PARAMETER_MOMENTUM],
+  real learningRate = learningRate_ * config.learning_rate();
-                                   *vecs[PARAMETER_LEARNING_RATE],
+  real momentum = config.momentum();
-                                   learningRate_ * config.learning_rate(),
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
+  adagradApply(value,
+               grad,
+               mom,
+               accum_buffer,
+               accum,
+               lr,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate);
 }
 ParameterOptimizer::TraverseCallback
@@ -152,37 +161,41 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
                                        const ParameterConfig& config,
                                        size_t sparseId) const {
  CHECK(sparseId == -1LU) << "Sparse update is not supported";
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-      *vecs[PARAMETER_GRADIENT], rou_, 1.0f - rou_);
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon_,
+  real learningRate = learningRate_ * config.learning_rate();
-                                        epsilon_);
+  real momentum = config.momentum();
-  vecs[PARAMETER_LEARNING_RATE]->sqrt();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  adadeltaApply(value,
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+                grad,
-      *vecs[PARAMETER_GRADIENT],
+                mom,
-      *vecs[PARAMETER_LEARNING_RATE],
+                accum,
-      rou_,
+                accum_update,
-      1.0f - rou_);
+                lr,
+                rou_,
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                epsilon_,
-                                   *vecs[PARAMETER_MOMENTUM],
+                learningRate,
-                                   *vecs[PARAMETER_LEARNING_RATE],
+                momentum,
-                                   learningRate_ * config.learning_rate(),
+                decayRate);
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
 }
 void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
                                       const ParameterConfig& config,
                                       size_t sparseId) const {
-  real accumulatedRou = rou_;
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+  real accumulatedRou = rou_;
  bool firstTime = timer_ == 0;
  if (sparseId != -1LU) {
    CHECK_LT(sparseId, t0Vec_.size());
@@ -191,40 +204,36 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
    t0Vec_[sparseId] = timer_ + 1;
  }
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  real epsilon = optConfig_.ada_epsilon();
-  // For the first time update, make the sum be the current square
+  real learningRate = learningRate_ * config.learning_rate();
-  // so that the initial estimation of E(g_t^2) will not be too small.
+  real momentum = config.momentum();
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-      *vecs[PARAMETER_GRADIENT],
-      accumulatedRou,
+  rmspropApply(value,
-      firstTime ? 1.0f : 1.0f - rou_);
+               grad,
+               mom,
-  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+               sum,
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+               sum1,
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou_);
+               lr,
+               accumulatedRou,
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+               rou_,
-  // Basiclly if the sign of the gradient changes more often,
+               epsilon,
-  // the learning rate will be decreased.
+               learningRate,
-  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+               momentum,
-  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+               decayRate,
-                                           -1.0f);
+               firstTime);
-  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate_ * config.learning_rate(),
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
 }
 void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
                                              const ParameterConfig& config,
                                              size_t sparseId) const {
-  real accumulatedRou = rou_;
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+  real accumulatedRou = rou_;
  bool firstTime = timer_ == 0;
  if (sparseId != -1LU) {
    CHECK_LT(sparseId, t0Vec_.size());
@@ -233,77 +242,62 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
    t0Vec_[sparseId] = timer_ + 1;
  }
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  real epsilon = optConfig_.ada_epsilon();
-  // For the first time update, make the sum be the current square
+  real learningRate = learningRate_ * config.learning_rate();
-  // so that the initial estimation of E(g_t^2) will not be too small.
+  real momentum = config.momentum();
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-      *vecs[PARAMETER_GRADIENT],
-      accumulatedRou,
+  decayedAdagradApply(value,
-      firstTime ? 1.0f : 1.0f - rou_);
+                      grad,
+                      mom,
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+                      sum,
-  // Basiclly if the bigger the magnitude gradient is,
+                      lr,
-  // the smaller the learning rate will be.
+                      accumulatedRou,
-  vecs[PARAMETER_LEARNING_RATE]->assign(optConfig_.ada_epsilon());
+                      rou_,
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+                      epsilon,
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+                      learningRate,
+                      momentum,
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                      decayRate,
-                                   *vecs[PARAMETER_MOMENTUM],
+                      firstTime);
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate_ * config.learning_rate(),
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
 }
 void AdamParameterOptimizer::update(const VectorPtr vecs[],
                                    const ParameterConfig& config,
                                    size_t sparseId) const {
  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  real beta1_power = std::pow(beta1_, step_);
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  real beta2_power = std::pow(beta2_, step_);
-  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  real learningRate = config.learning_rate() * learningRate_;
-  Vector* theta = vecs[PARAMETER_VALUE].get();
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  m->add(*g, beta1_, 1 - beta1_);
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  g->square();
+  adamApply(value,
-  v->add(*g, beta2_, 1 - beta2_);
+            grad,
+            mom,
-  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+            v,
-  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+            beta1_,
-  g->sqrt(*v);
+            beta2_,
-  g->dotDiv(*m, *g, 0., epsilon_);
+            beta1_power,
-  real alpha = config.learning_rate() * learningRate_;
+            beta2_power,
-  alpha = alpha * std::sqrt(1 - std::pow(beta2_, step_)) /
+            epsilon_,
-          (1 - std::pow(beta1_, step_));
+            learningRate);
-  theta->add(*theta, 1.0, *g, -alpha);
 }
 void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
                                      const ParameterConfig& config,
                                      size_t sparseId) const {
  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  real learningRate = config.learning_rate() * learningRate_;
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1_, 1 - beta1_);
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  u->mulScalar(beta2_);
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  g->abs();
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  u->max(*u, *g);
+  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
-  g->dotDiv(*m, *u);
-  real learningRate = config.learning_rate() * learningRate_;
-  learningRate /= (1 - std::pow(beta1_, step_));
-  theta->add(*theta, 1.0, *g, -learningRate);
 }
 void OptimizerWithGradientClipping::update(const VectorPtr vecs[],

--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-RUN apt-get update && \
+RUN apt-get update \
-    apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
    python-protobuf python-numpy python-dev swig openssh-server \
    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen && \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    apt-get clean -y
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    && apt-get clean -y
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
    sphinx sphinx_rtd_theme breathe recommonmark
@@ -25,6 +26,7 @@ ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 RUN mkdir /paddle
 COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
+VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
 RUN pip install /usr/local/opt/paddle/share/wheels/*.whl

--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-RUN apt-get update && \
+RUN apt-get update \
-    apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
    python-protobuf python-numpy python-dev swig openssh-server \
    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen && \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    apt-get clean -y
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    && apt-get clean -y
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
    sphinx sphinx_rtd_theme breathe recommonmark
@@ -25,6 +26,7 @@ ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 RUN mkdir /paddle
 COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
+VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
 RUN pip install /usr/local/opt/paddle/share/wheels/*.whl

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -20,8 +20,28 @@ cmake .. \
      -DWITH_AVX=${WITH_AVX} \
      -DWITH_SWIG_PY=ON \
      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=OFF
+      -DWITH_STYLE_CHECK=OFF \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 make -j `nproc`
 make install
+# Install woboq_codebrowser.
+git clone https://github.com/woboq/woboq_codebrowser /woboq
+cd /woboq
+cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+      -DCMAKE_BUILD_TYPE=Release \
+      .
+make
+export WOBOQ_OUT=/usr/share/nginx/html/paddle
+export BUILD_DIR=/paddle/build
+mkdir -p $WOBOQ_OUT
+cp -rv /woboq/data $WOBOQ_OUT/../data
+/woboq/generator/codebrowser_generator \
+    -b /paddle/build \
+    -a \
+    -o $WOBOQ_OUT \
+    -p paddle:/paddle
+/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
 trap : 0
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -17,22 +17,22 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 #include <google/protobuf/text_format.h>
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "TesterConfig.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/gserver/layers/ValidationLayer.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "TesterConfig.h"
 namespace paddle {
@@ -66,6 +66,9 @@ Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
 }
 void Tester::startTestPeriod() {
+  if (testDataProvider_) {
+    testDataProvider_->reset();
+  }
  testEvaluator_->start();
  testContext_.cost = 0;
  testContext_.numSamples = 0;
@@ -87,27 +90,18 @@ void Tester::testOneDataBatch(const DataBatch& dataBatch,
 void Tester::testOnePeriod() {
  DataBatch dataBatch;
  int64_t batchSize = config_->getOptConfig().batch_size();
-  int batches = std::numeric_limits<int>::max();
  std::vector<Argument> outArgs;
  startTestPeriod();
-  for (int i = 0; i < batches; ++i) {
+  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
-    int num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
-    if (num == 0) {
-      testDataProvider_->reset();
-      if (intconfig_->prevBatchState) {
-        gradientMachine_->resetState();
-      }
-      break;
-    }
    testOneDataBatch(dataBatch, &outArgs);
  }
  finishTestPeriod();
 }
 void Tester::finishTestPeriod() {
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->resetState();
+  }
  testEvaluator_->finish();
  CHECK_GT(testContext_.numSamples, 0)
      << "There is no samples in your test batch. Possibly "

--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -17,36 +17,38 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 #include <google/protobuf/text_format.h>
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/Excepts.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "RemoteParameterUpdater.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
 #include "TesterConfig.h"
 #include "ThreadParameterUpdater.h"
-#include "RemoteParameterUpdater.h"
 #include "TrainerConfigHelper.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
 P_DEFINE_string(config, "", "Trainer config file");
-P_DEFINE_int32(test_period, 0,
+P_DEFINE_int32(test_period,
+               0,
               "if equal 0, do test on all test data at the end of "
               "each pass. While if equal non-zero, do test on all test "
               "data every test_period batches");
-P_DEFINE_bool(test_all_data_in_one_period, false,
+P_DEFINE_bool(test_all_data_in_one_period,
-               "This option was deprecated, since we will always do "
+              false,
-               "test on all test set ");
+              "This option was deprecated, since we will always do "
+              "test on all test set ");
 P_DEFINE_bool(local, true, "Train in local mode or not");
@@ -392,10 +394,6 @@ void Trainer::startTrain() {
    dataProvider_->reset();
  }
-  if (this->testDataProvider_) {
-    this->testDataProvider_->reset();
-  }
  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
 }
@@ -630,16 +628,14 @@ void Trainer::test() { tester_->test(); }
 std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
  TesterConfig* conf = new TesterConfig;
  if (FLAGS_test_period) {
-    LOG(WARNING)
+    LOG(WARNING) << "The meaning of --test_period is changed: "
-      << "The meaning of --test_period is changed: "
+                 << "if equal 0, do test on all test data at the end of "
-      << "if equal 0, do test on all test data at the end of "
+                 << "each pass. While if equal non-zero, do test on all test "
-      << "each pass. While if equal non-zero, do test on all test "
+                 << "data every test_period batches ";
-      << "data every test_period batches ";
  }
  if (FLAGS_test_all_data_in_one_period) {
-    LOG(WARNING)
+    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
-      << "--test_all_data_in_one_period was deprecated, since "
+                 << "we will always do test on all test set ";
-      << "we will always do test on all test set ";
  }
  conf->testPeriod = FLAGS_test_period;
  conf->prevBatchState = FLAGS_prev_batch_state;