diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 980a97a07c996eca2e8c126a6ad5ab7f340fa1e5..2ca988c406ae2987e26ca37dbc17cc0a2af43743 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,10 +17,14 @@
     -   id: detect-private-key
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
     -   id: end-of-file-fixer
--   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
-    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+-   repo: local
     hooks:
-    -   id: clang-formater
+    -   id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: clang-format -i
+        language: system
+        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index ef31c252038ce18655913c0f41343fe6dc7dbb86..d00a9bb3a30cfb16623e073414088059481c3e1a 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag)
         if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
             message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
         endif()
+        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
+        # Use Debug mode instead for now.
+        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
+            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+        endif()
     elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
         # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 1054c3d3f15c2dcc55e585d6b82f9a983b7f3dfa..372272a53c12c314fc80eebbce5eae9fcabc55ba 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -104,6 +104,11 @@ cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
     :noindex:
+
+row_l2_norm
+-----------
+..  autoclass:: paddle.v2.layer.row_l2_norm
+    :noindex:
     
 Recurrent Layers
 ================
diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/detail/tensor-inl.h
index e7ff09dd5c954378afeca299e901277c3ebdb96a..92621f8c18ec0d03160a23c462830d14272c7f64 100644
--- a/paddle/framework/detail/tensor-inl.h
+++ b/paddle/framework/detail/tensor-inl.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
 #include "paddle/memory/memcpy.h"
 
 namespace paddle {
@@ -62,9 +61,11 @@ inline T* Tensor::mutable_data(platform::Place place) {
     if (platform::is_cpu_place(place)) {
       holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
           boost::get<platform::CPUPlace>(place), size));
+    } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_ONLY_CPU
+      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
     }
-#ifndef PADDLE_ONLY_CPU
-    else if (platform::is_gpu_place(place)) {
+#else
       holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
           boost::get<platform::GPUPlace>(place), size));
     }
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index cfe9cba308556475ef64b45e7178dfc418761598..cb86e6be2be3624bf54ee28193ca5d4c7bafa0eb 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -20,16 +20,16 @@ namespace paddle {
 namespace framework {
 
 template <>
-Eigen::DefaultDevice* ExecutionContext::GetEigenDevice<
+Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
     platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }
 
 #ifndef PADDLE_ONLY_CPU
 template <>
-Eigen::GpuDevice*
+Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 0832a663dd01fe2921366d70599bc867e73af47c..55435103489ace11868eed61c38018d8ba357e65 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -253,7 +253,7 @@ class ExecutionContext : public OperatorContext {
   template <typename PlaceType,
             typename DeviceType =
                 typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType* GetEigenDevice() const;
+  DeviceType& GetEigenDevice() const;
 
   platform::Place GetPlace() const { return device_context_.GetPlace(); }
 
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d609be43b73a86d0d0f7b60be993836e2ea6fff
--- /dev/null
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for L2 normalization in each row,
+ * \f[
+ *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+ * \f]
+ * where the size of \f$in\f$ is (batchSize x dataDim),
+ * and the size of \f$out\f$ is (batchSize x dataDim).
+ */
+
+class RowL2NormLayer : public Layer {
+protected:
+  MatrixPtr inSquare_;
+  MatrixPtr l2NormReciprocal_;
+  MatrixPtr dotSum_;
+
+public:
+  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
+
+bool RowL2NormLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void RowL2NormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+  CHECK_EQ(dataDim, inV->getWidth());
+  resetOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
+  inV->square2(*inSquare_);
+  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
+  inSquare_->rowSum(*l2NormReciprocal_);
+  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
+  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
+  outV->rowScale(0, *inV, *l2NormReciprocal_);
+}
+
+void RowL2NormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+  size_t batchSize = inV->getHeight();
+
+  // inG[ij] += outG[ij] / l2NormReciprocal
+  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
+  // inV[i])
+  if (inG) {
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    inSquare_->rowScale(0, *inV, *dotSum_);
+    inG->sub(*inSquare_);
+    inG->addRowScale(0, *outG, *l2NormReciprocal_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index c726e5a74eed26aec5cf9ea9aea2386335d06119..fe11278f41c0118ee0bdb34f17fbf9602e0fa76b 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1916,6 +1916,19 @@ TEST(Layer, ClipLayer) {
   }
 }
 
+TEST(Layer, RowL2NormLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_l2_norm");
+  config.layerConfig.set_size(size);
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index 79d8de6cd46e1c72b14b0554c7be7b4eee281f4c..f961b37565f400b5c26844b9e7a3cff5e682340b 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
 
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
index d2b649fcbd1e5cac1c8cfcfd4e522e41135f7d1f..a4ee407cae1ec4974ae5addd8626686d908eaf32 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -28,8 +28,7 @@ public:
 
     output->mutable_data<T>(context.GetPlace());
 
-    EigenVector<T>::Flatten(*output).device(
-        *(context.GetEigenDevice<Place>())) =
+    EigenVector<T>::Flatten(*output).device(context.GetEigenDevice<Place>()) =
         framework::EigenVector<T>::Flatten(*input0) +
         framework::EigenVector<T>::Flatten(*input1);
   }
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 19e4b74596a0f59edd04db830ec6f6f481373465..926a0c616b957d8e542c1f3dee227a718fb29f07 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
 
 REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index 5f7d443751d1cdd7de3b67b0de2758ba1d566fb3..20f21105298a20b2083c35e3a5d8b8049b0752ca 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -27,7 +27,7 @@ public:
 
     output->mutable_data<T>(context.GetPlace());
 
-    EigenScalar<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
+    EigenScalar<T>::From(*output).device(context.GetEigenDevice<Place>()) =
         EigenVector<T>::Flatten(*input).mean();
   }
 };
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index c27fc886ce7238a13c8ef86bce673a2b54949a9d..dc9236701627dc9335b844d2a82e18eb1f7dfd42 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -12,6 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 
 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index eef72ab293e13a9d05ce0013be41ec4bb75d6077..1d0617ab8bd44297ee112c58542cf6487118e801 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -29,7 +29,7 @@ public:
     auto output = context.Output<Tensor>(0);
     output->mutable_data<T>(context.GetPlace());
 
-    EigenMatrix<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
+    EigenMatrix<T>::From(*output).device(context.GetEigenDevice<Place>()) =
         EigenMatrix<T>::From(*context.Input<Tensor>("X"))
             .contract(EigenMatrix<T>::From(*context.Input<Tensor>("Y")),
                       dim_pair);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 4b33e38ebabe853e179fe70ef7fde0a80b9050e2..82338ceccc06653791b26472e18d804f62735649 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"
 
 REGISTER_OP_GPU_KERNEL(rowwise_add,
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index b86dd5463436bf521f9939b1c421b39f11102769..bd4d1128955fb718d3a84dfd96d8c68d7196e9cc 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -33,7 +33,7 @@ public:
     const int rest_size = input.size() / bias_size;
     Eigen::DSizes<int, 1> one_d(input.size());
     Eigen::DSizes<int, 1> bcast(rest_size);
-    output.reshape(one_d).device(*(context.GetEigenDevice<Place>())) =
+    output.reshape(one_d).device(context.GetEigenDevice<Place>()) =
         input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
   }
 };
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index f8f5b90cab460b4457cfb0a88bfc012bafe0fbc2..d79258cbf13c699cfb2afaee229cf96a3e377b5e 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
 
 REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index af1dfdd756ceb9991bee6b85c3281c05f0fb5a9f..d8ddbac573ffae8bd5d4b53041ed4dd35bb5ae0c 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -29,7 +29,7 @@ public:
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
-    EigenVector<T>::Flatten(*param_out).device(*(ctx.GetEigenDevice<Place>())) =
+    EigenVector<T>::Flatten(*param_out).device(ctx.GetEigenDevice<Place>()) =
         EigenVector<T>::Flatten(*param) - lr * EigenVector<T>::Flatten(*grad);
   }
 };
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
index f679b20418f04eff4310efe4e121963ce5a235e0..c9d11a2e1f9dcc563765c9e8cc1bae6beff57f18 100644
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"
 
 REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index 3dd23a9ebc7ac0972d6ee07b9ac051d59e66f62f..f518ddcf3b2381f0b8ceec3eee260f033a847263 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -27,8 +27,7 @@ public:
     auto output = context.Output<Tensor>(0);
     output->mutable_data<T>(context.GetPlace());
 
-    EigenVector<T>::Flatten(*output).device(
-        *(context.GetEigenDevice<Place>())) =
+    EigenVector<T>::Flatten(*output).device(context.GetEigenDevice<Place>()) =
         1.0 / (1.0 + (-1.0 * EigenVector<T>::Flatten(*input)).exp());
   }
 };
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index a1f6944a369fe5148ffcfeabf3bf7063dcbc2664..ddf8f6e913ccf450185f377f531bf978f69ed1fc 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -1,3 +1,4 @@
+#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"
 
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index a5c19c5fc7c6f5909dbb355aff09bf15405b6957..75c5197697dada58e09f4cda41cea13af56e79a3 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -46,9 +46,9 @@ public:
                                .reshape(batch_by_one)
                                .broadcast(one_by_class));
 
-    softmax.device(*(context.GetEigenDevice<Place>())) = shifted_logits.exp();
+    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
 
-    softmax.device(*(context.GetEigenDevice<Place>())) =
+    softmax.device(context.GetEigenDevice<Place>()) =
         (softmax *
          softmax.sum(along_class)
              .inverse()
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 26c8eb78e614a68ec9728aad727d8fe3e08547ae..60a42c777d1c2ebbc22fdb77b1100cc6fcf7ff35 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -144,12 +144,12 @@ inline void throw_on_error(T e) {
   throw_on_error(e, "");
 }
 
-#define PADDLE_THROW(...)                                      \
-  do {                                                         \
-    throw ::paddle::platform::EnforceNotMet(                   \
-        std::make_exception_ptr(                               \
-            std::runtime_error(string::Sprintf(__VA_ARGS__))), \
-        __FILE__, __LINE__);                                   \
+#define PADDLE_THROW(...)                                              \
+  do {                                                                 \
+    throw ::paddle::platform::EnforceNotMet(                           \
+        std::make_exception_ptr(                                       \
+            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
+        __FILE__, __LINE__);                                           \
   } while (0)
 
 #define PADDLE_ENFORCE(...)                                             \
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 801ef50e577d563f4534f33e49aa7b72ab840d89..d3cde07bd045dfe46be2363c14720c1393221157 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
 #include "paddle/pybind/tensor_bind.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@@ -55,6 +57,14 @@ static size_t UniqueIntegerGenerator() {
   return generator.fetch_add(1);
 }
 
+bool IsCompileGPU() {
+#ifdef PADDLE_ONLY_CPU
+  return false;
+#else
+  return true;
+#endif
+}
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
@@ -69,15 +79,27 @@ PYBIND11_PLUGIN(core) {
              self.Resize(pd::make_ddim(dim));
            })
       .def("alloc_float",
-           [](pd::Tensor& self) {
-             self.mutable_data<float>(paddle::platform::CPUPlace());
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+             self.mutable_data<float>(place);
            })
       .def("alloc_int",
-           [](pd::Tensor& self) {
-             self.mutable_data<int>(paddle::platform::CPUPlace());
+           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+             self.mutable_data<int>(place);
            })
-      .def("set", paddle::pybind::PyTensorSetFromArray<float>)
-      .def("set", paddle::pybind::PyTensorSetFromArray<int>)
+      .def("alloc_int",
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+             self.mutable_data<int>(place);
+           })
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<int>)
+#ifndef PADDLE_ONLY_CPU
+      .def("set", paddle::pybind::PyCUDATensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCUDATensorSetFromArray<int>)
+#endif
       .def("shape",
            [](pd::Tensor& self) { return pd::vectorize(self.dims()); });
 
@@ -136,11 +158,27 @@ All parameter, weight, gradient are variables in Paddle.
        "The module will return special predefined variable name in Paddle")
       .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
       .def("temp", pd::OperatorBase::TMP_VAR_NAME);
-
+  // clang-format off
   py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
-      .def_static("cpu_context", []() -> paddle::platform::DeviceContext* {
-        return new paddle::platform::CPUDeviceContext();
-      });
+      .def_static("create",
+                  [](paddle::platform::CPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+                    return new paddle::platform::CPUDeviceContext();
+                  })
+      .def_static("create",
+                  [](paddle::platform::GPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifdef PADDLE_ONLY_CPU
+                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
+#else
+                    return new paddle::platform::CUDADeviceContext(place);
+#endif
+                  });
+  // clang-format on
+
+  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
+
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
 
   py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
       m, "Operator");
@@ -176,5 +214,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("unique_integer", UniqueIntegerGenerator);
 
+  m.def("is_compile_gpu", IsCompileGPU);
+
   return m.ptr();
 }
diff --git a/paddle/pybind/tensor_bind.h b/paddle/pybind/tensor_bind.h
index 995e102bf9d342e1604f5ae704288d6cf68d97a4..def37219ccefd5435f1212c4e4daac5a351d76f4 100644
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
@@ -13,9 +13,11 @@
    limitations under the License. */
 
 #pragma once
-#include <paddle/framework/tensor.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
+#include <string>
+#include "paddle/framework/tensor.h"
+#include "paddle/memory/memcpy.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
 
 namespace py = pybind11;
 
@@ -40,9 +42,6 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
   py::buffer_info operator()(framework::Tensor &tensor) {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
-                   "Only CPU tensor can cast to numpy array");
-
     if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
@@ -56,12 +55,17 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         strides[i - 1] = sizeof(CUR_TYPE) * prod;
         prod *= dims_outside[i - 1];
       }
-
+      framework::Tensor dst_tensor;
+      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
+        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
+      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
+        dst_tensor = tensor;
+      }
       return py::buffer_info(
-          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
           sizeof(CUR_TYPE),
           py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(tensor.dims()),
+          (size_t)framework::arity(dst_tensor.dims()),
           dims_outside,
           strides);
     } else {
@@ -77,9 +81,10 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 }
 
 template <typename T>
-void PyTensorSetFromArray(
+void PyCPUTensorSetFromArray(
     framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
   std::vector<int> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
@@ -87,9 +92,28 @@ void PyTensorSetFromArray(
   }
 
   self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  auto *dst = self.mutable_data<T>(place);
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 
+#ifndef PADDLE_ONLY_CPU
+template <typename T>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::GPUPlace &place) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  paddle::platform::GpuMemcpySync(
+      dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
+}
+#endif
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 3860facb099950a5287d3f6b89c3de38f588f568..69ae0ea2d72c199a8e17c0595693e5e0b2f79ee1 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -148,7 +148,7 @@ cat >> /paddle/build/Dockerfile <<EOF
 ADD *.deb /
 # run paddle version to install python packages first
 RUN apt-get update &&\
-    apt-get install -y python-pip && pip install -U pip && \
+    apt-get install -y wget python-pip && pip install -U pip && \
     dpkg -i /*.deb ; apt-get install -f -y && \
     apt-get clean -y && \
     rm -f /*.deb && \
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 997f5bb47e19655522150ed427f437eca5f48fd7..9ea69fc5e57636c22fb20d5d97de760b9cc3bcde 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2768,6 +2768,16 @@ class SumToOneNormLayer(LayerBase):
         self.set_layer_size(input_layer0.size)
 
 
+@config_layer('row_l2_norm')
+class RowL2NormLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(RowL2NormLayer, self).__init__(
+            name, 'row_l2_norm', 0, inputs=inputs, **xargs)
+        config_assert(len(self.inputs) == 1, 'RowL2NormLayer must have 1 input')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+
+
 @config_layer('cos_vm')
 class CosSimVecMatLayer(LayerBase):
     def __init__(self, name, size, inputs, cos_scale=1.0, device=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 8c8a14dc1d9ac7b3f6131d9393b76e24a91457c3..ea5fdcc50f6abbc67fb61b7fd56c100d9f9811d0 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -76,6 +76,7 @@ __all__ = [
     'trans_layer',
     'rotate_layer',
     'sum_to_one_norm_layer',
+    'row_l2_norm_layer',
     'get_output_layer',
     'LayerType',
     'context_projection',
@@ -161,6 +162,7 @@ class LayerType(object):
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
     SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
+    ROW_L2_NORM_LAYER = 'row_l2_norm'
     ADDTO_LAYER = 'addto'
 
     CONCAT_LAYER = 'concat'
@@ -2891,6 +2893,42 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
         name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size)
 
 
+@wrap_name_default()
+@layer_support()
+def row_l2_norm_layer(input, name=None, layer_attr=None):
+    """
+    A layer for L2-normalization in each row.
+
+    .. math::
+       out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+
+    where the size of :math:`in` is (batchSize x dataDim) ,
+    and the size of :math:`out` is a (batchSize x dataDim) .
+
+    The example usage is:
+
+    .. code-block:: python
+
+       row_l2_norm_layer = row_l2_norm_layer(input=layer)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.ROW_L2_NORM_LAYER,
+        inputs=[input.name],
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.ROW_L2_NORM_LAYER, parents=[input], size=input.size)
+
+
 @wrap_name_default("addto")
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index d71c41f77e5d7f460e3a012a4ad094436509782e..0ffa58bc1e2088f75e7cd25c7ecdffbe270825a4 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -7,6 +7,6 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology test_gated_unit_layer test_clip_layer)
+test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..c2786ff55c7023d856d739face5e747cc5fee870
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__row_l2_norm_layer_0__"
+  type: "row_l2_norm"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__row_l2_norm_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__row_l2_norm_layer_0__"
+  input_layer_names: "input"
+  output_layer_names: "__row_l2_norm_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac8badb26a40e96e75225e6f61aa536cd28e9098
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+row_l2_norm = row_l2_norm_layer(input=data)
+
+outputs(row_l2_norm)
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 540636a0e8100fbf97231bd548dbc1176b07daca..4619b0edc3dd7e253e01f7fee5e6a8641340d291 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -8,7 +8,6 @@ add_python_test(test_framework
     test_fc_op.py
     test_add_two_op.py
     test_sgd_op.py
-    test_cross_entropy_op.py
     test_mul_op.py
     test_mean_op.py
     test_sigmoid_op.py
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index 99085c367221150c8386a24e8d90d58fd63894c4..98fae1b975ad6243b20e5c19ec6ff68d5536cd74 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -26,40 +26,45 @@ class OpTestMeta(type):
 
             scope = core.Scope()
             kwargs = dict()
+            places = []
+            places.append(core.CPUPlace())
+            if core.is_compile_gpu():
+                places.append(core.GPUPlace(0))
 
-            for in_name in func.all_input_args:
-                if hasattr(self, in_name):
-                    kwargs[in_name] = in_name
-                    var = scope.new_var(in_name).get_tensor()
-                    arr = getattr(self, in_name)
-                    var.set_dims(arr.shape)
-                    var.set(arr)
-                else:
-                    kwargs[in_name] = "@EMPTY@"
+            for place in places:
+                for in_name in func.all_input_args:
+                    if hasattr(self, in_name):
+                        kwargs[in_name] = in_name
+                        var = scope.new_var(in_name).get_tensor()
+                        arr = getattr(self, in_name)
+                        var.set_dims(arr.shape)
+                        var.set(arr, place)
+                    else:
+                        kwargs[in_name] = "@EMPTY@"
 
-            for out_name in func.all_output_args:
-                if hasattr(self, out_name):
-                    kwargs[out_name] = out_name
-                    scope.new_var(out_name).get_tensor()
+                for out_name in func.all_output_args:
+                    if hasattr(self, out_name):
+                        kwargs[out_name] = out_name
+                        scope.new_var(out_name).get_tensor()
 
-            for attr_name in func.all_attr_args:
-                if hasattr(self, attr_name):
-                    kwargs[attr_name] = getattr(self, attr_name)
+                for attr_name in func.all_attr_args:
+                    if hasattr(self, attr_name):
+                        kwargs[attr_name] = getattr(self, attr_name)
 
-            op = func(**kwargs)
+                op = func(**kwargs)
 
-            op.infer_shape(scope)
+                op.infer_shape(scope)
 
-            ctx = core.DeviceContext.cpu_context()
-            op.run(scope, ctx)
+                ctx = core.DeviceContext.create(place)
+                op.run(scope, ctx)
 
-            for out_name in func.all_output_args:
-                actual = numpy.array(scope.find_var(out_name).get_tensor())
-                expect = getattr(self, out_name)
-                # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
-                # has some diff, and could not pass unittest. So I set decimal 3 here.
-                # And I will check this in future.
-                numpy.testing.assert_almost_equal(actual, expect, decimal=3)
+                for out_name in func.all_output_args:
+                    actual = numpy.array(scope.find_var(out_name).get_tensor())
+                    expect = getattr(self, out_name)
+                    # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
+                    # has some diff, and could not pass unittest. So I set decimal 3 here.
+                    # And I will check this in future.
+                    numpy.testing.assert_almost_equal(actual, expect, decimal=3)
 
         obj.test_all = test_all
         return obj
diff --git a/python/paddle/v2/framework/tests/test_add_two_op.py b/python/paddle/v2/framework/tests/test_add_two_op.py
index a06d7a78ecf838a49e5f2808d3686c6b92faa8ce..73b373490956c741f06be2473e70592dc06fbc25 100644
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -8,8 +8,8 @@ class TestAddOp(unittest.TestCase):
 
     def setUp(self):
         self.type = "add_two"
-        self.X = numpy.random.random((342, 345)).astype("float32")
-        self.Y = numpy.random.random((342, 345)).astype("float32")
+        self.X = numpy.random.random((102, 105)).astype("float32")
+        self.Y = numpy.random.random((102, 105)).astype("float32")
         self.Out = self.X + self.Y
 
 
diff --git a/python/paddle/v2/framework/tests/test_fc_op.py b/python/paddle/v2/framework/tests/test_fc_op.py
index 43931aac406cd93beede008066aa1c0c00eba6ea..00dc4399aaf59e6382692c3a4356f89a7e79a0c5 100644
--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
@@ -7,17 +7,19 @@ import paddle.v2.framework.create_op_creation_methods as creation
 class TestFc(unittest.TestCase):
     def test_fc(self):
         scope = core.Scope()
+        place = core.CPUPlace()
         x = scope.new_var("X")
+
         x_tensor = x.get_tensor()
         x_tensor.set_dims([1000, 784])
-        x_tensor.alloc_float()
+        x_tensor.alloc_float(place)
 
         w = scope.new_var("W")
         w_tensor = w.get_tensor()
         w_tensor.set_dims([784, 100])
-        w_tensor.alloc_float()
+        w_tensor.alloc_float(place)
 
-        w_tensor.set(numpy.random.random((784, 100)).astype("float32"))
+        w_tensor.set(numpy.random.random((784, 100)).astype("float32"), place)
 
         # Set a real numpy array here.
         # x_tensor.set(numpy.array([]))
@@ -32,7 +34,7 @@ class TestFc(unittest.TestCase):
         op.infer_shape(scope)
         self.assertEqual([1000, 100], tensor.shape())
 
-        ctx = core.DeviceContext.cpu_context()
+        ctx = core.DeviceContext.create(place)
 
         op.run(scope, ctx)
 
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py
index 0a87e66cd03af1bf84be8ffe111e4a8c3a24d6dc..e1ac66d3a4d23d617f7c5a4d97d070b2660954c8 100644
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -8,8 +8,8 @@ class TestMulOp(unittest.TestCase):
 
     def setUp(self):
         self.type = "mul"
-        self.X = np.random.random((32, 784)).astype("float32")
-        self.Y = np.random.random((784, 100)).astype("float32")
+        self.X = np.random.random((32, 84)).astype("float32")
+        self.Y = np.random.random((84, 100)).astype("float32")
         self.Out = np.dot(self.X, self.Y)
 
 
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
index ef1514983c03f822f84b85437d1cfe653b6a1a2e..04abc14ee198fe4e2307e009c696a2b40ec271b6 100644
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -8,8 +8,8 @@ class TestRowwiseAddOp(unittest.TestCase):
 
     def setUp(self):
         self.type = "rowwise_add"
-        self.X = np.random.random((32, 784)).astype("float32")
-        self.b = np.random.random(784).astype("float32")
+        self.X = np.random.random((32, 84)).astype("float32")
+        self.b = np.random.random(84).astype("float32")
         self.Out = np.add(self.X, self.b)
 
 
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
index 405d73b224fa153e50b4ec408a921f2bdaab46aa..ca03cc11abe2ceb31b33a87797aa752943dd2a7d 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -8,8 +8,8 @@ class TestSGD(unittest.TestCase):
 
     def setUp(self):
         self.type = "sgd"
-        self.param = numpy.random.random((342, 345)).astype("float32")
-        self.grad = numpy.random.random((342, 345)).astype("float32")
+        self.param = numpy.random.random((102, 105)).astype("float32")
+        self.grad = numpy.random.random((102, 105)).astype("float32")
         self.learning_rate = 0.1
         self.param_out = self.param - self.learning_rate * self.grad
 
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py
index 6d59863cea29832f648139e07a134050e22bfa21..1af39818a305215b45219b8c5f0a10630fd64279 100644
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -7,16 +7,17 @@ class TestScope(unittest.TestCase):
     def test_int_tensor(self):
         scope = core.Scope()
         var = scope.new_var("test_tensor")
+        place = core.CPUPlace()
+
         tensor = var.get_tensor()
 
         tensor.set_dims([1000, 784])
-        tensor.alloc_int()
-
+        tensor.alloc_int(place)
         tensor_array = numpy.array(tensor)
         self.assertEqual((1000, 784), tensor_array.shape)
         tensor_array[3, 9] = 1
         tensor_array[19, 11] = 2
-        tensor.set(tensor_array)
+        tensor.set(tensor_array, place)
 
         tensor_array_2 = numpy.array(tensor)
         self.assertEqual(1.0, tensor_array_2[3, 9])
@@ -25,16 +26,18 @@ class TestScope(unittest.TestCase):
     def test_float_tensor(self):
         scope = core.Scope()
         var = scope.new_var("test_tensor")
+        place = core.CPUPlace()
+
         tensor = var.get_tensor()
 
         tensor.set_dims([1000, 784])
-        tensor.alloc_float()
+        tensor.alloc_float(place)
 
         tensor_array = numpy.array(tensor)
         self.assertEqual((1000, 784), tensor_array.shape)
         tensor_array[3, 9] = 1.0
         tensor_array[19, 11] = 2.0
-        tensor.set(tensor_array)
+        tensor.set(tensor_array, place)
 
         tensor_array_2 = numpy.array(tensor)
         self.assertAlmostEqual(1.0, tensor_array_2[3, 9])