merge develop

5229ccbd · Yang Yang · 1d9fd1c0 · 89ead8d1 · 5229ccbd · 5229ccbd
74 changed file
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -15,6 +15,13 @@
 include(ExternalProject)

 set(BOOST_PROJECT       "extern_boost")
+# To release PaddlePaddle as a pip package, we have to follow the
+# manylinux1 standard, which features as old Linux kernels and
+# compilers as possible and recommends CentOS 5. Indeed, the earliest
+# CentOS version that works with NVIDIA CUDA is CentOS 6.  And a new
+# version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
+# checked that the devtools package of CentOS 6 installs boost 1.41.0.
+# So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR           "boost_1_41_0")
 set(BOOST_URL           "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz")

--- a/doc/design/parallel_do.md
+++ b/doc/design/parallel_do.md
+# Design Doc: Parallel_Do in PaddlePaddle
+
+In PaddlePaddle, we use parallel_do primitive to represent multithread data parallel processing.
+
+## Design overview
+
+The definition of a parallel_do op looks like the following
+
+```c++
+AddInput(kInputs, "Inputs needed to be split onto different devices").AsDuplicable();
+AddInput(kParameters, "Parameters are duplicated over different devices")
+    .AsDuplicable();
+AddInput(kPlaces, "Devices used for parallel processing");
+AddOutput(kOutputs, "Outputs needed to be merged from different devices").AsDuplicable();
+AddOutput(kParallelScopes,
+          "Scopes for all local variables in forward pass. One scope for each device");
+AddAttr<framework::BlockDesc *>(kParallelBlock,
+                                "List of operaters to be executed in parallel");
+```
+
+A vanilla implementation of parallel_do can be shown as the following (`|` means single thread and
+`||||` means multiple threads)
+
+```
+In the forward pass
+  |      Split input onto different devices
+  |      Copy parameter to onto different devices
+  ||||   Compute forward pass in parallel
+  |      Merge output from different devices
+
+In the backward pass
+  |      Split output@grad onto different devices
+  ||||   Compute backward pass in parallel
+  |      accumulate param@grad from different devices to the first device
+  |      Merge input@grad from different devices
+  |      Copy param@grad to the place of parallel_do_op
+```
+
+This implementation allows to write mixed device program like this
+
+```python
+# get embedding feature on CPU
+feature = some_cpu_only_op(data)
+
+gpu_places = get_place(use_gpu=True)
+# parallel processing on multiple GPUs
+pd = ParallelDo(gpu_places)
+with pd.do():
+    read_input(feature)
+    prediction = my_net(feature)
+    write_output(prediction)
+prediction = pd()
+loss = cross_entropy(prediction, label)
+```
+
+And the programDesc are like the following
+
+```
+# start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU
+start_program
+{
+  vars: w1, w2
+  ops: init(w1), init(w2)
+}
+
+main_program
+{
+block0 {
+  vars: data, places, w1, w2
+  ops: data, get_place, parallel_do(block1),
+       parallel_do_grad(block2),
+       sgd(w2, w2_grad),
+       sgd(w1, w1_grad)
+}
+block1 {
+  parent_block: 0
+  vars: data, h1, h2, loss
+  ops: fc, fc, softmax
+}
+block2 {
+  parent_block: 1
+  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  ops: softmax_grad,
+       fc_grad
+       fc_grad
+}
+}
+```
+
+## Proformance Imporvement
+
+There are serial places we can make this parallel_do faster.
+
+### forward: split input onto different devices
+
+If the input of the parallel_do is independent from any prior opeartors, we can avoid this step by 
+prefetching the input onto different devices in a seperate background thread. And the python code
+looks like this.
+```python
+pd = ParallelDo(gpu_places)
+with pd.do():
+    feature = get_data_from_prefetch_queue(gpu_places)
+    prediction = my_net(feature)
+    write_output(activation)
+```
+
+### forward: Copy parameter to onto different devices
+
+We can avoid this step by making each device have a copy of the parameter. This requires:
+
+1. `fluid.default_start_up_program()` to be run on all devices
+1. In the backward, allreduce param@grad at different devices, this requires
+    1. `backward.py` add `allreduce` operators at parallel_do_grad
+    1. `allreduce` operators need to be called in async mode to achieve maximum throughput
+1. apply gradients related op(i.e. cliping, normalization, decay, sgd) on different devices in parallel
+
+By doing so, we also avoided "backward: accumulate param@grad from different devices to the first device".
+And the ProgramDesc looks like the following
+
+```
+# w1, w2 will be allocated on all GPUs
+start_program
+{
+block0 {
+  parallel_do(block1)
+}
+block1 {
+  parent_block: 0
+  vars: w1, w2
+  ops: init(w1), init(w2)
+}
+}
+
+main_program
+{
+block0 {
+  vars: data, places, w1, w2
+  ops: data, get_place, parallel_do(block1),
+       parallel_do_grad(block2),      # append_backward
+       parallel_do(block3)            # append_optimization
+       
+}
+block1 {
+  parent_block: 0
+  vars: data, h1, h2, loss
+  ops: fc, fc, softmax
+}
+block2 {
+  parent_block: 1
+  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  ops: softmax_grad,
+       fc_grad, allreduce(places, scopes, w1_grad),
+       fc_grad, allreduce(places, scopes, w2_grad)
+}
+block3 {
+  parent_block: 0
+  vars: lr
+  ops: sgd(w2, w2_grad),
+       sgd(w1, w1_grad)
+}
+}
+```
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -37,7 +37,7 @@ void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
          << " dst_place: " << dst_place;
  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
  dev_ctx->Wait();
-  Copy(in, dst_place, *dev_ctx, out);
+  TensorCopy(in, dst_place, *dev_ctx, out);
  dev_ctx->Wait();
 }


--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -157,8 +157,8 @@ TEST(Operator, CPUtoGPU) {
  auto dev_ctx = pool.Get(cuda_place);

  paddle::framework::Tensor output_tensor;
-  Copy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
-       &output_tensor);
+  TensorCopy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
+             &output_tensor);

  dev_ctx->Wait();
  float* output2_ptr = output_tensor.data<float>();

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -75,8 +75,10 @@ static void CheckTensorNANOrInf(const std::string& name,
      tensor.type().hash_code() != typeid(double).hash_code()) {
    return;
  }
-  PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
-  PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
+  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
+                 "Tensor %s contains Inf", name);
+  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
+                 "Tensor %s contains NAN", name);
 }

 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
@@ -120,12 +122,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,

  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);

    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    platform::RecordEvent record_event(op->Type(), pool.Get(place_));

+    VLOG(3) << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
+
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
              << memory::memory_usage(place_);

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -46,7 +46,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {

  if (!platform::is_cpu_place(t.place())) {
    LoDTensor tt;
-    framework::Copy(t, platform::CPUPlace(), &tt);
+    framework::TensorCopy(t, platform::CPUPlace(), &tt);
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(t.place());
    dev_ctx.Wait();
@@ -255,7 +255,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
    }
  }
  // the 3st field, Tensor
-  SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
+  TensorToStream(os, static_cast<Tensor>(tensor), dev_ctx);
 }

 void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
@@ -282,7 +282,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
    }
  }
  // the 3st filed, Tensor
-  DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
+  TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }

 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
@@ -308,14 +308,14 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
    if (lod().empty()) {
      auto src = Slice(begin, end);
      auto &dst_place = places[i];
-      framework::Copy(src, dst_place, &dst);
+      framework::TensorCopy(src, dst_place, &dst);
    } else {
      auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);

      auto &offset = lod_and_offset.second;
      auto src = Slice(offset.first, offset.second);
      auto &dst_place = places[i];
-      framework::Copy(src, dst_place, &dst);
+      framework::TensorCopy(src, dst_place, &dst);

      LoD my_lod;
      for (auto &l : lod_and_offset.first) {
@@ -369,7 +369,7 @@ void LoDTensor::MergeLoDTensor(
  for (auto *src : lod_tensors) {
    int end = begin + src->dims()[0];
    auto dst = Slice(begin, end);
-    framework::Copy(*src, dst_place, &dst);
+    framework::TensorCopy(*src, dst_place, &dst);
    begin = end;
  }
 }

--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -175,8 +175,8 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
  for (size_t ins = 0; ins < num_instances; ins++) {
    for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
      auto slice = tensor.Slice(elem, elem + 1);
-      Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
-           platform::CPUDeviceContext(), &slice);
+      TensorCopy(source.Slice(ins, ins + 1), platform::CPUPlace(),
+                 platform::CPUDeviceContext(), &slice);
    }
  }
  return tensor;

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -291,7 +291,7 @@ class Vector {

  void CopyToCPU() const {
    // COPY GPU Data To CPU
-    Copy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
+    TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
    WaitPlace(cuda_vec_.place());
  }

@@ -305,13 +305,14 @@ class Vector {
  void ImmutableCUDA(platform::Place place) const {
    if (IsDirty()) {
      if (IsInCPU()) {
-        Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
+        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+                   &cuda_vec_);
        WaitPlace(place);
        UnsetFlag(kDirty);
        SetFlag(kDataInCUDA);
      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
        framework::Tensor tmp;
-        Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
        WaitPlace(cuda_vec_.place());
        cuda_vec_.ShareDataWith(tmp);
        // Still dirty
@@ -322,13 +323,14 @@ class Vector {
    } else {
      if (!IsInCUDA()) {
        // Even data is not dirty. However, data is not in CUDA. Copy data.
-        Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
+        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+                   &cuda_vec_);
        WaitPlace(place);
        SetFlag(kDataInCUDA);
      } else if (!(place == cuda_vec_.place())) {
        framework::Tensor tmp;
        WaitPlace(cuda_vec_.place());
-        Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
        WaitPlace(cuda_vec_.place());
        WaitPlace(place);
        cuda_vec_.ShareDataWith(tmp);

--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -105,7 +105,7 @@ void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
        }
      }
      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
-      Copy(buffer_[i][j], platform::CPUPlace(), &dst);
+      TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
      dst_offset += ins_shape[0];
    }
    out_tensor.set_lod(batch_lod);

--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -34,7 +34,7 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
  }
  // the 4st field, Tensor data
-  SerializeToStream(os, selected_rows.value(), dev_ctx);
+  TensorToStream(os, selected_rows.value(), dev_ctx);
 }

 void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
@@ -62,7 +62,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
    selected_rows->set_height(height);
  }
  // the 4st field, tensor which contains the data
-  DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx);
+  TensorFromStream(is, selected_rows->mutable_value(), dev_ctx);
 }

 }  // namespace framework

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -16,6 +16,76 @@

 namespace paddle {
 namespace framework {
+
+void TensorCopy(const Tensor& src, const platform::Place& dst_place,
+                const platform::DeviceContext& ctx, Tensor* dst) {
+  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  src.check_memory_size();
+
+  dst->Resize(src.dims());
+  dst->set_layout(src.layout());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+
+  auto size = src.numel() * SizeOfType(src.type());
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+void TensorCopy(const Tensor& src, const platform::Place& dst_place,
+                Tensor* dst) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  if (platform::is_gpu_place(src.place())) {
+    dev_ctx = pool.Get(src.place());
+  } else {
+    dev_ctx = pool.Get(dst_place);
+  }
+  TensorCopy(src, dst_place, *dev_ctx, dst);
+}
+
 template <typename Predicate, typename DevCtx>
 struct AnyDTypeVisitor {
  Predicate predicate_;
@@ -69,7 +139,7 @@ struct AnyVisitor : public boost::static_visitor<bool> {
    tmp.mutable_data<bool>(cpu);
    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
    gpuctx->Wait();
-    Copy(out, cpu, *gpuctx, &tmp);
+    TensorCopy(out, cpu, *gpuctx, &tmp);
    gpuctx->Wait();
    return GetResult(tmp, cpu);
  }
@@ -87,7 +157,7 @@ inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
  return platform::VisitPlace(place, visitor);
 }

-struct HasNANPredicate {
+struct ContainsNANPredicate {
  template <typename T>
  auto operator()(const T& eigen_vec) const
      -> decltype(std::declval<T>().isnan()) {
@@ -96,12 +166,12 @@ struct HasNANPredicate {
  }
 };

-bool HasNAN(const framework::Tensor& tensor) {
-  HasNANPredicate predicate;
+bool TensorContainsNAN(const framework::Tensor& tensor) {
+  ContainsNANPredicate predicate;
  return Any(tensor, predicate);
 }

-struct HasInfPredicate {
+struct ContainsInfPredicate {
  template <typename T>
  auto operator()(const T& eigen_vec) const
      -> decltype(std::declval<T>().isinf()) {
@@ -110,10 +180,124 @@ struct HasInfPredicate {
  }
 };

-bool HasInf(const framework::Tensor& tensor) {
-  HasInfPredicate predicate;
+bool TensorContainsInf(const framework::Tensor& tensor) {
+  ContainsInfPredicate predicate;
  return Any(tensor, predicate);
 }

+void TensorToStream(std::ostream& os, const Tensor& tensor,
+                    const platform::DeviceContext& dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    proto::VarType::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto* pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto* data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char*>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+struct DeserializedDataFunctor {
+  DeserializedDataFunctor(void** buf, Tensor* tensor,
+                          const platform::Place& place)
+      : buf_(buf), tensor_(tensor), place_(place) {}
+
+  template <typename T>
+  void operator()() {
+    *buf_ = tensor_->mutable_data<T>(place_);
+  }
+
+  void** buf_;
+  Tensor* tensor_;
+  platform::Place place_;
+};
+
+void TensorFromStream(std::istream& is, Tensor* tensor,
+                      const platform::DeviceContext& dev_ctx) {
+  uint32_t version;
+  is.read(reinterpret_cast<char*>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  proto::VarType::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char*>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+    void* buf;
+    auto ctx = platform::CPUDeviceContext();
+    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      Tensor cpu_tensor;
+      cpu_tensor.Resize(framework::make_ddim(dims));
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      auto dst_place = dev_ctx.GetPlace();
+      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), tensor->memory_size());
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace framework {
-template <typename Predicate, typename DevCtx>
-struct AnyDTypeVisitor {
-  Predicate predicate_;
-  const Tensor& tensor_;
-  const DevCtx& ctx_;
-  Tensor* out_;
-
-  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
-                  Tensor* out)
-      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
-
-  template <typename T>
-  void operator()() const {
-    auto t = EigenVector<T>::Flatten(tensor_);
-    auto o = EigenScalar<bool>::From(*out_);
-    // return any of predicate_(t) is true.
-    o.device(*ctx_.eigen_device()) = predicate_(t).any();
-  }
-};
-
-template <typename Predicate, typename DevCtx>
-inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
-                    const DevCtx& ctx, framework::Tensor* out) {
-  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
-                                               predicate, tensor, ctx, out));
-}
-
-template <typename Predicate>
-struct AnyVisitor : public boost::static_visitor<bool> {
-  const framework::Tensor& tensor_;
-  Predicate predicate_;
-
-  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
-      : tensor_(tensor), predicate_(std::move(predicate)) {}
-
-  template <typename Place>
-  bool operator()(const Place& place) const {
-    framework::Tensor out;
-    out.Resize({1});
-    out.mutable_data<bool>(place);
-    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
-    AnyImpl(predicate_, tensor_, *ctx, &out);
-    return this->GetResult(out, place);
-  }
-
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CUDAPlace& gpu) const {
-    platform::CPUPlace cpu;
-    framework::Tensor tmp;
-    tmp.Resize({1});
-    tmp.mutable_data<bool>(cpu);
-    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
-    gpuctx->Wait();
-    Copy(out, cpu, *gpuctx, &tmp);
-    gpuctx->Wait();
-    return GetResult(tmp, cpu);
-  }
-
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CPUPlace& cpu) const {
-    return *out.data<bool>();
-  }
-};
-
-template <typename Predicate>
-inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
-  AnyVisitor<Predicate> visitor(tensor, predicate);
-  auto place = tensor.place();
-  return platform::VisitPlace(place, visitor);
-}
-
-struct HasNANPredicate {
-  template <typename T>
-  auto operator()(const T& eigen_vec) const
-      -> decltype(std::declval<T>().isnan()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isnan();
-  }
-};
-
-bool HasNAN(const framework::Tensor& tensor) {
-  HasNANPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-struct HasInfPredicate {
-  template <typename T>
-  auto operator()(const T& eigen_vec) const
-      -> decltype(std::declval<T>().isinf()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isinf();
-  }
-};
-
-bool HasInf(const framework::Tensor& tensor) {
-  HasInfPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
+tensor_util.cc
\ No newline at end of file
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -22,106 +22,38 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-/**
- * @brief   Copy the content of external tensor to a new place.
- *
- * @param[in] src        The external tensor.
- * @param[in] dst_place  The dst place.
- * @param[in] ctx        The device context contains device resources.
- *
- * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
- */
-inline void Copy(const Tensor& src, const platform::Place& dst_place,
-                 const platform::DeviceContext& ctx, Tensor* dst) {
-  VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to "
-          << dst_place;
-  src.check_memory_size();
+void TensorCopy(const Tensor& src, const platform::Place& dst_place,
+                const platform::DeviceContext& ctx, Tensor* dst);
+void TensorCopy(const Tensor& src, const platform::Place& dst_place,
+                Tensor* dst);

-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-  auto src_place = src.place();
-  auto src_ptr = src.data<void>();
+template <typename T>
+void TensorFromVector(const std::vector<T>& src,
+                      const platform::DeviceContext& ctx, Tensor* dst);
+template <typename T>
+void TensorFromVector(const std::vector<T>& src, Tensor* dst);

-  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+template <typename T>
+void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
+                    std::vector<T>* dst);
+template <typename T>
+void TesnorToVector(const Tensor& src, std::vector<T>* dst);

-  auto size = src.numel() * SizeOfType(src.type());
+bool TensorContainsNAN(const framework::Tensor& tensor);
+bool TensorContainsInf(const framework::Tensor& tensor);

-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
+void TensorToStream(std::ostream& os, const Tensor& tensor,
+                    const platform::DeviceContext& dev_ctx);
+void TensorFromStream(std::istream& is, Tensor* tensor,
+                      const platform::DeviceContext& dev_ctx);

-/**
- * @brief Wrapper on
- *     Copy(const Tensor& src, const platform::Place& dst_place,
- *              const platform::DeviceContext& ctx, Tensor* dst);
- *
- * @param[in] src        The external tensor.
- * @param[in] dst_place  The dst place.
- *
- * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
- */
-inline void Copy(const Tensor& src, const platform::Place& dst_place,
-                 Tensor* dst) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(src.place())) {
-    dev_ctx = pool.Get(src.place());
-  } else {
-    dev_ctx = pool.Get(dst_place);
-  }
-  Copy(src, dst_place, *dev_ctx, dst);
-}
+//
+// The implementation of template functions.
+//

-/**
- * @brief   Copy the content of an external vector to a tensor.
- *
- * @param[in] src        The external tensor.
- * @param[in] ctx        The device context contains device resources.
- *
- * * @note    CopyFromVector will resize dst to an 1D tensor with the same
- *            size as src.
- */
 template <typename T>
-inline void CopyFromVector(const std::vector<T>& src,
-                           const platform::DeviceContext& ctx, Tensor* dst) {
+void TensorFromVector(const std::vector<T>& src,
+                      const platform::DeviceContext& ctx, Tensor* dst) {
  auto dst_place = ctx.GetPlace();
  auto src_ptr = static_cast<const void*>(src.data());
  platform::CPUPlace src_place;
@@ -143,11 +75,8 @@ inline void CopyFromVector(const std::vector<T>& src,
 #endif
 }

-/**
- * @brief CopyFromVector CPU vector -> CPU Tensor
- */
 template <typename T>
-inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
+void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
  platform::CPUPlace dst_place = platform::CPUPlace();
  auto src_ptr = static_cast<const void*>(src.data());
  platform::CPUPlace src_place;
@@ -158,18 +87,9 @@ inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
 }

-/**
- * @brief   Copy the content of a tensor to a vector
- *
- * @param[in] src        The external tensor.
- * @param[in] ctx        The device context contains device resources.
- *
- * * @note    CopyFromVector assumes that the tensor has been resized
- *            before invoking.
- */
 template <typename T>
-inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
-                         std::vector<T>* dst) {
+void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
+                    std::vector<T>* dst) {
  auto src_ptr = static_cast<const void*>(src.data<T>());
  auto size = src.numel() * sizeof(T);

@@ -191,11 +111,8 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
 #endif
 }

-/**
- * @brief CopyToVector CPUTensor <-> CPU Vector
- */
 template <typename T>
-inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
+void TensorToVector(const Tensor& src, std::vector<T>* dst) {
  auto src_ptr = static_cast<const void*>(src.data<T>());
  auto size = src.numel() * sizeof(T);

@@ -209,125 +126,5 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
               src_ptr, size);
 }

-// Returns true if a tensor contains NAN, i.e., Not A Number.
-bool HasNAN(const framework::Tensor& tensor);
-
-// Returns true if a tensor contains Inf, i.e., Infinity.
-bool HasInf(const framework::Tensor& tensor);
-
-inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
-                              const platform::DeviceContext& dev_ctx) {
-  // TODO(typhoonzero): serialize to ostream
-  {  // the 1st field, uint32_t version
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
-  }
-  {  // the 2nd field, tensor description
-     // int32_t  size
-     // void*    protobuf message
-    proto::VarType::TensorDesc desc;
-    desc.set_data_type(framework::ToDataType(tensor.type()));
-    auto dims = framework::vectorize(tensor.dims());
-    auto* pb_dims = desc.mutable_dims();
-    pb_dims->Resize(static_cast<int>(dims.size()), 0);
-    std::copy(dims.begin(), dims.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
-    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    auto out = desc.SerializeAsString();
-    os.write(out.data(), size);
-  }
-  {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
-    auto* data_ptr = tensor.data<void>();
-    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                   "Index overflow when writing tensor");
-    if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto& gpu_dev_ctx =
-          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu, buf.get(),
-                     boost::get<platform::CUDAPlace>(tensor.place()),
-                     reinterpret_cast<const void*>(data), size_to_write,
-                     gpu_dev_ctx.stream());
-        gpu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW("Unexpected branch");
-#endif
-    } else {
-      os.write(static_cast<const char*>(data_ptr),
-               static_cast<std::streamsize>(size));
-    }
-  }
-}
-
-struct DeserializedDataFunctor {
-  DeserializedDataFunctor(void** buf, Tensor* tensor,
-                          const platform::Place& place)
-      : buf_(buf), tensor_(tensor), place_(place) {}
-
-  template <typename T>
-  void operator()() {
-    *buf_ = tensor_->mutable_data<T>(place_);
-  }
-
-  void** buf_;
-  Tensor* tensor_;
-  platform::Place place_;
-};
-
-inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
-                                  const platform::DeviceContext& dev_ctx) {
-  uint32_t version;
-  is.read(reinterpret_cast<char*>(&version), sizeof(version));
-  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  proto::VarType::TensorDesc desc;
-  {  // int32_t size
-     // proto buffer
-    int32_t size;
-    is.read(reinterpret_cast<char*>(&size), sizeof(size));
-    std::unique_ptr<char[]> buf(new char[size]);
-    is.read(reinterpret_cast<char*>(buf.get()), size);
-    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                   "Cannot parse tensor desc");
-  }
-  {  // read tensor
-    std::vector<int64_t> dims;
-    dims.reserve(static_cast<size_t>(desc.dims().size()));
-    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-    tensor->Resize(framework::make_ddim(dims));
-    void* buf;
-    auto ctx = platform::CPUDeviceContext();
-    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      Tensor cpu_tensor;
-      cpu_tensor.Resize(framework::make_ddim(dims));
-      framework::VisitDataType(
-          desc.data_type(),
-          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
-      auto dst_place = dev_ctx.GetPlace();
-      framework::Copy(cpu_tensor, dst_place, dev_ctx, tensor);
-#else
-      PADDLE_THROW("Unexpected branch");
-#endif
-    } else {
-      framework::VisitDataType(
-          desc.data_type(),
-          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), tensor->memory_size());
-    }
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace framework {

-TEST(Copy, Tensor) {
+TEST(TensorCopy, Tensor) {
  Tensor src_tensor;
  Tensor dst_tensor;
  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
@@ -33,7 +33,7 @@ TEST(Copy, Tensor) {
  src_tensor.set_layout(DataLayout::kAnyLayout);

  auto cpu_place = new platform::CPUPlace();
-  Copy(src_tensor, *cpu_place, &dst_tensor);
+  TensorCopy(src_tensor, *cpu_place, &dst_tensor);

  const int* dst_ptr = dst_tensor.data<int>();
  ASSERT_NE(src_ptr, dst_ptr);
@@ -44,7 +44,7 @@ TEST(Copy, Tensor) {
  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());

  Tensor slice_tensor = src_tensor.Slice(1, 2);
-  Copy(slice_tensor, *cpu_place, &dst_tensor);
+  TensorCopy(slice_tensor, *cpu_place, &dst_tensor);
  const int* slice_ptr = slice_tensor.data<int>();
  dst_ptr = dst_tensor.data<int>();
  ASSERT_NE(dst_ptr, slice_ptr);
@@ -68,11 +68,11 @@ TEST(Copy, Tensor) {
    // CPU Tensor to GPU Tensor
    auto gpu_place = new platform::CUDAPlace(0);
    platform::CUDADeviceContext gpu_ctx(*gpu_place);
-    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);

    // GPU Tensor to CPU Tensor
    auto cpu_place = new platform::CPUPlace();
-    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);

    // Sync before Compare Tensors
    gpu_ctx.Wait();
@@ -85,10 +85,10 @@ TEST(Copy, Tensor) {
    Tensor slice_tensor = src_tensor.Slice(1, 2);

    // CPU Slice Tensor to GPU Tensor
-    Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    TensorCopy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);

    // GPU Tensor to CPU Tensor
-    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);

    // Sync before Compare Slice Tensors
    gpu_ctx.Wait();
@@ -104,7 +104,7 @@ TEST(Copy, Tensor) {
 #endif
 }

-TEST(CopyFromVector, Tensor) {
+TEST(TensorFromVector, Tensor) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  {
@@ -114,7 +114,7 @@ TEST(CopyFromVector, Tensor) {
    // Copy to CPU Tensor
    cpu_tensor.Resize(make_ddim({3, 3}));
    auto cpu_place = new paddle::platform::CPUPlace();
-    CopyFromVector<int>(src_vec, &cpu_tensor);
+    TensorFromVector<int>(src_vec, &cpu_tensor);

    // Compare Tensors
    const int* cpu_ptr = cpu_tensor.data<int>();
@@ -126,7 +126,7 @@ TEST(CopyFromVector, Tensor) {

    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
    cpu_tensor.Resize(make_ddim({2, 2}));
-    CopyFromVector<int>(src_vec, &cpu_tensor);
+    TensorFromVector<int>(src_vec, &cpu_tensor);
    cpu_ptr = cpu_tensor.data<int>();
    src_ptr = src_vec.data();
    ASSERT_NE(src_ptr, cpu_ptr);
@@ -148,15 +148,15 @@ TEST(CopyFromVector, Tensor) {
    cpu_tensor.Resize(make_ddim({3, 3}));
    auto cpu_place = new paddle::platform::CPUPlace();
    CPUDeviceContext cpu_ctx(*cpu_place);
-    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);

    // Copy to GPUTensor
    gpu_tensor.Resize(make_ddim({3, 3}));
    auto gpu_place = new paddle::platform::CUDAPlace();
    CUDADeviceContext gpu_ctx(*gpu_place);
-    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
    // Copy from GPU to CPU tensor for comparison
-    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);

    // Sync before Compare Tensors
    gpu_ctx.Wait();
@@ -173,10 +173,10 @@ TEST(CopyFromVector, Tensor) {
    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);

    cpu_tensor.Resize(make_ddim({2, 2}));
-    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
    gpu_tensor.Resize(make_ddim({2, 2}));
-    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);

    // Sync before Compare Tensors
    gpu_ctx.Wait();
@@ -196,7 +196,7 @@ TEST(CopyFromVector, Tensor) {
 #endif
 }

-TEST(CopyToVector, Tensor) {
+TEST(TensorToVector, Tensor) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  {
@@ -208,7 +208,7 @@ TEST(CopyToVector, Tensor) {

    CPUPlace place;
    std::vector<int> dst;
-    CopyToVector<int>(src, &dst);
+    TensorToVector<int>(src, &dst);

    for (int i = 0; i < 3 * 3; ++i) {
      EXPECT_EQ(src_ptr[i], dst[i]);
@@ -220,10 +220,10 @@ TEST(CopyToVector, Tensor) {
    Tensor gpu_tensor;
    CUDAPlace place;
    CUDADeviceContext gpu_ctx(place);
-    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);

    std::vector<int> dst;
-    CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
+    TensorToVector<int>(gpu_tensor, gpu_ctx, &dst);

    for (int i = 0; i < 3 * 3; ++i) {
      EXPECT_EQ(src_vec[i], dst[i]);
@@ -232,7 +232,7 @@ TEST(CopyToVector, Tensor) {
 #endif
 }

-TEST(HasNAN, CPU) {
+TEST(TensorContainsNAN, CPU) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  Tensor src;
@@ -240,11 +240,12 @@ TEST(HasNAN, CPU) {
  buf[0] = 0.0;
  buf[1] = NAN;
  buf[2] = 0.0;
-
-  ASSERT_TRUE(HasNAN(src));
+  ASSERT_TRUE(TensorContainsNAN(src));
+  buf[1] = 0.0;
+  ASSERT_FALSE(TensorContainsNAN(src));
 }

-TEST(HasInf, CPU) {
+TEST(TensorContainsInf, CPU) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  Tensor src;
@@ -252,10 +253,12 @@ TEST(HasInf, CPU) {
  buf[0] = 1.0;
  buf[1] = INFINITY;
  buf[2] = 0.0;
-  ASSERT_TRUE(HasInf(src));
+  ASSERT_TRUE(TensorContainsInf(src));
+  buf[1] = 1.0;
+  ASSERT_FALSE(TensorContainsInf(src));
 }

-TEST(Tensor, SerializeAndDeserialize) {
+TEST(Tensor, FromAndToStream) {
  framework::Tensor src_tensor;
  int array[6] = {1, 2, 3, 4, 5, 6};
  src_tensor.Resize({2, 3});
@@ -268,10 +271,10 @@ TEST(Tensor, SerializeAndDeserialize) {
    auto place = new platform::CPUPlace();
    platform::CPUDeviceContext cpu_ctx(*place);
    std::ostringstream oss;
-    SerializeToStream(oss, src_tensor, cpu_ctx);
+    TensorToStream(oss, src_tensor, cpu_ctx);

    std::istringstream iss(oss.str());
-    DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
+    TensorFromStream(iss, &dst_tensor, cpu_ctx);
    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 5; ++i) {
      ASSERT_EQ(dst_ptr[i], array[i]);
@@ -288,13 +291,13 @@ TEST(Tensor, SerializeAndDeserialize) {
    auto gpu_place = new platform::CUDAPlace();
    platform::CUDADeviceContext gpu_ctx(*gpu_place);

-    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);

    std::ostringstream oss;
-    SerializeToStream(oss, gpu_tensor, gpu_ctx);
+    TensorToStream(oss, gpu_tensor, gpu_ctx);

    std::istringstream iss(oss.str());
-    DeserializeFromStream(iss, &dst_tensor, gpu_ctx);
+    TensorFromStream(iss, &dst_tensor, gpu_ctx);

    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 6; ++i) {

--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -31,7 +31,7 @@ static __global__ void FillInf(float* buf) {
  buf[2] = 0.5;
 }

-TEST(HasNAN, GPU) {
+TEST(TensorContainsNAN, GPU) {
  Tensor tensor;
  platform::CUDAPlace gpu(0);
  auto& pool = platform::DeviceContextPool::Instance();
@@ -39,10 +39,10 @@ TEST(HasNAN, GPU) {
  float* buf = tensor.mutable_data<float>({3}, gpu);
  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
  cuda_ctx->Wait();
-  ASSERT_TRUE(HasNAN(tensor));
+  ASSERT_TRUE(TensorContainsNAN(tensor));
 }

-TEST(HasInf, GPU) {
+TEST(TensorContainsInf, GPU) {
  Tensor tensor;
  platform::CUDAPlace gpu(0);
  auto& pool = platform::DeviceContextPool::Instance();
@@ -50,7 +50,7 @@ TEST(HasInf, GPU) {
  float* buf = tensor.mutable_data<float>({3}, gpu);
  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
  cuda_ctx->Wait();
-  ASSERT_TRUE(HasInf(tensor));
+  ASSERT_TRUE(TensorContainsInf(tensor));
 }

 }  // namespace framework

--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -64,7 +64,6 @@ class ThreadPool {
    Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
      try {
        fn();
-        return nullptr;
      } catch (platform::EnforceNotMet ex) {
        return std::unique_ptr<platform::EnforceNotMet>(
            new platform::EnforceNotMet(ex));
@@ -73,6 +72,7 @@ class ThreadPool {
            << "Unexpected exception is catched in thread pool. All "
               "throwable exception in Fluid should be an EnforceNotMet.";
      }
+      return nullptr;
    });
    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
    tasks_.push(std::move(task));

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -176,6 +176,20 @@ op_library(pool_op SRCS pool_op.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col)
 endif()

+cc_library(batch_size_like SRCS batch_size_like.cc DEPS op_registry)
+
+op_library(fill_constant_batch_size_like_op
+  SRCS fill_constant_batch_size_like_op.cc fill_constant_batch_size_like_op.cu.cc
+  DEPS batch_size_like)
+
+op_library(uniform_random_batch_size_like_op
+  SRCS uniform_random_batch_size_like_op.cc
+  DEPS batch_size_like uniform_random_op)
+
+op_library(gaussian_random_batch_size_like_op
+  SRCS gaussian_random_batch_size_like_op.cc
+  DEPS batch_size_like gaussian_random_op)
+
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)

--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -42,7 +42,7 @@ class ArrayOp : public framework::OperatorBase {
    if (platform::is_gpu_place(i_tensor.place())) {
      // FIXME: Avoid copy from GPU to CPU
      framework::Tensor t;
-      framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
+      framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
      dev_ctx.Wait();
      offset = static_cast<size_t>(*t.data<int64_t>());
    } else {

--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -112,8 +112,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
            platform::DeviceContextPool::Instance();
        auto &dev_ctx = *pool.Get(place);

-        framework::Copy(x[x_idx].Slice(start_offset, end_offset), place,
-                        dev_ctx, &slice);
+        framework::TensorCopy(x[x_idx].Slice(start_offset, end_offset), place,
+                              dev_ctx, &slice);
        out_offset += len;
      }
    }

--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -45,7 +45,7 @@ class AssignFunctor {
    out_rows.set_height(rows.height());
    auto &t = rows.value();
    auto *m = out_rows.mutable_value();
-    framework::Copy(t, t.place(), dev_ctx_, m);
+    framework::TensorCopy(t, t.place(), dev_ctx_, m);
  }

  template <typename T>
@@ -57,7 +57,7 @@ class AssignFunctor {
  void copy_tensor(const framework::LoDTensor &lod_tensor,
                   framework::LoDTensor *out) const {
    auto &out_tensor = *out;
-    Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
+    TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
    out_tensor.set_lod(lod_tensor.lod());
  }


--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -41,7 +41,7 @@ class AssignValueKernel : public framework::OpKernel<T> {
        break;
    }
    auto values = ctx.Attr<std::vector<T>>(value_name);
-    framework::CopyFromVector(values, ctx.device_context(), out);
+    framework::TensorFromVector(values, ctx.device_context(), out);
    out->Resize(framework::make_ddim(shape));
  }
 };

--- a/paddle/fluid/operators/batch_size_like.cc
+++ b/paddle/fluid/operators/batch_size_like.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/batch_size_like.h"
+
+namespace paddle {
+namespace operators {
+
+void BatchSizeLikeOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of %s should not be null.", Type());
+  PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of %s should not be null.",
+                 Type());
+
+  auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+  PADDLE_ENFORCE_GT(shape.size(), 0);
+  std::vector<int64_t> shape_int64(shape.size(), 0);
+  std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                 [](int a) { return static_cast<int64_t>(a); });
+  auto output_dim = framework::make_ddim(shape_int64);
+
+  int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
+  PADDLE_ENFORCE_GE(input_dim_idx, 0);
+  PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
+
+  int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
+  PADDLE_ENFORCE_GE(output_dim_idx, 0);
+  PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+
+  output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
+  ctx->SetOutputDim("Out", output_dim);
+}
+
+BatchSizeLikeOpMaker::BatchSizeLikeOpMaker(OpProto *proto,
+                                           OpAttrChecker *op_checker)
+    : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("Input",
+           "(Tensor) Tensor "
+           "whose input_dim_idx'th dimension specifies the batch_size");
+  AddOutput("Out",
+            "(Tensor) Tensor of specified shape will be filled "
+            "with the specified value");
+  AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+  AddAttr<int>("input_dim_idx",
+               "(int, default 0) The index of input's batch size dimension")
+      .SetDefault(0);
+  AddAttr<int>("output_dim_idx",
+               "(int, default 0) The index of output's batch size dimension")
+      .SetDefault(0);
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class BatchSizeLikeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override;
+};
+
+class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker);
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -232,12 +232,12 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
  id_tensor->set_lod(lod);
  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
  id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
-  framework::CopyFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
+  framework::TensorFromVector<int64_t>(id_data, cpu_ctx, id_tensor);

  score_tensor->set_lod(lod);
  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
  score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
-  framework::CopyFromVector<T>(score_data, cpu_ctx, score_tensor);
+  framework::TensorFromVector<T>(score_data, cpu_ctx, score_tensor);
 }

 template <typename T>

--- a/paddle/fluid/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
@@ -67,7 +67,6 @@ class CompareOpKernel
    auto* x = context.Input<Tensor>("X");
    auto* y = context.Input<Tensor>("Y");
    auto* z = context.Output<Tensor>("Out");
-    z->mutable_data<T>(context.GetPlace());
    int axis = context.Attr<int>("axis");
    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
                                                          Functor(), z);

--- a/paddle/fluid/operators/detection_output_op.h
+++ b/paddle/fluid/operators/detection_output_op.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   Indicesou may obtain a copy of the License at

-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0

-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */

 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
@@ -98,16 +98,16 @@ class DetectionOutputKernel : public framework::OpKernel<T> {
    T* conf_data = conf_tensor.data<T>();
    if (platform::is_gpu_place(context.GetPlace())) {
      loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
-      framework::Copy(loc_tensor, platform::CPUPlace(),
-                      context.device_context(), &loc_cpu);
+      framework::TensorCopy(loc_tensor, platform::CPUPlace(),
+                            context.device_context(), &loc_cpu);
      loc_data = loc_cpu.data<T>();
      conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
-      framework::Copy(conf_tensor, platform::CPUPlace(),
-                      context.device_context(), &conf_cpu);
+      framework::TensorCopy(conf_tensor, platform::CPUPlace(),
+                            context.device_context(), &conf_cpu);
      conf_data = conf_cpu.data<T>();
      priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
-      framework::Copy(*in_priorbox, platform::CPUPlace(),
-                      context.device_context(), &priorbox_cpu);
+      framework::TensorCopy(*in_priorbox, platform::CPUPlace(),
+                            context.device_context(), &priorbox_cpu);
      priorbox_data = priorbox_cpu.data<T>();
    }
    // get decode bboxes
@@ -158,8 +158,8 @@ class DetectionOutputKernel : public framework::OpKernel<T> {
                                batch_size, all_indices, all_decoded_bboxes,
                                out_data);
    if (platform::is_gpu_place(context.GetPlace())) {
-      framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(),
-                      out);
+      framework::TensorCopy(out_cpu, platform::CUDAPlace(),
+                            context.device_context(), out);
    }
  }
 };

--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -126,7 +126,8 @@ class ExpandGradKernel : public framework::OpKernel<T> {
      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
      out0->mutable_data<T>(context.GetPlace());
-      framework::Copy(*in0, context.GetPlace(), context.device_context(), out0);
+      framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
+                            out0);
    } else {
      switch (dims) {
        REP_EXPAND_GRAD_TEMPLATE(72)

--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -57,7 +57,7 @@ class FeedOp : public framework::OperatorBase {
    if (platform::is_same_place(feed_item.place(), place)) {
      out_item->ShareDataWith(feed_item);
    } else {
-      framework::Copy(feed_item, place, dev_ctx, out_item);
+      framework::TensorCopy(feed_item, place, dev_ctx, out_item);
    }
    out_item->set_lod(feed_item.lod());
  }

--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -56,7 +56,7 @@ class FetchOp : public framework::OperatorBase {
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(src_item.place());

-    Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
+    TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
    dev_ctx.Wait();
    dst_item.set_lod(src_item.lod());


--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -13,42 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
+#include "paddle/fluid/operators/batch_size_like.h"

 namespace paddle {
 namespace operators {

-class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Input"),
-        "Input(Input) of FillConstantBatchSizeLikeOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("Out"),
-        "Output(Out) of FillConstantBatchSizeLikeOp should not be null.");
-
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE_GT(shape.size(), 0);
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto output_dim = framework::make_ddim(shape_int64);
-
-    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
-    PADDLE_ENFORCE_GE(input_dim_idx, 0);
-    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
-
-    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
-    PADDLE_ENFORCE_GE(output_dim_idx, 0);
-    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
-
-    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
-    ctx->SetOutputDim("Out", output_dim);
-  }
-
+class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
 protected:
+  using BatchSizeLikeOp::BatchSizeLikeOp;
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
@@ -57,28 +29,14 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
  }
 };

-class FillConstantBatchSizeLikeOpMaker
-    : public framework::OpProtoAndCheckerMaker {
+class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
 public:
  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+      : BatchSizeLikeOpMaker(proto, op_checker) {
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
        .SetDefault(framework::proto::DataType::FP32);
-    AddInput("Input",
-             "(Tensor) Tensor "
-             "whose dim_idx th dimension is used to specify the batch_size");
-    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
-              "with the specified value");
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
-    AddAttr<int>("input_dim_idx",
-                 "(int, default 0) The index of input's batch size dimension")
-        .SetDefault(0);
-    AddAttr<int>("output_dim_idx",
-                 "(int, default 0) The index of output's batch size dimension")
-        .SetDefault(0);
    AddAttr<float>("value", "(float, default 0) The value to be filled")
        .SetDefault(0.0f);
    AddComment(R"DOC(

--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -74,7 +74,7 @@ class FillOp : public framework::OperatorBase {
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto &dev_ctx = *pool.Get(place);
-      framework::Copy(tensor, place, dev_ctx, &out);
+      framework::TensorCopy(tensor, place, dev_ctx, &out);
    }
  }
 };

--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/batch_size_like.h"
+
+namespace paddle {
+namespace operators {
+
+class GaussianRandomBatchSizeLikeOp : public BatchSizeLikeOp {
+ protected:
+  using BatchSizeLikeOp::BatchSizeLikeOp;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
+};
+
+class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
+ public:
+  GaussianRandomBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : BatchSizeLikeOpMaker(proto, op_checker) {
+    AddAttr<float>("mean",
+                   "(float, default 0.0) "
+                   "mean of random tensor.")
+        .SetDefault(.0f);
+    AddAttr<float>("std",
+                   "(float, default 1.0) "
+                   "std of random tensor.")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "(int, default 0) "
+                 "Random seed of generator."
+                 "0 means use system wide seed."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time.")
+        .SetDefault(0);
+    AddAttr<int>("dtype",
+                 "(int, default 5(FP32)) "
+                 "Output data type.")
+        .SetDefault(framework::proto::DataType::FP32);
+
+    AddComment(R"DOC(
+GaussianRandom Operator.
+
+Used to initialize tensors with gaussian random generator.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    gaussian_random_batch_size_like,
+    paddle::operators::GaussianRandomBatchSizeLikeOp,
+    paddle::operators::GaussianRandomBatchSizeLikeOpMaker);
+// Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -88,7 +88,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("seed",
                 "(int, default 0) "
                 "Random seed of generator."
-                 "0 means use system wide seed.")
+                 "0 means use system wide seed."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time.")
        .SetDefault(0);
    AddAttr<int>("dtype",
                 "(int, default 5(FP32)) "
@@ -110,4 +112,8 @@ Used to initialize tensors with gaussian random generator.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
                             ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
+                       ops::CPUGaussianRandomKernel<double>);
+REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
+                       ops::CPUGaussianRandomKernel<float>,
+                       ops::CPUGaussianRandomKernel<double>);
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -61,4 +61,8 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 REGISTER_OP_CUDA_KERNEL(gaussian_random,
-                        paddle::operators::GPUGaussianRandomKernel<float>);
+                        paddle::operators::GPUGaussianRandomKernel<float>,
+                        paddle::operators::GPUGaussianRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(gaussian_random_batch_size_like,
+                        paddle::operators::GPUGaussianRandomKernel<float>,
+                        paddle::operators::GPUGaussianRandomKernel<double>);
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -196,7 +196,7 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
        // dy_dx
        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
-        framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
+        framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x);

        // dy_dmean_dx
        row_mean(dev_ctx, temp, &temp_vec);
@@ -208,7 +208,7 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
      } else {
        // dy_dx
-        framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
+        framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);

        // dy_dmean_dx
        row_mean(dev_ctx, d_y, &temp_vec);

--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -69,7 +69,7 @@ class LoadCombineOp : public framework::OperatorBase {
        out_var->Clear();
        tensor = out_var->GetMutable<framework::LoDTensor>();
        tensor->set_lod(cpu_tensor.lod());
-        Copy(cpu_tensor, place, dev_ctx, tensor);
+        TensorCopy(cpu_tensor, place, dev_ctx, tensor);
      }
    }
  }

--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -55,7 +55,7 @@ class LoadOp : public framework::OperatorBase {
      out_var->Clear();
      tensor = out_var->GetMutable<framework::LoDTensor>();
      tensor->set_lod(cpu_tensor.lod());
-      Copy(cpu_tensor, place, dev_ctx, tensor);
+      TensorCopy(cpu_tensor, place, dev_ctx, tensor);
    }
  }
 };

--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -33,8 +33,8 @@ class LoDResetKernel : public framework::OpKernel<T> {
      auto* lod = lod_t->data<int>();
      if (platform::is_gpu_place(ctx.GetPlace())) {
        framework::Tensor lod_cpu;
-        framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(),
-                        &lod_cpu);
+        framework::TensorCopy(*lod_t, platform::CPUPlace(),
+                              ctx.device_context(), &lod_cpu);
        lod = lod_cpu.data<int>();
      }
      level0 = std::vector<int>(lod, lod + lod_t->numel());

--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -94,9 +94,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
            platform::DeviceContextPool::Instance();
        auto &dev_ctx = *pool.Get(place);

-        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
-                                static_cast<int>(each_range.end)),
-                        x.place(), dev_ctx, &slice);
+        framework::TensorCopy(x.Slice(static_cast<int>(each_range.begin),
+                                      static_cast<int>(each_range.end)),
+                              x.place(), dev_ctx, &slice);
        offset += len;
      }
    }

--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -149,7 +149,8 @@ class ContextProjectFunctor {
            Tensor out_t_sub = out_t.Slice(k * context_length,
                                           k * context_length + padding_size);
            Tensor w_sub = padding_data.Slice(k, k + padding_size);
-            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
+            framework::TensorCopy(w_sub, context.GetPlace(), context,
+                                  &out_t_sub);
          }
        }
        if (down_pad > 0) {  // add down pad
@@ -179,7 +180,8 @@ class ContextProjectFunctor {
                (down_pad_begin_row + t) * context_length);
            Tensor w_sub = padding_data.Slice(
                up_pad + padding_idx, up_pad + padding_idx + padding_size);
-            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
+            framework::TensorCopy(w_sub, context.GetPlace(), context,
+                                  &out_t_sub);
          }
        }
        out_t.Resize({sequence_height, context_length * sequence_width});

--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -62,7 +62,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    Copy(input_tmp, *place, *context, &input);
+    TensorCopy(input_tmp, *place, *context, &input);
  }
  output_cfo.mutable_data<float>(
      {1, filter_size, filter_size, output_height, output_width}, *place);
@@ -87,7 +87,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    out_cfo_ptr = output_cfo.data<float>();
  } else {
-    Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
+    TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
    out_cfo_ptr = output_tmp.data<float>();
  }
  for (int i = 0; i < 6; ++i) {
@@ -98,7 +98,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    out_ocf_ptr = output_ocf.data<float>();
  } else {
-    Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
+    TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
    out_ocf_ptr = output_tmp.data<float>();
  }

@@ -119,7 +119,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    Copy(input_tmp, *place, *context, &input);
+    TensorCopy(input_tmp, *place, *context, &input);
  }

  col2im(*context, output_cfo, dilation, stride, padding, &input);
@@ -128,7 +128,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    in_ptr = input.data<float>();
  } else {
-    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
    in_ptr = input_tmp.data<float>();
  }
  for (int i = 0; i < 6; ++i) {
@@ -140,7 +140,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    Copy(input_tmp, *place, *context, &input);
+    TensorCopy(input_tmp, *place, *context, &input);
  }

  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
@@ -148,7 +148,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    in_ptr = input.data<float>();
  } else {
-    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
    in_ptr = input_tmp.data<float>();
  }
  for (int i = 0; i < 6; ++i) {

--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -29,15 +29,15 @@ TEST(math_function, notrans_mul_trans) {
  auto* gpu_place = new paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);

-  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
+  paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);

  out_gpu.mutable_data<float>({2, 2}, *gpu_place);

  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);

-  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
+  paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);

  float* out_ptr = out.data<float>();
  context.Wait();
@@ -63,15 +63,15 @@ TEST(math_function, trans_mul_notrans) {
  auto* gpu_place = new paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);

-  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
+  paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);

  out_gpu.mutable_data<float>({3, 3}, *gpu_place);

  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);

-  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
+  paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);

  float* out_ptr = out.data<float>();
  context.Wait();
@@ -112,9 +112,9 @@ TEST(math_function, gemm_notrans_cublas) {
  auto* gpu_place = new paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);

-  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
-  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
+  paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
  float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -122,7 +122,7 @@ TEST(math_function, gemm_notrans_cublas) {
  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);

-  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
+  paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);

  // numpy code:
  // a = np.arange(6).reshape(2, 3)
@@ -167,9 +167,9 @@ TEST(math_function, gemm_trans_cublas) {
  auto* gpu_place = new paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);

-  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
-  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
+  paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
  float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -177,7 +177,7 @@ TEST(math_function, gemm_trans_cublas) {
  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);

-  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
+  paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);
  context.Wait();

  EXPECT_EQ(input3_ptr[0], 0);
@@ -218,15 +218,15 @@ void GemvTest(int m, int n, bool trans) {
  }

  paddle::platform::CUDADeviceContext context(*gpu_place);
-  paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a);
-  paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b);
+  paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a);
+  paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b);

  paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
      context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
      g_data_b, 0., g_data_c);

-  paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context,
-                          &vec_c);
+  paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context,
+                                &vec_c);

  if (!trans) {
    for (int i = 0; i < m; ++i) {

--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
  EXPECT_EQ(out_rows[6], 9);

  Tensor out_cpu;
-  Copy(*out_value, cpu_place, ctx, &out_cpu);
+  TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
  ctx.Wait();

  auto* out_cpu_data = out_cpu.data<float>();
@@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());

  Tensor tensor2_cpu;
-  Copy(*tensor2, cpu_place, ctx, &tensor2_cpu);
+  TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
  ctx.Wait();

  auto* tensor2_cpu_data = tensor2_cpu.data<float>();
@@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) {
  EXPECT_EQ(out_rows[6], 9);

  Tensor out_cpu;
-  Copy(*out_value, cpu_place, ctx, &out_cpu);
+  TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
  ctx.Wait();

  auto* out_cpu_data = out_cpu.data<float>();
@@ -191,7 +191,7 @@ TEST(selected_rows_functor, gpu_add_to) {
  add_to_tensor_functor(ctx, *output, tensor1.get());

  Tensor tensor1_cpu;
-  Copy(*tensor1, cpu_place, ctx, &tensor1_cpu);
+  TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
  ctx.Wait();

  auto* tensor1_cpu_data = tensor1_cpu.data<float>();

--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -97,7 +97,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                      "width of sequence in LoDTensor seq.");

    if (!norm_by_times && num_sequences == 1UL) {
-      Copy(seq, context.GetPlace(), context, &padding);
+      TensorCopy(seq, context.GetPlace(), context, &padding);
      padding.Resize(padding_dims);
      return;
    }
@@ -172,7 +172,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                      "width of sequence in LoDTensor seq.");

    if (!norm_by_times && num_sequences == 1UL) {
-      Copy(padding, context.GetPlace(), context, &seq);
+      TensorCopy(padding, context.GetPlace(), context, &seq);
      seq.Resize(seq_dims);
      return;
    }

--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -40,7 +40,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
  if (paddle::platform::is_cpu_place(*place)) {
    seq = cpu_seq;
  } else {
-    Copy(cpu_seq, *place, *context, &seq);
+    TensorCopy(cpu_seq, *place, *context, &seq);
    seq.set_lod(lod);
  }

@@ -63,7 +63,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
  if (paddle::platform::is_cpu_place(*place)) {
    cpu_seq_back = seq_back;
  } else {
-    Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
+    TensorCopy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
    cpu_seq_back.set_lod(lod);
  }


--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -71,7 +71,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    Copy(input_tmp, *place, *context, &input);
+    paddle::framework::TensorCopy(input_tmp, *place, *context, &input);
  }
  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                              output_depth, output_height, output_width},
@@ -85,7 +85,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    out_cfo_ptr = output.data<float>();
  } else {
-    Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
+    TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
    out_cfo_ptr = output_tmp.data<float>();
  }

@@ -99,7 +99,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    Copy(input_tmp, *place, *context, &input);
+    TensorCopy(input_tmp, *place, *context, &input);
  }

  paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
@@ -109,7 +109,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    in_ptr = input.data<float>();
  } else {
-    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
    in_ptr = input_tmp.data<float>();
  }


--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -51,7 +51,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
      cpu_mask->ShareDataWith(mask);
    } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+      framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
+                            cpu_mask.get());
 #else
      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -106,8 +107,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
        continue;
      }
      auto slice = out->Slice(out_offset, out_offset + len);
-      framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx,
-                      &slice);
+      framework::TensorCopy(input->Slice(start_offset, end_offset), place,
+                            dev_ctx, &slice);
      out_offset += len;
      (*in_idx) += 1;
    }

--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -67,7 +67,8 @@ class MineHardExamplesKernel : public framework::OpKernel<T> {
    auto out_match_indices =
        ctx.Output<framework::Tensor>("UpdatedMatchIndices");

-    framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
+    framework::TensorCopy(*in_matched_indices, ctx.GetPlace(),
+                          out_match_indices);

    int batch_size = in_matched_indices->dims()[0];
    int prior_num = in_matched_indices->dims()[1];

--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
    auto cols = ins[0]->numel() / rows;
    // copy index to cpu
    Tensor index_t_cpu;
-    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
    auto* index = index_t_cpu.data<int32_t>();
    auto stream = ctx.cuda_device_context().stream();
    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
    auto cols = ins[0]->numel() / rows;
    // copy index to cpu
    Tensor index_t_cpu;
-    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
    auto* index = index_t_cpu.data<int32_t>();

    auto stream = ctx.cuda_device_context().stream();

--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -98,7 +98,7 @@ class NCCLTester : public ::testing::Test {
      send_tensor->mutable_data<T>(kDims, place);

      std::vector<T> send_vector(f::product(kDims), gpu_id);
-      paddle::framework::CopyFromVector<T>(send_vector, *ctx, send_tensor);
+      paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
      ctx->Wait();
      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
    }

--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -79,7 +79,7 @@ inline void CopyOrShare(const framework::Variable &src,
      dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
    } else {
-      Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
+      TensorCopy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
    }
  } else if (src.IsType<SelectedRows>()) {
    auto &src_sr = src.Get<SelectedRows>();
@@ -89,7 +89,7 @@ inline void CopyOrShare(const framework::Variable &src,
      dst_sr->mutable_value()->ShareDataWith(src_sr.value());
      dst_sr->set_rows(src_sr.rows());
    } else {
-      Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
+      TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value());
    }
  } else {
    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
@@ -147,7 +147,7 @@ class ParallelDoOp : public framework::OperatorBase {
        auto &place = places[i];
        auto *sub_scope = sub_scopes[i];
        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
-        framework::Copy(src, place, dst);
+        framework::TensorCopy(src, place, dst);
      }
    }
    WaitOnPlaces(places);

--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -179,7 +179,7 @@ class TensorPrintOp : public framework::OperatorBase {
    } else {
      // copy data to cpu to print
      platform::CPUPlace place;
-      framework::Copy(in_tensor, place, &printed_tensor);
+      framework::TensorCopy(in_tensor, place, &printed_tensor);
    }

    Formater formater;

--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -291,7 +291,7 @@ class RecurrentOp : public RecurrentBase {
            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
            // Explicit copy output since the local RNN scope can be destroyed
            // early.
-            framework::Copy(src_tensor, place, dev_ctx, &dst_out);
+            framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
          });

      scopes.Next();
@@ -378,7 +378,7 @@ class RecurrentGradOp : public RecurrentBase {
          auto *cur_grad_var = cur_scope.Var(cur_grad);
          auto cur_grad_tensor =
              cur_grad_var->GetMutable<framework::LoDTensor>();
-          framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor);
+          framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
        }
      }

@@ -452,7 +452,7 @@ class RecurrentGradOp : public RecurrentBase {
            }

            auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            framework::Copy(inside, place, dev_ctx, &dst);
+            framework::TensorCopy(inside, place, dev_ctx, &dst);
          });
      VLOG(5) << "Link outside gradient finished ";

@@ -465,7 +465,7 @@ class RecurrentGradOp : public RecurrentBase {
                framework::LoDTensor *outside) {
              outside->Resize(inside.dims());
              outside->mutable_data(place, inside.type());
-              framework::Copy(inside, place, dev_ctx, outside);
+              framework::TensorCopy(inside, place, dev_ctx, outside);
            });
        VLOG(5) << "Link initialize state gradient finished ";
      }

--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -170,7 +170,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {

    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(place);
-    framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
+    framework::TensorCopy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
    out_offset += len;
    return out_offset;
  }

--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -28,7 +28,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
    auto* in = ctx.Input<framework::Tensor>("X");
    auto out_dims = out->dims();
    out->mutable_data<T>(ctx.GetPlace());
-    framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out);
+    framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
    out->Resize(out_dims);
  }
 };
@@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
    d_x->mutable_data<T>(ctx.GetPlace());

    auto in_dims = d_x->dims();
-    framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+    framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
    d_x->Resize(in_dims);
  }
 };

--- a/paddle/fluid/operators/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_reshape_op.h
@@ -61,7 +61,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
      }
    }

-    framework::Copy(*in, context.GetPlace(), out);
+    framework::TensorCopy(*in, context.GetPlace(), out);
    out->Resize({static_cast<int64_t>(out->lod()[0].back()), out_width});
  }
 };
@@ -77,7 +77,7 @@ class SequenceReshapeGradKernel : public framework::OpKernel<T> {
        context.Output<LoDTensor>(framework::GradVarName("X"));

    xg_tensor_ptr->mutable_data<T>(context.GetPlace());
-    framework::Copy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr);
+    framework::TensorCopy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr);
    xg_tensor_ptr->Resize(x_tensor_ptr->dims());
  }
 };

--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -66,13 +66,13 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {

    if (platform::is_gpu_place(ctx.GetPlace())) {
      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
-                      &offset_cpu);
+      framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(),
+                            &offset_cpu);
      offset_data = offset_cpu.data<int64_t>();

      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
-                      &length_cpu);
+      framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(),
+                            &length_cpu);
      length_data = length_cpu.data<int64_t>();
    }

@@ -127,13 +127,13 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {

    if (platform::is_gpu_place(ctx.GetPlace())) {
      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
-                      &offset_cpu);
+      framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(),
+                            &offset_cpu);
      offset_data = offset_cpu.data<int64_t>();

      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
-                      &length_cpu);
+      framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(),
+                            &length_cpu);
      length_data = length_cpu.data<int64_t>();
    }


--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -133,7 +133,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
      auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
      auto height = dout_tensor.dims()[0];
      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
-      framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
+      framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
      if (dx_tensor.dims()[0] > height) {
        auto rest_tensor = dx_tensor.Slice(
            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));

--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -55,7 +55,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
      cpu_mask->ShareDataWith(mask);
    } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+      framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
+                            cpu_mask.get());
 #else
      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -113,9 +114,9 @@ class SplitLoDTensorOp : public framework::OperatorBase {
        // out[offset: offset+len] = x[each_range.begin: each_range.end]
        auto slice = out->Slice(static_cast<int>(offset),
                                static_cast<int>(offset + len));
-        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
-                                static_cast<int>(each_range.end)),
-                        x.place(), dev_ctx, &slice);
+        framework::TensorCopy(x.Slice(static_cast<int>(each_range.begin),
+                                      static_cast<int>(each_range.end)),
+                              x.place(), dev_ctx, &slice);
        offset += len;
      }
    }

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -137,8 +137,8 @@ class SumKernel : public framework::OpKernel<T> {
              out_array.resize(i + 1);
            }
            if (out_array[i].numel() == 0) {
-              framework::Copy(in_array[i], in_array[i].place(),
-                              context.device_context(), &out_array[i]);
+              framework::TensorCopy(in_array[i], in_array[i].place(),
+                                    context.device_context(), &out_array[i]);
              out_array[i].set_lod(in_array[i].lod());
            } else {
              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());

--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -45,7 +45,7 @@ class WriteToArrayOp : public ArrayOp {
          platform::DeviceContextPool::Instance();
      auto &dev_ctx = *pool.Get(place);

-      Copy(x_tensor, place, dev_ctx, out_tensor);
+      TensorCopy(x_tensor, place, dev_ctx, out_tensor);
      out_tensor->set_lod(x_tensor.lod());
    } else {
      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
@@ -138,7 +138,7 @@ class ReadFromArrayOp : public ArrayOp {
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto &dev_ctx = *pool.Get(place);
-      framework::Copy(x_array[offset], place, dev_ctx, out_tensor);
+      framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor);
      out_tensor->set_lod(x_array[offset].lod());
    } else {
      VLOG(10) << "offset " << offset << " >= " << x_array.size();

--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/batch_size_like.h"
+
+namespace paddle {
+namespace operators {
+
+class UniformRandomBatchSizeLikeOp : public BatchSizeLikeOp {
+ protected:
+  using BatchSizeLikeOp::BatchSizeLikeOp;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
+};
+
+class UniformRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
+ public:
+  UniformRandomBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : BatchSizeLikeOpMaker(proto, op_checker) {
+    AddComment(R"DOC(
+Uniform random operator
+
+This operator initializes a tensor with the same batch_size as the Input tensor
+ with random values sampled from a uniform distribution.
+
+)DOC");
+    AddAttr<float>("min",
+                   "(float, default -1.0) "
+                   "Minimum value of uniform random")
+        .SetDefault(-1.0f);
+    AddAttr<float>("max",
+                   "(float, default 1.0) "
+                   "Maximun value of uniform random")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "(int, default 0) "
+                 "Random seed used for generating samples. "
+                 "0 means use a seed generated by the system."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time.")
+        .SetDefault(0);
+    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+        .SetDefault(framework::proto::DataType::FP32);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    uniform_random_batch_size_like,
+    paddle::operators::UniformRandomBatchSizeLikeOp,
+    paddle::operators::UniformRandomBatchSizeLikeOpMaker);
+// Kernels are registered in uniform_random_op.cc and uniform_random_op.cu
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -79,7 +79,7 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Uniform random operator.

-This operator initializes a tensor with random values sampled from a 
+This operator initializes a tensor with random values sampled from a
 uniform distribution.

 )DOC");
@@ -96,7 +96,9 @@ uniform distribution.
    AddAttr<int>("seed",
                 "(int, default 0) "
                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system.")
+                 "0 means use a seed generated by the system."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time.")
        .SetDefault(0);
    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
        .SetDefault(framework::proto::DataType::FP32);
@@ -110,3 +112,6 @@ REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp,
 REGISTER_OP_CPU_KERNEL(uniform_random,
                       paddle::operators::CPUUniformRandomKernel<float>,
                       paddle::operators::CPUUniformRandomKernel<double>);
+REGISTER_OP_CPU_KERNEL(uniform_random_batch_size_like,
+                       paddle::operators::CPUUniformRandomKernel<float>,
+                       paddle::operators::CPUUniformRandomKernel<double>);
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -66,3 +66,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 REGISTER_OP_CUDA_KERNEL(uniform_random,
                        paddle::operators::GPUUniformRandomKernel<float>,
                        paddle::operators::GPUUniformRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -185,7 +185,8 @@ class WarpCTCKernel : public framework::OpKernel<T> {

    // warpctc accesses labels in CPU memory
    Tensor warpctc_label;
-    Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label);
+    TensorCopy(*label, platform::CPUPlace(), ctx.device_context(),
+               &warpctc_label);
    const int* warpctc_label_data = warpctc_label.data<int>();
    // warpctc stores loss in CPU memory
    Tensor warpctc_loss;
@@ -200,7 +201,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
        sequence_width, num_sequences, blank, warpctc_loss_data);

    // Copy the loss back
-    Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
+    TensorCopy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
  }
 };


--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -101,7 +101,7 @@ T TensorGetElement(framework::Tensor &self, size_t offset) {
    return self.data<T>()[offset];
  } else {
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::Copy(self, platform::CPUPlace(), dst.get());
+    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
    return dst->data<T>()[offset];
  }
 }
@@ -111,9 +111,9 @@ template <typename T>
 void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
  if (platform::is_gpu_place(self.place())) {
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::Copy(self, platform::CPUPlace(), dst.get());
+    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
    dst->data<T>()[offset] = elem;
-    framework::Copy(*dst.get(), self.place(), &self);
+    framework::TensorCopy(*dst.get(), self.place(), &self);

  } else if (platform::is_cpu_place(self.place())) {
    self.data<T>()[offset] = elem;

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -68,6 +68,7 @@ __all__ = [
    'layer_norm',
    'softmax_with_cross_entropy',
    'smooth_l1',
+    'one_hot',
 ]


@@ -3212,3 +3213,40 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
                 'Out': loss},
        attrs={'sigma': sigma})
    return loss
+
+
+def one_hot(input, depth):
+    """
+    One Hot Operator. This operator creates the one-hot representations for input
+    index values. The following example will help to explain the function of this
+    operator.
+
+    Args:
+        input(Tensor/LodTensor):  A Tensor/LodTensor of indices, last dimension must be 1.
+        depth(scalar): an interger defining the depth of the one hot dimension.
+
+    Returns:
+         The one-hot tensor or LodTensor, same as input.
+
+    Examples:
+        X is a LoDTensor:
+          X.lod = [[0, 1, 4]]
+          X.shape = [4, 1]
+          X.data = [[1], [1], [3], [0]]
+        set depth = 4
+        Out is a LoDTensor:
+          Out.lod = [[0, 1, 4]]
+          Out.shape = [4, 4]
+          Out.data = [[0., 1., 0., 0.],
+                      [0., 1., 0., 0.],
+                      [0., 0., 0., 1.],
+                      [1., 0., 0., 0.]]
+    """
+    helper = LayerHelper("one_hot", **locals())
+    one_hot_out = helper.create_tmp_variable(dtype='float32')
+    helper.append_op(
+        type="one_hot",
+        inputs={'X': input},
+        attrs={'depth': depth},
+        outputs={'Out': one_hot_out})
+    return one_hot_out
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -66,6 +66,9 @@ __all__ = [
    'logical_xor',
    'logical_not',
    'uniform_random',
+    'uniform_random_batch_size_like',
+    'gaussian_random',
+    'gaussian_random_batch_size_like',
    'cumsum',
 ] + __activations__


--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function

 import unittest
 import paddle.v2.fluid as fluid
@@ -23,7 +24,8 @@ import sys

 def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
                    hid_dim=32):
-    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
    conv_3 = fluid.nets.sequence_conv_pool(
        input=emb,
        num_filters=hid_dim,
@@ -41,8 +43,6 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
                                 act="softmax")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    adam_optimizer.minimize(avg_cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)
    return avg_cost, accuracy, prediction

@@ -56,7 +56,8 @@ def stacked_lstm_net(data,
                     stacked_num=3):
    assert stacked_num % 2 == 1

-    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
    # add bias attr

    # TODO(qijun) linear act
@@ -79,8 +80,6 @@ def stacked_lstm_net(data,
                                 act='softmax')
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    adam_optimizer.minimize(avg_cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)
    return avg_cost, accuracy, prediction

@@ -93,7 +92,7 @@ def create_random_lodtensor(lod, place, low, high):
    return res


-def train(word_dict, net_method, use_cuda, save_dirname=None):
+def train(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
    BATCH_SIZE = 128
    PASS_NUM = 5
    dict_dim = len(word_dict)
@@ -102,8 +101,30 @@ def train(word_dict, net_method, use_cuda, save_dirname=None):
    data = fluid.layers.data(
        name="words", shape=[1], dtype="int64", lod_level=1)
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, acc_out, prediction = net_method(
-        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    if not parallel:
+        cost, acc_out, prediction = net_method(
+            data, label, input_dim=dict_dim, class_dim=class_dim)
+    else:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            cost, acc, _ = net_method(
+                pd.read_input(data),
+                pd.read_input(label),
+                input_dim=dict_dim,
+                class_dim=class_dim)
+            pd.write_output(cost)
+            pd.write_output(acc)
+
+        cost, acc = pd()
+        cost = fluid.layers.mean(x=cost)
+        acc_out = fluid.layers.mean(x=acc)
+        prediction = None
+        assert save_dirname is None
+
+    adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
+    adagrad.minimize(cost)

    train_data = paddle.batch(
        paddle.reader.shuffle(
@@ -164,14 +185,16 @@ def infer(use_cuda, save_dirname=None):
    print("Inference results: ", np_data)


-def main(word_dict, net_method, use_cuda):
+def main(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return

-    # Directory for saving the trained model
-    save_dirname = "understand_sentiment.inference.model"
-
-    train(word_dict, net_method, use_cuda, save_dirname)
+    train(
+        word_dict,
+        net_method,
+        use_cuda,
+        parallel=parallel,
+        save_dirname=save_dirname)
    infer(use_cuda, save_dirname)


@@ -191,20 +214,62 @@ class TestUnderstandSentiment(unittest.TestCase):

    def test_conv_cpu(self):
        with self.new_program_scope():
-            main(self.word_dict, net_method=convolution_net, use_cuda=False)
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=False,
+                save_dirname="understand_sentiment.inference.model")

+    def test_conv_cpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=False,
+                parallel=True)
+
+    @unittest.skip(reason="make CI faster")
    def test_stacked_lstm_cpu(self):
        with self.new_program_scope():
            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=False)

+    def test_stacked_lstm_cpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=False,
+                parallel=True)
+
    def test_conv_gpu(self):
        with self.new_program_scope():
-            main(self.word_dict, net_method=convolution_net, use_cuda=True)
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=True,
+                save_dirname="understand_sentiment.inference.model")
+
+    def test_conv_gpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=True,
+                parallel=True)

+    @unittest.skip(reason="make CI faster")
    def test_stacked_lstm_gpu(self):
        with self.new_program_scope():
            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=True)

+    def test_stacked_lstm_gpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=True,
+                parallel=True)
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/unittests/op_test.py
+++ b/python/paddle/v2/fluid/tests/unittests/op_test.py
@@ -248,7 +248,11 @@ class OpTest(unittest.TestCase):

        return feed_map

-    def check_output_with_place(self, place, atol):
+    def calc_output(self, place):
+        outs, _ = self._calc_output(place)
+        return outs
+
+    def _calc_output(self, place):
        op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)

        program = Program()
@@ -281,7 +285,10 @@ class OpTest(unittest.TestCase):
                       feed=feed_map,
                       fetch_list=fetch_list,
                       return_numpy=False)
+        return outs, fetch_list

+    def check_output_with_place(self, place, atol):
+        outs, fetch_list = self._calc_output(place)
        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
            if out_name not in self.outputs:
                continue
@@ -340,6 +347,15 @@ class OpTest(unittest.TestCase):
        for place in places:
            self.check_output_with_place(place, atol)

+    def check_output_customized(self, checker):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            outs = self.calc_output(place)
+            outs = [np.array(out) for out in outs]
+            checker(outs)
+
    def __assert_is_close(self, numeric_grads, analytic_grads, names,
                          max_relative_error, msg_prefix):


--- a/python/paddle/v2/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestGaussianRandomBatchSizeLike(OpTest):
+    def setUp(self):
+        self.op_type = "gaussian_random_batch_size_like"
+        self.inputs = {'Input': np.zeros((500, 2000), dtype="float32")}
+        self.attrs = {'mean': 1., 'std': 2., 'shape': [-1, 2000]}
+        self.outputs = {'Out': np.zeros((500, 2000), dtype='float32')}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        self.assertEqual(outs[0].shape, (500, 2000))
+        hist, _ = np.histogram(outs[0], range=(-3, 5))
+        hist = hist.astype("float32")
+        hist /= float(outs[0].size)
+        data = np.random.normal(size=(500, 2000), loc=1, scale=2)
+        hist2, _ = np.histogram(data, range=(-3, 5))
+        hist2 = hist2.astype("float32")
+        hist2 /= float(outs[0].size)
+        self.assertTrue(
+            np.allclose(
+                hist, hist2, rtol=0, atol=0.01),
+            "hist: " + str(hist) + " hist2: " + str(hist2))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestUniformRandomBatchSizeLike(OpTest):
+    def setUp(self):
+        self.op_type = "uniform_random_batch_size_like"
+        self.inputs = {'Input': np.zeros((500, 2000), dtype="float32")}
+        self.attrs = {'min': 1., 'max': 2., 'shape': [-1, 2000]}
+        self.outputs = {'Out': np.zeros((500, 2000), dtype='float32')}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        self.assertEqual(outs[0].shape, (500, 2000))
+        hist, _ = np.histogram(outs[0], range=(1, 2))
+        hist = hist.astype("float32")
+        hist /= float(outs[0].size)
+        prob = 0.1 * np.ones((10))
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/unittests/test_uniform_random_op.py
@@ -13,14 +13,11 @@
 # limitations under the License.

 import unittest
-import numpy
+import numpy as np
+from op_test import OpTest

-from paddle.v2.fluid.op import Operator
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid as fluid

-
-class TestUniformRandomOp(unittest.TestCase):
+class TestUniformRandomOp(OpTest):
    def setUp(self):
        self.op_type = "uniform_random"
        self.inputs = {}
@@ -30,35 +27,20 @@ class TestUniformRandomOp(unittest.TestCase):
            "max": 10.0,
            "seed": 10
        }
-        self.outputs = ["Out"]
-
-    def test_cpu(self):
-        self.uniform_random_test(place=core.CPUPlace())
-
-    def test_gpu(self):
-        if core.is_compiled_with_cuda():
-            self.uniform_random_test(place=core.CUDAPlace(0))
-
-    def uniform_random_test(self, place):
-        program = fluid.Program()
-        block = program.global_block()
-        vout = block.create_var(name="Out")
-        op = block.append_op(
-            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}

-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-
-        fetch_list = []
-        for var_name in self.outputs:
-            fetch_list.append(block.var(var_name))
-
-        exe = fluid.Executor(place)
-        outs = exe.run(program, fetch_list=fetch_list)
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)

+    def verify_output(self, outs):
        tensor = outs[0]
-
-        self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
+        hist, _ = np.histogram(outs[0], range=(-5, 10))
+        hist = hist.astype("float32")
+        hist /= float(outs[0].size)
+        prob = 0.1 * np.ones((10))
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))


 if __name__ == "__main__":