diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 95f0acace93c27f34b900ec8a912d381139f762e..90c7171419888612a48d929ed85039b16384a573 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -96,4 +96,4 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
-cc_test(tensor_copy_test SRCS tensor_copy_test.cc DEPS tensor paddle_memory)
+cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
diff --git a/paddle/operators/detail/tensor_copy.h b/paddle/operators/detail/strided_memcpy.h
similarity index 86%
rename from paddle/operators/detail/tensor_copy.h
rename to paddle/operators/detail/strided_memcpy.h
index 44fe4956481fc12fd0f3c2a49c1177c7b8d4aa78..b165224b37fb091c094a823179256c3dd40a37c9 100644
--- a/paddle/operators/detail/tensor_copy.h
+++ b/paddle/operators/detail/strided_memcpy.h
@@ -22,10 +22,10 @@ namespace operators {
 namespace detail {
 
 template <typename T, int Rank>
-struct TensorCopyFunctor;
+struct StridedMemcpyFunctor;
 
 template <typename T>
-struct TensorCopyFunctor<T, 1> {
+struct StridedMemcpyFunctor<T, 1> {
   void operator()(const platform::DeviceContext& dev_ctx, const T* src,
                   framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
                   framework::Dim<1> dst_stride, T* dst) const {
@@ -48,12 +48,12 @@ struct TensorCopyFunctor<T, 1> {
 };
 
 template <typename T, int Rank>
-struct TensorCopyFunctor {
+struct StridedMemcpyFunctor {
   void operator()(const platform::DeviceContext& dev_ctx, const T* src,
                   framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
                   framework::Dim<Rank> dst_stride, T* dst) const {
     for (int64_t i = 0; i < dst_dim.head; ++i) {
-      TensorCopyFunctor<T, Rank - 1> func;
+      StridedMemcpyFunctor<T, Rank - 1> func;
       func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
       src += src_stride.head;
       dst += dst_stride.head;
@@ -62,10 +62,10 @@ struct TensorCopyFunctor {
 };
 
 template <typename T>
-struct TensorCopyDimVisitor : public boost::static_visitor<void> {
-  TensorCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
-                       const framework::DDim& src_stride,
-                       const framework::DDim& dst_stride, T* dst)
+struct StridedCopyDimVisitor : public boost::static_visitor<void> {
+  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
+                        const framework::DDim& src_stride,
+                        const framework::DDim& dst_stride, T* dst)
       : dev_ctx_(dev_ctx),
         src_(src),
         src_stride_(src_stride),
@@ -77,7 +77,7 @@ struct TensorCopyDimVisitor : public boost::static_visitor<void> {
     Dim src_stride = boost::get<Dim>(src_stride_);
     Dim dst_stride = boost::get<Dim>(dst_stride_);
     constexpr int dim = Dim::dimensions;
-    TensorCopyFunctor<T, dim> functor;
+    StridedMemcpyFunctor<T, dim> functor;
     functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
   }
 
diff --git a/paddle/operators/tensor_copy.h b/paddle/operators/strided_memcpy.h
similarity index 65%
rename from paddle/operators/tensor_copy.h
rename to paddle/operators/strided_memcpy.h
index 9210b4638b3a122fcaf7206a190213177f8b4f9f..c9dd80518424017d9834a2bf7aee14caa56c9d79 100644
--- a/paddle/operators/tensor_copy.h
+++ b/paddle/operators/strided_memcpy.h
@@ -13,15 +13,17 @@
    limitations under the License. */
 
 #pragma once
-#include "paddle/operators/detail/tensor_copy.h"
+#include "paddle/operators/detail/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
 
-// Copy a tensor from src to dst.
-// The src and dst should be both on dev_ctx.GetPlace()
+// Strided memory copy from src to dst.
 //
-// the stride of an array (also referred to as increment, pitch or step size) is
+// The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will
+// be a segment fault.
+//
+// The stride of an array (also referred to as increment, pitch or step size) is
 // the number of locations in memory between beginnings of successive array
 // elements
 //
@@ -31,12 +33,12 @@ namespace operators {
 // NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
 // `dev_ctx.Wait()`.
 template <typename T>
-inline void TensorCopy(const platform::DeviceContext& dev_ctx, const T* src,
-                       const framework::DDim& src_stride,
-                       const framework::DDim& dst_dim,
-                       const framework::DDim& dst_stride, T* dst) {
+inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
+                          const framework::DDim& src_stride,
+                          const framework::DDim& dst_dim,
+                          const framework::DDim& dst_stride, T* dst) {
   using namespace detail;
-  TensorCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
   boost::apply_visitor(func, dst_dim);
 }
 }  // namespace operators
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05882a88738cfc9cc23480efe0afe504008377ca
--- /dev/null
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/strided_memcpy.h"
+#include "gtest/gtest.h"
+#include "paddle/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+TEST(StridedMemcpy, CPUCrop) {
+  // clang-format off
+  int src[] = {
+      0, 1, 2, 0, 0,
+      0, 3, 4, 0, 0,
+      0, 0, 0, 0, 0,
+  };
+  // clang-format on
+
+  framework::DDim src_stride({5, 1});
+
+  int dst[4];
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({2, 1});
+
+  platform::CPUDeviceContext ctx;
+  StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
+
+  ASSERT_EQ(1, dst[0]);
+  ASSERT_EQ(2, dst[1]);
+  ASSERT_EQ(3, dst[2]);
+  ASSERT_EQ(4, dst[3]);
+}
+
+TEST(StridedMemcpy, CPUConcat) {
+  // clang-format off
+  int src[] = {
+      1, 2,
+      3, 4
+  };
+  // clang-format on
+
+  int dst[8];
+
+  framework::DDim src_stride({2, 1});
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({4, 1});
+  platform::CPUDeviceContext ctx;
+
+  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
+  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
+
+  // clang-format off
+  int expect_dst[] = {
+      1, 2, 1, 2,
+      3, 4, 3, 4
+  };
+  // clang-format on
+  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
+    ASSERT_EQ(expect_dst[i], dst[i]);
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(StridedMemcpy, GPUCrop) {
+  // clang-format off
+  int src[] = {
+      0, 1, 2, 0, 0,
+      0, 3, 4, 0, 0,
+      0, 0, 0, 0, 0,
+  };
+  // clang-format on
+
+  platform::GPUPlace gpu0(0);
+  platform::CPUPlace cpu;
+
+  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+
+  framework::DDim src_stride({5, 1});
+
+  int dst[4];
+  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
+
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({2, 1});
+
+  platform::CUDADeviceContext ctx(gpu0);
+  StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
+                     gpu_dst);
+
+  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  ctx.Wait();
+
+  ASSERT_EQ(1, dst[0]);
+  ASSERT_EQ(2, dst[1]);
+  ASSERT_EQ(3, dst[2]);
+  ASSERT_EQ(4, dst[3]);
+
+  memory::Free(gpu0, gpu_dst);
+  memory::Free(gpu0, gpu_src);
+}
+
+TEST(StridedMemcpy, GPUConcat) {
+  // clang-format off
+  int src[] = {
+      1, 2,
+      3, 4
+  };
+  // clang-format on
+
+  platform::GPUPlace gpu0(0);
+  platform::CPUPlace cpu;
+
+  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+
+  int dst[8];
+  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
+
+  framework::DDim src_stride({2, 1});
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({4, 1});
+  platform::CUDADeviceContext ctx(gpu0);
+
+  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
+  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
+                     gpu_dst + 2);
+
+  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  ctx.Wait();
+
+  // clang-format off
+  int expect_dst[] = {
+      1, 2, 1, 2,
+      3, 4, 3, 4
+  };
+  // clang-format on
+  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
+    ASSERT_EQ(expect_dst[i], dst[i]);
+  }
+
+  memory::Free(gpu0, gpu_dst);
+  memory::Free(gpu0, gpu_src);
+}
+
+#endif
+}  // namespace operators
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/operators/tensor_copy_test.cc b/paddle/operators/tensor_copy_test.cc
deleted file mode 100644
index df177096d31393ef91be988d3f29102a084a614b..0000000000000000000000000000000000000000
--- a/paddle/operators/tensor_copy_test.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/tensor_copy.h"
-#include "gtest/gtest.h"
-#include "paddle/memory/memory.h"
-
-namespace paddle {
-namespace operators {
-TEST(TensorCopy, CPU_COPY) {
-  int src[] = {
-      0, 1, 2, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0,
-  };
-
-  framework::DDim src_stride({5, 1});
-
-  int dst[4];
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
-
-  platform::CPUDeviceContext ctx;
-  TensorCopy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
-
-  ASSERT_EQ(1, dst[0]);
-  ASSERT_EQ(2, dst[1]);
-  ASSERT_EQ(3, dst[2]);
-  ASSERT_EQ(4, dst[3]);
-}
-
-#ifndef PADDLE_ONLY_CPU
-TEST(TensorCopy, GPU_COPY) {
-  int src[] = {
-      0, 1, 2, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0,
-  };
-
-  platform::GPUPlace gpu0(0);
-  platform::CPUPlace cpu;
-
-  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
-
-  framework::DDim src_stride({5, 1});
-
-  int dst[4];
-  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
-
-  framework::DDim dst_dim({2, 2});
-  framework::DDim dst_stride({2, 1});
-
-  platform::CUDADeviceContext ctx(gpu0);
-  TensorCopy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
-
-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst));
-
-  ASSERT_EQ(1, dst[0]);
-  ASSERT_EQ(2, dst[1]);
-  ASSERT_EQ(3, dst[2]);
-  ASSERT_EQ(4, dst[3]);
-
-  memory::Free(gpu0, gpu_dst);
-  memory::Free(gpu0, gpu_src);
-}
-
-#endif
-}  // namespace operators
-}  // namespace paddle
\ No newline at end of file