Renamed to strided_memcpy and prettify unittests

Add unittests for Crop and Concat

Renamed to strided_memcpy and prettify unittests
Add unittests for Crop and Concat
3fb0b6e6 · Yu Yang · zchen0211 · bda67d9d · 3fb0b6e6 · 3fb0b6e6
4 changed file
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -96,4 +96,4 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
-cc_test(tensor_copy_test SRCS tensor_copy_test.cc DEPS tensor paddle_memory)
+cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
--- a/paddle/operators/detail/tensor_copy.h
+++ b/paddle/operators/detail/tensor_copy.h
@@ -22,10 +22,10 @@ namespace operators {
 namespace detail {

 template <typename T, int Rank>
-struct TensorCopyFunctor;
+struct StridedMemcpyFunctor;

 template <typename T>
-struct TensorCopyFunctor<T, 1> {
+struct StridedMemcpyFunctor<T, 1> {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
                  framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
                  framework::Dim<1> dst_stride, T* dst) const {
@@ -48,12 +48,12 @@ struct TensorCopyFunctor<T, 1> {
 };

 template <typename T, int Rank>
-struct TensorCopyFunctor {
+struct StridedMemcpyFunctor {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
                  framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
                  framework::Dim<Rank> dst_stride, T* dst) const {
    for (int64_t i = 0; i < dst_dim.head; ++i) {
-      TensorCopyFunctor<T, Rank - 1> func;
+      StridedMemcpyFunctor<T, Rank - 1> func;
      func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
      src += src_stride.head;
      dst += dst_stride.head;
@@ -62,10 +62,10 @@ struct TensorCopyFunctor {
 };

 template <typename T>
-struct TensorCopyDimVisitor : public boost::static_visitor<void> {
-  TensorCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
-                       const framework::DDim& src_stride,
-                       const framework::DDim& dst_stride, T* dst)
+struct StridedCopyDimVisitor : public boost::static_visitor<void> {
+  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
+                        const framework::DDim& src_stride,
+                        const framework::DDim& dst_stride, T* dst)
      : dev_ctx_(dev_ctx),
        src_(src),
        src_stride_(src_stride),
@@ -77,7 +77,7 @@ struct TensorCopyDimVisitor : public boost::static_visitor<void> {
    Dim src_stride = boost::get<Dim>(src_stride_);
    Dim dst_stride = boost::get<Dim>(dst_stride_);
    constexpr int dim = Dim::dimensions;
-    TensorCopyFunctor<T, dim> functor;
+    StridedMemcpyFunctor<T, dim> functor;
    functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
  }


--- a/paddle/operators/tensor_copy.h
+++ b/paddle/operators/tensor_copy.h
@@ -13,15 +13,17 @@
   limitations under the License. */

 #pragma once
-#include "paddle/operators/detail/tensor_copy.h"
+#include "paddle/operators/detail/strided_memcpy.h"

 namespace paddle {
 namespace operators {

-// Copy a tensor from src to dst.
-// The src and dst should be both on dev_ctx.GetPlace()
+// Strided memory copy from src to dst.
 //
-// the stride of an array (also referred to as increment, pitch or step size) is
+// The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will
+// be a segment fault.
+//
+// The stride of an array (also referred to as increment, pitch or step size) is
 // the number of locations in memory between beginnings of successive array
 // elements
 //
@@ -31,12 +33,12 @@ namespace operators {
 // NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
 // `dev_ctx.Wait()`.
 template <typename T>
-inline void TensorCopy(const platform::DeviceContext& dev_ctx, const T* src,
-                       const framework::DDim& src_stride,
-                       const framework::DDim& dst_dim,
-                       const framework::DDim& dst_stride, T* dst) {
+inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
+                          const framework::DDim& src_stride,
+                          const framework::DDim& dst_dim,
+                          const framework::DDim& dst_stride, T* dst) {
  using namespace detail;
-  TensorCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
  boost::apply_visitor(func, dst_dim);
 }
 }  // namespace operators

--- a/paddle/operators/tensor_copy_test.cc
+++ b/paddle/operators/tensor_copy_test.cc
@@ -12,16 +12,21 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/tensor_copy.h"
+#include "paddle/operators/strided_memcpy.h"
 #include "gtest/gtest.h"
 #include "paddle/memory/memory.h"

 namespace paddle {
 namespace operators {
-TEST(TensorCopy, CPU_COPY) {
+
+TEST(StridedMemcpy, CPUCrop) {
+  // clang-format off
  int src[] = {
-      0, 1, 2, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0,
+      0, 1, 2, 0, 0,
+      0, 3, 4, 0, 0,
+      0, 0, 0, 0, 0,
  };
+  // clang-format on

  framework::DDim src_stride({5, 1});

@@ -30,7 +35,7 @@ TEST(TensorCopy, CPU_COPY) {
  framework::DDim dst_stride({2, 1});

  platform::CPUDeviceContext ctx;
-  TensorCopy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
+  StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);

  ASSERT_EQ(1, dst[0]);
  ASSERT_EQ(2, dst[1]);
@@ -38,11 +43,44 @@ TEST(TensorCopy, CPU_COPY) {
  ASSERT_EQ(4, dst[3]);
 }

+TEST(StridedMemcpy, CPUConcat) {
+  // clang-format off
+  int src[] = {
+      1, 2,
+      3, 4
+  };
+  // clang-format on
+
+  int dst[8];
+
+  framework::DDim src_stride({2, 1});
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({4, 1});
+  platform::CPUDeviceContext ctx;
+
+  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
+  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
+
+  // clang-format off
+  int expect_dst[] = {
+      1, 2, 1, 2,
+      3, 4, 3, 4
+  };
+  // clang-format on
+  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
+    ASSERT_EQ(expect_dst[i], dst[i]);
+  }
+}
+
 #ifndef PADDLE_ONLY_CPU
-TEST(TensorCopy, GPU_COPY) {
+TEST(StridedMemcpy, GPUCrop) {
+  // clang-format off
  int src[] = {
-      0, 1, 2, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0,
+      0, 1, 2, 0, 0,
+      0, 3, 4, 0, 0,
+      0, 0, 0, 0, 0,
  };
+  // clang-format on

  platform::GPUPlace gpu0(0);
  platform::CPUPlace cpu;
@@ -59,9 +97,11 @@ TEST(TensorCopy, GPU_COPY) {
  framework::DDim dst_stride({2, 1});

  platform::CUDADeviceContext ctx(gpu0);
-  TensorCopy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
+  StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
+                     gpu_dst);

-  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst));
+  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  ctx.Wait();

  ASSERT_EQ(1, dst[0]);
  ASSERT_EQ(2, dst[1]);
@@ -72,6 +112,49 @@ TEST(TensorCopy, GPU_COPY) {
  memory::Free(gpu0, gpu_src);
 }

+TEST(StridedMemcpy, GPUConcat) {
+  // clang-format off
+  int src[] = {
+      1, 2,
+      3, 4
+  };
+  // clang-format on
+
+  platform::GPUPlace gpu0(0);
+  platform::CPUPlace cpu;
+
+  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+
+  int dst[8];
+  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
+
+  framework::DDim src_stride({2, 1});
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({4, 1});
+  platform::CUDADeviceContext ctx(gpu0);
+
+  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
+  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
+                     gpu_dst + 2);
+
+  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  ctx.Wait();
+
+  // clang-format off
+  int expect_dst[] = {
+      1, 2, 1, 2,
+      3, 4, 3, 4
+  };
+  // clang-format on
+  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
+    ASSERT_EQ(expect_dst[i], dst[i]);
+  }
+
+  memory::Free(gpu0, gpu_dst);
+  memory::Free(gpu0, gpu_src);
+}
+
 #endif
 }  // namespace operators
 }  // namespace paddle
\ No newline at end of file