提交 07915c95 编写于 作者: Y Yu Yang

Renamed to strided_memcpy and prettify unittests

Add unittests for Crop and Concat
上级 3a4897ab
......@@ -96,4 +96,4 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(tensor_copy_test SRCS tensor_copy_test.cc DEPS tensor paddle_memory)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
......@@ -22,10 +22,10 @@ namespace operators {
namespace detail {
template <typename T, int Rank>
struct TensorCopyFunctor;
struct StridedMemcpyFunctor;
template <typename T>
struct TensorCopyFunctor<T, 1> {
struct StridedMemcpyFunctor<T, 1> {
void operator()(const platform::DeviceContext& dev_ctx, const T* src,
framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
framework::Dim<1> dst_stride, T* dst) const {
......@@ -48,12 +48,12 @@ struct TensorCopyFunctor<T, 1> {
};
template <typename T, int Rank>
struct TensorCopyFunctor {
struct StridedMemcpyFunctor {
void operator()(const platform::DeviceContext& dev_ctx, const T* src,
framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
framework::Dim<Rank> dst_stride, T* dst) const {
for (int64_t i = 0; i < dst_dim.head; ++i) {
TensorCopyFunctor<T, Rank - 1> func;
StridedMemcpyFunctor<T, Rank - 1> func;
func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
src += src_stride.head;
dst += dst_stride.head;
......@@ -62,8 +62,8 @@ struct TensorCopyFunctor {
};
template <typename T>
struct TensorCopyDimVisitor : public boost::static_visitor<void> {
TensorCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
struct StridedCopyDimVisitor : public boost::static_visitor<void> {
StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride,
const framework::DDim& dst_stride, T* dst)
: dev_ctx_(dev_ctx),
......@@ -77,7 +77,7 @@ struct TensorCopyDimVisitor : public boost::static_visitor<void> {
Dim src_stride = boost::get<Dim>(src_stride_);
Dim dst_stride = boost::get<Dim>(dst_stride_);
constexpr int dim = Dim::dimensions;
TensorCopyFunctor<T, dim> functor;
StridedMemcpyFunctor<T, dim> functor;
functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
}
......
......@@ -13,15 +13,17 @@
limitations under the License. */
#pragma once
#include "paddle/operators/detail/tensor_copy.h"
#include "paddle/operators/detail/strided_memcpy.h"
namespace paddle {
namespace operators {
// Copy a tensor from src to dst.
// The src and dst should be both on dev_ctx.GetPlace()
// Strided memory copy from src to dst.
//
// the stride of an array (also referred to as increment, pitch or step size) is
// The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will
// be a segment fault.
//
// The stride of an array (also referred to as increment, pitch or step size) is
// the number of locations in memory between beginnings of successive array
// elements
//
......@@ -31,12 +33,12 @@ namespace operators {
// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
// `dev_ctx.Wait()`.
template <typename T>
inline void TensorCopy(const platform::DeviceContext& dev_ctx, const T* src,
inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride,
const framework::DDim& dst_dim,
const framework::DDim& dst_stride, T* dst) {
using namespace detail;
TensorCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim);
}
} // namespace operators
......
......@@ -12,16 +12,21 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/tensor_copy.h"
#include "paddle/operators/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/memory/memory.h"
namespace paddle {
namespace operators {
TEST(TensorCopy, CPU_COPY) {
TEST(StridedMemcpy, CPUCrop) {
// clang-format off
int src[] = {
0, 1, 2, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0,
0, 1, 2, 0, 0,
0, 3, 4, 0, 0,
0, 0, 0, 0, 0,
};
// clang-format on
framework::DDim src_stride({5, 1});
......@@ -30,7 +35,7 @@ TEST(TensorCopy, CPU_COPY) {
framework::DDim dst_stride({2, 1});
platform::CPUDeviceContext ctx;
TensorCopy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
ASSERT_EQ(1, dst[0]);
ASSERT_EQ(2, dst[1]);
......@@ -38,11 +43,44 @@ TEST(TensorCopy, CPU_COPY) {
ASSERT_EQ(4, dst[3]);
}
TEST(StridedMemcpy, CPUConcat) {
// clang-format off
int src[] = {
1, 2,
3, 4
};
// clang-format on
int dst[8];
framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1});
platform::CPUDeviceContext ctx;
StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
// clang-format off
int expect_dst[] = {
1, 2, 1, 2,
3, 4, 3, 4
};
// clang-format on
for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
ASSERT_EQ(expect_dst[i], dst[i]);
}
}
#ifndef PADDLE_ONLY_CPU
TEST(TensorCopy, GPU_COPY) {
TEST(StridedMemcpy, GPUCrop) {
// clang-format off
int src[] = {
0, 1, 2, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0,
0, 1, 2, 0, 0,
0, 3, 4, 0, 0,
0, 0, 0, 0, 0,
};
// clang-format on
platform::GPUPlace gpu0(0);
platform::CPUPlace cpu;
......@@ -59,9 +97,11 @@ TEST(TensorCopy, GPU_COPY) {
framework::DDim dst_stride({2, 1});
platform::CUDADeviceContext ctx(gpu0);
TensorCopy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
gpu_dst);
memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst));
memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait();
ASSERT_EQ(1, dst[0]);
ASSERT_EQ(2, dst[1]);
......@@ -72,6 +112,49 @@ TEST(TensorCopy, GPU_COPY) {
memory::Free(gpu0, gpu_src);
}
TEST(StridedMemcpy, GPUConcat) {
// clang-format off
int src[] = {
1, 2,
3, 4
};
// clang-format on
platform::GPUPlace gpu0(0);
platform::CPUPlace cpu;
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
int dst[8];
int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1});
platform::CUDADeviceContext ctx(gpu0);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
gpu_dst + 2);
memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait();
// clang-format off
int expect_dst[] = {
1, 2, 1, 2,
3, 4, 3, 4
};
// clang-format on
for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
ASSERT_EQ(expect_dst[i], dst[i]);
}
memory::Free(gpu0, gpu_dst);
memory::Free(gpu0, gpu_src);
}
#endif
} // namespace operators
} // namespace paddle
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册