提交 07915c95 编写于 作者: Y Yu Yang

Renamed to strided_memcpy and prettify unittests

Add unittests for Crop and Concat
上级 3a4897ab
...@@ -96,4 +96,4 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") ...@@ -96,4 +96,4 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(tensor_copy_test SRCS tensor_copy_test.cc DEPS tensor paddle_memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
...@@ -22,10 +22,10 @@ namespace operators { ...@@ -22,10 +22,10 @@ namespace operators {
namespace detail { namespace detail {
template <typename T, int Rank> template <typename T, int Rank>
struct TensorCopyFunctor; struct StridedMemcpyFunctor;
template <typename T> template <typename T>
struct TensorCopyFunctor<T, 1> { struct StridedMemcpyFunctor<T, 1> {
void operator()(const platform::DeviceContext& dev_ctx, const T* src, void operator()(const platform::DeviceContext& dev_ctx, const T* src,
framework::Dim<1> src_stride, framework::Dim<1> dst_dim, framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
framework::Dim<1> dst_stride, T* dst) const { framework::Dim<1> dst_stride, T* dst) const {
...@@ -48,12 +48,12 @@ struct TensorCopyFunctor<T, 1> { ...@@ -48,12 +48,12 @@ struct TensorCopyFunctor<T, 1> {
}; };
template <typename T, int Rank> template <typename T, int Rank>
struct TensorCopyFunctor { struct StridedMemcpyFunctor {
void operator()(const platform::DeviceContext& dev_ctx, const T* src, void operator()(const platform::DeviceContext& dev_ctx, const T* src,
framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim, framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
framework::Dim<Rank> dst_stride, T* dst) const { framework::Dim<Rank> dst_stride, T* dst) const {
for (int64_t i = 0; i < dst_dim.head; ++i) { for (int64_t i = 0; i < dst_dim.head; ++i) {
TensorCopyFunctor<T, Rank - 1> func; StridedMemcpyFunctor<T, Rank - 1> func;
func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst); func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
src += src_stride.head; src += src_stride.head;
dst += dst_stride.head; dst += dst_stride.head;
...@@ -62,8 +62,8 @@ struct TensorCopyFunctor { ...@@ -62,8 +62,8 @@ struct TensorCopyFunctor {
}; };
template <typename T> template <typename T>
struct TensorCopyDimVisitor : public boost::static_visitor<void> { struct StridedCopyDimVisitor : public boost::static_visitor<void> {
TensorCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src, StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride, const framework::DDim& src_stride,
const framework::DDim& dst_stride, T* dst) const framework::DDim& dst_stride, T* dst)
: dev_ctx_(dev_ctx), : dev_ctx_(dev_ctx),
...@@ -77,7 +77,7 @@ struct TensorCopyDimVisitor : public boost::static_visitor<void> { ...@@ -77,7 +77,7 @@ struct TensorCopyDimVisitor : public boost::static_visitor<void> {
Dim src_stride = boost::get<Dim>(src_stride_); Dim src_stride = boost::get<Dim>(src_stride_);
Dim dst_stride = boost::get<Dim>(dst_stride_); Dim dst_stride = boost::get<Dim>(dst_stride_);
constexpr int dim = Dim::dimensions; constexpr int dim = Dim::dimensions;
TensorCopyFunctor<T, dim> functor; StridedMemcpyFunctor<T, dim> functor;
functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_); functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
} }
......
...@@ -13,15 +13,17 @@ ...@@ -13,15 +13,17 @@
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/operators/detail/tensor_copy.h" #include "paddle/operators/detail/strided_memcpy.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// Copy a tensor from src to dst. // Strided memory copy from src to dst.
// The src and dst should be both on dev_ctx.GetPlace()
// //
// the stride of an array (also referred to as increment, pitch or step size) is // The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will
// be a segment fault.
//
// The stride of an array (also referred to as increment, pitch or step size) is
// the number of locations in memory between beginnings of successive array // the number of locations in memory between beginnings of successive array
// elements // elements
// //
...@@ -31,12 +33,12 @@ namespace operators { ...@@ -31,12 +33,12 @@ namespace operators {
// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke // NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
// `dev_ctx.Wait()`. // `dev_ctx.Wait()`.
template <typename T> template <typename T>
inline void TensorCopy(const platform::DeviceContext& dev_ctx, const T* src, inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
const framework::DDim& src_stride, const framework::DDim& src_stride,
const framework::DDim& dst_dim, const framework::DDim& dst_dim,
const framework::DDim& dst_stride, T* dst) { const framework::DDim& dst_stride, T* dst) {
using namespace detail; using namespace detail;
TensorCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst); StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim); boost::apply_visitor(func, dst_dim);
} }
} // namespace operators } // namespace operators
......
...@@ -12,16 +12,21 @@ ...@@ -12,16 +12,21 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/tensor_copy.h" #include "paddle/operators/strided_memcpy.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
TEST(TensorCopy, CPU_COPY) {
TEST(StridedMemcpy, CPUCrop) {
// clang-format off
int src[] = { int src[] = {
0, 1, 2, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0,
0, 3, 4, 0, 0,
0, 0, 0, 0, 0,
}; };
// clang-format on
framework::DDim src_stride({5, 1}); framework::DDim src_stride({5, 1});
...@@ -30,7 +35,7 @@ TEST(TensorCopy, CPU_COPY) { ...@@ -30,7 +35,7 @@ TEST(TensorCopy, CPU_COPY) {
framework::DDim dst_stride({2, 1}); framework::DDim dst_stride({2, 1});
platform::CPUDeviceContext ctx; platform::CPUDeviceContext ctx;
TensorCopy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst); StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
ASSERT_EQ(1, dst[0]); ASSERT_EQ(1, dst[0]);
ASSERT_EQ(2, dst[1]); ASSERT_EQ(2, dst[1]);
...@@ -38,11 +43,44 @@ TEST(TensorCopy, CPU_COPY) { ...@@ -38,11 +43,44 @@ TEST(TensorCopy, CPU_COPY) {
ASSERT_EQ(4, dst[3]); ASSERT_EQ(4, dst[3]);
} }
TEST(StridedMemcpy, CPUConcat) {
// clang-format off
int src[] = {
1, 2,
3, 4
};
// clang-format on
int dst[8];
framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1});
platform::CPUDeviceContext ctx;
StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
// clang-format off
int expect_dst[] = {
1, 2, 1, 2,
3, 4, 3, 4
};
// clang-format on
for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
ASSERT_EQ(expect_dst[i], dst[i]);
}
}
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
TEST(TensorCopy, GPU_COPY) { TEST(StridedMemcpy, GPUCrop) {
// clang-format off
int src[] = { int src[] = {
0, 1, 2, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0,
0, 3, 4, 0, 0,
0, 0, 0, 0, 0,
}; };
// clang-format on
platform::GPUPlace gpu0(0); platform::GPUPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
...@@ -59,9 +97,11 @@ TEST(TensorCopy, GPU_COPY) { ...@@ -59,9 +97,11 @@ TEST(TensorCopy, GPU_COPY) {
framework::DDim dst_stride({2, 1}); framework::DDim dst_stride({2, 1});
platform::CUDADeviceContext ctx(gpu0); platform::CUDADeviceContext ctx(gpu0);
TensorCopy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst); StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
gpu_dst);
memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst)); memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait();
ASSERT_EQ(1, dst[0]); ASSERT_EQ(1, dst[0]);
ASSERT_EQ(2, dst[1]); ASSERT_EQ(2, dst[1]);
...@@ -72,6 +112,49 @@ TEST(TensorCopy, GPU_COPY) { ...@@ -72,6 +112,49 @@ TEST(TensorCopy, GPU_COPY) {
memory::Free(gpu0, gpu_src); memory::Free(gpu0, gpu_src);
} }
TEST(StridedMemcpy, GPUConcat) {
// clang-format off
int src[] = {
1, 2,
3, 4
};
// clang-format on
platform::GPUPlace gpu0(0);
platform::CPUPlace cpu;
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
int dst[8];
int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1});
platform::CUDADeviceContext ctx(gpu0);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
gpu_dst + 2);
memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
ctx.Wait();
// clang-format off
int expect_dst[] = {
1, 2, 1, 2,
3, 4, 3, 4
};
// clang-format on
for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
ASSERT_EQ(expect_dst[i], dst[i]);
}
memory::Free(gpu0, gpu_dst);
memory::Free(gpu0, gpu_src);
}
#endif #endif
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册