提交 24cb1ce0 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #4177 from reyoung/feature/transform_ctx

Feature/transform ctx
...@@ -24,4 +24,4 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator ...@@ -24,4 +24,4 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/hostdevice.h" #include "paddle/platform/hostdevice.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
...@@ -21,6 +22,7 @@ ...@@ -21,6 +22,7 @@
#include <algorithm> #include <algorithm>
#include <type_traits> #include <type_traits>
#ifdef __NVCC__ #ifdef __NVCC__
#include <thrust/execution_policy.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "paddle/platform/details/device_ptr_cast.h" #include "paddle/platform/details/device_ptr_cast.h"
#endif #endif
...@@ -28,34 +30,39 @@ ...@@ -28,34 +30,39 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
// Transform on host or device. It provides the same API in std library. // Transform on host or device. It provides the same API in std library.
template <typename Place, typename InputIter, typename OutputIter, template <typename InputIter, typename OutputIter, typename UnaryOperation>
typename UnaryOperation> void Transform(const DeviceContext& context, InputIter first, InputIter last,
void Transform(Place place, InputIter first, InputIter last, OutputIter result, OutputIter result, UnaryOperation op) {
UnaryOperation op) { auto place = context.GetPlace();
if (is_cpu_place(place)) { if (is_cpu_place(place)) {
std::transform(first, last, result, op); std::transform(first, last, result, op);
} else { } else {
#ifdef __NVCC__ #ifdef __NVCC__
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
using namespace details; using namespace details;
thrust::transform(DevPtrCast(first), DevPtrCast(last), DevPtrCast(result), thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first),
op); DevPtrCast(last), DevPtrCast(result), op);
#else #else
PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file"); PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
#endif #endif
} }
} }
template <typename Place, typename InputIter1, typename InputIter2, template <typename InputIter1, typename InputIter2, typename OutputIter,
typename OutputIter, typename BinaryOperation> typename BinaryOperation>
void Transform(Place place, InputIter1 first1, InputIter1 last1, void Transform(const DeviceContext& context, InputIter1 first1,
InputIter2 first2, OutputIter result, BinaryOperation op) { InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
if (is_cpu_place(place)) { if (is_cpu_place(place)) {
std::transform(first1, last1, first2, result, op); std::transform(first1, last1, first2, result, op);
} else { } else {
#ifdef __NVCC__ #ifdef __NVCC__
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
using namespace details; using namespace details;
thrust::transform(DevPtrCast(first1), DevPtrCast(last1), DevPtrCast(first2), thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first1),
DevPtrCast(result), op); DevPtrCast(last1), DevPtrCast(first2), DevPtrCast(result),
op);
#else #else
PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file"); PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
#endif #endif
......
...@@ -36,8 +36,9 @@ class Multiply { ...@@ -36,8 +36,9 @@ class Multiply {
TEST(Transform, CPUUnary) { TEST(Transform, CPUUnary) {
using namespace paddle::platform; using namespace paddle::platform;
CPUDeviceContext ctx;
float buf[4] = {0.1, 0.2, 0.3, 0.4}; float buf[4] = {0.1, 0.2, 0.3, 0.4};
Transform(CPUPlace(), buf, buf + 4, buf, Scale<float>(10)); Transform(ctx, buf, buf + 4, buf, Scale<float>(10));
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5); ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
} }
...@@ -47,10 +48,12 @@ TEST(Transform, GPUUnary) { ...@@ -47,10 +48,12 @@ TEST(Transform, GPUUnary) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::memory; using namespace paddle::memory;
GPUPlace gpu0(0); GPUPlace gpu0(0);
CUDADeviceContext ctx(gpu0);
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4)); float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10)); Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx.Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
Free(gpu0, gpu_buf); Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
...@@ -62,7 +65,7 @@ TEST(Transform, CPUBinary) { ...@@ -62,7 +65,7 @@ TEST(Transform, CPUBinary) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::memory; using namespace paddle::memory;
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
Transform(CPUPlace(), buf, buf + 4, buf, buf, Multiply<int>()); Transform(CPUDeviceContext(), buf, buf + 4, buf, buf, Multiply<int>());
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]); ASSERT_EQ((i + 1) * (i + 1), buf[i]);
} }
...@@ -73,9 +76,11 @@ TEST(Transform, GPUBinary) { ...@@ -73,9 +76,11 @@ TEST(Transform, GPUBinary) {
using namespace paddle::memory; using namespace paddle::memory;
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
GPUPlace gpu0(0); GPUPlace gpu0(0);
CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf))); int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>()); Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx.Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf)); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
Free(gpu0, gpu_buf); Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册