提交 d865b047 编写于 作者: Q qingqing01 提交者: GitHub

Merge pull request #4201 from qingqing01/fix_prelu

Refine platform::Transform function and fix prelu_op testing.
...@@ -54,7 +54,8 @@ class PReluKernel : public framework::OpKernel { ...@@ -54,7 +54,8 @@ class PReluKernel : public framework::OpKernel {
int numel = x->numel(); int numel = x->numel();
Transform(context.device_context(), x_ptr, x_ptr + numel, o_ptr, Transform<Place> trans;
trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr,
PReluFunctor<T>(alpha_ptr)); PReluFunctor<T>(alpha_ptr));
} }
}; };
...@@ -91,8 +92,9 @@ class PReluGradKernel : public framework::OpKernel { ...@@ -91,8 +92,9 @@ class PReluGradKernel : public framework::OpKernel {
const T* out_ptr = out->data<T>(); const T* out_ptr = out->data<T>();
int numel = dx->numel(); int numel = dx->numel();
Transform(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, Transform<Place> trans;
dx_ptr, PReluGradFunctor<T>(alpha_ptr)); trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr,
PReluGradFunctor<T>(alpha_ptr));
// TODO (Zhuoyuan): add dalpha upgrade when GPU kernels ready // TODO (Zhuoyuan): add dalpha upgrade when GPU kernels ready
} }
......
...@@ -29,45 +29,71 @@ ...@@ -29,45 +29,71 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
// Transform on host or device. It provides the same API in std library. // Transform on host or device. It provides the same API in std library.
template <typename InputIter, typename OutputIter, typename UnaryOperation> template <typename Place>
void Transform(const DeviceContext& context, InputIter first, InputIter last, struct Transform {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op);
template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void operator()(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op);
};
template <>
struct Transform<platform::CPUPlace> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op) { OutputIter result, UnaryOperation op) {
auto place = context.GetPlace(); auto place = context.GetPlace();
if (is_cpu_place(place)) { PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
std::transform(first, last, result, op); std::transform(first, last, result, op);
} else {
#ifdef __NVCC__
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
using namespace details;
thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first),
DevPtrCast(last), DevPtrCast(result), op);
#else
PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
#endif
} }
}
template <typename InputIter1, typename InputIter2, typename OutputIter, template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation> typename BinaryOperation>
void Transform(const DeviceContext& context, InputIter1 first1, void operator()(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result, InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op) { BinaryOperation op) {
auto place = context.GetPlace(); auto place = context.GetPlace();
if (is_cpu_place(place)) { PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
std::transform(first1, last1, first2, result, op); std::transform(first1, last1, first2, result, op);
} else { }
};
#ifdef __NVCC__ #ifdef __NVCC__
template <>
struct Transform<platform::GPUPlace> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context); auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
using namespace details; thrust::transform(thrust::cuda::par.on(ctx.stream()),
thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first1), details::DevPtrCast(first), details::DevPtrCast(last),
DevPtrCast(last1), DevPtrCast(first2), DevPtrCast(result), details::DevPtrCast(result), op);
}
template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void operator()(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
thrust::transform(thrust::cuda::par.on(ctx.stream()),
details::DevPtrCast(first1), details::DevPtrCast(last1),
details::DevPtrCast(first2), details::DevPtrCast(result),
op); op);
#else
PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
#endif
} }
}; };
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/memory/memcpy.h" #include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
#include "paddle/platform/hostdevice.h"
#include "paddle/platform/transform.h" #include "paddle/platform/transform.h"
template <typename T> template <typename T>
...@@ -38,7 +39,8 @@ TEST(Transform, CPUUnary) { ...@@ -38,7 +39,8 @@ TEST(Transform, CPUUnary) {
using namespace paddle::platform; using namespace paddle::platform;
CPUDeviceContext ctx; CPUDeviceContext ctx;
float buf[4] = {0.1, 0.2, 0.3, 0.4}; float buf[4] = {0.1, 0.2, 0.3, 0.4};
Transform(ctx, buf, buf + 4, buf, Scale<float>(10)); Transform<paddle::platform::CPUPlace> trans;
trans(ctx, buf, buf + 4, buf, Scale<float>(10));
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5); ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
} }
...@@ -52,7 +54,8 @@ TEST(Transform, GPUUnary) { ...@@ -52,7 +54,8 @@ TEST(Transform, GPUUnary) {
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4)); float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10)); Transform<paddle::platform::GPUPlace> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
Free(gpu0, gpu_buf); Free(gpu0, gpu_buf);
...@@ -65,7 +68,9 @@ TEST(Transform, CPUBinary) { ...@@ -65,7 +68,9 @@ TEST(Transform, CPUBinary) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::memory; using namespace paddle::memory;
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
Transform(CPUDeviceContext(), buf, buf + 4, buf, buf, Multiply<int>()); Transform<paddle::platform::CPUPlace> trans;
CPUDeviceContext ctx;
trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]); ASSERT_EQ((i + 1) * (i + 1), buf[i]);
} }
...@@ -79,7 +84,8 @@ TEST(Transform, GPUBinary) { ...@@ -79,7 +84,8 @@ TEST(Transform, GPUBinary) {
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf))); int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>()); Transform<paddle::platform::GPUPlace> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf)); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
Free(gpu0, gpu_buf); Free(gpu0, gpu_buf);
......
...@@ -17,10 +17,10 @@ class PReluTest(OpTest): ...@@ -17,10 +17,10 @@ class PReluTest(OpTest):
assert out_np is not self.inputs['X'] assert out_np is not self.inputs['X']
self.outputs = {'Out': out_np} self.outputs = {'Out': out_np}
def not_test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def not_test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out') self.check_grad(['X'], 'Out')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册