提交 d865b047 编写于 作者: Q qingqing01 提交者: GitHub

Merge pull request #4201 from qingqing01/fix_prelu

Refine platform::Transform function and fix prelu_op testing.
......@@ -54,8 +54,9 @@ class PReluKernel : public framework::OpKernel {
int numel = x->numel();
Transform(context.device_context(), x_ptr, x_ptr + numel, o_ptr,
PReluFunctor<T>(alpha_ptr));
Transform<Place> trans;
trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr,
PReluFunctor<T>(alpha_ptr));
}
};
......@@ -91,8 +92,9 @@ class PReluGradKernel : public framework::OpKernel {
const T* out_ptr = out->data<T>();
int numel = dx->numel();
Transform(context.device_context(), out_ptr, out_ptr + numel, dout_ptr,
dx_ptr, PReluGradFunctor<T>(alpha_ptr));
Transform<Place> trans;
trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr,
PReluGradFunctor<T>(alpha_ptr));
// TODO (Zhuoyuan): add dalpha upgrade when GPU kernels ready
}
......
......@@ -29,45 +29,71 @@
namespace paddle {
namespace platform {
// Transform on host or device. It provides the same API in std library.
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void Transform(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op) {
auto place = context.GetPlace();
if (is_cpu_place(place)) {
template <typename Place>
struct Transform {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op);
template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void operator()(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op);
};
template <>
struct Transform<platform::CPUPlace> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
std::transform(first, last, result, op);
} else {
#ifdef __NVCC__
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
using namespace details;
thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first),
DevPtrCast(last), DevPtrCast(result), op);
#else
PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
#endif
}
}
template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void Transform(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
if (is_cpu_place(place)) {
template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void operator()(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
std::transform(first1, last1, first2, result, op);
} else {
}
};
#ifdef __NVCC__
template <>
struct Transform<platform::GPUPlace> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
using namespace details;
thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first1),
DevPtrCast(last1), DevPtrCast(first2), DevPtrCast(result),
thrust::transform(thrust::cuda::par.on(ctx.stream()),
details::DevPtrCast(first), details::DevPtrCast(last),
details::DevPtrCast(result), op);
}
template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void operator()(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
thrust::transform(thrust::cuda::par.on(ctx.stream()),
details::DevPtrCast(first1), details::DevPtrCast(last1),
details::DevPtrCast(first2), details::DevPtrCast(result),
op);
#else
PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
#endif
}
};
#endif
} // namespace platform
} // namespace paddle
......@@ -15,6 +15,7 @@
#include <gtest/gtest.h>
#include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h"
#include "paddle/platform/hostdevice.h"
#include "paddle/platform/transform.h"
template <typename T>
......@@ -38,7 +39,8 @@ TEST(Transform, CPUUnary) {
using namespace paddle::platform;
CPUDeviceContext ctx;
float buf[4] = {0.1, 0.2, 0.3, 0.4};
Transform(ctx, buf, buf + 4, buf, Scale<float>(10));
Transform<paddle::platform::CPUPlace> trans;
trans(ctx, buf, buf + 4, buf, Scale<float>(10));
for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
}
......@@ -52,7 +54,8 @@ TEST(Transform, GPUUnary) {
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
Transform<paddle::platform::GPUPlace> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx.Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
Free(gpu0, gpu_buf);
......@@ -65,7 +68,9 @@ TEST(Transform, CPUBinary) {
using namespace paddle::platform;
using namespace paddle::memory;
int buf[4] = {1, 2, 3, 4};
Transform(CPUDeviceContext(), buf, buf + 4, buf, buf, Multiply<int>());
Transform<paddle::platform::CPUPlace> trans;
CPUDeviceContext ctx;
trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]);
}
......@@ -79,11 +84,12 @@ TEST(Transform, GPUBinary) {
CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
Transform<paddle::platform::GPUPlace> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx.Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]);
}
}
\ No newline at end of file
}
......@@ -17,10 +17,10 @@ class PReluTest(OpTest):
assert out_np is not self.inputs['X']
self.outputs = {'Out': out_np}
def not_test_check_output(self):
def test_check_output(self):
self.check_output()
def not_test_check_grad(self):
def test_check_grad(self):
self.check_grad(['X'], 'Out')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册