提交 1d756746 编写于 作者: Q qiaolongfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-listen-and-serv-op

# A image for building paddle binaries
# Use cuda devel base image for both cpu and gpu environment
# When you modify it, please be aware of cudnn-runtime version
# and libcudnn.so.x in paddle/scripts/docker/build.sh
FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
......
......@@ -146,6 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
return;
}
need_update_ = true;
ops_.erase(ops_.begin() + s, ops_.begin() + e);
}
......
......@@ -103,9 +103,7 @@ static void BuildVar(const std::string& param_name,
}
TEST(Operator, CPUtoGPU) {
using namespace paddle::framework;
using namespace paddle::platform;
InitDevices(true);
paddle::framework::InitDevices(true);
paddle::framework::Scope scope;
paddle::platform::CPUPlace cpu_place;
......@@ -118,8 +116,9 @@ TEST(Operator, CPUtoGPU) {
auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
// prepare input
auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
auto* in_t = scope.Var("IN1")->GetMutable<paddle::framework::LoDTensor>();
auto* src_ptr =
in_t->mutable_data<float>({2, 3}, paddle::platform::CPUPlace());
for (int i = 0; i < 2 * 3; ++i) {
src_ptr[i] = static_cast<float>(i);
}
......@@ -128,7 +127,7 @@ TEST(Operator, CPUtoGPU) {
auto* output = scope.Var("OUT1");
cpu_op->Run(scope, cpu_place);
auto* output_ptr = output->Get<LoDTensor>().data<float>();
auto* output_ptr = output->Get<paddle::framework::LoDTensor>().data<float>();
for (int i = 0; i < 2 * 3; ++i) {
ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
}
......@@ -153,12 +152,14 @@ TEST(Operator, CPUtoGPU) {
VLOG(3) << "after gpu_op run";
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
DeviceContextPool& pool = DeviceContextPool::Instance();
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto dev_ctx = pool.Get(cuda_place);
paddle::framework::Tensor output_tensor;
TensorCopy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
&output_tensor);
paddle::framework::TensorCopy(output2->Get<paddle::framework::LoDTensor>(),
paddle::platform::CPUPlace(), *dev_ctx,
&output_tensor);
dev_ctx->Wait();
float* output2_ptr = output_tensor.data<float>();
......
......@@ -18,27 +18,28 @@
#include "paddle/fluid/platform/device_context.h"
TEST(DataTransform, DataLayoutFunction) {
using namespace paddle::framework;
using namespace paddle::platform;
auto place = CPUPlace();
Tensor in = Tensor();
Tensor out = Tensor();
in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
in.set_layout(DataLayout::kNHWC);
auto kernel_nhwc = OpKernelType(proto::VarType::FP32, place,
DataLayout::kNHWC, LibraryType::kPlain);
auto kernel_ncwh = OpKernelType(proto::VarType::FP32, place,
DataLayout::kNCHW, LibraryType::kPlain);
TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
auto place = paddle::platform::CPUPlace();
paddle::framework::Tensor in = paddle::framework::Tensor();
paddle::framework::Tensor out = paddle::framework::Tensor();
in.mutable_data<double>(paddle::framework::make_ddim({2, 3, 1, 2}), place);
in.set_layout(paddle::framework::DataLayout::kNHWC);
auto kernel_nhwc = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP32, place,
paddle::framework::DataLayout::kNHWC,
paddle::framework::LibraryType::kPlain);
auto kernel_ncwh = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP32, place,
paddle::framework::DataLayout::kNCHW,
paddle::framework::LibraryType::kPlain);
paddle::framework::TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
EXPECT_TRUE(out.layout() == paddle::framework::DataLayout::kNCHW);
EXPECT_TRUE(out.dims() == paddle::framework::make_ddim({2, 2, 3, 1}));
TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
}
......@@ -17,43 +17,58 @@ limitations under the License. */
#include "gtest/gtest.h"
TEST(DataTypeTransform, CPUTransform) {
using namespace paddle::framework;
using namespace paddle::platform;
auto place = CPUPlace();
auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto place = paddle::platform::CPUPlace();
auto kernel_fp16 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP16, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_fp32 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP32, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_fp64 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP64, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_int32 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::INT32, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_int64 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::INT64, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_bool = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::BOOL, place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
// data type transform from float32
{
Tensor in;
Tensor out;
paddle::framework::Tensor in;
paddle::framework::Tensor out;
float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
float* ptr =
in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
int data_number = 2 * 3;
for (int i = 0; i < data_number; ++i) {
ptr[i] = i / 3;
}
TransDataType(kernel_fp32, kernel_fp64, in, &out);
paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in, &out);
double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
}
TransDataType(kernel_fp32, kernel_int32, in, &out);
paddle::framework::TransDataType(kernel_fp32, kernel_int32, in, &out);
int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
......@@ -62,10 +77,11 @@ TEST(DataTypeTransform, CPUTransform) {
// data type transform from/to float16
{
Tensor in;
Tensor out;
paddle::framework::Tensor in;
paddle::framework::Tensor out;
float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
paddle::framework::make_ddim({2, 3}), place);
int data_number = 2 * 3;
for (int i = 0; i < data_number; ++i) {
......@@ -73,94 +89,104 @@ TEST(DataTypeTransform, CPUTransform) {
}
// transform from float16 to other data types
TransDataType(kernel_fp16, kernel_fp32, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in, &out);
float* out_data_float = out.data<float>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_fp64, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in, &out);
double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int32, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_int32, in, &out);
int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int64, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_int64, in, &out);
int64_t* out_data_int64 = out.data<int64_t>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_bool, in, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_bool, in, &out);
bool* out_data_bool = out.data<bool>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
}
// transform float to float16
float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
float* in_data_float =
in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_float[i] = i;
}
TransDataType(kernel_fp32, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_float[i]).x);
}
// transform double to float16
double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
double* in_data_double =
in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_double[i] = i;
}
TransDataType(kernel_fp64, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_double[i]).x);
}
// transform int to float16
int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
int* in_data_int =
in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_int[i] = i;
}
TransDataType(kernel_int32, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_int32, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_int[i]).x);
}
// transform int64 to float16
int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
int64_t* in_data_int64 =
in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_int64[i] = i;
}
TransDataType(kernel_int64, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_int64, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_int64[i]).x);
}
// transform bool to float16
bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
bool* in_data_bool =
in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_bool[i] = i;
}
TransDataType(kernel_bool, kernel_fp16, in, &out);
ptr = out.data<float16>();
paddle::framework::TransDataType(kernel_bool, kernel_fp16, in, &out);
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_bool[i]).x);
}
}
}
......@@ -18,42 +18,58 @@ limitations under the License. */
#include "gtest/gtest.h"
TEST(DataTypeTransform, GPUTransform) {
using namespace paddle::framework;
using namespace paddle::platform;
auto cpu_place = CPUPlace();
auto gpu_place = CUDAPlace(0);
CUDADeviceContext context(gpu_place);
auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place);
auto kernel_fp16 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP16, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_fp32 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP32, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_fp64 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP64, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_int32 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::INT32, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_int64 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::INT64, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
auto kernel_bool = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::BOOL, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
paddle::framework::LibraryType::kPlain);
// data type transform from float32
{
Tensor in;
Tensor in_gpu;
Tensor out_gpu;
Tensor out;
paddle::framework::Tensor in;
paddle::framework::Tensor in_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
float* in_ptr =
in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
int data_number = sizeof(arr) / sizeof(arr[0]);
memcpy(in_ptr, arr, sizeof(arr));
TensorCopy(in, gpu_place, context, &in_gpu);
paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
double* out_data_double = out.data<double>();
......@@ -61,8 +77,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
}
TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_fp32, kernel_int32, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
int* out_data_int = out.data<int>();
......@@ -73,22 +90,27 @@ TEST(DataTypeTransform, GPUTransform) {
// data type transform from/to float16
{
Tensor in;
Tensor in_gpu;
Tensor out_gpu;
Tensor out;
float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
float16 arr[6] = {float16(0), float16(1), float16(2),
float16(3), float16(4), float16(5)};
paddle::framework::Tensor in;
paddle::framework::Tensor in_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
paddle::framework::make_ddim({2, 3}), cpu_place);
paddle::platform::float16 arr[6] = {
paddle::platform::float16(0), paddle::platform::float16(1),
paddle::platform::float16(2), paddle::platform::float16(3),
paddle::platform::float16(4), paddle::platform::float16(5)};
int data_number = sizeof(arr) / sizeof(arr[0]);
memcpy(ptr, arr, sizeof(arr));
TensorCopy(in, gpu_place, context, &in_gpu);
paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
// transform from float16 to other data types
TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
float* out_data_float = out.data<float>();
......@@ -96,8 +118,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
double* out_data_double = out.data<double>();
......@@ -105,8 +128,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_int32, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
int* out_data_int = out.data<int>();
......@@ -114,8 +138,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_int64, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
int64_t* out_data_int64 = out.data<int64_t>();
......@@ -123,8 +148,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_fp16, kernel_bool, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
bool* out_data_bool = out.data<bool>();
......@@ -133,90 +159,103 @@ TEST(DataTypeTransform, GPUTransform) {
}
// transform float to float16
float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
float* in_data_float =
in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_float[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_float[i]).x);
}
// transform double to float16
double* in_data_double =
in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
double* in_data_double = in.mutable_data<double>(
paddle::framework::make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_double[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_double[i]).x);
}
// transform int to float16
int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
int* in_data_int =
in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_int[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_int32, kernel_fp16, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_int[i]).x);
}
// transform int64 to float16
int64_t* in_data_int64 =
in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
int64_t* in_data_int64 = in.mutable_data<int64_t>(
paddle::framework::make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_int64[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_int64, kernel_fp16, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_int64[i]).x);
}
// transform bool to float16
bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
bool* in_data_bool =
in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_bool[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
paddle::framework::TransDataType(kernel_bool, kernel_fp16, in_gpu,
&out_gpu);
paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
ptr = out.data<paddle::platform::float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
EXPECT_EQ(ptr[i].x,
static_cast<paddle::platform::float16>(in_data_bool[i]).x);
}
}
}
......@@ -8,27 +8,28 @@ cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope plac
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
if(WITH_GPU)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim dynload_cuda)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
else()
set(multi_devices_graph_builder_deps)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
endif()
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base variable_visitor scope ddim memory)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope variable_visitor ddim memory)
cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context broadcast_op_handle)
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
......
......@@ -44,9 +44,15 @@ void BroadcastOpHandle::RunImpl() {
// &in_place;
WaitInputVarGenerated(*in_var_handle);
auto *in_var = local_scopes_.at(in_var_handle->scope_idx_)
->FindVar(in_var_handle->name_);
std::vector<const Scope *> var_scopes;
for (auto *s : local_scopes_) {
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
}
auto *in_var =
var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
PADDLE_ENFORCE_NOT_NULL(in_var);
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
for (auto *out : out_var_handles) {
......@@ -55,17 +61,16 @@ void BroadcastOpHandle::RunImpl() {
}
auto &out_p = out->place_;
auto *out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_);
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(),
"Places must be all on CPU or all on CUDA.");
VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
VariableVisitor::GetMutableTensor(out_var)
.Resize(in_tensor.dims())
.mutable_data(out_p, in_tensor.type());
VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p,
in_tensor.type());
auto dev_ctx = dev_ctxes_[out_p];
auto dev_ctx = dev_ctxes_.at(out_p);
RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] {
paddle::framework::TensorCopy(
in_tensor, out_p, *(dev_ctx),
......
......@@ -30,6 +30,7 @@ const f::DDim kDims = {20, 20};
struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
std::vector<Scope*> param_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
......@@ -72,11 +73,17 @@ struct TestBroadcastOpHandle {
void InitBroadcastOp(size_t input_scope_idx) {
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
local_scopes_[j]->Var("out");
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
local_scope.Var("out");
param_scopes_.emplace_back(&local_scope);
}
local_scopes_[input_scope_idx]->Var("input");
param_scopes_[input_scope_idx]->Var("input");
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
auto* in_var_handle =
new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
vars_.emplace_back(in_var_handle);
......@@ -105,7 +112,8 @@ struct TestBroadcastOpHandle {
}
void TestBroadcastLodTensor(size_t input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
......@@ -117,6 +125,7 @@ struct TestBroadcastOpHandle {
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
in_lod_tensor->set_lod(lod);
in_lod_tensor->Resize(kDims);
op_handle_->Run(false);
......@@ -124,7 +133,8 @@ struct TestBroadcastOpHandle {
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = local_scopes_[j]->Var("out");
auto out_var = param_scopes_[j]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_tensor = out_var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
......@@ -139,7 +149,8 @@ struct TestBroadcastOpHandle {
}
void TestBroadcastSelectedRows(size_t input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
......@@ -162,7 +173,8 @@ struct TestBroadcastOpHandle {
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = local_scopes_[j]->Var("out");
auto out_var = param_scopes_[j]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
......
......@@ -14,7 +14,7 @@
#pragma once
#include <memory>
#include <thread>
#include <thread> // NOLINT
namespace paddle {
namespace framework {
......@@ -23,7 +23,7 @@ namespace details {
// Change it to thread safe flags if needed.
class ThreadUnsafeOwnershipFlags {
public:
ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags& operator=(
......
......@@ -41,14 +41,19 @@ void GatherOpHandle::RunImpl() {
out_var_handle = out_var_handles.front();
}
std::vector<const Scope *> var_scopes;
for (auto *s : local_scopes_) {
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
}
auto in_0_handle = in_var_handles[0];
auto pre_in_var =
local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
auto pre_place = in_0_handle->place_;
var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
"Currently, gather_op only can gather SelectedRows.");
auto pre_place = in_0_handle->place_;
PADDLE_ENFORCE_EQ(out_var_handle->place_.which(), pre_place.which(),
"The place of input and output should be the same.");
......@@ -67,7 +72,7 @@ void GatherOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
"Places must be all on CPU or all on CUDA.");
auto *in_var =
local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
auto &in_sr = in_var->Get<framework::SelectedRows>();
PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
......@@ -86,7 +91,7 @@ void GatherOpHandle::RunImpl() {
// write the output
auto &out_place = out_var_handle->place_;
auto out_scope_idx = out_var_handle->scope_idx_;
auto out_var = local_scopes_[out_scope_idx]->FindVar(out_var_handle->name_);
auto out_var = var_scopes.at(out_scope_idx)->FindVar(out_var_handle->name_);
auto out = out_var->GetMutable<framework::SelectedRows>();
out->set_height(pre_in.height());
......
......@@ -29,6 +29,7 @@ const f::DDim kDims = {20, 20};
struct TestGatherOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
std::vector<Scope*> param_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
......@@ -71,9 +72,14 @@ struct TestGatherOpHandle {
void InitGatherOp(size_t input_scope_idx) {
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
local_scopes_[j]->Var("out");
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
local_scope.Var("input");
param_scopes_.emplace_back(&local_scope);
}
local_scopes_[input_scope_idx]->Var("input");
param_scopes_[input_scope_idx]->Var("out");
op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
// add input
......@@ -115,7 +121,8 @@ struct TestGatherOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
......@@ -128,10 +135,11 @@ struct TestGatherOpHandle {
value->Resize(kDims);
}
auto out_var = local_scopes_[output_scope_idx]->Var("out");
auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = local_scopes_[output_scope_idx]->Var("input");
auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
out_selected_rows->mutable_value()->ShareDataWith(
......@@ -155,7 +163,8 @@ struct TestGatherOpHandle {
f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t j = 0; j < f::product(kDims); ++j) {
for (int64_t j = 0;
j < f::product(kDims) * static_cast<int64_t>(gpu_list_.size()); ++j) {
ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
}
}
......
......@@ -43,21 +43,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
int dtype = -1;
size_t numel = 0;
std::vector<LoDTensor> lod_tensors;
std::vector<const LoDTensor *> lod_tensors;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *s = local_scopes_[i];
auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
lod_tensors.emplace_back(lod_tensor);
lod_tensors.emplace_back(&lod_tensor);
}
if (platform::is_gpu_place(lod_tensors[0].place())) {
if (platform::is_gpu_place(lod_tensors[0]->place())) {
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
auto &lod_tensor = lod_tensors[i];
auto &lod_tensor = *lod_tensors[i];
void *buffer = const_cast<void *>(lod_tensor.data<void>());
if (dtype == -1) {
......@@ -93,7 +93,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
// Reduce All Tensor to trg in CPU
ReduceLoDTensor func(lod_tensors, &trg);
VisitDataType(ToDataType(lod_tensors[0].type()), func);
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &scope =
......
......@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once
#include <string>
#include <tuple>
#include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_proto_maker.h"
......
......@@ -24,23 +24,23 @@ namespace framework {
namespace details {
struct ReduceLoDTensor {
const std::vector<LoDTensor> &src_tensors_;
const std::vector<const LoDTensor *> &src_tensors_;
LoDTensor &dst_tensor_;
ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
ReduceLoDTensor(const std::vector<const LoDTensor *> &src, LoDTensor *dst)
: src_tensors_(src), dst_tensor_(*dst) {}
template <typename T>
void operator()() const {
PADDLE_ENFORCE(!src_tensors_.empty());
auto &t0 = src_tensors_[0];
auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0);
dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
for (size_t i = 1; i < src_tensors_.size(); ++i) {
auto &t = src_tensors_[i];
auto &t = *src_tensors_[i];
PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
PADDLE_ENFORCE_EQ(t.type(), t0.type());
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
......
......@@ -13,7 +13,9 @@
// limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
namespace paddle {
namespace framework {
......@@ -21,85 +23,84 @@ namespace details {
void ReduceOpHandle::RunImpl() {
// the input and output may have dummy var.
std::vector<VarHandle *> in_var_handles = GetValidVarHandles(inputs_);
std::vector<VarHandle *> out_var_handles = GetValidVarHandles(outputs_);
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one.");
// Wait input done, this Wait is asynchronous operation
WaitEvents(in_var_handles);
VarHandle *out_var_handle;
{
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one.");
out_var_handle = out_var_handles.front();
}
// check in the same place
auto in_0_handle = in_var_handles[0];
auto pre_place = in_0_handle->place_;
std::vector<const Scope *> var_scopes;
for (auto *s : local_scopes_) {
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
}
auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
// Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated(in_var_handles);
auto pre_place = in_0_handle->place_;
std::vector<platform::Place> in_places;
auto pre_in_tensor = VariableVisitor::GetMutableTensor(pre_in_var);
for (auto *in_handle : in_var_handles) {
auto in_p = in_handle->place_;
PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
"Places must be all on CPU or all on CUDA.");
in_places.emplace_back(in_p);
}
auto out_var = local_scopes_[out_var_handles[0]->scope_idx_]->FindVar(
out_var_handles[0]->name_);
auto in_var =
var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
PADDLE_ENFORCE_NOT_NULL(in_var);
auto pre_in_var =
local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
if (pre_in_var->IsType<framework::SelectedRows>()) {
auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
std::vector<const SelectedRows *> in_selected_rows;
auto in_tensor = VariableVisitor::GetMutableTensor(in_var);
PADDLE_ENFORCE_EQ(in_tensor.type(), pre_in_tensor.type(),
"The type of input is not consistent.");
}
for (auto *in_handle : in_var_handles) {
auto in_var =
local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
auto &in_sr = in_var->Get<framework::SelectedRows>();
auto out_var =
var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
"The type of input is not consistent.");
if (pre_in_var->IsType<framework::SelectedRows>()) {
std::vector<const SelectedRows *> in_selected_rows =
GetInputValues<SelectedRows>(in_var_handles, var_scopes);
in_selected_rows.emplace_back(&in_sr);
}
auto trg = out_var->GetMutable<framework::SelectedRows>();
GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_,
out_var_handles[0]->place_, trg);
out_var_handle->place_,
out_var->GetMutable<framework::SelectedRows>());
} else {
auto pre_in = pre_in_var->Get<framework::LoDTensor>();
std::vector<LoDTensor> lod_tensors;
// can be refined
for (auto *in_handle : in_var_handles) {
auto in_var =
local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
auto &in_sr = in_var->Get<framework::LoDTensor>();
PADDLE_ENFORCE_EQ(in_sr.type(), pre_in.type(),
"The type of input is not consistent.");
lod_tensors.emplace_back(in_sr);
}
auto trg = out_var->GetMutable<framework::LoDTensor>();
trg->Resize(pre_in.dims());
trg->mutable_data(out_var_handles[0]->place_, pre_in.type());
std::vector<const LoDTensor *> lod_tensors =
GetInputValues<LoDTensor>(in_var_handles, var_scopes);
if (paddle::platform::is_cpu_place(pre_place)) {
ReduceLoDTensor func(lod_tensors, trg);
VisitDataType(ToDataType(lod_tensors[0].type()), func);
ReduceLoDTensor func(lod_tensors,
out_var->GetMutable<framework::LoDTensor>());
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
} else if (paddle::platform::is_gpu_place(pre_place)) {
#ifdef PADDLE_WITH_CUDA
auto out_p = out_var_handles[0]->place_;
int root = boost::get<platform::CUDAPlace>(out_p).device;
auto pre_in = pre_in_var->Get<framework::LoDTensor>();
VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
VariableVisitor::GetMutableTensor(out_var).mutable_data(
out_var_handle->place_, pre_in.type());
auto out_p = out_var_handle->place_;
int root = boost::get<platform::CUDAPlace>(out_p).device;
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
for (size_t i = 0; i < var_scopes.size(); ++i) {
auto &p = in_places[i];
auto &lod_tensor = lod_tensors[i];
auto &lod_tensor = *lod_tensors[i];
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
......@@ -109,14 +110,16 @@ void ReduceOpHandle::RunImpl() {
void *buffer = const_cast<void *>(lod_tensor.data<void>());
void *recvbuffer = nullptr;
if (root == dev_id) {
recvbuffer = trg->mutable_data(out_var_handles[0]->place_);
recvbuffer =
out_var->GetMutable<framework::LoDTensor>()->mutable_data(
out_var_handle->place_);
}
int type = platform::ToNCCLDataType(lod_tensor.type());
all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(platform::dynload::ncclReduce(
buffer, recvbuffer, static_cast<size_t>(lod_tensor.numel()),
platform::ToNCCLDataType(lod_tensor.type()), ncclSum, root, comm,
stream));
static_cast<ncclDataType_t>(type), ncclSum, root, comm, stream));
});
}
......@@ -135,26 +138,31 @@ void ReduceOpHandle::RunImpl() {
}
}
void ReduceOpHandle::WaitEvents(
const std::vector<VarHandle *> &in_var_handles) {
if (in_var_handles[0]->generated_op_) {
for (auto *in : in_var_handles) {
in_var_handles[0]->generated_op_->Wait(dev_ctxes_[in->place_]);
}
template <typename T>
std::vector<const T *> ReduceOpHandle::GetInputValues(
const std::vector<VarHandle *> &in_var_handles,
const std::vector<const Scope *> &var_scopes) const {
std::vector<const T *> in_selected_rows;
for (auto *in_handle : in_var_handles) {
auto &in_sr = var_scopes.at(in_handle->scope_idx_)
->FindVar(in_handle->name_)
->Get<T>();
in_selected_rows.emplace_back(&in_sr);
}
return in_selected_rows;
}
std::vector<VarHandle *> ReduceOpHandle::GetValidVarHandles(
const std::vector<VarHandleBase *> &inputs) {
std::vector<VarHandle *> in_var_handles;
for (auto *in : inputs) {
auto *in_handle = dynamic_cast<VarHandle *>(in);
if (in_handle) {
in_var_handles.push_back(in_handle);
void ReduceOpHandle::WaitInputVarGenerated(
const std::vector<VarHandle *> &in_var_handles) {
for (auto *in : in_var_handles) {
if (in->generated_op_) {
for (auto pair : dev_ctxes_) {
in->generated_op_->Wait(pair.second);
}
}
}
return in_var_handles;
}
std::string ReduceOpHandle::Name() const { return "reduce"; }
} // namespace details
} // namespace framework
......
......@@ -59,10 +59,13 @@ struct ReduceOpHandle : public OpHandleBase {
protected:
void RunImpl() override;
std::vector<VarHandle *> GetValidVarHandles(
const std::vector<VarHandleBase *> &inputs);
void WaitEvents(const std::vector<VarHandle *> &in_var_handles);
void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
template <typename T>
std::vector<const T *> GetInputValues(
const std::vector<VarHandle *> &in_var_handles,
const std::vector<const Scope *> &var_scopes) const;
};
} // namespace details
......
......@@ -14,7 +14,6 @@
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
......@@ -30,6 +29,7 @@ struct TestReduceOpHandle {
bool use_gpu_;
Scope g_scope_;
std::vector<Scope *> local_scopes_;
std::vector<Scope *> param_scopes_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
std::vector<p::Place> gpu_list_;
......@@ -83,12 +83,18 @@ struct TestReduceOpHandle {
}
}
void InitReduceOp(size_t input_scope_idx) {
void InitReduceOp(size_t out_scope_idx) {
// init scope
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
local_scopes_[j]->Var("out");
Scope &local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope *>() = &local_scope;
local_scope.Var("input");
param_scopes_.emplace_back(&local_scope);
}
local_scopes_[input_scope_idx]->Var("input");
param_scopes_[out_scope_idx]->Var("out");
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
......@@ -106,6 +112,7 @@ struct TestReduceOpHandle {
#endif
}
// init op handle
// add input
for (size_t j = 0; j < gpu_list_.size(); ++j) {
if (!use_gpu_) {
......@@ -126,7 +133,7 @@ struct TestReduceOpHandle {
// add output
auto *out_var_handle =
new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
......@@ -148,7 +155,8 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
......@@ -161,10 +169,11 @@ struct TestReduceOpHandle {
value->Resize(kDims);
}
auto out_var = local_scopes_[output_scope_idx]->Var("out");
auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = local_scopes_[output_scope_idx]->Var("input");
auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
out_selected_rows->mutable_value()->ShareDataWith(
......@@ -202,7 +211,8 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = local_scopes_[input_scope_idx]->Var("input");
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
in_lod_tensor->set_lod(lod);
......@@ -211,10 +221,11 @@ struct TestReduceOpHandle {
send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
}
auto out_var = local_scopes_[output_scope_idx]->Var("out");
auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
auto in_var = local_scopes_[output_scope_idx]->Var("input");
auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
auto in_lodtensor = in_var->Get<f::LoDTensor>();
out_lodtensor->ShareDataWith(in_lodtensor);
......@@ -239,34 +250,34 @@ struct TestReduceOpHandle {
TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
TestReduceOpHandle test_op;
size_t input_scope_idx = 0;
size_t out_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitReduceOp(input_scope_idx);
test_op.TestReduceSelectedRows(input_scope_idx);
test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceSelectedRows(out_scope_idx);
}
TEST(ReduceTester, TestCPUReduceTestLodTensor) {
TestReduceOpHandle test_op;
size_t input_scope_idx = 0;
size_t out_scope_idx = 0;
test_op.InitCtxOnGpu(false);
test_op.InitReduceOp(input_scope_idx);
test_op.TestReduceLodTensors(input_scope_idx);
test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceLodTensors(out_scope_idx);
}
#ifdef PADDLE_WITH_CUDA
TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
TestReduceOpHandle test_op;
size_t input_scope_idx = 0;
size_t out_scope_idx = 0;
test_op.InitCtxOnGpu(true);
test_op.InitReduceOp(input_scope_idx);
test_op.TestReduceSelectedRows(input_scope_idx);
test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceSelectedRows(out_scope_idx);
}
TEST(ReduceTester, TestGPUReduceTestLodTensor) {
TestReduceOpHandle test_op;
size_t input_scope_idx = 0;
size_t out_scope_idx = 0;
test_op.InitCtxOnGpu(true);
test_op.InitReduceOp(input_scope_idx);
test_op.TestReduceLodTensors(input_scope_idx);
test_op.InitReduceOp(out_scope_idx);
test_op.TestReduceLodTensors(out_scope_idx);
}
#endif
......
......@@ -202,8 +202,9 @@ class CosineOpComplete : public paddle::framework::CosineOp {
};
TEST(OperatorRegistrar, Test) {
using namespace paddle::framework;
OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
paddle::framework::OperatorRegistrar<
CosineOpComplete, paddle::framework::CosineOpProtoAndCheckerMaker>
reg("cos");
}
namespace paddle {
......
......@@ -226,10 +226,8 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
// test with multi inputs
TEST(OpKernel, multi_inputs) {
using namespace paddle::framework;
paddle::framework::InitDevices(true);
proto::OpDesc op_desc;
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("op_multi_inputs_with_kernel");
BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
......@@ -243,12 +241,12 @@ TEST(OpKernel, multi_inputs) {
paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope;
scope.Var("x0")->GetMutable<LoDTensor>();
scope.Var("x1")->GetMutable<LoDTensor>();
scope.Var("x2")->GetMutable<LoDTensor>();
scope.Var("k0")->GetMutable<LoDTensor>();
scope.Var("y0")->GetMutable<LoDTensor>();
scope.Var("y1")->GetMutable<LoDTensor>();
scope.Var("x0")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("x1")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("x2")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("k0")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("y0")->GetMutable<paddle::framework::LoDTensor>();
scope.Var("y1")->GetMutable<paddle::framework::LoDTensor>();
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
op->Run(scope, cpu_place);
......
......@@ -27,10 +27,14 @@ BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
return blocks_.back().get();
}
proto::ProgramDesc *ProgramDesc::Proto() {
void ProgramDesc::Flush() {
for (auto &block : blocks_) {
block->Flush();
}
}
proto::ProgramDesc *ProgramDesc::Proto() {
Flush();
return &desc_;
}
......
......@@ -51,6 +51,8 @@ class ProgramDesc {
size_t Size() const { return blocks_.size(); }
void Flush();
proto::ProgramDesc *Proto();
// The output variable of feed_op is referenced as feed_target.
......
......@@ -15,14 +15,14 @@ limitations under the License. */
#include <gtest/gtest.h>
#include <atomic>
#include "threadpool.h"
#include "paddle/fluid/framework/threadpool.h"
namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
std::vector<std::future<void>> fs;
for (int i = 0; i < cnt; ++i) {
fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
}
}
......@@ -46,7 +46,7 @@ TEST(ThreadPool, ConcurrentRun) {
int n = 50;
// sum = (n * (n + 1)) / 2
for (int i = 1; i <= n; ++i) {
std::thread t(do_sum, pool, std::ref(sum), i);
std::thread t(do_sum, pool, &sum, i);
threads.push_back(std::move(t));
}
for (auto& t : threads) {
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/io.h"
#include <algorithm>
#include <fstream>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
......@@ -27,14 +28,14 @@ namespace inference {
// linking the inference shared library.
void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
void ReadBinaryFile(const std::string& filename, std::string& contents) {
void ReadBinaryFile(const std::string& filename, std::string* contents) {
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
fin.seekg(0, std::ios::end);
contents.clear();
contents.resize(fin.tellg());
contents->clear();
contents->resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&contents[0], contents.size());
fin.read(&(contents->at(0)), contents->size());
fin.close();
}
......@@ -47,7 +48,7 @@ bool IsPersistable(const framework::VarDesc* var) {
return false;
}
void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename) {
......@@ -92,18 +93,18 @@ void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
op->CheckAttrs();
}
executor.Run(*load_program, &scope, 0, true, true);
executor->Run(*load_program, scope, 0, true, true);
delete load_program;
}
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
framework::Scope* scope,
const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
std::string program_desc_str;
VLOG(3) << "loading model from " << model_filename;
ReadBinaryFile(model_filename, program_desc_str);
ReadBinaryFile(model_filename, &program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
......@@ -113,11 +114,11 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
}
std::unique_ptr<framework::ProgramDesc> Load(
framework::Executor& executor, framework::Scope& scope,
framework::Executor* executor, framework::Scope* scope,
const std::string& prog_filename, const std::string& param_filename) {
std::string model_filename = prog_filename;
std::string program_desc_str;
ReadBinaryFile(model_filename, program_desc_str);
ReadBinaryFile(model_filename, &program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
......
......@@ -27,17 +27,17 @@ namespace inference {
void Init(bool init_p2p);
void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
framework::Scope* scope,
const std::string& dirname);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
framework::Scope* scope,
const std::string& prog_filename,
const std::string& param_filename);
......
......@@ -133,12 +133,12 @@ void TestInference(const std::string& dirname,
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program = paddle::inference::Load(
executor, *scope, dirname + "/" + prog_filename,
&executor, scope, dirname + "/" + prog_filename,
dirname + "/" + param_filename);
} else {
// Parameters are saved in separate files sited in the specified
// `dirname`.
inference_program = paddle::inference::Load(executor, *scope, dirname);
inference_program = paddle::inference::Load(&executor, scope, dirname);
}
}
// Disable the profiler and print the timing information
......
......@@ -163,7 +163,12 @@ function(op_library TARGET)
# pybind USE_OP
if (${pybind_flag} EQUAL 0)
# NOTE(*): activation use macro to regist the kernels, set use_op manually.
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP(relu);\n")
else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif()
endif()
endfunction()
......
......@@ -9,7 +9,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/float16.h"
......
......@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
......@@ -25,6 +28,16 @@ limitations under the License. */
namespace paddle {
namespace operators {
/* Use ugly global variable, for the using in python layer side
Please refer to the layer_helper.py and get the details.
*/
static std::unordered_set<std::string> InplaceOpSet = {
"sigmoid", "exp", "relu", "tanh", "sqrt", "ceil",
"floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
};
static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
template <typename DeviceContext, typename Functor>
class ActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
......@@ -60,7 +73,6 @@ class ActivationGradKernel
public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<framework::Tensor>("X");
auto* Out = context.Input<framework::Tensor>("Out");
auto* dOut =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
......@@ -68,7 +80,6 @@ class ActivationGradKernel
dX->mutable_data<T>(context.GetPlace());
auto dout = framework::EigenVector<T>::Flatten(*dOut);
auto x = framework::EigenVector<T>::Flatten(*X);
auto out = framework::EigenVector<T>::Flatten(*Out);
auto dx = framework::EigenVector<T>::Flatten(*dX);
auto* place =
......@@ -78,7 +89,16 @@ class ActivationGradKernel
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(*place, x, out, dout, dx);
bool inplace = functor.Inplace();
if (!inplace) {
auto* X = context.Input<framework::Tensor>("X");
auto x = framework::EigenVector<T>::Flatten(*X);
functor(*place, x, out, dout, dx);
} else {
VLOG(10) << " Inplace activation ";
auto x = framework::EigenVector<T>::Flatten(*dX);
functor(*place, x, out, dout, dx);
}
}
};
......@@ -89,6 +109,14 @@ struct BaseActivationFunctor {
using AttrPair = std::vector<std::pair<const char*, float*>>;
AttrPair GetAttrs() { return AttrPair(); }
/* NOTE(*): Output reuse X memory if X is not dependented by its Gradient.
For example, sigmoid op's gradient didn't involve x, so its output can
reuse
input memory. But abs op's gradient use x, it can not be inplaced.
gradient did use x.
*/
bool Inplace() const { return false; }
};
// sigmoid(x) = 1 / (1 + exp(-x))
......@@ -102,6 +130,7 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
template <typename T>
struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
bool Inplace() const { return IsInplace("sigmoid"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
......@@ -156,6 +185,7 @@ struct ExpFunctor : public BaseActivationFunctor<T> {
template <typename T>
struct ExpGradFunctor : public BaseActivationFunctor<T> {
bool Inplace() const { return IsInplace("exp"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
......@@ -174,10 +204,11 @@ struct ReluFunctor : public BaseActivationFunctor<T> {
template <typename T>
struct ReluGradFunctor : public BaseActivationFunctor<T> {
bool Inplace() const { return IsInplace("relu"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
}
};
......@@ -192,6 +223,7 @@ struct TanhFunctor : public BaseActivationFunctor<T> {
template <typename T>
struct TanhGradFunctor : public BaseActivationFunctor<T> {
bool Inplace() const { return IsInplace("tanh"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
......@@ -297,6 +329,7 @@ struct SqrtFunctor : public BaseActivationFunctor<T> {
template <typename T>
struct SqrtGradFunctor : public BaseActivationFunctor<T> {
bool Inplace() const { return IsInplace("sqrt"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
......@@ -316,10 +349,11 @@ struct CeilFunctor : public BaseActivationFunctor<T> {
template <typename T>
struct ZeroGradFunctor : public BaseActivationFunctor<T> {
bool Inplace() const { return IsInplace("ceil"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = static_cast<T>(0) / x;
dx.device(d) = static_cast<T>(0) / out;
}
};
......@@ -432,6 +466,7 @@ struct ReciprocalFunctor : public BaseActivationFunctor<T> {
template <typename T>
struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
bool Inplace() const { return IsInplace("reciprocal"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
......@@ -531,12 +566,14 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
bool Inplace() const { return IsInplace("relu6"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout *
((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
.template cast<T>();
dx.device(d) =
dout *
((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
.template cast<T>();
}
};
......@@ -611,11 +648,12 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
bool Inplace() const { return IsInplace("soft_relu"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto tmp = static_cast<T>(threshold);
auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
auto temp = ((out > -tmp) * (out < tmp)).template cast<T>().eval();
dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
}
};
......@@ -791,7 +829,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"slope", &slope}, {"offset", &offset}};
}
bool Inplace() { return IsInplace("hard_sigmoid"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
......
......@@ -24,12 +24,34 @@ namespace paddle {
namespace operators {
template <typename T>
__global__ void RandomGenerator(const size_t n, const T* src,
const T* cpu_mask_data, T* mask_data, T* dst) {
__global__ void RandomGenerator(const size_t n, const int seed,
const float dropout_prob, const T* src,
T* mask_data, T* dst) {
thrust::minstd_rand rng;
rng.seed(seed);
thrust::uniform_real_distribution<float> dist(0, 1);
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int step_size = 0;
T mask;
T dest;
for (; idx < n; idx += blockDim.x * gridDim.x) {
mask_data[idx] = cpu_mask_data[idx];
dst[idx] = mask_data[idx] * src[idx];
T s = src[idx];
if (step_size == 0) {
rng.discard(idx);
step_size = blockDim.x * gridDim.x;
} else {
rng.discard(step_size);
}
if (dist(rng) < dropout_prob) {
mask = static_cast<T>(0);
} else {
mask = static_cast<T>(1);
}
dest = s * mask;
mask_data[idx] = mask;
dst[idx] = dest;
}
}
......@@ -56,27 +78,15 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
std::random_device rnd;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
std::minstd_rand engine;
engine.seed(seed);
std::uniform_real_distribution<float> dist(0, 1);
framework::Vector<T> cpu_mask(size);
for (size_t i = 0; i < size; ++i) {
if (dist(engine) < dropout_prob) {
cpu_mask[i] = static_cast<T>(0);
} else {
cpu_mask[i] = static_cast<T>(1);
}
}
int threads = 512;
int grid = (x->numel() + threads - 1) / threads;
RandomGenerator<
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, x_data, cpu_mask.CUDAData(context.GetPlace()), mask_data,
y_data);
size, seed, dropout_prob, x_data, mask_data, y_data);
} else {
auto X = EigenVector<T>::Flatten(*x);
auto Y = EigenVector<T>::Flatten(*y);
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
}
}
......@@ -89,8 +99,6 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
ops::GPUDropoutKernel<plat::CUDADeviceContext, double>,
ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(dropout_grad,
ops::DropoutGradKernel<plat::CUDADeviceContext, double>,
ops::DropoutGradKernel<plat::CUDADeviceContext, float>);
......@@ -24,7 +24,7 @@ namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
class CPUDropoutKernel : public framework::OpKernel<T> {
......@@ -60,8 +60,8 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
}
}
} else {
auto X = EigenVector<T>::Flatten(*x);
auto Y = EigenVector<T>::Flatten(*y);
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
Y.device(place) = X * (1.0f - dropout_prob);
......@@ -81,9 +81,9 @@ class DropoutGradKernel : public framework::OpKernel<T> {
auto* mask = context.Input<Tensor>("Mask");
grad_x->mutable_data<T>(context.GetPlace());
auto M = EigenVector<T>::Flatten(*mask);
auto dX = EigenVector<T>::Flatten(*grad_x);
auto dY = EigenVector<T>::Flatten(*grad_y);
auto M = EigenMatrix<T>::Reshape(*mask, 1);
auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <iostream>
#include <string>
#include <thread> // NOLINT
......@@ -33,16 +32,14 @@ namespace m = paddle::operators::math;
USE_OP(dropout);
static paddle::framework::DDim dims = {10, 10};
void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto var = scope->Var("X");
auto tensor = var->GetMutable<f::LoDTensor>();
tensor->Resize(dims);
tensor->Resize({10, 10});
std::vector<float> init;
for (int64_t i = 0; i < f::product(dims); ++i) {
for (int64_t i = 0; i < 10 * 10; ++i) {
init.push_back(1.0);
}
......@@ -51,19 +48,18 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
auto place = ctx.GetPlace();
auto out_var = scope->Var("Out");
auto out_tensor = out_var->GetMutable<f::LoDTensor>();
out_tensor->Resize(dims);
out_tensor->Resize({10, 10});
out_tensor->mutable_data<float>(place); // allocate
auto mask_var = scope->Var("Mask");
auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
mask_tensor->Resize(dims);
mask_tensor->Resize({10, 10});
mask_tensor->mutable_data<float>(place); // allocate
// run
f::AttributeMap attrs;
float dropout_prob = 0.5;
attrs.insert({"is_test", false});
attrs.insert({"fix_seed", true});
attrs.insert({"fix_seed", 1});
attrs.insert({"seed", 3});
attrs.insert({"dropout_prob", dropout_prob});
auto dropout_op = f::OpRegistry::CreateOp(
......@@ -73,7 +69,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
std::vector<float> out_vec;
TensorToVector(*out_tensor, ctx, &out_vec);
ctx.Wait();
std::vector<float> std_out = {
0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
......@@ -88,22 +83,22 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
}
}
// TODO(wyi): Due to
// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
// disable this test to remove the prevention of the merge of
// unrelated PRs.
/*
TEST(Dropout, CPUDense) {
f::Scope scope;
p::CPUPlace place;
p::CPUDeviceContext ctx(place);
Compare(&scope, ctx);
Compare(scope, ctx);
}
// TODO(wyi, dzhwinter): Due to
// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
// disable this test to remove the prevention of the merge of
// unrelated PRs.
/*
TEST(Dropout, GPUDense) {
f::Scope scope;
p::CUDAPlace place;
p::CUDADeviceContext ctx(place);
Compare(&scope, ctx);
Compare(scope, ctx);
}
*/
......@@ -60,7 +60,7 @@ class MKLDNNActivationGradKernel
}
};
namespace {
namespace { // NOLINT
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx,
const framework::OperatorWithKernel& oper) {
......
......@@ -127,6 +127,7 @@ void BindProgramDesc(pybind11::module *m) {
.def("block", &pd::ProgramDesc::MutableBlock,
pybind11::return_value_policy::reference)
.def("num_blocks", &pd::ProgramDesc::Size)
.def("flush", &pd::ProgramDesc::Flush)
.def("get_feed_target_names", &pd::ProgramDesc::GetFeedTargetNames)
.def("get_fetch_target_names", &pd::ProgramDesc::GetFetchTargetNames)
.def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
......
......@@ -33,6 +33,7 @@ limitations under the License. */
#include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -461,6 +462,9 @@ All parameter, weight, gradient are variables in Paddle.
self.back().set_lod(t.lod());
});
m.def("IsInplace",
[](std::string op) -> bool { return operators::IsInplace(op); });
m.def("op_support_gpu", OpSupportGPU);
#ifdef PADDLE_WITH_CUDA
m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
......
......@@ -155,7 +155,7 @@ EOF
function gen_dockerfile() {
# Set BASE_IMAGE according to env variables
if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
BASE_IMAGE="nvidia/cuda:8.0-cudnn7-runtime-ubuntu16.04"
else
BASE_IMAGE="ubuntu:16.04"
fi
......@@ -164,7 +164,7 @@ function gen_dockerfile() {
DOCKERFILE_CUDNN_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
fi
cat <<EOF
......
......@@ -336,18 +336,20 @@ def save_inference_model(dirname,
if main_program is None:
main_program = default_main_program()
copy_program = main_program
if not os.path.isdir(dirname):
os.makedirs(dirname)
# Clear the is_target information and remove the existed feed and fetch op
global_block = main_program.global_block()
global_block = copy_program.global_block()
for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False)
if op.type == "feed" or op.type == "fetch":
global_block.remove_op(i)
copy_program.desc.flush()
pruned_program = main_program.prune(targets=target_vars)
pruned_program = copy_program.prune(targets=target_vars)
inference_program = pruned_program.inference_optimize()
fetch_var_names = [v.name for v in target_vars]
......
......@@ -19,6 +19,7 @@ from framework import Variable, Parameter, default_main_program, default_startup
import unique_name
from paddle.fluid.initializer import Constant, Xavier
from param_attr import ParamAttr, WeightNormParamAttr
import core
class LayerHelper(object):
......@@ -398,13 +399,16 @@ class LayerHelper(object):
return input_var
if isinstance(act, basestring):
act = {'type': act}
tmp = self.create_tmp_variable(dtype=input_var.dtype)
if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
act_type = act.pop('type')
if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
tmp = input_var
# NOTE(dzhwinter): some activation support inplace compution.
if not core.IsInplace(act_type):
tmp = self.create_tmp_variable(dtype=input_var.dtype)
self.append_op(
type=act_type,
inputs={"X": [input_var]},
......
......@@ -21,7 +21,7 @@ from ..executor import global_scope
__all__ = [
'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
'open_files', 'read_file', 'shuffle', 'double_buffer'
'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer'
]
......@@ -290,7 +290,7 @@ def open_recordio_file(filename,
lod_levels,
dtypes,
pass_num=1,
for_parallel=False):
for_parallel=True):
"""
Open a RecordIO file
......@@ -364,7 +364,7 @@ def open_files(filenames,
thread_num,
buffer_size=None,
pass_num=1,
for_parallel=False):
for_parallel=True):
"""
Open files
......@@ -476,6 +476,11 @@ def shuffle(reader, buffer_size):
'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
def batch(reader, batch_size):
return __create_unshared_decorated_reader__(
'create_batch_reader', reader, {'batch_size': int(batch_size)})
def double_buffer(reader, place=None):
attrs = dict()
if place is not None:
......
......@@ -361,10 +361,7 @@ class TestCeil(OpTest):
def test_check_output(self):
self.check_output()
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
# The same reason with TestFloor
def init_dtype(self):
pass
......@@ -396,10 +393,8 @@ class TestFloor(OpTest):
def test_check_output(self):
self.check_output()
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
# the gradient on floor, ceil, round is undefined.
# we return zero as gradient, but the numpy return nan
def init_dtype(self):
pass
......@@ -501,11 +496,6 @@ class TestRound(OpTest):
def test_check_output(self):
self.check_output()
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
......
......@@ -100,6 +100,9 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
# (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
# transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
if data_format != "NCHW" and data_format != "NHWC":
raise ValueError("Unknown data order.")
if data_format == "NCHW":
x = np.transpose(x, (0, 2, 3, 1))
y_grad = np.transpose(y_grad, (0, 2, 3, 1))
......@@ -304,7 +307,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
# run backward
y_grad = np.random.random_sample(shape).astype(np.float32)
x_grad, scale_grad, bias_grad = _reference_grad(
x, y_grad, scale, saved_mean, var_ref, epsilon, data_format)
x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
var_dict = locals()
var_dict['y@GRAD'] = y_grad
......
......@@ -69,7 +69,6 @@ class TestMultipleReader(unittest.TestCase):
break
batch_count += 1
self.assertLessEqual(img_val.shape[0], self.batch_size)
data_files.reset()
self.assertEqual(batch_count, self.num_batch * 3)
def test_main(self):
......
......@@ -43,9 +43,8 @@ class TestMultipleReader(unittest.TestCase):
filename='./mnist.recordio',
shapes=[(-1, 784), (-1, 1)],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
data_file = fluid.layers.io.multi_pass(
reader=data_file, pass_num=self.pass_num)
dtypes=['float32', 'int64'],
pass_num=self.pass_num)
img, label = fluid.layers.read_file(data_file)
if fluid.core.is_compiled_with_cuda():
......@@ -65,5 +64,4 @@ class TestMultipleReader(unittest.TestCase):
break
batch_count += 1
self.assertLessEqual(img_val.shape[0], self.batch_size)
data_file.reset()
self.assertEqual(batch_count, self.num_batch * self.pass_num)
......@@ -74,13 +74,13 @@ class TestRecordIO(unittest.TestCase):
avg_loss_np.append(tmp)
batch_id += 1
data_file.reset()
self.assertEqual(batch_id, self.num_batches)
self.assertLess(avg_loss_np[-1], avg_loss_np[0])
def test_shuffle_reader(self):
self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(reader, buffer_size=200))
self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(
reader, buffer_size=200))
def test_double_buffer_reader(self):
self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册