未验证 提交 22bbd547 编写于 作者: Y Yiqun Liu 提交者: GitHub

Add the support of fp16 in fusion_group (#22239)

上级 d97475d5
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <sstream> #include <sstream>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h" #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
#include "paddle/fluid/framework/ir/fusion_group/cuda_resources.h"
#include "paddle/fluid/framework/ir/fusion_group/operation.h" #include "paddle/fluid/framework/ir/fusion_group/operation.h"
namespace paddle { namespace paddle {
...@@ -27,13 +28,14 @@ CodeGenerator::CodeGenerator() { ...@@ -27,13 +28,14 @@ CodeGenerator::CodeGenerator() {
// Only support elementwise operations now. // Only support elementwise operations now.
code_templates_.resize(1); code_templates_.resize(1);
CodeTemplate elementwise_t(elementwise_cuda_template); CodeTemplate elementwise_t(cuda_kernel_template_1d);
code_templates_[0] = elementwise_t; code_templates_[0] = elementwise_t;
} }
std::string CodeGenerator::Generate(SubGraph* subgraph) { std::string CodeGenerator::Generate(SubGraph* subgraph) {
std::vector<OperationExpression> expressions = ConvertToExpressions(subgraph); std::vector<OperationExpression> expressions = ConvertToExpressions(subgraph);
return Generate(subgraph->GetFuncName(), expressions); return Generate(subgraph->GetFuncName(), subgraph->GetDataType(),
expressions);
} }
static bool HasInput(Node* n, std::string name) { static bool HasInput(Node* n, std::string name) {
...@@ -100,9 +102,9 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions( ...@@ -100,9 +102,9 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
// In order to get the right result of expression, we need to calculate and // In order to get the right result of expression, we need to calculate and
// store the expression as suffix Expressions using vector. // store the expression as suffix Expressions using vector.
std::string CodeGenerator::Generate( std::string CodeGenerator::Generate(
std::string func_name, std::vector<OperationExpression> expressions) { std::string func_name, std::string dtype,
const std::vector<OperationExpression>& expressions) {
// TODO(liuyiqun): Check whether all expressions are elementwise operations. // TODO(liuyiqun): Check whether all expressions are elementwise operations.
std::string dtype = "float";
std::set<int> input_ids = DistilInputIds(expressions); std::set<int> input_ids = DistilInputIds(expressions);
std::set<int> output_ids = DistilOutputIds(expressions); std::set<int> output_ids = DistilOutputIds(expressions);
...@@ -111,6 +113,15 @@ std::string CodeGenerator::Generate( ...@@ -111,6 +113,15 @@ std::string CodeGenerator::Generate(
template_var.Add("parameters", EmitParameters(input_ids, output_ids, dtype)); template_var.Add("parameters", EmitParameters(input_ids, output_ids, dtype));
template_var.Add("compute_body", template_var.Add("compute_body",
EmitComputeBody(expressions, input_ids, output_ids, dtype)); EmitComputeBody(expressions, input_ids, output_ids, dtype));
std::string predefined_cuda_functions;
if (dtype == "float") {
predefined_cuda_functions = predefined_cuda_functions_fp32;
} else if (dtype == "double") {
predefined_cuda_functions = predefined_cuda_functions_fp64;
} else if (dtype == "float16") {
predefined_cuda_functions = predefined_cuda_functions_fp16;
}
return predefined_cuda_functions + code_templates_[0].Format(template_var); return predefined_cuda_functions + code_templates_[0].Format(template_var);
} }
...@@ -173,9 +184,10 @@ std::string CodeGenerator::EmitComputeBody( ...@@ -173,9 +184,10 @@ std::string CodeGenerator::EmitComputeBody(
std::string dtype) { std::string dtype) {
std::ostringstream compute; std::ostringstream compute;
std::unordered_set<int> used; std::unordered_set<int> used;
std::string compute_dtype = (dtype == "float16") ? "float" : dtype;
for (size_t i = 0; i < expressions.size(); i++) { for (size_t i = 0; i < expressions.size(); i++) {
VLOG(3) << DebugString(expressions[i]); VLOG(3) << DebugString(expressions[i]);
compute << expressions[i].GetExpression(dtype, &used); compute << expressions[i].GetExpression(compute_dtype, &used);
} }
// Load input to temporal variables. // Load input to temporal variables.
...@@ -183,14 +195,23 @@ std::string CodeGenerator::EmitComputeBody( ...@@ -183,14 +195,23 @@ std::string CodeGenerator::EmitComputeBody(
for (auto id : input_ids) { for (auto id : input_ids) {
if (output_ids.find(id) == output_ids.end() && if (output_ids.find(id) == output_ids.end() &&
used.find(id) != used.end()) { used.find(id) != used.end()) {
load << dtype << " " << TmpName(id) << " = " << ArgName(id) << "[idx];"; if (dtype == "float16") {
load << "float " << TmpName(id) << " = __half2float(" << ArgName(id)
<< "[idx]);";
} else {
load << dtype << " " << TmpName(id) << " = " << ArgName(id) << "[idx];";
}
} }
} }
// Store temporal variables to memory. // Store temporal variables to memory.
std::ostringstream store; std::ostringstream store;
for (auto id : output_ids) { for (auto id : output_ids) {
store << ArgName(id) << "[idx] = " << TmpName(id) << ";"; if (dtype == "float16") {
store << ArgName(id) << "[idx] = __float2half(" << TmpName(id) << ");";
} else {
store << ArgName(id) << "[idx] = " << TmpName(id) << ";";
}
} }
return load.str() + compute.str() + store.str(); return load.str() + compute.str() + store.str();
......
...@@ -30,8 +30,8 @@ class CodeGenerator { ...@@ -30,8 +30,8 @@ class CodeGenerator {
public: public:
CodeGenerator(); CodeGenerator();
std::string Generate(std::string func_name, std::string Generate(std::string func_name, std::string dtype,
std::vector<OperationExpression> expressions); const std::vector<OperationExpression>& expressions);
std::string Generate(SubGraph* subgraph); std::string Generate(SubGraph* subgraph);
......
...@@ -149,31 +149,6 @@ class CodeTemplate { ...@@ -149,31 +149,6 @@ class CodeTemplate {
std::string template_str_; std::string template_str_;
}; };
static const char predefined_cuda_functions[] = R"(
__device__ float real_exp(float x) { return ::expf(x); }
__device__ double real_exp(double x) { return ::exp(x); }
__device__ float real_log(float x) { return ::logf(x); }
__device__ double real_log(double x) { return ::log(x); }
__device__ float real_min(float x, float y) { return ::fminf(x, y); }
__device__ double real_min(double x, double y) { return ::fmin(x, y); }
__device__ float real_max(float x, float y) { return ::fmaxf(x, y); }
__device__ double real_max(double x, double y) { return ::fmax(x, y); }
)";
static const char elementwise_cuda_template[] = R"(
extern "C" __global__ void $func_name($parameters) {
for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < N;
idx += gridDim.x * blockDim.x) {
$compute_body
}
}
)";
static std::string DebugString(const OperationExpression& expr) { static std::string DebugString(const OperationExpression& expr) {
std::stringstream ret; std::stringstream ret;
ret << "Op(" << expr.GetOpType() << "), inputs:{"; ret << "Op(" << expr.GetOpType() << "), inputs:{";
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math.h"
#include "paddle/fluid/platform/device_code.h" #include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -88,7 +89,8 @@ inline float elementwise_mul_grad_dy(float x, float y, float out, float dout) { ...@@ -88,7 +89,8 @@ inline float elementwise_mul_grad_dy(float x, float y, float out, float dout) {
void CheckOutput(const std::vector<OperationExpression>& expressions, void CheckOutput(const std::vector<OperationExpression>& expressions,
const std::vector<LoDTensor> cpu_tensors, const std::vector<LoDTensor> cpu_tensors,
const std::vector<int> input_ids_of_subgraph, const std::vector<int> input_ids_of_subgraph,
const std::vector<int> output_ids_of_subgraph, int i) { const std::vector<int> output_ids_of_subgraph, int i,
float eps) {
std::vector<float> var(cpu_tensors.size()); std::vector<float> var(cpu_tensors.size());
for (auto id : input_ids_of_subgraph) { for (auto id : input_ids_of_subgraph) {
if (id >= 0) { if (id >= 0) {
...@@ -138,7 +140,12 @@ void CheckOutput(const std::vector<OperationExpression>& expressions, ...@@ -138,7 +140,12 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
for (auto id : output_ids_of_subgraph) { for (auto id : output_ids_of_subgraph) {
float actual = cpu_tensors[id].data<float>()[i]; float actual = cpu_tensors[id].data<float>()[i];
float expect = var[id]; float expect = var[id];
EXPECT_LT(fabs(actual - expect), 1.E-05); if (fabs(actual - expect) > eps) {
LOG(INFO) << "Precision check failed from i = " << id
<< ", expect: " << expect << ", actual: " << actual;
EXPECT_LT(fabs(actual - expect), eps);
break;
}
} }
} }
...@@ -162,33 +169,49 @@ void SetupRandomCPUTensor(LoDTensor* tensor) { ...@@ -162,33 +169,49 @@ void SetupRandomCPUTensor(LoDTensor* tensor) {
namespace fusion_group = paddle::framework::ir::fusion_group; namespace fusion_group = paddle::framework::ir::fusion_group;
template <typename T>
void TestMainImpl(std::string func_name, std::string code_str, void TestMainImpl(std::string func_name, std::string code_str,
std::vector<paddle::framework::LoDTensor> cpu_tensors, int n, std::vector<paddle::framework::LoDTensor> cpu_tensors, int n,
std::vector<int> input_ids, std::vector<int> output_ids) { std::vector<int> input_ids, std::vector<int> output_ids) {
bool is_float16 = std::type_index(typeid(T)) ==
std::type_index(typeid(paddle::platform::float16));
paddle::framework::InitDevices(false, {0}); paddle::framework::InitDevices(false, {0});
paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0); paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceCode device_code(place, func_name, code_str); paddle::platform::CUDADeviceCode device_code(place, func_name, code_str);
device_code.Compile(); device_code.Compile(is_float16);
std::vector<paddle::framework::LoDTensor> gpu_tensors(cpu_tensors.size()); std::vector<paddle::framework::LoDTensor> gpu_tensors(cpu_tensors.size());
std::vector<paddle::framework::LoDTensor> tmp_cpu_tensors(cpu_tensors.size());
std::vector<float*> gpu_ptrs(gpu_tensors.size()); std::vector<T*> gpu_ptrs(gpu_tensors.size());
std::vector<void*> args; std::vector<void*> args;
args.push_back(&n); args.push_back(&n);
for (auto id : input_ids) { for (auto id : input_ids) {
if (id >= 0) { if (id >= 0) {
gpu_ptrs[id] = gpu_ptrs[id] =
gpu_tensors[id].mutable_data<float>(cpu_tensors[id].dims(), place); gpu_tensors[id].mutable_data<T>(cpu_tensors[id].dims(), place);
fusion_group::SetupRandomCPUTensor<float>(&cpu_tensors[id]); fusion_group::SetupRandomCPUTensor<float>(&cpu_tensors[id]);
TensorCopySync(cpu_tensors[id], place, &gpu_tensors[id]); if (is_float16) {
paddle::platform::float16* tmp_cpu_ptr =
tmp_cpu_tensors[id].mutable_data<paddle::platform::float16>(
cpu_tensors[id].dims(), paddle::platform::CPUPlace());
const float* cpu_ptr = cpu_tensors[id].data<float>();
for (int64_t i = 0; i < cpu_tensors[id].numel(); ++i) {
tmp_cpu_ptr[i] = paddle::platform::float16(cpu_ptr[i]);
}
TensorCopySync(tmp_cpu_tensors[id], place, &gpu_tensors[id]);
} else {
TensorCopySync(cpu_tensors[id], place, &gpu_tensors[id]);
}
args.push_back(&gpu_ptrs[id]); args.push_back(&gpu_ptrs[id]);
} }
} }
for (auto id : output_ids) { for (auto id : output_ids) {
gpu_ptrs[id] = gpu_ptrs[id] =
gpu_tensors[id].mutable_data<float>(cpu_tensors[id].dims(), place); gpu_tensors[id].mutable_data<T>(cpu_tensors[id].dims(), place);
args.push_back(&gpu_ptrs[id]); args.push_back(&gpu_ptrs[id]);
} }
...@@ -200,38 +223,93 @@ void TestMainImpl(std::string func_name, std::string code_str, ...@@ -200,38 +223,93 @@ void TestMainImpl(std::string func_name, std::string code_str,
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
dev_ctx->Wait(); dev_ctx->Wait();
// Copy the results back to CPU.
for (auto id : output_ids) { for (auto id : output_ids) {
TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(), if (is_float16) {
&cpu_tensors[id]); paddle::platform::float16* tmp_cpu_ptr =
tmp_cpu_tensors[id].mutable_data<paddle::platform::float16>(
cpu_tensors[id].dims(), paddle::platform::CPUPlace());
TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(),
&tmp_cpu_tensors[id]);
float* cpu_ptr = cpu_tensors[id].mutable_data<float>(
cpu_tensors[id].dims(), paddle::platform::CPUPlace());
for (int64_t i = 0; i < cpu_tensors[id].numel(); ++i) {
cpu_ptr[i] = static_cast<float>(tmp_cpu_ptr[i]);
}
} else {
TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(),
&cpu_tensors[id]);
}
}
}
void TestElementwiseMain(
std::string func_name, std::string code_str,
std::vector<fusion_group::OperationExpression> expressions,
std::vector<int> input_ids, std::vector<int> output_ids,
std::string dtype) {
std::unordered_set<int> ids;
for (auto id : input_ids) {
ids.insert(id);
}
for (auto id : output_ids) {
ids.insert(id);
}
// Prepare CPU tensors which always hold float.
std::vector<paddle::framework::LoDTensor> cpu_tensors(ids.size());
auto dims = paddle::framework::make_ddim(
{static_cast<int64_t>(256), static_cast<int64_t>(1024)});
for (size_t i = 0; i < cpu_tensors.size(); ++i) {
cpu_tensors[i].mutable_data<float>(dims, paddle::platform::CPUPlace());
}
int n = cpu_tensors[0].numel();
if (dtype == "float16") {
TestMainImpl<paddle::platform::float16>(func_name, code_str, cpu_tensors, n,
input_ids, output_ids);
} else {
TestMainImpl<float>(func_name, code_str, cpu_tensors, n, input_ids,
output_ids);
}
// Check the results
float eps = (dtype == "float16") ? 1E-2 : 1E-5;
for (int i = 0; i < n; i++) {
fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids,
i, eps);
} }
} }
void TestMain(std::string func_name, void TestMain(std::string func_name,
std::vector<fusion_group::OperationExpression> expressions, std::vector<fusion_group::OperationExpression> expressions,
std::vector<paddle::framework::LoDTensor> cpu_tensors, int n, std::vector<int> input_ids, std::vector<int> output_ids,
std::vector<int> input_ids, std::vector<int> output_ids) { std::string dtype) {
fusion_group::OperationMap::Init(); fusion_group::OperationMap::Init();
fusion_group::CodeGenerator code_generator; fusion_group::CodeGenerator code_generator;
std::string code_str = code_generator.Generate(func_name, expressions); std::string code_str = code_generator.Generate(func_name, dtype, expressions);
VLOG(3) << code_str; VLOG(3) << code_str;
TestMainImpl(func_name, code_str, cpu_tensors, n, input_ids, output_ids); LOG(INFO) << "dtype: " << dtype;
TestElementwiseMain(func_name, code_str, expressions, input_ids, output_ids,
dtype);
} }
std::vector<fusion_group::OperationExpression> TestMain( void TestMain(fusion_group::SubGraph* subgraph, std::vector<int> input_ids,
fusion_group::SubGraph* subgraph, std::vector<int> output_ids) {
std::vector<paddle::framework::LoDTensor> cpu_tensors, int n,
std::vector<int> input_ids, std::vector<int> output_ids) {
fusion_group::OperationMap::Init(); fusion_group::OperationMap::Init();
fusion_group::CodeGenerator code_generator; fusion_group::CodeGenerator code_generator;
std::string code_str = code_generator.Generate(subgraph); std::string code_str = code_generator.Generate(subgraph);
VLOG(3) << code_str; VLOG(3) << code_str;
TestMainImpl(subgraph->GetFuncName(), code_str, cpu_tensors, n, input_ids,
output_ids);
// Need to check the accuracy according to expressions. // Need to check the accuracy according to expressions.
return code_generator.ConvertToExpressions(subgraph); std::vector<fusion_group::OperationExpression> expressions =
code_generator.ConvertToExpressions(subgraph);
LOG(INFO) << "dtype: " << subgraph->GetDataType();
TestElementwiseMain(subgraph->GetFuncName(), code_str, expressions, input_ids,
output_ids, subgraph->GetDataType());
} }
TEST(code_generator, elementwise) { TEST(code_generator, elementwise) {
...@@ -248,30 +326,16 @@ TEST(code_generator, elementwise) { ...@@ -248,30 +326,16 @@ TEST(code_generator, elementwise) {
std::vector<fusion_group::OperationExpression> expressions = { std::vector<fusion_group::OperationExpression> expressions = {
exp1, exp2, exp3, exp4, exp5}; exp1, exp2, exp3, exp4, exp5};
// Prepare CPU tensors for (std::string dtype : {"float", "float16"}) {
std::vector<paddle::framework::LoDTensor> cpu_tensors(9); // Expressions:
auto dims = paddle::framework::make_ddim( // Op(elementwise_mul), inputs:{0,1}, outputs:{2}
{static_cast<int64_t>(256), static_cast<int64_t>(1024)}); // Op(elementwise_add), inputs:{2,3}, outputs:{4}
for (size_t i = 0; i < cpu_tensors.size(); ++i) { // Op(elementwise_sub), inputs:{4,5}, outputs:{6}
cpu_tensors[i].mutable_data<float>(dims, paddle::platform::CPUPlace()); // Op(relu), inputs:{6}, outputs:{7}
} // Op(sigmoid), inputs:{7}, outputs:{8}
std::vector<int> input_ids = {0, 1, 3, 5};
// Expressions: std::vector<int> output_ids = {2, 4, 6, 7, 8};
// Op(elementwise_mul), inputs:{0,1}, outputs:{2} TestMain("elementwise_kernel_0", expressions, input_ids, output_ids, dtype);
// Op(elementwise_add), inputs:{2,3}, outputs:{4}
// Op(elementwise_sub), inputs:{4,5}, outputs:{6}
// Op(relu), inputs:{6}, outputs:{7}
// Op(sigmoid), inputs:{7}, outputs:{8}
int n = cpu_tensors[0].numel();
std::vector<int> input_ids = {0, 1, 3, 5};
std::vector<int> output_ids = {2, 4, 6, 7, 8};
TestMain("elementwise_kernel_0", expressions, cpu_tensors, n, input_ids,
output_ids);
// Check the results
for (int i = 0; i < n; i++) {
fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids,
i);
} }
} }
...@@ -286,32 +350,19 @@ TEST(code_generator, elementwise_grad) { ...@@ -286,32 +350,19 @@ TEST(code_generator, elementwise_grad) {
{4, 5}); {4, 5});
std::vector<fusion_group::OperationExpression> expressions = {exp1, exp2}; std::vector<fusion_group::OperationExpression> expressions = {exp1, exp2};
// Prepare CPU tensors for (std::string dtype : {"float", "float16"}) {
std::vector<paddle::framework::LoDTensor> cpu_tensors(8); // Expressions:
auto dims = paddle::framework::make_ddim( // Op(relu_grad), inputs:{2,3,7}, outputs:{6}
{static_cast<int64_t>(256), static_cast<int64_t>(1024)}); // Op(elementwise_mul_grad), inputs:{0,1,2,6}, outputs:{4,5}
for (size_t i = 0; i < cpu_tensors.size(); ++i) { std::vector<int> input_ids = {0, 1, 2, 3, 7};
cpu_tensors[i].mutable_data<float>(dims, paddle::platform::CPUPlace()); std::vector<int> output_ids = {4, 5, 6};
} TestMain("elementwise_grad_kernel_0", expressions, input_ids, output_ids,
dtype);
// Expressions:
// Op(relu_grad), inputs:{2,3,7}, outputs:{6}
// Op(elementwise_mul_grad), inputs:{0,1,2,6}, outputs:{4,5}
int n = cpu_tensors[0].numel();
std::vector<int> input_ids = {0, 1, 2, 3, 7};
std::vector<int> output_ids = {4, 5, 6};
TestMain("elementwise_grad_kernel_0", expressions, cpu_tensors, n, input_ids,
output_ids);
// Check the results
for (int i = 0; i < n; i++) {
fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids,
i);
} }
} }
std::unique_ptr<paddle::framework::ir::Graph> BuildGraph( std::unique_ptr<paddle::framework::ir::Graph> BuildGraph(bool backward,
bool backward = false) { std::string dtype) {
// inputs operator output // inputs operator output
// -------------------------------------------------------- // --------------------------------------------------------
// x0 sigmoid -> tmp_0 // x0 sigmoid -> tmp_0
...@@ -353,6 +404,14 @@ std::unique_ptr<paddle::framework::ir::Graph> BuildGraph( ...@@ -353,6 +404,14 @@ std::unique_ptr<paddle::framework::ir::Graph> BuildGraph(
std::unique_ptr<paddle::framework::ir::Graph> graph( std::unique_ptr<paddle::framework::ir::Graph> graph(
new paddle::framework::ir::Graph(layers.main_program())); new paddle::framework::ir::Graph(layers.main_program()));
auto proto_dtype = (dtype == "float16")
? paddle::framework::proto::VarType::FP16
: paddle::framework::proto::VarType::FP32;
for (auto* n : graph->Nodes()) {
if (n && n->IsVar() && n->Var()) {
n->Var()->SetDataType(proto_dtype);
}
}
#ifdef __clang__ #ifdef __clang__
return graph; return graph;
#else #else
...@@ -401,66 +460,40 @@ std::unordered_set<paddle::framework::ir::Node*> DistilGradNodes( ...@@ -401,66 +460,40 @@ std::unordered_set<paddle::framework::ir::Node*> DistilGradNodes(
} }
TEST(code_generator, subgraph) { TEST(code_generator, subgraph) {
std::unique_ptr<paddle::framework::ir::Graph> graph = BuildGraph(false); for (std::string dtype : {"float", "float16"}) {
fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true, std::unique_ptr<paddle::framework::ir::Graph> graph =
graph->Nodes()); BuildGraph(false, dtype);
fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true,
// Prepare CPU tensors graph->Nodes());
std::vector<paddle::framework::LoDTensor> cpu_tensors(9);
auto dims = paddle::framework::make_ddim( // Expressions generated by code_generator (they may be different):
{static_cast<int64_t>(256), static_cast<int64_t>(1024)}); // Op(sigmoid), inputs:{0}, outputs:{4}
for (size_t i = 0; i < cpu_tensors.size(); ++i) { // Op(elementwise_mul), inputs:{4,1}, outputs:{7}
cpu_tensors[i].mutable_data<float>(dims, paddle::platform::CPUPlace()); // Op(tanh), inputs:{2}, outputs:{5}
} // Op(elementwise_mul), inputs:{3,5}, outputs:{6}
// Op(elementwise_add), inputs:{7,6}, outputs:{8}
// Expressions generated by code_generator (they may be different): std::vector<int> input_ids = {0, 1, 2, 3};
// Op(sigmoid), inputs:{0}, outputs:{4} std::vector<int> output_ids = {4, 5, 6, 7, 8};
// Op(elementwise_mul), inputs:{4,1}, outputs:{7} TestMain(&subgraph, input_ids, output_ids);
// Op(tanh), inputs:{2}, outputs:{5}
// Op(elementwise_mul), inputs:{3,5}, outputs:{6}
// Op(elementwise_add), inputs:{7,6}, outputs:{8}
int n = cpu_tensors[0].numel();
std::vector<int> input_ids = {0, 1, 2, 3};
std::vector<int> output_ids = {4, 5, 6, 7, 8};
std::vector<fusion_group::OperationExpression> expressions =
TestMain(&subgraph, cpu_tensors, n, input_ids, output_ids);
// Check the results
for (int i = 0; i < n; i++) {
fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids,
i);
} }
} }
TEST(code_generator, subgraph_grad) { TEST(code_generator, subgraph_grad) {
std::unique_ptr<paddle::framework::ir::Graph> graph = BuildGraph(true); for (std::string dtype : {"float", "float16"}) {
fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true, std::unique_ptr<paddle::framework::ir::Graph> graph =
DistilGradNodes(graph)); BuildGraph(true, dtype);
fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true,
// Prepare CPU tensors DistilGradNodes(graph));
std::vector<paddle::framework::LoDTensor> cpu_tensors(18);
auto dims = paddle::framework::make_ddim( // Expressions generated by code_generator (they may be different):
{static_cast<int64_t>(256), static_cast<int64_t>(1024)}); // Op(elementwise_add_grad), inputs:{1,2,3,0}, outputs:{11,10}
for (size_t i = 0; i < cpu_tensors.size(); ++i) { // Op(elementwise_mul_grad), inputs:{5,4,2,10}, outputs:{17,13}
cpu_tensors[i].mutable_data<float>(dims, paddle::platform::CPUPlace()); // Op(elementwise_mul_grad), inputs:{7,6,1,11}, outputs:{12,15}
} // Op(sigmoid_grad), inputs:{8,7,12}, outputs:{16}
// Op(tanh_grad), inputs:{9,4,13}, outputs:{14}
// Expressions generated by code_generator (they may be different): std::vector<int> input_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
// Op(elementwise_add_grad), inputs:{1,2,3,0}, outputs:{11,10} std::vector<int> output_ids = {10, 11, 12, 13, 14, 15, 16, 17};
// Op(elementwise_mul_grad), inputs:{5,4,2,10}, outputs:{17,13} TestMain(&subgraph, input_ids, output_ids);
// Op(elementwise_mul_grad), inputs:{7,6,1,11}, outputs:{12,15}
// Op(sigmoid_grad), inputs:{8,7,12}, outputs:{16}
// Op(tanh_grad), inputs:{9,4,13}, outputs:{14}
int n = cpu_tensors[0].numel();
std::vector<int> input_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
std::vector<int> output_ids = {10, 11, 12, 13, 14, 15, 16, 17};
std::vector<fusion_group::OperationExpression> expressions =
TestMain(&subgraph, cpu_tensors, n, input_ids, output_ids);
// Check the results
for (int i = 0; i < n; i++) {
fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids,
i);
} }
} }
#endif #endif
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace framework {
namespace ir {
namespace fusion_group {
static constexpr char predefined_cuda_functions_fp32[] = R"(
__device__ inline float real_exp(float x) { return ::expf(x); }
__device__ inline float real_log(float x) { return ::logf(x); }
)";
static constexpr char predefined_cuda_functions_fp64[] = R"(
__device__ inline double real_exp(double x) { return ::exp(x); }
__device__ inline double real_log(double x) { return ::log(x); }
)";
static constexpr char predefined_cuda_functions_fp16[] = R"(
__device__ inline float real_exp(float x) { return ::expf(x); }
__device__ inline float real_log(float x) { return ::logf(x); }
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
struct __align__(2) __half {
__device__ __half() { }
protected:
unsigned short __x;
};
__device__ __half __float2half(const float f) {
__half val;
asm("{ cvt.rn.f16.f32 %0, %1; }\n" : "=h"(__HALF_TO_US(val)
) : "f"(f));
return val;
}
__device__ float __half2float(const __half h) {
float val;
asm("{ cvt.f32.f16 %0, %1; }\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
return val;
}
#undef __HALF_TO_US
#undef __HALF_TO_CUS
typedef __half float16;
)";
static constexpr char cuda_kernel_template_1d[] = R"(
extern "C" __global__ void $func_name($parameters) {
for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < N;
idx += gridDim.x * blockDim.x) {
$compute_body
}
}
)";
} // namespace fusion_group
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -32,8 +32,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const { ...@@ -32,8 +32,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
if (Get<bool>("use_gpu")) { if (Get<bool>("use_gpu")) {
fusion_group::OperationMap::Init(); fusion_group::OperationMap::Init();
int num_elementwise_groups = DetectFusionGroup(graph, 0); int num_elementwise_groups = DetectFusionGroup(graph, 0);
VLOG(3) << "Detect " << num_elementwise_groups AddStatis(num_elementwise_groups);
<< " elementwise fusion groups.";
} }
} }
...@@ -49,23 +48,23 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const { ...@@ -49,23 +48,23 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
size_t min_subgraph_size = 2; size_t min_subgraph_size = 2;
bool save_intermediate_out = true; bool save_intermediate_out = true;
for (auto& vec : subgraphs) { for (auto& vec : subgraphs) {
if (vec.size() >= min_subgraph_size) { fusion_group::SubGraph subgraph(
std::string func_name = "fused_elementwise_" + std::to_string(index++); type, "", save_intermediate_out,
fusion_group::SubGraph subgraph( std::unordered_set<Node*>(vec.begin(), vec.end()));
type, func_name, save_intermediate_out, VLOG(3) << "subgraph: {\n" << DebugString(subgraph.SortedNodes()) << "}\n";
std::unordered_set<Node*>(vec.begin(), vec.end()));
VLOG(3) << "subgraph: {\n" if (subgraph.IsValid(min_subgraph_size)) {
<< DebugString(subgraph.SortedNodes()) << "}\n"; subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++));
if (GenerateCode(&subgraph)) {
GenerateCode(&subgraph); InsertFusionGroupOp(graph, &subgraph);
InsertFusionGroupOp(graph, &subgraph); num_subgraphs++;
num_subgraphs++; }
} }
} }
return num_subgraphs; return num_subgraphs;
} }
void FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const { bool FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const {
fusion_group::CodeGenerator code_generator; fusion_group::CodeGenerator code_generator;
std::string code_str = code_generator.Generate(subgraph); std::string code_str = code_generator.Generate(subgraph);
VLOG(3) << code_str; VLOG(3) << code_str;
...@@ -74,10 +73,12 @@ void FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const { ...@@ -74,10 +73,12 @@ void FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const {
platform::CUDAPlace place = platform::CUDAPlace(0); platform::CUDAPlace place = platform::CUDAPlace(0);
std::unique_ptr<platform::CUDADeviceCode> device_code( std::unique_ptr<platform::CUDADeviceCode> device_code(
new platform::CUDADeviceCode(place, subgraph->GetFuncName(), code_str)); new platform::CUDADeviceCode(place, subgraph->GetFuncName(), code_str));
device_code->Compile(); bool is_compiled = device_code->Compile();
if (is_compiled) {
platform::DeviceCodePool& pool = platform::DeviceCodePool::Init({place}); platform::DeviceCodePool& pool = platform::DeviceCodePool::Init({place});
pool.Set(std::move(device_code)); pool.Set(std::move(device_code));
}
return is_compiled;
} }
static int ExtractOpRole(fusion_group::SubGraph* subgraph) { static int ExtractOpRole(fusion_group::SubGraph* subgraph) {
......
...@@ -29,7 +29,7 @@ class FusionGroupPass : public FusePassBase { ...@@ -29,7 +29,7 @@ class FusionGroupPass : public FusePassBase {
private: private:
int DetectFusionGroup(Graph* graph, int type = 0) const; int DetectFusionGroup(Graph* graph, int type = 0) const;
void GenerateCode(fusion_group::SubGraph* subgraph) const; bool GenerateCode(fusion_group::SubGraph* subgraph) const;
void InsertFusionGroupOp(Graph* graph, void InsertFusionGroupOp(Graph* graph,
fusion_group::SubGraph* subgraph) const; fusion_group::SubGraph* subgraph) const;
......
...@@ -59,6 +59,11 @@ std::unique_ptr<Graph> BuildElementwiseListGraph(bool backward = false) { ...@@ -59,6 +59,11 @@ std::unique_ptr<Graph> BuildElementwiseListGraph(bool backward = false) {
} }
std::unique_ptr<Graph> graph(new Graph(layers.main_program())); std::unique_ptr<Graph> graph(new Graph(layers.main_program()));
for (auto* n : graph->Nodes()) {
if (n && n->IsVar() && n->Var()) {
n->Var()->SetDataType(proto::VarType::FP32);
}
}
#ifdef __clang__ #ifdef __clang__
return graph; return graph;
#else #else
...@@ -116,6 +121,11 @@ std::unique_ptr<Graph> BuildElementwiseTreeGraph(bool backward = false) { ...@@ -116,6 +121,11 @@ std::unique_ptr<Graph> BuildElementwiseTreeGraph(bool backward = false) {
} }
std::unique_ptr<Graph> graph(new Graph(layers.main_program())); std::unique_ptr<Graph> graph(new Graph(layers.main_program()));
for (auto* n : graph->Nodes()) {
if (n && n->IsVar() && n->Var()) {
n->Var()->SetDataType(proto::VarType::FP32);
}
}
#ifdef __clang__ #ifdef __clang__
return graph; return graph;
#else #else
......
...@@ -91,7 +91,7 @@ void OperationMap::InsertUnaryElementwiseOperations() { ...@@ -91,7 +91,7 @@ void OperationMap::InsertUnaryElementwiseOperations() {
// relu: // relu:
// out = f(x) = x > 0 ? x : 0 // out = f(x) = x > 0 ? x : 0
// dx = dout * (out > 0 ? 1 : 0) // dx = dout * (out > 0 ? 1 : 0)
insert_handler("relu", "real_max(${0}, 0)", {"${1} > 0 ? ${2} : 0"}); insert_handler("relu", "${0} > 0 ? ${0} : 0", {"${1} > 0 ? ${2} : 0"});
// sigmoid: // sigmoid:
// out = f(x) = 1.0 / (1.0 + exp(-x)) // out = f(x) = 1.0 / (1.0 + exp(-x))
// dx = dout * out * (1 - out) // dx = dout * out * (1 - out)
...@@ -133,9 +133,24 @@ void OperationMap::InsertBinaryElementwiseOperations() { ...@@ -133,9 +133,24 @@ void OperationMap::InsertBinaryElementwiseOperations() {
// dy = dout * x // dy = dout * x
insert_handler("elementwise_mul", "${0} * ${1}", insert_handler("elementwise_mul", "${0} * ${1}",
{"${3} * ${1}", "${3} * ${0}"}); {"${3} * ${1}", "${3} * ${0}"});
insert_handler("elementwise_div", "${0} / ${1}", {}); // elementwise_div:
insert_handler("elementwise_min", "real_min(${0}, ${1})", {}); // out = x / y
insert_handler("elementwise_max", "real_max(${0}, ${1})", {}); // dx = dout / y
// dy = - dout * out / y
insert_handler("elementwise_div", "${0} / ${1}",
{"${3} / ${1}", "- ${3} * ${2} / ${1}"});
// elementwise_min:
// out = x < y ? x : y
// dx = dout * (x < y)
// dy = dout * (x >= y)
insert_handler("elementwise_min", "${0} < ${1} ? ${0} : ${1}",
{"${3} * (${0} < ${1})", "${3} * (${0} >= ${1})"});
// elementwise_max:
// out = x > y ? x : y
// dx = dout * (x > y)
// dy = dout * (x <= y)
insert_handler("elementwise_max", "${0} > ${1} ? ${0} : ${1}",
{"${3} * (${0} > ${1})", "${3} * (${0} <= ${1})"});
} }
} // namespace fusion_group } // namespace fusion_group
......
...@@ -49,11 +49,23 @@ class SubGraph { ...@@ -49,11 +49,23 @@ class SubGraph {
} }
} }
} }
ExtractDataType();
} }
bool IsEmpty() { return nodes_set_.empty(); } bool IsValid(int min_subgraph_size) {
int num_operations = GetNumOperations();
if (num_operations < min_subgraph_size) {
VLOG(2) << "There are only " << num_operations
<< " operations in the subgraph. Expected at least "
<< min_subgraph_size;
return false;
}
return ExtractDataType();
}
int GetType() const { return type_; } int GetType() const { return type_; }
std::string GetDataType() const { return data_type_; }
void SetFuncName(std::string func_name) { func_name_ = func_name; } void SetFuncName(std::string func_name) { func_name_ = func_name; }
std::string GetFuncName() const { return func_name_; } std::string GetFuncName() const { return func_name_; }
...@@ -150,6 +162,37 @@ class SubGraph { ...@@ -150,6 +162,37 @@ class SubGraph {
} }
private: private:
bool ExtractDataType() {
bool is_first = true;
proto::VarType::Type data_type = proto::VarType::FP32;
for (auto* n : nodes_set_) {
if (n && n->IsVar() && n->Var()) {
if (n->Var()->GetType() != proto::VarType::LOD_TENSOR) {
// All var node in a subgraph should hold a LoDTensor.
return false;
}
if (is_first) {
data_type = n->Var()->GetDataType();
is_first = false;
} else if (n->Var()->GetDataType() != data_type) {
// DataType of VarDesc in a subgraph is not the same.
return false;
}
}
}
if (data_type == proto::VarType::FP32) {
data_type_ = "float";
} else if (data_type == proto::VarType::FP64) {
data_type_ = "double";
} else if (data_type == proto::VarType::FP16) {
data_type_ = "float16";
} else {
VLOG(2) << "Only support fp32, fp64 and fp16 in fusion_group.";
return false;
}
return true;
}
void TopologicalSort() { void TopologicalSort() {
if (!is_sorted_) { if (!is_sorted_) {
std::unordered_map<Node*, std::vector<Node*>> inputs_map; std::unordered_map<Node*, std::vector<Node*>> inputs_map;
...@@ -203,6 +246,7 @@ class SubGraph { ...@@ -203,6 +246,7 @@ class SubGraph {
private: private:
int type_{-1}; int type_{-1};
std::string data_type_;
std::string func_name_; std::string func_name_;
bool save_intermediate_out_{true}; bool save_intermediate_out_{true};
......
...@@ -33,8 +33,9 @@ struct Layers { ...@@ -33,8 +33,9 @@ struct Layers {
const ProgramDesc& main_program() { return program_; } const ProgramDesc& main_program() { return program_; }
VarDesc* data(std::string name, std::vector<int64_t> shape = {}, VarDesc* data(std::string name, std::vector<int64_t> shape = {},
bool is_persistable = false) { bool is_persistable = false,
return lod_tensor(name, shape, is_persistable); proto::VarType::Type data_type = proto::VarType::FP32) {
return lod_tensor(name, shape, is_persistable, data_type);
} }
VarDesc* conv2d(VarDesc* input, VarDesc* filter, VarDesc* bias, VarDesc* conv2d(VarDesc* input, VarDesc* filter, VarDesc* bias,
...@@ -379,9 +380,11 @@ struct Layers { ...@@ -379,9 +380,11 @@ struct Layers {
private: private:
VarDesc* lod_tensor(std::string name, std::vector<int64_t> shape = {}, VarDesc* lod_tensor(std::string name, std::vector<int64_t> shape = {},
bool is_persistable = false) { bool is_persistable = false,
proto::VarType::Type data_type = proto::VarType::FP32) {
auto* var = program_.MutableBlock(0)->Var(name); auto* var = program_.MutableBlock(0)->Var(name);
var->SetType(proto::VarType::LOD_TENSOR); var->SetType(proto::VarType::LOD_TENSOR);
var->SetDataType(data_type);
var->SetShape(shape); var->SetShape(shape);
var->SetPersistable(is_persistable); var->SetPersistable(is_persistable);
return var; return var;
......
...@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_group_op.h" #include "paddle/fluid/operators/fused/fusion_group_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
fusion_group, fusion_group, ops::FusionGroupKernel<plat::CUDADeviceContext, float>,
ops::FusionGroupKernel<paddle::platform::CUDADeviceContext, double>, ops::FusionGroupKernel<plat::CUDADeviceContext, double>,
ops::FusionGroupKernel<paddle::platform::CUDADeviceContext, float>); ops::FusionGroupKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_code.h" #include "paddle/fluid/platform/device_code.h"
#include <sys/stat.h>
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <utility> #include <utility>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
DECLARE_string(cuda_dir);
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -79,6 +82,46 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) { ...@@ -79,6 +82,46 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
static std::string FindCUDAIncludePath() {
auto EndWith = [](std::string str, std::string substr) -> bool {
size_t pos = str.rfind(substr);
return pos != std::string::npos && pos == (str.length() - substr.length());
};
struct stat st;
std::string cuda_include_path;
if (!FLAGS_cuda_dir.empty()) {
cuda_include_path = FLAGS_cuda_dir;
if (EndWith(cuda_include_path, "/")) {
cuda_include_path.erase(cuda_include_path.end() - 1);
}
for (std::string suffix : {"/lib", "/lib64"}) {
if (EndWith(FLAGS_cuda_dir, suffix)) {
cuda_include_path.erase(cuda_include_path.end() - suffix.length());
break;
}
}
if (!EndWith(cuda_include_path, "include")) {
cuda_include_path += "/include";
}
// Whether the cuda_include_path exists on the file system.
if (stat(cuda_include_path.c_str(), &st) == 0) {
return cuda_include_path;
}
}
cuda_include_path = "/usr/local/cuda/include";
if (stat(cuda_include_path.c_str(), &st) == 0) {
return cuda_include_path;
}
LOG(WARNING) << "Cannot find CUDA include path."
<< "Please check whether CUDA is installed in the default "
"installation path, or specify it by export "
"FLAGS_cuda_dir=xxx.";
return "";
}
CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name, CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
const std::string& kernel) { const std::string& kernel) {
if (!is_gpu_place(place)) { if (!is_gpu_place(place)) {
...@@ -91,7 +134,7 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name, ...@@ -91,7 +134,7 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
kernel_ = kernel; kernel_ = kernel;
} }
bool CUDADeviceCode::Compile() { bool CUDADeviceCode::Compile(bool include_path) {
is_compiled_ = false; is_compiled_ = false;
if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) { if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) {
LOG(WARNING) LOG(WARNING)
...@@ -116,8 +159,14 @@ bool CUDADeviceCode::Compile() { ...@@ -116,8 +159,14 @@ bool CUDADeviceCode::Compile() {
int compute_capability = dev_ctx->GetComputeCapability(); int compute_capability = dev_ctx->GetComputeCapability();
std::string compute_flag = std::string compute_flag =
"--gpu-architecture=compute_" + std::to_string(compute_capability); "--gpu-architecture=compute_" + std::to_string(compute_capability);
const std::vector<const char*> options = {"--std=c++11", std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
compute_flag.c_str()}; if (include_path) {
std::string cuda_include_path = FindCUDAIncludePath();
if (!cuda_include_path.empty()) {
std::string include_option = "--include-path=" + cuda_include_path;
options.push_back(include_option.c_str());
}
}
nvrtcResult compile_result = nvrtcResult compile_result =
dynload::nvrtcCompileProgram(program, // program dynload::nvrtcCompileProgram(program, // program
options.size(), // numOptions options.size(), // numOptions
......
...@@ -31,7 +31,7 @@ namespace platform { ...@@ -31,7 +31,7 @@ namespace platform {
class DeviceCode { class DeviceCode {
public: public:
virtual ~DeviceCode() {} virtual ~DeviceCode() {}
virtual bool Compile() = 0; virtual bool Compile(bool include_path = false) = 0;
virtual void Launch(const size_t n, std::vector<void*>* args) const = 0; virtual void Launch(const size_t n, std::vector<void*>* args) const = 0;
Place GetPlace() const { return place_; } Place GetPlace() const { return place_; }
...@@ -48,7 +48,7 @@ class CUDADeviceCode : public DeviceCode { ...@@ -48,7 +48,7 @@ class CUDADeviceCode : public DeviceCode {
public: public:
explicit CUDADeviceCode(const Place& place, const std::string& name, explicit CUDADeviceCode(const Place& place, const std::string& name,
const std::string& kernel); const std::string& kernel);
bool Compile() override; bool Compile(bool include_path = false) override;
void Launch(const size_t n, std::vector<void*>* args) const override; void Launch(const size_t n, std::vector<void*>* args) const override;
void SetNumThreads(int num_threads) { num_threads_ = num_threads; } void SetNumThreads(int num_threads) { num_threads_ = num_threads; }
......
...@@ -2,7 +2,7 @@ file(GLOB TEST_IR_PASSES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") ...@@ -2,7 +2,7 @@ file(GLOB TEST_IR_PASSES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_IR_PASSES "${TEST_IR_PASSES}") string(REPLACE ".py" "" TEST_IR_PASSES "${TEST_IR_PASSES}")
if(NOT WITH_GPU OR WIN32 OR APPLE) if(NOT WITH_GPU OR WIN32 OR APPLE)
LIST(REMOVE_ITEM TEST_IR_PASSES test_ir_fusion_group) LIST(REMOVE_ITEM TEST_IR_PASSES test_ir_fusion_group_pass)
endif() endif()
foreach(target ${TEST_IR_PASSES}) foreach(target ${TEST_IR_PASSES})
......
...@@ -17,92 +17,125 @@ import unittest ...@@ -17,92 +17,125 @@ import unittest
import numpy as np import numpy as np
from pass_test import PassTest from pass_test import PassTest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.core as core import paddle.fluid.core as core
class FusionGroupPassTest(PassTest): class FusionGroupPassTest(PassTest):
def setUp(self): def build_program(self, dtype):
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
data1 = fluid.data(name="data1", shape=[32, 128], dtype="float32") self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 2)
data2 = fluid.data(name="data2", shape=[32, 128], dtype="float32") self.feed_vars.append(
data3 = fluid.data(name="data3", shape=[32, 128], dtype="float32") fluid.data(
tmp_1 = fluid.layers.elementwise_add(data1, data2) name="data2", shape=[128, 128], dtype=dtype))
tmp_2 = fluid.layers.elementwise_mul(data3, tmp_1)
# subgraph with only 1 op node
self.feeds = { tmp_0 = self.feed_vars[0] * self.feed_vars[1]
"data1": np.random.random((32, 128)).astype("float32"), tmp_1 = layers.mul(tmp_0, self.feed_vars[2])
"data2": np.random.random((32, 128)).astype("float32"), # subgraph with 2 op nodes
"data3": np.random.random((32, 128)).astype("float32") tmp_2 = layers.relu(tmp_0 + tmp_1)
}
self.fetch_list = [tmp_1, tmp_2] self.fetch_list = [tmp_2]
self.num_fused_ops = 1
def setUp(self):
self.build_program("float32")
self.feeds = self._feed_random_data(self.feed_vars)
self.pass_names = "fusion_group_pass" self.pass_names = "fusion_group_pass"
self.fused_op_type = "fusion_group" self.fused_op_type = "fusion_group"
self.num_fused_ops = 1
def _prepare_feed_vars(self, shape, dtype, num_data):
feed_vars = []
for i in range(num_data):
var = fluid.data(name=("data" + str(i)), shape=shape, dtype=dtype)
feed_vars.append(var)
return feed_vars
def _feed_random_data(self, feed_vars):
feeds = {}
for var in feed_vars:
if var.type != fluid.core.VarDesc.VarType.LOD_TENSOR:
raise TypeError("Feed data of non LoDTensor is not supported.")
shape = var.shape
if var.dtype == fluid.core.VarDesc.VarType.FP32:
dtype = "float32"
elif var.dtype == fluid.core.VarDesc.VarType.FP64:
dtype = "float64"
elif var.dtype == fluid.core.VarDesc.VarType.FP16:
dtype = "float16"
else:
raise ValueError("Unsupported dtype %s" % var.dtype)
feeds[var.name] = np.random.random(shape).astype(dtype)
return feeds
def test_check_output(self): def test_check_output(self):
use_gpu_set = []
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
use_gpu_set.append(True) self.pass_attrs = {"fusion_group_pass": {"use_gpu": True}}
for use_gpu in use_gpu_set: self.check_output_with_place(fluid.CUDAPlace(0))
self.pass_attrs = {"fusion_group_pass": {"use_gpu": use_gpu}}
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
self.check_output_with_place(place, startup_on_cpu=False)
class FusionGroupPassTest1(FusionGroupPassTest): class FusionGroupPassTest1(FusionGroupPassTest):
def setUp(self): def build_program(self, dtype):
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
data = [] self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 5)
for i in range(5):
data.append( tmp_0 = layers.assign(self.feed_vars[0])
fluid.data( # subgraph with 9 op nodes
name=("data" + str(i)), tmp_1 = tmp_0 * layers.sigmoid(self.feed_vars[1]) + layers.sigmoid(
shape=[32, 128], self.feed_vars[2]) * layers.tanh(self.feed_vars[3])
dtype="float32")) tmp_2 = layers.tanh(tmp_1) + layers.sigmoid(self.feed_vars[4])
tmp_1 = (
fluid.layers.assign(data[0]) * fluid.layers.sigmoid(data[1])
) + (fluid.layers.sigmoid(data[2]) * fluid.layers.tanh(data[3]))
tmp_2 = fluid.layers.tanh(tmp_1) + fluid.layers.sigmoid(data[4])
self.feeds = {}
for i in range(5):
self.feeds["data" + str(i)] = np.random.random(
(32, 128)).astype("float32")
self.fetch_list = [tmp_1, tmp_2] self.fetch_list = [tmp_1, tmp_2]
self.pass_names = "fusion_group_pass"
self.fused_op_type = "fusion_group"
self.num_fused_ops = 1 self.num_fused_ops = 1
class FusionGroupPassTest2(FusionGroupPassTest): class FusionGroupPassTest2(FusionGroupPassTest):
def setUp(self): def build_program(self, dtype):
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
data = [] self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 3)
for i in range(3): self.feed_vars.append(
data.append(
fluid.data(
name=("data" + str(i)),
shape=[32, 128],
dtype="float32"))
data.append(
fluid.data( fluid.data(
name="data3", shape=[128, 32], dtype="float32")) name="data3", shape=[128, 32], dtype=dtype))
tmp_1 = fluid.layers.relu((data[0] - data[1]) * data[2])
tmp_2 = fluid.layers.sigmoid(data[3]) # subgraph with 3 op nodes
tmp_3 = fluid.layers.relu(tmp_2) tmp_1 = layers.relu(
tmp_4 = fluid.layers.mul(tmp_1, tmp_3) (self.feed_vars[0] - self.feed_vars[1]) * self.feed_vars[2])
# subgraph with 2 op nodes
self.feeds = {} tmp_2 = layers.relu(layers.sigmoid(self.feed_vars[3]))
for i in range(3): tmp_3 = layers.mul(tmp_1, tmp_2)
self.feeds["data" + str(i)] = np.random.random(
(32, 128)).astype("float32") self.fetch_list = [tmp_1, tmp_2, tmp_3]
self.feeds["data3"] = np.random.random((128, 32)).astype("float32") self.num_fused_ops = 2
self.fetch_list = [tmp_1, tmp_2, tmp_3, tmp_4]
class FusionGroupPassTestFP64(FusionGroupPassTest):
def setUp(self):
self.build_program("float64")
self.feeds = self._feed_random_data(self.feed_vars)
self.pass_names = "fusion_group_pass" self.pass_names = "fusion_group_pass"
self.fused_op_type = "fusion_group" self.fused_op_type = "fusion_group"
self.num_fused_ops = 2
class FusionGroupPassTestFP16(FusionGroupPassTest):
def build_program(self, dtype):
with fluid.program_guard(self.main_program, self.startup_program):
self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 2)
self.feed_vars.append(
fluid.data(
name="data2", shape=[128, 128], dtype=dtype))
# subgraph with only 1 op node
tmp_0 = self.feed_vars[0] * self.feed_vars[1]
tmp_1 = layers.mul(tmp_0, self.feed_vars[2])
tmp_2 = layers.cast(tmp_0, dtype="float16")
tmp_3 = layers.cast(tmp_1, dtype="float16")
# subgraph with 2 op nodes
tmp_4 = layers.relu(tmp_2 + tmp_3)
tmp_5 = layers.cast(tmp_4, dtype=dtype)
self.fetch_list = [tmp_5]
self.num_fused_ops = 1
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册