未验证 提交 f4634d76 编写于 作者: Y Yiqun Liu 提交者: GitHub

Optimize the CUDA implementation of sequence_expand op by reduce the times of...

Optimize the CUDA implementation of sequence_expand op by reduce the times of copying lod data from CPU to GPU. (#15493)

* Optimize the CUDA implementation of sequence_expand op by reduce the times of copying lod data from CPU to GPU.
test=develop

* Refine the op benchmark to support setting lod in config.
test=develop
上级 60546b78
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester.h" #include "paddle/fluid/operators/benchmark/op_tester.h"
#include <fstream>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
...@@ -28,6 +29,7 @@ namespace operators { ...@@ -28,6 +29,7 @@ namespace operators {
namespace benchmark { namespace benchmark {
DEFINE_string(op_config_list, "", "Path of op config file."); DEFINE_string(op_config_list, "", "Path of op config file.");
DEFINE_int32(specified_config_id, -1, "Test the specified op config.");
void OpTester::Init(const std::string &filename) { void OpTester::Init(const std::string &filename) {
Init(OpTesterConfig(filename)); Init(OpTesterConfig(filename));
...@@ -147,7 +149,7 @@ void OpTester::CreateInputVarDesc() { ...@@ -147,7 +149,7 @@ void OpTester::CreateInputVarDesc() {
var->SetShape(input->dims); var->SetShape(input->dims);
op_desc_.SetInput(name, {var_name}); op_desc_.SetInput(name, {var_name});
inputs_.push_back(var_name); input_lods_[var_name] = input->lod;
} }
} }
...@@ -162,7 +164,6 @@ void OpTester::CreateOutputVarDesc() { ...@@ -162,7 +164,6 @@ void OpTester::CreateOutputVarDesc() {
var->SetDataType(framework::proto::VarType::FP32); var->SetDataType(framework::proto::VarType::FP32);
op_desc_.SetOutput(name, {var_name}); op_desc_.SetOutput(name, {var_name});
outputs_.push_back(var_name);
} }
} }
...@@ -218,16 +219,26 @@ void OpTester::CreateVariables(framework::Scope *scope) { ...@@ -218,16 +219,26 @@ void OpTester::CreateVariables(framework::Scope *scope) {
} }
} }
for (auto &item : input_lods_) {
// Allocate memory for input tensor // Allocate memory for input tensor
for (auto &name : inputs_) { auto &var_name = item.first;
VLOG(3) << "Allocate memory for tensor " << name; VLOG(3) << "Allocate memory for tensor " << var_name;
auto &var_desc = vars_[name];
auto &var_desc = vars_[var_name];
std::vector<int64_t> shape = var_desc->GetShape(); std::vector<int64_t> shape = var_desc->GetShape();
auto *var = scope->Var(name); auto *var = scope->Var(var_name);
auto *tensor = var->GetMutable<framework::LoDTensor>(); auto *tensor = var->GetMutable<framework::LoDTensor>();
SetupTensor<float>(tensor, shape, static_cast<float>(0.0), SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
static_cast<float>(1.0)); static_cast<float>(1.0));
VLOG(3) << "Set lod for tensor " << var_name;
std::vector<std::vector<size_t>> &lod_vec = item.second;
framework::LoD lod;
for (size_t i = 0; i < lod_vec.size(); ++i) {
lod.push_back(lod_vec[i]);
}
tensor->set_lod(lod);
} }
} }
...@@ -282,10 +293,32 @@ std::string OpTester::DebugString() { ...@@ -282,10 +293,32 @@ std::string OpTester::DebugString() {
} }
TEST(op_tester, base) { TEST(op_tester, base) {
OpTester tester;
if (!FLAGS_op_config_list.empty()) { if (!FLAGS_op_config_list.empty()) {
tester.Init(FLAGS_op_config_list); std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
FLAGS_op_config_list.c_str());
std::vector<OpTesterConfig> op_configs;
while (!fin.eof()) {
OpTesterConfig config;
bool result = config.Init(fin);
if (result) {
op_configs.push_back(config);
}
}
if (FLAGS_specified_config_id >= 0 &&
FLAGS_specified_config_id < static_cast<int>(op_configs.size())) {
OpTester tester;
tester.Init(op_configs[FLAGS_specified_config_id]);
tester.Run();
} else { } else {
for (size_t i = 0; i < op_configs.size(); ++i) {
OpTester tester;
tester.Init(op_configs[i]);
tester.Run();
}
}
} else {
OpTester tester;
OpTesterConfig config; OpTesterConfig config;
config.op_type = "elementwise_add"; config.op_type = "elementwise_add";
config.inputs.resize(2); config.inputs.resize(2);
...@@ -294,8 +327,8 @@ TEST(op_tester, base) { ...@@ -294,8 +327,8 @@ TEST(op_tester, base) {
config.inputs[1].name = "Y"; config.inputs[1].name = "Y";
config.inputs[1].dims = {64, 1}; config.inputs[1].dims = {64, 1};
tester.Init(config); tester.Init(config);
}
tester.Run(); tester.Run();
}
} }
} // namespace benchmark } // namespace benchmark
......
...@@ -57,8 +57,7 @@ class OpTester { ...@@ -57,8 +57,7 @@ class OpTester {
std::string type_; std::string type_;
framework::OpDesc op_desc_; framework::OpDesc op_desc_;
std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_; std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
std::vector<std::string> inputs_; std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_;
std::vector<std::string> outputs_;
std::unique_ptr<framework::OperatorBase> op_; std::unique_ptr<framework::OperatorBase> op_;
platform::Place place_; platform::Place place_;
std::unique_ptr<framework::Scope> scope_; std::unique_ptr<framework::Scope> scope_;
......
...@@ -33,21 +33,64 @@ static bool EndWith(const std::string& str, const std::string& substr) { ...@@ -33,21 +33,64 @@ static bool EndWith(const std::string& str, const std::string& substr) {
return str.rfind(substr) == (str.length() - substr.length()); return str.rfind(substr) == (str.length() - substr.length());
} }
static void EraseEndSep(std::string* str) { static void EraseEndSep(std::string* str,
std::string substr = kSepBetweenItems; std::string substr = kSepBetweenItems) {
if (EndWith(*str, substr)) { if (EndWith(*str, substr)) {
str->erase(str->length() - substr.length(), str->length()); str->erase(str->length() - substr.length(), str->length());
} }
} }
static std::vector<int64_t> ParseDims(std::string dims_str) { void OpInputConfig::ParseDims(std::istream& is) {
std::vector<int64_t> dims; std::string dims_str;
is >> dims_str;
dims.clear();
std::string token; std::string token;
std::istringstream token_stream(dims_str); std::istringstream token_stream(dims_str);
while (std::getline(token_stream, token, 'x')) { while (std::getline(token_stream, token, 'x')) {
dims.push_back(std::stoi(token)); dims.push_back(std::stoi(token));
} }
return dims; }
void OpInputConfig::ParseLoD(std::istream& is) {
std::string lod_str;
std::string start_sep =
std::string(kStartSeparator) + std::string(kStartSeparator);
std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator);
std::string sep;
is >> sep;
if (StartWith(sep, start_sep)) {
lod_str += sep;
while (!EndWith(sep, end_sep)) {
is >> sep;
lod_str += sep;
}
}
EraseEndSep(&lod_str);
PADDLE_ENFORCE_GE(lod_str.length(), 4U);
VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
// Parse the lod_str
lod.clear();
for (size_t i = 1; i < lod_str.length() - 1;) {
if (lod_str[i] == '{') {
std::vector<size_t> level;
while (lod_str[i] != '}') {
++i;
std::string number;
while (lod_str[i] >= '0' && lod_str[i] <= '9') {
number += lod_str[i];
++i;
}
level.push_back(atoi(number.c_str()));
}
lod.push_back(level);
} else if (lod_str[i] == '}') {
++i;
}
}
} }
OpInputConfig::OpInputConfig(std::istream& is) { OpInputConfig::OpInputConfig(std::istream& is) {
...@@ -60,9 +103,9 @@ OpInputConfig::OpInputConfig(std::istream& is) { ...@@ -60,9 +103,9 @@ OpInputConfig::OpInputConfig(std::istream& is) {
is >> name; is >> name;
EraseEndSep(&name); EraseEndSep(&name);
} else if (sep == "dims" || sep == "dims:") { } else if (sep == "dims" || sep == "dims:") {
std::string dims_str; ParseDims(is);
is >> dims_str; } else if (sep == "lod" || sep == "lod:") {
dims = ParseDims(dims_str); ParseLoD(is);
} }
} }
} }
...@@ -76,7 +119,7 @@ OpTesterConfig::OpTesterConfig(const std::string& filename) { ...@@ -76,7 +119,7 @@ OpTesterConfig::OpTesterConfig(const std::string& filename) {
Init(fin); Init(fin);
} }
void OpTesterConfig::Init(std::istream& is) { bool OpTesterConfig::Init(std::istream& is) {
std::string sep; std::string sep;
is >> sep; is >> sep;
if (sep == kStartSeparator) { if (sep == kStartSeparator) {
...@@ -95,9 +138,40 @@ void OpTesterConfig::Init(std::istream& is) { ...@@ -95,9 +138,40 @@ void OpTesterConfig::Init(std::istream& is) {
} else if (sep == "input" || sep == "input:") { } else if (sep == "input" || sep == "input:") {
OpInputConfig input_config(is); OpInputConfig input_config(is);
inputs.push_back(input_config); inputs.push_back(input_config);
} else if (sep == "attrs" || sep == "attrs:") {
ParseAttrs(is);
} else {
if (sep != kEndSeparator) {
return false;
}
}
}
} else {
return false;
}
return true;
}
bool OpTesterConfig::ParseAttrs(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (true) {
std::string key;
is >> key;
if (key == kEndSeparator) {
break;
} }
std::string value;
is >> value;
EraseEndSep(&key, ":");
EraseEndSep(&value);
attrs[key] = value;
} }
} }
return true;
} }
const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) { const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <istream> #include <istream>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -26,19 +27,27 @@ struct OpInputConfig { ...@@ -26,19 +27,27 @@ struct OpInputConfig {
OpInputConfig() {} OpInputConfig() {}
explicit OpInputConfig(std::istream& is); explicit OpInputConfig(std::istream& is);
void ParseDims(std::istream& is);
void ParseLoD(std::istream& is);
std::string name; std::string name;
std::vector<int64_t> dims; std::vector<int64_t> dims;
std::vector<std::vector<size_t>> lod;
}; };
struct OpTesterConfig { struct OpTesterConfig {
OpTesterConfig() {} OpTesterConfig() {}
explicit OpTesterConfig(const std::string& filename); explicit OpTesterConfig(const std::string& filename);
void Init(std::istream& is);
bool Init(std::istream& is);
bool ParseAttrs(std::istream& is);
const OpInputConfig* GetInput(const std::string& name); const OpInputConfig* GetInput(const std::string& name);
std::string op_type; std::string op_type;
std::vector<OpInputConfig> inputs; std::vector<OpInputConfig> inputs;
std::unordered_map<std::string, std::string> attrs;
int device_id{-1}; // CPU: -1 int device_id{-1}; // CPU: -1
int repeat{1}; int repeat{1};
int profile{0}; int profile{0};
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod, ...@@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
} }
} }
template <typename T>
static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
const LoDTensor& x, LoDTensor* out,
const framework::Vector<size_t>& x_lod,
const framework::Vector<size_t>& ref_lod,
bool do_copy) {
auto out_data = out->data<T>();
auto x_data = x.data<T>();
auto& gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
int x_item_length = x.numel() / x.dims()[0];
int out_offset = 0;
int num_copys = 0;
for (size_t i = 1; i < ref_lod.size(); ++i) {
int repeat_num = ref_lod[i] - ref_lod[i - 1];
int x_start = x_lod[i - 1];
int x_end = x_lod[i];
int x_seq_len = x_end - x_start;
if (repeat_num > 0) {
if (do_copy) {
int out_start = out_offset;
if (out->lod().size() == 1) {
out_start = out->lod()[0][out_offset];
}
for (int j = 0; j < repeat_num; j++) {
for (int k = 0; k < x_seq_len; k++) {
memory::Copy(
gpu_place,
out_data + (out_start + j * x_seq_len + k) * x_item_length,
gpu_place, x_data + (x_start + k) * x_item_length,
sizeof(T) * x_item_length, context.stream());
}
}
} else {
num_copys += repeat_num * x_seq_len;
}
}
out_offset += repeat_num;
}
return num_copys;
}
template <typename T> template <typename T>
struct SequenceExpandFunctor<platform::CUDADeviceContext, T> { struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
void operator()( void operator()(
...@@ -95,11 +139,30 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> { ...@@ -95,11 +139,30 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
const framework::Vector<size_t>& x_lod, /*expand source lod*/ const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/ const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* out) { LoDTensor* out) {
int num_copys =
ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (num_copys < 5) {
ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
} else {
int x_item_length = x.numel() / x.dims()[0]; int x_item_length = x.numel() / x.dims()[0];
framework::Vector<size_t> out_offset(x_lod.size()); size_t x_lod_size = x_lod.size();
framework::Vector<size_t> out_offset(x_lod_size * 2 + ref_lod.size());
GetOutputOffset(x_lod, ref_lod, &out_offset); GetOutputOffset(x_lod, ref_lod, &out_offset);
int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16)); for (size_t i = 0; i < x_lod_size; ++i) {
out_offset[x_lod_size + i] = x_lod[i];
}
for (size_t i = 0; i < ref_lod.size(); ++i) {
out_offset[2 * x_lod_size + i] = ref_lod[i];
}
const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
const size_t* x_lod_data = out_offset_data + x_lod_size;
const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
int thread_x =
std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
int thread_y = 16; int thread_y = 16;
int thread_z = 1024 / thread_x / thread_y; int thread_z = 1024 / thread_x / thread_y;
int block_x = static_cast<int>(ref_lod.size()); int block_x = static_cast<int>(ref_lod.size());
...@@ -107,10 +170,9 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> { ...@@ -107,10 +170,9 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
dim3 grid_size(block_x, 1); dim3 grid_size(block_x, 1);
sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>( sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
x.data<T>(), x_lod.CUDAData(context.GetPlace()), x.data<T>(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size,
ref_lod.CUDAData(context.GetPlace()), x_item_length, out->mutable_data<T>(context.GetPlace()));
out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length, }
out->mutable_data<T>(context.GetPlace()));
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册