未验证 提交 f4634d76 编写于 作者: Y Yiqun Liu 提交者: GitHub

Optimize the CUDA implementation of sequence_expand op by reduce the times of...

Optimize the CUDA implementation of sequence_expand op by reduce the times of copying lod data from CPU to GPU. (#15493)

* Optimize the CUDA implementation of sequence_expand op by reduce the times of copying lod data from CPU to GPU.
test=develop

* Refine the op benchmark to support setting lod in config.
test=develop
上级 60546b78
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester.h"
#include <fstream>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_info.h"
......@@ -28,6 +29,7 @@ namespace operators {
namespace benchmark {
DEFINE_string(op_config_list, "", "Path of op config file.");
DEFINE_int32(specified_config_id, -1, "Test the specified op config.");
void OpTester::Init(const std::string &filename) {
Init(OpTesterConfig(filename));
......@@ -147,7 +149,7 @@ void OpTester::CreateInputVarDesc() {
var->SetShape(input->dims);
op_desc_.SetInput(name, {var_name});
inputs_.push_back(var_name);
input_lods_[var_name] = input->lod;
}
}
......@@ -162,7 +164,6 @@ void OpTester::CreateOutputVarDesc() {
var->SetDataType(framework::proto::VarType::FP32);
op_desc_.SetOutput(name, {var_name});
outputs_.push_back(var_name);
}
}
......@@ -218,16 +219,26 @@ void OpTester::CreateVariables(framework::Scope *scope) {
}
}
// Allocate memory for input tensor
for (auto &name : inputs_) {
VLOG(3) << "Allocate memory for tensor " << name;
auto &var_desc = vars_[name];
for (auto &item : input_lods_) {
// Allocate memory for input tensor
auto &var_name = item.first;
VLOG(3) << "Allocate memory for tensor " << var_name;
auto &var_desc = vars_[var_name];
std::vector<int64_t> shape = var_desc->GetShape();
auto *var = scope->Var(name);
auto *var = scope->Var(var_name);
auto *tensor = var->GetMutable<framework::LoDTensor>();
SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
static_cast<float>(1.0));
VLOG(3) << "Set lod for tensor " << var_name;
std::vector<std::vector<size_t>> &lod_vec = item.second;
framework::LoD lod;
for (size_t i = 0; i < lod_vec.size(); ++i) {
lod.push_back(lod_vec[i]);
}
tensor->set_lod(lod);
}
}
......@@ -282,10 +293,32 @@ std::string OpTester::DebugString() {
}
TEST(op_tester, base) {
OpTester tester;
if (!FLAGS_op_config_list.empty()) {
tester.Init(FLAGS_op_config_list);
std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
FLAGS_op_config_list.c_str());
std::vector<OpTesterConfig> op_configs;
while (!fin.eof()) {
OpTesterConfig config;
bool result = config.Init(fin);
if (result) {
op_configs.push_back(config);
}
}
if (FLAGS_specified_config_id >= 0 &&
FLAGS_specified_config_id < static_cast<int>(op_configs.size())) {
OpTester tester;
tester.Init(op_configs[FLAGS_specified_config_id]);
tester.Run();
} else {
for (size_t i = 0; i < op_configs.size(); ++i) {
OpTester tester;
tester.Init(op_configs[i]);
tester.Run();
}
}
} else {
OpTester tester;
OpTesterConfig config;
config.op_type = "elementwise_add";
config.inputs.resize(2);
......@@ -294,8 +327,8 @@ TEST(op_tester, base) {
config.inputs[1].name = "Y";
config.inputs[1].dims = {64, 1};
tester.Init(config);
tester.Run();
}
tester.Run();
}
} // namespace benchmark
......
......@@ -57,8 +57,7 @@ class OpTester {
std::string type_;
framework::OpDesc op_desc_;
std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
std::vector<std::string> inputs_;
std::vector<std::string> outputs_;
std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_;
std::unique_ptr<framework::OperatorBase> op_;
platform::Place place_;
std::unique_ptr<framework::Scope> scope_;
......
......@@ -33,21 +33,64 @@ static bool EndWith(const std::string& str, const std::string& substr) {
return str.rfind(substr) == (str.length() - substr.length());
}
static void EraseEndSep(std::string* str) {
std::string substr = kSepBetweenItems;
static void EraseEndSep(std::string* str,
std::string substr = kSepBetweenItems) {
if (EndWith(*str, substr)) {
str->erase(str->length() - substr.length(), str->length());
}
}
static std::vector<int64_t> ParseDims(std::string dims_str) {
std::vector<int64_t> dims;
void OpInputConfig::ParseDims(std::istream& is) {
std::string dims_str;
is >> dims_str;
dims.clear();
std::string token;
std::istringstream token_stream(dims_str);
while (std::getline(token_stream, token, 'x')) {
dims.push_back(std::stoi(token));
}
return dims;
}
void OpInputConfig::ParseLoD(std::istream& is) {
std::string lod_str;
std::string start_sep =
std::string(kStartSeparator) + std::string(kStartSeparator);
std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator);
std::string sep;
is >> sep;
if (StartWith(sep, start_sep)) {
lod_str += sep;
while (!EndWith(sep, end_sep)) {
is >> sep;
lod_str += sep;
}
}
EraseEndSep(&lod_str);
PADDLE_ENFORCE_GE(lod_str.length(), 4U);
VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
// Parse the lod_str
lod.clear();
for (size_t i = 1; i < lod_str.length() - 1;) {
if (lod_str[i] == '{') {
std::vector<size_t> level;
while (lod_str[i] != '}') {
++i;
std::string number;
while (lod_str[i] >= '0' && lod_str[i] <= '9') {
number += lod_str[i];
++i;
}
level.push_back(atoi(number.c_str()));
}
lod.push_back(level);
} else if (lod_str[i] == '}') {
++i;
}
}
}
OpInputConfig::OpInputConfig(std::istream& is) {
......@@ -60,9 +103,9 @@ OpInputConfig::OpInputConfig(std::istream& is) {
is >> name;
EraseEndSep(&name);
} else if (sep == "dims" || sep == "dims:") {
std::string dims_str;
is >> dims_str;
dims = ParseDims(dims_str);
ParseDims(is);
} else if (sep == "lod" || sep == "lod:") {
ParseLoD(is);
}
}
}
......@@ -76,7 +119,7 @@ OpTesterConfig::OpTesterConfig(const std::string& filename) {
Init(fin);
}
void OpTesterConfig::Init(std::istream& is) {
bool OpTesterConfig::Init(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
......@@ -95,9 +138,40 @@ void OpTesterConfig::Init(std::istream& is) {
} else if (sep == "input" || sep == "input:") {
OpInputConfig input_config(is);
inputs.push_back(input_config);
} else if (sep == "attrs" || sep == "attrs:") {
ParseAttrs(is);
} else {
if (sep != kEndSeparator) {
return false;
}
}
}
} else {
return false;
}
return true;
}
bool OpTesterConfig::ParseAttrs(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (true) {
std::string key;
is >> key;
if (key == kEndSeparator) {
break;
}
std::string value;
is >> value;
EraseEndSep(&key, ":");
EraseEndSep(&value);
attrs[key] = value;
}
}
return true;
}
const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <istream>
#include <string>
#include <unordered_map>
#include <vector>
namespace paddle {
......@@ -26,19 +27,27 @@ struct OpInputConfig {
OpInputConfig() {}
explicit OpInputConfig(std::istream& is);
void ParseDims(std::istream& is);
void ParseLoD(std::istream& is);
std::string name;
std::vector<int64_t> dims;
std::vector<std::vector<size_t>> lod;
};
struct OpTesterConfig {
OpTesterConfig() {}
explicit OpTesterConfig(const std::string& filename);
void Init(std::istream& is);
bool Init(std::istream& is);
bool ParseAttrs(std::istream& is);
const OpInputConfig* GetInput(const std::string& name);
std::string op_type;
std::vector<OpInputConfig> inputs;
std::unordered_map<std::string, std::string> attrs;
int device_id{-1}; // CPU: -1
int repeat{1};
int profile{0};
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
......@@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
}
}
template <typename T>
static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
const LoDTensor& x, LoDTensor* out,
const framework::Vector<size_t>& x_lod,
const framework::Vector<size_t>& ref_lod,
bool do_copy) {
auto out_data = out->data<T>();
auto x_data = x.data<T>();
auto& gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
int x_item_length = x.numel() / x.dims()[0];
int out_offset = 0;
int num_copys = 0;
for (size_t i = 1; i < ref_lod.size(); ++i) {
int repeat_num = ref_lod[i] - ref_lod[i - 1];
int x_start = x_lod[i - 1];
int x_end = x_lod[i];
int x_seq_len = x_end - x_start;
if (repeat_num > 0) {
if (do_copy) {
int out_start = out_offset;
if (out->lod().size() == 1) {
out_start = out->lod()[0][out_offset];
}
for (int j = 0; j < repeat_num; j++) {
for (int k = 0; k < x_seq_len; k++) {
memory::Copy(
gpu_place,
out_data + (out_start + j * x_seq_len + k) * x_item_length,
gpu_place, x_data + (x_start + k) * x_item_length,
sizeof(T) * x_item_length, context.stream());
}
}
} else {
num_copys += repeat_num * x_seq_len;
}
}
out_offset += repeat_num;
}
return num_copys;
}
template <typename T>
struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
void operator()(
......@@ -95,22 +139,40 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* out) {
int x_item_length = x.numel() / x.dims()[0];
framework::Vector<size_t> out_offset(x_lod.size());
GetOutputOffset(x_lod, ref_lod, &out_offset);
int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
int thread_y = 16;
int thread_z = 1024 / thread_x / thread_y;
int block_x = static_cast<int>(ref_lod.size());
dim3 block_size(thread_x, thread_y, thread_z);
dim3 grid_size(block_x, 1);
int num_copys =
ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (num_copys < 5) {
ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
} else {
int x_item_length = x.numel() / x.dims()[0];
size_t x_lod_size = x_lod.size();
framework::Vector<size_t> out_offset(x_lod_size * 2 + ref_lod.size());
GetOutputOffset(x_lod, ref_lod, &out_offset);
for (size_t i = 0; i < x_lod_size; ++i) {
out_offset[x_lod_size + i] = x_lod[i];
}
for (size_t i = 0; i < ref_lod.size(); ++i) {
out_offset[2 * x_lod_size + i] = ref_lod[i];
}
sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
x.data<T>(), x_lod.CUDAData(context.GetPlace()),
ref_lod.CUDAData(context.GetPlace()),
out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
out->mutable_data<T>(context.GetPlace()));
const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
const size_t* x_lod_data = out_offset_data + x_lod_size;
const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
int thread_x =
std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
int thread_y = 16;
int thread_z = 1024 / thread_x / thread_y;
int block_x = static_cast<int>(ref_lod.size());
dim3 block_size(thread_x, thread_y, thread_z);
dim3 grid_size(block_x, 1);
sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
x.data<T>(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size,
x_item_length, out->mutable_data<T>(context.GetPlace()));
}
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册