From f4634d76d719810da4b4d1bfe9549ab814dfc58a Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 26 Feb 2019 11:59:10 +0800 Subject: [PATCH] Optimize the CUDA implementation of sequence_expand op by reduce the times of copying lod data from CPU to GPU. (#15493) * Optimize the CUDA implementation of sequence_expand op by reduce the times of copying lod data from CPU to GPU. test=develop * Refine the op benchmark to support setting lod in config. test=develop --- paddle/fluid/operators/benchmark/op_tester.cc | 53 +++++++++-- paddle/fluid/operators/benchmark/op_tester.h | 3 +- .../operators/benchmark/op_tester_config.cc | 92 +++++++++++++++++-- .../operators/benchmark/op_tester_config.h | 11 ++- .../sequence_ops/sequence_expand_op.cu | 92 ++++++++++++++++--- 5 files changed, 214 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc index e179de56cd..064903c299 100644 --- a/paddle/fluid/operators/benchmark/op_tester.cc +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/benchmark/op_tester.h" +#include #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_info.h" @@ -28,6 +29,7 @@ namespace operators { namespace benchmark { DEFINE_string(op_config_list, "", "Path of op config file."); +DEFINE_int32(specified_config_id, -1, "Test the specified op config."); void OpTester::Init(const std::string &filename) { Init(OpTesterConfig(filename)); @@ -147,7 +149,7 @@ void OpTester::CreateInputVarDesc() { var->SetShape(input->dims); op_desc_.SetInput(name, {var_name}); - inputs_.push_back(var_name); + input_lods_[var_name] = input->lod; } } @@ -162,7 +164,6 @@ void OpTester::CreateOutputVarDesc() { var->SetDataType(framework::proto::VarType::FP32); op_desc_.SetOutput(name, {var_name}); - outputs_.push_back(var_name); } } @@ -218,16 +219,26 @@ void OpTester::CreateVariables(framework::Scope *scope) { } } - // Allocate memory for input tensor - for (auto &name : inputs_) { - VLOG(3) << "Allocate memory for tensor " << name; - auto &var_desc = vars_[name]; + for (auto &item : input_lods_) { + // Allocate memory for input tensor + auto &var_name = item.first; + VLOG(3) << "Allocate memory for tensor " << var_name; + + auto &var_desc = vars_[var_name]; std::vector shape = var_desc->GetShape(); - auto *var = scope->Var(name); + auto *var = scope->Var(var_name); auto *tensor = var->GetMutable(); SetupTensor(tensor, shape, static_cast(0.0), static_cast(1.0)); + + VLOG(3) << "Set lod for tensor " << var_name; + std::vector> &lod_vec = item.second; + framework::LoD lod; + for (size_t i = 0; i < lod_vec.size(); ++i) { + lod.push_back(lod_vec[i]); + } + tensor->set_lod(lod); } } @@ -282,10 +293,32 @@ std::string OpTester::DebugString() { } TEST(op_tester, base) { - OpTester tester; if (!FLAGS_op_config_list.empty()) { - tester.Init(FLAGS_op_config_list); + std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", + FLAGS_op_config_list.c_str()); + std::vector op_configs; + while (!fin.eof()) { + OpTesterConfig config; + bool result = config.Init(fin); + if (result) { + op_configs.push_back(config); + } + } + if (FLAGS_specified_config_id >= 0 && + FLAGS_specified_config_id < static_cast(op_configs.size())) { + OpTester tester; + tester.Init(op_configs[FLAGS_specified_config_id]); + tester.Run(); + } else { + for (size_t i = 0; i < op_configs.size(); ++i) { + OpTester tester; + tester.Init(op_configs[i]); + tester.Run(); + } + } } else { + OpTester tester; OpTesterConfig config; config.op_type = "elementwise_add"; config.inputs.resize(2); @@ -294,8 +327,8 @@ TEST(op_tester, base) { config.inputs[1].name = "Y"; config.inputs[1].dims = {64, 1}; tester.Init(config); + tester.Run(); } - tester.Run(); } } // namespace benchmark diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h index 1723d46c47..8f150b23ad 100644 --- a/paddle/fluid/operators/benchmark/op_tester.h +++ b/paddle/fluid/operators/benchmark/op_tester.h @@ -57,8 +57,7 @@ class OpTester { std::string type_; framework::OpDesc op_desc_; std::unordered_map> vars_; - std::vector inputs_; - std::vector outputs_; + std::unordered_map>> input_lods_; std::unique_ptr op_; platform::Place place_; std::unique_ptr scope_; diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc index 3db8de7f76..8336804ec0 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.cc +++ b/paddle/fluid/operators/benchmark/op_tester_config.cc @@ -33,21 +33,64 @@ static bool EndWith(const std::string& str, const std::string& substr) { return str.rfind(substr) == (str.length() - substr.length()); } -static void EraseEndSep(std::string* str) { - std::string substr = kSepBetweenItems; +static void EraseEndSep(std::string* str, + std::string substr = kSepBetweenItems) { if (EndWith(*str, substr)) { str->erase(str->length() - substr.length(), str->length()); } } -static std::vector ParseDims(std::string dims_str) { - std::vector dims; +void OpInputConfig::ParseDims(std::istream& is) { + std::string dims_str; + is >> dims_str; + + dims.clear(); std::string token; std::istringstream token_stream(dims_str); while (std::getline(token_stream, token, 'x')) { dims.push_back(std::stoi(token)); } - return dims; +} + +void OpInputConfig::ParseLoD(std::istream& is) { + std::string lod_str; + std::string start_sep = + std::string(kStartSeparator) + std::string(kStartSeparator); + std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator); + + std::string sep; + is >> sep; + if (StartWith(sep, start_sep)) { + lod_str += sep; + while (!EndWith(sep, end_sep)) { + is >> sep; + lod_str += sep; + } + } + EraseEndSep(&lod_str); + PADDLE_ENFORCE_GE(lod_str.length(), 4U); + VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length(); + + // Parse the lod_str + lod.clear(); + for (size_t i = 1; i < lod_str.length() - 1;) { + if (lod_str[i] == '{') { + std::vector level; + while (lod_str[i] != '}') { + ++i; + + std::string number; + while (lod_str[i] >= '0' && lod_str[i] <= '9') { + number += lod_str[i]; + ++i; + } + level.push_back(atoi(number.c_str())); + } + lod.push_back(level); + } else if (lod_str[i] == '}') { + ++i; + } + } } OpInputConfig::OpInputConfig(std::istream& is) { @@ -60,9 +103,9 @@ OpInputConfig::OpInputConfig(std::istream& is) { is >> name; EraseEndSep(&name); } else if (sep == "dims" || sep == "dims:") { - std::string dims_str; - is >> dims_str; - dims = ParseDims(dims_str); + ParseDims(is); + } else if (sep == "lod" || sep == "lod:") { + ParseLoD(is); } } } @@ -76,7 +119,7 @@ OpTesterConfig::OpTesterConfig(const std::string& filename) { Init(fin); } -void OpTesterConfig::Init(std::istream& is) { +bool OpTesterConfig::Init(std::istream& is) { std::string sep; is >> sep; if (sep == kStartSeparator) { @@ -95,9 +138,40 @@ void OpTesterConfig::Init(std::istream& is) { } else if (sep == "input" || sep == "input:") { OpInputConfig input_config(is); inputs.push_back(input_config); + } else if (sep == "attrs" || sep == "attrs:") { + ParseAttrs(is); + } else { + if (sep != kEndSeparator) { + return false; + } } } + } else { + return false; + } + return true; +} + +bool OpTesterConfig::ParseAttrs(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (true) { + std::string key; + is >> key; + if (key == kEndSeparator) { + break; + } + + std::string value; + is >> value; + EraseEndSep(&key, ":"); + EraseEndSep(&value); + + attrs[key] = value; + } } + return true; } const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) { diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h index f7b62cb8ad..c2ff6dafc0 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.h +++ b/paddle/fluid/operators/benchmark/op_tester_config.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include namespace paddle { @@ -26,19 +27,27 @@ struct OpInputConfig { OpInputConfig() {} explicit OpInputConfig(std::istream& is); + void ParseDims(std::istream& is); + void ParseLoD(std::istream& is); + std::string name; std::vector dims; + std::vector> lod; }; struct OpTesterConfig { OpTesterConfig() {} explicit OpTesterConfig(const std::string& filename); - void Init(std::istream& is); + + bool Init(std::istream& is); + + bool ParseAttrs(std::istream& is); const OpInputConfig* GetInput(const std::string& name); std::string op_type; std::vector inputs; + std::unordered_map attrs; int device_id{-1}; // CPU: -1 int repeat{1}; int profile{0}; diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index afc08c7b3f..888d1a12e6 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector& x_lod, } } +template +static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context, + const LoDTensor& x, LoDTensor* out, + const framework::Vector& x_lod, + const framework::Vector& ref_lod, + bool do_copy) { + auto out_data = out->data(); + auto x_data = x.data(); + + auto& gpu_place = boost::get(context.GetPlace()); + + int x_item_length = x.numel() / x.dims()[0]; + int out_offset = 0; + int num_copys = 0; + for (size_t i = 1; i < ref_lod.size(); ++i) { + int repeat_num = ref_lod[i] - ref_lod[i - 1]; + int x_start = x_lod[i - 1]; + int x_end = x_lod[i]; + int x_seq_len = x_end - x_start; + if (repeat_num > 0) { + if (do_copy) { + int out_start = out_offset; + if (out->lod().size() == 1) { + out_start = out->lod()[0][out_offset]; + } + for (int j = 0; j < repeat_num; j++) { + for (int k = 0; k < x_seq_len; k++) { + memory::Copy( + gpu_place, + out_data + (out_start + j * x_seq_len + k) * x_item_length, + gpu_place, x_data + (x_start + k) * x_item_length, + sizeof(T) * x_item_length, context.stream()); + } + } + } else { + num_copys += repeat_num * x_seq_len; + } + } + out_offset += repeat_num; + } + return num_copys; +} + template struct SequenceExpandFunctor { void operator()( @@ -95,22 +139,40 @@ struct SequenceExpandFunctor { const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand referenced lod*/ LoDTensor* out) { - int x_item_length = x.numel() / x.dims()[0]; - framework::Vector out_offset(x_lod.size()); - GetOutputOffset(x_lod, ref_lod, &out_offset); - - int thread_x = std::min(32, std::max(static_cast(ref_lod.size()), 16)); - int thread_y = 16; - int thread_z = 1024 / thread_x / thread_y; - int block_x = static_cast(ref_lod.size()); - dim3 block_size(thread_x, thread_y, thread_z); - dim3 grid_size(block_x, 1); + int num_copys = + ExpandByMemoryCopy(context, x, out, x_lod, ref_lod, false); + // Sometimes direct copies will be faster, this maybe need deeply analysis. + if (num_copys < 5) { + ExpandByMemoryCopy(context, x, out, x_lod, ref_lod, true); + } else { + int x_item_length = x.numel() / x.dims()[0]; + size_t x_lod_size = x_lod.size(); + framework::Vector out_offset(x_lod_size * 2 + ref_lod.size()); + GetOutputOffset(x_lod, ref_lod, &out_offset); + + for (size_t i = 0; i < x_lod_size; ++i) { + out_offset[x_lod_size + i] = x_lod[i]; + } + for (size_t i = 0; i < ref_lod.size(); ++i) { + out_offset[2 * x_lod_size + i] = ref_lod[i]; + } - sequence_expand_kernel<<>>( - x.data(), x_lod.CUDAData(context.GetPlace()), - ref_lod.CUDAData(context.GetPlace()), - out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length, - out->mutable_data(context.GetPlace())); + const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace()); + const size_t* x_lod_data = out_offset_data + x_lod_size; + const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size; + + int thread_x = + std::min(32, std::max(static_cast(ref_lod.size()), 16)); + int thread_y = 16; + int thread_z = 1024 / thread_x / thread_y; + int block_x = static_cast(ref_lod.size()); + dim3 block_size(thread_x, thread_y, thread_z); + dim3 grid_size(block_x, 1); + + sequence_expand_kernel<<>>( + x.data(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size, + x_item_length, out->mutable_data(context.GetPlace())); + } } }; -- GitLab