From f4634d76d719810da4b4d1bfe9549ab814dfc58a Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Tue, 26 Feb 2019 11:59:10 +0800
Subject: [PATCH] Optimize the CUDA implementation of sequence_expand op by
 reduce the times of copying lod data from CPU to GPU. (#15493)

* Optimize the CUDA implementation of sequence_expand op by reduce the times of copying lod data from CPU to GPU.
test=develop

* Refine the op benchmark to support setting lod in config.
test=develop
---
 paddle/fluid/operators/benchmark/op_tester.cc | 53 +++++++++--
 paddle/fluid/operators/benchmark/op_tester.h  |  3 +-
 .../operators/benchmark/op_tester_config.cc   | 92 +++++++++++++++++--
 .../operators/benchmark/op_tester_config.h    | 11 ++-
 .../sequence_ops/sequence_expand_op.cu        | 92 ++++++++++++++++---
 5 files changed, 214 insertions(+), 37 deletions(-)
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index e179de56cd..064903c299 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/benchmark/op_tester.h"
+#include <fstream>
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -28,6 +29,7 @@ namespace operators {
 namespace benchmark {
 
 DEFINE_string(op_config_list, "", "Path of op config file.");
+DEFINE_int32(specified_config_id, -1, "Test the specified op config.");
 
 void OpTester::Init(const std::string &filename) {
   Init(OpTesterConfig(filename));
@@ -147,7 +149,7 @@ void OpTester::CreateInputVarDesc() {
     var->SetShape(input->dims);
 
     op_desc_.SetInput(name, {var_name});
-    inputs_.push_back(var_name);
+    input_lods_[var_name] = input->lod;
   }
 }
 
@@ -162,7 +164,6 @@ void OpTester::CreateOutputVarDesc() {
     var->SetDataType(framework::proto::VarType::FP32);
 
     op_desc_.SetOutput(name, {var_name});
-    outputs_.push_back(var_name);
   }
 }
 
@@ -218,16 +219,26 @@ void OpTester::CreateVariables(framework::Scope *scope) {
     }
   }
 
-  // Allocate memory for input tensor
-  for (auto &name : inputs_) {
-    VLOG(3) << "Allocate memory for tensor " << name;
-    auto &var_desc = vars_[name];
+  for (auto &item : input_lods_) {
+    // Allocate memory for input tensor
+    auto &var_name = item.first;
+    VLOG(3) << "Allocate memory for tensor " << var_name;
+
+    auto &var_desc = vars_[var_name];
     std::vector<int64_t> shape = var_desc->GetShape();
 
-    auto *var = scope->Var(name);
+    auto *var = scope->Var(var_name);
     auto *tensor = var->GetMutable<framework::LoDTensor>();
     SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
                        static_cast<float>(1.0));
+
+    VLOG(3) << "Set lod for tensor " << var_name;
+    std::vector<std::vector<size_t>> &lod_vec = item.second;
+    framework::LoD lod;
+    for (size_t i = 0; i < lod_vec.size(); ++i) {
+      lod.push_back(lod_vec[i]);
+    }
+    tensor->set_lod(lod);
   }
 }
 
@@ -282,10 +293,32 @@ std::string OpTester::DebugString() {
 }
 
 TEST(op_tester, base) {
-  OpTester tester;
   if (!FLAGS_op_config_list.empty()) {
-    tester.Init(FLAGS_op_config_list);
+    std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+                   FLAGS_op_config_list.c_str());
+    std::vector<OpTesterConfig> op_configs;
+    while (!fin.eof()) {
+      OpTesterConfig config;
+      bool result = config.Init(fin);
+      if (result) {
+        op_configs.push_back(config);
+      }
+    }
+    if (FLAGS_specified_config_id >= 0 &&
+        FLAGS_specified_config_id < static_cast<int>(op_configs.size())) {
+      OpTester tester;
+      tester.Init(op_configs[FLAGS_specified_config_id]);
+      tester.Run();
+    } else {
+      for (size_t i = 0; i < op_configs.size(); ++i) {
+        OpTester tester;
+        tester.Init(op_configs[i]);
+        tester.Run();
+      }
+    }
   } else {
+    OpTester tester;
     OpTesterConfig config;
     config.op_type = "elementwise_add";
     config.inputs.resize(2);
@@ -294,8 +327,8 @@ TEST(op_tester, base) {
     config.inputs[1].name = "Y";
     config.inputs[1].dims = {64, 1};
     tester.Init(config);
+    tester.Run();
   }
-  tester.Run();
 }
 
 }  // namespace benchmark
diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h
index 1723d46c47..8f150b23ad 100644
--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -57,8 +57,7 @@ class OpTester {
   std::string type_;
   framework::OpDesc op_desc_;
   std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
-  std::vector<std::string> inputs_;
-  std::vector<std::string> outputs_;
+  std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_;
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   std::unique_ptr<framework::Scope> scope_;
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
index 3db8de7f76..8336804ec0 100644
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -33,21 +33,64 @@ static bool EndWith(const std::string& str, const std::string& substr) {
   return str.rfind(substr) == (str.length() - substr.length());
 }
 
-static void EraseEndSep(std::string* str) {
-  std::string substr = kSepBetweenItems;
+static void EraseEndSep(std::string* str,
+                        std::string substr = kSepBetweenItems) {
   if (EndWith(*str, substr)) {
     str->erase(str->length() - substr.length(), str->length());
   }
 }
 
-static std::vector<int64_t> ParseDims(std::string dims_str) {
-  std::vector<int64_t> dims;
+void OpInputConfig::ParseDims(std::istream& is) {
+  std::string dims_str;
+  is >> dims_str;
+
+  dims.clear();
   std::string token;
   std::istringstream token_stream(dims_str);
   while (std::getline(token_stream, token, 'x')) {
     dims.push_back(std::stoi(token));
   }
-  return dims;
+}
+
+void OpInputConfig::ParseLoD(std::istream& is) {
+  std::string lod_str;
+  std::string start_sep =
+      std::string(kStartSeparator) + std::string(kStartSeparator);
+  std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator);
+
+  std::string sep;
+  is >> sep;
+  if (StartWith(sep, start_sep)) {
+    lod_str += sep;
+    while (!EndWith(sep, end_sep)) {
+      is >> sep;
+      lod_str += sep;
+    }
+  }
+  EraseEndSep(&lod_str);
+  PADDLE_ENFORCE_GE(lod_str.length(), 4U);
+  VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
+
+  // Parse the lod_str
+  lod.clear();
+  for (size_t i = 1; i < lod_str.length() - 1;) {
+    if (lod_str[i] == '{') {
+      std::vector<size_t> level;
+      while (lod_str[i] != '}') {
+        ++i;
+
+        std::string number;
+        while (lod_str[i] >= '0' && lod_str[i] <= '9') {
+          number += lod_str[i];
+          ++i;
+        }
+        level.push_back(atoi(number.c_str()));
+      }
+      lod.push_back(level);
+    } else if (lod_str[i] == '}') {
+      ++i;
+    }
+  }
 }
 
 OpInputConfig::OpInputConfig(std::istream& is) {
@@ -60,9 +103,9 @@ OpInputConfig::OpInputConfig(std::istream& is) {
         is >> name;
         EraseEndSep(&name);
       } else if (sep == "dims" || sep == "dims:") {
-        std::string dims_str;
-        is >> dims_str;
-        dims = ParseDims(dims_str);
+        ParseDims(is);
+      } else if (sep == "lod" || sep == "lod:") {
+        ParseLoD(is);
       }
     }
   }
@@ -76,7 +119,7 @@ OpTesterConfig::OpTesterConfig(const std::string& filename) {
   Init(fin);
 }
 
-void OpTesterConfig::Init(std::istream& is) {
+bool OpTesterConfig::Init(std::istream& is) {
   std::string sep;
   is >> sep;
   if (sep == kStartSeparator) {
@@ -95,9 +138,40 @@ void OpTesterConfig::Init(std::istream& is) {
       } else if (sep == "input" || sep == "input:") {
         OpInputConfig input_config(is);
         inputs.push_back(input_config);
+      } else if (sep == "attrs" || sep == "attrs:") {
+        ParseAttrs(is);
+      } else {
+        if (sep != kEndSeparator) {
+          return false;
+        }
       }
     }
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool OpTesterConfig::ParseAttrs(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (true) {
+      std::string key;
+      is >> key;
+      if (key == kEndSeparator) {
+        break;
+      }
+
+      std::string value;
+      is >> value;
+      EraseEndSep(&key, ":");
+      EraseEndSep(&value);
+
+      attrs[key] = value;
+    }
   }
+  return true;
 }
 
 const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h
index f7b62cb8ad..c2ff6dafc0 100644
--- a/paddle/fluid/operators/benchmark/op_tester_config.h
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <istream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace paddle {
@@ -26,19 +27,27 @@ struct OpInputConfig {
   OpInputConfig() {}
   explicit OpInputConfig(std::istream& is);
 
+  void ParseDims(std::istream& is);
+  void ParseLoD(std::istream& is);
+
   std::string name;
   std::vector<int64_t> dims;
+  std::vector<std::vector<size_t>> lod;
 };
 
 struct OpTesterConfig {
   OpTesterConfig() {}
   explicit OpTesterConfig(const std::string& filename);
-  void Init(std::istream& is);
+
+  bool Init(std::istream& is);
+
+  bool ParseAttrs(std::istream& is);
 
   const OpInputConfig* GetInput(const std::string& name);
 
   std::string op_type;
   std::vector<OpInputConfig> inputs;
+  std::unordered_map<std::string, std::string> attrs;
   int device_id{-1};  // CPU: -1
   int repeat{1};
   int profile{0};
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index afc08c7b3f..888d1a12e6 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -88,6 +89,49 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
   }
 }
 
+template <typename T>
+static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
+                              const LoDTensor& x, LoDTensor* out,
+                              const framework::Vector<size_t>& x_lod,
+                              const framework::Vector<size_t>& ref_lod,
+                              bool do_copy) {
+  auto out_data = out->data<T>();
+  auto x_data = x.data<T>();
+
+  auto& gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
+
+  int x_item_length = x.numel() / x.dims()[0];
+  int out_offset = 0;
+  int num_copys = 0;
+  for (size_t i = 1; i < ref_lod.size(); ++i) {
+    int repeat_num = ref_lod[i] - ref_lod[i - 1];
+    int x_start = x_lod[i - 1];
+    int x_end = x_lod[i];
+    int x_seq_len = x_end - x_start;
+    if (repeat_num > 0) {
+      if (do_copy) {
+        int out_start = out_offset;
+        if (out->lod().size() == 1) {
+          out_start = out->lod()[0][out_offset];
+        }
+        for (int j = 0; j < repeat_num; j++) {
+          for (int k = 0; k < x_seq_len; k++) {
+            memory::Copy(
+                gpu_place,
+                out_data + (out_start + j * x_seq_len + k) * x_item_length,
+                gpu_place, x_data + (x_start + k) * x_item_length,
+                sizeof(T) * x_item_length, context.stream());
+          }
+        }
+      } else {
+        num_copys += repeat_num * x_seq_len;
+      }
+    }
+    out_offset += repeat_num;
+  }
+  return num_copys;
+}
+
 template <typename T>
 struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
   void operator()(
@@ -95,22 +139,40 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* out) {
-    int x_item_length = x.numel() / x.dims()[0];
-    framework::Vector<size_t> out_offset(x_lod.size());
-    GetOutputOffset(x_lod, ref_lod, &out_offset);
-
-    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
-    int thread_y = 16;
-    int thread_z = 1024 / thread_x / thread_y;
-    int block_x = static_cast<int>(ref_lod.size());
-    dim3 block_size(thread_x, thread_y, thread_z);
-    dim3 grid_size(block_x, 1);
+    int num_copys =
+        ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (num_copys < 5) {
+      ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
+    } else {
+      int x_item_length = x.numel() / x.dims()[0];
+      size_t x_lod_size = x_lod.size();
+      framework::Vector<size_t> out_offset(x_lod_size * 2 + ref_lod.size());
+      GetOutputOffset(x_lod, ref_lod, &out_offset);
+
+      for (size_t i = 0; i < x_lod_size; ++i) {
+        out_offset[x_lod_size + i] = x_lod[i];
+      }
+      for (size_t i = 0; i < ref_lod.size(); ++i) {
+        out_offset[2 * x_lod_size + i] = ref_lod[i];
+      }
 
-    sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), x_lod.CUDAData(context.GetPlace()),
-        ref_lod.CUDAData(context.GetPlace()),
-        out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
-        out->mutable_data<T>(context.GetPlace()));
+      const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace());
+      const size_t* x_lod_data = out_offset_data + x_lod_size;
+      const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;
+
+      int thread_x =
+          std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+      int thread_y = 16;
+      int thread_z = 1024 / thread_x / thread_y;
+      int block_x = static_cast<int>(ref_lod.size());
+      dim3 block_size(thread_x, thread_y, thread_z);
+      dim3 grid_size(block_x, 1);
+
+      sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+          x.data<T>(), x_lod_data, ref_lod_data, out_offset_data, x_lod_size,
+          x_item_length, out->mutable_data<T>(context.GetPlace()));
+    }
   }
 };
 
-- 
GitLab