add BCast and Gather

8eaec5dd · chengduoZH · 47a4ec06 · 8eaec5dd · 8eaec5dd · 8eaec5dd
5 changed file
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -2,8 +2,12 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
-nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+if(WITH_GPU)
+    nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
        dynload_cuda)
+    nv_library(broad_cast_op_handle SRCS broad_cast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+endif()
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
@@ -11,6 +15,8 @@ cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 if(WITH_GPU)
    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
+    nv_test(broad_cast_op_test SRCS broad_cast_op_handle_test.cc DEPS var_handle op_handle_base scope lod_tensor ddim memory
+            device_context broad_cast_op_handle)
 else()
    set(multi_devices_graph_builder_deps)
 endif()

--- a/paddle/fluid/framework/details/broad_cast_op_handle.cc
+++ b/paddle/fluid/framework/details/broad_cast_op_handle.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/broad_cast_op_handle.h"
+namespace paddle {
+namespace framework {
+namespace details {
+Tensor *GetTensorFromVar(Variable *in_var) {
+  if (in_var->IsType<LoDTensor>()) {
+    return in_var->GetMutable<LoDTensor>();
+  } else if (in_var->IsType<SelectedRows>()) {
+    return in_var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+  return nullptr;
+}
+BCastOpHandle::BCastOpHandle(const std::vector<Scope *> &local_scopes,
+                             const std::vector<platform::Place> &places,
+                             const platform::ContextMap &ctxs)
+    : local_scopes_(local_scopes), places_(places), ctxs_(ctxs) {
+  for (auto &p : places_) {
+    this->dev_ctxes_[p] = ctxs_.DevCtx(p);
+  }
+}
+void BCastOpHandle::RunImpl() {
+  PADDLE_ENFORCE_EQ(this->inputs_.size(), 1);
+  PADDLE_ENFORCE_EQ(this->outputs_.size(), places_.size());
+  // Wait input done, this Wait is asynchronous operation
+  auto in_var_handle = static_cast<VarHandle *>(this->inputs_[0]);
+  auto &in_place = in_var_handle->place_;
+  if (inputs_[0]->generated_op_)
+    inputs_[0]->generated_op_->Wait(dev_ctxes_[in_place]);
+  auto iter = std::find(places_.begin(), places_.end(), in_place);
+  if (iter == places_.end()) {
+    PADDLE_THROW("The input of BCast is not in the places_.");
+  }
+  int offset = iter - places_.begin();
+  auto in_var = local_scopes_[offset]->FindVar(in_var_handle->name_);
+  Tensor *in_tensor = GetTensorFromVar(in_var);
+  for (auto *out : outputs_) {
+    auto out_handle = static_cast<VarHandle *>(out);
+    auto &out_p = out_handle->place_;
+    auto iter = std::find(places_.begin(), places_.end(), out_p);
+    if (iter == places_.end()) {
+      PADDLE_THROW("The output of BCast is not in the places_.");
+    }
+    int offset = iter - places_.begin();
+    auto *s = local_scopes_[offset];
+    auto out_var = s->FindVar(out_handle->name_);
+    PADDLE_ENFORCE_EQ(out_var->Type(), in_var->Type(), "");
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto in_sr = in_var->GetMutable<framework::SelectedRows>();
+      auto out = out_var->GetMutable<framework::SelectedRows>();
+      if (in_sr == out) continue;
+      out->set_height(in_sr->height());
+      out->set_rows(in_sr->rows());
+      out->mutable_value()->Resize(in_sr->value().dims());
+      out->mutable_value()->mutable_data(out_p, in_sr->value().type());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto in_lod = in_var->GetMutable<framework::LoDTensor>();
+      auto out = out_var->GetMutable<framework::LoDTensor>();
+      if (in_lod == out) continue;
+      out->set_lod(in_lod->lod());
+      out->Resize(in_lod->dims());
+      out->mutable_data(out_p, in_lod->type());
+    } else {
+      PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+    }
+    Tensor *out_tensor = GetTensorFromVar(out_var);
+    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
+                                  out_tensor);
+  }
+}
+std::string BCastOpHandle::Name() const { return "broadcast"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/broad_cast_op_handle.h
+++ b/paddle/fluid/framework/details/broad_cast_op_handle.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+namespace details {
+/*
+ * BroadCast the input to all scope.
+ *
+ */
+struct BCastOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::ContextMap &ctxs_;
+  BCastOpHandle(const std::vector<Scope *> &local_scopes,
+                const std::vector<platform::Place> &places,
+                const platform::ContextMap &ctxs);
+  std::string Name() const override;
+  bool IsMultiDeviceTransfer() override { return false; };
+ protected:
+  void RunImpl() override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/broad_cast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broad_cast_op_handle_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/broad_cast_op_handle.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+// test data amount
+const f::DDim kDims = {20, 20};
+class BroadCastTester : public ::testing::Test {
+ public:
+  void SetUp() override {
+    int count = p::GetCUDADeviceCount();
+    if (count <= 1) {
+      LOG(WARNING) << "Cannot test multi-gpu BroadCast, because the CUDA "
+                      "device count is "
+                   << count;
+      exit(0);
+    }
+    for (int i = 0; i < count; ++i) {
+      gpu_list_.emplace_back(p::CUDAPlace(i));
+    }
+    ctxs_ = new p::ContextMap(gpu_list_);
+  }
+  template <class T>
+  void BroadCastInitOp(int gpu_id = 0) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scope_.push_back(&g_scope_.NewScope());
+      auto* out_var = local_scope_[j]->Var("out");
+      out_var->GetMutable<T>();
+    }
+    auto* in_var = local_scope_[gpu_id]->Var("input");
+    in_var->GetMutable<T>();
+    bc_op_handle_ =
+        new f::details::BCastOpHandle(local_scope_, gpu_list_, *ctxs_);
+    f::details::VarHandle* in_var_handle = new f::details::VarHandle();
+    in_var_handle->place_ = gpu_list_[gpu_id];
+    in_var_handle->name_ = "input";
+    in_var_handle->version_ = 1;
+    in_var_handle->generated_op_ = nullptr;
+    bc_op_handle_->AddInput(in_var_handle);
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      f::details::VarHandle* out_var_handle = new f::details::VarHandle();
+      out_var_handle->place_ = gpu_list_[j];
+      out_var_handle->name_ = "out";
+      out_var_handle->version_ = 2;
+      out_var_handle->generated_op_ = bc_op_handle_;
+      bc_op_handle_->AddOutput(out_var_handle);
+    }
+  }
+  void BroadCastDestroy() {
+    delete ctxs_;
+    for (auto in : bc_op_handle_->inputs_) {
+      delete in;
+    }
+    for (auto out : bc_op_handle_->outputs_) {
+      delete out;
+    }
+    delete bc_op_handle_;
+  }
+ public:
+  f::Scope g_scope_;
+  p::ContextMap* ctxs_;
+  std::vector<f::Scope*> local_scope_;
+  std::vector<p::Place> gpu_list_;
+  f::details::BCastOpHandle* bc_op_handle_;
+};
+TEST_F(BroadCastTester, BroadCastTestLodTensor) {
+  int gpu_id = 0;
+  BroadCastInitOp<f::LoDTensor>(gpu_id);
+  auto in_var = local_scope_[gpu_id]->Var("input");
+  auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+  in_lod_tensor->mutable_data<float>(kDims, gpu_list_[gpu_id]);
+  std::vector<float> send_vector(f::product(kDims), gpu_id + 12);
+  for (size_t k = 0; k < send_vector.size(); ++k) {
+    send_vector[k] = k;
+  }
+  f::LoD lod{{0, 10, 20}};
+  paddle::framework::TensorFromVector<float>(
+      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), in_lod_tensor);
+  in_lod_tensor->set_lod(lod);
+  bc_op_handle_->Run(false);
+  ctxs_->WaitAll();
+  p::CPUPlace cpu_place;
+  for (size_t j = 0; j < gpu_list_.size(); ++j) {
+    auto out_var = local_scope_[j]->Var("out");
+    auto out_tensor = out_var->Get<f::LoDTensor>();
+    PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
+    f::Tensor result_tensor;
+    f::TensorCopy(out_tensor, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
+    float* ct = result_tensor.mutable_data<float>(cpu_place);
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    }
+  }
+  BroadCastDestroy();
+}
+TEST_F(BroadCastTester, BroadCastTestSelectedRows) {
+  int gpu_id = 0;
+  BroadCastInitOp<f::SelectedRows>(gpu_id);
+  auto in_var = local_scope_[gpu_id]->Var("input");
+  auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+  auto value = in_selected_rows->mutable_value();
+  value->mutable_data<float>(kDims, gpu_list_[gpu_id]);
+  int height = kDims[0] * 2;
+  std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                            2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+  in_selected_rows->set_height(height);
+  in_selected_rows->set_rows(rows);
+  std::vector<float> send_vector(f::product(kDims));
+  for (size_t k = 0; k < send_vector.size(); ++k) {
+    send_vector[k] = k;
+  }
+  paddle::framework::TensorFromVector<float>(
+      send_vector, *(ctxs_->DevCtx(gpu_list_[gpu_id])), value);
+  bc_op_handle_->Run(false);
+  ctxs_->WaitAll();
+  p::CPUPlace cpu_place;
+  for (size_t j = 0; j < gpu_list_.size(); ++j) {
+    auto out_var = local_scope_[j]->Var("out");
+    auto& out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+    }
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_->DevCtx(j)), &result_tensor);
+    float* ct = result_tensor.data<float>();
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j], 1e-5);
+    }
+  }
+  BroadCastDestroy();
+}
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -2,17 +2,20 @@
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
+ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <memory>
+#include <string>
 #include <unordered_map>
+#include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
@@ -137,6 +140,45 @@ template <>
 struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
  using TYPE = CUDAPinnedDeviceContext;
 };
+class ContextMap {
+ public:
+  explicit ContextMap(const std::vector<platform::Place>& places) {
+    order_.reserve(places.size());
+    for (auto& p : places) {
+      auto dev = boost::get<CUDAPlace>(p);
+      int dev_id = dev.device;
+      order_.emplace_back(dev_id);
+      contexts_[dev_id].reset(new CUDADeviceContext(dev));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        "Context Map does not support contain two or more same device");
+  }
+  DeviceContext* DevCtx(int dev_id) const { return at(dev_id); }
+  DeviceContext* DevCtx(platform::Place p) const {
+    return DevCtx(boost::get<CUDAPlace>(p).device);
+  }
+  DeviceContext* at(platform::Place p) const {
+    return this->at(boost::get<CUDAPlace>(p).device);
+  }
+  DeviceContext* at(int dev_id) const { return contexts_.at(dev_id).get(); }
+  void WaitAll() {
+    for (auto& p : contexts_) {
+      p.second->Wait();
+    }
+  }
+ private:
+  std::unordered_map<int, std::unique_ptr<DeviceContext>> contexts_;
+  std::vector<int> order_;
+};
 #endif
 #ifdef PADDLE_WITH_MKLDNN