fix reduce_gather in case of enable_mem_sharing == false (#1186)

ccc3b389 · Jinhui Yuan · GitHub · 28a6fc98 · ccc3b389 · ccc3b389
3 changed file
--- a/oneflow/core/actor/reduce_gather_compute_actor.cpp
+++ b/oneflow/core/actor/reduce_gather_compute_actor.cpp
@@ -3,7 +3,8 @@
 namespace oneflow {

 void ReduceGatherCompActor::SetKernelCtxOther(void** other) {
-  other_val_ = InBnId4RegstDescId(cur_processed_regst_desc_id());
+  int64_t in_bn_id = InBnId4RegstDescId(cur_processed_regst_desc_id());
+  other_val_ = std::make_pair(in_bn_id, EnableInplace());
  *other = static_cast<void*>(&other_val_);
 }


--- a/oneflow/core/actor/reduce_gather_compute_actor.h
+++ b/oneflow/core/actor/reduce_gather_compute_actor.h
@@ -15,7 +15,7 @@ class ReduceGatherCompActor final : public InputWiseCompActor {
  void VirtualCompActorInit(const TaskProto& proto) override { InputWiseCompActor::Init(proto); }
  void SetKernelCtxOther(void** other) override;

-  int64_t other_val_;
+  std::pair<int64_t, bool> other_val_;
 };

 }  // namespace oneflow

--- a/oneflow/core/kernel/reduce_gather_kernel.cpp
+++ b/oneflow/core/kernel/reduce_gather_kernel.cpp
@@ -5,15 +5,17 @@ namespace oneflow {
 template<DeviceType device_type>
 void ReduceGatherKernel<device_type>::ForwardDataContent(
    const KernelCtx& ctx, std::function<Blob*(const std::string&)> BnInOp2Blob) const {
-  if (device_type == DeviceType::kGPU) { return; }
-  int64_t in_bn_id = *static_cast<int64_t*>(ctx.other);
+  const auto* other_val = static_cast<std::pair<int64_t, bool>*>(ctx.other);
+  int64_t in_bn_id = other_val->first;
+  bool is_inplace = other_val->second;
+  if (is_inplace) { return; }

  Blob* out_blob = BnInOp2Blob("out");
  char* dst_cur_dptr = out_blob->mut_dptr<char>();
  dst_cur_dptr += this->kernel_conf().reduce_gather_conf().data_offset().Get(in_bn_id);
  Blob* in_blob = BnInOp2Blob(this->op_attribute().input_bns().Get(in_bn_id));
  size_t in_byte_size = in_blob->ByteSizeOfDataContentField();
-  Memcpy<DeviceType::kCPU>(ctx.device_ctx, dst_cur_dptr, in_blob->dptr<char>(), in_byte_size);
+  Memcpy<device_type>(ctx.device_ctx, dst_cur_dptr, in_blob->dptr<char>(), in_byte_size);
 }

 ADD_DEVICE_TYPE_KERNEL_CREATOR(OperatorConf::kReduceGatherConf, ReduceGatherKernel);