未验证 提交 67163fb4 编写于 作者: G Guoxia Wang 提交者: GitHub

fix the bug of margin cross entropy loss for eager mode (#43161)

上级 85baa3c0
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#pragma once #pragma once
#include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/phi/api/all.h" #include "paddle/phi/api/include/tensor.h"
namespace egr { namespace egr {
namespace egr_utils_api { namespace egr_utils_api {
......
...@@ -26,10 +26,12 @@ namespace cub = hipcub; ...@@ -26,10 +26,12 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif #endif
...@@ -63,6 +65,20 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place, ...@@ -63,6 +65,20 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place,
framework::TensorFromVector(shard_dim_vec, ctx, &num_classes_per_device); framework::TensorFromVector(shard_dim_vec, ctx, &num_classes_per_device);
int* num_classes_per_device_ptr = num_classes_per_device.data<int>(); int* num_classes_per_device_ptr = num_classes_per_device.data<int>();
auto map = distributed::ProcessGroupMapFromGid::getInstance();
if (map->has(rid)) {
// Use ProcessGroup
distributed::ProcessGroup* pg = map->get(rid);
std::vector<phi::DenseTensor> in_tensor;
std::vector<phi::DenseTensor> out_tensor;
in_tensor.push_back(num_classes_per_device);
out_tensor.push_back(num_classes_per_device);
distributed::AllreduceOptions opts;
opts.reduce_op = distributed::ReduceOp::SUM;
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
// use global calculate stream // use global calculate stream
const auto calcu_stream = const auto calcu_stream =
...@@ -76,6 +92,7 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place, ...@@ -76,6 +92,7 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place,
platform::ToNCCLDataType( platform::ToNCCLDataType(
framework::TransToProtoVarType(num_classes_per_device.dtype())), framework::TransToProtoVarType(num_classes_per_device.dtype())),
ncclSum, comm->comm(), calcu_stream)); ncclSum, comm->comm(), calcu_stream));
}
auto class_interval_ptr = auto class_interval_ptr =
class_interval->mutable_data<int>({nranks + 1}, place); class_interval->mutable_data<int>({nranks + 1}, place);
...@@ -228,8 +245,14 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -228,8 +245,14 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
platform::NCCLComm* comm; platform::NCCLComm* comm;
distributed::ProcessGroup* pg = nullptr;
gpuStream_t stream; gpuStream_t stream;
if (nranks > 1) { if (nranks > 1) {
auto map = distributed::ProcessGroupMapFromGid::getInstance();
if (map->has(rid)) {
// Use ProcessGroup
pg = map->get(rid);
} else {
comm = platform::NCCLCommContext::Instance().Get(rid, place); comm = platform::NCCLCommContext::Instance().Get(rid, place);
// use global calculate stream // use global calculate stream
...@@ -237,6 +260,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -237,6 +260,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
platform::DeviceContextPool::Instance().Get(place)) platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
} }
}
#endif #endif
// allocate memory on device. // allocate memory on device.
...@@ -306,12 +330,24 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -306,12 +330,24 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nranks > 1) { if (nranks > 1) {
if (pg) {
std::vector<phi::DenseTensor> in_tensor;
std::vector<phi::DenseTensor> out_tensor;
in_tensor.push_back(logits_max);
out_tensor.push_back(logits_max);
distributed::AllreduceOptions opts;
opts.reduce_op = distributed::ReduceOp::MAX;
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
logits_max_buff, logits_max_buff, logits_max.numel(), logits_max_buff, logits_max_buff, logits_max.numel(),
platform::ToNCCLDataType( platform::ToNCCLDataType(
framework::TransToProtoVarType(logits_max.dtype())), framework::TransToProtoVarType(logits_max.dtype())),
ncclMax, comm->comm(), stream)); ncclMax, comm->comm(), stream));
} }
}
#endif #endif
// step 3, logit - logit_max // step 3, logit - logit_max
...@@ -329,12 +365,24 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -329,12 +365,24 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nranks > 1) { if (nranks > 1) {
if (pg) {
std::vector<phi::DenseTensor> in_tensor;
std::vector<phi::DenseTensor> out_tensor;
in_tensor.push_back(sum_exp_logits);
out_tensor.push_back(sum_exp_logits);
distributed::AllreduceOptions opts;
opts.reduce_op = distributed::ReduceOp::SUM;
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
platform::ToNCCLDataType( platform::ToNCCLDataType(
framework::TransToProtoVarType(sum_exp_logits.dtype())), framework::TransToProtoVarType(sum_exp_logits.dtype())),
ncclSum, comm->comm(), stream)); ncclSum, comm->comm(), stream));
} }
}
#endif #endif
// step 5, (logit - logit_max) - log(sum(exp(logit - logit_max))) // step 5, (logit - logit_max) - log(sum(exp(logit - logit_max)))
...@@ -363,12 +411,24 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -363,12 +411,24 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nranks > 1) { if (nranks > 1) {
if (pg) {
std::vector<phi::DenseTensor> in_tensor;
std::vector<phi::DenseTensor> out_tensor;
in_tensor.push_back(*loss);
out_tensor.push_back(*loss);
distributed::AllreduceOptions opts;
opts.reduce_op = distributed::ReduceOp::SUM;
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
loss_ptr, loss_ptr, loss->numel(), loss_ptr, loss_ptr, loss->numel(),
platform::ToNCCLDataType( platform::ToNCCLDataType(
framework::TransToProtoVarType(loss->dtype())), framework::TransToProtoVarType(loss->dtype())),
ncclSum, comm->comm(), stream)); ncclSum, comm->comm(), stream));
} }
}
#endif #endif
} }
}; };
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from __future__ import print_function from __future__ import print_function
import os
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -23,7 +24,10 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus ...@@ -23,7 +24,10 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
class TestParallelMarginSoftmaxWithCrossEntropy(TestMultipleGpus): class TestParallelMarginSoftmaxWithCrossEntropy(TestMultipleGpus):
def test_parallel_margin_cross_entropy(self): def test_parallel_margin_cross_entropy(self):
self.run_mnist_2gpu('parallel_margin_cross_entropy.py') self.run_mnist_2gpu('parallel_margin_cross_entropy.py')
self.run_mnist_2gpu(
'parallel_margin_cross_entropy.py', eager_mode=False)
if __name__ == "__main__": if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册