未验证 提交 58c8f6b3 编写于 作者: X xiayanming 提交者: GitHub

[hybrid] seed and dropout op support force-cpu (#35820)

* [HIP] fix op not support AMD GPU bug, the flag PADDLE_WITH_ROCM is invalid

* [HIP] fix op not support AMD GPU bug, the flag PADDLE_WITH_ROCM is invalid

* [HIP] fix op not support AMD GPU bug

* [hybrid] seed and dropout op support force-cpu

* [hybrid] seed and dropout op support force-cpu

* [hybrid] seed and dropout op support force-cpu

* [hybrid] seed and dropout op support force-cpu

* [hybrid] seed and dropout op support force-cpu

* [hybrid] fix seed ci failed issue

* add AsExtra for force_cpu of seed op
上级 3bb4715e
...@@ -205,6 +205,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -205,6 +205,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor);
seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]); seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
increment = offset; increment = offset;
} else if (seed && platform::is_cpu_place(seed->place())) {
seed_data = *(seed->data<int>());
increment = offset;
} else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) {
auto seed_offset = gen_cuda->IncrementOffset(offset); auto seed_offset = gen_cuda->IncrementOffset(offset);
seed_data = seed_offset.first; seed_data = seed_offset.first;
......
...@@ -42,6 +42,19 @@ class DropoutOp : public framework::OperatorWithKernel { ...@@ -42,6 +42,19 @@ class DropoutOp : public framework::OperatorWithKernel {
return framework::OpKernelType( return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
} }
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
if (var_name == "Seed") {
VLOG(10) << "var_name:" << var_name
<< " does not need to transform in dropout op";
return expected_kernel_type;
}
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
}; };
class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -39,6 +39,12 @@ class SeedOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -39,6 +39,12 @@ class SeedOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddOutput("Out", "The output of seed op."); AddOutput("Out", "The output of seed op.");
AddAttr<int>("seed", "Dropout random seed.").SetDefault(0); AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
AddAttr<bool>("force_cpu",
"(bool, default false) Force fill output variable to cpu "
"memory. Otherwise, fill output variable to the running "
"device")
.SetDefault(false)
.AsExtra();
AddComment(R"DOC( AddComment(R"DOC(
Seed Operator. Seed Operator.
)DOC"); )DOC");
...@@ -55,3 +61,15 @@ REGISTER_OPERATOR( ...@@ -55,3 +61,15 @@ REGISTER_OPERATOR(
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
seed, ops::CPUSeedKernel<paddle::platform::CPUDeviceContext, int>); seed, ops::CPUSeedKernel<paddle::platform::CPUDeviceContext, int>);
/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(seed)
.AddCheckpoint(
R"ROC(
Upgrade seed add a new attribute [force_cpu])ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"force_cpu",
"If true, Force fill output variable to cpu."
"memory. Otherwise, fill output variable to the running "
"device",
false));
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/seed_op.h" #include "paddle/fluid/operators/seed_op.h"
namespace paddle { namespace paddle {
...@@ -20,10 +21,10 @@ namespace operators { ...@@ -20,10 +21,10 @@ namespace operators {
template <typename Place, typename T> template <typename Place, typename T>
class GPUSeedKernel : public framework::OpKernel<T> { class GPUSeedKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto* out = context.Output<Tensor>("Out"); auto *out = context.Output<Tensor>("Out");
auto* out_data = out->mutable_data<T>(context.GetPlace());
int user_seed = context.Attr<int>("seed"); int user_seed = context.Attr<int>("seed");
auto force_cpu = context.Attr<bool>("force_cpu");
std::random_device rnd; std::random_device rnd;
int seed; int seed;
if (user_seed != 0) { if (user_seed != 0) {
...@@ -31,11 +32,24 @@ class GPUSeedKernel : public framework::OpKernel<T> { ...@@ -31,11 +32,24 @@ class GPUSeedKernel : public framework::OpKernel<T> {
} else { } else {
seed = rnd(); seed = rnd();
} }
auto target_gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); bool cpu_place = force_cpu || context.GetPlace() == platform::CPUPlace();
auto stream = context.cuda_device_context().stream(); if (cpu_place) {
memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, platform::DeviceContextPool &pool =
sizeof(int), stream); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(context.GetPlace());
out->mutable_data<T>(platform::CPUPlace());
math::SetConstant<platform::CPUDeviceContext, T> functor;
functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
out, static_cast<T>(seed));
} else {
auto *out_data = out->mutable_data<T>(context.GetPlace());
auto target_gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
auto stream = context.cuda_device_context().stream();
memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed,
sizeof(int), stream);
}
} }
}; };
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -197,13 +197,18 @@ class ProgramStats(object): ...@@ -197,13 +197,18 @@ class ProgramStats(object):
if op.desc.has_attr(op_device_attr_name): if op.desc.has_attr(op_device_attr_name):
op_device = op.desc.attr(op_device_attr_name) op_device = op.desc.attr(op_device_attr_name)
# Setting the force_cpu of seed to true will make the output of seed in cpu memory,
# reduce the synchronous copy from GPU to CPU in dropout, and reduce the communication hang
added_op = self.block._insert_op( added_op = self.block._insert_op(
index=op.idx, index=op.idx,
type='seed', type='seed',
inputs={}, inputs={},
outputs={'Out': [added_var]}, outputs={'Out': [added_var]},
attrs={'seed': seed, attrs={
'op_device': op_device}) 'seed': seed,
'op_device': op_device,
'force_cpu': True
})
self.ops.insert(op_idx, added_op) self.ops.insert(op_idx, added_op)
# modify dropout op desc so that it accept a seed var as input # modify dropout op desc so that it accept a seed var as input
op.desc.set_input("Seed", [var_unique_name]) op.desc.set_input("Seed", [var_unique_name])
......
...@@ -232,6 +232,75 @@ class TestFP16DropoutOp2(TestFP16DropoutOp): ...@@ -232,6 +232,75 @@ class TestFP16DropoutOp2(TestFP16DropoutOp):
self.fix_seed = False self.fix_seed = False
class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase):
def test_seed_cpu_place(self):
paddle.enable_static()
main_program = Program()
with program_guard(main_program):
seed_input_name = "tensor@SeedInput"
x_var_name = "tensor@X"
x_out_var = "tensor@XOut"
mask_var_name = "tensor@Mask"
seed_input_var = main_program.global_block().create_var(
name=seed_input_name,
shape=[1],
dtype='int32',
persistable=False,
stop_gradient=True)
x_out_var = main_program.global_block().create_var(
name=x_out_var,
shape=[40, 40],
dtype='float32',
persistable=False,
stop_gradient=True)
x_var = main_program.global_block().create_var(
name=x_var_name,
shape=[40, 40],
dtype='float32',
persistable=False,
stop_gradient=True)
mask_var = main_program.global_block().create_var(
name=mask_var_name,
shape=[1],
dtype='int',
persistable=False,
stop_gradient=True)
main_program.global_block().append_op(
type="fill_constant",
outputs={"Out": x_var_name},
attrs={
"shape": [40, 40],
"dtype": x_var.dtype,
"value": 1.0,
"place_type": 0
})
main_program.global_block().append_op(
type='seed',
inputs={},
outputs={'Out': seed_input_var},
attrs={'seed': 1,
'force_cpu': True})
main_program.global_block().append_op(
type='dropout',
inputs={'X': x_var,
'Seed': seed_input_var},
attrs={'dropout_prob': 0.},
outputs={'Out': x_out_var,
'Mask': mask_var})
place = fluid.CPUPlace()
if core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
x_out, mask_out = exe.run(
main_program,
feed={},
fetch_list=[x_out_var.name, mask_var.name])
x_in_np = np.ones([40, 40]).astype("float32")
self.assertTrue(np.allclose(x_out, x_in_np))
class TestDropoutOpError(unittest.TestCase): class TestDropoutOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
......
...@@ -25,7 +25,7 @@ class TestSeedOpFixSeed(OpTest): ...@@ -25,7 +25,7 @@ class TestSeedOpFixSeed(OpTest):
self.op_type = "seed" self.op_type = "seed"
self.inputs = {} self.inputs = {}
self.attrs = {"seed": 123} self.attrs = {"seed": 123}
self.outputs = {"Out": np.asarray((123)).astype('int32')} self.outputs = {"Out": np.asarray((123)).astype('int')}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -36,7 +36,7 @@ class TestSeedOpDiffSeed(OpTest): ...@@ -36,7 +36,7 @@ class TestSeedOpDiffSeed(OpTest):
self.op_type = "seed" self.op_type = "seed"
self.inputs = {} self.inputs = {}
self.attrs = {"seed": 0} self.attrs = {"seed": 0}
self.outputs = {"Out": np.asarray((123)).astype('int32')} self.outputs = {"Out": np.asarray((123)).astype('int')}
def test_check_output(self): def test_check_output(self):
self.check_output(no_check_set=["Out"]) self.check_output(no_check_set=["Out"])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册