diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 4261a5f2534c85e2d68a8ed44ad0b91a86b66667..7a93d2db0dd1ce67b6bb4c04f1d9a4c4ea424f82 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -205,6 +205,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); seed_data = static_cast(seed_cpu_tensor.data()[0]); increment = offset; + } else if (seed && platform::is_cpu_place(seed->place())) { + seed_data = *(seed->data()); + increment = offset; } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { auto seed_offset = gen_cuda->IncrementOffset(offset); seed_data = seed_offset.first; diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 9700b9a2f7a1c2adea5e7cf76c30be3631d28372..cbfb795d6a23e1837b2a4095a4480c979e6b6d3b 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -42,6 +42,19 @@ class DropoutOp : public framework::OperatorWithKernel { return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "Seed") { + VLOG(10) << "var_name:" << var_name + << " does not need to transform in dropout op"; + return expected_kernel_type; + } + + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } }; class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc index 2f3e4c9ba88c3952146d91947eb53ef8f3340905..32daa8c3934aeded83f65e3c3a1da7cb60088529 100644 --- a/paddle/fluid/operators/seed_op.cc +++ b/paddle/fluid/operators/seed_op.cc @@ -39,6 +39,12 @@ class SeedOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddOutput("Out", "The output of seed op."); AddAttr("seed", "Dropout random seed.").SetDefault(0); + AddAttr("force_cpu", + "(bool, default false) Force fill output variable to cpu " + "memory. Otherwise, fill output variable to the running " + "device") + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( Seed Operator. )DOC"); @@ -55,3 +61,15 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( seed, ops::CPUSeedKernel); + +/* ========================== register checkpoint ===========================*/ +REGISTER_OP_VERSION(seed) + .AddCheckpoint( + R"ROC( + Upgrade seed add a new attribute [force_cpu])ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "force_cpu", + "If true, Force fill output variable to cpu." + "memory. Otherwise, fill output variable to the running " + "device", + false)); diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index c84407ba52dfd6403e7548d2424a2c679af4f1dc..4593b88019621a3e8bdece3147106b2b3f29ab32 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/seed_op.h" namespace paddle { @@ -20,10 +21,10 @@ namespace operators { template class GPUSeedKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); + void Compute(const framework::ExecutionContext &context) const override { + auto *out = context.Output("Out"); int user_seed = context.Attr("seed"); + auto force_cpu = context.Attr("force_cpu"); std::random_device rnd; int seed; if (user_seed != 0) { @@ -31,11 +32,24 @@ class GPUSeedKernel : public framework::OpKernel { } else { seed = rnd(); } - auto target_gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); - auto stream = context.cuda_device_context().stream(); - memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, - sizeof(int), stream); + + bool cpu_place = force_cpu || context.GetPlace() == platform::CPUPlace(); + if (cpu_place) { + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(context.GetPlace()); + out->mutable_data(platform::CPUPlace()); + math::SetConstant functor; + functor(reinterpret_cast(dev_ctx), + out, static_cast(seed)); + } else { + auto *out_data = out->mutable_data(context.GetPlace()); + auto target_gpu_place = + BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + auto stream = context.cuda_device_context().stream(); + memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, + sizeof(int), stream); + } } }; diff --git a/paddle/fluid/operators/seed_op.h b/paddle/fluid/operators/seed_op.h index f8b513fca4824c3c8e242326f99e6c840520e7a3..671f397d4eaffca21c3562d73de6d82b715ffc91 100644 --- a/paddle/fluid/operators/seed_op.h +++ b/paddle/fluid/operators/seed_op.h @@ -14,6 +14,7 @@ #pragma once #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace operators { diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 8bf27f6d2fd988b91cbd584a6dc0539a3935563e..7aa3c888f2ad188b98a3de052335aaa9cfd90fc6 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -197,13 +197,18 @@ class ProgramStats(object): if op.desc.has_attr(op_device_attr_name): op_device = op.desc.attr(op_device_attr_name) + # Setting the force_cpu of seed to true will make the output of seed in cpu memory, + # reduce the synchronous copy from GPU to CPU in dropout, and reduce the communication hang added_op = self.block._insert_op( index=op.idx, type='seed', inputs={}, outputs={'Out': [added_var]}, - attrs={'seed': seed, - 'op_device': op_device}) + attrs={ + 'seed': seed, + 'op_device': op_device, + 'force_cpu': True + }) self.ops.insert(op_idx, added_op) # modify dropout op desc so that it accept a seed var as input op.desc.set_input("Seed", [var_unique_name]) diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index 89755d0365f2cb64ed2fd561ebcf169a89fc8e20..396d55b3d0a8b5b2887e97b3369956b9b7f96ba6 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -232,6 +232,75 @@ class TestFP16DropoutOp2(TestFP16DropoutOp): self.fix_seed = False +class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase): + def test_seed_cpu_place(self): + paddle.enable_static() + main_program = Program() + with program_guard(main_program): + seed_input_name = "tensor@SeedInput" + x_var_name = "tensor@X" + x_out_var = "tensor@XOut" + + mask_var_name = "tensor@Mask" + seed_input_var = main_program.global_block().create_var( + name=seed_input_name, + shape=[1], + dtype='int32', + persistable=False, + stop_gradient=True) + x_out_var = main_program.global_block().create_var( + name=x_out_var, + shape=[40, 40], + dtype='float32', + persistable=False, + stop_gradient=True) + x_var = main_program.global_block().create_var( + name=x_var_name, + shape=[40, 40], + dtype='float32', + persistable=False, + stop_gradient=True) + mask_var = main_program.global_block().create_var( + name=mask_var_name, + shape=[1], + dtype='int', + persistable=False, + stop_gradient=True) + + main_program.global_block().append_op( + type="fill_constant", + outputs={"Out": x_var_name}, + attrs={ + "shape": [40, 40], + "dtype": x_var.dtype, + "value": 1.0, + "place_type": 0 + }) + main_program.global_block().append_op( + type='seed', + inputs={}, + outputs={'Out': seed_input_var}, + attrs={'seed': 1, + 'force_cpu': True}) + main_program.global_block().append_op( + type='dropout', + inputs={'X': x_var, + 'Seed': seed_input_var}, + attrs={'dropout_prob': 0.}, + outputs={'Out': x_out_var, + 'Mask': mask_var}) + place = fluid.CPUPlace() + if core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + x_out, mask_out = exe.run( + main_program, + feed={}, + fetch_list=[x_out_var.name, mask_var.name]) + x_in_np = np.ones([40, 40]).astype("float32") + self.assertTrue(np.allclose(x_out, x_in_np)) + + class TestDropoutOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_seed_op.py b/python/paddle/fluid/tests/unittests/test_seed_op.py index 7d6705f72569b60df0d4dc15f7c00556edaa9d1d..08478d7140d434faca6fdee887e8effcb82d504d 100644 --- a/python/paddle/fluid/tests/unittests/test_seed_op.py +++ b/python/paddle/fluid/tests/unittests/test_seed_op.py @@ -25,7 +25,7 @@ class TestSeedOpFixSeed(OpTest): self.op_type = "seed" self.inputs = {} self.attrs = {"seed": 123} - self.outputs = {"Out": np.asarray((123)).astype('int32')} + self.outputs = {"Out": np.asarray((123)).astype('int')} def test_check_output(self): self.check_output() @@ -36,7 +36,7 @@ class TestSeedOpDiffSeed(OpTest): self.op_type = "seed" self.inputs = {} self.attrs = {"seed": 0} - self.outputs = {"Out": np.asarray((123)).astype('int32')} + self.outputs = {"Out": np.asarray((123)).astype('int')} def test_check_output(self): self.check_output(no_check_set=["Out"])