From 28d69d710a00a04423b0a28a0c72ac2690d1f641 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Sun, 28 Apr 2019 02:11:08 -0500 Subject: [PATCH] Refine dropout gpu memory (#17095) * refine_dropout_mem,test=develop * # This is a combination of 14 commits. # The first commit's message is: remove ut test_dist_word2vec in mac ci, will fix it in private, test=develop (#17066) # This is the 2nd commit message: Fleet unify distributed training (#16791) * implement distributed transpiler with fleet # This is the 3rd commit message: ParallelDyGraph with GPU collective mode (#16827) implement dygraph.parallel.DataParallel to hook reduce op. # This is the 4th commit message: Init mixed precision training interface (#16856) * Init mixed precision training interface * Add fp16 test script test=develop * All initializers support float16 test=develop * Code cleanup & add more code annotations test=develop * Update API spec test=develop * Add usage example in doc test=develop # This is the 5th commit message: fix reference_count_pass,test=develop (#17060) test=develop # This is the 6th commit message: Speedup roi_perspective_transform op by caching the information of linear interpolation in forward (#17090) * Cache the information of linear interpolation in forward and use it in backward. test=develop * Fix cuda kernel. test=develop # This is the 7th commit message: remove unnecessary prepare_data (#17080) test=develop # This is the 8th commit message: fix interpolate cu. test=develop (#17101) # This is the 9th commit message: test=develop, double backward leaky_relu (#17067) backward of backward: leaky_relu # This is the 10th commit message: fix fuse optimizer ops (#17102) test=develop # This is the 11th commit message: truncated_gaussian_random supported in distributed training, test=develop (#17091) # This is the 12th commit message: Detailed coordinate description for yolov3 loss (#17007) * Detailed coordinate description for yolov3 loss test=develop * modified api.spec test=develop * modified loss name * fix api.spec test=develop * polish description test=develop * modified api.spec test=develop # This is the 13th commit message: fix test_weight_decay (#17109) test=develop # This is the 14th commit message: Path flag (#17105) * fix python/paddle/fluid/__init__.py detecting problems --- paddle/fluid/operators/activation_op.h | 2 +- paddle/fluid/operators/conv_cudnn_op.cu.cc | 6 +++ paddle/fluid/operators/dropout_op.cc | 8 ++++ paddle/fluid/operators/dropout_op.cu | 39 ++++++++++++------- paddle/fluid/operators/dropout_op.h | 39 ++++++++++++++----- python/paddle/fluid/layers/nn.py | 2 +- .../fluid/tests/unittests/test_dropout_op.py | 10 ++--- 7 files changed, 76 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index d306e20037..5848d9dad5 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -545,7 +545,7 @@ struct ZeroGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = static_cast(0) / out; + dx.device(d) = static_cast(0) * out; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index ffcf8a5800..9a545160a1 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -222,6 +222,9 @@ class CUDNNConvOpKernel : public framework::OpKernel { dev_ctx); void* cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); + VLOG(2) << "Cudnn workspace size fwd: " + << static_cast(workspace_size_in_bytes) / (1 << 20) + << " MB"; // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { @@ -473,6 +476,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { {static_cast(workspace_size_in_bytes)}), dev_ctx); cudnn_workspace_ptr = static_cast(cudnn_workspace.data()); + VLOG(2) << "Cudnn workspace size bwd: " + << static_cast(workspace_size_in_bytes) / (1 << 20) + << " MB"; } // ------------------- cudnn conv backward data --------------------- diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 65c2ff6415..273015f976 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -117,6 +117,14 @@ class DropoutOpGrad : public framework::OperatorWithKernel { ctx->ShareLoD(framework::GradVarName("Out"), /*->*/ framework::GradVarName("X")); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; class DropoutGradOpDescMaker : public framework::SingleGradOpDescMaker { diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index 7a6927d3e5..e26eba68f1 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -22,10 +22,10 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template __global__ void RandomGenerator(const size_t n, const int seed, const float dropout_prob, const T* src, - T* mask_data, T* dst, + MaskType* mask_data, T* dst, bool is_upscale_in_train) { thrust::minstd_rand rng; rng.seed(seed); @@ -34,7 +34,7 @@ __global__ void RandomGenerator(const size_t n, const int seed, int idx = blockDim.x * blockIdx.x + threadIdx.x; int step_size = 0; - T mask; + MaskType mask; T dest; for (; idx < n; idx += blockDim.x * gridDim.x) { T s = src[idx]; @@ -45,15 +45,16 @@ __global__ void RandomGenerator(const size_t n, const int seed, rng.discard(step_size); } if (dist(rng) < dropout_prob) { - mask = static_cast(0); + mask = 0; + dest = 0; } else { + mask = 1; if (is_upscale_in_train) { - mask = static_cast(1.0f / (1.0f - dropout_prob)); + dest = s / static_cast(1.0f - dropout_prob); } else { - mask = static_cast(1); + dest = s; } } - dest = s * mask; mask_data[idx] = mask; dst[idx] = dest; } @@ -71,30 +72,40 @@ class GPUDropoutKernel : public framework::OpKernel { y->mutable_data(context.GetPlace()); float dropout_prob = context.Attr("dropout_prob"); - auto dropout_implementation = + auto& dropout_implementation = context.Attr("dropout_implementation"); + bool upscale_in_train = (dropout_implementation == "upscale_in_train"); + auto& place = *context.template device_context().eigen_device(); if (!context.Attr("is_test")) { + int64_t x_numel = x->numel(); + auto stream = context.cuda_device_context().stream(); + auto* mask = context.Output("Mask"); - auto* mask_data = mask->mutable_data(context.GetPlace()); + auto* mask_data = mask->mutable_data(context.GetPlace()); size_t size = framework::product(mask->dims()); auto* x_data = x->data(); auto* y_data = y->mutable_data(context.GetPlace()); + if (dropout_prob == 1.0f) { + PADDLE_ENFORCE(cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); + PADDLE_ENFORCE(cudaMemsetAsync(mask_data, 0, + x_numel * sizeof(*mask_data), stream)); + return; + } std::random_device rnd; int seed = context.Attr("fix_seed") ? context.Attr("seed") : rnd(); int threads = 512; - int grid = (x->numel() + threads - 1) / threads; - RandomGenerator< - T><<>>( + int grid = (x_numel + threads - 1) / threads; + RandomGenerator<<>>( size, seed, dropout_prob, x_data, mask_data, y_data, - (dropout_implementation == "upscale_in_train")); + upscale_in_train); } else { auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); - if (dropout_implementation == "upscale_in_train") { + if (upscale_in_train) { Y.device(place) = X; } else { Y.device(place) = X * static_cast(1.0f - dropout_prob); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h index 6c629b7b6d..09c4899c73 100644 --- a/paddle/fluid/operators/dropout_op.h +++ b/paddle/fluid/operators/dropout_op.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include @@ -37,11 +38,20 @@ class CPUDropoutKernel : public framework::OpKernel { auto* y_data = y->mutable_data(context.GetPlace()); float dropout_prob = context.Attr("dropout_prob"); - auto dropout_implementation = + auto& dropout_implementation = context.Attr("dropout_implementation"); + bool upscale_in_train = (dropout_implementation == "upscale_in_train"); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); - auto* mask_data = mask->mutable_data(context.GetPlace()); + auto* mask_data = mask->mutable_data(context.GetPlace()); + size_t size = framework::product(mask->dims()); + + // Special case when dropout_prob is 1.0 + if (dropout_prob == 1.0f) { + std::memset(y_data, 0, size * sizeof(*y_data)); // NOLINT + std::memset(mask_data, 0, size * sizeof(*mask_data)); // NOLINT + return; + } // NOTE: fixed seed should only be used in unittest or for debug. // Guarantee to use random seed in training. @@ -53,17 +63,15 @@ class CPUDropoutKernel : public framework::OpKernel { std::uniform_real_distribution dist(0, 1); - size_t size = framework::product(mask->dims()); for (size_t i = 0; i < size; ++i) { if (dist(engine) < dropout_prob) { mask_data[i] = 0; y_data[i] = 0; } else { - if (dropout_implementation == "upscale_in_train") { - mask_data[i] = 1.0f / static_cast(1.0f - dropout_prob); + mask_data[i] = 1; + if (upscale_in_train) { y_data[i] = x_data[i] / static_cast(1.0f - dropout_prob); } else { - mask_data[i] = 1; y_data[i] = x_data[i]; } } @@ -73,7 +81,7 @@ class CPUDropoutKernel : public framework::OpKernel { auto Y = EigenMatrix::Reshape(*y, 1); auto& place = *context.template device_context().eigen_device(); - if (dropout_implementation == "upscale_in_train") { + if (upscale_in_train) { Y.device(place) = X; } else { Y.device(place) = X * static_cast(1.0f - dropout_prob); @@ -94,13 +102,26 @@ class DropoutGradKernel : public framework::OpKernel { auto* mask = context.Input("Mask"); grad_x->mutable_data(context.GetPlace()); - auto M = EigenMatrix::Reshape(*mask, 1); + auto M = EigenMatrix::Reshape(*mask, 1); auto dX = EigenMatrix::Reshape(*grad_x, 1); auto dY = EigenMatrix::Reshape(*grad_y, 1); auto& place = *context.template device_context().eigen_device(); - dX.device(place) = dY * M; + + auto& dropout_implementation = + context.Attr("dropout_implementation"); + if (dropout_implementation == "upscale_in_train") { + float dropout_prob = context.Attr("dropout_prob"); + if (dropout_prob == 1.0f) { + dX.device(place) = static_cast(0) * dY; + } else { + dX.device(place) = + dY * M.cast() / static_cast(1.0f - dropout_prob); + } + } else { + dX.device(place) = dY * M.cast(); + } } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index cd3b7354ed..37997159b4 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1390,7 +1390,7 @@ def dropout(x, helper = LayerHelper('dropout', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) mask = helper.create_variable_for_type_inference( - dtype=x.dtype, stop_gradient=True) + dtype=core.VarDesc.VarType.UINT8, stop_gradient=True) if (seed is None or seed == 0) and helper.main_program.random_seed != 0: seed = helper.main_program.random_seed diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index be3c5f3b95..59918a7bb2 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -27,7 +27,7 @@ class TestDropoutOp(OpTest): self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False} self.outputs = { 'Out': self.inputs['X'], - 'Mask': np.ones((32, 64)).astype('float32') + 'Mask': np.ones((32, 64)).astype('uint8') } def test_check_output(self): @@ -44,7 +44,7 @@ class TestDropoutOp2(TestDropoutOp): self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False} self.outputs = { 'Out': np.zeros((32, 64)).astype('float32'), - 'Mask': np.zeros((32, 64)).astype('float32') + 'Mask': np.zeros((32, 64)).astype('uint8') } @@ -55,7 +55,7 @@ class TestDropoutOp3(TestDropoutOp): self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False} self.outputs = { 'Out': self.inputs['X'], - 'Mask': np.ones((32, 64, 2)).astype('float32') + 'Mask': np.ones((32, 64, 2)).astype('uint8') } @@ -97,7 +97,7 @@ class TestDropoutOp6(TestDropoutOp): } self.outputs = { 'Out': np.zeros((32, 64)).astype('float32'), - 'Mask': np.zeros((32, 64)).astype('float32') + 'Mask': np.zeros((32, 64)).astype('uint8') } @@ -113,7 +113,7 @@ class TestDropoutOp7(TestDropoutOp): } self.outputs = { 'Out': self.inputs['X'], - 'Mask': np.ones((32, 64, 2)).astype('float32') + 'Mask': np.ones((32, 64, 2)).astype('uint8') } -- GitLab