未验证 提交 28d69d71 编写于 作者: Z Zeng Jinle 提交者: GitHub

Refine dropout gpu memory (#17095)

* refine_dropout_mem,test=develop

* # This is a combination of 14 commits.
# The first commit's message is:
remove ut test_dist_word2vec in mac ci, will fix it in private, test=develop (#17066)

# This is the 2nd commit message:

Fleet unify distributed training (#16791)

* implement distributed transpiler with fleet
# This is the 3rd commit message:

ParallelDyGraph with GPU collective mode (#16827)

implement dygraph.parallel.DataParallel to hook reduce op.

# This is the 4th commit message:

Init mixed precision training interface (#16856)

* Init mixed precision training interface

* Add fp16 test script

test=develop

* All initializers support float16

test=develop

* Code cleanup & add more code annotations

test=develop

* Update API spec

test=develop

* Add usage example in doc

test=develop

# This is the 5th commit message:

fix reference_count_pass,test=develop (#17060)

test=develop
# This is the 6th commit message:

Speedup roi_perspective_transform op by caching the information of linear interpolation in forward (#17090)

* Cache the information of linear interpolation in forward and use it in backward.
test=develop

* Fix cuda kernel.
test=develop

# This is the 7th commit message:

remove unnecessary prepare_data (#17080)

test=develop
# This is the 8th commit message:

fix interpolate cu. test=develop (#17101)

# This is the 9th commit message:

test=develop, double backward leaky_relu (#17067)

backward of backward: leaky_relu
# This is the 10th commit message:

fix fuse optimizer ops (#17102)

test=develop
# This is the 11th commit message:

truncated_gaussian_random supported in distributed training, test=develop (#17091)

# This is the 12th commit message:

 Detailed coordinate description for yolov3 loss (#17007)

* Detailed coordinate description for yolov3 loss

test=develop

* modified api.spec

test=develop

* modified loss name

* fix api.spec

test=develop

* polish description

test=develop

* modified api.spec

test=develop

# This is the 13th commit message:

fix test_weight_decay (#17109)

test=develop
# This is the 14th commit message:

Path flag (#17105)

* fix python/paddle/fluid/__init__.py detecting problems
上级 b9494058
......@@ -545,7 +545,7 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = static_cast<T>(0) / out;
dx.device(d) = static_cast<T>(0) * out;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
......
......@@ -222,6 +222,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
dev_ctx);
void* cudnn_workspace_ptr =
static_cast<void*>(cudnn_workspace.data<int8_t>());
VLOG(2) << "Cudnn workspace size fwd: "
<< static_cast<double>(workspace_size_in_bytes) / (1 << 20)
<< " MB";
// ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) {
......@@ -473,6 +476,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
{static_cast<int64_t>(workspace_size_in_bytes)}),
dev_ctx);
cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
VLOG(2) << "Cudnn workspace size bwd: "
<< static_cast<double>(workspace_size_in_bytes) / (1 << 20)
<< " MB";
}
// ------------------- cudnn conv backward data ---------------------
......
......@@ -117,6 +117,14 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
ctx->ShareLoD(framework::GradVarName("Out"),
/*->*/ framework::GradVarName("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
ctx.GetPlace());
}
};
class DropoutGradOpDescMaker : public framework::SingleGradOpDescMaker {
......
......@@ -22,10 +22,10 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
template <typename T, typename MaskType>
__global__ void RandomGenerator(const size_t n, const int seed,
const float dropout_prob, const T* src,
T* mask_data, T* dst,
MaskType* mask_data, T* dst,
bool is_upscale_in_train) {
thrust::minstd_rand rng;
rng.seed(seed);
......@@ -34,7 +34,7 @@ __global__ void RandomGenerator(const size_t n, const int seed,
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int step_size = 0;
T mask;
MaskType mask;
T dest;
for (; idx < n; idx += blockDim.x * gridDim.x) {
T s = src[idx];
......@@ -45,15 +45,16 @@ __global__ void RandomGenerator(const size_t n, const int seed,
rng.discard(step_size);
}
if (dist(rng) < dropout_prob) {
mask = static_cast<T>(0);
mask = 0;
dest = 0;
} else {
mask = 1;
if (is_upscale_in_train) {
mask = static_cast<T>(1.0f / (1.0f - dropout_prob));
dest = s / static_cast<T>(1.0f - dropout_prob);
} else {
mask = static_cast<T>(1);
dest = s;
}
}
dest = s * mask;
mask_data[idx] = mask;
dst[idx] = dest;
}
......@@ -71,30 +72,40 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
y->mutable_data<T>(context.GetPlace());
float dropout_prob = context.Attr<float>("dropout_prob");
auto dropout_implementation =
auto& dropout_implementation =
context.Attr<std::string>("dropout_implementation");
bool upscale_in_train = (dropout_implementation == "upscale_in_train");
auto& place = *context.template device_context<Place>().eigen_device();
if (!context.Attr<bool>("is_test")) {
int64_t x_numel = x->numel();
auto stream = context.cuda_device_context().stream();
auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace());
auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
size_t size = framework::product(mask->dims());
auto* x_data = x->data<T>();
auto* y_data = y->mutable_data<T>(context.GetPlace());
if (dropout_prob == 1.0f) {
PADDLE_ENFORCE(cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
PADDLE_ENFORCE(cudaMemsetAsync(mask_data, 0,
x_numel * sizeof(*mask_data), stream));
return;
}
std::random_device rnd;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
int threads = 512;
int grid = (x->numel() + threads - 1) / threads;
RandomGenerator<
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
int grid = (x_numel + threads - 1) / threads;
RandomGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
size, seed, dropout_prob, x_data, mask_data, y_data,
(dropout_implementation == "upscale_in_train"));
upscale_in_train);
} else {
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
if (dropout_implementation == "upscale_in_train") {
if (upscale_in_train) {
Y.device(place) = X;
} else {
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstring>
#include <random>
#include <string>
......@@ -37,11 +38,20 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
auto* y_data = y->mutable_data<T>(context.GetPlace());
float dropout_prob = context.Attr<float>("dropout_prob");
auto dropout_implementation =
auto& dropout_implementation =
context.Attr<std::string>("dropout_implementation");
bool upscale_in_train = (dropout_implementation == "upscale_in_train");
if (!context.Attr<bool>("is_test")) {
auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace());
auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
size_t size = framework::product(mask->dims());
// Special case when dropout_prob is 1.0
if (dropout_prob == 1.0f) {
std::memset(y_data, 0, size * sizeof(*y_data)); // NOLINT
std::memset(mask_data, 0, size * sizeof(*mask_data)); // NOLINT
return;
}
// NOTE: fixed seed should only be used in unittest or for debug.
// Guarantee to use random seed in training.
......@@ -53,17 +63,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
std::uniform_real_distribution<float> dist(0, 1);
size_t size = framework::product(mask->dims());
for (size_t i = 0; i < size; ++i) {
if (dist(engine) < dropout_prob) {
mask_data[i] = 0;
y_data[i] = 0;
} else {
if (dropout_implementation == "upscale_in_train") {
mask_data[i] = 1.0f / static_cast<T>(1.0f - dropout_prob);
mask_data[i] = 1;
if (upscale_in_train) {
y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
} else {
mask_data[i] = 1;
y_data[i] = x_data[i];
}
}
......@@ -73,7 +81,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
if (dropout_implementation == "upscale_in_train") {
if (upscale_in_train) {
Y.device(place) = X;
} else {
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
......@@ -94,13 +102,26 @@ class DropoutGradKernel : public framework::OpKernel<T> {
auto* mask = context.Input<Tensor>("Mask");
grad_x->mutable_data<T>(context.GetPlace());
auto M = EigenMatrix<T>::Reshape(*mask, 1);
auto M = EigenMatrix<uint8_t>::Reshape(*mask, 1);
auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
dX.device(place) = dY * M;
auto& dropout_implementation =
context.Attr<std::string>("dropout_implementation");
if (dropout_implementation == "upscale_in_train") {
float dropout_prob = context.Attr<float>("dropout_prob");
if (dropout_prob == 1.0f) {
dX.device(place) = static_cast<T>(0) * dY;
} else {
dX.device(place) =
dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
}
} else {
dX.device(place) = dY * M.cast<T>();
}
}
};
......
......@@ -1390,7 +1390,7 @@ def dropout(x,
helper = LayerHelper('dropout', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
mask = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True)
dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
seed = helper.main_program.random_seed
......
......@@ -27,7 +27,7 @@ class TestDropoutOp(OpTest):
self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
self.outputs = {
'Out': self.inputs['X'],
'Mask': np.ones((32, 64)).astype('float32')
'Mask': np.ones((32, 64)).astype('uint8')
}
def test_check_output(self):
......@@ -44,7 +44,7 @@ class TestDropoutOp2(TestDropoutOp):
self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
self.outputs = {
'Out': np.zeros((32, 64)).astype('float32'),
'Mask': np.zeros((32, 64)).astype('float32')
'Mask': np.zeros((32, 64)).astype('uint8')
}
......@@ -55,7 +55,7 @@ class TestDropoutOp3(TestDropoutOp):
self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
self.outputs = {
'Out': self.inputs['X'],
'Mask': np.ones((32, 64, 2)).astype('float32')
'Mask': np.ones((32, 64, 2)).astype('uint8')
}
......@@ -97,7 +97,7 @@ class TestDropoutOp6(TestDropoutOp):
}
self.outputs = {
'Out': np.zeros((32, 64)).astype('float32'),
'Mask': np.zeros((32, 64)).astype('float32')
'Mask': np.zeros((32, 64)).astype('uint8')
}
......@@ -113,7 +113,7 @@ class TestDropoutOp7(TestDropoutOp):
}
self.outputs = {
'Out': self.inputs['X'],
'Mask': np.ones((32, 64, 2)).astype('float32')
'Mask': np.ones((32, 64, 2)).astype('uint8')
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册