modify dropout att; test=develop

a6e6bc45 · phlrain · 049c9c7d · a6e6bc45 · a6e6bc45 · a6e6bc45
5 changed file
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/dropout_op.h"
+#include <string>
 namespace paddle {
 namespace operators {
@@ -57,15 +58,29 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
                  "will be dropped.")
        .SetDefault(false);
    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
-    AddAttr<bool>("dropout_implementation",
+    AddAttr<std::string>(
-                  "When it's True, In the training, after set some value"
+        "dropout_implementation",
-                  "to 0 (probability is dropout_prob),"
+        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
-                  "all the value will divide (1-dropout_prob)"
+        "There are two kinds of ways to implement dropout"
-                  "By using this way, will do nothing in the inference program"
+        "(the mask below is a tensor have the same shape with input"
-                  "The dropout op can be removed in the inference program."
+        "the value of mask is 0 or 1, the ratio of 0 is dropout_prob)"
-                  "The inference program will be more efficient"
+        "1. downgrade_in_infer(default), downgrade the outcome at inference "
-                  "When it's False, same as original")
+        "time"
-        .SetDefault(false);
+        "   train: out = input * mask"
+        "   inference: out = input * dropout_prob"
+        "2. upscale_in_train, upscale the outcome at training time, do nothing "
+        "in inference"
+        "   train: out = input * mask / ( 1.0 - dropout_prob )"
+        "   inference: out = input"
+        "   dropout op can be removed from the program. the program will be "
+        "efficient")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string& type) {
+          PADDLE_ENFORCE(
+              type == "downgrade_in_infer" || type == "upscale_in_train",
+              "dropout_implementation can only be downgrade_in_infer or "
+              "upscale_in_train");
+        });
    AddComment(R"DOC(
 Dropout Operator.

--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include <string>
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/float16.h"
@@ -27,7 +28,7 @@ template <typename T>
 __global__ void RandomGenerator(const size_t n, const int seed,
                                const float dropout_prob, const T* src,
                                T* mask_data, T* dst,
-                                bool dropout_implementation) {
+                                bool is_upscale_in_train) {
  thrust::minstd_rand rng;
  rng.seed(seed);
  thrust::uniform_real_distribution<float> dist(0, 1);
@@ -48,7 +49,7 @@ __global__ void RandomGenerator(const size_t n, const int seed,
    if (dist(rng) < dropout_prob) {
      mask = static_cast<T>(0);
    } else {
-      if (dropout_implementation) {
+      if (is_upscale_in_train) {
        mask = static_cast<T>(1.0f / (1.0f - dropout_prob));
      } else {
        mask = static_cast<T>(1);
@@ -72,7 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
    y->mutable_data<T>(context.GetPlace());
    float dropout_prob = context.Attr<float>("dropout_prob");
-    auto dropout_implementation = context.Attr<bool>("dropout_implementation");
+    auto dropout_implementation =
+        context.Attr<std::string>("dropout_implementation");
    auto& place = *context.template device_context<Place>().eigen_device();
    if (!context.Attr<bool>("is_test")) {
      auto* mask = context.Output<Tensor>("Mask");
@@ -90,11 +92,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
      RandomGenerator<
          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
          size, seed, dropout_prob, x_data, mask_data, y_data,
-          dropout_implementation);
+          (dropout_implementation == "upscale_in_train"));
    } else {
      auto X = EigenMatrix<T>::Reshape(*x, 1);
      auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      if (dropout_implementation) {
+      if (dropout_implementation == "upscale_in_train") {
        Y.device(place) = X;
      } else {
        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);

--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 #include <random>
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -36,7 +37,8 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
    auto* y_data = y->mutable_data<T>(context.GetPlace());
    float dropout_prob = context.Attr<float>("dropout_prob");
-    auto dropout_implementation = context.Attr<bool>("dropout_implementation");
+    auto dropout_implementation =
+        context.Attr<std::string>("dropout_implementation");
    if (!context.Attr<bool>("is_test")) {
      auto* mask = context.Output<Tensor>("Mask");
      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
@@ -57,7 +59,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
          mask_data[i] = 0;
          y_data[i] = 0;
        } else {
-          if (dropout_implementation) {
+          if (dropout_implementation == "upscale_in_train") {
            mask_data[i] = 1.0f / static_cast<T>(1.0f - dropout_prob);
            y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
          } else {
@@ -71,7 +73,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
      auto Y = EigenMatrix<T>::Reshape(*y, 1);
      auto& place =
          *context.template device_context<DeviceContext>().eigen_device();
-      if (dropout_implementation) {
+      if (dropout_implementation == "upscale_in_train") {
        Y.device(place) = X;
      } else {
        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -985,7 +985,7 @@ def dropout(x,
            is_test=False,
            seed=None,
            name=None,
-            dropout_implementation=False):
+            dropout_implementation="downgrade_in_infer"):
    """
    Computes dropout.
@@ -1005,13 +1005,20 @@ def dropout(x,
                    units will be dropped. DO NOT use a fixed seed in training.
        name (str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
-        dropout_implementation(bool): A Flag indicating whether divide (1-dropout_prob). 
+        dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train']
-                                      When it's True, all the units will divide (1-dropout_prob)
+                                        1. downgrade_in_infer(default), downgrade the outcome at inference
-                                      after set some units to zero in the train program.
+                                           train: out = input * mask
-                                      And do nothing in the inference program.
+                                           inference: out = input * dropout_prob
-                                      The dropout op can be removed in the inference program.
+                                           (make is a tensor same shape with input, value is 0 or 1
-                                      The inference program will be more efficient
+                                            ratio of 0 is dropout_prob)
-                                      When it's False, same as original
+                                        2. upscale_in_train, upscale the outcome at training time
+                                           train: out = input * mask / ( 1.0 - dropout_prob )
+                                           inference: out = input
+                                           (make is a tensor same shape with input, value is 0 or 1
+                                            ratio of 0 is dropout_prob)
+                                           dropout op can be removed from the program. 
+                                           the program will be efficient
    Returns:

--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -93,7 +93,7 @@ class TestDropoutOp6(TestDropoutOp):
            'dropout_prob': 1.0,
            'fix_seed': True,
            'is_test': False,
-            'div_prob_in_train': True
+            'dropout_implementation': 'upscale_in_train'
        }
        self.outputs = {
            'Out': np.zeros((32, 64)).astype('float32'),
@@ -109,7 +109,7 @@ class TestDropoutOp7(TestDropoutOp):
            'dropout_prob': 0.0,
            'fix_seed': True,
            'is_test': False,
-            'div_prob_in_train': True
+            'dropout_implementation': 'upscale_in_train'
        }
        self.outputs = {
            'Out': self.inputs['X'],
@@ -125,7 +125,7 @@ class TestDropoutOp8(OpTest):
            'dropout_prob': 0.35,
            'fix_seed': True,
            'is_test': True,
-            'div_prob_in_train': True
+            'dropout_implementation': 'upscale_in_train'
        }
        self.outputs = {'Out': self.inputs['X']}
@@ -140,7 +140,7 @@ class TestDropoutOp9(OpTest):
        self.attrs = {
            'dropout_prob': 0.75,
            'is_test': True,
-            'div_prob_in_train': True
+            'dropout_implementation': 'upscale_in_train'
        }
        self.outputs = {'Out': self.inputs['X']}