From e0b136c0f972813d87e8f03d67e97b7b7c4dfcb3 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Sun, 18 Mar 2018 22:24:43 +0800
Subject: [PATCH] Refine average accumulates op 1. Rename inputs and outputs 2.
 Add some comments

---
 .../fluid/operators/average_accumulates_op.cc | 138 +++++++++++-------
 .../fluid/operators/average_accumulates_op.cu |  36 +++--
 .../fluid/operators/average_accumulates_op.h  |  92 ++++++------
 3 files changed, 147 insertions(+), 119 deletions(-)
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 808693b61..368a1f561 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,9 +21,9 @@ template <>
 void getAccumulators<paddle::platform::CPUDeviceContext>(
     const framework::ExecutionContext& ctx, int64_t& num_updates_,
     int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("num_updates");
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
 
   old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
   num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
@@ -34,9 +34,9 @@ template <>
 void setAccumulators<paddle::platform::CPUDeviceContext>(
     const framework::ExecutionContext& ctx, int64_t num_updates_,
     int64_t num_accumulates_, int64_t old_num_accumulates_) {
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("num_updates");
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
 
   out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
   out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
@@ -49,64 +49,62 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(
-        ctx->HasInput("Param"),
-        "Input (Param) of average_accumulates op should not be null.");
+        ctx->HasInput("param"),
+        "Input (param) of average_accumulates op should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasInput("Grad"),
-        "Input (Grad) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("sum_1"),
+        ctx->HasInput("in_sum_1"),
         "Input (sum_1) of average_accumulates op should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasInput("sum_2"),
+        ctx->HasInput("in_sum_2"),
         "Input (sum_2) of average_accumulates op should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasInput("sum_3"),
+        ctx->HasInput("in_sum_3"),
         "Input (sum_3) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("num_accumulates"),
-                   "Input (num_accumulates) of average_accumulates op should "
-                   "not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("old_num_accumulates"),
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_accumulates"),
+        "Input (in_num_accumulates) of average_accumulates op should "
+        "not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
                    "Input (old_num_accumulates) of average_accumulates op "
                    "should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasInput("num_updates"),
+        ctx->HasInput("in_num_updates"),
         "Input (num_updates) of average_accumulates op should not be null.");
 
     PADDLE_ENFORCE(
-        ctx->HasOutput("sum_1"),
+        ctx->HasOutput("out_sum_1"),
         "Output (sum_1) of average_accumulates op should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasOutput("sum_2"),
+        ctx->HasOutput("out_sum_2"),
         "Output (sum_2) of average_accumulates op should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasOutput("sum_3"),
+        ctx->HasOutput("out_sum_3"),
         "Output (sum_3) of average_accumulates op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("num_accumulates"),
+    PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
                    "Output (num_accumulates) of average_accumulates op should "
                    "not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("old_num_accumulates"),
+    PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
                    "Output (old_num_accumulates) of average_accumulates op "
                    "should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasOutput("num_updates"),
+        ctx->HasOutput("out_num_updates"),
         "Output (num_updates) of average_accumulates op should not be null.");
 
-    auto in_dim = ctx->GetInputDim("Param");
+    auto in_dim = ctx->GetInputDim("param");
 
-    ctx->SetOutputDim("sum_1", in_dim);
-    ctx->SetOutputDim("sum_2", in_dim);
-    ctx->SetOutputDim("sum_3", in_dim);
-    ctx->SetOutputDim("num_accumulates", {1});
-    ctx->SetOutputDim("old_num_accumulates", {1});
-    ctx->SetOutputDim("num_updates", {1});
+    ctx->SetOutputDim("out_sum_1", in_dim);
+    ctx->SetOutputDim("out_sum_2", in_dim);
+    ctx->SetOutputDim("out_sum_3", in_dim);
+    ctx->SetOutputDim("out_num_accumulates", {1});
+    ctx->SetOutputDim("out_old_num_accumulates", {1});
+    ctx->SetOutputDim("out_num_updates", {1});
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("param")->type()),
         ctx.GetPlace());
   }
 };
@@ -115,26 +113,60 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("sum_1", "");
-    AddInput("sum_2", "");
-    AddInput("sum_3", "");
-    AddInput("num_accumulates", "");
-    AddInput("old_num_accumulates", "");
-    AddInput("num_updates", "");
-
-    AddOutput("sum_1", "");
-    AddOutput("sum_2", "");
-    AddOutput("sum_3", "");
-    AddOutput("num_accumulates", "");
-    AddOutput("old_num_accumulates", "");
-    AddOutput("num_updates", "");
-
-    AddAttr<float>("", "average_window");
-    AddAttr<float>("", "max_average_window");
-    AddAttr<float>("", "min_average_window");
+    AddInput("param",
+             "Input(Tensor or LoDTensor): The parameter to be accumulated.");
+    AddInput("in_sum_1",
+             "Input(Tensor or LoDTensor): A tensor used to store the parameter "
+             "sums with the same shape as input(param).");
+    AddInput("in_sum_2",
+             "Input(Tensor or LoDTensor): A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param). It is used to avoid loss of precision due to too "
+             "many sums.");
+    AddInput("in_sum_3",
+             "Input(Tensor or LoDTensor): A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param).");
+    AddInput("in_num_accumulates",
+             "Input(Tensor): The accumulating times of current window with "
+             "shape [1].");
+    AddInput("in_old_num_accumulates",
+             "Input(Tensor): The accumulating times of previous window with "
+             "shape [1].");
+    AddInput("in_num_updates",
+             "Input(Tensor): The total number of batches used by trainning "
+             "before this batch with shape [1].");
+
+    AddOutput("out_sum_1",
+              "Output(Tensor or LoDTensor): A tensor used to store the "
+              "parameter sums with the same shape as input(param).");
+    AddOutput("out_sum_2",
+              "Output(Tensor or LoDTensor): A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param). It is used to avoid loss of precision due to too "
+              "many sums.");
+    AddOutput("out_sum_3",
+              "Output(Tensor or LoDTensor): A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param).");
+    AddOutput("out_num_accumulates",
+              "Output(Tensor): The accumulating times of current window with "
+              "shape [1].");
+    AddOutput("out_old_num_accumulates",
+              "Output(Tensor): The accumulating times of previous window with "
+              "shape [1].");
+    AddOutput("out_num_updates",
+              "Output(Tensor): The total number of batches used by trainning "
+              "before this batch with shape [1].");
+
+    AddAttr<float>("average_window",
+                   "The rate of average window size relative to num_updates.");
+    AddAttr<int64_t>("max_average_window", "Maximum size of average window.");
+    AddAttr<int64_t>("min_average_window", "Minimu size of average window.");
 
     AddComment(R"DOC(
 AverageAccumulates Operator.
+Accumulate the sum of parameter whtin sliding window. The size of sliding window is determined by 'average_window', 'max_average_window' and 'min_average_window'.
 )DOC");
   }
 };
@@ -143,10 +175,10 @@ AverageAccumulates Operator.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(average_accumulate, ops::AverageAccumulatesOp,
+REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
                   ops::AverageAccumulatesOpMaker,
                   paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    average_accumulate,
+    average_accumulates,
     ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
     ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index 56f2f02fd..dbaa8ba6c 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -21,39 +21,43 @@ template <>
 void getAccumulators<paddle::platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx, int64_t& num_updates_,
     int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("num_updates");
-
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+  auto stream = ctx.cuda_device_context().stream();
   memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
                platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
-               sizeof(int64_t));
+               sizeof(int64_t), stream);
   memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
-               in_old_num_accumulates->data<int64_t>(), sizeof(int64_t));
+               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
   memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
-               in_num_updates->data<int64_t>(), sizeof(int64_t));
+               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
 }
 
 template <>
 void setAccumulators<paddle::platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx, int64_t num_updates_,
     int64_t num_accumulates_, int64_t old_num_accumulates_) {
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("num_updates");
+  auto stream = ctx.cuda_device_context().stream();
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
 
   memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t));
+               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
+               stream);
   memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t));
+               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
+               stream);
   memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
-               platform::CPUPlace(), &num_updates_, sizeof(int64_t));
-}
-}
+               platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
 }
 
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    average_accumulate,
+    average_accumulates,
     ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
     ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 73814dd24..d33fd5519 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -29,88 +29,80 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename DeviceContext>
 void getAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t& num_updates_, int64_t& num_accumulates_,
-                     int64_t& old_num_accumulates_);
+                     int64_t& num_updates, int64_t& num_accumulates,
+                     int64_t& old_num_accumulates);
 
 template <typename DeviceContext>
 void setAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t num_updates_, int64_t num_accumulates_,
-                     int64_t old_num_accumulates_);
+                     int64_t num_updates, int64_t num_accumulates,
+                     int64_t old_num_accumulates);
 
 template <typename DeviceContext, typename T>
 class AverageAccumulatesKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    // It is used to avoid loss of precision
     static const int64_t kMaxNumAccumulates = 16384;
-    // accumulators
-    int64_t num_updates_ = 0;
-    int64_t num_accumulates_ = 0;
-    int64_t old_num_accumulates_ = 0;
-    // attrs
-    int64_t min_average_window_;
-    int64_t max_average_window_;
-    float average_window_;
-
-    auto* param = ctx.Input<Tensor>("Param");
-    auto* in_sum_1 = ctx.Input<Tensor>("sum_1");
-    auto* in_sum_2 = ctx.Input<Tensor>("sum_2");
-    auto* in_sum_3 = ctx.Input<Tensor>("sum_3");
-
-    auto* out_sum_1 = ctx.Output<Tensor>("sum_1");
-    auto* out_sum_2 = ctx.Output<Tensor>("sum_2");
-    auto* out_sum_3 = ctx.Output<Tensor>("sum_3");
-
-    getAccumulators<DeviceContext>(ctx, num_updates_, num_accumulates_,
-                                   old_num_accumulates_);
-    average_window_ = ctx.Attr<float>("average_window");
-    max_average_window_ =
-        ctx.Attr<int64_t>("max_average_window");  // default bach number
-    min_average_window_ =
-        ctx.Attr<int64_t>("min_average_window");  // default 10000L
-    min_average_window_ =
-        std::min<int64_t>(min_average_window_, max_average_window_);
-
+    // Get accumulators from input
+    int64_t num_updates = 0;
+    int64_t num_accumulates = 0;
+    int64_t old_num_accumulates = 0;
+    getAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+
+    // Get attrs
+    float average_window = ctx.Attr<float>("average_window");
+    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+    min_average_window =
+        std::min<int64_t>(min_average_window, max_average_window);
+
+    // Get inputs
+    auto* param = ctx.Input<Tensor>("param");
+    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
     auto param_tensor = EigenVector<T>::Flatten(*param);
     auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
     auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
     auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
+
+    // Get outputs
+    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
     auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
     auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
     auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
 
+    // Compute
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
     math::SetConstant<DeviceContext, T> constant_functor;
-    // start batch
-    ++num_updates_;
-    ++num_accumulates_;
-
-    // update
+    ++num_updates;
+    ++num_accumulates;
     out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
-
     out_sum_2_tensor.device(place) = in_sum_2_tensor;
     out_sum_3_tensor.device(place) = in_sum_3_tensor;
-    // needSpecialTraversal
-    if (num_updates_ % kMaxNumAccumulates == 0) {
+    if (num_updates % kMaxNumAccumulates == 0) {
       out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
       constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
                        0.0);
     }
-
-    if (num_accumulates_ >= min_average_window_ &&
-        num_accumulates_ >= std::min<int64_t>(max_average_window_,
-                                              num_updates_ * average_window_)) {
+    if (num_accumulates >= min_average_window &&
+        num_accumulates >= std::min<int64_t>(max_average_window,
+                                             num_updates * average_window)) {
       out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
       constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
                        0.0);
       constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
                        0.0);
-
-      // finishBatch
-      old_num_accumulates_ = num_accumulates_;
-      num_accumulates_ = 0;
+      old_num_accumulates = num_accumulates;
+      num_accumulates = 0;
     }
-    setAccumulators<DeviceContext>(ctx, num_updates_, num_accumulates_,
-                                   old_num_accumulates_);
+
+    // Set accumulators to output
+    setAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
   }
 };
 
-- 
GitLab