Phi average accumulates migration (#44554)

* move average_accumulates op to phi kernel

Phi average accumulates migration (#44554)
* move average_accumulates op to phi kernel
eafd4280 · Wang Bojun · GitHub · 122fff46 · eafd4280 · eafd4280
13 changed file
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -57,6 +57,8 @@ no_amp_list = [
    'adam',
    'adamw_',
    'adamw',
+    'average_accumulates',
+    'average_accumulates_',
    'decayed_adagrad_',
    'decayed_adagrad',
    'dgc_momentum_',

--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -12,99 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/average_accumulates_op.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/multiary.h"

 namespace paddle {
 namespace operators {

-template <>
-void GetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
-                                      int64_t* num_updates,
-                                      int64_t* num_accumulates,
-                                      int64_t* old_num_accumulates) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-
-  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
-  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
-  *num_updates = in_num_updates->data<int64_t>()[0];
-}
-
-template <>
-void SetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
-                                      int64_t num_updates,
-                                      int64_t num_accumulates,
-                                      int64_t old_num_accumulates) {
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-
-  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
-  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
-  out_num_updates->data<int64_t>()[0] = num_updates;
-}
-
 class AverageAccumulatesOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("param"), "Input", "param", "AverageAccumulates");
-    OP_INOUT_CHECK(
-        ctx->HasInput("in_sum_1"), "Input", "in_sum_1", "AverageAccumulates");
-    OP_INOUT_CHECK(
-        ctx->HasInput("in_sum_2"), "Input", "in_sum_2", "AverageAccumulates");
-    OP_INOUT_CHECK(
-        ctx->HasInput("in_sum_3"), "Input", "in_sum_3", "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasInput("in_num_accumulates"),
-                   "Input",
-                   "in_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasInput("in_old_num_accumulates"),
-                   "Input",
-                   "in_old_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasInput("in_num_updates"),
-                   "Input",
-                   "in_num_updates",
-                   "AverageAccumulates");
-
-    OP_INOUT_CHECK(ctx->HasOutput("out_sum_1"),
-                   "Output",
-                   "out_sum_1",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_sum_2"),
-                   "Output",
-                   "out_sum_2",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_sum_3"),
-                   "Output",
-                   "out_sum_3",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_num_accumulates"),
-                   "Output",
-                   "out_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_old_num_accumulates"),
-                   "Output",
-                   "out_old_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_num_updates"),
-                   "Output",
-                   "out_num_updates",
-                   "AverageAccumulates");
-    auto in_dim = ctx->GetInputDim("param");
-
-    ctx->SetOutputDim("out_sum_1", in_dim);
-    ctx->SetOutputDim("out_sum_2", in_dim);
-    ctx->SetOutputDim("out_sum_3", in_dim);
-    ctx->SetOutputDim("out_num_accumulates", {1});
-    ctx->SetOutputDim("out_old_num_accumulates", {1});
-    ctx->SetOutputDim("out_num_updates", {1});
-  }
-
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -209,12 +129,14 @@ And for a mini-batch in training, accumulators were computed as below steps:
 }  // namespace paddle

 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates,
+                            AverageAccumulatesInferShapeFunctor,
+                            PD_INFER_META(phi::AverageAccumulatesInferMeta));
+
 REGISTER_OPERATOR(
    average_accumulates,
    ops::AverageAccumulatesOp,
    ops::AverageAccumulatesOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(average_accumulates,
-                       ops::AverageAccumulatesKernel<phi::CPUContext, float>,
-                       ops::AverageAccumulatesKernel<phi::CPUContext, double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AverageAccumulatesInferShapeFunctor);
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/average_accumulates_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-template <>
-void GetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t* num_updates_,
-    int64_t* num_accumulates_,
-    int64_t* old_num_accumulates_) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-  auto stream = ctx.cuda_device_context().stream();
-  auto cuda_place = in_old_num_accumulates->place();
-  memory::Copy(platform::CPUPlace(),
-               old_num_accumulates_,
-               cuda_place,
-               in_old_num_accumulates->data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-  memory::Copy(platform::CPUPlace(),
-               num_accumulates_,
-               cuda_place,
-               in_num_accumulates->data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-  memory::Copy(platform::CPUPlace(),
-               num_updates_,
-               cuda_place,
-               in_num_updates->data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-}
-
-template <>
-void SetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t num_updates_,
-    int64_t num_accumulates_,
-    int64_t old_num_accumulates_) {
-  auto stream = ctx.cuda_device_context().stream();
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-  auto cuda_place = out_old_num_accumulates->place();
-
-  memory::Copy(cuda_place,
-               out_old_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(),
-               &old_num_accumulates_,
-               sizeof(int64_t),
-               stream);
-  memory::Copy(cuda_place,
-               out_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(),
-               &num_accumulates_,
-               sizeof(int64_t),
-               stream);
-  memory::Copy(cuda_place,
-               out_num_updates->data<int64_t>(),
-               platform::CPUPlace(),
-               &num_updates_,
-               sizeof(int64_t),
-               stream);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    average_accumulates,
-    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext>
-void GetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t* num_updates,
-                     int64_t* num_accumulates,
-                     int64_t* old_num_accumulates);
-
-template <typename DeviceContext>
-void SetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t num_updates,
-                     int64_t num_accumulates,
-                     int64_t old_num_accumulates);
-
-template <typename DeviceContext, typename T>
-class AverageAccumulatesKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // It is used to avoid loss of precision
-    static const int64_t kMaxNumAccumulates = 16384;
-    // Get accumulators from input
-    int64_t num_updates = 0;
-    int64_t num_accumulates = 0;
-    int64_t old_num_accumulates = 0;
-    GetAccumulators<DeviceContext>(
-        ctx, &num_updates, &num_accumulates, &old_num_accumulates);
-
-    // Get attrs
-    float average_window = ctx.Attr<float>("average_window");
-    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
-    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(
-        min_average_window,
-        max_average_window,
-        platform::errors::InvalidArgument(
-            "The min_average_window > "
-            "max_average_window is not right, min_average_window is %ld, "
-            "max_average_window is %ld.",
-            min_average_window,
-            max_average_window));
-
-    // Get inputs
-    auto* param = ctx.Input<Tensor>("param");
-    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
-    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
-    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
-    auto param_tensor = framework::EigenVector<T>::Flatten(*param);
-    auto in_sum_1_tensor = framework::EigenVector<T>::Flatten(*in_sum_1);
-    auto in_sum_2_tensor = framework::EigenVector<T>::Flatten(*in_sum_2);
-    auto in_sum_3_tensor = framework::EigenVector<T>::Flatten(*in_sum_3);
-
-    // Get outputs
-    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
-    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
-    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
-    auto out_sum_1_tensor = framework::EigenVector<T>::Flatten(*out_sum_1);
-    auto out_sum_2_tensor = framework::EigenVector<T>::Flatten(*out_sum_2);
-    auto out_sum_3_tensor = framework::EigenVector<T>::Flatten(*out_sum_3);
-
-    // Compute
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::SetConstant<DeviceContext, T> constant_functor;
-    ++num_updates;
-    ++num_accumulates;
-    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
-    out_sum_2_tensor.device(place) = in_sum_2_tensor;
-    out_sum_3_tensor.device(place) = in_sum_3_tensor;
-    if (num_updates % kMaxNumAccumulates == 0) {
-      // Move the sum to a different buffer to avoid loss of precision due to
-      // too many sums.
-      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
-      constant_functor(
-          ctx.template device_context<DeviceContext>(), out_sum_1, 0.0);
-    }
-    if (num_accumulates >= min_average_window &&
-        num_accumulates >= std::min<int64_t>(max_average_window,
-                                             num_updates * average_window)) {
-      //  Now the average window is too long, discard the old sum.
-      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
-      constant_functor(
-          ctx.template device_context<DeviceContext>(), out_sum_1, 0.0);
-      constant_functor(
-          ctx.template device_context<DeviceContext>(), out_sum_2, 0.0);
-      old_num_accumulates = num_accumulates;
-      num_accumulates = 0;
-    }
-
-    // Set accumulators to output
-    SetAccumulators<DeviceContext>(
-        ctx, num_updates, num_accumulates, old_num_accumulates);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -264,6 +264,17 @@
  kernel :
    func : auc

+#average_accumulates
+- api : average_accumulates_
+  args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int64_t max_average_window, int64_t min_average_window)
+  output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
+  infer_meta:
+    func : AverageAccumulatesInferMeta
+  kernel :
+    func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense}
+    data_type : param
+  inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
+
 # batch_norm
 - api : batch_norm
  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)

--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -434,6 +434,68 @@ void AucInferMeta(const MetaTensor& input,
  }
 }

+void AverageAccumulatesInferMeta(const MetaTensor& param,
+                                 const MetaTensor& in_sum_1,
+                                 const MetaTensor& in_sum_2,
+                                 const MetaTensor& in_sum_3,
+                                 const MetaTensor& in_num_accumulates,
+                                 const MetaTensor& in_old_num_accumulates,
+                                 const MetaTensor& in_num_updates,
+                                 float average_window,
+                                 int64_t max_average_window,
+                                 int64_t min_average_window,
+                                 MetaTensor* out_sum_1,
+                                 MetaTensor* out_sum_2,
+                                 MetaTensor* out_sum_3,
+                                 MetaTensor* out_num_accumulates,
+                                 MetaTensor* out_old_num_accumulates,
+                                 MetaTensor* out_num_updates) {
+  // auto in_dim = param.dims;
+  PADDLE_ENFORCE_NE(
+      out_sum_1,
+      nullptr,
+      errors::NotFound(
+          "Output(out_sum_1) of AverageAccumulates should not be null."));
+  PADDLE_ENFORCE_NE(
+      out_sum_2,
+      nullptr,
+      errors::NotFound(
+          "Output(out_sum_2) of AverageAccumulates should not be null."));
+  PADDLE_ENFORCE_NE(
+      out_sum_3,
+      nullptr,
+      errors::NotFound(
+          "Output(out_sum_3) of AverageAccumulates should not be null."));
+  PADDLE_ENFORCE_NE(out_num_accumulates,
+                    nullptr,
+                    errors::NotFound("Output(out_num_accumulates) of "
+                                     "AverageAccumulates should not be null."));
+
+  PADDLE_ENFORCE_NE(out_old_num_accumulates,
+                    nullptr,
+                    errors::NotFound("Output(out_old_num_accumulates) of "
+                                     "AverageAccumulates should not be null."));
+
+  PADDLE_ENFORCE_NE(
+      out_num_updates,
+      nullptr,
+      errors::NotFound(
+          "Output(out_num_updates) of AverageAccumulates should not be null."));
+
+  out_sum_1->set_dims(in_sum_1.dims());
+  out_sum_1->set_dtype(in_sum_1.dtype());
+  out_sum_2->set_dims(in_sum_2.dims());
+  out_sum_2->set_dtype(in_sum_2.dtype());
+  out_sum_3->set_dims(in_sum_3.dims());
+  out_sum_3->set_dtype(in_sum_3.dtype());
+  out_num_accumulates->set_dims({1});
+  out_num_accumulates->set_dtype(in_num_accumulates.dtype());
+  out_old_num_accumulates->set_dims({1});
+  out_old_num_accumulates->set_dtype(in_old_num_accumulates.dtype());
+  out_num_updates->set_dims({1});
+  out_num_updates->set_dtype(in_num_updates.dtype());
+}
+
 void BatchNormInferMeta(const MetaTensor& x,
                        const MetaTensor& scale,
                        const MetaTensor& bias,

--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -134,6 +134,23 @@ void AucInferMeta(const MetaTensor& input,
                  MetaTensor* stat_neg_out,
                  MetaConfig config = MetaConfig());

+void AverageAccumulatesInferMeta(const MetaTensor& param,
+                                 const MetaTensor& in_sum_1,
+                                 const MetaTensor& in_sum_2,
+                                 const MetaTensor& in_sum_3,
+                                 const MetaTensor& in_num_accumulates,
+                                 const MetaTensor& in_old_num_accumulates,
+                                 const MetaTensor& in_num_updates,
+                                 float average_window,
+                                 int64_t max_average_window,
+                                 int64_t min_average_window,
+                                 MetaTensor* out_sum_1,
+                                 MetaTensor* out_sum_2,
+                                 MetaTensor* out_sum_3,
+                                 MetaTensor* out_num_accumulates,
+                                 MetaTensor* out_old_num_accumulates,
+                                 MetaTensor* out_num_updates);
+
 void BatchNormInferMeta(const MetaTensor& x,
                        const MetaTensor& scale,
                        const MetaTensor& bias,

--- a/paddle/phi/kernels/average_accumulates_kernel.h
+++ b/paddle/phi/kernels/average_accumulates_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename Context>
+void GetAccumulators(const Context& dev_ctx,
+                     const DenseTensor& in_num_accumulates,
+                     const DenseTensor& in_old_num_accumulates,
+                     const DenseTensor& in_num_updates,
+                     int64_t* num_updates,
+                     int64_t* num_accumulates,
+                     int64_t* old_num_accumulates);
+
+template <typename Context>
+void SetAccumulators(const Context& dev_ctx,
+                     int64_t num_updates,
+                     int64_t num_accumulates,
+                     int64_t old_num_accumulates,
+                     DenseTensor* out_num_accumulates,
+                     DenseTensor* out_old_num_accumulates,
+                     DenseTensor* out_num_updates);
+
+template <typename T, typename Context>
+void AverageAccumulatesKernel(const Context& dev_ctx,
+                              const DenseTensor& param,
+                              const DenseTensor& in_sum_1,
+                              const DenseTensor& in_sum_2,
+                              const DenseTensor& in_sum_3,
+                              const DenseTensor& in_num_accumulates,
+                              const DenseTensor& in_old_num_accumulates,
+                              const DenseTensor& in_num_updates,
+                              float average_window,
+                              int64_t max_average_window,
+                              int64_t min_average_window,
+                              DenseTensor* out_sum_1,
+                              DenseTensor* out_sum_2,
+                              DenseTensor* out_sum_3,
+                              DenseTensor* out_num_accumulates,
+                              DenseTensor* out_old_num_accumulates,
+                              DenseTensor* out_num_updates);
+}  // namespace phi
--- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
+++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <>
+void GetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
+                                      const DenseTensor& in_num_accumulates,
+                                      const DenseTensor& in_old_num_accumulates,
+                                      const DenseTensor& in_num_updates,
+                                      int64_t* num_updates,
+                                      int64_t* num_accumulates,
+                                      int64_t* old_num_accumulates) {
+  *old_num_accumulates = in_old_num_accumulates.data<int64_t>()[0];
+  *num_accumulates = in_num_accumulates.data<int64_t>()[0];
+  *num_updates = in_num_updates.data<int64_t>()[0];
+}
+
+template <>
+void SetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
+                                      int64_t num_updates,
+                                      int64_t num_accumulates,
+                                      int64_t old_num_accumulates,
+                                      DenseTensor* out_num_accumulates,
+                                      DenseTensor* out_old_num_accumulates,
+                                      DenseTensor* out_num_updates) {
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
+  out_num_updates->data<int64_t>()[0] = num_updates;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(average_accumulates,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AverageAccumulatesKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <>
+void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
+                                      const DenseTensor& in_num_accumulates,
+                                      const DenseTensor& in_old_num_accumulates,
+                                      const DenseTensor& in_num_updates,
+                                      int64_t* num_updates,
+                                      int64_t* num_accumulates,
+                                      int64_t* old_num_accumulates) {
+  auto stream = dev_ctx.stream();
+  auto cuda_place = in_old_num_accumulates.place();
+  paddle::memory::Copy(phi::CPUPlace(),
+                       old_num_accumulates,
+                       cuda_place,
+                       in_old_num_accumulates.data<int64_t>(),
+                       sizeof(int64_t),
+                       stream);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       num_accumulates,
+                       cuda_place,
+                       in_num_accumulates.data<int64_t>(),
+                       sizeof(int64_t),
+                       stream);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       num_updates,
+                       cuda_place,
+                       in_num_updates.data<int64_t>(),
+                       sizeof(int64_t),
+                       stream);
+}
+
+template <>
+void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
+                                      int64_t num_updates,
+                                      int64_t num_accumulates,
+                                      int64_t old_num_accumulates,
+                                      DenseTensor* out_num_accumulates,
+                                      DenseTensor* out_old_num_accumulates,
+                                      DenseTensor* out_num_updates) {
+  int64_t* out_num_accumulates_ptr =
+      dev_ctx.template Alloc<int64_t>(out_num_accumulates);
+  int64_t* out_old_num_accumulates_ptr =
+      dev_ctx.template Alloc<int64_t>(out_old_num_accumulates);
+  int64_t* out_num_updates_ptr =
+      dev_ctx.template Alloc<int64_t>(out_num_updates);
+
+  auto stream = dev_ctx.stream();
+
+  auto cuda_place = out_old_num_accumulates->place();
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       out_num_accumulates_ptr,
+                       phi::CPUPlace(),
+                       &num_accumulates,
+                       sizeof(int64_t),
+                       stream);
+
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       out_old_num_accumulates_ptr,
+                       phi::CPUPlace(),
+                       &old_num_accumulates,
+                       sizeof(int64_t),
+                       stream);
+
+  paddle::memory::Copy(cuda_place,
+                       out_num_updates_ptr,
+                       phi::CPUPlace(),
+                       &num_updates,
+                       sizeof(int64_t),
+                       stream);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(average_accumulates,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AverageAccumulatesKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
+++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+
+#include <algorithm>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AverageAccumulatesKernel(const Context& dev_ctx,
+                              const DenseTensor& param,
+                              const DenseTensor& in_sum_1,
+                              const DenseTensor& in_sum_2,
+                              const DenseTensor& in_sum_3,
+                              const DenseTensor& in_num_accumulates,
+                              const DenseTensor& in_old_num_accumulates,
+                              const DenseTensor& in_num_updates,
+                              float average_window,
+                              int64_t max_average_window,
+                              int64_t min_average_window,
+                              DenseTensor* out_sum_1,
+                              DenseTensor* out_sum_2,
+                              DenseTensor* out_sum_3,
+                              DenseTensor* out_num_accumulates,
+                              DenseTensor* out_old_num_accumulates,
+                              DenseTensor* out_num_updates) {
+  // It is used to avoid loss of precision
+  static const int64_t kMaxNumAccumulates = 16384;
+  // Get accumulators from input
+  // int64_t num_updates = 0;
+  // int64_t num_accumulates = 0;
+  // int64_t old_num_accumulates = 0;
+
+  auto num_updates_cpu =
+      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+  int64_t* num_updates_cpu_ptr =
+      reinterpret_cast<int64_t*>(num_updates_cpu->ptr());
+
+  auto num_accumulates_cpu =
+      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+  int64_t* num_accumulates_cpu_ptr =
+      reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr());
+
+  auto old_num_accumulates_cpu =
+      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+  int64_t* old_num_accumulates_cpu_ptr =
+      reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr());
+
+  GetAccumulators<Context>(dev_ctx,
+                           in_num_accumulates,
+                           in_old_num_accumulates,
+                           in_num_updates,
+                           num_updates_cpu_ptr,
+                           num_accumulates_cpu_ptr,
+                           old_num_accumulates_cpu_ptr);
+  // Get attrs
+  // float average_window = ctx.Attr<float>("average_window");
+  // int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+  // int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+  PADDLE_ENFORCE_LE(
+      min_average_window,
+      max_average_window,
+      errors::InvalidArgument(
+          "The min_average_window > "
+          "max_average_window is not right, min_average_window is %ld, "
+          "max_average_window is %ld.",
+          min_average_window,
+          max_average_window));
+
+  // Get inputs
+  // auto* param = ctx.Input<Tensor>("param");
+  // auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+  // auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+  // auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+  auto param_tensor = EigenVector<T>::Flatten(param);
+  auto in_sum_1_tensor = EigenVector<T>::Flatten(in_sum_1);
+  auto in_sum_2_tensor = EigenVector<T>::Flatten(in_sum_2);
+  auto in_sum_3_tensor = EigenVector<T>::Flatten(in_sum_3);
+
+  // Get outputs
+  // auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+  // auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+  // auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+  dev_ctx.template Alloc<T>(out_sum_1);
+  dev_ctx.template Alloc<T>(out_sum_2);
+  dev_ctx.template Alloc<T>(out_sum_3);
+
+  auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+  auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+  auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+
+  // Compute
+  // auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+  auto& place = *dev_ctx.eigen_device();
+
+  funcs::SetConstant<Context, T> constant_functor;
+  ++(*num_updates_cpu_ptr);
+  ++(*num_accumulates_cpu_ptr);
+  out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+  out_sum_2_tensor.device(place) = in_sum_2_tensor;
+  out_sum_3_tensor.device(place) = in_sum_3_tensor;
+  if ((*num_updates_cpu_ptr) % kMaxNumAccumulates == 0) {
+    // Move the sum to a different buffer to avoid loss of precision due to
+    // too many sums.
+    out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+    constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
+  }
+  if ((*num_accumulates_cpu_ptr) >= min_average_window &&
+      (*num_accumulates_cpu_ptr) >=
+          std::min<int64_t>(max_average_window,
+                            (*num_updates_cpu_ptr) * average_window)) {
+    //  Now the average window is too long, discard the old sum.
+    out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+    constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
+    constant_functor(dev_ctx, out_sum_2, static_cast<T>(0));
+    (*old_num_accumulates_cpu_ptr) = (*num_accumulates_cpu_ptr);
+    (*num_accumulates_cpu_ptr) = 0;
+  }
+
+  // Set accumulators to output
+  SetAccumulators<Context>(dev_ctx,
+                           *num_updates_cpu_ptr,
+                           *num_accumulates_cpu_ptr,
+                           *old_num_accumulates_cpu_ptr,
+                           out_num_accumulates,
+                           out_old_num_accumulates,
+                           out_num_updates);
+}
+
+}  // namespace phi
--- a/paddle/phi/ops/compat/average_accumulates_sig.cc
+++ b/paddle/phi/ops/compat/average_accumulates_sig.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+KernelSignature AverageAccumulatesOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "average_accumulates",
+      {"param",
+       "in_sum_1",
+       "in_sum_2",
+       "in_sum_3",
+       "in_num_accumulates",
+       "in_old_num_accumulates",
+       "in_num_updates"},
+      {"average_window", "max_average_window", "min_average_window"},
+      {"out_sum_1",
+       "out_sum_2",
+       "out_sum_3",
+       "out_num_accumulates",
+       "out_old_num_accumulates",
+       "out_num_updates"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(average_accumulates,
+                           phi::AverageAccumulatesOpArgumentMapping);
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -21,6 +21,7 @@ import numpy as np
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 from paddle import _C_ops
+from paddle.fluid.framework import in_dygraph_mode

 __all__ = []

@@ -231,7 +232,14 @@ class ModelAverage(Optimizer):
        old_num_accumulates = self._get_accumulator('old_num_accumulates',
                                                    param_and_grad[0])
        num_updates = self._get_accumulator('num_updates', param_and_grad[0])
-        if framework._non_static_mode():
+
+        if in_dygraph_mode():
+            _, _, _, _, _, _ = _C_ops.final_state_average_accumulates_(
+                param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
+                old_num_accumulates, num_updates, self.average_window,
+                self.max_average_window, self.min_average_window)
+            return None
+        elif framework._non_static_mode():
            _, _, _, _, _, _ = _C_ops.average_accumulates(
                param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
                old_num_accumulates, num_updates, sum_1, sum_2, sum_3,