未验证 提交 eafd4280 编写于 作者: W Wang Bojun 提交者: GitHub

Phi average accumulates migration (#44554)

* move average_accumulates op to phi kernel
上级 122fff46
......@@ -57,6 +57,8 @@ no_amp_list = [
'adam',
'adamw_',
'adamw',
'average_accumulates',
'average_accumulates_',
'decayed_adagrad_',
'decayed_adagrad',
'dgc_momentum_',
......
......@@ -12,99 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/average_accumulates_op.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
template <>
void GetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
int64_t* num_updates,
int64_t* num_accumulates,
int64_t* old_num_accumulates) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
*old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
*num_accumulates = in_num_accumulates->data<int64_t>()[0];
*num_updates = in_num_updates->data<int64_t>()[0];
}
template <>
void SetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
int64_t num_updates,
int64_t num_accumulates,
int64_t old_num_accumulates) {
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
out_num_accumulates->data<int64_t>()[0] = num_accumulates;
out_num_updates->data<int64_t>()[0] = num_updates;
}
class AverageAccumulatesOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(
ctx->HasInput("param"), "Input", "param", "AverageAccumulates");
OP_INOUT_CHECK(
ctx->HasInput("in_sum_1"), "Input", "in_sum_1", "AverageAccumulates");
OP_INOUT_CHECK(
ctx->HasInput("in_sum_2"), "Input", "in_sum_2", "AverageAccumulates");
OP_INOUT_CHECK(
ctx->HasInput("in_sum_3"), "Input", "in_sum_3", "AverageAccumulates");
OP_INOUT_CHECK(ctx->HasInput("in_num_accumulates"),
"Input",
"in_num_accumulates",
"AverageAccumulates");
OP_INOUT_CHECK(ctx->HasInput("in_old_num_accumulates"),
"Input",
"in_old_num_accumulates",
"AverageAccumulates");
OP_INOUT_CHECK(ctx->HasInput("in_num_updates"),
"Input",
"in_num_updates",
"AverageAccumulates");
OP_INOUT_CHECK(ctx->HasOutput("out_sum_1"),
"Output",
"out_sum_1",
"AverageAccumulates");
OP_INOUT_CHECK(ctx->HasOutput("out_sum_2"),
"Output",
"out_sum_2",
"AverageAccumulates");
OP_INOUT_CHECK(ctx->HasOutput("out_sum_3"),
"Output",
"out_sum_3",
"AverageAccumulates");
OP_INOUT_CHECK(ctx->HasOutput("out_num_accumulates"),
"Output",
"out_num_accumulates",
"AverageAccumulates");
OP_INOUT_CHECK(ctx->HasOutput("out_old_num_accumulates"),
"Output",
"out_old_num_accumulates",
"AverageAccumulates");
OP_INOUT_CHECK(ctx->HasOutput("out_num_updates"),
"Output",
"out_num_updates",
"AverageAccumulates");
auto in_dim = ctx->GetInputDim("param");
ctx->SetOutputDim("out_sum_1", in_dim);
ctx->SetOutputDim("out_sum_2", in_dim);
ctx->SetOutputDim("out_sum_3", in_dim);
ctx->SetOutputDim("out_num_accumulates", {1});
ctx->SetOutputDim("out_old_num_accumulates", {1});
ctx->SetOutputDim("out_num_updates", {1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -209,12 +129,14 @@ And for a mini-batch in training, accumulators were computed as below steps:
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates,
AverageAccumulatesInferShapeFunctor,
PD_INFER_META(phi::AverageAccumulatesInferMeta));
REGISTER_OPERATOR(
average_accumulates,
ops::AverageAccumulatesOp,
ops::AverageAccumulatesOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(average_accumulates,
ops::AverageAccumulatesKernel<phi::CPUContext, float>,
ops::AverageAccumulatesKernel<phi::CPUContext, double>);
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
AverageAccumulatesInferShapeFunctor);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/average_accumulates_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
namespace paddle {
namespace operators {
template <>
void GetAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx,
int64_t* num_updates_,
int64_t* num_accumulates_,
int64_t* old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream();
auto cuda_place = in_old_num_accumulates->place();
memory::Copy(platform::CPUPlace(),
old_num_accumulates_,
cuda_place,
in_old_num_accumulates->data<int64_t>(),
sizeof(int64_t),
stream);
memory::Copy(platform::CPUPlace(),
num_accumulates_,
cuda_place,
in_num_accumulates->data<int64_t>(),
sizeof(int64_t),
stream);
memory::Copy(platform::CPUPlace(),
num_updates_,
cuda_place,
in_num_updates->data<int64_t>(),
sizeof(int64_t),
stream);
}
template <>
void SetAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx,
int64_t num_updates_,
int64_t num_accumulates_,
int64_t old_num_accumulates_) {
auto stream = ctx.cuda_device_context().stream();
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
auto cuda_place = out_old_num_accumulates->place();
memory::Copy(cuda_place,
out_old_num_accumulates->data<int64_t>(),
platform::CPUPlace(),
&old_num_accumulates_,
sizeof(int64_t),
stream);
memory::Copy(cuda_place,
out_num_accumulates->data<int64_t>(),
platform::CPUPlace(),
&num_accumulates_,
sizeof(int64_t),
stream);
memory::Copy(cuda_place,
out_num_updates->data<int64_t>(),
platform::CPUPlace(),
&num_updates_,
sizeof(int64_t),
stream);
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
average_accumulates,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext>
void GetAccumulators(const framework::ExecutionContext& ctx,
int64_t* num_updates,
int64_t* num_accumulates,
int64_t* old_num_accumulates);
template <typename DeviceContext>
void SetAccumulators(const framework::ExecutionContext& ctx,
int64_t num_updates,
int64_t num_accumulates,
int64_t old_num_accumulates);
template <typename DeviceContext, typename T>
class AverageAccumulatesKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// It is used to avoid loss of precision
static const int64_t kMaxNumAccumulates = 16384;
// Get accumulators from input
int64_t num_updates = 0;
int64_t num_accumulates = 0;
int64_t old_num_accumulates = 0;
GetAccumulators<DeviceContext>(
ctx, &num_updates, &num_accumulates, &old_num_accumulates);
// Get attrs
float average_window = ctx.Attr<float>("average_window");
int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
PADDLE_ENFORCE_LE(
min_average_window,
max_average_window,
platform::errors::InvalidArgument(
"The min_average_window > "
"max_average_window is not right, min_average_window is %ld, "
"max_average_window is %ld.",
min_average_window,
max_average_window));
// Get inputs
auto* param = ctx.Input<Tensor>("param");
auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
auto param_tensor = framework::EigenVector<T>::Flatten(*param);
auto in_sum_1_tensor = framework::EigenVector<T>::Flatten(*in_sum_1);
auto in_sum_2_tensor = framework::EigenVector<T>::Flatten(*in_sum_2);
auto in_sum_3_tensor = framework::EigenVector<T>::Flatten(*in_sum_3);
// Get outputs
auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
auto out_sum_1_tensor = framework::EigenVector<T>::Flatten(*out_sum_1);
auto out_sum_2_tensor = framework::EigenVector<T>::Flatten(*out_sum_2);
auto out_sum_3_tensor = framework::EigenVector<T>::Flatten(*out_sum_3);
// Compute
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
phi::funcs::SetConstant<DeviceContext, T> constant_functor;
++num_updates;
++num_accumulates;
out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
out_sum_2_tensor.device(place) = in_sum_2_tensor;
out_sum_3_tensor.device(place) = in_sum_3_tensor;
if (num_updates % kMaxNumAccumulates == 0) {
// Move the sum to a different buffer to avoid loss of precision due to
// too many sums.
out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
constant_functor(
ctx.template device_context<DeviceContext>(), out_sum_1, 0.0);
}
if (num_accumulates >= min_average_window &&
num_accumulates >= std::min<int64_t>(max_average_window,
num_updates * average_window)) {
// Now the average window is too long, discard the old sum.
out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
constant_functor(
ctx.template device_context<DeviceContext>(), out_sum_1, 0.0);
constant_functor(
ctx.template device_context<DeviceContext>(), out_sum_2, 0.0);
old_num_accumulates = num_accumulates;
num_accumulates = 0;
}
// Set accumulators to output
SetAccumulators<DeviceContext>(
ctx, num_updates, num_accumulates, old_num_accumulates);
}
};
} // namespace operators
} // namespace paddle
......@@ -264,6 +264,17 @@
kernel :
func : auc
#average_accumulates
- api : average_accumulates_
args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int64_t max_average_window, int64_t min_average_window)
output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
infer_meta:
func : AverageAccumulatesInferMeta
kernel :
func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense}
data_type : param
inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
# batch_norm
- api : batch_norm
args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
......
......@@ -434,6 +434,68 @@ void AucInferMeta(const MetaTensor& input,
}
}
void AverageAccumulatesInferMeta(const MetaTensor& param,
const MetaTensor& in_sum_1,
const MetaTensor& in_sum_2,
const MetaTensor& in_sum_3,
const MetaTensor& in_num_accumulates,
const MetaTensor& in_old_num_accumulates,
const MetaTensor& in_num_updates,
float average_window,
int64_t max_average_window,
int64_t min_average_window,
MetaTensor* out_sum_1,
MetaTensor* out_sum_2,
MetaTensor* out_sum_3,
MetaTensor* out_num_accumulates,
MetaTensor* out_old_num_accumulates,
MetaTensor* out_num_updates) {
// auto in_dim = param.dims;
PADDLE_ENFORCE_NE(
out_sum_1,
nullptr,
errors::NotFound(
"Output(out_sum_1) of AverageAccumulates should not be null."));
PADDLE_ENFORCE_NE(
out_sum_2,
nullptr,
errors::NotFound(
"Output(out_sum_2) of AverageAccumulates should not be null."));
PADDLE_ENFORCE_NE(
out_sum_3,
nullptr,
errors::NotFound(
"Output(out_sum_3) of AverageAccumulates should not be null."));
PADDLE_ENFORCE_NE(out_num_accumulates,
nullptr,
errors::NotFound("Output(out_num_accumulates) of "
"AverageAccumulates should not be null."));
PADDLE_ENFORCE_NE(out_old_num_accumulates,
nullptr,
errors::NotFound("Output(out_old_num_accumulates) of "
"AverageAccumulates should not be null."));
PADDLE_ENFORCE_NE(
out_num_updates,
nullptr,
errors::NotFound(
"Output(out_num_updates) of AverageAccumulates should not be null."));
out_sum_1->set_dims(in_sum_1.dims());
out_sum_1->set_dtype(in_sum_1.dtype());
out_sum_2->set_dims(in_sum_2.dims());
out_sum_2->set_dtype(in_sum_2.dtype());
out_sum_3->set_dims(in_sum_3.dims());
out_sum_3->set_dtype(in_sum_3.dtype());
out_num_accumulates->set_dims({1});
out_num_accumulates->set_dtype(in_num_accumulates.dtype());
out_old_num_accumulates->set_dims({1});
out_old_num_accumulates->set_dtype(in_old_num_accumulates.dtype());
out_num_updates->set_dims({1});
out_num_updates->set_dtype(in_num_updates.dtype());
}
void BatchNormInferMeta(const MetaTensor& x,
const MetaTensor& scale,
const MetaTensor& bias,
......
......@@ -134,6 +134,23 @@ void AucInferMeta(const MetaTensor& input,
MetaTensor* stat_neg_out,
MetaConfig config = MetaConfig());
void AverageAccumulatesInferMeta(const MetaTensor& param,
const MetaTensor& in_sum_1,
const MetaTensor& in_sum_2,
const MetaTensor& in_sum_3,
const MetaTensor& in_num_accumulates,
const MetaTensor& in_old_num_accumulates,
const MetaTensor& in_num_updates,
float average_window,
int64_t max_average_window,
int64_t min_average_window,
MetaTensor* out_sum_1,
MetaTensor* out_sum_2,
MetaTensor* out_sum_3,
MetaTensor* out_num_accumulates,
MetaTensor* out_old_num_accumulates,
MetaTensor* out_num_updates);
void BatchNormInferMeta(const MetaTensor& x,
const MetaTensor& scale,
const MetaTensor& bias,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename Context>
void GetAccumulators(const Context& dev_ctx,
const DenseTensor& in_num_accumulates,
const DenseTensor& in_old_num_accumulates,
const DenseTensor& in_num_updates,
int64_t* num_updates,
int64_t* num_accumulates,
int64_t* old_num_accumulates);
template <typename Context>
void SetAccumulators(const Context& dev_ctx,
int64_t num_updates,
int64_t num_accumulates,
int64_t old_num_accumulates,
DenseTensor* out_num_accumulates,
DenseTensor* out_old_num_accumulates,
DenseTensor* out_num_updates);
template <typename T, typename Context>
void AverageAccumulatesKernel(const Context& dev_ctx,
const DenseTensor& param,
const DenseTensor& in_sum_1,
const DenseTensor& in_sum_2,
const DenseTensor& in_sum_3,
const DenseTensor& in_num_accumulates,
const DenseTensor& in_old_num_accumulates,
const DenseTensor& in_num_updates,
float average_window,
int64_t max_average_window,
int64_t min_average_window,
DenseTensor* out_sum_1,
DenseTensor* out_sum_2,
DenseTensor* out_sum_3,
DenseTensor* out_num_accumulates,
DenseTensor* out_old_num_accumulates,
DenseTensor* out_num_updates);
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/average_accumulates_kernel.h"
#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <>
void GetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
const DenseTensor& in_num_accumulates,
const DenseTensor& in_old_num_accumulates,
const DenseTensor& in_num_updates,
int64_t* num_updates,
int64_t* num_accumulates,
int64_t* old_num_accumulates) {
*old_num_accumulates = in_old_num_accumulates.data<int64_t>()[0];
*num_accumulates = in_num_accumulates.data<int64_t>()[0];
*num_updates = in_num_updates.data<int64_t>()[0];
}
template <>
void SetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
int64_t num_updates,
int64_t num_accumulates,
int64_t old_num_accumulates,
DenseTensor* out_num_accumulates,
DenseTensor* out_old_num_accumulates,
DenseTensor* out_num_updates) {
out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
out_num_accumulates->data<int64_t>()[0] = num_accumulates;
out_num_updates->data<int64_t>()[0] = num_updates;
}
} // namespace phi
PD_REGISTER_KERNEL(average_accumulates,
CPU,
ALL_LAYOUT,
phi::AverageAccumulatesKernel,
float,
double) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/average_accumulates_kernel.h"
#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <>
void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
const DenseTensor& in_num_accumulates,
const DenseTensor& in_old_num_accumulates,
const DenseTensor& in_num_updates,
int64_t* num_updates,
int64_t* num_accumulates,
int64_t* old_num_accumulates) {
auto stream = dev_ctx.stream();
auto cuda_place = in_old_num_accumulates.place();
paddle::memory::Copy(phi::CPUPlace(),
old_num_accumulates,
cuda_place,
in_old_num_accumulates.data<int64_t>(),
sizeof(int64_t),
stream);
paddle::memory::Copy(phi::CPUPlace(),
num_accumulates,
cuda_place,
in_num_accumulates.data<int64_t>(),
sizeof(int64_t),
stream);
paddle::memory::Copy(phi::CPUPlace(),
num_updates,
cuda_place,
in_num_updates.data<int64_t>(),
sizeof(int64_t),
stream);
}
template <>
void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
int64_t num_updates,
int64_t num_accumulates,
int64_t old_num_accumulates,
DenseTensor* out_num_accumulates,
DenseTensor* out_old_num_accumulates,
DenseTensor* out_num_updates) {
int64_t* out_num_accumulates_ptr =
dev_ctx.template Alloc<int64_t>(out_num_accumulates);
int64_t* out_old_num_accumulates_ptr =
dev_ctx.template Alloc<int64_t>(out_old_num_accumulates);
int64_t* out_num_updates_ptr =
dev_ctx.template Alloc<int64_t>(out_num_updates);
auto stream = dev_ctx.stream();
auto cuda_place = out_old_num_accumulates->place();
paddle::memory::Copy(dev_ctx.GetPlace(),
out_num_accumulates_ptr,
phi::CPUPlace(),
&num_accumulates,
sizeof(int64_t),
stream);
paddle::memory::Copy(dev_ctx.GetPlace(),
out_old_num_accumulates_ptr,
phi::CPUPlace(),
&old_num_accumulates,
sizeof(int64_t),
stream);
paddle::memory::Copy(cuda_place,
out_num_updates_ptr,
phi::CPUPlace(),
&num_updates,
sizeof(int64_t),
stream);
}
} // namespace phi
PD_REGISTER_KERNEL(average_accumulates,
GPU,
ALL_LAYOUT,
phi::AverageAccumulatesKernel,
float,
double) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/kernels/average_accumulates_kernel.h"
#include <algorithm>
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T, typename Context>
void AverageAccumulatesKernel(const Context& dev_ctx,
const DenseTensor& param,
const DenseTensor& in_sum_1,
const DenseTensor& in_sum_2,
const DenseTensor& in_sum_3,
const DenseTensor& in_num_accumulates,
const DenseTensor& in_old_num_accumulates,
const DenseTensor& in_num_updates,
float average_window,
int64_t max_average_window,
int64_t min_average_window,
DenseTensor* out_sum_1,
DenseTensor* out_sum_2,
DenseTensor* out_sum_3,
DenseTensor* out_num_accumulates,
DenseTensor* out_old_num_accumulates,
DenseTensor* out_num_updates) {
// It is used to avoid loss of precision
static const int64_t kMaxNumAccumulates = 16384;
// Get accumulators from input
// int64_t num_updates = 0;
// int64_t num_accumulates = 0;
// int64_t old_num_accumulates = 0;
auto num_updates_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
int64_t* num_updates_cpu_ptr =
reinterpret_cast<int64_t*>(num_updates_cpu->ptr());
auto num_accumulates_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
int64_t* num_accumulates_cpu_ptr =
reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr());
auto old_num_accumulates_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
int64_t* old_num_accumulates_cpu_ptr =
reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr());
GetAccumulators<Context>(dev_ctx,
in_num_accumulates,
in_old_num_accumulates,
in_num_updates,
num_updates_cpu_ptr,
num_accumulates_cpu_ptr,
old_num_accumulates_cpu_ptr);
// Get attrs
// float average_window = ctx.Attr<float>("average_window");
// int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
// int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
PADDLE_ENFORCE_LE(
min_average_window,
max_average_window,
errors::InvalidArgument(
"The min_average_window > "
"max_average_window is not right, min_average_window is %ld, "
"max_average_window is %ld.",
min_average_window,
max_average_window));
// Get inputs
// auto* param = ctx.Input<Tensor>("param");
// auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
// auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
// auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
auto param_tensor = EigenVector<T>::Flatten(param);
auto in_sum_1_tensor = EigenVector<T>::Flatten(in_sum_1);
auto in_sum_2_tensor = EigenVector<T>::Flatten(in_sum_2);
auto in_sum_3_tensor = EigenVector<T>::Flatten(in_sum_3);
// Get outputs
// auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
// auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
// auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
dev_ctx.template Alloc<T>(out_sum_1);
dev_ctx.template Alloc<T>(out_sum_2);
dev_ctx.template Alloc<T>(out_sum_3);
auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
// Compute
// auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto& place = *dev_ctx.eigen_device();
funcs::SetConstant<Context, T> constant_functor;
++(*num_updates_cpu_ptr);
++(*num_accumulates_cpu_ptr);
out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
out_sum_2_tensor.device(place) = in_sum_2_tensor;
out_sum_3_tensor.device(place) = in_sum_3_tensor;
if ((*num_updates_cpu_ptr) % kMaxNumAccumulates == 0) {
// Move the sum to a different buffer to avoid loss of precision due to
// too many sums.
out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
}
if ((*num_accumulates_cpu_ptr) >= min_average_window &&
(*num_accumulates_cpu_ptr) >=
std::min<int64_t>(max_average_window,
(*num_updates_cpu_ptr) * average_window)) {
// Now the average window is too long, discard the old sum.
out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
constant_functor(dev_ctx, out_sum_2, static_cast<T>(0));
(*old_num_accumulates_cpu_ptr) = (*num_accumulates_cpu_ptr);
(*num_accumulates_cpu_ptr) = 0;
}
// Set accumulators to output
SetAccumulators<Context>(dev_ctx,
*num_updates_cpu_ptr,
*num_accumulates_cpu_ptr,
*old_num_accumulates_cpu_ptr,
out_num_accumulates,
out_old_num_accumulates,
out_num_updates);
}
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature AverageAccumulatesOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"average_accumulates",
{"param",
"in_sum_1",
"in_sum_2",
"in_sum_3",
"in_num_accumulates",
"in_old_num_accumulates",
"in_num_updates"},
{"average_window", "max_average_window", "min_average_window"},
{"out_sum_1",
"out_sum_2",
"out_sum_3",
"out_num_accumulates",
"out_old_num_accumulates",
"out_num_updates"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(average_accumulates,
phi::AverageAccumulatesOpArgumentMapping);
......@@ -21,6 +21,7 @@ import numpy as np
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
from paddle import _C_ops
from paddle.fluid.framework import in_dygraph_mode
__all__ = []
......@@ -231,7 +232,14 @@ class ModelAverage(Optimizer):
old_num_accumulates = self._get_accumulator('old_num_accumulates',
param_and_grad[0])
num_updates = self._get_accumulator('num_updates', param_and_grad[0])
if framework._non_static_mode():
if in_dygraph_mode():
_, _, _, _, _, _ = _C_ops.final_state_average_accumulates_(
param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
old_num_accumulates, num_updates, self.average_window,
self.max_average_window, self.min_average_window)
return None
elif framework._non_static_mode():
_, _, _, _, _, _ = _C_ops.average_accumulates(
param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
old_num_accumulates, num_updates, sum_1, sum_2, sum_3,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册