提交 e0b136c0 编写于 作者: W wanghaoshuang

Refine average accumulates op

1. Rename inputs and outputs
2. Add some comments
上级 87fe52c1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -21,9 +21,9 @@ template <>
void getAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("num_updates");
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
......@@ -34,9 +34,9 @@ template <>
void setAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t num_updates_,
int64_t num_accumulates_, int64_t old_num_accumulates_) {
auto* out_old_num_accumulates = ctx.Output<Tensor>("old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("num_updates");
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
......@@ -49,64 +49,62 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("Param"),
"Input (Param) of average_accumulates op should not be null.");
ctx->HasInput("param"),
"Input (param) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("Grad"),
"Input (Grad) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("sum_1"),
ctx->HasInput("in_sum_1"),
"Input (sum_1) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("sum_2"),
ctx->HasInput("in_sum_2"),
"Input (sum_2) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("sum_3"),
ctx->HasInput("in_sum_3"),
"Input (sum_3) of average_accumulates op should not be null.");
PADDLE_ENFORCE(ctx->HasInput("num_accumulates"),
"Input (num_accumulates) of average_accumulates op should "
PADDLE_ENFORCE(
ctx->HasInput("in_num_accumulates"),
"Input (in_num_accumulates) of average_accumulates op should "
"not be null.");
PADDLE_ENFORCE(ctx->HasInput("old_num_accumulates"),
PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
"Input (old_num_accumulates) of average_accumulates op "
"should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("num_updates"),
ctx->HasInput("in_num_updates"),
"Input (num_updates) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("sum_1"),
ctx->HasOutput("out_sum_1"),
"Output (sum_1) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("sum_2"),
ctx->HasOutput("out_sum_2"),
"Output (sum_2) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("sum_3"),
ctx->HasOutput("out_sum_3"),
"Output (sum_3) of average_accumulates op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("num_accumulates"),
PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
"Output (num_accumulates) of average_accumulates op should "
"not be null.");
PADDLE_ENFORCE(ctx->HasOutput("old_num_accumulates"),
PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
"Output (old_num_accumulates) of average_accumulates op "
"should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("num_updates"),
ctx->HasOutput("out_num_updates"),
"Output (num_updates) of average_accumulates op should not be null.");
auto in_dim = ctx->GetInputDim("Param");
auto in_dim = ctx->GetInputDim("param");
ctx->SetOutputDim("sum_1", in_dim);
ctx->SetOutputDim("sum_2", in_dim);
ctx->SetOutputDim("sum_3", in_dim);
ctx->SetOutputDim("num_accumulates", {1});
ctx->SetOutputDim("old_num_accumulates", {1});
ctx->SetOutputDim("num_updates", {1});
ctx->SetOutputDim("out_sum_1", in_dim);
ctx->SetOutputDim("out_sum_2", in_dim);
ctx->SetOutputDim("out_sum_3", in_dim);
ctx->SetOutputDim("out_num_accumulates", {1});
ctx->SetOutputDim("out_old_num_accumulates", {1});
ctx->SetOutputDim("out_num_updates", {1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Param")->type()),
framework::ToDataType(ctx.Input<Tensor>("param")->type()),
ctx.GetPlace());
}
};
......@@ -115,26 +113,60 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
public:
AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("sum_1", "");
AddInput("sum_2", "");
AddInput("sum_3", "");
AddInput("num_accumulates", "");
AddInput("old_num_accumulates", "");
AddInput("num_updates", "");
AddOutput("sum_1", "");
AddOutput("sum_2", "");
AddOutput("sum_3", "");
AddOutput("num_accumulates", "");
AddOutput("old_num_accumulates", "");
AddOutput("num_updates", "");
AddAttr<float>("", "average_window");
AddAttr<float>("", "max_average_window");
AddAttr<float>("", "min_average_window");
AddInput("param",
"Input(Tensor or LoDTensor): The parameter to be accumulated.");
AddInput("in_sum_1",
"Input(Tensor or LoDTensor): A tensor used to store the parameter "
"sums with the same shape as input(param).");
AddInput("in_sum_2",
"Input(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param). It is used to avoid loss of precision due to too "
"many sums.");
AddInput("in_sum_3",
"Input(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param).");
AddInput("in_num_accumulates",
"Input(Tensor): The accumulating times of current window with "
"shape [1].");
AddInput("in_old_num_accumulates",
"Input(Tensor): The accumulating times of previous window with "
"shape [1].");
AddInput("in_num_updates",
"Input(Tensor): The total number of batches used by trainning "
"before this batch with shape [1].");
AddOutput("out_sum_1",
"Output(Tensor or LoDTensor): A tensor used to store the "
"parameter sums with the same shape as input(param).");
AddOutput("out_sum_2",
"Output(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param). It is used to avoid loss of precision due to too "
"many sums.");
AddOutput("out_sum_3",
"Output(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param).");
AddOutput("out_num_accumulates",
"Output(Tensor): The accumulating times of current window with "
"shape [1].");
AddOutput("out_old_num_accumulates",
"Output(Tensor): The accumulating times of previous window with "
"shape [1].");
AddOutput("out_num_updates",
"Output(Tensor): The total number of batches used by trainning "
"before this batch with shape [1].");
AddAttr<float>("average_window",
"The rate of average window size relative to num_updates.");
AddAttr<int64_t>("max_average_window", "Maximum size of average window.");
AddAttr<int64_t>("min_average_window", "Minimu size of average window.");
AddComment(R"DOC(
AverageAccumulates Operator.
Accumulate the sum of parameter whtin sliding window. The size of sliding window is determined by 'average_window', 'max_average_window' and 'min_average_window'.
)DOC");
}
};
......@@ -143,10 +175,10 @@ AverageAccumulates Operator.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(average_accumulate, ops::AverageAccumulatesOp,
REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
ops::AverageAccumulatesOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
average_accumulate,
average_accumulates,
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -21,39 +21,43 @@ template <>
void getAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("num_updates");
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
sizeof(int64_t));
sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
in_old_num_accumulates->data<int64_t>(), sizeof(int64_t));
in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
in_num_updates->data<int64_t>(), sizeof(int64_t));
in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
}
template <>
void setAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t num_updates_,
int64_t num_accumulates_, int64_t old_num_accumulates_) {
auto* out_old_num_accumulates = ctx.Output<Tensor>("old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("num_updates");
auto stream = ctx.cuda_device_context().stream();
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t));
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
stream);
memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &num_accumulates_, sizeof(int64_t));
platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
stream);
memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
platform::CPUPlace(), &num_updates_, sizeof(int64_t));
}
}
platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
average_accumulate,
average_accumulates,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -29,88 +29,80 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext>
void getAccumulators(const framework::ExecutionContext& ctx,
int64_t& num_updates_, int64_t& num_accumulates_,
int64_t& old_num_accumulates_);
int64_t& num_updates, int64_t& num_accumulates,
int64_t& old_num_accumulates);
template <typename DeviceContext>
void setAccumulators(const framework::ExecutionContext& ctx,
int64_t num_updates_, int64_t num_accumulates_,
int64_t old_num_accumulates_);
int64_t num_updates, int64_t num_accumulates,
int64_t old_num_accumulates);
template <typename DeviceContext, typename T>
class AverageAccumulatesKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// It is used to avoid loss of precision
static const int64_t kMaxNumAccumulates = 16384;
// accumulators
int64_t num_updates_ = 0;
int64_t num_accumulates_ = 0;
int64_t old_num_accumulates_ = 0;
// attrs
int64_t min_average_window_;
int64_t max_average_window_;
float average_window_;
auto* param = ctx.Input<Tensor>("Param");
auto* in_sum_1 = ctx.Input<Tensor>("sum_1");
auto* in_sum_2 = ctx.Input<Tensor>("sum_2");
auto* in_sum_3 = ctx.Input<Tensor>("sum_3");
auto* out_sum_1 = ctx.Output<Tensor>("sum_1");
auto* out_sum_2 = ctx.Output<Tensor>("sum_2");
auto* out_sum_3 = ctx.Output<Tensor>("sum_3");
getAccumulators<DeviceContext>(ctx, num_updates_, num_accumulates_,
old_num_accumulates_);
average_window_ = ctx.Attr<float>("average_window");
max_average_window_ =
ctx.Attr<int64_t>("max_average_window"); // default bach number
min_average_window_ =
ctx.Attr<int64_t>("min_average_window"); // default 10000L
min_average_window_ =
std::min<int64_t>(min_average_window_, max_average_window_);
// Get accumulators from input
int64_t num_updates = 0;
int64_t num_accumulates = 0;
int64_t old_num_accumulates = 0;
getAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
old_num_accumulates);
// Get attrs
float average_window = ctx.Attr<float>("average_window");
int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
min_average_window =
std::min<int64_t>(min_average_window, max_average_window);
// Get inputs
auto* param = ctx.Input<Tensor>("param");
auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
auto param_tensor = EigenVector<T>::Flatten(*param);
auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
// Get outputs
auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
// Compute
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
math::SetConstant<DeviceContext, T> constant_functor;
// start batch
++num_updates_;
++num_accumulates_;
// update
++num_updates;
++num_accumulates;
out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
out_sum_2_tensor.device(place) = in_sum_2_tensor;
out_sum_3_tensor.device(place) = in_sum_3_tensor;
// needSpecialTraversal
if (num_updates_ % kMaxNumAccumulates == 0) {
if (num_updates % kMaxNumAccumulates == 0) {
out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
0.0);
}
if (num_accumulates_ >= min_average_window_ &&
num_accumulates_ >= std::min<int64_t>(max_average_window_,
num_updates_ * average_window_)) {
if (num_accumulates >= min_average_window &&
num_accumulates >= std::min<int64_t>(max_average_window,
num_updates * average_window)) {
out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
0.0);
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
0.0);
// finishBatch
old_num_accumulates_ = num_accumulates_;
num_accumulates_ = 0;
old_num_accumulates = num_accumulates;
num_accumulates = 0;
}
setAccumulators<DeviceContext>(ctx, num_updates_, num_accumulates_,
old_num_accumulates_);
// Set accumulators to output
setAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
old_num_accumulates);
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册