提交 e0b136c0 编写于 作者: W wanghaoshuang

Refine average accumulates op

1. Rename inputs and outputs
2. Add some comments
上级 87fe52c1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -21,9 +21,9 @@ template <> ...@@ -21,9 +21,9 @@ template <>
void getAccumulators<paddle::platform::CPUDeviceContext>( void getAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_, const framework::ExecutionContext& ctx, int64_t& num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("old_num_accumulates"); auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0]; old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
num_accumulates_ = in_num_accumulates->data<int64_t>()[0]; num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
...@@ -34,9 +34,9 @@ template <> ...@@ -34,9 +34,9 @@ template <>
void setAccumulators<paddle::platform::CPUDeviceContext>( void setAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t num_updates_, const framework::ExecutionContext& ctx, int64_t num_updates_,
int64_t num_accumulates_, int64_t old_num_accumulates_) { int64_t num_accumulates_, int64_t old_num_accumulates_) {
auto* out_old_num_accumulates = ctx.Output<Tensor>("old_num_accumulates"); auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("num_accumulates"); auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("num_updates"); auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_; out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
out_num_accumulates->data<int64_t>()[0] = num_accumulates_; out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
...@@ -49,64 +49,62 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel { ...@@ -49,64 +49,62 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("Param"), ctx->HasInput("param"),
"Input (Param) of average_accumulates op should not be null."); "Input (param) of average_accumulates op should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("Grad"), ctx->HasInput("in_sum_1"),
"Input (Grad) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("sum_1"),
"Input (sum_1) of average_accumulates op should not be null."); "Input (sum_1) of average_accumulates op should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("sum_2"), ctx->HasInput("in_sum_2"),
"Input (sum_2) of average_accumulates op should not be null."); "Input (sum_2) of average_accumulates op should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("sum_3"), ctx->HasInput("in_sum_3"),
"Input (sum_3) of average_accumulates op should not be null."); "Input (sum_3) of average_accumulates op should not be null.");
PADDLE_ENFORCE(ctx->HasInput("num_accumulates"), PADDLE_ENFORCE(
"Input (num_accumulates) of average_accumulates op should " ctx->HasInput("in_num_accumulates"),
"not be null."); "Input (in_num_accumulates) of average_accumulates op should "
PADDLE_ENFORCE(ctx->HasInput("old_num_accumulates"), "not be null.");
PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
"Input (old_num_accumulates) of average_accumulates op " "Input (old_num_accumulates) of average_accumulates op "
"should not be null."); "should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("num_updates"), ctx->HasInput("in_num_updates"),
"Input (num_updates) of average_accumulates op should not be null."); "Input (num_updates) of average_accumulates op should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasOutput("sum_1"), ctx->HasOutput("out_sum_1"),
"Output (sum_1) of average_accumulates op should not be null."); "Output (sum_1) of average_accumulates op should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasOutput("sum_2"), ctx->HasOutput("out_sum_2"),
"Output (sum_2) of average_accumulates op should not be null."); "Output (sum_2) of average_accumulates op should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasOutput("sum_3"), ctx->HasOutput("out_sum_3"),
"Output (sum_3) of average_accumulates op should not be null."); "Output (sum_3) of average_accumulates op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("num_accumulates"), PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
"Output (num_accumulates) of average_accumulates op should " "Output (num_accumulates) of average_accumulates op should "
"not be null."); "not be null.");
PADDLE_ENFORCE(ctx->HasOutput("old_num_accumulates"), PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
"Output (old_num_accumulates) of average_accumulates op " "Output (old_num_accumulates) of average_accumulates op "
"should not be null."); "should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasOutput("num_updates"), ctx->HasOutput("out_num_updates"),
"Output (num_updates) of average_accumulates op should not be null."); "Output (num_updates) of average_accumulates op should not be null.");
auto in_dim = ctx->GetInputDim("Param"); auto in_dim = ctx->GetInputDim("param");
ctx->SetOutputDim("sum_1", in_dim); ctx->SetOutputDim("out_sum_1", in_dim);
ctx->SetOutputDim("sum_2", in_dim); ctx->SetOutputDim("out_sum_2", in_dim);
ctx->SetOutputDim("sum_3", in_dim); ctx->SetOutputDim("out_sum_3", in_dim);
ctx->SetOutputDim("num_accumulates", {1}); ctx->SetOutputDim("out_num_accumulates", {1});
ctx->SetOutputDim("old_num_accumulates", {1}); ctx->SetOutputDim("out_old_num_accumulates", {1});
ctx->SetOutputDim("num_updates", {1}); ctx->SetOutputDim("out_num_updates", {1});
} }
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Param")->type()), framework::ToDataType(ctx.Input<Tensor>("param")->type()),
ctx.GetPlace()); ctx.GetPlace());
} }
}; };
...@@ -115,26 +113,60 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -115,26 +113,60 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker) AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("sum_1", ""); AddInput("param",
AddInput("sum_2", ""); "Input(Tensor or LoDTensor): The parameter to be accumulated.");
AddInput("sum_3", ""); AddInput("in_sum_1",
AddInput("num_accumulates", ""); "Input(Tensor or LoDTensor): A tensor used to store the parameter "
AddInput("old_num_accumulates", ""); "sums with the same shape as input(param).");
AddInput("num_updates", ""); AddInput("in_sum_2",
"Input(Tensor or LoDTensor): A auxiliary tensor to help "
AddOutput("sum_1", ""); "accumulating sums of parameter values with the same shape as "
AddOutput("sum_2", ""); "input(param). It is used to avoid loss of precision due to too "
AddOutput("sum_3", ""); "many sums.");
AddOutput("num_accumulates", ""); AddInput("in_sum_3",
AddOutput("old_num_accumulates", ""); "Input(Tensor or LoDTensor): A auxiliary tensor to help "
AddOutput("num_updates", ""); "accumulating sums of parameter values with the same shape as "
"input(param).");
AddAttr<float>("", "average_window"); AddInput("in_num_accumulates",
AddAttr<float>("", "max_average_window"); "Input(Tensor): The accumulating times of current window with "
AddAttr<float>("", "min_average_window"); "shape [1].");
AddInput("in_old_num_accumulates",
"Input(Tensor): The accumulating times of previous window with "
"shape [1].");
AddInput("in_num_updates",
"Input(Tensor): The total number of batches used by trainning "
"before this batch with shape [1].");
AddOutput("out_sum_1",
"Output(Tensor or LoDTensor): A tensor used to store the "
"parameter sums with the same shape as input(param).");
AddOutput("out_sum_2",
"Output(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param). It is used to avoid loss of precision due to too "
"many sums.");
AddOutput("out_sum_3",
"Output(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param).");
AddOutput("out_num_accumulates",
"Output(Tensor): The accumulating times of current window with "
"shape [1].");
AddOutput("out_old_num_accumulates",
"Output(Tensor): The accumulating times of previous window with "
"shape [1].");
AddOutput("out_num_updates",
"Output(Tensor): The total number of batches used by trainning "
"before this batch with shape [1].");
AddAttr<float>("average_window",
"The rate of average window size relative to num_updates.");
AddAttr<int64_t>("max_average_window", "Maximum size of average window.");
AddAttr<int64_t>("min_average_window", "Minimu size of average window.");
AddComment(R"DOC( AddComment(R"DOC(
AverageAccumulates Operator. AverageAccumulates Operator.
Accumulate the sum of parameter whtin sliding window. The size of sliding window is determined by 'average_window', 'max_average_window' and 'min_average_window'.
)DOC"); )DOC");
} }
}; };
...@@ -143,10 +175,10 @@ AverageAccumulates Operator. ...@@ -143,10 +175,10 @@ AverageAccumulates Operator.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(average_accumulate, ops::AverageAccumulatesOp, REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
ops::AverageAccumulatesOpMaker, ops::AverageAccumulatesOpMaker,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
average_accumulate, average_accumulates,
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>, ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>); ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -21,39 +21,43 @@ template <> ...@@ -21,39 +21,43 @@ template <>
void getAccumulators<paddle::platform::CUDADeviceContext>( void getAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_, const framework::ExecutionContext& ctx, int64_t& num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("old_num_accumulates"); auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CPUPlace(), &old_num_accumulates_, memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(), platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
sizeof(int64_t)); sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(), memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
in_old_num_accumulates->data<int64_t>(), sizeof(int64_t)); in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(), memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
in_num_updates->data<int64_t>(), sizeof(int64_t)); in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
} }
template <> template <>
void setAccumulators<paddle::platform::CUDADeviceContext>( void setAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t num_updates_, const framework::ExecutionContext& ctx, int64_t num_updates_,
int64_t num_accumulates_, int64_t old_num_accumulates_) { int64_t num_accumulates_, int64_t old_num_accumulates_) {
auto* out_old_num_accumulates = ctx.Output<Tensor>("old_num_accumulates"); auto stream = ctx.cuda_device_context().stream();
auto* out_num_accumulates = ctx.Output<Tensor>("num_accumulates"); auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("num_updates"); auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(), memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t)); platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
stream);
memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(), memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &num_accumulates_, sizeof(int64_t)); platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
stream);
memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(), memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
platform::CPUPlace(), &num_updates_, sizeof(int64_t)); platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
}
}
} }
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
average_accumulate, average_accumulates,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>, ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>); ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -29,88 +29,80 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>; ...@@ -29,88 +29,80 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext> template <typename DeviceContext>
void getAccumulators(const framework::ExecutionContext& ctx, void getAccumulators(const framework::ExecutionContext& ctx,
int64_t& num_updates_, int64_t& num_accumulates_, int64_t& num_updates, int64_t& num_accumulates,
int64_t& old_num_accumulates_); int64_t& old_num_accumulates);
template <typename DeviceContext> template <typename DeviceContext>
void setAccumulators(const framework::ExecutionContext& ctx, void setAccumulators(const framework::ExecutionContext& ctx,
int64_t num_updates_, int64_t num_accumulates_, int64_t num_updates, int64_t num_accumulates,
int64_t old_num_accumulates_); int64_t old_num_accumulates);
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AverageAccumulatesKernel : public framework::OpKernel<T> { class AverageAccumulatesKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
// It is used to avoid loss of precision
static const int64_t kMaxNumAccumulates = 16384; static const int64_t kMaxNumAccumulates = 16384;
// accumulators // Get accumulators from input
int64_t num_updates_ = 0; int64_t num_updates = 0;
int64_t num_accumulates_ = 0; int64_t num_accumulates = 0;
int64_t old_num_accumulates_ = 0; int64_t old_num_accumulates = 0;
// attrs getAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
int64_t min_average_window_; old_num_accumulates);
int64_t max_average_window_;
float average_window_; // Get attrs
float average_window = ctx.Attr<float>("average_window");
auto* param = ctx.Input<Tensor>("Param"); int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
auto* in_sum_1 = ctx.Input<Tensor>("sum_1"); int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
auto* in_sum_2 = ctx.Input<Tensor>("sum_2"); min_average_window =
auto* in_sum_3 = ctx.Input<Tensor>("sum_3"); std::min<int64_t>(min_average_window, max_average_window);
auto* out_sum_1 = ctx.Output<Tensor>("sum_1"); // Get inputs
auto* out_sum_2 = ctx.Output<Tensor>("sum_2"); auto* param = ctx.Input<Tensor>("param");
auto* out_sum_3 = ctx.Output<Tensor>("sum_3"); auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
getAccumulators<DeviceContext>(ctx, num_updates_, num_accumulates_, auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
old_num_accumulates_);
average_window_ = ctx.Attr<float>("average_window");
max_average_window_ =
ctx.Attr<int64_t>("max_average_window"); // default bach number
min_average_window_ =
ctx.Attr<int64_t>("min_average_window"); // default 10000L
min_average_window_ =
std::min<int64_t>(min_average_window_, max_average_window_);
auto param_tensor = EigenVector<T>::Flatten(*param); auto param_tensor = EigenVector<T>::Flatten(*param);
auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1); auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2); auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3); auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
// Get outputs
auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1); auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2); auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3); auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
// Compute
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
math::SetConstant<DeviceContext, T> constant_functor; math::SetConstant<DeviceContext, T> constant_functor;
// start batch ++num_updates;
++num_updates_; ++num_accumulates;
++num_accumulates_;
// update
out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
out_sum_2_tensor.device(place) = in_sum_2_tensor; out_sum_2_tensor.device(place) = in_sum_2_tensor;
out_sum_3_tensor.device(place) = in_sum_3_tensor; out_sum_3_tensor.device(place) = in_sum_3_tensor;
// needSpecialTraversal if (num_updates % kMaxNumAccumulates == 0) {
if (num_updates_ % kMaxNumAccumulates == 0) {
out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1, constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
0.0); 0.0);
} }
if (num_accumulates >= min_average_window &&
if (num_accumulates_ >= min_average_window_ && num_accumulates >= std::min<int64_t>(max_average_window,
num_accumulates_ >= std::min<int64_t>(max_average_window_, num_updates * average_window)) {
num_updates_ * average_window_)) {
out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1, constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
0.0); 0.0);
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2, constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
0.0); 0.0);
old_num_accumulates = num_accumulates;
// finishBatch num_accumulates = 0;
old_num_accumulates_ = num_accumulates_;
num_accumulates_ = 0;
} }
setAccumulators<DeviceContext>(ctx, num_updates_, num_accumulates_,
old_num_accumulates_); // Set accumulators to output
setAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
old_num_accumulates);
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册