提交 14a59d2e 编写于 作者: Y Yu Yang

Merge branch 'develop' of github.com:baidu/Paddle into feature/grad_reg_mechanism_cont2

......@@ -33,22 +33,45 @@ The mapping relationship between an operator and its gradient operators is a fun
```cpp
// (OpDesc) --> vector<OpDesc>
using GradOpDescMaker = std::function<std::vector<OpDesc>(const OpDesc&)>;
std::function<std::vector<OpDescBind>(const OpDescBind&)>;
```
The function take a `OpDesc` of the forward operator and return one or many gradient operator descriptions.
The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for protobuf message `OpDesc` to manipulate `OpDesc` fast.
The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
```cpp
struct OpInfo {
GradOpDescMaker grad_op_maker_;
std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)> grad_op_maker_;
...
};
```
The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
```cpp
class GradOpDescMakerBase {
public:
GradOpDescMakerBase(const OpDescBind& );
virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
};
```
We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
```cpp
using GradOpMaker = ...;
std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
func = [] (const OpDescBind& fwd_op) {
GradOpMaker maker(fwd_op);
return maker();
};
```
We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
The user interface should be
......
......@@ -168,9 +168,27 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
net->ops_[op_offset]->Rename(name, dup_outputs.back());
}
// collect all the offset to append `add` op for each alias
insert_position.push_back(
{dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
{{"Out", {name}}}, {})});
//
// one variable is shared between multiple operators.
// insert add operator one by one, then add it to output
for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
++output_idx) {
auto insert_add_x = dup_outputs[output_idx];
auto insert_add_y = dup_outputs[output_idx];
auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
// first add op inserted
if (output_idx == dup_outputs.size() - 2) {
insert_add_out = name;
}
if (output_idx != 0) {
insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1);
}
insert_position.push_back(
{dup_op.back(),
OpRegistry::CreateOp(
"sum", {{"X", {insert_add_x}}, {"X", {insert_add_y}}},
{{"Out", {insert_add_out}}}, {})});
}
}
// make sure the inserted `add` ops follow the BFS order.
......
......@@ -143,15 +143,18 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
}
};
class AddOpMaker : public OpProtoAndCheckerMaker {
class SumOpMaker : public framework::OpProtoAndCheckerMaker {
public:
AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "x").AsDuplicable();
AddOutput("Out", "out");
AddInput("X", "the input tensors of sum operator.")
.AsDuplicable()
.NotInGradient();
AddOutput("Out", "the output tensor of sum operator.").NotInGradient();
AddComment("");
}
};
} // namespace framework
} // namespace paddle
......@@ -165,7 +168,7 @@ REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
REGISTER_OP(sum, f::NOP, f::SumOpMaker, sum_grad, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
f::NOP);
......@@ -294,7 +297,7 @@ TEST(Backward, net_shared_weight) {
ASSERT_TRUE(bwd->IsNetOp());
auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
ASSERT_EQ(3UL, bwd_net->ops_.size());
ASSERT_EQ("add", bwd_net->ops_[2]->Type());
ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
}
TEST(Backward, op_all_input_are_not_need) {
......
......@@ -24,7 +24,7 @@ class GradOpDescMakerBase {
explicit GradOpDescMakerBase(const OpDescBind& fwd_op) : fwd_op_(fwd_op) {}
virtual ~GradOpDescMakerBase() = default;
virtual std::vector<OpDescBind> operator()() const = 0;
virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
protected:
static std::vector<std::string> ToGradNames(
......@@ -44,12 +44,12 @@ class GradOpDescMakerBase {
return ToGradNames(fwd_op_.Output(name));
}
std::vector<std::string> InputParamNames() const {
return this->fwd_op_.InputParamNames();
std::vector<std::string> InputNames() const {
return this->fwd_op_.InputNames();
}
std::vector<std::string> OutputParamNames() const {
return this->fwd_op_.OutputParamNames();
std::vector<std::string> OutputNames() const {
return this->fwd_op_.OutputNames();
}
std::vector<std::string> Input(const std::string& name) const {
......@@ -80,10 +80,15 @@ class GradOpDescMakerBase {
class SingleGradOpDescMaker : public GradOpDescMakerBase {
public:
using GradOpDescMakerBase::GradOpDescMakerBase;
std::vector<OpDescBind> operator()() const { return {this->Apply()}; }
std::vector<std::unique_ptr<OpDescBind>> operator()() const {
std::vector<std::unique_ptr<OpDescBind>> retv;
retv.emplace_back(this->Apply());
return retv;
}
protected:
virtual OpDescBind Apply() const = 0;
virtual std::unique_ptr<OpDescBind> Apply() const = 0;
};
class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
......@@ -91,23 +96,23 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
using SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
virtual OpDescBind Apply() const {
OpDescBind grad;
grad.SetType(this->GradOpType());
virtual std::unique_ptr<OpDescBind> Apply() const {
auto* grad = new OpDescBind();
grad->SetType(this->GradOpType());
for (auto& input_param : this->InputParamNames()) {
grad.SetInput(input_param, this->Input(input_param));
grad.SetOutput(GradVarName(input_param), this->InputGrad(input_param));
for (auto& input_param : this->InputNames()) {
grad->SetInput(input_param, this->Input(input_param));
grad->SetOutput(GradVarName(input_param), this->InputGrad(input_param));
}
for (auto& output_param : this->OutputParamNames()) {
grad.SetInput(output_param, this->Output(output_param));
grad.SetInput(GradVarName(output_param), this->OutputGrad(output_param));
for (auto& output_param : this->OutputNames()) {
grad->SetInput(output_param, this->Output(output_param));
grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
}
grad.SetAttrMap(this->Attrs());
grad->SetAttrMap(this->Attrs());
return grad;
return std::unique_ptr<OpDescBind>(grad);
}
virtual std::string GradOpType() const {
......
......@@ -31,15 +31,6 @@ const std::vector<std::string> &OpDescBind::Input(
return it->second;
}
std::vector<std::string> OpDescBind::InputNames() const {
std::vector<std::string> retv;
retv.reserve(this->inputs_.size());
for (auto &ipt : this->inputs_) {
retv.push_back(ipt.first);
}
return retv;
}
void OpDescBind::SetInput(const std::string &param_name,
const std::vector<std::string> &args) {
need_update_ = true;
......@@ -54,15 +45,6 @@ const std::vector<std::string> &OpDescBind::Output(
return it->second;
}
std::vector<std::string> OpDescBind::OutputNames() const {
std::vector<std::string> retv;
retv.reserve(this->outputs_.size());
for (auto &ipt : this->outputs_) {
retv.push_back(ipt.first);
}
return retv;
}
void OpDescBind::SetOutput(const std::string &param_name,
const std::vector<std::string> &args) {
need_update_ = true;
......
......@@ -35,15 +35,11 @@ class OpDescBind {
const std::vector<std::string> &Input(const std::string &name) const;
std::vector<std::string> InputNames() const;
void SetInput(const std::string &param_name,
const std::vector<std::string> &args);
const std::vector<std::string> &Output(const std::string &name) const;
std::vector<std::string> OutputNames() const;
void SetOutput(const std::string &param_name,
const std::vector<std::string> &args);
......@@ -71,10 +67,8 @@ class OpDescBind {
// Only be used in C++
void SetAttrMap(const AttributeMap &attr_map);
std::vector<std::string> InputParamNames() const { return MapKeys(inputs_); }
std::vector<std::string> OutputParamNames() const {
return MapKeys(outputs_);
}
std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
void SetInputMap(const VariableNameMap &input) {
this->inputs_ = input;
......
......@@ -31,7 +31,7 @@ namespace framework {
struct OpInfo {
OpCreator creator_;
std::string grad_op_type_;
std::function<std::vector<OpDescBind>(const OpDescBind&)> grad_op_maker_;
GradOpMakerFN grad_op_maker_;
OpProto* proto_{nullptr};
OpAttrChecker* checker_{nullptr};
......
......@@ -20,6 +20,7 @@
namespace paddle {
namespace framework {
class OperatorBase;
class OpDescBind;
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
// The order should be as same as framework.proto
......@@ -34,5 +35,8 @@ using OpCreator = std::function<OperatorBase*(
const std::string& /*type*/, const VariableNameMap& /*inputs*/,
const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
using GradOpMakerFN =
std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>;
} // namespace framework
} // namespace paddle
......@@ -126,8 +126,7 @@ void CondOp::PrepareDataForSubnet(
dim[0] = index_tensors[i].dims()[0];
tensor_child->mutable_data<float>(dim, platform::CPUPlace());
Gather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
tensor_child);
CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
}
}
......@@ -188,7 +187,7 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
Variable* var_child = sub_scopes[i]->FindVar(output);
PADDLE_ENFORCE_NOT_NULL(var_child);
auto* tensor_child = &var_child->Get<LoDTensor>();
ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
tensor_parent);
}
}
......
/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace operators {
using framework::Tensor;
using platform::Place;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
size_t index_size, size_t slice_size) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
int gather_i = indices[indices_i];
int params_i = gather_i * slice_size + slice_i;
*(output + i) = *(params + params_i);
}
}
/**
* A thin wrapper on gpu tensor
* Return a new tensor from source tensor, gathered according to index
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename T>
void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) {
// PADDLE_ENFORCE(platform::is_gpu_place(place));
// check index of shape 1-D
PADDLE_ENFORCE(index.dims().size() == 1);
int index_size = index.dims()[0];
auto src_dims = src.dims();
framework::DDim output_dims(src_dims);
output_dims[0] = index_size;
// slice size
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
const T* p_src = src.data<T>();
const int* p_index = index.data<int>();
T* p_output = output->data<T>();
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
GatherCUDAKernel<T><<<
grid, block, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
p_src, p_index, p_output, index_size, slice_size);
}
} // namespace operators
} // namespace paddle
......@@ -24,49 +24,40 @@ limitations under the License. */
namespace paddle {
namespace operators {
// Implementation of CPU copy
template <typename T>
void CPUGather(const T* src, const int* indices, const int slice_size,
const int index_size, T* output) {
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = indices[i];
memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
}
}
// Implementation of GPU copy:
template <typename T>
void GPUGather(const T* src, const int* index, const int slice_size,
const int index_size, T* output);
using framework::Tensor;
/**
* A thin wrapper for gathering on cpu tensor
* Return a new tensor from source tensor, gathered according to index
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename T>
void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
const paddle::framework::Tensor* index,
paddle::framework::Tensor* output) {
void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
// check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
int index_size = index->dims()[0];
PADDLE_ENFORCE(index.dims().size() == 1);
int index_size = index.dims()[0];
auto src_dims = src->dims();
auto src_dims = src.dims();
framework::DDim output_dims(src_dims);
output_dims[0] = index_size;
const T* p_src = src.data<T>();
const int* p_index = index.data<int>();
T* p_output = output->data<T>();
// slice size
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Gathering
if (platform::is_cpu_place(place)) {
CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size,
output->data<T>());
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = p_index[i];
memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
}
}
......
......@@ -31,6 +31,8 @@ class GatherOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of GatherOp should not be null.");
auto index_dims = ctx->GetInputDim("Index");
PADDLE_ENFORCE(index_dims.size() == 1);
int batch_size = ctx->GetInputDim("Index")[0];
PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
framework::DDim output_dims(ctx->GetInputDim("X"));
......@@ -79,8 +81,5 @@ Out = X[Index]
namespace ops = paddle::operators;
REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
ops::GatherGradOp);
REGISTER_OP_CPU_KERNEL(gather,
ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
gather_grad,
ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gather.cu.h"
#include "paddle/framework/eigen.h"
#include "paddle/operators/gather_op.h"
#include "scatter.cu.h"
namespace paddle {
namespace operators {
template <typename T>
class GatherOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *x = ctx.Input<Tensor>("X");
auto *index = ctx.Input<Tensor>("Index");
auto *output = ctx.Output<Tensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
GPUGather<T>(ctx.device_context(), *x, *index, output);
}
};
template <typename T>
class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *Index = ctx.Input<Tensor>("Index");
auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *x = ctx.Input<Tensor>("X");
dX->mutable_data<T>(ctx.GetPlace());
auto dxt = framework::EigenVector<T>::Flatten(*dX);
auto place = ctx.GetEigenDevice<platform::GPUPlace>();
dxt.device(place) = dxt.constant(static_cast<T>(0));
GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
......@@ -23,29 +23,40 @@ namespace operators {
using Tensor = framework::Tensor;
template <typename Place, typename T>
template <typename T>
class GatherOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *X = ctx.Input<Tensor>("X");
auto *Index = ctx.Input<Tensor>("Index");
auto *Y = ctx.Output<Tensor>("Out");
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU.");
auto *x = ctx.Input<Tensor>("X");
auto *index = ctx.Input<Tensor>("Index");
auto *output = ctx.Output<Tensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
Y->mutable_data<T>(ctx.GetPlace());
Gather<T>(ctx.GetPlace(), X, Index, Y);
CPUGather<T>(ctx.device_context(), *x, *index, output);
}
};
template <typename Place, typename T>
template <typename T>
class GatherGradientOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU.");
auto *Index = ctx.Input<Tensor>("Index");
auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
dX->mutable_data<T>(ctx.GetPlace());
ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX);
auto dxt = framework::EigenVector<T>::Flatten(*dX);
auto place = ctx.GetEigenDevice<platform::CPUPlace>();
dxt.device(place) = dxt.constant(static_cast<T>(0));
ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
}
};
......
......@@ -41,7 +41,9 @@ TEST(Gather, GatherData) {
int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
Gather<int>(CPUPlace(), src, index, output);
auto* cpu_place = new paddle::platform::CPUPlace();
paddle::platform::CPUDeviceContext ctx(*cpu_place);
CPUGather<int>(ctx, *src, *index, output);
for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
......
/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void ScatterCUDAKernel(const T* params, const int* indices,
T* output, size_t index_size,
size_t slice_size) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
int scatter_i = indices[indices_i];
int out_i = scatter_i * slice_size + slice_i;
*(output + out_i) = *(params + i);
}
}
/**
* A thin wrapper on gpu tensor
* Return a new updated tensor from source tensor, scatter-assigned according to
* index
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename T>
void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) {
// PADDLE_ENFORCE(platform::is_gpu_place(place));
// check index of shape 1-D
PADDLE_ENFORCE(index.dims().size() == 1);
int index_size = index.dims()[0];
auto src_dims = src.dims();
framework::DDim output_dims(src_dims);
output_dims[0] = index_size;
// slice size
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
const T* p_src = src.data<T>();
const int* p_index = index.data<int>();
T* p_output = output->data<T>();
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
ScatterCUDAKernel<T><<<
grid, block, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
p_src, p_index, p_output, index_size, slice_size);
}
} // namespace operators
} // namespace paddle
......@@ -24,63 +24,42 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
// Implementation of CPU copy
template <typename T>
void CPUScatterUpdate(const paddle::framework::Tensor* src, const int* index,
const size_t index_size,
paddle::framework::Tensor* output) {
paddle::framework::DDim output_dims = output->dims();
for (size_t i = 0; i < index_size; ++i) {
int index_ = index[i];
paddle::framework::Tensor src_ = *src;
paddle::framework::Tensor output_ = *output;
if (index_size > 1) src_ = src->Slice<T>(i, i + 1);
if (output_dims[0] > 1) output_ = output->Slice<T>(index_, index_ + 1);
auto X = EigenVector<T>::Flatten(src_);
auto Y = EigenVector<T>::Flatten(output_);
Y = X + Y;
}
}
// Implementation of GPU scatter:
template <typename T>
void GPUScatterUpdate(const T* src, const int* index, const int slice_size,
const int index_size, T* output);
/**
* Return a updated tensor from source tensor, scattered according to index:
* dst[i] += src[index[i]]
* dst[i] = src[index[i]]
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename T>
void ScatterUpdate(const platform::Place& place,
const paddle::framework::Tensor* src,
const paddle::framework::Tensor* index,
paddle::framework::Tensor* output) {
void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
// check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1);
int index_size = index->dims()[0];
PADDLE_ENFORCE(index.dims().size() == 1);
int index_size = index.dims()[0];
auto src_dims = src->dims();
auto src_dims = src.dims();
auto dst_dims = output->dims();
const T* p_src = src.data<T>();
const int* p_index = index.data<int>();
T* p_output = output->data<T>();
// check src shape and dst shape should match
for (int i = 1; i < src_dims.size(); i++)
PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
if (platform::is_cpu_place(place)) {
CPUScatterUpdate<T>(src, index->data<int>(), index_size, output);
} else {
// slice size
size_t slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = p_index[i];
memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
}
}
......
......@@ -97,8 +97,5 @@ Out[Index] = Ref[Index] + Updates
namespace ops = paddle::operators;
REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
ops::ScatterGradOp);
REGISTER_OP_CPU_KERNEL(scatter,
ops::ScatterOpKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
scatter_grad,
ops::ScatterGradientOpKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gather.cu.h"
#include "paddle/operators/gather_op.h"
#include "scatter.cu.h"
namespace paddle {
namespace operators {
template <typename T>
class ScatterOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *Ref = ctx.Input<Tensor>("Ref");
auto *Index = ctx.Input<Tensor>("Index");
auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out");
Out->ShareDataWith<T>(*Ref);
GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
}
};
template <typename T>
class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO
dRef->ShareDataWith<T>(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Index]
GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
......@@ -23,10 +23,12 @@ namespace operators {
using Tensor = framework::Tensor;
template <typename Place, typename T>
template <typename T>
class ScatterOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU.");
auto *Ref = ctx.Input<Tensor>("Ref");
auto *Index = ctx.Input<Tensor>("Index");
auto *Updates = ctx.Input<Tensor>("Updates");
......@@ -35,14 +37,16 @@ class ScatterOpKernel : public framework::OpKernel<T> {
// In place output: Out = Ref, Out[Index] += Updates
Out->ShareDataWith<T>(*Ref);
// Apply ScatterUpdate: Out[index] += Updates[:]
ScatterUpdate<T>(ctx.GetPlace(), Updates, Index, Out);
ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
}
};
template <typename Place, typename T>
template <typename T>
class ScatterGradientOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index");
......@@ -52,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
dRef->ShareDataWith<T>(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Index]
Gather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
}
};
......
......@@ -40,7 +40,9 @@ TEST(scatter, ScatterUpdate) {
float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
ScatterUpdate<float>(CPUPlace(), src, index, output);
auto* cpu_place = new paddle::platform::CPUPlace();
paddle::platform::CPUDeviceContext ctx(*cpu_place);
ScatterAssign<float>(ctx, *src, *index, output);
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
......
......@@ -27,6 +27,8 @@ class SGDOp : public framework::OperatorWithKernel {
"Input(param) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("grad"),
"Input(grad) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("learning_rate"),
"Input(learning_rate) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("param_out"),
"Output(param_out) of SGDOp should not be null.");
......@@ -42,9 +44,9 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("param", "input parameter");
AddInput("learning_rate", "learning rate of sgd");
AddInput("grad", "input gradient");
AddOutput("param_out", "output parameter");
AddAttr<float>("learning_rate", "learning rate of sgd");
AddComment(R"DOC(
Simplest sgd algorithm.
......
......@@ -31,7 +31,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
auto param = ctx.Input<Tensor>("param");
auto grad = ctx.Input<Tensor>("grad");
auto param_out = ctx.Output<Tensor>("param_out");
float lr = ctx.Attr<float>("learning_rate");
float lr = *ctx.Input<float>("learning_rate");
param_out->mutable_data<T>(ctx.GetPlace());
......
......@@ -13,4 +13,4 @@ limitations under the License. */
#include "paddle/operators/sum_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
......@@ -143,6 +143,13 @@ All parameter, weight, gradient are variables in Paddle.
.def("set_int",
[](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
.def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
.def("is_float", [](const Variable &var) { return var.IsType<float>(); })
.def("set_float",
[](Variable &var, float val) -> void {
*var.GetMutable<float>() = val;
})
.def("get_float",
[](const Variable &var) -> float { return var.Get<float>(); })
.def("get_tensor",
[](Variable &self) -> LoDTensor * {
return self.GetMutable<LoDTensor>();
......
......@@ -46,12 +46,17 @@ def create_op(scope, op_type, inputs, outputs, attrs):
def set_input(scope, op, inputs, place):
def __set_input__(var_name, var):
tensor = scope.find_var(var_name).get_tensor()
if isinstance(var, tuple):
tensor.set_lod(var[1])
var = var[0]
tensor.set_dims(var.shape)
tensor.set(var, place)
if isinstance(var, tuple) or isinstance(var, np.ndarray):
tensor = scope.find_var(var_name).get_tensor()
if isinstance(var, tuple):
tensor.set_lod(var[1])
var = var[0]
tensor.set_dims(var.shape)
tensor.set(var, place)
elif isinstance(var, float):
scope.find_var(var_name).set_float(var)
elif isinstance(var, int):
scope.find_var(var_name).set_int(var)
for in_name, in_dup in Operator.get_op_inputs(op.type()):
if in_name in inputs:
......
......@@ -10,7 +10,7 @@ class TestScatterOp(OpTest):
index_np = np.array([1, 2]).astype("int32")
updates_np = np.random.random((2, 3)).astype("float32")
output_np = np.copy(ref_np)
output_np[index_np] += updates_np
output_np[index_np] = updates_np
self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
self.outputs = {'Out': output_np}
......@@ -18,7 +18,7 @@ class TestScatterOp(OpTest):
self.check_output()
def test_check_grad(self):
self.check_grad(['Updates', 'Ref'], 'Out', in_place=True)
self.check_grad(['Updates'], 'Out', in_place=True)
if __name__ == "__main__":
......
......@@ -10,8 +10,7 @@ class TestSGDOp(OpTest):
g = np.random.random((102, 105)).astype("float32")
lr = 0.1
self.inputs = {'param': w, 'grad': g}
self.attrs = {'learning_rate': lr}
self.inputs = {'param': w, 'grad': g, 'learning_rate': lr}
self.outputs = {'param_out': w - lr * g}
def test_check_output(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册