提交 2cde56c5 编写于 作者: W wanghaoshuang

Use Transform instead of eigen

上级 743dfd82
...@@ -80,6 +80,5 @@ class ClipOpGrad : public framework::OperatorWithKernel { ...@@ -80,6 +80,5 @@ class ClipOpGrad : public framework::OperatorWithKernel {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad, REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
ops::ClipOpGrad); ops::ClipOpGrad);
REGISTER_OP_CPU_KERNEL(clip, REGISTER_OP_CPU_KERNEL(clip, ops::ClipKernel<float>);
ops::ClipKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(clip_grad, ops::ClipGradKernel<float>); REGISTER_OP_CPU_KERNEL(clip_grad, ops::ClipGradKernel<float>);
...@@ -14,60 +14,6 @@ ...@@ -14,60 +14,6 @@
#include "paddle/operators/clip_op.h" #include "paddle/operators/clip_op.h"
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
namespace paddle {
namespace operators {
using framework::LoDTensor;
template <typename T>
__global__ void ClipGradientKernel(const int N, const T min, const T max,
const T* Y, const T* dY, T* dX) {
CUDA_1D_KERNEL_LOOP(i, N) {
if (Y[i] > min && Y[i] < max) {
dX[i] = dY[i];
} else {
dX[i] = 0;
}
}
}
template <typename T>
class ClipGradientOpCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto max = context.Attr<float>("max");
auto min = context.Attr<float>("min");
auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
if (d_x != nullptr) {
auto* x = context.Input<LoDTensor>("X");
auto dims = d_x->dims();
int64_t count = d_out->numel();
auto d_x_data = d_x->mutable_data<T>(context.GetPlace());
auto d_out_data = d_out->data<T>();
auto x_data = x->data<T>();
int N = d_x->dims()[0];
int D = d_x->dims()[1];
int block = 512;
int grid = (N * D + block - 1) / block;
ClipGradientKernel<T><<<
grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
context.device_context())
.stream()>>>(count, min, max, x_data, d_out_data,
d_x_data);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(clip, REGISTER_OP_GPU_KERNEL(clip, ops::ClipKernel<float>);
ops::ClipKernel<paddle::platform::GPUPlace, float>); REGISTER_OP_GPU_KERNEL(clip_grad, ops::ClipGradKernel<float>);
REGISTER_OP_GPU_KERNEL(clip_grad, ops::ClipGradientOpCUDAKernel<float>);
...@@ -16,57 +16,61 @@ ...@@ -16,57 +16,61 @@
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::LoDTensor; using framework::Tensor;
using platform::Transform;
template <typename T, size_t D, int MajorType = Eigen::RowMajor, template <typename T>
typename IndexType = Eigen::DenseIndex> class ClipFunctor {
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>; public:
explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
HOSTDEVICE T operator()(const T& x) const {
if (x < min_)
return min_;
else if (x > max_)
return max_;
else
return x;
}
private:
T min_;
T max_;
};
template <typename T>
class ClipGradFunctor {
public:
explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
HOSTDEVICE T operator()(const T& x, const T& y) const {
if (y > min_ && y < max_)
return x;
else
return 0;
}
template <typename Place, typename T, size_t D> private:
void ClipFunction(const framework::ExecutionContext& context) { T min_;
auto max = context.op().Attr<float>("max"); T max_;
auto min = context.op().Attr<float>("min"); };
auto* x = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto x_tensor = EigenTensor<T, D>::From(*x);
auto out_tensor = EigenTensor<T, D>::From(*out);
auto place = context.GetEigenDevice<Place>();
out_tensor.device(place) = x_tensor.cwiseMin(max).cwiseMax(min);
}
template <typename Place, typename T> template <typename T>
class ClipKernel : public framework::OpKernel { class ClipKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
int rank = context.Input<LoDTensor>("X")->dims().size(); auto max = context.Attr<T>("max");
switch (rank) { auto min = context.Attr<T>("min");
case 1: auto* x = context.Input<Tensor>("X");
ClipFunction<Place, T, 1>(context); auto* out = context.Output<Tensor>("Out");
break; T* out_data = out->mutable_data<T>(context.GetPlace());
case 2: const T* x_data = x->data<T>();
ClipFunction<Place, T, 2>(context); int numel = x->numel();
break; Transform(context.device_context(), x_data, x_data + numel, out_data,
case 3: ClipFunctor<T>(min, max));
ClipFunction<Place, T, 3>(context);
break;
case 4:
ClipFunction<Place, T, 4>(context);
break;
case 5:
ClipFunction<Place, T, 5>(context);
break;
case 6:
ClipFunction<Place, T, 6>(context);
break;
default:
PADDLE_THROW(
"PadOp only support tensors with no more than 6 dimensions.");
}
} }
}; };
...@@ -74,24 +78,18 @@ template <typename T> ...@@ -74,24 +78,18 @@ template <typename T>
class ClipGradKernel : public framework::OpKernel { class ClipGradKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto max = context.op().Attr<float>("max"); auto max = context.Attr<T>("max");
auto min = context.op().Attr<float>("min"); auto min = context.Attr<T>("min");
auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out")); auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X")); auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
if (d_x != nullptr) { if (d_x != nullptr) {
auto* x = context.Input<LoDTensor>("X"); auto* x = context.Input<Tensor>("X");
auto dims = d_x->dims(); int64_t numel = d_out->numel();
int64_t count = d_out->numel();
auto d_x_data = d_x->mutable_data<T>(context.GetPlace()); auto d_x_data = d_x->mutable_data<T>(context.GetPlace());
auto d_out_data = d_out->data<T>(); const T* d_out_data = d_out->data<T>();
auto x_data = x->data<T>(); const T* x_data = x->data<T>();
for (int i = 0; i < count; ++i) { Transform(context.device_context(), d_out_data, d_out_data + numel,
if (x_data[i] > min && x_data[i] < max) { x_data, d_x_data, ClipGradFunctor<T>(min, max));
d_x_data[i] = d_out_data[i];
} else {
d_x_data[i] = 0;
}
}
} }
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册