未验证 提交 ca725c82 编写于 作者: Z Zhang Ting 提交者: GitHub

improve fp16 performance of slice_grad, test=develop (#25523)

上级 5d3766ff
...@@ -12,145 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,145 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <thrust/device_vector.h>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/slice_op.h" #include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <size_t D>
__global__ void Padding(const paddle::platform::float16* d_out,
const int64_t* out_dims, const int64_t* in_dims,
const int64_t* offsets, int64_t n,
paddle::platform::float16* d_in) {
int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
if (out_idx < n) {
int64_t out_idx_tmp = out_idx;
int64_t coords[D] = {0};
for (int i = D - 1; i >= 0; --i) {
coords[i] = out_idx_tmp % out_dims[i];
out_idx_tmp /= out_dims[i];
coords[i] += offsets[i];
}
int64_t in_idx = 0;
for (int i = 0; i < D; ++i) {
in_idx = in_idx * in_dims[i] + coords[i];
}
d_in[in_idx] = d_out[out_idx];
}
}
template <>
class SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
: public framework::OpKernel<paddle::platform::float16> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
auto out_dims = d_out->dims();
auto in_dims = d_in->dims();
int rank = out_dims.size();
std::vector<int64_t> offsets(rank, 0);
auto axes = ctx.Attr<std::vector<int>>("axes");
auto starts_int = ctx.Attr<std::vector<int>>("starts");
std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
auto list_new_starts_tensor =
ctx.MultiInput<framework::Tensor>("StartsTensorList");
if (list_new_starts_tensor.size() > 0) {
starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
} else if (ctx.HasInput("StartsTensor")) {
auto* starts_tensor = ctx.Input<framework::Tensor>("StartsTensor");
starts = GetDataFromTensor<int64_t>(starts_tensor);
}
for (size_t i = 0; i < starts.size(); ++i) {
if (starts[i] < 0) {
starts[i] += in_dims[axes[i]];
}
offsets[axes[i]] = std::max(starts[i], static_cast<int64_t>(0));
}
math::SetConstant<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
set_zero;
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
int64_t numel = d_out->numel();
dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
dim3 threads(PADDLE_CUDA_NUM_THREADS);
auto stream = ctx.cuda_device_context().stream();
const std::vector<int64_t> out_shape =
framework::vectorize<int64_t>(out_dims);
const std::vector<int64_t> in_shape =
framework::vectorize<int64_t>(in_dims);
framework::Tensor out_dims_tensor;
framework::Tensor in_dims_tensor;
framework::Tensor offsets_tensor;
framework::TensorFromVector(out_shape, ctx.device_context(),
&out_dims_tensor);
framework::TensorFromVector(in_shape, ctx.device_context(),
&in_dims_tensor);
framework::TensorFromVector(offsets, ctx.device_context(), &offsets_tensor);
const int64_t* out_dims_ptr = out_dims_tensor.data<int64_t>();
const int64_t* in_dims_ptr = in_dims_tensor.data<int64_t>();
const int64_t* offsets_ptr = offsets_tensor.data<int64_t>();
switch (rank) {
case 1:
Padding<1><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 2:
Padding<2><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 3:
Padding<3><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 4:
Padding<4><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 5:
Padding<5><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 6:
Padding<6><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>, slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, double>, ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
......
...@@ -350,7 +350,7 @@ class SliceGradKernel : public framework::OpKernel<T> { ...@@ -350,7 +350,7 @@ class SliceGradKernel : public framework::OpKernel<T> {
platform::DeviceContextPool& pool = platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Get(context.GetPlace()); auto& dev_ctx = *pool.Get(context.GetPlace());
T value = 0.0; T value = T(0);
math::SetConstant<DeviceContext, T> functor; math::SetConstant<DeviceContext, T> functor;
for (int i = 0; i < d_in_size; ++i) { for (int i = 0; i < d_in_size; ++i) {
auto dim = input_array->at(i).dims(); auto dim = input_array->at(i).dims();
...@@ -440,7 +440,7 @@ class SliceGradKernel : public framework::OpKernel<T> { ...@@ -440,7 +440,7 @@ class SliceGradKernel : public framework::OpKernel<T> {
auto d_out_t = auto d_out_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From( framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*d_out, out_dims); *d_out, out_dims);
d_in_t.device(place) = d_out_t.pad(paddings, 0); d_in_t.device(place) = d_out_t.pad(paddings, T(0));
} }
}; };
} // namespace operators } // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册