未验证 提交 de003cee 编写于 作者: Z Zhang Ting 提交者: GitHub

[cherry-pick] improve perfomance of cast and tril op (#30498)

* add fp16 support for tril_triu op (#30186)

* add VecCastCUDAKernel (#30296)
Co-authored-by: Nfurnace <34057289+windstamp@users.noreply.github.com>
上级 2c3799d1
......@@ -19,6 +19,43 @@ limitations under the License. */
namespace paddle {
namespace operators {
// aligned vector generates vectorized load/store on CUDA
template <typename T, int Size>
struct alignas(sizeof(T) * Size) AlignedVector {
T val[Size];
};
template <typename T>
inline int VectorizedSize(const T* pointer) {
uint64_t address = reinterpret_cast<uint64_t>(pointer);
constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value; // NOLINT
if (address % vec4 == 0) {
return 4;
}
return 1;
}
template <typename InT, typename OutT, int VecSize>
__global__ void VecCastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
using LoadT = AlignedVector<InT, VecSize>;
using StoreT = AlignedVector<OutT, VecSize>;
for (int i = idx * VecSize; i < N; i += blockDim.x * gridDim.x * VecSize) {
InT in_vec[VecSize];
LoadT* in_value = reinterpret_cast<LoadT*>(&in_vec);
*in_value = *reinterpret_cast<const LoadT*>(&in[i]);
OutT out_vec[VecSize];
#pragma unroll
for (int ii = 0; ii < VecSize; ii++) {
out_vec[ii] = static_cast<OutT>(in_vec[ii]);
}
*(reinterpret_cast<StoreT*>(&out[i])) =
*reinterpret_cast<StoreT*>(&out_vec[0]);
}
}
template <typename InT, typename OutT>
__global__ void CastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
CUDA_KERNEL_LOOP(index, N) { out[index] = static_cast<OutT>(in[index]); }
......@@ -40,8 +77,16 @@ struct CastOpFunctor<platform::CUDADeviceContext, InT> {
auto* out = out_->mutable_data<OutT>(ctx_.GetPlace());
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(ctx_, size);
CastCUDAKernel<InT, OutT><<<config.block_per_grid, config.thread_per_block,
0, ctx_.stream()>>>(in, size, out);
int vec_size = VectorizedSize<OutT>(out);
if (!std::is_same<InT, OutT>::value && vec_size == 4 && size % 4 == 0) {
VecCastCUDAKernel<InT, OutT, 4><<<
config.block_per_grid, config.thread_per_block, 0, ctx_.stream()>>>(
in, size, out);
} else {
CastCUDAKernel<InT, OutT><<<config.block_per_grid,
config.thread_per_block, 0, ctx_.stream()>>>(
in, size, out);
}
}
};
......
......@@ -99,6 +99,7 @@ class TrilTriuGradOpMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
......@@ -107,10 +108,13 @@ REGISTER_OP_CPU_KERNEL(
tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
REGISTER_OP_CPU_KERNEL(
tril_triu_grad,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext,
plat::float16>);
......@@ -15,16 +15,20 @@ limitations under the License. */
#include "paddle/fluid/operators/tril_triu_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
tril_triu,
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
tril_triu_grad,
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
......
......@@ -16,8 +16,10 @@ from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle
import paddle.fluid as fluid
import paddle.tensor as tensor
from paddle.fluid.framework import Program, program_guard
class TrilTriuOpDefaultTest(OpTest):
......@@ -68,6 +70,8 @@ def case_generator(op_type, Xshape, diagonal, expected):
class FailureCase(unittest.TestCase):
def test_failure(self):
paddle.enable_static()
data = fluid.data(shape=Xshape, dtype='float64', name=cls_name)
with self.assertRaisesRegexp(
eval(expected.split(':')[-1]), errmsg[expected]):
......@@ -75,6 +79,8 @@ def case_generator(op_type, Xshape, diagonal, expected):
class SuccessCase(TrilTriuOpDefaultTest):
def initTestCase(self):
paddle.enable_static()
self.real_op_type = op_type
self.diagonal = diagonal
self.X = np.random.random(Xshape).astype("float64")
......@@ -120,39 +126,58 @@ class TestTrilTriuOpAPI(unittest.TestCase):
"""
def test_api(self):
data = np.random.random([1, 9, 9, 4]).astype('float32')
x = fluid.data(shape=[1, 9, -1, 4], dtype='float32', name='x')
tril_out, triu_out = tensor.tril(x), tensor.triu(x)
place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
tril_out, triu_out = exe.run(
fluid.default_main_program(),
feed={"x": data},
fetch_list=[tril_out, triu_out], )
self.assertTrue(np.allclose(tril_out, np.tril(data)))
self.assertTrue(np.allclose(triu_out, np.triu(data)))
paddle.enable_static()
dtypes = ['float16', 'float32']
for dtype in dtypes:
prog = Program()
startup_prog = Program()
with program_guard(prog, startup_prog):
data = np.random.random([1, 9, 9, 4]).astype(dtype)
x = fluid.data(shape=[1, 9, -1, 4], dtype=dtype, name='x')
tril_out, triu_out = tensor.tril(x), tensor.triu(x)
place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
tril_out, triu_out = exe.run(
fluid.default_main_program(),
feed={"x": data},
fetch_list=[tril_out, triu_out], )
self.assertTrue(np.allclose(tril_out, np.tril(data)))
self.assertTrue(np.allclose(triu_out, np.triu(data)))
def test_api_with_dygraph(self):
with fluid.dygraph.guard():
data = np.random.random([1, 9, 9, 4]).astype('float32')
x = fluid.dygraph.to_variable(data)
tril_out, triu_out = tensor.tril(x).numpy(), tensor.triu(x).numpy()
self.assertTrue(np.allclose(tril_out, np.tril(data)))
self.assertTrue(np.allclose(triu_out, np.triu(data)))
paddle.disable_static()
dtypes = ['float16', 'float32']
for dtype in dtypes:
with fluid.dygraph.guard():
data = np.random.random([1, 9, 9, 4]).astype(dtype)
x = fluid.dygraph.to_variable(data)
tril_out, triu_out = tensor.tril(x).numpy(), tensor.triu(
x).numpy()
self.assertTrue(np.allclose(tril_out, np.tril(data)))
self.assertTrue(np.allclose(triu_out, np.triu(data)))
def test_fluid_api(self):
data = np.random.random([1, 9, 9, 4]).astype('float32')
x = fluid.data(shape=[1, 9, -1, 4], dtype='float32', name='x')
triu_out = fluid.layers.triu(x)
place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
triu_out = exe.run(fluid.default_main_program(),
feed={"x": data},
fetch_list=[triu_out])
paddle.enable_static()
dtypes = ['float16', 'float32']
for dtype in dtypes:
prog = Program()
startup_prog = Program()
with program_guard(prog, startup_prog):
data = np.random.random([1, 9, 9, 4]).astype(dtype)
x = fluid.data(shape=[1, 9, -1, 4], dtype=dtype, name='x')
triu_out = fluid.layers.triu(x)
place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
triu_out = exe.run(fluid.default_main_program(),
feed={"x": data},
fetch_list=[triu_out])
if __name__ == '__main__':
......
......@@ -558,8 +558,8 @@ def _tril_triu_op(helper):
x = helper.kwargs.get('x', None)
assert x is not None, 'x cannot be None in {}'.format(op_type)
check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
op_type)
check_variable_and_dtype(
x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], op_type)
if len(x.shape) < 2:
raise ValueError("x shape in {} must be at least 2-D".format(op_type))
diagonal = helper.kwargs.get('diagonal', 0)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册