未验证 提交 486d6572 编写于 作者: W Wilber 提交者: GitHub

[Code Format] Update code format. (#3890)

上级 45457074
......@@ -33,91 +33,91 @@ namespace cuda {
class AssignValueTest : public ::testing::Test {
protected:
AssignValueTest() : dtype(5), shape({1}) {
int num =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
fp32_values.resize(num);
int32_values.resize(num);
int64_values.resize(num);
bool_values.resize(num);
AssignValueTest() : dtype_(5), shape_({1}) {
int num = std::accumulate(
shape_.begin(), shape_.end(), 1, std::multiplies<int>());
fp32_values_.resize(num);
int32_values_.resize(num);
int64_values_.resize(num);
bool_values_.resize(num);
for (int i = 0; i < num; ++i) {
fp32_values[i] = i + 5;
int32_values[i] = i;
int64_values[i] = i;
bool_values[i] = i;
fp32_values_[i] = i + 5;
int32_values_[i] = i;
int64_values_[i] = i;
bool_values_[i] = i;
}
std::vector<int64_t> out_shape(shape.size(), 0);
for (size_t i = 0; i < shape.size(); ++i) out_shape[i] = shape[i];
Out_ref.Resize(lite::DDim(out_shape));
Out_gpu.Resize(Out_ref.dims());
Out_cpu.Resize(Out_ref.dims());
std::vector<int64_t> out_shape(shape_.size(), 0);
for (size_t i = 0; i < shape_.size(); ++i) out_shape[i] = shape_[i];
out_ref_.Resize(lite::DDim(out_shape));
out_gpu_.Resize(out_ref_.dims());
out_cpu_.Resize(out_ref_.dims());
cpu_base(&Out_ref);
RunBaseLine(&out_ref_);
device_init();
InitParamAndContext();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.shape = shape;
param.dtype = dtype;
param.fp32_values = fp32_values;
param.int32_values = int32_values;
param.int64_values = int64_values;
param.bool_values = bool_values;
param.Out = &Out_gpu;
void InitParamAndContext() {
ctx_.reset(new KernelContext);
cudaStreamCreate(&stream_);
auto& context = ctx_->As<CUDAContext>();
context.SetExecStream(stream_);
param_.shape = shape_;
param_.dtype = dtype_;
param_.fp32_values = fp32_values_;
param_.int32_values = int32_values_;
param_.int64_values = int64_values_;
param_.bool_values = bool_values_;
param_.Out = &out_gpu_;
}
void float_data_init() {}
void InitFloatInput() {}
void half_data_init() {}
void InitHalfInput() {}
void cpu_base(lite::Tensor* Out) {
if (dtype == static_cast<int>(lite::core::FluidType::INT32)) {
for (size_t i = 0; i < int32_values.size(); ++i) {
Out->mutable_data<int>()[i] = int32_values[i];
void RunBaseLine(lite::Tensor* out) {
if (dtype_ == static_cast<int>(lite::core::FluidType::INT32)) {
for (size_t i = 0; i < int32_values_.size(); ++i) {
out->mutable_data<int>()[i] = int32_values_[i];
}
} else if (dtype == static_cast<int>(lite::core::FluidType::FP32)) {
for (size_t i = 0; i < fp32_values.size(); ++i) {
Out->mutable_data<float>()[i] = fp32_values[i];
} else if (dtype_ == static_cast<int>(lite::core::FluidType::FP32)) {
for (size_t i = 0; i < fp32_values_.size(); ++i) {
out->mutable_data<float>()[i] = fp32_values_[i];
}
} else if (dtype == static_cast<int>(lite::core::FluidType::INT64)) {
for (size_t i = 0; i < int64_values.size(); ++i) {
Out->mutable_data<int64_t>()[i] = int64_values[i];
} else if (dtype_ == static_cast<int>(lite::core::FluidType::INT64)) {
for (size_t i = 0; i < int64_values_.size(); ++i) {
out->mutable_data<int64_t>()[i] = int64_values_[i];
}
} else if (dtype == static_cast<bool>(lite::core::FluidType::BOOL)) {
for (size_t i = 0; i < bool_values.size(); ++i) {
Out->mutable_data<bool>()[i] = bool_values[i];
} else if (dtype_ == static_cast<bool>(lite::core::FluidType::BOOL)) {
for (size_t i = 0; i < bool_values_.size(); ++i) {
out->mutable_data<bool>()[i] = bool_values_[i];
}
} else {
LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype;
LOG(FATAL) << "Unsupported dtype_ for assign_value_op:" << dtype_;
}
}
int dtype;
std::vector<int> shape;
std::vector<float> fp32_values;
std::vector<int> int32_values;
std::vector<int64_t> int64_values;
std::vector<int> bool_values;
int dtype_;
std::vector<int> shape_;
std::vector<float> fp32_values_;
std::vector<int> int32_values_;
std::vector<int64_t> int64_values_;
std::vector<int> bool_values_;
lite::Tensor Out_ref;
lite::Tensor Out_gpu;
lite::Tensor Out_cpu;
lite::Tensor out_ref_;
lite::Tensor out_gpu_;
lite::Tensor out_cpu_;
operators::AssignValueParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
operators::AssignValueParam param_;
std::unique_ptr<KernelContext> ctx_;
cudaStream_t stream_;
};
TEST_F(AssignValueTest, fp32) {
float_data_init();
InitFloatInput();
AssignValueCompute kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
kernel.SetParam(param_);
kernel.SetContext(std::move(ctx_));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
......@@ -135,12 +135,12 @@ TEST_F(AssignValueTest, fp32) {
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
out_gpu_.data<float>(),
sizeof(float) * out_gpu_.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
for (int i = 0; i < out_gpu_.numel(); ++i) {
EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
}
}
......
......@@ -33,7 +33,7 @@ struct FcTypeTraits<float> {
};
template <typename T>
__global__ void bias_v4(const int num, const T* bias, T* data, int K) {
__global__ void AddBiasV4(const int num, const T* bias, T* data, int K) {
CUDA_KERNEL_LOOP(index, num) {
int bias_idx = index % K;
const T bias_ptr = bias[bias_idx];
......@@ -48,7 +48,7 @@ __global__ void bias_v4(const int num, const T* bias, T* data, int K) {
}
template <typename T>
__global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) {
__global__ void AddBiasReluV4(const int num, const T* bias, T* data, int K) {
CUDA_KERNEL_LOOP(index, num) {
int bias_idx = index % K;
const T bias_ptr = bias[bias_idx];
......@@ -63,7 +63,7 @@ __global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) {
}
template <typename T>
__global__ void general_bias(const int num, const T* bias, T* data) {
__global__ void AddBias(const int num, const T* bias, T* data) {
int offset = blockIdx.x * num;
for (int i = threadIdx.x; i < num; i += blockDim.x) {
......@@ -78,7 +78,7 @@ __global__ void general_bias(const int num, const T* bias, T* data) {
}
template <typename T>
__global__ void general_relu_bias(const int num, const T* bias, T* data) {
__global__ void AddBiasRelu(const int num, const T* bias, T* data) {
int offset = blockIdx.x * num;
for (int i = threadIdx.x; i < num; i += blockDim.x) {
......@@ -140,10 +140,10 @@ void FcCompute<T, PType>::Run() {
const auto* bias_ptr_v4 = reinterpret_cast<const trans_type*>(b_data);
auto* data_ptr_v4 = reinterpret_cast<trans_type*>(out_data);
if (activation_type == "relu") {
bias_relu_v4<trans_type><<<blocks, threads, 0, stream>>>(
AddBiasReluV4<trans_type><<<blocks, threads, 0, stream>>>(
num, bias_ptr_v4, data_ptr_v4, N / 4);
} else if (activation_type == "") {
bias_v4<trans_type><<<blocks, threads, 0, stream>>>(
AddBiasV4<trans_type><<<blocks, threads, 0, stream>>>(
num, bias_ptr_v4, data_ptr_v4, N / 4);
} else {
LOG(FATAL) << "not supported activation type: " << activation_type;
......@@ -152,9 +152,9 @@ void FcCompute<T, PType>::Run() {
const int threads = 256;
const int blocks = M;
if (activation_type == "relu") {
general_relu_bias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
AddBiasRelu<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
} else if (activation_type == "") {
general_bias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
AddBias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
} else {
LOG(FATAL) << "not supported activation type: " << activation_type;
}
......
......@@ -31,101 +31,101 @@ namespace cuda {
class FcTest : public ::testing::Test {
protected:
FcTest()
: m(128),
k(512),
n(64),
in_num_col_dims(1),
act_type("relu"),
x_shape({m, k}),
w_shape({k, n}),
b_shape({n}),
out_shape({m, n}) {
X_gpu.Resize(lite::DDim(x_shape));
X_ref.Resize(lite::DDim(x_shape));
W_gpu.Resize(lite::DDim(w_shape));
W_ref.Resize(lite::DDim(w_shape));
b_gpu.Resize(lite::DDim(b_shape));
b_ref.Resize(lite::DDim(b_shape));
auto x_ref_data = X_ref.mutable_data<float>();
auto w_ref_data = W_ref.mutable_data<float>();
auto b_ref_data = b_ref.mutable_data<float>();
: m_(128),
k_(512),
n_(64),
in_num_col_dims_(1),
act_type_("relu"),
x_shape_({m_, k_}),
w_shape_({k_, n_}),
b_shape_({n_}),
out_shape_({m_, n_}) {
x_ref_.Resize(lite::DDim(x_shape_));
x_gpu_.Resize(lite::DDim(x_shape_));
w_ref_.Resize(lite::DDim(w_shape_));
w_gpu_.Resize(lite::DDim(w_shape_));
b_ref_.Resize(lite::DDim(b_shape_));
b_gpu_.Resize(lite::DDim(b_shape_));
auto x_ref_data = x_ref_.mutable_data<float>();
auto w_ref_data = w_ref_.mutable_data<float>();
auto b_ref_data = b_ref_.mutable_data<float>();
// prepare input
for (int64_t i = 0; i < X_ref.numel(); i++) {
for (int64_t i = 0; i < x_ref_.numel(); i++) {
x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
}
for (int64_t i = 0; i < W_ref.numel(); i++) {
for (int64_t i = 0; i < w_ref_.numel(); i++) {
w_ref_data[i] = static_cast<float>(i % 10 * 0.2);
}
for (int64_t i = 0; i < b_ref.numel(); i++) {
for (int64_t i = 0; i < b_ref_.numel(); i++) {
b_ref_data[i] = static_cast<float>(i % 10 * 0.2);
}
Out_ref.Resize(lite::DDim(out_shape));
Out_cpu.Resize(Out_ref.dims());
Out_gpu.Resize(Out_ref.dims());
fc_cpu_base(&X_ref, &W_ref, &b_ref, &Out_ref);
out_ref_.Resize(lite::DDim(out_shape_));
out_cpu_.Resize(out_ref_.dims());
out_gpu_.Resize(out_ref_.dims());
RunBaseLine(&x_ref_, &w_ref_, &b_ref_, &out_ref_);
device_init();
InitParamAndContext();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.input = &X_gpu;
param.w = &W_gpu;
param.bias = &b_gpu;
param.in_num_col_dims = in_num_col_dims;
param.activation_type = act_type;
param.output = &Out_gpu;
void InitParamAndContext() {
ctx_.reset(new KernelContext);
cudaStreamCreate(&stream_);
auto& context = ctx_->As<CUDAContext>();
context.SetExecStream(stream_);
param_.input = &x_gpu_;
param_.w = &w_gpu_;
param_.bias = &b_gpu_;
param_.in_num_col_dims = in_num_col_dims_;
param_.activation_type = act_type_;
param_.output = &out_gpu_;
}
void float_data_init() {
X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
X_gpu.dims());
W_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(W_ref.data<float>(),
W_gpu.dims());
b_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(b_ref.data<float>(),
b_gpu.dims());
void InitFloatInput() {
x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
x_gpu_.dims());
w_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(w_ref_.data<float>(),
w_gpu_.dims());
b_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(b_ref_.data<float>(),
b_gpu_.dims());
}
void half_data_init() {
X_half.Resize(lite::DDim(x_shape));
auto x_half_data = X_half.mutable_data<half>();
for (int64_t i = 0; i < X_half.numel(); i++) {
x_half_data[i] = half(lite::float16(X_ref.data<float>()[i]));
void InitHalfInput() {
x_half_.Resize(lite::DDim(x_shape_));
auto x_half_data = x_half_.mutable_data<half>();
for (int64_t i = 0; i < x_half_.numel(); i++) {
x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
}
X_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, X_gpu.dims());
W_half.Resize(W_ref.dims());
auto w_half_data = W_half.mutable_data<half>();
for (int64_t i = 0; i < W_half.numel(); i++) {
w_half_data[i] = half(lite::float16(W_ref.data<float>()[i]));
x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
w_half_.Resize(w_ref_.dims());
auto w_half_data = w_half_.mutable_data<half>();
for (int64_t i = 0; i < w_half_.numel(); i++) {
w_half_data[i] = half(lite::float16(w_ref_.data<float>()[i]));
}
W_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(w_half_data, W_gpu.dims());
b_half.Resize(b_ref.dims());
auto b_half_data = b_half.mutable_data<half>();
for (int64_t i = 0; i < b_half.numel(); i++) {
b_half_data[i] = half(lite::float16(b_ref.data<float>()[i]));
w_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(w_half_data, w_gpu_.dims());
b_half_.Resize(b_ref_.dims());
auto b_half_data = b_half_.mutable_data<half>();
for (int64_t i = 0; i < b_half_.numel(); i++) {
b_half_data[i] = half(lite::float16(b_ref_.data<float>()[i]));
}
b_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(b_half_data, b_gpu.dims());
b_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(b_half_data, b_gpu_.dims());
}
void fc_cpu_base(const lite::Tensor* X,
const lite::Tensor* W,
void RunBaseLine(const lite::Tensor* x,
const lite::Tensor* w,
const lite::Tensor* b,
lite::Tensor* Out) {
const float* data_in = X->data<float>();
lite::Tensor* out) {
const float* data_in = x->data<float>();
const float* bias = b->data<float>();
const float* weights = W->data<float>();
float* data_out = Out->mutable_data<float>();
int out_rows = X->dims()[0];
int in_cols = X->numel() / out_rows;
int out_cols = W->numel() / in_cols;
const float* weights = w->data<float>();
float* data_out = out->mutable_data<float>();
int out_rows = x->dims()[0];
int in_cols = x->numel() / out_rows;
int out_cols = w->numel() / in_cols;
int index_out;
for (int i = 0; i < out_rows; i++) {
for (int j = 0; j < out_cols; j++) {
......@@ -135,31 +135,31 @@ class FcTest : public ::testing::Test {
data_out[index_out] +=
data_in[i * in_cols + k] * weights[k * out_cols + j];
}
if (act_type == "relu") {
if (act_type_ == "relu") {
data_out[index_out] *= static_cast<int>(data_out[index_out] > 0);
}
}
}
}
int m, k, n, in_num_col_dims;
std::string act_type;
std::vector<int64_t> x_shape, w_shape, b_shape, out_shape;
lite::Tensor X_ref, W_ref, b_ref, Out_ref;
lite::Tensor X_gpu, W_gpu, b_gpu;
lite::Tensor X_half, W_half, b_half;
lite::Tensor Out_cpu, Out_gpu;
operators::FcParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
int m_, k_, n_, in_num_col_dims_;
std::string act_type_;
std::vector<int64_t> x_shape_, w_shape_, b_shape_, out_shape_;
lite::Tensor x_ref_, w_ref_, b_ref_, out_ref_;
lite::Tensor x_gpu_, w_gpu_, b_gpu_;
lite::Tensor x_half_, w_half_, b_half_;
lite::Tensor out_cpu_, out_gpu_;
operators::FcParam param_;
std::unique_ptr<KernelContext> ctx_;
cudaStream_t stream_;
};
TEST_F(FcTest, TestFP32) {
float_data_init();
InitFloatInput();
FcCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
kernel.SetParam(param_);
kernel.SetContext(std::move(ctx_));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
......@@ -177,14 +177,14 @@ TEST_F(FcTest, TestFP32) {
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
out_gpu_.data<float>(),
sizeof(float) * out_gpu_.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
float res = Out_cpu.data<float>()[i];
float ref = Out_ref.data<float>()[i];
for (int i = 0; i < out_gpu_.numel(); ++i) {
float res = out_cpu_.data<float>()[i];
float ref = out_ref_.data<float>()[i];
EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
}
}
......
......@@ -28,11 +28,6 @@ class SequenceMaskCompute : public KernelLite<TARGET(kCUDA), Ptype> {
void Run() override;
virtual ~SequenceMaskCompute() = default;
// private:
// lite::Tensor seq_offsets_;
// std::vector<int64_t> seq_len_;
// std::vector<size_t> seq_offsets_vec_;
};
} // namespace cuda
......
......@@ -32,73 +32,73 @@ namespace cuda {
class SequenceMaskTest : public ::testing::Test {
protected:
SequenceMaskTest()
: maxlen(4),
out_dtype(5),
x_data({3, 2, 1, 0}),
out_shape({static_cast<int64_t>(x_data.size()), maxlen}) {
X_ref.Resize(lite::DDim({static_cast<int64_t>(x_data.size())}));
X_gpu.Resize(X_ref.dims());
: maxlen_(4),
out_dtype_(5),
x_data_({3, 2, 1, 0}),
out_shape_({static_cast<int64_t>(x_data_.size()), maxlen_}) {
x_ref_.Resize(lite::DDim({static_cast<int64_t>(x_data_.size())}));
x_gpu_.Resize(x_ref_.dims());
auto* x_ref_data = X_ref.mutable_data<int64_t>();
auto* x_ref_data = x_ref_.mutable_data<int64_t>();
// prepare input
for (size_t i = 0; i < x_data.size(); i++) {
x_ref_data[i] = x_data[i];
for (size_t i = 0; i < x_data_.size(); i++) {
x_ref_data[i] = x_data_[i];
}
Out_ref.Resize(lite::DDim(out_shape));
Out_gpu.Resize(Out_ref.dims());
Out_cpu.Resize(Out_ref.dims());
cpu_base(&X_ref, &Out_ref);
out_ref_.Resize(lite::DDim(out_shape_));
out_gpu_.Resize(out_ref_.dims());
out_cpu_.Resize(out_ref_.dims());
RunBaseLine(&x_ref_, &out_ref_);
device_init();
InitParamAndContext();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.X = &X_gpu;
param.Y = &Out_gpu;
param.maxlen = maxlen;
param.out_dtype = out_dtype;
void InitParamAndContext() {
ctx_.reset(new KernelContext);
cudaStreamCreate(&stream_);
auto& context = ctx_->As<CUDAContext>();
context.SetExecStream(stream_);
param_.X = &x_gpu_;
param_.Y = &out_gpu_;
param_.maxlen = maxlen_;
param_.out_dtype = out_dtype_;
}
void float_data_init() {
X_gpu.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(X_ref.data<int64_t>(),
X_gpu.dims());
void InitFloatInput() {
x_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(x_ref_.data<int64_t>(),
x_gpu_.dims());
}
void half_data_init() {}
void InitHalfInput() {}
void cpu_base(const lite::Tensor* X, lite::Tensor* Out) {
auto* out_data = Out->mutable_data<float>();
void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) {
auto* out_data = out->mutable_data<float>();
for (size_t i = 0; i < x_data.size(); ++i) {
for (int j = 0; j < maxlen; ++j) {
out_data[i * maxlen + j] = j < x_data[i] ? 1 : 0;
for (size_t i = 0; i < x_data_.size(); ++i) {
for (int j = 0; j < maxlen_; ++j) {
out_data[i * maxlen_ + j] = j < x_data_[i] ? 1 : 0;
}
}
}
int maxlen, out_dtype;
std::vector<int64_t> x_data, out_shape;
int maxlen_, out_dtype_;
std::vector<int64_t> x_data_, out_shape_;
lite::Tensor X_ref, Out_ref;
lite::Tensor X_gpu, Out_gpu;
lite::Tensor Out_cpu;
lite::Tensor x_ref_, out_ref_;
lite::Tensor x_gpu_, out_gpu_;
lite::Tensor out_cpu_;
operators::SequenceMaskParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
operators::SequenceMaskParam param_;
std::unique_ptr<KernelContext> ctx_;
cudaStream_t stream_;
};
TEST_F(SequenceMaskTest, fp32) {
float_data_init();
InitFloatInput();
SequenceMaskCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
kernel.SetParam(param_);
kernel.SetContext(std::move(ctx_));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
......@@ -116,12 +116,12 @@ TEST_F(SequenceMaskTest, fp32) {
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
out_gpu_.data<float>(),
sizeof(float) * out_gpu_.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
for (int i = 0; i < out_gpu_.numel(); ++i) {
EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
}
}
......
......@@ -23,7 +23,7 @@
#include "lite/api/test_helper.h"
#include "lite/backends/cuda/cuda_utils.h"
// #include "lite/utils/float16.h"
#include "lite/utils/float16.h"
namespace paddle {
namespace lite {
......@@ -33,72 +33,73 @@ namespace cuda {
class SequencePadTest : public ::testing::Test {
protected:
SequencePadTest()
: batch(5),
features(2),
padded_length(3),
x_lod({{0, 2, 5}}),
x_shape({batch, features}),
pad_value_shape({features}),
out_shape({static_cast<int64_t>(x_lod[0].size() - 1),
padded_length,
features}) {
X_ref.Resize(lite::DDim(x_shape));
X_ref.set_lod(x_lod);
X_gpu.Resize(X_ref.dims());
PadValue_ref.Resize(lite::DDim(pad_value_shape));
PadValue_gpu.Resize(PadValue_ref.dims());
Length_ref.Resize(lite::DDim({static_cast<int64_t>(x_lod[0].size() - 1)}));
Length_gpu.Resize(Length_ref.dims());
auto x_ref_data = X_ref.mutable_data<float>();
auto pad_value_ref_data = PadValue_ref.mutable_data<float>();
: batch_(5),
features_(2),
padded_length_(3),
x_lod_({{0, 2, 5}}),
x_shape_({batch_, features_}),
pad_value_shape_({features_}),
out_shape_({static_cast<int64_t>(x_lod_[0].size() - 1),
padded_length_,
features_}) {
x_ref_.Resize(lite::DDim(x_shape_));
x_ref_.set_lod(x_lod_);
x_gpu_.Resize(x_ref_.dims());
pad_value_ref_.Resize(lite::DDim(pad_value_shape_));
pad_value_gpu_.Resize(pad_value_ref_.dims());
length_ref_.Resize(
lite::DDim({static_cast<int64_t>(x_lod_[0].size() - 1)}));
length_gpu_.Resize(length_ref_.dims());
auto x_ref_data = x_ref_.mutable_data<float>();
auto pad_value_ref_data = pad_value_ref_.mutable_data<float>();
// prepare input
for (int64_t i = 0; i < X_ref.numel(); i++) {
for (int64_t i = 0; i < x_ref_.numel(); i++) {
x_ref_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < PadValue_ref.numel(); i++) {
for (int64_t i = 0; i < pad_value_ref_.numel(); i++) {
pad_value_ref_data[i] = static_cast<float>(i);
}
Out_ref.Resize(lite::DDim(out_shape));
Out_gpu.Resize(Out_ref.dims());
Out_cpu.Resize(Out_ref.dims());
cpu_base(&X_ref, &PadValue_ref, &Out_ref, &Length_ref);
out_ref_.Resize(lite::DDim(out_shape_));
out_gpu_.Resize(out_ref_.dims());
out_cpu_.Resize(out_ref_.dims());
RunBaseLine(&x_ref_, &pad_value_ref_, &out_ref_, &length_ref_);
device_init();
InitParamAndContext();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.X = &X_gpu;
param.PadValue = &PadValue_gpu;
param.Length = &Length_gpu;
param.Out = &Out_gpu;
param.padded_length = padded_length;
void InitParamAndContext() {
ctx_.reset(new KernelContext);
cudaStreamCreate(&stream_);
auto& context = ctx_->As<CUDAContext>();
context.SetExecStream(stream_);
param_.X = &x_gpu_;
param_.PadValue = &pad_value_gpu_;
param_.Length = &length_gpu_;
param_.Out = &out_gpu_;
param_.padded_length = padded_length_;
}
void float_data_init() {
X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
X_gpu.dims());
X_gpu.set_lod(X_ref.lod());
PadValue_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(
PadValue_ref.data<float>(), PadValue_gpu.dims());
void InitFloatInput() {
x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
x_gpu_.dims());
x_gpu_.set_lod(x_ref_.lod());
pad_value_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(
pad_value_ref_.data<float>(), pad_value_gpu_.dims());
}
void half_data_init() {}
void InitHalfInput() {}
void cpu_base(const lite::Tensor* X,
const lite::Tensor* PadValue,
lite::Tensor* Out,
lite::Tensor* Length) {
auto* length_data = Length->mutable_data<int64_t>();
auto* out_data = Out->mutable_data<float>();
void RunBaseLine(const lite::Tensor* x,
const lite::Tensor* pad_value,
lite::Tensor* out,
lite::Tensor* length) {
auto* length_data = length->mutable_data<int64_t>();
auto* out_data = out->mutable_data<float>();
length_data[0] = 2;
length_data[1] = 3;
......@@ -112,24 +113,24 @@ class SequencePadTest : public ::testing::Test {
}
}
int batch, features, padded_length;
LoD x_lod;
std::vector<int64_t> x_shape, pad_value_shape, out_shape;
int batch_, features_, padded_length_;
LoD x_lod_;
std::vector<int64_t> x_shape_, pad_value_shape_, out_shape_;
lite::Tensor X_ref, PadValue_ref, Out_ref, Length_ref;
lite::Tensor X_gpu, PadValue_gpu, Out_gpu, Length_gpu;
lite::Tensor Out_cpu, Length_cpu;
lite::Tensor x_ref_, pad_value_ref_, out_ref_, length_ref_;
lite::Tensor x_gpu_, pad_value_gpu_, out_gpu_, length_gpu_;
lite::Tensor out_cpu_, length_cpu_;
operators::SequencePadParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
operators::SequencePadParam param_;
std::unique_ptr<KernelContext> ctx_;
cudaStream_t stream_;
};
TEST_F(SequencePadTest, fp32) {
float_data_init();
InitFloatInput();
SequencePadCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
kernel.SetParam(param_);
kernel.SetContext(std::move(ctx_));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
......@@ -147,20 +148,20 @@ TEST_F(SequencePadTest, fp32) {
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
out_gpu_.data<float>(),
sizeof(float) * out_gpu_.numel(),
IoDirection::DtoH);
CopySync<TARGET(kCUDA)>(Length_cpu.mutable_data<int64_t>(),
Length_gpu.data<int64_t>(),
sizeof(int64_t) * Length_gpu.numel(),
CopySync<TARGET(kCUDA)>(length_cpu_.mutable_data<int64_t>(),
length_gpu_.data<int64_t>(),
sizeof(int64_t) * length_gpu_.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
for (int i = 0; i < out_gpu_.numel(); ++i) {
EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
}
for (int i = 0; i < Length_gpu.numel(); ++i) {
for (int i = 0; i < length_gpu_.numel(); ++i) {
EXPECT_NEAR(
Length_cpu.data<int64_t>()[i], Length_ref.data<int64_t>()[i], 1e-5);
length_cpu_.data<int64_t>()[i], length_ref_.data<int64_t>()[i], 1e-5);
}
}
......
......@@ -23,7 +23,7 @@
#include "lite/api/test_helper.h"
#include "lite/backends/cuda/cuda_utils.h"
// #include "lite/utils/float16.h"
#include "lite/utils/float16.h"
namespace paddle {
namespace lite {
......@@ -33,66 +33,66 @@ namespace cuda {
class SequenceUnpadTest : public ::testing::Test {
protected:
SequenceUnpadTest()
: batch(5),
features(2),
padded_length(3),
out_lod({{0, 2, 5}}),
x_shape({static_cast<int64_t>(out_lod[0].size() - 1),
padded_length,
features}),
out_shape({batch, features}) {
X_ref.Resize(lite::DDim(x_shape));
X_gpu.Resize(X_ref.dims());
Length_ref.Resize(
lite::DDim({static_cast<int64_t>(out_lod[0].size() - 1)}));
Length_gpu.Resize(Length_ref.dims());
auto* x_ref_data = X_ref.mutable_data<float>();
auto* length_ref_data = Length_ref.mutable_data<int64_t>();
: batch_(5),
features_(2),
padded_length_(3),
out_lod_({{0, 2, 5}}),
x_shape_({static_cast<int64_t>(out_lod_[0].size() - 1),
padded_length_,
features_}),
out_shape_({batch_, features_}) {
x_ref_.Resize(lite::DDim(x_shape_));
x_gpu_.Resize(x_ref_.dims());
length_ref_.Resize(
lite::DDim({static_cast<int64_t>(out_lod_[0].size() - 1)}));
length_gpu_.Resize(length_ref_.dims());
auto* x_ref_data = x_ref_.mutable_data<float>();
auto* length_ref_data = length_ref_.mutable_data<int64_t>();
// prepare input
for (int64_t i = 0; i < X_ref.numel(); i++) {
for (int64_t i = 0; i < x_ref_.numel(); i++) {
x_ref_data[i] = static_cast<float>(i);
}
for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
length_ref_data[i] = out_lod[0][i + 1] - out_lod[0][i];
for (size_t i = 0; i < out_lod_[0].size() - 1; ++i) {
length_ref_data[i] = out_lod_[0][i + 1] - out_lod_[0][i];
}
Out_ref.Resize(lite::DDim(out_shape));
Out_ref.set_lod(out_lod);
Out_gpu.Resize(Out_ref.dims());
Out_gpu.set_lod(Out_ref.lod());
Out_cpu.Resize(Out_ref.dims());
Out_cpu.set_lod(Out_ref.lod());
out_ref_.Resize(lite::DDim(out_shape_));
out_ref_.set_lod(out_lod_);
out_gpu_.Resize(out_ref_.dims());
out_gpu_.set_lod(out_ref_.lod());
out_cpu_.Resize(out_ref_.dims());
out_cpu_.set_lod(out_ref_.lod());
cpu_base(&X_ref, &Length_ref, &Out_ref);
RunBaseLine(&x_ref_, &length_ref_, &out_ref_);
device_init();
InitParamAndContext();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.X = &X_gpu;
param.Length = &Length_gpu;
param.Out = &Out_gpu;
void InitParamAndContext() {
ctx_.reset(new KernelContext);
cudaStreamCreate(&stream_);
auto& context = ctx_->As<CUDAContext>();
context.SetExecStream(stream_);
param_.X = &x_gpu_;
param_.Length = &length_gpu_;
param_.Out = &out_gpu_;
}
void float_data_init() {
X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
X_gpu.dims());
Length_gpu.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(
Length_ref.data<int64_t>(), Length_gpu.dims());
void InitFloatInput() {
x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
x_gpu_.dims());
length_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(
length_ref_.data<int64_t>(), length_gpu_.dims());
}
void half_data_init() {}
void InitHalfInput() {}
void cpu_base(const lite::Tensor* X,
const lite::Tensor* Length,
lite::Tensor* Out) {
void RunBaseLine(const lite::Tensor* X,
const lite::Tensor* Length,
lite::Tensor* Out) {
auto* out_data = Out->mutable_data<float>();
for (size_t i = 0; i < 4; ++i) {
......@@ -103,24 +103,24 @@ class SequenceUnpadTest : public ::testing::Test {
}
}
int batch, features, padded_length;
LoD out_lod;
std::vector<int64_t> x_shape, out_shape;
int batch_, features_, padded_length_;
LoD out_lod_;
std::vector<int64_t> x_shape_, out_shape_;
lite::Tensor X_ref, Out_ref, Length_ref;
lite::Tensor X_gpu, Out_gpu, Length_gpu;
lite::Tensor Out_cpu, Length_cpu;
lite::Tensor x_ref_, out_ref_, length_ref_;
lite::Tensor x_gpu_, out_gpu_, length_gpu_;
lite::Tensor out_cpu_, length_cpu_;
operators::SequencePadParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
operators::SequencePadParam param_;
std::unique_ptr<KernelContext> ctx_;
cudaStream_t stream_;
};
TEST_F(SequenceUnpadTest, fp32) {
float_data_init();
InitFloatInput();
SequenceUnpadCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
kernel.SetParam(param_);
kernel.SetContext(std::move(ctx_));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
......@@ -138,12 +138,12 @@ TEST_F(SequenceUnpadTest, fp32) {
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
out_gpu_.data<float>(),
sizeof(float) * out_gpu_.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
for (int i = 0; i < out_gpu_.numel(); ++i) {
EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
}
}
......
......@@ -43,7 +43,7 @@ void TransposeCompute<T, Ptype>::Run() {
// NCHW -> NHWC
if (axes.size() == 4 && axes[0] == 0 && axes[1] == 2 && axes[2] == 3 &&
axes[3] == 1) {
trans.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
trans_.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
return;
......@@ -52,13 +52,13 @@ void TransposeCompute<T, Ptype>::Run() {
// NHWC -> NCHW
if (axes.size() == 4 && axes[0] == 0 && axes[1] == 3 && axes[2] == 1 &&
axes[3] == 2) {
trans.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
trans_.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
return;
}
trans.transpose(out, in, dims, axes, &stream);
trans_.transpose(out, in, dims, axes, &stream);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
}
......
......@@ -30,7 +30,7 @@ class TransposeCompute : public KernelLite<TARGET(kCUDA), Ptype> {
virtual ~TransposeCompute() = default;
private:
lite::cuda::math::Transpose<Dtype> trans;
lite::cuda::math::Transpose<Dtype> trans_;
};
} // namespace cuda
......
......@@ -36,9 +36,9 @@ namespace {
#define OUT(n, c, h, w) \
output_data[w + h * output_w + c * output_h * output_w + \
n * output_c * output_h * output_w]
void nchw2nhwc_ref(lite::Tensor* input,
lite::Tensor* output,
const std::vector<int> axies) {
void Nchw2nhwcBaseLine(lite::Tensor* input,
lite::Tensor* output,
const std::vector<int> axies) {
auto* input_data = input->data<float>();
auto* output_data = output->mutable_data<float>();
......@@ -69,9 +69,9 @@ void nchw2nhwc_ref(lite::Tensor* input,
#define OUT(n, h, w, c) \
output_data[c + w * output_c + h * output_w * output_c + \
n * output_h * output_w * output_c]
void nhwc2nchw_ref(lite::Tensor* input,
lite::Tensor* output,
const std::vector<int> axies) {
void Nhwc2nchwBaseLine(lite::Tensor* input,
lite::Tensor* output,
const std::vector<int> axies) {
auto* input_data = input->data<float>();
auto* output_data = output->mutable_data<float>();
......@@ -94,7 +94,7 @@ void nhwc2nchw_ref(lite::Tensor* input,
}
}
void transpose_ref(const lite::Tensor* input,
void TransBaseLine(const lite::Tensor* input,
lite::Tensor* output,
const std::vector<int> axes) {
auto* input_data = input->data<float>();
......@@ -173,9 +173,9 @@ TEST(transpose_nchw, normal) {
auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
CopySync<TARGET(kCUDA)>(
out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
nchw2nhwc_ref(&x_ref, &out_ref, axes);
Nchw2nhwcBaseLine(&x_ref, &out_ref, axes);
auto* out_ref_data = out_ref.mutable_data<float>();
// transpose_ref(&x_ref, &out_ref, axes);
// TransBaseLine(&x_ref, &out_ref, axes);
for (int i = 0; i < out.numel(); i++) {
EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
}
......@@ -225,8 +225,8 @@ TEST(transpose_nhwc, normal) {
auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
CopySync<TARGET(kCUDA)>(
out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
nhwc2nchw_ref(&x_ref, &out_ref, axes);
// transpose_ref(&x_ref, &out_ref, axes);
Nhwc2nchwBaseLine(&x_ref, &out_ref, axes);
// TransBaseLine(&x_ref, &out_ref, axes);
auto* out_ref_data = out_ref.mutable_data<float>();
for (int i = 0; i < out.numel(); i++) {
EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
......@@ -236,77 +236,77 @@ TEST(transpose_nhwc, normal) {
class TransposeTest : public ::testing::Test {
protected:
TransposeTest()
: C(3),
H(128),
W(64),
axes({1, 2, 0}),
x_shape({C, H, W}),
out_shape({H, W, C}) {
X_ref.Resize(lite::DDim(x_shape));
X_gpu.Resize(X_ref.dims());
: C_(3),
H_(128),
W_(64),
axes_({1, 2, 0}),
x_shape_({C_, H_, W_}),
out_shape_({H_, W_, C_}) {
x_ref_.Resize(lite::DDim(x_shape_));
x_gpu_.Resize(x_ref_.dims());
auto x_ref_data = X_ref.mutable_data<float>();
auto X_ref__data = x_ref_.mutable_data<float>();
// prepare input
for (int64_t i = 0; i < X_ref.numel(); i++) {
x_ref_data[i] = static_cast<float>(i);
for (int64_t i = 0; i < x_ref_.numel(); i++) {
X_ref__data[i] = static_cast<float>(i);
}
Out_ref.Resize(lite::DDim(out_shape));
Out_gpu.Resize(Out_ref.dims());
Out_cpu.Resize(Out_ref.dims());
cpu_base(&X_ref, &Out_ref);
out_ref_.Resize(lite::DDim(out_shape_));
out_gpu_.Resize(out_ref_.dims());
out_cpu_.Resize(out_ref_.dims());
RunBaseLine(&x_ref_, &out_ref_);
device_init();
InitParamAndContext();
}
void device_init() {
ctx.reset(new KernelContext);
cudaStreamCreate(&stream);
auto& context = ctx->As<CUDAContext>();
context.SetExecStream(stream);
param.x = &X_gpu;
param.output = &Out_gpu;
param.axis = axes;
void InitParamAndContext() {
ctx_.reset(new KernelContext);
cudaStreamCreate(&stream_);
auto& context = ctx_->As<CUDAContext>();
context.SetExecStream(stream_);
param_.x = &x_gpu_;
param_.output = &out_gpu_;
param_.axis = axes_;
}
void float_data_init() {
X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
X_gpu.dims());
void InitFloatInput() {
x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
x_gpu_.dims());
}
void half_data_init() {
X_half.Resize(lite::DDim(X_ref.dims()));
auto x_half_data = X_half.mutable_data<half>();
for (int64_t i = 0; i < X_half.numel(); i++) {
x_half_data[i] = half(lite::float16(X_ref.data<float>()[i]));
void InitHalfInput() {
x_half_.Resize(lite::DDim(x_ref_.dims()));
auto X_half__data = x_half_.mutable_data<half>();
for (int64_t i = 0; i < x_half_.numel(); i++) {
X_half__data[i] = half(lite::float16(x_ref_.data<float>()[i]));
}
X_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, X_gpu.dims());
x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(X_half__data, x_gpu_.dims());
}
void cpu_base(const lite::Tensor* X, lite::Tensor* Out) {
transpose_ref(X, Out, axes);
void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) {
TransBaseLine(x, out, axes_);
}
int C, H, W;
std::vector<int> axes;
std::vector<int64_t> x_shape, out_shape;
int C_, H_, W_;
std::vector<int> axes_;
std::vector<int64_t> x_shape_, out_shape_;
lite::Tensor X_ref, Out_ref;
lite::Tensor X_gpu, Out_gpu;
lite::Tensor X_half;
lite::Tensor Out_cpu;
lite::Tensor x_ref_, out_ref_;
lite::Tensor x_gpu_, out_gpu_;
lite::Tensor x_half_;
lite::Tensor out_cpu_;
operators::TransposeParam param;
std::unique_ptr<KernelContext> ctx;
cudaStream_t stream;
operators::TransposeParam param_;
std::unique_ptr<KernelContext> ctx_;
cudaStream_t stream_;
};
TEST_F(TransposeTest, fp32) {
float_data_init();
InitFloatInput();
TransposeCompute<float, PRECISION(kFloat)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
kernel.SetParam(param_);
kernel.SetContext(std::move(ctx_));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
......@@ -324,20 +324,20 @@ TEST_F(TransposeTest, fp32) {
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
Out_gpu.data<float>(),
sizeof(float) * Out_gpu.numel(),
CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
out_gpu_.data<float>(),
sizeof(float) * out_gpu_.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_gpu.numel(); ++i) {
EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
for (int i = 0; i < out_gpu_.numel(); ++i) {
EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
}
}
TEST_F(TransposeTest, TestFP16) {
half_data_init();
InitHalfInput();
TransposeCompute<half, PRECISION(kFP16)> kernel;
kernel.SetParam(param);
kernel.SetContext(std::move(ctx));
kernel.SetParam(param_);
kernel.SetContext(std::move(ctx_));
for (int i = 0; i < FLAGS_warmup; ++i) {
kernel.Launch();
......@@ -355,16 +355,16 @@ TEST_F(TransposeTest, TestFP16) {
<< ", repeats: " << FLAGS_repeats << ", spend "
<< duration / FLAGS_repeats << " ms in average.";
const half* out_gpu_data = Out_gpu.data<half>();
half* out_cpu_data = Out_cpu.mutable_data<half>();
CopySync<TARGET(kCUDA)>(out_cpu_data,
out_gpu_data,
sizeof(half) * Out_gpu.numel(),
const half* Out_gpu__data = out_gpu_.data<half>();
half* Out_cpu__data = out_cpu_.mutable_data<half>();
CopySync<TARGET(kCUDA)>(Out_cpu__data,
Out_gpu__data,
sizeof(half) * out_gpu_.numel(),
IoDirection::DtoH);
for (int i = 0; i < Out_cpu.numel(); ++i) {
float res = static_cast<float>(lite::float16(out_cpu_data[i]));
float ref = Out_ref.data<float>()[i];
for (int i = 0; i < out_cpu_.numel(); ++i) {
float res = static_cast<float>(lite::float16(Out_cpu__data[i]));
float ref = out_ref_.data<float>()[i];
EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册