提交 da043b14 编写于 作者: W Wilber 提交者: GitHub

update cuda kernels to run content-dnn models test=develop (#2554)

update cuda kernels to run content-dnn model
上级 239edc81
......@@ -121,6 +121,7 @@ template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
......
......@@ -115,6 +115,8 @@ KernelRegistry::KernelRegistry()
INIT_FOR(kCUDA, kAny, kNCHW);
INIT_FOR(kCUDA, kAny, kAny);
INIT_FOR(kCUDA, kInt8, kNHWC);
INIT_FOR(kCUDA, kInt64, kNCHW);
INIT_FOR(kCUDA, kInt64, kNHWC);
INIT_FOR(kHost, kFloat, kNCHW);
INIT_FOR(kHost, kAny, kNCHW);
......
......@@ -40,6 +40,7 @@ __global__ void ker_attention_padding_mask(T* out_data,
const int attn_seq_len,
const int src_seq_num,
const int src_seq_len,
const T* pad_begin_data,
const T mask,
const int count) {
CUDA_KERNEL_LOOP(tid, count) {
......@@ -49,7 +50,12 @@ __global__ void ker_attention_padding_mask(T* out_data,
int attn_word_id = tmp_tid % attn_seq_len;
int src_seq_id = attn_seq_id % src_seq_num;
int cur_len = src_offset[src_seq_id + 1] - src_offset[src_seq_id];
if (src_word_id >= cur_len) {
int k = static_cast<int>(pad_begin_data[src_seq_id]);
if (k < cur_len &&
tid >= src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) + k &&
tid < src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) +
cur_len) {
out_data[tid] = mask;
} else {
out_data[tid] = attn_data[tid];
......@@ -79,6 +85,35 @@ void AttentionPaddingMaskCompute::Run() {
auto attn_data = attn->data<float>();
auto out_data = out->mutable_data<float>(TARGET(kCUDA));
std::vector<float> src_cpu(src->numel(), 0);
TargetWrapperCuda::MemcpyAsync(src_cpu.data(),
src->data<float>(),
sizeof(float) * src->numel(),
IoDirection::DtoH,
stream);
cudaStreamSynchronize(stream);
std::vector<float> pad_begin(src_seq_num, 0);
auto src_len = static_cast<int64_t>(src->lod()[0][1]);
int _pad_id = param.pad_id;
for (int i = 0; i < src_seq_num; ++i) {
const auto* src_data = src_cpu.data() + src_len * i;
int index = src_len - 1;
for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
--index) {
}
pad_begin[i] = static_cast<float>(index + 1);
}
param.pad_begin->Resize({static_cast<int64_t>(src_seq_num)});
auto pad_begin_cuda_data =
param.pad_begin->mutable_data<float>(TARGET(kCUDA));
TargetWrapperCuda::MemcpyAsync(pad_begin_cuda_data,
pad_begin.data(),
sizeof(float) * src_seq_num,
IoDirection::HtoD,
stream);
std::vector<int> src_offset_cpu(src_offset.size(), 0);
for (int i = 0; i < src_offset.size(); i++) {
src_offset_cpu[i] = src_offset[i];
......@@ -101,11 +136,12 @@ void AttentionPaddingMaskCompute::Run() {
attn_seq_len,
src_seq_num,
src_seq_len,
pad_begin_cuda_data,
param.mask,
count);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
} // namespace cuda
......@@ -113,7 +149,7 @@ void AttentionPaddingMaskCompute::Run() {
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(attention_padding_mask,
REGISTER_LITE_KERNEL(search_attention_padding_mask,
kCUDA,
kFloat,
kNCHW,
......
......@@ -20,21 +20,22 @@ namespace lite {
namespace kernels {
namespace cuda {
void FeedCompute::Run() {
auto& param = this->Param<param_t>();
template <typename T, PrecisionType Ptype>
void FeedCompute<T, Ptype>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
VLOG(4) << "feed_list.size: " << param.feed_list->size();
const lite::Tensor& feed_item = (*param.feed_list)[param.col];
int num = static_cast<int>(feed_item.numel());
auto input = feed_item.data<float>();
auto input = feed_item.data<T>();
param.out->Resize(feed_item.dims());
auto output = param.out->mutable_data<float>(TARGET(kCUDA));
auto output = param.out->template mutable_data<T>(TARGET(kCUDA));
VLOG(4) << "col: " << param.col << " num:" << num;
TargetW::MemcpyAsync(
output, input, num * sizeof(float), IoDirection::HtoD, stream);
output, input, num * sizeof(T), IoDirection::HtoD, stream);
}
} // namespace cuda
......@@ -42,8 +43,13 @@ void FeedCompute::Run() {
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
feed, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::FeedCompute, nchw)
typedef paddle::lite::kernels::cuda::FeedCompute<float, PRECISION(kFloat)>
FeedFp32;
typedef paddle::lite::kernels::cuda::FeedCompute<int64_t, PRECISION(kInt64)>
FeedInt64;
REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNCHW, FeedFp32, nchw)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
......@@ -54,8 +60,7 @@ REGISTER_LITE_KERNEL(
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL(
feed, kCUDA, kFloat, kNHWC, paddle::lite::kernels::cuda::FeedCompute, nhwc)
REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNHWC, FeedFp32, nhwc)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
......@@ -65,3 +70,25 @@ REGISTER_LITE_KERNEL(
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNCHW, FeedInt64, nchw)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt64),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kInt64),
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNHWC, FeedInt64, nhwc)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt64),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kInt64),
DATALAYOUT(kNHWC))})
.Finalize();
......@@ -20,7 +20,8 @@ namespace lite {
namespace kernels {
namespace cuda {
class FeedCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
template <typename T, PrecisionType Ptype>
class FeedCompute : public KernelLite<TARGET(kCUDA), Ptype> {
public:
using param_t = operators::FeedParam;
using TargetW = TargetWrapper<TARGET(kCUDA)>;
......
......@@ -82,8 +82,32 @@ void MatchMatrixTensorCompute::Run() {
gemm_impl_->run(1.0f, 0.0f, l_t_data, r_data, top_data, &context);
}
}
int batch_size = x->lod()[0].size() - 1;
int lod_lv1_size = batch_size * dim_t;
int lod_lv2_size = x->lod()[0].back() * dim_t;
std::vector<size_t> out_lod0(batch_size + 1, 0);
std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
for (int i = 0; i < batch_size; i++) {
out_lod0[i + 1] = out_lod0[i] + dim_t;
int len_l = offset_l[i + 1] - offset_l[i];
for (int j = 0; j < dim_t; j++) {
out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
int len_r = offset_r[i + 1] - offset_r[i];
for (int k = 0; k < len_l; k++) {
out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
}
}
}
LoD out_lod;
out_lod.push_back(top_offset);
out_lod.push_back(offset_l);
out_lod.push_back(offset_r);
out->set_lod(out_lod);
}
......
......@@ -32,4 +32,7 @@ REGISTER_LITE_KERNEL(search_aligned_mat_mul,
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
......@@ -36,7 +36,6 @@ void anakin_NV_gemv<float>(cublasHandle_t handle,
const float* x,
const float beta,
float* y) {
LOG(INFO) << "1";
cublasOperation_t cuTransA = (TransA == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
CUBLAS_CHECK(
cublasSgemv(handle, cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
......@@ -66,17 +65,13 @@ void anakin_NV_gemm<float>(cublasHandle_t handle,
const float* B,
const float beta,
float* C) {
LOG(INFO) << "1";
// Note that cublas follows fortran order.
int lda = (!TransA /* == CblasNoTrans*/) ? K : M;
int ldb = (!TransB /* == CblasNoTrans*/) ? N : K;
LOG(INFO) << "1";
cublasOperation_t cuTransA =
(!TransA /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
LOG(INFO) << "1";
cublasOperation_t cuTransB =
(!TransB /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
LOG(INFO) << "1";
CUBLAS_CHECK(cublasSgemm(handle,
cuTransB,
cuTransA,
......@@ -91,7 +86,6 @@ void anakin_NV_gemm<float>(cublasHandle_t handle,
&beta,
C,
N));
LOG(INFO) << "1";
}
template <>
......
......@@ -46,7 +46,9 @@ __global__ void ker_search_group_padding(Dtype* out_emb_padding_data,
in_data[(offset[seq_id] + word_id_in_seq) * emb_size + emb_id];
} else {
out_emb_padding_data[tid] = 0.f;
out_padding_data[word_id] = pad_id;
if (emb_id == 0) {
out_padding_data[word_id] = pad_id;
}
}
}
}
......@@ -61,12 +63,7 @@ void SearchGroupPaddingCompute::Run() {
Tensor* out_new = param.out_new;
Tensor* out_padding = param.out_padding;
const float pad_id = static_cast<float>(param.pad_id);
const float* in_data = x->data<float>();
float* out_emb_padding_data =
out_emb_padding->mutable_data<float>(TARGET(kCUDA));
float* out_new_data = out_new->mutable_data<float>(TARGET(kCUDA));
float* out_padding_data = out_padding->mutable_data<float>(TARGET(kCUDA));
const auto& in_seq_offset = x->lod()[0];
int batch = in_seq_offset.size() - 1;
int max_seq = 0;
......@@ -85,16 +82,20 @@ void SearchGroupPaddingCompute::Run() {
out_emb_padding_lod.push_back(new_offset);
out_emb_padding->set_lod(out_emb_padding_lod);
out_emb_padding->Resize({batch * max_seq, x_dims[1]});
float* out_emb_padding_data =
out_emb_padding->mutable_data<float>(TARGET(kCUDA));
LoD out_new_lod;
out_new_lod.push_back(in_seq_offset);
out_new->set_lod(out_new_lod);
out_new->Resize({x_dims[0], 1});
float* out_new_data = out_new->mutable_data<float>(TARGET(kCUDA));
LoD out_padding_lod;
out_padding_lod.push_back(new_offset);
out_padding->set_lod(out_padding_lod);
out_padding->Resize({batch * max_seq, 1});
float* out_padding_data = out_padding->mutable_data<float>(TARGET(kCUDA));
const int count = out_emb_padding->numel();
const auto& out_emb_padding_seq_offset = out_emb_padding->lod()[0];
......@@ -112,6 +113,10 @@ void SearchGroupPaddingCompute::Run() {
TargetWrapperCuda::MemsetSync(
out_new_data, 0, out_new->dims()[0] * out_new->dims()[1] * sizeof(float));
TargetWrapperCuda::MemsetSync(
out_padding_data,
0,
out_padding->dims()[0] * out_padding->dims()[1] * sizeof(float));
ker_search_group_padding<
float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, cuda_stream>>>(
......
......@@ -50,6 +50,7 @@ void SearchSeqDepaddingCompute::Run() {
auto* out = param.out;
auto* in_data = pad->data<float>();
out->Resize({src->dims()[0], pad->dims()[1]});
auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
const int count = out->numel();
......@@ -59,6 +60,9 @@ void SearchSeqDepaddingCompute::Run() {
int seq_num = pad_seq_offset.size() - 1;
int emb_size = pad->dims()[1];
LoD out_lod;
out_lod.push_back(src_seq_offset);
out->set_lod(out_lod);
std::vector<int> seq_id_map;
for (int i = 0; i < seq_num; i++) {
int cur_len = src_seq_offset[i + 1] - src_seq_offset[i];
......@@ -77,11 +81,12 @@ void SearchSeqDepaddingCompute::Run() {
cuda_stream);
int threads = 512;
ker_sequence_depadding_fwd<<<count, threads, 0, cuda_stream>>>(
int blocks = (count + threads - 1) / threads;
ker_sequence_depadding_fwd<<<blocks, threads, 0, cuda_stream>>>(
out_data, in_data, seq_id_map_data, seq_num, max_len, emb_size, count);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
} // namespace cuda
......
......@@ -120,7 +120,7 @@ void SequenceArithmeticCompute::Run() {
auto x_data = param.X->data<float>();
auto x_lod = param.X->lod()[0];
auto y_data = param.X->data<float>();
auto y_data = param.Y->data<float>();
auto y_lod = param.Y->lod()[0];
auto out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
......@@ -174,7 +174,6 @@ void SequenceArithmeticCompute::Run() {
int seq_num = x_lod.size() - 1;
int count = param.X->numel();
int inner_size = param.X->dims()[1];
switch (param.op_type) {
case 1: // sum
ker_arithmetic_sum<
......
......@@ -24,6 +24,24 @@ namespace cuda {
const int CUDA_NUM_THREADS = 512;
template <typename T>
inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs) {
std::vector<size_t> result;
result.resize(xs[0]->lod()[0].size());
for (size_t i = 1; i < result.size(); ++i) {
size_t sum = 0;
for (size_t j = 0; j < xs.size(); ++j) {
auto& x_lod = xs[j]->lod()[0];
sum += x_lod[i];
}
result[i] = sum;
}
LoD lod;
lod.emplace_back(result);
return lod;
}
template <typename Dtype>
__global__ void ker_sequence_concat(Dtype* out_data,
const uint64_t* in_locate_data,
......@@ -96,6 +114,8 @@ void SequenceConcatCompute::Run() {
IoDirection::HtoD,
stream);
param.Out->set_lod(ConcatLoD<float>(param.X));
int count = param.X[0]->numel();
for (int i = 1; i < param.X.size(); ++i) {
count += param.X[i]->numel();
......
......@@ -254,4 +254,5 @@ REGISTER_LITE_KERNEL(sequence_pool,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
......@@ -42,11 +42,9 @@ __host__ __device__ inline size_t UpperBound(const T* x,
return static_cast<size_t>(first - x);
}
__global__ void SequenceReverseKernelGridIsOne(const float* x,
float* y,
const int64_t* lod,
size_t lod_count,
int64_t row_numel) {
template <typename T>
__global__ void SequenceReverseKernelGridIsOne(
const T* x, T* y, const int64_t* lod, size_t lod_count, int64_t row_numel) {
int64_t idx = static_cast<int64_t>(threadIdx.x);
auto row_idx_x = idx / row_numel;
auto lod_idx = UpperBound(lod, lod_count, row_idx_x);
......@@ -55,8 +53,9 @@ __global__ void SequenceReverseKernelGridIsOne(const float* x,
y[idx_y] = x[idx];
}
__global__ void SequenceReverseKernel(const float* x,
float* y,
template <typename T>
__global__ void SequenceReverseKernel(const T* x,
T* y,
const int64_t* lod,
size_t lod_count,
int64_t row_numel,
......@@ -71,19 +70,20 @@ __global__ void SequenceReverseKernel(const float* x,
}
}
void SequenceReverseCompute::Run() {
auto& param = this->Param<param_t>();
template <typename T, PrecisionType Ptype>
void SequenceReverseCompute<T, Ptype>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
size_t limit = static_cast<size_t>(param.X->numel());
int64_t row_numel = static_cast<int64_t>(limit / param.X->dims()[0]);
const auto* x_data = param.X->data<float>();
auto y_data = param.Out->mutable_data<float>(TARGET(kCUDA));
const auto* x_data = param.X->template data<T>();
auto y_data = param.Out->template mutable_data<T>(TARGET(kCUDA));
CHECK_NE(x_data, y_data)
<< "SequenceReverse Op does not support in-place operation";
const auto lod = param.X->lod()[param.X->lod().size() - 1];
const size_t lod_count = lod.size();
param.Out->set_lod(param.X->lod());
lod_cuda.Resize({static_cast<int64_t>(lod.size())});
int64_t* lod_data = lod_cuda.mutable_data<int64_t>(TARGET(kCUDA));
......@@ -92,11 +92,9 @@ void SequenceReverseCompute::Run() {
sizeof(int64_t) * lod.size(),
IoDirection::HtoD,
stream);
constexpr int num_threads = 1024;
int block_size = limit <= num_threads ? limit : num_threads;
int grid_size = (limit + num_threads - 1) / num_threads;
if (grid_size == 1) {
SequenceReverseKernelGridIsOne<<<1, block_size, 0, stream>>>(
x_data, y_data, lod_data, lod_count, row_numel);
......@@ -104,7 +102,6 @@ void SequenceReverseCompute::Run() {
SequenceReverseKernel<<<grid_size, block_size, 0, stream>>>(
x_data, y_data, lod_data, lod_count, row_numel, limit);
}
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
}
......@@ -114,12 +111,20 @@ void SequenceReverseCompute::Run() {
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(sequence_reverse,
kCUDA,
kFloat,
kNCHW,
paddle::lite::kernels::cuda::SequenceReverseCompute,
def)
typedef paddle::lite::kernels::cuda::SequenceReverseCompute<float,
PRECISION(kFloat)>
ReverseFp32;
typedef paddle::lite::kernels::cuda::SequenceReverseCompute<int64_t,
PRECISION(kInt64)>
ReverseInt64;
REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kFloat, kNCHW, ReverseFp32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kInt64, kNCHW, ReverseInt64, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
.Finalize();
......@@ -20,8 +20,8 @@ namespace lite {
namespace kernels {
namespace cuda {
class SequenceReverseCompute
: public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
template <typename T, PrecisionType Ptype>
class SequenceReverseCompute : public KernelLite<TARGET(kCUDA), Ptype> {
public:
using param_t = operators::SequenceReverseParam;
......
......@@ -40,7 +40,7 @@ static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) {
}
TEST(sequence_reverse_cuda, normal) {
SequenceReverseCompute seq_kernel;
SequenceReverseCompute<float, PRECISION(kFloat)> seq_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
......
......@@ -26,8 +26,6 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
const Dtype *input,
const int *gpu_input_offset_l,
const int *gpu_input_offset_r,
const int row_max,
const int col_max,
const int topk_size,
const int *topks,
const int feat_map_num) {
......@@ -35,17 +33,20 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x]; // 8
int col = gpu_input_offset_r[blockIdx.x + 1] -
gpu_input_offset_r[blockIdx.x]; // 30
int max_k = topks[topk_size - 1];
max_k = max_k < col ? max_k : col;
extern __shared__ Dtype smem[]; // H*W
const Dtype *fm_row_in_data = input +
blockIdx.x * row_max * feat_map_num * col_max +
blockIdx.y * row_max * col_max;
const Dtype *fm_row_in_data = input;
for (int i = 0; i < blockIdx.x; ++i) {
int tmp_row = gpu_input_offset_l[i + 1] - gpu_input_offset_l[i];
int tmp_col = gpu_input_offset_r[i + 1] - gpu_input_offset_r[i];
fm_row_in_data += tmp_row * feat_map_num * tmp_col;
}
fm_row_in_data += blockIdx.y * row * col;
for (int i = threadIdx.x; i < row * col_max; i += blockDim.x) {
for (int i = threadIdx.x; i < row * col; i += blockDim.x) {
smem[i] = fm_row_in_data[i];
}
__syncthreads();
......@@ -56,7 +57,7 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
(gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size +
blockIdx.y * topk_size;
Dtype *smem_start_col = smem + idx * col_max;
Dtype *smem_start_col = smem + idx * col;
int counter = max_k; // topk_size;
Dtype last_max_val = -20000.0;
......@@ -75,7 +76,7 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
if (max_val < -9999.0) { // == -10000.0
max_val = last_max_val;
}
smem_start_col[max_pos] = 10000000.0;
smem_start_col[max_pos] = -10000000.0;
int i = max_k - counter;
for (int c = 0; c < topk_size; c++) {
if (i <= topks[c] - 1) {
......@@ -97,7 +98,6 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
auto &param = this->Param<param_t>();
auto &ctx = this->ctx_->template As<CUDAContext>();
auto cuda_stream = ctx.exec_stream();
int topk_num = param.topks.size();
lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
_top_ks.Resize(top_ks_shape);
......@@ -107,12 +107,16 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
cudaMemcpyHostToDevice,
cuda_stream);
int width_offset_len = param.X->lod()[0].size();
int width_offset_len = param.COLUMN->lod()[0].size();
lite::DDim width_offset_shape(
std::vector<int64_t>{width_offset_len, 1, 1, 1});
_width_offset.Resize(width_offset_shape);
std::vector<int> width_lod_0(width_offset_len, 0);
for (size_t i = 0; i < param.COLUMN->lod()[0].size(); ++i) {
width_lod_0[i] = static_cast<int>(param.COLUMN->lod()[0][i]);
}
cudaMemcpyAsync(_width_offset.mutable_data<int>(TARGET(kCUDA)),
&(param.X->lod()[0][0]),
&width_lod_0[0],
sizeof(int) * width_offset_len,
cudaMemcpyHostToDevice,
cuda_stream);
......@@ -121,8 +125,12 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
lite::DDim height_offset_shape(
std::vector<int64_t>{height_offset_len, 1, 1, 1});
_height_offset.Resize(height_offset_shape);
std::vector<int> height_lod_0(height_offset_len, 0);
for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) {
height_lod_0[i] = static_cast<int>(param.ROW->lod()[0][i]);
}
cudaMemcpyAsync(_height_offset.mutable_data<int>(TARGET(kCUDA)),
&(param.ROW->lod()[0][0]),
&height_lod_0[0],
sizeof(int) * height_offset_len,
cudaMemcpyHostToDevice,
cuda_stream);
......@@ -136,16 +144,20 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
sizeof(T) * out_tensor->numel(),
cuda_stream);
auto x_dims = x_tensor->dims();
int num = x_dims[0];
int channel = x_dims[1];
int height = x_dims[2];
int width = x_dims[3];
int num = param.ROW->lod()[0].size() - 1;
int channel = param.channel_num;
const int *height_offset = _height_offset.data<int>();
const int *width_offset = _width_offset.data<int>();
int feat_map_size = height * width;
int feat_map_size = 0;
for (size_t i = 0; i < height_lod_0.size() - 1; ++i) {
int height = height_lod_0[i + 1] - height_lod_0[i];
int width = width_lod_0[i + 1] - width_lod_0[i];
if (height * width > feat_map_size) {
feat_map_size = height * width;
}
}
dim3 blocks(num, channel);
dim3 threads(32, 1);
topk_avg_pooling_kernel_by_row_improve<
......@@ -154,11 +166,12 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
in_data,
height_offset,
width_offset,
height,
width,
param.topks.size(),
_top_ks.data<int>(),
param.channel_num);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
} // namespace cuda
......
......@@ -173,9 +173,10 @@ void SoftmaxCompute::Run() {
cudaGetDeviceProperties(&deviceProp, device_id);
size_t sharedmem_size = deviceProp.sharedMemPerBlock;
int max_dimsize = sharedmem_size / sizeof(float) / threads;
auto input_data = param.x->data<float>();
auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
TargetWrapperCuda::MemsetSync(
output_data, 0, param.output->numel() * sizeof(float));
if (axis_size <= max_dimsize) {
int use_sharemem_size = axis_size * threads * sizeof(float);
sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(
......@@ -194,7 +195,7 @@ void SoftmaxCompute::Run() {
auto max_data = tmax_data.mutable_data<float>(TARGET(kCUDA));
auto sum_data = tsum_data.mutable_data<float>(TARGET(kCUDA));
//! firstly, get maximum data
float min_data = std::numeric_limits<float>::min();
float min_data = std::numeric_limits<float>::lowest();
softmax_max_kernel<float><<<blocks, threads, 0, stream>>>(total_threads,
input_data,
max_data,
......@@ -217,7 +218,7 @@ void SoftmaxCompute::Run() {
total_threads, output_data, sum_data, inner_num, outer_num, axis_size);
}
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
} // namespace cuda
......@@ -258,4 +259,5 @@ REGISTER_LITE_KERNEL(search_seq_softmax,
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
......@@ -54,8 +54,7 @@ bool SequenceTopkAvgPoolingOpLite::InferShape() const {
vec_out_shape.push_back(channel_num * num_k);
param_.Out->Resize(lite::DDim(vec_out_shape));
auto out_lod = param_.Out->mutable_lod();
*out_lod = param_.X->lod();
param_.Out->set_lod(param_.ROW->lod());
return true;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册