提交 07972467 编写于 作者: Y yangyaming

Enhance sequence_padding functor (CPU and GPU).

上级 7c671466
......@@ -18,128 +18,111 @@ namespace paddle {
namespace operators {
namespace math {
template <typename T>
class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor* padding,
bool norm_by_times) {
auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The LoD of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims();
PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding->dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width].");
const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be the "
"maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be the "
"number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
const T* seq_data = seq.data<T>();
T* padding_data = padding->data<T>();
for (int64_t i = 0; i < max_sequence_length; ++i) {
for (int64_t j = 0; j < num_sequences; ++j) {
int64_t start_pos = abs_offset_lod[level][j];
int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
if (i < sequence_length) {
// i > 0 => sequence_length > 0
T scale =
norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
for (int64_t k = 0; k < sequence_width; ++k) {
padding_data[(i * num_sequences + j) * sequence_width + k] =
seq_data[(start_pos + i) * sequence_width + k] * scale;
template <typename T, PaddingLayout padding_layout>
void CopyDataCPU(framework::LoDTensor* seq_tensor,
framework::Tensor* padding_tensor,
const framework::Vector<size_t>& abs_offset,
const int64_t& max_seq_len, const int64_t& seq_width,
bool seq_to_padding, bool norm_by_len) {
T* seq_data = seq_tensor->data<T>();
T* padding_data = padding_tensor->data<T>();
int64_t seq_num = abs_offset.size() - 1;
for (int64_t i = 0; i < seq_num; ++i) {
int64_t seq_start = abs_offset[i];
int64_t seq_len = abs_offset[i + 1] - seq_start;
T scale = norm_by_len ? (1.0f / static_cast<T>(seq_len)) : 1.0f;
for (int64_t j = 0; j < seq_len; ++j) {
for (int64_t k = 0; k < seq_width; ++k) {
size_t padding_offset = 0;
if (padding_layout == BATCH_LENGTH_WIDTH) {
padding_offset = (i * max_seq_len * seq_width) + j * seq_width + k;
} else {
padding_offset = (j * seq_num * seq_width) + i * seq_width + k;
}
if (seq_to_padding) {
padding_data[padding_offset] =
seq_data[(seq_start + j) * seq_width + k] * scale;
} else {
memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
sequence_width * sizeof(T));
seq_data[(seq_start + j) * seq_width + k] =
padding_data[padding_offset] * scale;
}
}
}
}
};
}
template <typename T>
class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
template <typename T, PaddingLayout padding_layout>
class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T, padding_layout> {
public:
void operator()(const platform::CPUDeviceContext& context,
framework::LoDTensor* seq, const framework::Tensor& padding,
bool norm_by_times) {
auto lod = seq->lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The LoD of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq->dims();
PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding.dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequnece_length, num_sequences, sequence_width].");
const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
const T* padding_data = padding.data<T>();
T* seq_data = seq->data<T>();
for (int64_t i = 0; i < num_sequences; ++i) {
int64_t start_pos = abs_offset_lod[level][i];
int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
for (int64_t j = 0; j < sequence_length; ++j) {
// sequence_width > j > 0
T scale =
norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
for (int64_t k = 0; k < sequence_width; ++k) {
seq_data[(start_pos + j) * sequence_width + k] =
padding_data[(j * num_sequences + i) * sequence_width + k] *
scale;
}
}
const framework::LoDTensor& seq_tensor,
framework::Tensor* padding_tensor,
T padding_value = static_cast<T>(0),
bool norm_by_times = false, size_t lod_level = 0) {
ValidateLoD(seq_tensor, lod_level);
auto& lod = seq_tensor.lod();
auto& abs_offset = framework::ToAbsOffset(lod)[lod_level];
auto seq_dims = seq_tensor.dims();
auto padding_dims = padding_tensor->dims();
int64_t max_seq_len = MaximumSequenceLength(lod, lod_level);
int64_t seq_num = abs_offset.size() - 1;
int64_t seq_width = seq_tensor.numel() / seq_dims[0];
int64_t numel = max_seq_len * seq_num * seq_width;
ValidateShape(seq_dims, abs_offset.back(), padding_dims, max_seq_len,
seq_num, seq_width, padding_layout);
T* padding_data = padding_tensor->data<T>();
memset(padding_data, padding_value, numel * sizeof(T));
CopyDataCPU<T, padding_layout>(
const_cast<framework::LoDTensor*>(&seq_tensor), padding_tensor,
abs_offset, max_seq_len, seq_width, true /* seq_to_padding */,
norm_by_times);
}
};
template <typename T, PaddingLayout padding_layout>
class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T, padding_layout> {
public:
void operator()(const platform::CPUDeviceContext& context,
framework::LoDTensor* seq_tensor,
const framework::Tensor& padding_tensor,
bool norm_by_times = false, size_t lod_level = 0) {
ValidateLoD(*seq_tensor, lod_level);
auto& lod = seq_tensor->lod();
auto& abs_offset = framework::ToAbsOffset(lod)[lod_level];
auto& seq_dims = seq_tensor->dims();
auto& padding_dims = padding_tensor.dims();
int64_t max_seq_len = MaximumSequenceLength(lod, lod_level);
int64_t seq_num = abs_offset.size() - 1;
int64_t seq_width = seq_tensor->numel() / seq_dims[0];
ValidateShape(seq_dims, abs_offset.back(), padding_dims, max_seq_len,
seq_num, seq_width, padding_layout);
T* seq_data = seq_tensor->data<T>();
memset(seq_data, static_cast<T>(0), seq_tensor->numel() * sizeof(T));
CopyDataCPU<T, padding_layout>(
seq_tensor, const_cast<framework::Tensor*>(&padding_tensor), abs_offset,
max_seq_len, seq_width, false /* seq_to_padding */, norm_by_times);
}
};
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float,
LENGTH_BATCH_WIDTH>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float,
LENGTH_BATCH_WIDTH>;
} // namespace math
} // namespace operators
......
......@@ -19,87 +19,76 @@ namespace paddle {
namespace operators {
namespace math {
template <typename T, bool NormByTimes, bool Padding>
__global__ void SequencePaddingKernel(T* padding, T* sequence,
const size_t* sequence_start_positions,
const size_t sequence_width,
const size_t max_sequence_length,
const size_t num_sequences) {
template <typename T, bool Padding>
__global__ void SequencePaddingKernel(
T* padding_data, T* seq_data, const size_t* abs_offset,
const size_t& seq_num, const size_t& max_seq_len, const size_t& seq_width,
const PaddingLayout& padding_layout, bool norm_by_times = false,
const T& padding_value = 0) {
size_t padding_idx = blockIdx.y;
size_t start_pos = sequence_start_positions[padding_idx];
size_t sequence_length =
sequence_start_positions[padding_idx + 1] - start_pos;
size_t seq_start = abs_offset[padding_idx];
size_t seq_len = abs_offset[padding_idx + 1] - seq_start;
size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
size_t padding_base_idx =
(sequence_idx * num_sequences + padding_idx) * sequence_width;
size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
size_t seq_idx = blockIdx.x * blockDim.y + threadIdx.y;
if (sequence_idx < sequence_length) {
T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
size_t seq_offset = (seq_start + seq_idx) * seq_width;
size_t padding_offset = 0;
if (padding_layout == LENGTH_BATCH_WIDTH) {
padding_offset = (seq_idx * seq_num + padding_idx) * seq_width;
} else {
padding_offset = (padding_idx * max_seq_len + seq_idx) * seq_width;
}
if (seq_idx < seq_len) {
T scale = norm_by_times ? (1.0f / static_cast<T>(seq_len)) : 1.0f;
if (Padding) {
/* sequence -> padding */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
for (size_t i = threadIdx.x; i < seq_width; i += blockDim.x) {
padding_data[padding_offset + i] = scale * seq_data[seq_offset + i];
}
} else {
/* padding -> sequence */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
for (size_t i = threadIdx.x; i < seq_width; i += blockDim.x) {
seq_data[seq_offset + i] = scale * padding_data[padding_offset + i];
}
}
} else if (sequence_idx < max_sequence_length) {
} else if (seq_idx < max_seq_len) {
if (Padding) {
/* sequence -> padding */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
padding[padding_base_idx + i] = 0;
for (size_t i = threadIdx.x; i < seq_width; i += blockDim.x) {
padding_data[padding_offset + i] = padding_value;
}
}
}
}
template <typename T>
class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
template <typename T, PaddingLayout padding_layout>
class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T, padding_layout> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor* padding,
bool norm_by_times) {
auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The lod of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims();
PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding->dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width].");
int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be the "
"maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be the "
"number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(seq, context.GetPlace(), context, padding);
padding->Resize(padding_dims);
const framework::LoDTensor& seq_tensor,
framework::Tensor* padding_tensor,
T padding_value = static_cast<T>(0),
bool norm_by_times = false, size_t lod_level = 0) {
ValidateLoD(seq_tensor, lod_level);
auto& lod = seq_tensor.lod();
auto& abs_offset = framework::ToAbsOffset(lod)[lod_level];
auto seq_dims = seq_tensor.dims();
auto padding_dims = padding_tensor->dims();
int64_t max_seq_len = MaximumSequenceLength(lod, lod_level);
const int64_t seq_num = abs_offset.size() - 1;
const int64_t seq_width = seq_tensor.numel() / seq_dims[0];
ValidateShape(seq_dims, abs_offset.back(), padding_dims, max_seq_len,
seq_num, seq_width, padding_layout);
if (!norm_by_times && seq_num == 1UL) {
TensorCopy(seq_tensor, context.GetPlace(), context, padding_tensor);
padding_tensor->Resize(padding_dims);
return;
}
......@@ -109,72 +98,46 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
std::min(((((seq_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = num_sequences;
size_t grid_dim_x = (max_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
const T* seq_data = seq.data<T>();
T* padding_data = padding->data<T>();
if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
} else {
SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
const T* seq_data = seq_tensor.data<T>();
T* padding_data = padding_tensor->data<T>();
SequencePaddingKernel<T, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
}
abs_offset.CUDAData(context.GetPlace()), seq_num, max_seq_len,
seq_width, padding_layout, norm_by_times, padding_value);
}
};
template <typename T>
class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
template <typename T, PaddingLayout padding_layout>
class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T,
padding_layout> {
public:
void operator()(const platform::CUDADeviceContext& context,
framework::LoDTensor* seq, const framework::Tensor& padding,
bool norm_by_times) {
auto lod = seq->lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The lod of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq->dims();
PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding.dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequnece_length, num_sequences, sequence_width].");
int64_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq.");
const int64_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(padding, context.GetPlace(), context, seq);
seq->Resize(seq_dims);
framework::LoDTensor* seq_tensor,
const framework::Tensor& padding_tensor,
bool norm_by_times = false, size_t lod_level = 0) {
ValidateLoD(*seq_tensor, lod_level);
auto& lod = seq_tensor->lod();
auto& abs_offset = framework::ToAbsOffset(lod)[lod_level];
auto seq_dims = seq_tensor->dims();
auto padding_dims = padding_tensor.dims();
int64_t max_seq_len = MaximumSequenceLength(lod, lod_level);
int64_t seq_num = abs_offset.size() - 1;
int64_t seq_width = seq_tensor->numel() / seq_dims[0];
if (!norm_by_times && seq_num == 1UL) {
TensorCopy(padding_tensor, context.GetPlace(), context, seq_tensor);
seq_tensor->Resize(seq_dims);
return;
}
......@@ -184,32 +147,28 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
std::min(((((seq_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = num_sequences;
size_t grid_dim_x = (max_seq_len + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = seq_num;
dim3 grid(grid_dim_x, grid_dim_y);
const T* padding_data = padding.data<T>();
T* seq_data = seq->data<T>();
if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
} else {
SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
const T* padding_data = padding_tensor.data<T>();
T* seq_data = seq_tensor->data<T>();
SequencePaddingKernel<T, 1><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences);
}
abs_offset.CUDAData(context.GetPlace()), seq_num, max_seq_len,
seq_width, padding_layout, norm_by_times);
}
};
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float,
LENGTH_BATCH_WIDTH>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float,
LENGTH_BATCH_WIDTH>;
} // namespace math
} // namespace operators
......
......@@ -22,17 +22,50 @@ namespace paddle {
namespace operators {
namespace math {
enum PaddingLayout { BATCH_LENGTH_WIDTH, LENGTH_BATCH_WIDTH };
inline static size_t MaximumSequenceLength(const framework::LoD& lod,
const size_t level) {
const size_t num_sequences = lod[level].size() - 1;
size_t max_sequence_length = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
for (size_t i = 0; i < num_sequences; ++i) {
max_sequence_length =
std::max(max_sequence_length,
abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
const size_t seq_num = lod[level].size() - 1;
size_t max_seq_len = 0;
auto abs_offset = framework::ToAbsOffset(lod)[level];
for (size_t i = 0; i < seq_num; ++i) {
max_seq_len = std::max(max_seq_len, abs_offset[i + 1] - abs_offset[i]);
}
return max_seq_len;
}
inline static void ValidateLoD(const framework::LoDTensor& seq_tensor,
const size_t& lod_level) {
PADDLE_ENFORCE(lod_level < seq_tensor.lod().size(),
"Invalid `lod_level` which should be at least 0 and less "
"than maximum lod level of `seq_tensor`.");
}
inline static void ValidateShape(const framework::DDim& seq_tensor_dims,
const size_t& abs_offset_back_value,
const framework::DDim& padding_tensor_dims,
const int64_t& max_seq_len,
const int64_t& seq_num,
const int64_t& seq_width,
const PaddingLayout& padding_layout) {
PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]),
abs_offset_back_value,
"The 1st dimension of `seq_tensor` should be equal to "
"sum of lengths of all sequences.");
PADDLE_ENFORCE_EQ(padding_tensor_dims.size(), 3UL,
"`padding_tensor` should be a 3-D tensor.");
if (padding_layout == BATCH_LENGTH_WIDTH) {
PADDLE_ENFORCE_EQ(padding_tensor_dims,
framework::make_ddim({seq_num, max_seq_len, seq_width}));
} else if (padding_layout == LENGTH_BATCH_WIDTH) {
PADDLE_ENFORCE_EQ(padding_tensor_dims,
framework::make_ddim({max_seq_len, seq_num, seq_width}));
} else {
PADDLE_THROW("Unsupported padding layout.");
}
return max_sequence_length;
}
/*
......@@ -61,18 +94,23 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod,
*
* \note transposition is also done in this functor.
*/
template <typename DeviceContext, typename T>
template <typename DeviceContext, typename T, PaddingLayout padding_layout>
class PaddingLoDTensorFunctor {
public:
void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
framework::Tensor* padding, bool norm_by_times);
void operator()(const DeviceContext& context,
const framework::LoDTensor& seq_tensor,
framework::Tensor* padding_tensor,
T padding_value = static_cast<T>(0),
bool norm_by_times = false, size_t lod_level = 0);
};
template <typename DeviceContext, typename T>
template <typename DeviceContext, typename T, PaddingLayout padding_layout>
class UnpaddingLoDTensorFunctor {
public:
void operator()(const DeviceContext& context, framework::LoDTensor* seq,
const framework::Tensor& padding, bool norm_by_times);
void operator()(const DeviceContext& context,
framework::LoDTensor* seq_tensor,
const framework::Tensor& padding_tensor,
bool norm_by_times = false, size_t lod_level = 0);
};
} // namespace math
......
......@@ -32,7 +32,11 @@ class SequencePadOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(x_dims.size(), 2,
"Only support 2-D tensor, rank of Input(X) should be 2.");
auto out_dims = x_dims;
int lod_level = ctx->Attrs().Get<int>("lod_level");
int64_t max_len = -1;
int64_t seq_num = -1;
int x_lod_size = -1;
if (ctx->IsRuntime()) {
framework::Variable* x_var =
......@@ -40,27 +44,31 @@ class SequencePadOp : public framework::OperatorWithKernel {
auto& x_lod = x_var->Get<LoDTensor>().lod();
PADDLE_ENFORCE_GE(x_lod.size(), 1,
"Input(X) should be sequences containing lod.");
x_lod_size = x_lod.size();
auto x_abs_offset = framework::ToAbsOffset(x_lod)[lod_level];
PADDLE_ENFORCE_EQ(x_dims[0], static_cast<int64_t>(x_abs_offset.back()),
"The first dimension of `X` should be equal to sum "
"of all sequences' length.");
auto last_level_lod = x_lod[x_lod.size() - 1];
size_t max_len = 0;
seq_num = x_abs_offset.size() - 1;
for (size_t i = 1; i < last_level_lod.size(); ++i) {
auto seq_len = last_level_lod[i] - last_level_lod[i - 1];
for (size_t i = 1; i <= seq_num; ++i) {
int64_t seq_len = x_abs_offset[i] - x_abs_offset[i - 1];
max_len = max_len < seq_len ? seq_len : max_len;
}
out_dims[0] = max_len * (last_level_lod.size() - 1);
} else {
framework::VarDesc* x_desc =
boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("X")[0]);
PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1,
"Input(X) should be sequences containing lod.");
out_dims[0] = -1;
x_lod_size = x_desc->GetLoDLevel();
}
ctx->SetOutputDim("Out", out_dims);
PADDLE_ENFORCE(lod_level >= 0 && lod_level < x_lod_size,
"Invalid `lod_level` which should be at least 0 and less "
"than maximum lod level of `X`");
ctx->SetOutputDim("Out", {seq_num, max_len, x_dims[1]});
}
protected:
......@@ -84,9 +92,11 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor) Output variable which would be a common tensor "
"without lod. Each sequence would be padded to the maximum "
"length.");
AddAttr<float>("lod_level",
"(int, default 0) Specify which level lod to referred to.");
AddAttr<float>("pad_value",
"(float, default 0.0) Value to be padded "
"to the end of each sequence.");
"(float, default 0.0) Specify which value to be padded to "
"the end of each sequence.");
AddComment(R"DOC(
)DOC");
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h"
namespace paddle {
namespace operators {
......@@ -23,39 +24,68 @@ namespace operators {
using LoDTensor = framework::LoDTensor;
using LoD = framework::LoD;
// @TODO clean code
template <typename DeviceContext, typename T>
struct CopyFunctor {
LoDTensor* lod_tensor_;
LoDTensor* pad_tensor_;
const LoD& ref_lod_;
const DeviceContext& ctx_;
bool is_lod_to_pad_;
CopyFunctor(LoDTensor* lod_tensor, const LoD& ref_lod, LoDTensor* pad_tensor,
const DeviceContext& ctx, bool is_lod_to_pad)
: lod_tensor_(lod_tensor),
pad_tensor_(pad_tensor),
ref_lod_(ref_lod),
ctx_(ctx),
is_lod_to_pad_(is_lod_to_pad) {}
void operator()() const {
/*
auto seq_num = ref_lod_.size() - 1;
auto max_len = pad_tensor_->dims()[0] / seq_num;
PADDLE_ENFORCE_EQ(max_len * seq_num, pad_tensor_->dims()[0],
"First dimension of padded tensor should be equal to "
"maximum sequence length mulplied by sequence number.");
for (size_t i = 1; i < ref_lod_.size(); ++i) {
auto seq_start = ref_lod_[i - 1];
auto seq_end = ref_lod_[i];
auto pad_start = (i - 1) * max_len;
auto pad_end = pad_start + (seq_end - seq_start);
auto sub_lod_tensor = lod_tensor_->Slice(seq_start, seq_end);
auto sub_pad_tensor = pad_tensor_->Slice(pad_start, pad_end);
if (is_lod_to_pad_) {
framework::TensorCopy(sub_lod_tensor, ctx.GetPlace(), &sub_pad_tensor);
} else {
framework::TensorCopy(sub_pad_tensor, ctx.GetPlace(), &sub_lod_tensor);
}
}
*/
}
};
template <typename DeviceContext, typename T>
class SequencePadOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x_ptr = ctx.Input<LoDTensor>("X");
/*
auto* x = ctx.Input<LoDTensor>("X");
auto* out_ptr = ctx.Output<LoDTensor>("Out");
out_ptr->mutable_data<T>(ctx.GetPlace());
// Resize();
T pad_value = static_cast<T>(ctx.Attr<float>("pad_value"));
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *x, *, false);
math::SetConstant<DeviceContext, T> set_func;
set_func(ctx.template device_context<DeviceContext>(), out_ptr, pad_value);
auto& x_lod = x_ptr->lod();
auto& x_last_level_lod = x_lod[x_lod.size() - 1];
auto seq_num = x_last_level_lod.size() - 1;
auto max_len = out_ptr->dims()[0] / seq_num;
PADDLE_ENFORCE_EQ(max_len * seq_num, out_ptr->dims()[0],
"First dimension of `Out` should be equal to "
"maximum length mulplied by sequence number.");
for (size_t i = 1; i < x_last_level_lod.size(); ++i) {
auto x_start = x_last_level_lod[i - 1];
auto x_end = x_last_level_lod[i];
auto out_start = (i - 1) * max_len;
auto out_end = out_start + (x_end - x_start);
auto x_sub_tensor = x_ptr->Slice(x_start, x_end);
auto out_sub_tensor = out_ptr->Slice(out_start, out_end);
framework::TensorCopy(x_sub_tensor, ctx.GetPlace(), &out_sub_tensor);
}
*/
}
};
......@@ -63,33 +93,26 @@ template <typename DeviceContext, typename T>
class SequencePadGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
/*
auto* x_ptr = ctx.Input<LoDTensor>("X");
auto* g_out_ptr = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
auto* g_x_ptr = ctx.Output<LoDTensor>(framework::GradVarName("X"));
math::SetConstant<DeviceContext, T> set_func;
set_func(ctx.template device_context<DeviceContext>(), g_x_ptr,
set_func(ctx.template device_context<DeviceContext>(),
g_x_ptr,
static_cast<T>(0));
auto& x_lod = x_ptr->lod();
auto& x_last_level_lod = x_lod[x_lod.size() - 1];
auto seq_num = x_last_level_lod.size() - 1;
int64_t max_len = g_out_ptr->dims()[0] / seq_num;
PADDLE_ENFORCE_EQ(max_len * seq_num, g_out_ptr->dims()[0],
"First dimension of `Out` should be equal to "
"maximum length mulplied by sequence number.");
for (size_t i = 1; i < x_last_level_lod.size(); ++i) {
auto x_start = x_last_level_lod[i - 1];
auto x_end = x_last_level_lod[i];
auto out_start = (i - 1) * max_len;
auto out_end = out_start + (x_end - x_start);
auto g_out_sub = g_out_ptr->Slice(out_start, out_end);
auto g_x_sub = g_x_ptr->Slice(x_start, x_end);
framework::TensorCopy(g_x_sub, ctx.GetPlace(), &g_out_sub);
}
CopyFunctor copy_func<DeviceContext, T>(g_out_ptr,
x_last_level_lod,
g_x_ptr,
ctx,
false);
copy_func();
*/
}
};
......
......@@ -161,7 +161,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
static_cast<int64_t>(num_sequences),
static_cast<int64_t>(sequence_width)});
warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
math::PaddingLoDTensorFunctor<DeviceContext, T, math::LENGTH_BATCH_WIDTH>()(
ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits,
false);
const T* warpctc_logits_data = warpctc_logits.data<T>();
......@@ -216,7 +216,8 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times");
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
math::UnpaddingLoDTensorFunctor<DeviceContext, T,
math::LENGTH_BATCH_WIDTH>()(
ctx.template device_context<DeviceContext>(), logits_grad,
*warpctc_grad, norm_by_times);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册