diff --git a/paddle/operators/ctc_greedy_decode_op.cc b/paddle/operators/ctc_decode_op.cc similarity index 67% rename from paddle/operators/ctc_greedy_decode_op.cc rename to paddle/operators/ctc_decode_op.cc index 3c9b705f7f61c334236e3b44c8e8967cbdbc3d46..b290b11d1d1e80ec403429211ab6ea2bde9fc934 100644 --- a/paddle/operators/ctc_greedy_decode_op.cc +++ b/paddle/operators/ctc_decode_op.cc @@ -29,14 +29,8 @@ class CTCGreedyDecodeOp : public framework::OperatorWithKernel { auto input_dims = ctx->GetInputDim("Input"); - int sequence_width = - static_cast(framework::product(input_dims) / input_dims[0]); - int blank = ctx->Attrs().Get("blank"); - PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width), - "The value of Attr(blank) should be in interval [0, %d).", - sequence_width); // TODO(wanghaoshuang): it is tricky to set the wrong dimension here. - ctx->SetOutputDim("Output", {input_dims[0], 1}); + ctx->SetOutputDim("Output", input_dims); } protected: @@ -53,25 +47,37 @@ class CTCGreedyDecodeOpMaker : public framework::OpProtoAndCheckerMaker { CTCGreedyDecodeOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("Input", - "(LodTensor, default: LoDTensor), the unscaled " - "probabilities of variable-length sequences, which is a 2-D " - "Tensor with LoD information. It's shape is " - "[Lp, num_classes + 1], where Lp is the sum of all input " - "sequences' length and num_classes is the true number of classes " - "(not including the blank label)."); - AddOutput("Output", "(Tensor, default: Tensor), the decode result "); + "(LodTensor, default: LoDTensor), Its shape is " + "[Lp, 1], where Lp is the sum of all input sequences' length."); + AddOutput("Output", "(Tensor, default: Tensor), The decode result."); AddAttr("blank", "(int, default: 0), the blank label setted in Connectionist " - "Temporal Classification (CTC) op, and it is in the " - "half-opened interval [0, num_classes + 1).") + "Temporal Classification (CTC) op.") .SetDefault(0); AddAttr("merge_repeated", "(bool, default: true), whether to " "merge repeated elements between two blanks. ") .SetDefault(true); AddComment(R"DOC( -CTCGreedyDecoder is an implementation of the simple best path decoding -algorithm, selecting at each timestep the most likely class at each timestep. +CTCDecoder is used to merge repeated elements between two blanks +and then delete all blanks in sequence. + +Given: + Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, + 6, 0, 0, 7, 7, 7, 0] + Input.dims = {18, 1} + Input.LoD = [[0, 11, 18]] + +And: + blank = 0 + merge_repeated = True + +Then: + Output.data = [1, 2, 4, 4, 5, 6, + 6, 7] + Output.dims = {8, 1} + Output.LoD = [[0, 6, 8]] + )DOC"); } }; @@ -85,4 +91,4 @@ REGISTER_OPERATOR(ctc_greedy_decode, ops::CTCGreedyDecodeOp, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( ctc_greedy_decode, - ops::CTCGreedyDecodeKernel); + ops::CTCGreedyDecodeKernel); diff --git a/paddle/operators/ctc_greedy_decode_op.cu b/paddle/operators/ctc_decode_op.cu similarity index 60% rename from paddle/operators/ctc_greedy_decode_op.cu rename to paddle/operators/ctc_decode_op.cu index 43a78745aca2f625c2ee64ae30197240fa1701fb..e9cdad7c26b17999c08cae4fe46122cf534fb891 100644 --- a/paddle/operators/ctc_greedy_decode_op.cu +++ b/paddle/operators/ctc_decode_op.cu @@ -16,62 +16,20 @@ limitations under the License. */ #include #include #include "paddle/operators/ctc_greedy_decode_op.h" -#include "paddle/platform/cuda_helper.h" -#include "paddle/platform/gpu_info.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -__device__ static float atomicMaxF(float* address, float val) { - int* address_as_i = (int*)address; - int old = *address_as_i, assumed; - do { - assumed = old; - old = ::atomicCAS(address_as_i, assumed, - __float_as_int(::fmaxf(val, __int_as_float(assumed)))); - } while (assumed != old); - return __int_as_float(old); -} - -template -__global__ void ArgmaxCudaKernel(const size_t seq_width, const T* logits, - int* output) { - T local_max_value = 0; - int local_max_index = 0; - __shared__ T max_value; - if (threadIdx.x == 0) { - max_value = 0; - } - __syncthreads(); - - for (int i = threadIdx.x; i < seq_width; i += BlockSize) { - T value = logits[blockIdx.x * seq_width + i]; - if (value > local_max_value) { - local_max_value = value; - local_max_index = i; - } - } - - atomicMaxF(&max_value, local_max_value); - - __syncthreads(); - - if (local_max_value == max_value) { - output[blockIdx.x] = local_max_index; - } -} template -__global__ void MergeAndDelCudaKernel(const int64_t num_token, int* tokens, +__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens, const size_t num_seq, size_t* lod0, const int blank, const int merge_repeated, - size_t* out_lod0, int* output) { + size_t* out_lod0, T* output) { int ouput_idx = 0; out_lod0[0] = 0; for (int i = 0; i < num_seq; ++i) { - int pre_token = -1; + T pre_token = -1; for (int j = lod0[i]; j < lod0[i + 1]; ++j) { if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) { output[ouput_idx] = tokens[j]; @@ -89,44 +47,39 @@ class CTCGreedyDecodeOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "It must use CUDAPlace."); + const size_t level = 0; auto* input = ctx.Input("Input"); auto* output = ctx.Output("Output"); + auto input_lod = framework::ToAbsOffset(input->lod()); + const T* tokens = input->data(); const int64_t num_tokens = input->dims()[0]; - const size_t seq_width = input->numel() / num_tokens; - const T* logits = input->data(); - Tensor tmp; - int* tokens = tmp.mutable_data({num_tokens, 1}, ctx.GetPlace()); - // get argmax - // platform::GpuMemsetAsync(args, 0, sizeof(float), stream); - - auto stream = ctx.cuda_device_context().stream(); - ArgmaxCudaKernel<<< - num_tokens, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(seq_width, logits, - tokens); - - const size_t level = 0; - auto input_lod = framework::ToAbsOffset(input->lod()); const size_t num_seq = input_lod[level].size() - 1; + const int blank = ctx.Attr("blank"); const int merge_repeated = static_cast(ctx.Attr("merge_repeated")); + // prepare a lod to record lod information while merging elements thrust::device_vector dev_out_lod0(input_lod[level].size()); size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data()); - int* output_data = - output->mutable_data({num_tokens, 1}, ctx.GetPlace()); + // merge elements and delete blank + T* output_data = output->mutable_data({num_tokens, 1}, ctx.GetPlace()); + + auto stream = ctx.cuda_device_context().stream(); MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( num_tokens, tokens, num_seq, input_lod[level].data(), blank, merge_repeated, dev_out_lod0_ptr, output_data); + // set output lod thrust::host_vector host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end()); framework::LoD out_lod; out_lod.push_back(host_out_lod0); output->set_lod(out_lod); + // resize output dims output->Resize({static_cast(host_out_lod0.back()), 1}); } }; @@ -135,4 +88,4 @@ class CTCGreedyDecodeOpCUDAKernel : public framework::OpKernel { } // namespace paddle REGISTER_OP_CUDA_KERNEL(ctc_greedy_decode, - paddle::operators::CTCGreedyDecodeOpCUDAKernel); + paddle::operators::CTCGreedyDecodeOpCUDAKernel); diff --git a/paddle/operators/ctc_greedy_decode_op.h b/paddle/operators/ctc_decode_op.h similarity index 75% rename from paddle/operators/ctc_greedy_decode_op.h rename to paddle/operators/ctc_decode_op.h index f12ea6c541b699f6d3cb4183d556318f28153662..30bb53e157f19e1c98d9945deee01a5cb2595952 100644 --- a/paddle/operators/ctc_greedy_decode_op.h +++ b/paddle/operators/ctc_decode_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/framework/op_registry.h" -#include "unsupported/Eigen/CXX11/Tensor" namespace paddle { namespace operators { @@ -30,8 +29,9 @@ class CTCGreedyDecodeKernel : public framework::OpKernel { auto* input = ctx.Input("Input"); auto* output = ctx.Output("Output"); const size_t level = 0; - auto input_lod = framework::ToAbsOffset(input->lod()); + + // check input dims and lod auto input_dims = input->dims(); PADDLE_ENFORCE_EQ(input_dims[0], static_cast(input_lod[level].back()), @@ -39,38 +39,36 @@ class CTCGreedyDecodeKernel : public framework::OpKernel { "the sum of all sequences' lengths."); const size_t num_sequences = input_lod[level].size() - 1; - const size_t sequence_width = input->numel() / input_dims[0]; size_t blank = static_cast(ctx.Attr("blank")); bool merge_repeated = ctx.Attr("merge_repeated"); + + // merge repeated tokens and delete blank std::vector> pathes(num_sequences); std::vector output_lod0(1, 0); - const T* input_data = input->data(); - Eigen::Map< - Eigen::Matrix> - input_mat(const_cast(input_data), input->numel() / sequence_width, - sequence_width); - - size_t max_class_idx; - size_t prev_class_idx = -1; for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) { + T prev_token = -1; for (size_t i = input_lod[level][seq_idx]; i < input_lod[level][seq_idx + 1]; ++i) { - input_mat.row(i).maxCoeff(&max_class_idx); - if (max_class_idx != blank && - !(merge_repeated && max_class_idx == prev_class_idx)) { - pathes[seq_idx].push_back(max_class_idx); + if (input_data[i] != blank && + !(merge_repeated && input_data[i] == prev_token)) { + pathes[seq_idx].push_back(input_data[i]); } - prev_class_idx = max_class_idx; + prev_token = input_data[i]; } output_lod0.push_back(output_lod0.back() + pathes[seq_idx].size()); } + + // set output lod framework::LoD output_lod; output_lod.push_back(output_lod0); output->set_lod(output_lod); - int64_t num_step = static_cast(output_lod0.back()); - int* output_data = output->mutable_data({num_step, 1}, ctx.GetPlace()); + // resize output dims + T* output_data = output->mutable_data( + {static_cast(output_lod0.back()), 1}, ctx.GetPlace()); + + // copy result to output for (int i = 0; i < num_sequences; ++i) { memcpy(output_data + output_lod0[i], pathes[i].data(), sizeof(int) * pathes[i].size()); diff --git a/python/paddle/v2/fluid/tests/test_ctc_decode.py b/python/paddle/v2/fluid/tests/test_ctc_decode.py new file mode 100644 index 0000000000000000000000000000000000000000..3b7486cfb98098facf05106eb2ff668959bb35d5 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_ctc_decode.py @@ -0,0 +1,62 @@ +import sys +import unittest +import numpy as np +from op_test import OpTest +from test_softmax_op import stable_softmax + + +def CTCDecode(input, lod, blank, merge_repeated): + lod0 = lod[0] + result = [] + for i in range(len(lod0) - 1): + prev_token = -1 + for j in range(lod0[i], lod0[i + 1]): + token = input[j][0] + if (token != blank) and not (merge_repeated and + token == prev_token): + result.append(token) + prev_token = token + result = np.array(result).reshape([len(result), 1]).astype("int32") + return result + + +class TestCTCDecodeOp(OpTest): + def config(self): + self.op_type = "ctc_greedy_decode" + self.input_lod = [[0, 11, 18]] + self.blank = 0 + self.merge_repeated = False + self.input = np.array( + [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0]).reshape( + [18, 1]).astype("int32") + + def setUp(self): + self.config() + output = CTCDecode(self.input, self.input_lod, self.blank, + self.merge_repeated) + + self.inputs = {"Input": (self.input, self.input_lod), } + self.outputs = {"Output": output} + self.attrs = { + "blank": self.blank, + "merge_repeated": self.merge_repeated + } + + def test_check_output(self): + self.check_output() + pass + + +class TestCTCDecodeOpCase1(TestCTCDecodeOp): + def config(self): + self.op_type = "ctc_greedy_decode" + self.input_lod = [[0, 11, 18]] + self.blank = 0 + self.merge_repeated = True + self.input = np.array( + [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0]).reshape( + [18, 1]).astype("int32") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_ctc_greedy_decode.py b/python/paddle/v2/fluid/tests/test_ctc_greedy_decode.py deleted file mode 100644 index 23fceb6dcdbc9068984635fe903535a0ed0a3ad9..0000000000000000000000000000000000000000 --- a/python/paddle/v2/fluid/tests/test_ctc_greedy_decode.py +++ /dev/null @@ -1,56 +0,0 @@ -import sys -import unittest -import numpy as np -from op_test import OpTest -from test_softmax_op import stable_softmax - - -def CTCGreedyDecode(softmax, blank, merge_repeated): - prev_token = -1 - result = [] - for token in np.argmax(softmax, axis=1): - if (token != blank) and not (merge_repeated and token == prev_token): - result.append(token) - return np.array(result).reshape([len(result), 1]) - - -class TestCTCGreedyDecodeOp(OpTest): - def config(self): - self.op_type = "ctc_greedy_decode" - self.batch_size = 4 - self.num_classes = 8 - self.input_lod = [[0, 4, 5, 8, 11]] - self.blank = 7 - self.merge_repeated = True - - def setUp(self): - self.config() - input = np.random.uniform( - 0.1, 1.0, - [self.input_lod[0][-1], self.num_classes]).astype("float32") - softmax = np.apply_along_axis(stable_softmax, 1, input) - output = CTCGreedyDecode(softmax, self.blank, self.merge_repeated) - - self.inputs = {"Input": (softmax, self.input_lod), } - self.outputs = {"Output": output} - self.attrs = { - "blank": self.blank, - "merge_repeated": self.merge_repeated - } - - def test_check_output(self): - self.check_output() - - -class TestCTCGreedyDecodeOpCase1(TestCTCGreedyDecodeOp): - def config(self): - self.op_type = "ctc_greedy_decode" - self.batch_size = 4 - self.num_classes = 1025 - self.input_lod = [[0, 4, 5, 8, 11]] - self.blank = 0 - self.merge_repeated = True - - -if __name__ == "__main__": - unittest.main()