diff --git a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc index d1fd1d0dcac5f982c30cc77ef71df97c7820358d..48092d89cd0ccb927b11598282100e56e20080ba 100644 --- a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc +++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc @@ -32,23 +32,6 @@ WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr &vocab, max_bytes_per_token_(max_bytes_per_token), unknown_token_(unknown_token) {} -void WordpieceTokenizerOp::PadTokens(const std::vector> &tokens, const std::string &padded_str, - std::vector *out_padded_tokens, int *out_cols) const { - int rows = tokens.size(); - int max_cols = 0; - for (int i = 0; i < rows; i++) { - max_cols = std::max(max_cols, static_cast(tokens[i].size())); - } - out_padded_tokens->resize(rows * max_cols, padded_str); - for (int i = 0; i < rows; i++) { - int index = i * max_cols; - for (int j = 0; j < tokens[i].size(); j++) { - (*out_padded_tokens)[index++] = tokens[i][j]; - } - } - *out_cols = max_cols; -} - Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found, int *out_end) const { CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range"); @@ -117,20 +100,16 @@ Status WordpieceTokenizerOp::Compute(const std::shared_ptr &input, std:: if (input->Rank() > 1 || input->type() != DataType::DE_STRING) { RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor"); } - std::vector> out_tokens(input->Size()); - int i = 0; + std::vector out_tokens; for (auto iter = input->begin(); iter != input->end(); iter++) { - RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &out_tokens[i++])); + std::vector temp_tokens; + RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens)); + out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end()); } - std::vector padded_tokens; - int cols = 0; - PadTokens(out_tokens, "", &padded_tokens, &cols); - std::vector shapes; - if (input->Rank() == 1) { - shapes.push_back(out_tokens.size()); + if (out_tokens.empty()) { + out_tokens.emplace_back(""); } - shapes.push_back(cols); - *output = std::make_shared(std::move(padded_tokens), TensorShape(shapes)); + *output = std::make_shared(out_tokens, TensorShape({(dsize_t)out_tokens.size()})); return Status::OK(); } diff --git a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h index d74f28df47aac33906a649fb1fbd90ae693526d9..c9a75025c69b64f7bd5050b01f425f6230bb2b53 100644 --- a/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h +++ b/mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h @@ -48,8 +48,6 @@ class WordpieceTokenizerOp : public TensorOp { Status Compute(const std::shared_ptr &input, std::shared_ptr *output) override; protected: - void PadTokens(const std::vector> &tokens, const std::string &padded_str, - std::vector *out_padded_tokens, int *out_cols) const; Status AddSubword(const std::string &input_token, const int start, const int end, std::vector *out_token) const; Status FoundNoToken(const std::string &input_token, std::vector *out_tokens) const; diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py index ad4c12ad982ab6f616b9d856de40a2bb23a309d0..2f45ea13ffeaca746a03e1e71cb12104150d2091 100644 --- a/mindspore/dataset/text/transforms.py +++ b/mindspore/dataset/text/transforms.py @@ -188,7 +188,7 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): class WordpieceTokenizer(cde.WordpieceTokenizerOp): """ - Tokenize scalar token or 1-D tokens to subword tokens. + Tokenize scalar token or 1-D tokens to 1-D subword tokens. Args vocab(Vocab): a Vocab object. diff --git a/tests/ut/python/dataset/test_bert_tokenizer.py b/tests/ut/python/dataset/test_bert_tokenizer.py index 8974d022e6e32f82143c2de59c7868654b4919ab..ad7a663e9335cbb8f5aa4f99ba2162c24bd29143 100644 --- a/tests/ut/python/dataset/test_bert_tokenizer.py +++ b/tests/ut/python/dataset/test_bert_tokenizer.py @@ -35,38 +35,24 @@ test_paras = [ dict( first=1, last=4, - expect_str=[[['床'], ['前'], ['明'], ['月'], ['光']], - [['疑'], ['是'], ['地'], ['上'], ['霜']], - [['举'], ['头'], ['望'], ['明'], ['月']], - [['低'], ['头'], ['思'], ['故'], ['乡']]], + expect_str=[['床', '前', '明', '月', '光'], + ['疑', '是', '地', '上', '霜'], + ['举', '头', '望', '明', '月'], + ['低', '头', '思', '故', '乡']], vocab_list=vocab_bert ), # test english text dict( first=5, last=5, - expect_str=[[['i', pad], - ["am", pad], - ['mak', '##ing'], - ['small', pad], - ['mistake', '##s'], - ['during', pad], - ['work', '##ing'], - ['hour', '##s']]], + expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], lower_case=True, vocab_list=vocab_bert ), dict( first=5, last=5, - expect_str=[[['I', pad], - ["am", pad], - ['mak', '##ing'], - ['small', pad], - ['mistake', '##s'], - ['during', pad], - ['work', '##ing'], - ['hour', '##s']]], + expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], lower_case=False, vocab_list=vocab_bert ), @@ -75,8 +61,8 @@ test_paras = [ first=6, last=7, expect_str=[ - [['😀'], ['嘿'], ['嘿'], ['😃'], ['哈'], ['哈'], ['😄'], ['大'], ['笑'], ['😁'], ['嘻'], ['嘻']], - [['繁'], ['體'], ['字']]], + ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], + ['繁', '體', '字']], normalization_form=nlp.utils.NormalizeForm.NFKC, vocab_list=vocab_bert ), @@ -85,11 +71,11 @@ test_paras = [ first=8, last=12, expect_str=[ - [['[UNK]'], ['[CLS]']], - [['[UNK]'], ['[SEP]']], - [['[UNK]'], ['[UNK]']], - [['[UNK]'], ['[PAD]']], - [['[UNK]'], ['[MASK]']], + ['[UNK]', '[CLS]'], + ['[UNK]', '[SEP]'], + ['[UNK]', '[UNK]'], + ['[UNK]', '[PAD]'], + ['[UNK]', '[MASK]'], ], lower_case=False, vocab_list=vocab_bert, @@ -99,7 +85,7 @@ test_paras = [ dict( first=13, last=13, - expect_str=[[['12'], ['+'], ['/'], ['-'], ['28'], ['='], ['40'], ['/'], ['-'], ['16']]], + expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']], preserve_unused_token=True, vocab_list=vocab_bert ), @@ -107,9 +93,7 @@ test_paras = [ dict( first=8, last=8, - expect_str=[ - [['[UNK]'], [' '], ['[CLS]']], - ], + expect_str=[['[UNK]', ' ', '[CLS]']], lower_case=False, vocab_list=vocab_bert, preserve_unused_token=True, @@ -118,9 +102,7 @@ test_paras = [ dict( first=8, last=8, - expect_str=[ - [['unused'], [' '], ['[CLS]']], - ], + expect_str=[['unused', ' ', '[CLS]']], lower_case=False, vocab_list=vocab_bert, preserve_unused_token=True, @@ -130,9 +112,7 @@ test_paras = [ dict( first=8, last=8, - expect_str=[ - [['unused'], [' '], ['['], ['CLS'], [']']], - ], + expect_str=[['unused', ' ', '[', 'CLS', ']']], lower_case=False, vocab_list=vocab_bert, preserve_unused_token=False,