提交 5276db8f 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!2390 change output of WordpieceTokenizer and BertTokenizer to 1-D string tensors

Merge pull request !2390 from qianlong21st/wordpiece_tokenizer_1D
......@@ -32,23 +32,6 @@ WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab,
max_bytes_per_token_(max_bytes_per_token),
unknown_token_(unknown_token) {}
void WordpieceTokenizerOp::PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
std::vector<std::string> *out_padded_tokens, int *out_cols) const {
int rows = tokens.size();
int max_cols = 0;
for (int i = 0; i < rows; i++) {
max_cols = std::max(max_cols, static_cast<int>(tokens[i].size()));
}
out_padded_tokens->resize(rows * max_cols, padded_str);
for (int i = 0; i < rows; i++) {
int index = i * max_cols;
for (int j = 0; j < tokens[i].size(); j++) {
(*out_padded_tokens)[index++] = tokens[i][j];
}
}
*out_cols = max_cols;
}
Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
bool *out_found, int *out_end) const {
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && start < input_token.size(), "Out of range");
......@@ -117,20 +100,16 @@ Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::
if (input->Rank() > 1 || input->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
}
std::vector<std::vector<std::string>> out_tokens(input->Size());
int i = 0;
std::vector<std::string> out_tokens;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) {
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &out_tokens[i++]));
std::vector<std::string> temp_tokens;
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens));
out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end());
}
std::vector<std::string> padded_tokens;
int cols = 0;
PadTokens(out_tokens, "<pad>", &padded_tokens, &cols);
std::vector<dsize_t> shapes;
if (input->Rank() == 1) {
shapes.push_back(out_tokens.size());
if (out_tokens.empty()) {
out_tokens.emplace_back("");
}
shapes.push_back(cols);
*output = std::make_shared<Tensor>(std::move(padded_tokens), TensorShape(shapes));
*output = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
return Status::OK();
}
......
......@@ -48,8 +48,6 @@ class WordpieceTokenizerOp : public TensorOp {
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
protected:
void PadTokens(const std::vector<std::vector<std::string>> &tokens, const std::string &padded_str,
std::vector<std::string> *out_padded_tokens, int *out_cols) const;
Status AddSubword(const std::string &input_token, const int start, const int end,
std::vector<std::string> *out_token) const;
Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const;
......
......@@ -188,7 +188,7 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
class WordpieceTokenizer(cde.WordpieceTokenizerOp):
"""
Tokenize scalar token or 1-D tokens to subword tokens.
Tokenize scalar token or 1-D tokens to 1-D subword tokens.
Args
vocab(Vocab): a Vocab object.
......
......@@ -35,38 +35,24 @@ test_paras = [
dict(
first=1,
last=4,
expect_str=[[['床'], ['前'], ['明'], ['月'], ['光']],
[['疑'], ['是'], ['地'], ['上'], ['霜']],
[['举'], ['头'], ['望'], ['明'], ['月']],
[['低'], ['头'], ['思'], ['故'], ['乡']]],
expect_str=[['床', '前', '明', '月', '光'],
['疑', '是', '地', '上', '霜'],
['举', '头', '望', '明', '月'],
['低', '头', '思', '故', '乡']],
vocab_list=vocab_bert
),
# test english text
dict(
first=5,
last=5,
expect_str=[[['i', pad],
["am", pad],
['mak', '##ing'],
['small', pad],
['mistake', '##s'],
['during', pad],
['work', '##ing'],
['hour', '##s']]],
expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
lower_case=True,
vocab_list=vocab_bert
),
dict(
first=5,
last=5,
expect_str=[[['I', pad],
["am", pad],
['mak', '##ing'],
['small', pad],
['mistake', '##s'],
['during', pad],
['work', '##ing'],
['hour', '##s']]],
expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
lower_case=False,
vocab_list=vocab_bert
),
......@@ -75,8 +61,8 @@ test_paras = [
first=6,
last=7,
expect_str=[
[['😀'], ['嘿'], ['嘿'], ['😃'], ['哈'], ['哈'], ['😄'], ['大'], ['笑'], ['😁'], ['嘻'], ['嘻']],
[['繁'], ['體'], ['字']]],
['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
['繁', '體', '字']],
normalization_form=nlp.utils.NormalizeForm.NFKC,
vocab_list=vocab_bert
),
......@@ -85,11 +71,11 @@ test_paras = [
first=8,
last=12,
expect_str=[
[['[UNK]'], ['[CLS]']],
[['[UNK]'], ['[SEP]']],
[['[UNK]'], ['[UNK]']],
[['[UNK]'], ['[PAD]']],
[['[UNK]'], ['[MASK]']],
['[UNK]', '[CLS]'],
['[UNK]', '[SEP]'],
['[UNK]', '[UNK]'],
['[UNK]', '[PAD]'],
['[UNK]', '[MASK]'],
],
lower_case=False,
vocab_list=vocab_bert,
......@@ -99,7 +85,7 @@ test_paras = [
dict(
first=13,
last=13,
expect_str=[[['12'], ['+'], ['/'], ['-'], ['28'], ['='], ['40'], ['/'], ['-'], ['16']]],
expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']],
preserve_unused_token=True,
vocab_list=vocab_bert
),
......@@ -107,9 +93,7 @@ test_paras = [
dict(
first=8,
last=8,
expect_str=[
[['[UNK]'], [' '], ['[CLS]']],
],
expect_str=[['[UNK]', ' ', '[CLS]']],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
......@@ -118,9 +102,7 @@ test_paras = [
dict(
first=8,
last=8,
expect_str=[
[['unused'], [' '], ['[CLS]']],
],
expect_str=[['unused', ' ', '[CLS]']],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=True,
......@@ -130,9 +112,7 @@ test_paras = [
dict(
first=8,
last=8,
expect_str=[
[['unused'], [' '], ['['], ['CLS'], [']']],
],
expect_str=[['unused', ' ', '[', 'CLS', ']']],
lower_case=False,
vocab_list=vocab_bert,
preserve_unused_token=False,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册