提交 47060631 编写于 作者: X xiefangqi

add offsets feature to tokenizer

上级 4bdd8e16
...@@ -601,13 +601,14 @@ void bindTensorOps4(py::module *m) { ...@@ -601,13 +601,14 @@ void bindTensorOps4(py::module *m) {
void bindTokenizerOps(py::module *m) { void bindTokenizerOps(py::module *m) {
(void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "") (void)py::class_<JiebaTokenizerOp, TensorOp, std::shared_ptr<JiebaTokenizerOp>>(*m, "JiebaTokenizerOp", "")
.def(py::init<const std::string, std::string, JiebaMode>(), py::arg("hmm_path"), py::arg("mp_path"), .def(py::init<const std::string &, const std::string &, const JiebaMode &, const bool &>(), py::arg("hmm_path"),
py::arg("mode") = JiebaMode::kMix) py::arg("mp_path"), py::arg("mode") = JiebaMode::kMix,
py::arg("with_offsets") = JiebaTokenizerOp::kDefWithOffsets)
.def("add_word", .def("add_word",
[](JiebaTokenizerOp &self, const std::string word, int freq) { THROW_IF_ERROR(self.AddWord(word, freq)); }); [](JiebaTokenizerOp &self, const std::string word, int freq) { THROW_IF_ERROR(self.AddWord(word, freq)); });
(void)py::class_<UnicodeCharTokenizerOp, TensorOp, std::shared_ptr<UnicodeCharTokenizerOp>>( (void)py::class_<UnicodeCharTokenizerOp, TensorOp, std::shared_ptr<UnicodeCharTokenizerOp>>(
*m, "UnicodeCharTokenizerOp", "Tokenize a scalar tensor of UTF-8 string to Unicode characters.") *m, "UnicodeCharTokenizerOp", "Tokenize a scalar tensor of UTF-8 string to Unicode characters.")
.def(py::init<>()); .def(py::init<const bool &>(), py::arg("with_offsets") = UnicodeCharTokenizerOp::kDefWithOffsets);
(void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp", (void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp",
"Tensor operation to LookUp each word") "Tensor operation to LookUp each word")
.def(py::init<std::shared_ptr<Vocab>, WordIdType>(), py::arg("vocab"), py::arg("unknown")) .def(py::init<std::shared_ptr<Vocab>, WordIdType>(), py::arg("vocab"), py::arg("unknown"))
...@@ -619,21 +620,25 @@ void bindTokenizerOps(py::module *m) { ...@@ -619,21 +620,25 @@ void bindTokenizerOps(py::module *m) {
py::arg("separator")); py::arg("separator"));
(void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>( (void)py::class_<WordpieceTokenizerOp, TensorOp, std::shared_ptr<WordpieceTokenizerOp>>(
*m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.") *m, "WordpieceTokenizerOp", "Tokenize scalar token or 1-D tokens to subword tokens.")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &>(), .def(
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, const bool &>(),
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken)); py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets);
} }
void bindDependIcuTokenizerOps(py::module *m) { void bindDependIcuTokenizerOps(py::module *m) {
#ifdef ENABLE_ICU4C #ifdef ENABLE_ICU4C
(void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>( (void)py::class_<WhitespaceTokenizerOp, TensorOp, std::shared_ptr<WhitespaceTokenizerOp>>(
*m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.") *m, "WhitespaceTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces.")
.def(py::init<>()); .def(py::init<const bool &>(), py::arg("with_offsets") = WhitespaceTokenizerOp::kDefWithOffsets);
(void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>( (void)py::class_<UnicodeScriptTokenizerOp, TensorOp, std::shared_ptr<UnicodeScriptTokenizerOp>>(
*m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.") *m, "UnicodeScriptTokenizerOp", "Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.")
.def(py::init<>()) .def(py::init<>())
.def(py::init<bool>(), py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace); .def(py::init<const bool &, const bool &>(),
py::arg("keep_whitespace") = UnicodeScriptTokenizerOp::kDefKeepWhitespace,
py::arg("with_offsets") = UnicodeScriptTokenizerOp::kDefWithOffsets);
(void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>( (void)py::class_<CaseFoldOp, TensorOp, std::shared_ptr<CaseFoldOp>>(
*m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor") *m, "CaseFoldOp", "Apply case fold operation on utf-8 string tensor")
.def(py::init<>()); .def(py::init<>());
...@@ -647,24 +652,28 @@ void bindDependIcuTokenizerOps(py::module *m) { ...@@ -647,24 +652,28 @@ void bindDependIcuTokenizerOps(py::module *m) {
py::arg("replace_all")); py::arg("replace_all"));
(void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>( (void)py::class_<RegexTokenizerOp, TensorOp, std::shared_ptr<RegexTokenizerOp>>(
*m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.") *m, "RegexTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by regex expression pattern.")
.def(py::init<const std::string &, const std::string &>(), py::arg("delim_pattern"), py::arg("keep_delim_pattern")); .def(py::init<const std::string &, const std::string &, const bool &>(), py::arg("delim_pattern"),
py::arg("keep_delim_pattern"), py::arg("with_offsets") = RegexTokenizerOp::kDefWithOffsets);
(void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>( (void)py::class_<BasicTokenizerOp, TensorOp, std::shared_ptr<BasicTokenizerOp>>(
*m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.") *m, "BasicTokenizerOp", "Tokenize a scalar tensor of UTF-8 string by specific rules.")
.def(py::init<bool, bool, NormalizeForm, bool>(), py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, .def(py::init<const bool &, const bool &, const NormalizeForm &, const bool &, const bool &>(),
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken,
py::arg("with_offsets") = BasicTokenizerOp::kDefWithOffsets);
(void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp", (void)py::class_<BertTokenizerOp, TensorOp, std::shared_ptr<BertTokenizerOp>>(*m, "BertTokenizerOp",
"Tokenizer used for Bert text process.") "Tokenizer used for Bert text process.")
.def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, bool, bool, .def(py::init<const std::shared_ptr<Vocab> &, const std::string &, const int &, const std::string &, const bool &,
NormalizeForm, bool>(), const bool &, const NormalizeForm &, const bool &, const bool &>(),
py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator), py::arg("vocab"), py::arg("suffix_indicator") = std::string(WordpieceTokenizerOp::kDefSuffixIndicator),
py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken, py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken), py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase, py::arg("lower_case") = BasicTokenizerOp::kDefLowerCase,
py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace, py::arg("keep_whitespace") = BasicTokenizerOp::kDefKeepWhitespace,
py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm, py::arg("normalization_form") = BasicTokenizerOp::kDefNormalizationForm,
py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken); py::arg("preserve_unused_token") = BasicTokenizerOp::kDefPreserveUnusedToken,
py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets);
#endif #endif
} }
......
...@@ -27,10 +27,12 @@ ...@@ -27,10 +27,12 @@
namespace mindspore { namespace mindspore {
namespace dataset { namespace dataset {
const bool BasicTokenizerOp::kDefLowerCase = false; const bool BasicTokenizerOp::kDefLowerCase = false;
const bool BasicTokenizerOp::kDefKeepWhitespace = false; const bool BasicTokenizerOp::kDefKeepWhitespace = false;
const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone; const NormalizeForm BasicTokenizerOp::kDefNormalizationForm = NormalizeForm::kNone;
const bool BasicTokenizerOp::kDefPreserveUnusedToken = true; const bool BasicTokenizerOp::kDefPreserveUnusedToken = true;
const bool BasicTokenizerOp::kDefWithOffsets = false;
const char BasicTokenizerOp::kCommonPattern[] = const char BasicTokenizerOp::kCommonPattern[] =
"[!-/]" "[!-/]"
"|[:-@]" "|[:-@]"
...@@ -47,11 +49,14 @@ const char BasicTokenizerOp::kCommonPattern[] = ...@@ -47,11 +49,14 @@ const char BasicTokenizerOp::kCommonPattern[] =
"|[\\x{2F800}-\\x{2FA1F}]"; "|[\\x{2F800}-\\x{2FA1F}]";
const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|\\[unused\\d+\\]|"; const char BasicTokenizerOp::kUnusedPattern[] = "\\[CLS\\]|\\[SEP\\]|\\[UNK\\]|\\[PAD\\]|\\[MASK\\]|\\[unused\\d+\\]|";
const std::unordered_set<std::string> BasicTokenizerOp::kUnusedWords{"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"}; const std::unordered_set<std::string> BasicTokenizerOp::kUnusedWords{"[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]"};
BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, NormalizeForm normalization_form,
bool preserve_unused_token) BasicTokenizerOp::BasicTokenizerOp(const bool &lower_case, const bool &keep_whitespace,
const NormalizeForm &normalization_form, const bool &preserve_unused_token,
const bool &with_offsets)
: lower_case_(lower_case), : lower_case_(lower_case),
keep_whitespace_(keep_whitespace), keep_whitespace_(keep_whitespace),
preserve_unused_token_(preserve_unused_token), preserve_unused_token_(preserve_unused_token),
with_offsets_(with_offsets),
case_fold_(std::make_unique<CaseFoldOp>()), case_fold_(std::make_unique<CaseFoldOp>()),
nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)), nfd_normalize_(std::make_unique<NormalizeUTF8Op>(NormalizeForm::kNfd)),
normalization_form_(normalization_form), normalization_form_(normalization_form),
...@@ -69,7 +74,7 @@ BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, Normal ...@@ -69,7 +74,7 @@ BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, Normal
keep_delim_pattern = kUnusedPattern + keep_delim_pattern; keep_delim_pattern = kUnusedPattern + keep_delim_pattern;
delim_pattern = kUnusedPattern + delim_pattern; delim_pattern = kUnusedPattern + delim_pattern;
} }
regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern); regex_tokenizer_ = std::make_unique<RegexTokenizerOp>(delim_pattern, keep_delim_pattern, with_offsets_);
} }
Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::string_view &text, Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::string_view &text,
...@@ -135,9 +140,10 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor ...@@ -135,9 +140,10 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor
return Status::OK(); return Status::OK();
} }
Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { Status BasicTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK(input, output); IO_CHECK_VECTOR(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
} }
std::shared_ptr<Tensor> cur_input; std::shared_ptr<Tensor> cur_input;
...@@ -145,10 +151,10 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar ...@@ -145,10 +151,10 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
if (lower_case_) { if (lower_case_) {
if (!preserve_unused_token_) { if (!preserve_unused_token_) {
// to lower case // to lower case
RETURN_IF_NOT_OK(case_fold_->Compute(input, &processed_tensor)); RETURN_IF_NOT_OK(case_fold_->Compute(input[0], &processed_tensor));
} else { } else {
// to lower case except words in kUnusedWords // to lower case except words in kUnusedWords
RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input, &processed_tensor)); RETURN_IF_NOT_OK(CaseFoldWithoutUnusedWords(input[0], &processed_tensor));
} }
cur_input = processed_tensor; cur_input = processed_tensor;
// strip accent characters // strip accent characters
...@@ -156,12 +162,12 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar ...@@ -156,12 +162,12 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
cur_input = processed_tensor; cur_input = processed_tensor;
RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor)); RETURN_IF_NOT_OK(replace_accent_chars_->Compute(cur_input, &processed_tensor));
} else { } else {
RETURN_IF_NOT_OK(common_normalize_->Compute(input, &processed_tensor)); RETURN_IF_NOT_OK(common_normalize_->Compute(input[0], &processed_tensor));
} }
// strip control characters // strip control characters
cur_input = processed_tensor; cur_input = processed_tensor;
RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor)); RETURN_IF_NOT_OK(replace_control_chars_->Compute(cur_input, &processed_tensor));
return regex_tokenizer_->Compute(processed_tensor, output); return regex_tokenizer_->Compute(TensorRow(0, {std::move(processed_tensor)}), output);
} }
} // namespace dataset } // namespace dataset
} // namespace mindspore } // namespace mindspore
...@@ -36,15 +36,18 @@ class BasicTokenizerOp : public TensorOp { ...@@ -36,15 +36,18 @@ class BasicTokenizerOp : public TensorOp {
static const bool kDefKeepWhitespace; static const bool kDefKeepWhitespace;
static const NormalizeForm kDefNormalizationForm; static const NormalizeForm kDefNormalizationForm;
static const bool kDefPreserveUnusedToken; static const bool kDefPreserveUnusedToken;
explicit BasicTokenizerOp(bool lower_case = kDefLowerCase, bool keep_whitespace = kDefKeepWhitespace, static const bool kDefWithOffsets;
NormalizeForm normalization_form = kDefNormalizationForm,
bool preserve_unused_token = kDefPreserveUnusedToken); explicit BasicTokenizerOp(const bool &lower_case = kDefLowerCase, const bool &keep_whitespace = kDefKeepWhitespace,
const NormalizeForm &normalization_form = kDefNormalizationForm,
const bool &preserve_unused_token = kDefPreserveUnusedToken,
const bool &with_offsets = kDefWithOffsets);
~BasicTokenizerOp() override = default; ~BasicTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; } void Print(std::ostream &out) const override { out << "BasicTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; Status Compute(const TensorRow &input, TensorRow *output) override;
protected: protected:
Status CaseFoldWithoutUnusedWords(const std::string_view &text, const std::unordered_set<std::string> &unused_words, Status CaseFoldWithoutUnusedWords(const std::string_view &text, const std::unordered_set<std::string> &unused_words,
...@@ -55,6 +58,7 @@ class BasicTokenizerOp : public TensorOp { ...@@ -55,6 +58,7 @@ class BasicTokenizerOp : public TensorOp {
static const char kCommonPattern[]; static const char kCommonPattern[];
static const char kUnusedPattern[]; static const char kUnusedPattern[];
static const std::unordered_set<std::string> kUnusedWords; static const std::unordered_set<std::string> kUnusedWords;
bool with_offsets_;
bool lower_case_; bool lower_case_;
bool keep_whitespace_; bool keep_whitespace_;
NormalizeForm normalization_form_; NormalizeForm normalization_form_;
......
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
#include "dataset/text/kernels/bert_tokenizer_op.h" #include "dataset/text/kernels/bert_tokenizer_op.h"
namespace mindspore { namespace mindspore {
namespace dataset { namespace dataset {
Status BertTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { Status BertTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK(input, output); IO_CHECK_VECTOR(input, output);
std::shared_ptr<Tensor> basic_tensor; TensorRow basic_tensor;
RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor)); RETURN_IF_NOT_OK(basic_tokenizer_.Compute(input, &basic_tensor));
RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output)); RETURN_IF_NOT_OK(wordpiece_tokenizer_.Compute(basic_tensor, output));
return Status::OK(); return Status::OK();
......
...@@ -32,18 +32,19 @@ class BertTokenizerOp : public TensorOp { ...@@ -32,18 +32,19 @@ class BertTokenizerOp : public TensorOp {
const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator, const std::string &suffix_indicator = WordpieceTokenizerOp::kDefSuffixIndicator,
const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken, const int &max_bytes_per_token = WordpieceTokenizerOp::kDefMaxBytesPerToken,
const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken, const std::string &unknown_token = WordpieceTokenizerOp::kDefUnknownToken,
bool lower_case = BasicTokenizerOp::kDefLowerCase, const bool &lower_case = BasicTokenizerOp::kDefLowerCase,
bool keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace, const bool &keep_whitespace = BasicTokenizerOp::kDefKeepWhitespace,
NormalizeForm normalization_form = BasicTokenizerOp::kDefNormalizationForm, const NormalizeForm &normalization_form = BasicTokenizerOp::kDefNormalizationForm,
bool preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken) const bool &preserve_unused_token = BasicTokenizerOp::kDefPreserveUnusedToken,
: wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token), const bool &with_offsets = WordpieceTokenizerOp::kDefWithOffsets)
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token) {} : wordpiece_tokenizer_(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets),
basic_tokenizer_(lower_case, keep_whitespace, normalization_form, preserve_unused_token, with_offsets) {}
~BertTokenizerOp() override = default; ~BertTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "BertTokenizerOp"; } void Print(std::ostream &out) const override { out << "BertTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; Status Compute(const TensorRow &input, TensorRow *output) override;
private: private:
WordpieceTokenizerOp wordpiece_tokenizer_; WordpieceTokenizerOp wordpiece_tokenizer_;
......
...@@ -23,35 +23,63 @@ ...@@ -23,35 +23,63 @@
namespace mindspore { namespace mindspore {
namespace dataset { namespace dataset {
JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, JiebaMode mode) const bool JiebaTokenizerOp::kDefWithOffsets = false;
: jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path) {
JiebaTokenizerOp::JiebaTokenizerOp(const std::string &hmm_path, const std::string &dict_path, const JiebaMode &mode,
const bool &with_offsets)
: jieba_mode_(mode), hmm_model_path_(hmm_path), mp_dict_path_(dict_path), with_offsets_(with_offsets) {
jieba_parser_ = std::make_unique<cppjieba::Jieba>(mp_dict_path_, hmm_model_path_, ""); jieba_parser_ = std::make_unique<cppjieba::Jieba>(mp_dict_path_, hmm_model_path_, "");
} }
Status JiebaTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { Status JiebaTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK(input, output); IO_CHECK_VECTOR(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
RETURN_UNEXPECTED_IF_NULL(jieba_parser_); RETURN_UNEXPECTED_IF_NULL(jieba_parser_);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor"); RETURN_STATUS_UNEXPECTED("the input tensor should be scalar string tensor");
} }
std::string_view sentence_v; std::string_view sentence_v;
RETURN_IF_NOT_OK(input->GetItemAt(&sentence_v, {})); RETURN_IF_NOT_OK(input[0]->GetItemAt(&sentence_v, {}));
std::string sentence{sentence_v}; std::string sentence{sentence_v};
std::vector<std::string> words; std::vector<std::string> words;
std::vector<uint32_t> offsets_start, offsets_limit;
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
if (sentence == "") { if (sentence == "") {
words.push_back(""); words.push_back("");
} else { } else {
std::vector<cppjieba::Word> tmp;
if (jieba_mode_ == JiebaMode::kMp) { if (jieba_mode_ == JiebaMode::kMp) {
jieba_parser_->CutSmall(sentence, words, MAX_WORD_LENGTH); std::unique_ptr<cppjieba::MPSegment> mp_seg = std::make_unique<cppjieba::MPSegment>(jieba_parser_->GetDictTrie());
mp_seg->Cut(sentence, tmp, MAX_WORD_LENGTH);
} else if (jieba_mode_ == JiebaMode::kHmm) { } else if (jieba_mode_ == JiebaMode::kHmm) {
jieba_parser_->CutHMM(sentence, words); std::unique_ptr<cppjieba::HMMSegment> hmm_seg =
std::make_unique<cppjieba::HMMSegment>(jieba_parser_->GetHMMModel());
hmm_seg->Cut(sentence, tmp);
} else { // Mix } else { // Mix
jieba_parser_->Cut(sentence, words, true); std::unique_ptr<cppjieba::MixSegment> mix_seg =
std::make_unique<cppjieba::MixSegment>(jieba_parser_->GetDictTrie(), jieba_parser_->GetHMMModel());
mix_seg->Cut(sentence, tmp, true);
}
GetStringsFromWords(tmp, words);
for (auto item : tmp) {
offsets_start.push_back(static_cast<uint32_t>(item.offset));
offsets_limit.push_back(static_cast<uint32_t>(item.offset + item.word.length()));
} }
} }
*output = std::make_shared<Tensor>(words, TensorShape({(dsize_t)words.size()})); token_tensor = std::make_shared<Tensor>(words, TensorShape({(dsize_t)words.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
}
return Status::OK(); return Status::OK();
} }
......
...@@ -30,15 +30,19 @@ enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 }; ...@@ -30,15 +30,19 @@ enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 };
class JiebaTokenizerOp : public TensorOp { class JiebaTokenizerOp : public TensorOp {
public: public:
// deffault constant for Jieba MPSegment algorithm. // default constant for Jieba MPSegment algorithm.
static constexpr size_t MAX_WORD_LENGTH = 512; static constexpr size_t MAX_WORD_LENGTH = 512;
// default const for set whether Jieba output offsets tensor.
static const bool kDefWithOffsets;
// Constructor for JiebaTokenizerOp. // Constructor for JiebaTokenizerOp.
// @param hmm_path HMM model file. // @param hmm_path HMM model file.
// @param mp_path MP model file. // @param mp_path MP model file.
// @mode tokenization mode [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will // @mode tokenization mode [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will
// tokenize with Hiddel Markov Model Segment algorithm, "MIx" model will tokenize with a mix of MPSegment and // tokenize with Hiddel Markov Model Segment algorithm, "MIx" model will tokenize with a mix of MPSegment and
// HMMSegment algorithm. // HMMSegment algorithm.
JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, JiebaMode mode = JiebaMode::kMix); // @with_offsets user set this value to choose whether output offset tensor.
JiebaTokenizerOp(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix,
const bool &with_offsets = kDefWithOffsets);
~JiebaTokenizerOp() override = default; ~JiebaTokenizerOp() override = default;
void Print(std::ostream &out) const override { void Print(std::ostream &out) const override {
...@@ -46,7 +50,7 @@ class JiebaTokenizerOp : public TensorOp { ...@@ -46,7 +50,7 @@ class JiebaTokenizerOp : public TensorOp {
<< mp_dict_path_; << mp_dict_path_;
} }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; Status Compute(const TensorRow &input, TensorRow *output) override;
// @word the word to be added to the JiebaTokenizer. // @word the word to be added to the JiebaTokenizer.
// @freq [Default 0] the frequency fo the word to be added. // @freq [Default 0] the frequency fo the word to be added.
...@@ -58,6 +62,7 @@ class JiebaTokenizerOp : public TensorOp { ...@@ -58,6 +62,7 @@ class JiebaTokenizerOp : public TensorOp {
std::string mp_dict_path_; std::string mp_dict_path_;
std::unique_ptr<cppjieba::Jieba> jieba_parser_; std::unique_ptr<cppjieba::Jieba> jieba_parser_;
JiebaMode jieba_mode_; JiebaMode jieba_mode_;
bool with_offsets_;
}; };
} // namespace dataset } // namespace dataset
} // namespace mindspore } // namespace mindspore
......
...@@ -22,8 +22,11 @@ ...@@ -22,8 +22,11 @@
namespace mindspore { namespace mindspore {
namespace dataset { namespace dataset {
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8,
icu::UnicodeString *out_unicode) const { const bool RegexTokenizerOp::kDefWithOffsets = false;
Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len,
std::string *out_utf8, icu::UnicodeString *out_unicode) const {
CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input"); CHECK_FAIL_RETURN_UNEXPECTED((out_utf8 != nullptr || out_unicode != nullptr), "Wrong input");
int total_len = input.length(); int total_len = input.length();
int end = start + len; int end = start + len;
...@@ -39,7 +42,9 @@ Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int s ...@@ -39,7 +42,9 @@ Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int s
return Status::OK(); return Status::OK();
} }
Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const { Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens,
std::vector<uint32_t> *offsets_start,
std::vector<uint32_t> *offsets_limit) const {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
out_tokens->clear(); out_tokens->clear();
icu::RegexMatcher token_matcher(delim_pattern_, 0, status); icu::RegexMatcher token_matcher(delim_pattern_, 0, status);
...@@ -50,6 +55,7 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std ...@@ -50,6 +55,7 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text)); icu::UnicodeString utext(icu::UnicodeString::fromUTF8(text));
token_matcher.reset(utext); token_matcher.reset(utext);
int text_start_index = 0;
int token_start_index = 0; int token_start_index = 0;
status = U_ZERO_ERROR; status = U_ZERO_ERROR;
while (token_matcher.find(status) && U_SUCCESS(status)) { while (token_matcher.find(status) && U_SUCCESS(status)) {
...@@ -62,41 +68,70 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std ...@@ -62,41 +68,70 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
int token_len = deli_start_index - token_start_index; int token_len = deli_start_index - token_start_index;
if (token_len > 0) { if (token_len > 0) {
std::string token; std::string token;
uint32_t token_offset = 0;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token)); RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, token_len, &token));
token_offset = token.length();
out_tokens->emplace_back(std::move(token)); out_tokens->emplace_back(std::move(token));
offsets_start->push_back(static_cast<uint32_t>(text_start_index));
offsets_limit->push_back(static_cast<uint32_t>(text_start_index + token_offset));
text_start_index += token_offset;
} }
int delim_len = deli_end_index - deli_start_index; int delim_len = deli_end_index - deli_start_index;
if (keep_delim_ && delim_len > 0) { if (delim_len > 0) {
icu::UnicodeString delim_str; icu::UnicodeString delim_str;
std::string delim_utf8_str; std::string delim_utf8_str;
uint32_t delim_str_offset = 0;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str)); RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, deli_start_index, delim_len, &delim_utf8_str, &delim_str));
delim_matcher.reset(delim_str); delim_matcher.reset(delim_str);
if (delim_matcher.matches(status) && U_SUCCESS(status)) { delim_str_offset = delim_utf8_str.length();
if (keep_delim_ && delim_matcher.matches(status) && U_SUCCESS(status)) {
out_tokens->emplace_back(std::move(delim_utf8_str)); out_tokens->emplace_back(std::move(delim_utf8_str));
offsets_start->push_back(static_cast<uint32_t>(text_start_index));
offsets_limit->push_back(static_cast<uint32_t>(text_start_index + delim_str_offset));
} }
text_start_index += delim_str_offset;
} }
token_start_index = deli_end_index; token_start_index = deli_end_index;
} }
if (token_start_index < utext.length()) { if (token_start_index < utext.length()) {
std::string temp; std::string temp;
uint32_t temp_offset = 0;
RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp)); RETURN_IF_NOT_OK(GetUnicodeSubstr(utext, token_start_index, utext.length() - token_start_index, &temp));
temp_offset = temp.length();
out_tokens->emplace_back(std::move(temp)); out_tokens->emplace_back(std::move(temp));
offsets_start->push_back(static_cast<uint32_t>(text_start_index));
offsets_limit->push_back(static_cast<uint32_t>(text_start_index + temp_offset));
} }
return Status::OK(); return Status::OK();
} }
Status RegexTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { Status RegexTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK(input, output); IO_CHECK_VECTOR(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
} }
std::string_view text; std::string_view text;
RETURN_IF_NOT_OK(input->GetItemAt(&text, {}));
std::vector<std::string> tokens; std::vector<std::string> tokens;
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens)); std::vector<uint32_t> offsets_start;
*output = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()})); std::vector<uint32_t> offsets_limit;
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
RETURN_IF_NOT_OK(input[0]->GetItemAt(&text, {}));
RETURN_IF_NOT_OK(GetRegexTokens(std::string(text.data(), text.size()), &tokens, &offsets_start, &offsets_limit));
token_tensor = std::make_shared<Tensor>(std::move(tokens), TensorShape({(dsize_t)tokens.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
}
return Status::OK(); return Status::OK();
} }
} // namespace dataset } // namespace dataset
......
...@@ -32,25 +32,31 @@ namespace dataset { ...@@ -32,25 +32,31 @@ namespace dataset {
class RegexTokenizerOp : public TensorOp { class RegexTokenizerOp : public TensorOp {
public: public:
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern) static const bool kDefWithOffsets;
RegexTokenizerOp(const std::string &delim_pattern, const std::string &keep_delim_pattern,
const bool &with_offsets = kDefWithOffsets)
: delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)), : delim_pattern_(icu::UnicodeString::fromUTF8(delim_pattern)),
keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)), keep_delim_pattern_(icu::UnicodeString::fromUTF8(keep_delim_pattern)),
with_offsets_(with_offsets),
keep_delim_(!keep_delim_pattern.empty()) {} keep_delim_(!keep_delim_pattern.empty()) {}
~RegexTokenizerOp() override = default; ~RegexTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; } void Print(std::ostream &out) const override { out << "RegexTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; Status Compute(const TensorRow &input, TensorRow *output) override;
protected: protected:
Status GetUnicodeSubstr(const icu::UnicodeString &input, int start, int len, std::string *out_utf8, Status GetUnicodeSubstr(const icu::UnicodeString &input, const int &start, const int &len, std::string *out_utf8,
icu::UnicodeString *out_unicode = nullptr) const; icu::UnicodeString *out_unicode = nullptr) const;
Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens) const; Status GetRegexTokens(const std::string &text, std::vector<std::string> *out_tokens,
std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const;
private: private:
const icu::UnicodeString delim_pattern_; const icu::UnicodeString delim_pattern_;
const icu::UnicodeString keep_delim_pattern_; const icu::UnicodeString keep_delim_pattern_;
bool with_offsets_;
const bool keep_delim_; const bool keep_delim_;
}; };
} // namespace dataset } // namespace dataset
......
...@@ -27,26 +27,46 @@ using cppjieba::RuneStrArray; ...@@ -27,26 +27,46 @@ using cppjieba::RuneStrArray;
namespace mindspore { namespace mindspore {
namespace dataset { namespace dataset {
Status UnicodeCharTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { const bool UnicodeCharTokenizerOp::kDefWithOffsets = false;
IO_CHECK(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { Status UnicodeCharTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
} }
std::string_view str; std::string_view str;
RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
RuneStrArray runes; RuneStrArray runes;
if (!DecodeRunesInString(str.data(), str.size(), runes)) { if (!DecodeRunesInString(str.data(), str.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
} }
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
std::vector<std::string> splits(runes.size()); std::vector<std::string> splits(runes.size());
std::vector<uint32_t> offsets_start, offsets_limit;
for (size_t i = 0; i < runes.size(); i++) { for (size_t i = 0; i < runes.size(); i++) {
offsets_start.push_back(runes[i].offset);
offsets_limit.push_back(runes[i].offset + runes[i].len);
splits[i] = str.substr(runes[i].offset, runes[i].len); splits[i] = str.substr(runes[i].offset, runes[i].len);
} }
if (splits.empty()) { if (splits.empty()) {
splits.emplace_back(""); splits.emplace_back("");
offsets_start.push_back(0);
offsets_limit.push_back(0);
}
token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
} }
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
return Status::OK(); return Status::OK();
} }
} // namespace dataset } // namespace dataset
......
...@@ -26,13 +26,18 @@ namespace dataset { ...@@ -26,13 +26,18 @@ namespace dataset {
class UnicodeCharTokenizerOp : public TensorOp { class UnicodeCharTokenizerOp : public TensorOp {
public: public:
UnicodeCharTokenizerOp() {} static const bool kDefWithOffsets;
explicit UnicodeCharTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {}
~UnicodeCharTokenizerOp() override = default; ~UnicodeCharTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "UnicodeCharTokenizerOp"; } void Print(std::ostream &out) const override { out << "UnicodeCharTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; Status Compute(const TensorRow &input, TensorRow *output) override;
private:
bool with_offsets_;
}; };
} // namespace dataset } // namespace dataset
......
...@@ -32,24 +32,28 @@ namespace mindspore { ...@@ -32,24 +32,28 @@ namespace mindspore {
namespace dataset { namespace dataset {
const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false; const bool UnicodeScriptTokenizerOp::kDefKeepWhitespace = false;
const bool UnicodeScriptTokenizerOp::kDefWithOffsets = false;
Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { Status UnicodeScriptTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK(input, output); IO_CHECK_VECTOR(input, output);
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) { CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
} }
std::string_view str; std::string_view str;
RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
RuneStrArray runes; RuneStrArray runes;
if (!DecodeRunesInString(str.data(), str.size(), runes)) { if (!DecodeRunesInString(str.data(), str.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
} }
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
UScriptCode last_script = USCRIPT_INVALID_CODE; UScriptCode last_script = USCRIPT_INVALID_CODE;
icu::ErrorCode status; icu::ErrorCode status;
int start = 0; int start = 0;
int len = 0; int len = 0;
std::vector<std::string> splits; std::vector<std::string> splits;
std::vector<uint32_t> offsets_start, offsets_limit;
bool was_space = false; bool was_space = false;
for (size_t i = 0; i < runes.size(); i++) { for (size_t i = 0; i < runes.size(); i++) {
...@@ -66,6 +70,8 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s ...@@ -66,6 +70,8 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
if (len > 0 && (script != last_script || is_space != was_space)) { if (len > 0 && (script != last_script || is_space != was_space)) {
// 3) If keep_whitespace_ is false, all the whitespace characters will be discard // 3) If keep_whitespace_ is false, all the whitespace characters will be discard
if (keep_whitespace_ || !was_space) { if (keep_whitespace_ || !was_space) {
offsets_start.push_back(static_cast<uint32_t>(start));
offsets_limit.push_back(static_cast<uint32_t>(start + len));
std::string temp(str.substr(start, len)); std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp)); splits.emplace_back(std::move(temp));
} }
...@@ -79,14 +85,29 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s ...@@ -79,14 +85,29 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
} }
if (len > 0 && (keep_whitespace_ || !was_space)) { if (len > 0 && (keep_whitespace_ || !was_space)) {
offsets_start.push_back(static_cast<uint32_t>(start));
offsets_limit.push_back(static_cast<uint32_t>(start + len));
std::string temp(str.substr(start, len)); std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp)); splits.emplace_back(std::move(temp));
} }
// 4) If the input is empty scalar string, the output will be 1-D empty string. // 4) If the input is empty scalar string, the output will be 1-D empty string.
if (splits.empty()) { if (splits.empty()) {
splits.emplace_back(""); splits.emplace_back("");
offsets_start.push_back(0);
offsets_limit.push_back(0);
}
token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
} }
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
return Status::OK(); return Status::OK();
} }
} // namespace dataset } // namespace dataset
......
...@@ -27,17 +27,21 @@ namespace dataset { ...@@ -27,17 +27,21 @@ namespace dataset {
class UnicodeScriptTokenizerOp : public TensorOp { class UnicodeScriptTokenizerOp : public TensorOp {
public: public:
static const bool kDefKeepWhitespace; static const bool kDefKeepWhitespace;
static const bool kDefWithOffsets;
explicit UnicodeScriptTokenizerOp(bool keep_whitespace = kDefKeepWhitespace) : keep_whitespace_(keep_whitespace) {} explicit UnicodeScriptTokenizerOp(const bool &keep_whitespace = kDefKeepWhitespace,
const bool &with_offsets = kDefWithOffsets)
: keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
~UnicodeScriptTokenizerOp() override = default; ~UnicodeScriptTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; } void Print(std::ostream &out) const override { out << "UnicodeScriptTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; Status Compute(const TensorRow &input, TensorRow *output) override;
private: private:
bool keep_whitespace_; // If or not keep whitespace tokens bool keep_whitespace_; // If or not keep whitespace tokens
bool with_offsets_;
}; };
} // namespace dataset } // namespace dataset
} // namespace mindspore } // namespace mindspore
......
...@@ -30,24 +30,33 @@ using cppjieba::RuneStrArray; ...@@ -30,24 +30,33 @@ using cppjieba::RuneStrArray;
namespace mindspore { namespace mindspore {
namespace dataset { namespace dataset {
Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
IO_CHECK(input, output); const bool WhitespaceTokenizerOp::kDefWithOffsets = false;
if (input->Rank() != 0 || input->type() != DataType::DE_STRING) {
Status WhitespaceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK_VECTOR(input, output);
CHECK_FAIL_RETURN_UNEXPECTED(input.size() == 1, "Input should be one tensor");
if (input[0]->Rank() != 0 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor"); RETURN_STATUS_UNEXPECTED("The input tensor should be scalar string tensor");
} }
std::string_view str; std::string_view str;
RETURN_IF_NOT_OK(input->GetItemAt(&str, {})); RETURN_IF_NOT_OK(input[0]->GetItemAt(&str, {}));
RuneStrArray runes; RuneStrArray runes;
if (!DecodeRunesInString(str.data(), str.size(), runes)) { if (!DecodeRunesInString(str.data(), str.size(), runes)) {
RETURN_STATUS_UNEXPECTED("Decode utf8 string failed."); RETURN_STATUS_UNEXPECTED("Decode utf8 string failed.");
} }
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
std::vector<uint32_t> offsets_start, offsets_limit;
std::vector<std::string> splits; std::vector<std::string> splits;
int start = 0; int start = 0;
int len = 0; int len = 0;
for (size_t i = 0; i < runes.size(); i++) { for (size_t i = 0; i < runes.size(); i++) {
if (u_isUWhiteSpace(runes[i].rune)) { if (u_isUWhiteSpace(runes[i].rune)) {
if (len > 0) { if (len > 0) {
offsets_start.push_back(static_cast<uint32_t>(start));
offsets_limit.push_back(static_cast<uint32_t>(start + len));
std::string temp(str.substr(start, len)); std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp)); splits.emplace_back(std::move(temp));
len = 0; len = 0;
...@@ -60,13 +69,28 @@ Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std: ...@@ -60,13 +69,28 @@ Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std:
} }
} }
if (len > 0) { if (len > 0) {
offsets_start.push_back(static_cast<uint32_t>(start));
offsets_limit.push_back(static_cast<uint32_t>(start + len));
std::string temp(str.substr(start, len)); std::string temp(str.substr(start, len));
splits.emplace_back(std::move(temp)); splits.emplace_back(std::move(temp));
} }
if (splits.empty()) { if (splits.empty()) {
splits.emplace_back(""); splits.emplace_back("");
offsets_start.push_back(0);
offsets_limit.push_back(0);
}
token_tensor = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
} }
*output = std::make_shared<Tensor>(splits, TensorShape({(dsize_t)splits.size()}));
return Status::OK(); return Status::OK();
} }
} // namespace dataset } // namespace dataset
......
...@@ -26,13 +26,18 @@ namespace dataset { ...@@ -26,13 +26,18 @@ namespace dataset {
class WhitespaceTokenizerOp : public TensorOp { class WhitespaceTokenizerOp : public TensorOp {
public: public:
WhitespaceTokenizerOp() {} static const bool kDefWithOffsets;
explicit WhitespaceTokenizerOp(const bool &with_offsets = kDefWithOffsets) : with_offsets_(with_offsets) {}
~WhitespaceTokenizerOp() override = default; ~WhitespaceTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; } void Print(std::ostream &out) const override { out << "WhitespaceTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; Status Compute(const TensorRow &input, TensorRow *output) override;
private:
bool with_offsets_;
}; };
} // namespace dataset } // namespace dataset
} // namespace mindspore } // namespace mindspore
......
...@@ -24,13 +24,16 @@ namespace dataset { ...@@ -24,13 +24,16 @@ namespace dataset {
const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##"; const char WordpieceTokenizerOp::kDefSuffixIndicator[] = "##";
const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100; const int WordpieceTokenizerOp::kDefMaxBytesPerToken = 100;
const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]"; const char WordpieceTokenizerOp::kDefUnknownToken[] = "[UNK]";
const bool WordpieceTokenizerOp::kDefWithOffsets = false;
WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator, WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator,
const int &max_bytes_per_token, const std::string &unknown_token) const int &max_bytes_per_token, const std::string &unknown_token,
const bool &with_offsets)
: vocab_(vocab), : vocab_(vocab),
suffix_indicator_(suffix_indicator), suffix_indicator_(suffix_indicator),
max_bytes_per_token_(max_bytes_per_token), max_bytes_per_token_(max_bytes_per_token),
unknown_token_(unknown_token) {} unknown_token_(unknown_token),
with_offsets_(with_offsets) {}
Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start,
bool *out_found, int *out_end) const { bool *out_found, int *out_end) const {
...@@ -52,17 +55,22 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru ...@@ -52,17 +55,22 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru
return Status::OK(); return Status::OK();
} }
Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const { Status WordpieceTokenizerOp::FoundNoToken(const std::string &input_token, const uint32_t &basic_start,
std::vector<std::string> *out_tokens, std::vector<uint32_t> *offsets_start,
std::vector<uint32_t> *offsets_limit) const {
out_tokens->clear(); out_tokens->clear();
offsets_start->push_back(basic_start);
if (unknown_token_.empty()) { if (unknown_token_.empty()) {
out_tokens->emplace_back(input_token); out_tokens->emplace_back(input_token);
offsets_limit->push_back(basic_start + input_token.length());
} else { } else {
out_tokens->emplace_back(unknown_token_); out_tokens->emplace_back(unknown_token_);
offsets_limit->push_back(basic_start + input_token.length());
} }
return Status::OK(); return Status::OK();
} }
Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int start, const int end, Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const int &start, const int &end,
std::vector<std::string> *out_tokens) const { std::vector<std::string> *out_tokens) const {
CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range"); CHECK_FAIL_RETURN_UNEXPECTED(start >= 0 && end > start && end <= input_token.size(), "Out of range");
std::string subword = input_token.substr(start, end - start); std::string subword = input_token.substr(start, end - start);
...@@ -73,9 +81,19 @@ Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const in ...@@ -73,9 +81,19 @@ Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const in
return Status::OK(); return Status::OK();
} }
Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const { Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, const uint32_t &basic_start,
std::vector<std::string> *out_tokens, std::vector<uint32_t> *offsets_start,
std::vector<uint32_t> *offsets_limit) const {
if (input_token.size() > max_bytes_per_token_) { if (input_token.size() > max_bytes_per_token_) {
return FoundNoToken(input_token, out_tokens); offsets_start->push_back(basic_start);
if (!unknown_token_.empty()) {
offsets_limit->push_back(basic_start + unknown_token_.size());
out_tokens->emplace_back(unknown_token_);
} else {
out_tokens->emplace_back(input_token);
offsets_limit->push_back(basic_start + input_token.size());
}
return Status::OK();
} }
RuneStrArray runes; RuneStrArray runes;
if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) { if (!DecodeRunesInString(input_token.data(), input_token.size(), runes)) {
...@@ -87,29 +105,52 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect ...@@ -87,29 +105,52 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect
RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end)); RETURN_IF_NOT_OK(LookupWord(input_token, runes, start, &found, &end));
if (found) { if (found) {
RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens)); RETURN_IF_NOT_OK(AddSubword(input_token, start, end, out_tokens));
offsets_start->push_back(static_cast<uint32_t>(basic_start + start));
offsets_limit->push_back(static_cast<uint32_t>(basic_start + end));
start = end; start = end;
} else { } else {
return FoundNoToken(input_token, out_tokens); return FoundNoToken(input_token, basic_start, out_tokens, offsets_start, offsets_limit);
} }
} }
return Status::OK(); return Status::OK();
} }
Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { Status WordpieceTokenizerOp::Compute(const TensorRow &input, TensorRow *output) {
IO_CHECK(input, output); IO_CHECK_VECTOR(input, output);
if (input->Rank() > 1 || input->type() != DataType::DE_STRING) { if (input[0]->Rank() > 1 || input[0]->type() != DataType::DE_STRING) {
RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor"); RETURN_STATUS_UNEXPECTED("The input tensor should be scalar or 1-D string tensor");
} }
dsize_t count = 0;
std::vector<std::string> out_tokens; std::vector<std::string> out_tokens;
for (auto iter = input->begin<std::string_view>(); iter != input->end<std::string_view>(); iter++) { std::vector<uint32_t> offsets_start, offsets_limit;
std::shared_ptr<Tensor> token_tensor, offsets_start_tensor, offsets_limit_tensor;
for (auto iter = input[0]->begin<std::string_view>(); iter != input[0]->end<std::string_view>(); iter++) {
uint32_t basic_start = 0;
std::vector<std::string> temp_tokens; std::vector<std::string> temp_tokens;
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), &temp_tokens)); if (with_offsets_ && input.size() == 3) {
RETURN_IF_NOT_OK(input[1]->GetItemAt<uint32_t>(&basic_start, {count, 0}));
}
RETURN_IF_NOT_OK(GetTokens(std::string(*iter), basic_start, &temp_tokens, &offsets_start, &offsets_limit));
out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end()); out_tokens.insert(out_tokens.end(), temp_tokens.begin(), temp_tokens.end());
count++;
} }
if (out_tokens.empty()) { if (out_tokens.empty()) {
out_tokens.emplace_back(""); out_tokens.emplace_back("");
offsets_start.push_back(0);
offsets_limit.push_back(0);
}
token_tensor = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
output->push_back(token_tensor);
if (with_offsets_) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_start_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_start.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_start[0])));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&offsets_limit_tensor, TensorImpl::kFlexible,
TensorShape({(dsize_t)offsets_limit.size()}), DataType(DataType::DE_UINT32),
reinterpret_cast<unsigned char *>(&offsets_limit[0])));
output->push_back(offsets_start_tensor);
output->push_back(offsets_limit_tensor);
} }
*output = std::make_shared<Tensor>(out_tokens, TensorShape({(dsize_t)out_tokens.size()}));
return Status::OK(); return Status::OK();
} }
......
...@@ -37,27 +37,31 @@ class WordpieceTokenizerOp : public TensorOp { ...@@ -37,27 +37,31 @@ class WordpieceTokenizerOp : public TensorOp {
static const char kDefSuffixIndicator[]; static const char kDefSuffixIndicator[];
static const int kDefMaxBytesPerToken; static const int kDefMaxBytesPerToken;
static const char kDefUnknownToken[]; static const char kDefUnknownToken[];
static const bool kDefWithOffsets;
WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = kDefSuffixIndicator, WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = kDefSuffixIndicator,
const int &max_bytes_per_token = kDefMaxBytesPerToken, const int &max_bytes_per_token = kDefMaxBytesPerToken,
const std::string &unknown_token = kDefUnknownToken); const std::string &unknown_token = kDefUnknownToken, const bool &with_offsets = kDefWithOffsets);
~WordpieceTokenizerOp() override = default; ~WordpieceTokenizerOp() override = default;
void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; } void Print(std::ostream &out) const override { out << "WordpieceTokenizerOp"; }
Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; Status Compute(const TensorRow &input, TensorRow *output) override;
protected: protected:
Status AddSubword(const std::string &input_token, const int start, const int end, Status AddSubword(const std::string &input_token, const int &start, const int &end,
std::vector<std::string> *out_token) const; std::vector<std::string> *out_token) const;
Status FoundNoToken(const std::string &input_token, std::vector<std::string> *out_tokens) const; Status FoundNoToken(const std::string &input_token, const uint32_t &basic_start, std::vector<std::string> *out_tokens,
std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const;
Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found, Status LookupWord(const std::string &input_token, const RuneStrArray &runes, const int start, bool *out_found,
int *out_end) const; int *out_end) const;
Status GetTokens(const std::string &input_token, std::vector<std::string> *out_tokens) const; Status GetTokens(const std::string &input_token, const uint32_t &basic_start, std::vector<std::string> *out_tokens,
std::vector<uint32_t> *offsets_start, std::vector<uint32_t> *offsets_limit) const;
private: private:
const std::shared_ptr<Vocab> vocab_; const std::shared_ptr<Vocab> vocab_;
const std::string suffix_indicator_; const std::string suffix_indicator_;
const bool with_offsets_;
const int max_bytes_per_token_; const int max_bytes_per_token_;
const std::string unknown_token_; const std::string unknown_token_;
}; };
......
...@@ -52,8 +52,9 @@ import mindspore._c_dataengine as cde ...@@ -52,8 +52,9 @@ import mindspore._c_dataengine as cde
from .utils import JiebaMode, NormalizeForm, to_str from .utils import JiebaMode, NormalizeForm, to_str
from .validators import check_lookup, check_jieba_add_dict, \ from .validators import check_lookup, check_jieba_add_dict, \
check_jieba_add_word, check_jieba_init, check_ngram, check_pair_truncate, \ check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer,\
check_to_number, check_python_tokenizer check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate,\
check_to_number, check_bert_tokenizer, check_python_tokenizer
from ..core.datatypes import mstype_to_detype from ..core.datatypes import mstype_to_detype
...@@ -125,15 +126,31 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): ...@@ -125,15 +126,31 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
- JiebaMode.MP, tokenize with MPSegment algorithm. - JiebaMode.MP, tokenize with MPSegment algorithm.
- JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm. - JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
- JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm. - JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False)
>>> data = data.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
""" """
@check_jieba_init @check_jieba_init
def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX): def __init__(self, hmm_path, mp_path, mode=JiebaMode.MIX, with_offsets=False):
if not isinstance(mode, JiebaMode):
raise TypeError("Wrong input type for mode, should be JiebaMode.")
self.mode = mode self.mode = mode
self.__check_path__(hmm_path) self.__check_path__(hmm_path)
self.__check_path__(mp_path) self.__check_path__(mp_path)
self.with_offsets = with_offsets
super().__init__(hmm_path, mp_path, super().__init__(hmm_path, mp_path,
DE_C_INTER_JIEBA_MODE[mode]) DE_C_INTER_JIEBA_MODE[mode],
self.with_offsets)
@check_jieba_add_word @check_jieba_add_word
def add_word(self, word, freq=None): def add_word(self, word, freq=None):
...@@ -226,8 +243,26 @@ class JiebaTokenizer(cde.JiebaTokenizerOp): ...@@ -226,8 +243,26 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp): class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
""" """
Tokenize a scalar tensor of UTF-8 string to Unicode characters. Tokenize a scalar tensor of UTF-8 string to Unicode characters.
Args:
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.UnicodeCharTokenizer()
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.UnicodeCharTokenizer(True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
""" """
@check_with_offsets
def __init__(self, with_offsets=False):
self.with_offsets = with_offsets
super().__init__(self.with_offsets)
class WordpieceTokenizer(cde.WordpieceTokenizerOp): class WordpieceTokenizer(cde.WordpieceTokenizerOp):
""" """
...@@ -239,22 +274,58 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp): ...@@ -239,22 +274,58 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100). max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100).
unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string, unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default='[UNK]'). return the token directly, else return 'unknown_token'(default='[UNK]').
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
>>> max_bytes_per_token=100, with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
>>> max_bytes_per_token=100, with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
""" """
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]'): @check_wordpiece_tokenizer
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100,
unknown_token='[UNK]', with_offsets=False):
self.vocab = vocab self.vocab = vocab
self.suffix_indicator = suffix_indicator self.suffix_indicator = suffix_indicator
self.max_bytes_per_token = max_bytes_per_token self.max_bytes_per_token = max_bytes_per_token
self.unknown_token = unknown_token self.unknown_token = unknown_token
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token) self.with_offsets = with_offsets
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token,
self.unknown_token, self.with_offsets)
if platform.system().lower() != 'windows': if platform.system().lower() != 'windows':
class WhitespaceTokenizer(cde.WhitespaceTokenizerOp): class WhitespaceTokenizer(cde.WhitespaceTokenizerOp):
""" """
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n'). Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '\\\\t', '\\\\r', '\\\\n').
Args:
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.WhitespaceTokenizer()
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.WhitespaceTokenizer(True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
""" """
@check_with_offsets
def __init__(self, with_offsets=False):
self.with_offsets = with_offsets
super().__init__(self.with_offsets)
class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp): class UnicodeScriptTokenizer(cde.UnicodeScriptTokenizerOp):
""" """
...@@ -262,11 +333,25 @@ if platform.system().lower() != 'windows': ...@@ -262,11 +333,25 @@ if platform.system().lower() != 'windows':
Args: Args:
keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False). keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
""" """
def __init__(self, keep_whitespace=False): @check_unicode_script_tokenizer
def __init__(self, keep_whitespace=False, with_offsets=False):
self.keep_whitespace = keep_whitespace self.keep_whitespace = keep_whitespace
super().__init__(self.keep_whitespace) self.with_offsets = with_offsets
super().__init__(self.keep_whitespace, self.with_offsets)
class CaseFold(cde.CaseFoldOp): class CaseFold(cde.CaseFoldOp):
...@@ -302,6 +387,9 @@ if platform.system().lower() != 'windows': ...@@ -302,6 +387,9 @@ if platform.system().lower() != 'windows':
""" """
def __init__(self, normalize_form=NormalizeForm.NFKC): def __init__(self, normalize_form=NormalizeForm.NFKC):
if not isinstance(normalize_form, NormalizeForm):
raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.")
self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form] self.normalize_form = DE_C_INTER_NORMALIZE_FORM[normalize_form]
super().__init__(self.normalize_form) super().__init__(self.normalize_form)
...@@ -338,12 +426,26 @@ if platform.system().lower() != 'windows': ...@@ -338,12 +426,26 @@ if platform.system().lower() != 'windows':
keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''), if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
in this situation, delimiters will not kept as a output token(default=''). in this situation, delimiters will not kept as a output token(default='').
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
""" """
def __init__(self, delim_pattern, keep_delim_pattern=''): @check_regex_tokenizer
def __init__(self, delim_pattern, keep_delim_pattern='', with_offsets=False):
self.delim_pattern = delim_pattern self.delim_pattern = delim_pattern
self.keep_delim_pattern = keep_delim_pattern self.keep_delim_pattern = keep_delim_pattern
super().__init__(self.delim_pattern, self.keep_delim_pattern) self.with_offsets = with_offsets
super().__init__(self.delim_pattern, self.keep_delim_pattern, self.with_offsets)
class BasicTokenizer(cde.BasicTokenizerOp): class BasicTokenizer(cde.BasicTokenizerOp):
...@@ -359,16 +461,41 @@ if platform.system().lower() != 'windows': ...@@ -359,16 +461,41 @@ if platform.system().lower() != 'windows':
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE'). only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
preserve_unused_token(bool, optional): If True, do not split special tokens like preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True). '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
>>> keep_whitespace=False,
>>> normalization_form=NormalizeForm.NONE,
>>> preserve_unused_token=True,
>>> with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
>>> keep_whitespace=False,
>>> normalization_form=NormalizeForm.NONE,
>>> preserve_unused_token=True,
>>> with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
""" """
def __init__(self, lower_case=False, keep_whitespace=False, @check_basic_tokenizer
normalization_form=NormalizeForm.NONE, preserve_unused_token=True): def __init__(self, lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
preserve_unused_token=True, with_offsets=False):
if not isinstance(normalization_form, NormalizeForm):
raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.")
self.lower_case = lower_case self.lower_case = lower_case
self.keep_whitespace = keep_whitespace self.keep_whitespace = keep_whitespace
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
self.preserve_unused_token = preserve_unused_token self.preserve_unused_token = preserve_unused_token
super().__init__(self.lower_case, self.keep_whitespace, self.with_offsets = with_offsets
self.normalization_form, self.preserve_unused_token) super().__init__(self.lower_case, self.keep_whitespace, self.normalization_form,
self.preserve_unused_token, self.with_offsets)
class BertTokenizer(cde.BertTokenizerOp): class BertTokenizer(cde.BertTokenizerOp):
...@@ -389,11 +516,33 @@ if platform.system().lower() != 'windows': ...@@ -389,11 +516,33 @@ if platform.system().lower() != 'windows':
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE'). only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
preserve_unused_token(bool, optional): If True, do not split special tokens like preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True). '[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
>>> with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
>>> with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
""" """
def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, @check_bert_tokenizer
unknown_token='[UNK]', lower_case=False, keep_whitespace=False, def __init__(self, vocab, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]',
normalization_form=NormalizeForm.NONE, preserve_unused_token=True): lower_case=False, keep_whitespace=False, normalization_form=NormalizeForm.NONE,
preserve_unused_token=True, with_offsets=False):
if not isinstance(normalization_form, NormalizeForm):
raise TypeError("Wrong input type for normalization_form, should be NormalizeForm.")
self.vocab = vocab self.vocab = vocab
self.suffix_indicator = suffix_indicator self.suffix_indicator = suffix_indicator
self.max_bytes_per_token = max_bytes_per_token self.max_bytes_per_token = max_bytes_per_token
...@@ -402,8 +551,10 @@ if platform.system().lower() != 'windows': ...@@ -402,8 +551,10 @@ if platform.system().lower() != 'windows':
self.keep_whitespace = keep_whitespace self.keep_whitespace = keep_whitespace
self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form] self.normalization_form = DE_C_INTER_NORMALIZE_FORM[normalization_form]
self.preserve_unused_token = preserve_unused_token self.preserve_unused_token = preserve_unused_token
self.with_offsets = with_offsets
super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token, super().__init__(self.vocab, self.suffix_indicator, self.max_bytes_per_token, self.unknown_token,
self.lower_case, self.keep_whitespace, self.normalization_form, self.preserve_unused_token) self.lower_case, self.keep_whitespace, self.normalization_form,
self.preserve_unused_token, self.with_offsets)
class TruncateSequencePair(cde.TruncateSequencePairOp): class TruncateSequencePair(cde.TruncateSequencePairOp):
......
...@@ -25,7 +25,6 @@ from mindspore._c_expression import typing ...@@ -25,7 +25,6 @@ from mindspore._c_expression import typing
from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, check_positive, \ from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, check_positive, \
INT32_MAX, check_value INT32_MAX, check_value
def check_unique_list_of_words(words, arg_name): def check_unique_list_of_words(words, arg_name):
"""Check that words is a list and each element is a str without any duplication""" """Check that words is a list and each element is a str without any duplication"""
...@@ -116,11 +115,22 @@ def check_from_dict(method): ...@@ -116,11 +115,22 @@ def check_from_dict(method):
def check_jieba_init(method): def check_jieba_init(method):
"""Wrapper method to check the parameters of jieba add word.""" """Wrapper method to check the parameters of jieba init."""
@wraps(method) @wraps(method)
def new_method(self, *args, **kwargs): def new_method(self, *args, **kwargs):
parse_user_args(method, *args, **kwargs) [hmm_path, mp_path, _, with_offsets], _ = parse_user_args(method, *args, **kwargs)
if hmm_path is None:
raise ValueError("The dict of HMMSegment in cppjieba is not provided.")
if not isinstance(hmm_path, str):
raise TypeError("Wrong input type for hmm_path, should be string.")
if mp_path is None:
raise ValueError("The dict of MPSegment in cppjieba is not provided.")
if not isinstance(mp_path, str):
raise TypeError("Wrong input type for mp_path, should be string.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs) return method(self, *args, **kwargs)
return new_method return new_method
...@@ -152,6 +162,128 @@ def check_jieba_add_dict(method): ...@@ -152,6 +162,128 @@ def check_jieba_add_dict(method):
return new_method return new_method
def check_with_offsets(method):
"""Wrapper method to check if with_offsets is the only one parameter."""
@wraps(method)
def new_method(self, *args, **kwargs):
[with_offsets], _ = parse_user_args(method, *args, **kwargs)
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_unicode_script_tokenizer(method):
"""Wrapper method to check the parameter of UnicodeScriptTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[keep_whitespace, with_offsets], _ = parse_user_args(method, *args, **kwargs)
if not isinstance(keep_whitespace, bool):
raise TypeError("Wrong input type for keep_whitespace, should be boolean.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_wordpiece_tokenizer(method):
"""Wrapper method to check the parameter of WordpieceTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets], _ =\
parse_user_args(method, *args, **kwargs)
if vocab is None:
raise ValueError("vocab is not provided.")
if not isinstance(vocab, cde.Vocab):
raise TypeError("Wrong input type for vocab, should be Vocab object.")
if not isinstance(suffix_indicator, str):
raise TypeError("Wrong input type for suffix_indicator, should be string.")
if not isinstance(unknown_token, str):
raise TypeError("Wrong input type for unknown_token, should be string.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
check_uint32(max_bytes_per_token)
return method(self, *args, **kwargs)
return new_method
def check_regex_tokenizer(method):
"""Wrapper method to check the parameter of RegexTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[delim_pattern, keep_delim_pattern, with_offsets], _ = parse_user_args(method, *args, **kwargs)
if delim_pattern is None:
raise ValueError("delim_pattern is not provided.")
if not isinstance(delim_pattern, str):
raise TypeError("Wrong input type for delim_pattern, should be string.")
if not isinstance(keep_delim_pattern, str):
raise TypeError("Wrong input type for keep_delim_pattern, should be string.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_basic_tokenizer(method):
"""Wrapper method to check the parameter of RegexTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[lower_case, keep_whitespace, _, preserve_unused, with_offsets], _ =\
parse_user_args(method, *args, **kwargs)
if not isinstance(lower_case, bool):
raise TypeError("Wrong input type for lower_case, should be boolean.")
if not isinstance(keep_whitespace, bool):
raise TypeError("Wrong input type for keep_whitespace, should be boolean.")
if not isinstance(preserve_unused, bool):
raise TypeError("Wrong input type for preserve_unused_token, should be boolean.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_bert_tokenizer(method):
"""Wrapper method to check the parameter of BertTokenizer."""
@wraps(method)
def new_method(self, *args, **kwargs):
[vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case, keep_whitespace, _,
preserve_unused_token, with_offsets], _ = parse_user_args(method, *args, **kwargs)
if vocab is None:
raise ValueError("vacab is not provided.")
if not isinstance(vocab, cde.Vocab):
raise TypeError("Wrong input type for vocab, should be Vocab object.")
if not isinstance(suffix_indicator, str):
raise TypeError("Wrong input type for suffix_indicator, should be string.")
if not isinstance(max_bytes_per_token, int):
raise TypeError("Wrong input type for max_bytes_per_token, should be int.")
check_uint32(max_bytes_per_token)
if not isinstance(unknown_token, str):
raise TypeError("Wrong input type for unknown_token, should be string.")
if not isinstance(lower_case, bool):
raise TypeError("Wrong input type for lower_case, should be boolean.")
if not isinstance(keep_whitespace, bool):
raise TypeError("Wrong input type for keep_whitespace, should be boolean.")
if not isinstance(preserve_unused_token, bool):
raise TypeError("Wrong input type for preserve_unused_token, should be boolean.")
if not isinstance(with_offsets, bool):
raise TypeError("Wrong input type for with_offsets, should be boolean.")
return method(self, *args, **kwargs)
return new_method
def check_from_dataset(method): def check_from_dataset(method):
"""A wrapper that wrap a parameter checker to the original function.""" """A wrapper that wrap a parameter checker to the original function."""
......
...@@ -39,21 +39,22 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opFuntions) { ...@@ -39,21 +39,22 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opFuntions) {
std::string dataset_path = datasets_root_path_ + "/jiebadict"; std::string dataset_path = datasets_root_path_ + "/jiebadict";
std::string hmm_path = dataset_path + "/hmm_model.utf8"; std::string hmm_path = dataset_path + "/hmm_model.utf8";
std::string mp_path = dataset_path + "/jieba.dict.utf8"; std::string mp_path = dataset_path + "/jieba.dict.utf8";
std::shared_ptr<Tensor> output_tensor; TensorRow input, output;
std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path)); std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path));
std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("今天天气太好了我们一起去外面玩吧"); std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("今天天气太好了我们一起去外面玩吧");
Status s = op->Compute(input_tensor, &output_tensor); input.push_back(input_tensor);
Status s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output_tensor->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
EXPECT_EQ(output_tensor->Size(), 7); EXPECT_EQ(output[0]->Size(), 7);
CheckEqual(output_tensor, {0}, "今天天气"); CheckEqual(output[0], {0}, "今天天气");
CheckEqual(output_tensor, {1}, "太好了"); CheckEqual(output[0], {1}, "太好了");
CheckEqual(output_tensor, {2}, "我们"); CheckEqual(output[0], {2}, "我们");
CheckEqual(output_tensor, {3}, "一起"); CheckEqual(output[0], {3}, "一起");
CheckEqual(output_tensor, {4}, "去"); CheckEqual(output[0], {4}, "去");
CheckEqual(output_tensor, {5}, "外面"); CheckEqual(output[0], {5}, "外面");
CheckEqual(output_tensor, {6}, "玩吧"); CheckEqual(output[0], {6}, "玩吧");
} }
TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) { TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) {
...@@ -61,16 +62,17 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) { ...@@ -61,16 +62,17 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) {
std::string dataset_path = datasets_root_path_ + "/jiebadict"; std::string dataset_path = datasets_root_path_ + "/jiebadict";
std::string hmm_path = dataset_path + "/hmm_model.utf8"; std::string hmm_path = dataset_path + "/hmm_model.utf8";
std::string mp_path = dataset_path + "/jieba.dict.utf8"; std::string mp_path = dataset_path + "/jieba.dict.utf8";
std::shared_ptr<Tensor> output_tensor; TensorRow input, output;
std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path)); std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path));
op->AddWord("男默女泪"); op->AddWord("男默女泪");
std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("男默女泪"); std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("男默女泪");
Status s = op->Compute(input_tensor, &output_tensor); input.push_back(input_tensor);
Status s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output_tensor->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
EXPECT_EQ(output_tensor->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
CheckEqual(output_tensor, {0}, "男默女泪"); CheckEqual(output[0], {0}, "男默女泪");
} }
TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) { TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) {
...@@ -78,14 +80,15 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) { ...@@ -78,14 +80,15 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) {
std::string dataset_path = datasets_root_path_ + "/jiebadict"; std::string dataset_path = datasets_root_path_ + "/jiebadict";
std::string hmm_path = dataset_path + "/hmm_model.utf8"; std::string hmm_path = dataset_path + "/hmm_model.utf8";
std::string mp_path = dataset_path + "/jieba.dict.utf8"; std::string mp_path = dataset_path + "/jieba.dict.utf8";
std::shared_ptr<Tensor> output_tensor; TensorRow input, output;
std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path)); std::unique_ptr<JiebaTokenizerOp> op(new JiebaTokenizerOp(hmm_path, mp_path));
op->AddWord("男默女泪"); op->AddWord("男默女泪");
std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(""); std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>("");
Status s = op->Compute(input_tensor, &output_tensor); input.push_back(input_tensor);
Status s = op->Compute(input, &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output_tensor->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
EXPECT_EQ(output_tensor->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
CheckEqual(output_tensor, {0}, ""); CheckEqual(output[0], {0}, "");
} }
\ No newline at end of file
...@@ -45,227 +45,245 @@ class MindDataTestTokenizerOp : public UT::Common { ...@@ -45,227 +45,245 @@ class MindDataTestTokenizerOp : public UT::Common {
TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) { TEST_F(MindDataTestTokenizerOp, TestUnicodeCharTokenizerOp) {
MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp."; MS_LOG(INFO) << "Doing TestUnicodeCharTokenizerOp.";
std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp()); std::unique_ptr<UnicodeCharTokenizerOp> op(new UnicodeCharTokenizerOp(true));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Hello World!"); std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Hello World!");
std::shared_ptr<Tensor> output; TensorRow output;
Status s = op->Compute(input, &output); Status s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 12); EXPECT_EQ(output[0]->Size(), 12);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor1: " << output->ToString(); MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
CheckEqual(output, {0}, "H"); CheckEqual(output[0], {0}, "H");
CheckEqual(output, {1}, "e"); CheckEqual(output[0], {1}, "e");
CheckEqual(output, {2}, "l"); CheckEqual(output[0], {2}, "l");
CheckEqual(output, {3}, "l"); CheckEqual(output[0], {3}, "l");
CheckEqual(output, {4}, "o"); CheckEqual(output[0], {4}, "o");
CheckEqual(output, {5}, " "); CheckEqual(output[0], {5}, " ");
CheckEqual(output, {6}, "W"); CheckEqual(output[0], {6}, "W");
CheckEqual(output, {7}, "o"); CheckEqual(output[0], {7}, "o");
CheckEqual(output, {8}, "r"); CheckEqual(output[0], {8}, "r");
CheckEqual(output, {9}, "l"); CheckEqual(output[0], {9}, "l");
CheckEqual(output, {10}, "d"); CheckEqual(output[0], {10}, "d");
CheckEqual(output, {11}, "!"); CheckEqual(output[0], {11}, "!");
input = std::make_shared<Tensor>("中国 你好!"); input = std::make_shared<Tensor>("中国 你好!");
s = op->Compute(input, &output); output.clear();
s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 6); EXPECT_EQ(output[0]->Size(), 6);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor2: " << output->ToString(); MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
CheckEqual(output, {0}, "中"); CheckEqual(output[0], {0}, "中");
CheckEqual(output, {1}, "国"); CheckEqual(output[0], {1}, "国");
CheckEqual(output, {2}, " "); CheckEqual(output[0], {2}, " ");
CheckEqual(output, {3}, "你"); CheckEqual(output[0], {3}, "你");
CheckEqual(output, {4}, "好"); CheckEqual(output[0], {4}, "好");
CheckEqual(output, {5}, "!"); CheckEqual(output[0], {5}, "!");
input = std::make_shared<Tensor>("中"); input = std::make_shared<Tensor>("中");
s = op->Compute(input, &output); output.clear();
s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor3: " << output->ToString(); MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
CheckEqual(output, {0}, "中"); CheckEqual(output[0], {0}, "中");
input = std::make_shared<Tensor>("H"); input = std::make_shared<Tensor>("H");
s = op->Compute(input, &output); output.clear();
s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor4: " << output->ToString(); MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
CheckEqual(output, {0}, "H"); CheckEqual(output[0], {0}, "H");
input = std::make_shared<Tensor>(" "); input = std::make_shared<Tensor>(" ");
s = op->Compute(input, &output); output.clear();
s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 2); EXPECT_EQ(output[0]->Size(), 2);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor5: " << output->ToString(); MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
CheckEqual(output, {0}, " "); CheckEqual(output[0], {0}, " ");
CheckEqual(output, {1}, " "); CheckEqual(output[0], {1}, " ");
input = std::make_shared<Tensor>(""); input = std::make_shared<Tensor>("");
s = op->Compute(input, &output); output.clear();
s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor6: " << output->ToString(); MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString();
CheckEqual(output, {0}, ""); CheckEqual(output[0], {0}, "");
} }
TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) { TEST_F(MindDataTestTokenizerOp, TestWhitespaceTokenizerOp) {
MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp."; MS_LOG(INFO) << "Doing TestWhitespaceTokenizerOp.";
std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp()); std::unique_ptr<WhitespaceTokenizerOp> op(new WhitespaceTokenizerOp(true));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China."); std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China.");
std::shared_ptr<Tensor> output; TensorRow output;
Status s = op->Compute(input, &output); Status s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 3); EXPECT_EQ(output[0]->Size(), 3);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor1: " << output->ToString(); MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
CheckEqual(output, {0}, "Welcome"); CheckEqual(output[0], {0}, "Welcome");
CheckEqual(output, {1}, "to"); CheckEqual(output[0], {1}, "to");
CheckEqual(output, {2}, "China."); CheckEqual(output[0], {2}, "China.");
input = std::make_shared<Tensor>(" hello"); input = std::make_shared<Tensor>(" hello");
s = op->Compute(input, &output); output.clear();
s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor2: " << output->ToString(); MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
CheckEqual(output, {0}, "hello"); CheckEqual(output[0], {0}, "hello");
input = std::make_shared<Tensor>("hello"); input = std::make_shared<Tensor>("hello");
s = op->Compute(input, &output); output.clear();
s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor3: " << output->ToString(); MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
CheckEqual(output, {0}, "hello"); CheckEqual(output[0], {0}, "hello");
input = std::make_shared<Tensor>("hello "); input = std::make_shared<Tensor>("hello ");
s = op->Compute(input, &output); output.clear();
s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor4: " << output->ToString(); MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
CheckEqual(output, {0}, "hello"); CheckEqual(output[0], {0}, "hello");
input = std::make_shared<Tensor>(" "); input = std::make_shared<Tensor>(" ");
s = op->Compute(input, &output); output.clear();
s = op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor5: " << output->ToString(); MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
CheckEqual(output, {0}, ""); CheckEqual(output[0], {0}, "");
} }
TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) { TEST_F(MindDataTestTokenizerOp, TestUnicodeScriptTokenizer) {
MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer."; MS_LOG(INFO) << "Doing TestUnicodeScriptTokenizer.";
std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true)); std::unique_ptr<UnicodeScriptTokenizerOp> keep_whitespace_op(new UnicodeScriptTokenizerOp(true, true));
std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false)); std::unique_ptr<UnicodeScriptTokenizerOp> skip_whitespace_op(new UnicodeScriptTokenizerOp(false, true));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京"); std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
std::shared_ptr<Tensor> output; TensorRow output;
Status s = keep_whitespace_op->Compute(input, &output); Status s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 10); EXPECT_EQ(output[0]->Size(), 10);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor1: " << output->ToString(); MS_LOG(INFO) << "Out tensor1: " << output[0]->ToString();
CheckEqual(output, {0}, "Welcome"); CheckEqual(output[0], {0}, "Welcome");
CheckEqual(output, {1}, " "); CheckEqual(output[0], {1}, " ");
CheckEqual(output, {2}, "to"); CheckEqual(output[0], {2}, "to");
CheckEqual(output, {3}, " "); CheckEqual(output[0], {3}, " ");
CheckEqual(output, {4}, "China"); CheckEqual(output[0], {4}, "China");
CheckEqual(output, {5}, "."); CheckEqual(output[0], {5}, ".");
CheckEqual(output, {6}, " \n "); CheckEqual(output[0], {6}, " \n ");
CheckEqual(output, {7}, "中国"); CheckEqual(output[0], {7}, "中国");
CheckEqual(output, {8}, "\t"); CheckEqual(output[0], {8}, "\t");
CheckEqual(output, {9}, "北京"); CheckEqual(output[0], {9}, "北京");
s = skip_whitespace_op->Compute(input, &output); output.clear();
s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 6); EXPECT_EQ(output[0]->Size(), 6);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor2: " << output->ToString(); MS_LOG(INFO) << "Out tensor2: " << output[0]->ToString();
CheckEqual(output, {0}, "Welcome"); CheckEqual(output[0], {0}, "Welcome");
CheckEqual(output, {1}, "to"); CheckEqual(output[0], {1}, "to");
CheckEqual(output, {2}, "China"); CheckEqual(output[0], {2}, "China");
CheckEqual(output, {3}, "."); CheckEqual(output[0], {3}, ".");
CheckEqual(output, {4}, "中国"); CheckEqual(output[0], {4}, "中国");
CheckEqual(output, {5}, "北京"); CheckEqual(output[0], {5}, "北京");
input = std::make_shared<Tensor>(" Welcome to 中国. "); input = std::make_shared<Tensor>(" Welcome to 中国. ");
s = skip_whitespace_op->Compute(input, &output); output.clear();
s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 4); EXPECT_EQ(output[0]->Size(), 4);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor3: " << output->ToString(); MS_LOG(INFO) << "Out tensor3: " << output[0]->ToString();
CheckEqual(output, {0}, "Welcome"); CheckEqual(output[0], {0}, "Welcome");
CheckEqual(output, {1}, "to"); CheckEqual(output[0], {1}, "to");
CheckEqual(output, {2}, "中国"); CheckEqual(output[0], {2}, "中国");
CheckEqual(output, {3}, "."); CheckEqual(output[0], {3}, ".");
s = keep_whitespace_op->Compute(input, &output); output.clear();
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 8); EXPECT_EQ(output[0]->Size(), 8);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor4: " << output->ToString(); MS_LOG(INFO) << "Out tensor4: " << output[0]->ToString();
CheckEqual(output, {0}, " "); CheckEqual(output[0], {0}, " ");
CheckEqual(output, {1}, "Welcome"); CheckEqual(output[0], {1}, "Welcome");
CheckEqual(output, {2}, " "); CheckEqual(output[0], {2}, " ");
CheckEqual(output, {3}, "to"); CheckEqual(output[0], {3}, "to");
CheckEqual(output, {4}, " "); CheckEqual(output[0], {4}, " ");
CheckEqual(output, {5}, "中国"); CheckEqual(output[0], {5}, "中国");
CheckEqual(output, {6}, "."); CheckEqual(output[0], {6}, ".");
CheckEqual(output, {7}, " "); CheckEqual(output[0], {7}, " ");
input = std::make_shared<Tensor>("Hello"); input = std::make_shared<Tensor>("Hello");
s = keep_whitespace_op->Compute(input, &output); output.clear();
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor5: " << output->ToString(); MS_LOG(INFO) << "Out tensor5: " << output[0]->ToString();
CheckEqual(output, {0}, "Hello"); CheckEqual(output[0], {0}, "Hello");
input = std::make_shared<Tensor>("H"); input = std::make_shared<Tensor>("H");
s = keep_whitespace_op->Compute(input, &output); output.clear();
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor6: " << output->ToString(); MS_LOG(INFO) << "Out tensor6: " << output[0]->ToString();
CheckEqual(output, {0}, "H"); CheckEqual(output[0], {0}, "H");
input = std::make_shared<Tensor>(""); input = std::make_shared<Tensor>("");
s = keep_whitespace_op->Compute(input, &output); output.clear();
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor7: " << output->ToString(); MS_LOG(INFO) << "Out tensor7: " << output[0]->ToString();
CheckEqual(output, {0}, ""); CheckEqual(output[0], {0}, "");
input = std::make_shared<Tensor>("Hello中国Hello世界"); input = std::make_shared<Tensor>("Hello中国Hello世界");
s = keep_whitespace_op->Compute(input, &output); EXPECT_TRUE(s.IsOk()); output.clear();
EXPECT_EQ(output->Size(), 4); s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Size(), 4);
MS_LOG(INFO) << "Out tensor8: " << output->ToString(); EXPECT_EQ(output[0]->Rank(), 1);
CheckEqual(output, {0}, "Hello"); MS_LOG(INFO) << "Out tensor8: " << output[0]->ToString();
CheckEqual(output, {1}, "中国"); CheckEqual(output[0], {0}, "Hello");
CheckEqual(output, {2}, "Hello"); CheckEqual(output[0], {1}, "中国");
CheckEqual(output, {3}, "世界"); CheckEqual(output[0], {2}, "Hello");
CheckEqual(output[0], {3}, "世界");
input = std::make_shared<Tensor>(" "); input = std::make_shared<Tensor>(" ");
s = keep_whitespace_op->Compute(input, &output); output.clear();
s = keep_whitespace_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor10: " << output->ToString(); MS_LOG(INFO) << "Out tensor10: " << output[0]->ToString();
CheckEqual(output, {0}, " "); CheckEqual(output[0], {0}, " ");
input = std::make_shared<Tensor>(" "); input = std::make_shared<Tensor>(" ");
s = skip_whitespace_op->Compute(input, &output); output.clear();
s = skip_whitespace_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
EXPECT_EQ(output->Size(), 1); EXPECT_EQ(output[0]->Size(), 1);
EXPECT_EQ(output->Rank(), 1); EXPECT_EQ(output[0]->Rank(), 1);
MS_LOG(INFO) << "Out tensor11: " << output->ToString(); MS_LOG(INFO) << "Out tensor11: " << output[0]->ToString();
CheckEqual(output, {0}, ""); CheckEqual(output[0], {0}, "");
} }
TEST_F(MindDataTestTokenizerOp, TestCaseFold) { TEST_F(MindDataTestTokenizerOp, TestCaseFold) {
...@@ -321,10 +339,10 @@ TEST_F(MindDataTestTokenizerOp, TestRegexReplace) { ...@@ -321,10 +339,10 @@ TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) { TEST_F(MindDataTestTokenizerOp, TestRegexTokenizer) {
MS_LOG(INFO) << "Doing TestRegexTokenizerOp."; MS_LOG(INFO) << "Doing TestRegexTokenizerOp.";
std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "")); std::unique_ptr<RegexTokenizerOp> regex_tokenizer_op(new RegexTokenizerOp("\\p{Cc}|\\p{Cf}|\\s+", "", true));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京"); std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. \n 中国\t北京");
std::shared_ptr<Tensor> output; TensorRow output;
Status s = regex_tokenizer_op->Compute(input, &output); Status s = regex_tokenizer_op->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
} }
...@@ -332,9 +350,10 @@ TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) { ...@@ -332,9 +350,10 @@ TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
MS_LOG(INFO) << "Doing TestBasicTokenizer."; MS_LOG(INFO) << "Doing TestBasicTokenizer.";
//bool lower_case, bool keep_whitespace, //bool lower_case, bool keep_whitespace,
// NormalizeForm normalization_form, bool preserve_unused_token // NormalizeForm normalization_form, bool preserve_unused_token
std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false)); std::unique_ptr<BasicTokenizerOp> basic_tokenizer(new BasicTokenizerOp(true, true, NormalizeForm::kNone, false,
true));
std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. 中国\t北京"); std::shared_ptr<Tensor> input = std::make_shared<Tensor>("Welcome to China. 中国\t北京");
std::shared_ptr<Tensor> output; TensorRow output;
Status s = basic_tokenizer->Compute(input, &output); Status s = basic_tokenizer->Compute(TensorRow(0, {input}), &output);
EXPECT_TRUE(s.IsOk()); EXPECT_TRUE(s.IsOk());
} }
\ No newline at end of file
...@@ -18,7 +18,7 @@ Testing BasicTokenizer op in DE ...@@ -18,7 +18,7 @@ Testing BasicTokenizer op in DE
import numpy as np import numpy as np
import mindspore.dataset as ds import mindspore.dataset as ds
from mindspore import log as logger from mindspore import log as logger
import mindspore.dataset.text as nlp import mindspore.dataset.text as text
BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt" BASIC_TOKENIZER_FILE = "../data/dataset/testTokenizerData/basic_tokenizer.txt"
...@@ -37,47 +37,102 @@ test_paras = [ ...@@ -37,47 +37,102 @@ test_paras = [
'(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封', '(', '1644', '-', '1911', ')', 'は', '、', '中', '国', 'の', '封',
'建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'], '建', '王', '朝', 'の', '歴', '史', 'における', '最', '後', 'の2つの', '王', '朝', 'でした'],
['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는', ['명나라', '(', '1368', '-', '1644', ')', '와', '청나라', '(', '1644', '-', '1911', ')', '는',
'중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']] '중국', '봉건', '왕조의', '역사에서', '마지막', '두', '왕조였다']],
expected_offsets_start=[[0, 8, 11, 18, 21, 24, 27, 30],
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42],
[0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37],
[0, 3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49,
52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100],
[0, 3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51,
54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115],
[0, 10, 11, 15, 16, 20, 21, 25, 35, 36, 40, 41, 45, 46, 50, 57, 64, 74, 87, 97, 101]],
expected_offsets_limit=[[7, 10, 18, 21, 24, 27, 30, 33],
[3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45],
[4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40],
[3, 6, 9, 13, 16, 20, 23, 26, 29, 32, 35, 38, 42, 45, 49, 52, 55, 58,
61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 103],
[3, 6, 9, 13, 14, 18, 21, 24, 27, 30, 33, 37, 38, 42, 45, 48, 51, 54,
57, 60, 63, 66, 69, 72, 75, 78, 81, 93, 96, 99, 109, 112, 115, 124],
[9, 11, 15, 16, 20, 21, 24, 34, 36, 40, 41, 45, 46, 49, 56, 63, 73, 86, 96, 100, 113]]
), ),
dict( dict(
first=7, first=7,
last=7, last=7,
expected_tokens=[['this', 'is', 'a', 'funky', 'string']], expected_tokens=[['this', 'is', 'a', 'funky', 'string']],
expected_offsets_start=[[0, 5, 8, 10, 16]],
expected_offsets_limit=[[4, 7, 9, 15, 22]],
lower_case=True lower_case=True
), ),
] ]
def check_basic_tokenizer(first, last, expected_tokens, lower_case=False, keep_whitespace=False, def check_basic_tokenizer_default(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit,
normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False): lower_case=False, keep_whitespace=False,
normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False):
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
if first > 1: if first > 1:
dataset = dataset.skip(first - 1) dataset = dataset.skip(first - 1)
if last >= first: if last >= first:
dataset = dataset.take(last - first + 1) dataset = dataset.take(last - first + 1)
basic_tokenizer = nlp.BasicTokenizer(lower_case=lower_case, basic_tokenizer = text.BasicTokenizer(lower_case=lower_case,
keep_whitespace=keep_whitespace, keep_whitespace=keep_whitespace,
normalization_form=normalization_form, normalization_form=normalization_form,
preserve_unused_token=preserve_unused_token) preserve_unused_token=preserve_unused_token)
dataset = dataset.map(operations=basic_tokenizer) dataset = dataset.map(operations=basic_tokenizer)
count = 0 count = 0
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']) token = text.to_str(i['text'])
logger.info("Out:", text) logger.info("Out:", token)
logger.info("Exp:", expected_tokens[count]) logger.info("Exp:", expected_tokens[count])
np.testing.assert_array_equal(text, expected_tokens[count]) np.testing.assert_array_equal(token, expected_tokens[count])
count = count + 1 count = count + 1
def test_basic_tokenizer(): def check_basic_tokenizer_with_offsets(first, last, expected_tokens, expected_offsets_start, expected_offsets_limit,
lower_case=False, keep_whitespace=False,
normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False):
dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
basic_tokenizer = text.BasicTokenizer(lower_case=lower_case,
keep_whitespace=keep_whitespace,
normalization_form=normalization_form,
preserve_unused_token=preserve_unused_token,
with_offsets=True)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=basic_tokenizer)
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token'])
logger.info("Out:", token)
logger.info("Exp:", expected_tokens[count])
np.testing.assert_array_equal(token, expected_tokens[count])
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count = count + 1
def test_basic_tokenizer_with_offsets():
"""
Test BasicTokenizer
"""
for paras in test_paras:
check_basic_tokenizer_with_offsets(**paras)
def test_basic_tokenizer_default():
""" """
Test BasicTokenizer Test BasicTokenizer
""" """
for paras in test_paras: for paras in test_paras:
check_basic_tokenizer(**paras) check_basic_tokenizer_default(**paras)
if __name__ == '__main__': if __name__ == '__main__':
test_basic_tokenizer() test_basic_tokenizer_default()
test_basic_tokenizer_with_offsets()
...@@ -18,7 +18,7 @@ Testing BertTokenizer op in DE ...@@ -18,7 +18,7 @@ Testing BertTokenizer op in DE
import numpy as np import numpy as np
import mindspore.dataset as ds import mindspore.dataset as ds
from mindspore import log as logger from mindspore import log as logger
import mindspore.dataset.text as nlp import mindspore.dataset.text as text
BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt" BERT_TOKENIZER_FILE = "../data/dataset/testTokenizerData/bert_tokenizer.txt"
...@@ -39,6 +39,14 @@ test_paras = [ ...@@ -39,6 +39,14 @@ test_paras = [
['疑', '是', '地', '上', '霜'], ['疑', '是', '地', '上', '霜'],
['举', '头', '望', '明', '月'], ['举', '头', '望', '明', '月'],
['低', '头', '思', '故', '乡']], ['低', '头', '思', '故', '乡']],
expected_offsets_start=[[0, 3, 6, 9, 12],
[0, 3, 6, 9, 12],
[0, 3, 6, 9, 12],
[0, 3, 6, 9, 12]],
expected_offsets_limit=[[3, 6, 9, 12, 15],
[3, 6, 9, 12, 15],
[3, 6, 9, 12, 15],
[3, 6, 9, 12, 15]],
vocab_list=vocab_bert vocab_list=vocab_bert
), ),
# test english text # test english text
...@@ -46,6 +54,8 @@ test_paras = [ ...@@ -46,6 +54,8 @@ test_paras = [
first=5, first=5,
last=5, last=5,
expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], expect_str=[['i', 'am', 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]],
expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]],
lower_case=True, lower_case=True,
vocab_list=vocab_bert vocab_list=vocab_bert
), ),
...@@ -53,6 +63,8 @@ test_paras = [ ...@@ -53,6 +63,8 @@ test_paras = [
first=5, first=5,
last=5, last=5,
expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']], expect_str=[['I', "am", 'mak', '##ing', 'small', 'mistake', '##s', 'during', 'work', '##ing', 'hour', '##s']],
expected_offsets_start=[[0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46]],
expected_offsets_limit=[[1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47]],
lower_case=False, lower_case=False,
vocab_list=vocab_bert vocab_list=vocab_bert
), ),
...@@ -63,7 +75,9 @@ test_paras = [ ...@@ -63,7 +75,9 @@ test_paras = [
expect_str=[ expect_str=[
['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'], ['😀', '嘿', '嘿', '😃', '哈', '哈', '😄', '大', '笑', '😁', '嘻', '嘻'],
['繁', '體', '字']], ['繁', '體', '字']],
normalization_form=nlp.utils.NormalizeForm.NFKC, expected_offsets_start=[[0, 4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37], [0, 3, 6]],
expected_offsets_limit=[[4, 7, 10, 14, 17, 20, 24, 27, 30, 34, 37, 40], [3, 6, 9]],
normalization_form=text.utils.NormalizeForm.NFKC,
vocab_list=vocab_bert vocab_list=vocab_bert
), ),
# test preserved tokens # test preserved tokens
...@@ -79,6 +93,8 @@ test_paras = [ ...@@ -79,6 +93,8 @@ test_paras = [
['[unused1]'], ['[unused1]'],
['[unused10]'] ['[unused10]']
], ],
expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]],
expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]],
lower_case=False, lower_case=False,
vocab_list=vocab_bert, vocab_list=vocab_bert,
preserve_unused_token=True, preserve_unused_token=True,
...@@ -95,6 +111,8 @@ test_paras = [ ...@@ -95,6 +111,8 @@ test_paras = [
['[unused1]'], ['[unused1]'],
['[unused10]'] ['[unused10]']
], ],
expected_offsets_start=[[0, 7], [0, 7], [0, 7], [0, 7], [0, 7], [0], [0]],
expected_offsets_limit=[[6, 12], [6, 12], [6, 12], [6, 12], [6, 13], [9], [10]],
lower_case=True, lower_case=True,
vocab_list=vocab_bert, vocab_list=vocab_bert,
preserve_unused_token=True, preserve_unused_token=True,
...@@ -104,6 +122,8 @@ test_paras = [ ...@@ -104,6 +122,8 @@ test_paras = [
first=15, first=15,
last=15, last=15,
expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']], expect_str=[['12', '+', '/', '-', '28', '=', '40', '/', '-', '16']],
expected_offsets_start=[[0, 2, 3, 4, 5, 7, 8, 10, 11, 12]],
expected_offsets_limit=[[2, 3, 4, 5, 7, 8, 10, 11, 12, 14]],
preserve_unused_token=True, preserve_unused_token=True,
vocab_list=vocab_bert vocab_list=vocab_bert
), ),
...@@ -112,6 +132,8 @@ test_paras = [ ...@@ -112,6 +132,8 @@ test_paras = [
first=8, first=8,
last=8, last=8,
expect_str=[['[UNK]', ' ', '[CLS]']], expect_str=[['[UNK]', ' ', '[CLS]']],
expected_offsets_start=[[0, 6, 7]],
expected_offsets_limit=[[6, 7, 12]],
lower_case=False, lower_case=False,
vocab_list=vocab_bert, vocab_list=vocab_bert,
preserve_unused_token=True, preserve_unused_token=True,
...@@ -121,6 +143,8 @@ test_paras = [ ...@@ -121,6 +143,8 @@ test_paras = [
first=8, first=8,
last=8, last=8,
expect_str=[['unused', ' ', '[CLS]']], expect_str=[['unused', ' ', '[CLS]']],
expected_offsets_start=[[0, 6, 7]],
expected_offsets_limit=[[6, 7, 12]],
lower_case=False, lower_case=False,
vocab_list=vocab_bert, vocab_list=vocab_bert,
preserve_unused_token=True, preserve_unused_token=True,
...@@ -131,6 +155,8 @@ test_paras = [ ...@@ -131,6 +155,8 @@ test_paras = [
first=8, first=8,
last=8, last=8,
expect_str=[['unused', ' ', '[', 'CLS', ']']], expect_str=[['unused', ' ', '[', 'CLS', ']']],
expected_offsets_start=[[0, 6, 7, 8, 11]],
expected_offsets_limit=[[6, 7, 8, 11, 12]],
lower_case=False, lower_case=False,
vocab_list=vocab_bert, vocab_list=vocab_bert,
preserve_unused_token=False, preserve_unused_token=False,
...@@ -140,20 +166,20 @@ test_paras = [ ...@@ -140,20 +166,20 @@ test_paras = [
] ]
def check_bert_tokenizer(first, last, expect_str, def check_bert_tokenizer_default(first, last, expect_str,
vocab_list, expected_offsets_start, expected_offsets_limit,
suffix_indicator='##', vocab_list, suffix_indicator='##',
max_bytes_per_token=100, unknown_token='[UNK]', max_bytes_per_token=100, unknown_token='[UNK]',
lower_case=False, keep_whitespace=False, lower_case=False, keep_whitespace=False,
normalization_form=nlp.utils.NormalizeForm.NONE, normalization_form=text.utils.NormalizeForm.NONE,
preserve_unused_token=False): preserve_unused_token=False):
dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
if first > 1: if first > 1:
dataset = dataset.skip(first - 1) dataset = dataset.skip(first - 1)
if last >= first: if last >= first:
dataset = dataset.take(last - first + 1) dataset = dataset.take(last - first + 1)
vocab = nlp.Vocab.from_list(vocab_list) vocab = text.Vocab.from_list(vocab_list)
tokenizer_op = nlp.BertTokenizer( tokenizer_op = text.BertTokenizer(
vocab=vocab, suffix_indicator=suffix_indicator, vocab=vocab, suffix_indicator=suffix_indicator,
max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token, max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token,
lower_case=lower_case, keep_whitespace=keep_whitespace, lower_case=lower_case, keep_whitespace=keep_whitespace,
...@@ -162,20 +188,59 @@ def check_bert_tokenizer(first, last, expect_str, ...@@ -162,20 +188,59 @@ def check_bert_tokenizer(first, last, expect_str,
dataset = dataset.map(operations=tokenizer_op) dataset = dataset.map(operations=tokenizer_op)
count = 0 count = 0
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']) token = text.to_str(i['text'])
logger.info("Out:", text) logger.info("Out:", token)
logger.info("Exp:", expect_str[count]) logger.info("Exp:", expect_str[count])
np.testing.assert_array_equal(text, expect_str[count]) np.testing.assert_array_equal(token, expect_str[count])
count = count + 1 count = count + 1
def test_bert_tokenizer(): def check_bert_tokenizer_with_offsets(first, last, expect_str,
expected_offsets_start, expected_offsets_limit,
vocab_list, suffix_indicator='##',
max_bytes_per_token=100, unknown_token='[UNK]',
lower_case=False, keep_whitespace=False,
normalization_form=text.utils.NormalizeForm.NONE,
preserve_unused_token=False):
dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
vocab = text.Vocab.from_list(vocab_list)
tokenizer_op = text.BertTokenizer(
vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token,
unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace,
normalization_form=normalization_form, preserve_unused_token=preserve_unused_token, with_offsets=True)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op)
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token'])
logger.info("Out:", token)
logger.info("Exp:", expect_str[count])
np.testing.assert_array_equal(token, expect_str[count])
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count = count + 1
def test_bert_tokenizer_default():
"""
Test WordpieceTokenizer when with_offsets=False
"""
for paras in test_paras:
check_bert_tokenizer_default(**paras)
def test_bert_tokenizer_with_offsets():
""" """
Test WordpieceTokenizer Test WordpieceTokenizer when with_offsets=True
""" """
for paras in test_paras: for paras in test_paras:
check_bert_tokenizer(**paras) check_bert_tokenizer_with_offsets(**paras)
if __name__ == '__main__': if __name__ == '__main__':
test_bert_tokenizer() test_bert_tokenizer_default()
test_bert_tokenizer_with_offsets()
...@@ -197,6 +197,229 @@ def test_jieba_5(): ...@@ -197,6 +197,229 @@ def test_jieba_5():
assert item == expect[index] assert item == expect[index]
def test_jieba_with_offsets_1():
"""Test jieba tokenizer with MP mode"""
data = ds.TextFileDataset(DATA_FILE)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=1)
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
ret = []
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_1_1():
"""Test jieba tokenizer with HMM mode"""
data = ds.TextFileDataset(DATA_FILE)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM, with_offsets=True)
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=1)
expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
expected_offsets_start = [0, 6, 12, 15, 18, 21, 27, 33, 36, 42, 45]
expected_offsets_limit = [6, 12, 15, 18, 21, 27, 33, 36, 42, 45, 48]
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_1_2():
"""Test jieba tokenizer with HMM MIX"""
data = ds.TextFileDataset(DATA_FILE)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX, with_offsets=True)
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=1)
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_2():
"""Test add_word"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
jieba_op.add_word("男默女泪")
expect = ['男默女泪', '市', '长江大桥']
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=2)
expected_offsets_start = [0, 12, 15]
expected_offsets_limit = [12, 15, 27]
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_2_1():
"""Test add_word with freq"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
jieba_op.add_word("男默女泪", 10)
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=2)
expect = ['男默女泪', '市', '长江大桥']
expected_offsets_start = [0, 12, 15]
expected_offsets_limit = [12, 15, 27]
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_2_2():
"""Test add_word with freq, the value of freq affects the result of segmentation"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
jieba_op.add_word("江大桥", 20000)
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=2)
expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_3():
"""Test add_dict with dict"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
user_dict = {
"男默女泪": 10
}
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
jieba_op.add_dict(user_dict)
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=1)
expect = ['男默女泪', '市', '长江大桥']
expected_offsets_start = [0, 12, 15]
expected_offsets_limit = [12, 15, 27]
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_3_1():
"""Test add_dict with dict"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
user_dict = {
"男默女泪": 10,
"江大桥": 20000
}
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
jieba_op.add_dict(user_dict)
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=1)
expect = ['男默女泪', '市长', '江大桥']
expected_offsets_start = [0, 12, 18]
expected_offsets_limit = [12, 18, 27]
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_4():
DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
jieba_op.add_dict(DICT_FILE)
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=1)
expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_5():
"""Test add dict with file path"""
DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
data = ds.TextFileDataset(DATA_FILE4)
jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
jieba_op.add_word("江大桥", 20000)
data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
columns_order=["token", "offsets_start", "offsets_limit"],
operations=jieba_op, num_parallel_workers=1)
expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
for i in data.create_dict_iterator():
ret = to_str(i["token"])
for index, item in enumerate(ret):
assert item == expect[index]
for index, item in enumerate(i["offsets_start"]):
assert item == expected_offsets_start[index]
for index, item in enumerate(i["offsets_limit"]):
assert item == expected_offsets_limit[index]
def gen(): def gen():
text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S') text = np.array("今天天气太好了我们一起去外面玩吧".encode("UTF8"), dtype='S')
yield (text,) yield (text,)
...@@ -236,3 +459,13 @@ if __name__ == "__main__": ...@@ -236,3 +459,13 @@ if __name__ == "__main__":
test_jieba_5() test_jieba_5()
test_jieba_5() test_jieba_5()
test_jieba_6() test_jieba_6()
test_jieba_with_offsets_1()
test_jieba_with_offsets_1_1()
test_jieba_with_offsets_1_2()
test_jieba_with_offsets_2()
test_jieba_with_offsets_2_1()
test_jieba_with_offsets_2_2()
test_jieba_with_offsets_3()
test_jieba_with_offsets_3_1()
test_jieba_with_offsets_4()
test_jieba_with_offsets_5()
...@@ -18,7 +18,7 @@ Testing UnicodeCharTokenizer op in DE ...@@ -18,7 +18,7 @@ Testing UnicodeCharTokenizer op in DE
import numpy as np import numpy as np
import mindspore.dataset as ds import mindspore.dataset as ds
from mindspore import log as logger from mindspore import log as logger
import mindspore.dataset.text as nlp import mindspore.dataset.text as text
DATA_FILE = "../data/dataset/testTokenizerData/1.txt" DATA_FILE = "../data/dataset/testTokenizerData/1.txt"
NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt" NORMALIZE_FILE = "../data/dataset/testTokenizerData/normalize.txt"
...@@ -36,23 +36,48 @@ def split_by_unicode_char(input_strs): ...@@ -36,23 +36,48 @@ def split_by_unicode_char(input_strs):
return out return out
def test_unicode_char_tokenizer(): def test_unicode_char_tokenizer_default():
""" """
Test UnicodeCharTokenizer Test UnicodeCharTokenizer
""" """
input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = nlp.UnicodeCharTokenizer() tokenizer = text.UnicodeCharTokenizer()
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
tokens = [] tokens = []
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist() token = text.to_str(i['text']).tolist()
tokens.append(text) tokens.append(token)
logger.info("The out tokens is : {}".format(tokens)) logger.info("The out tokens is : {}".format(tokens))
assert split_by_unicode_char(input_strs) == tokens assert split_by_unicode_char(input_strs) == tokens
def test_whitespace_tokenizer(): def test_unicode_char_tokenizer_with_offsets():
"""
Test UnicodeCharTokenizer
"""
input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ")
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = text.UnicodeCharTokenizer(with_offsets=True)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
tokens = []
expected_offsets_start = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
[0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]]
expected_offsets_limit = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
[3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]]
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token']).tolist()
tokens.append(token)
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count += 1
logger.info("The out tokens is : {}".format(tokens))
assert split_by_unicode_char(input_strs) == tokens
def test_whitespace_tokenizer_default():
""" """
Test WhitespaceTokenizer Test WhitespaceTokenizer
""" """
...@@ -61,17 +86,44 @@ def test_whitespace_tokenizer(): ...@@ -61,17 +86,44 @@ def test_whitespace_tokenizer():
["我喜欢English!"], ["我喜欢English!"],
[""]] [""]]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = nlp.WhitespaceTokenizer() tokenizer = text.WhitespaceTokenizer()
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
tokens = [] tokens = []
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist() token = text.to_str(i['text']).tolist()
tokens.append(text) tokens.append(token)
logger.info("The out tokens is : {}".format(tokens)) logger.info("The out tokens is : {}".format(tokens))
assert whitespace_strs == tokens assert whitespace_strs == tokens
def test_unicode_script_tokenizer(): def test_whitespace_tokenizer_with_offsets():
"""
Test WhitespaceTokenizer
"""
whitespace_strs = [["Welcome", "to", "Beijing!"],
["北京欢迎您!"],
["我喜欢English!"],
[""]]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = text.WhitespaceTokenizer(with_offsets=True)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
tokens = []
expected_offsets_start = [[0, 8, 11], [0], [0], [0]]
expected_offsets_limit = [[7, 10, 19], [18], [17], [0]]
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token']).tolist()
tokens.append(token)
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count += 1
logger.info("The out tokens is : {}".format(tokens))
assert whitespace_strs == tokens
def test_unicode_script_tokenizer_default():
""" """
Test UnicodeScriptTokenizer when para keep_whitespace=False Test UnicodeScriptTokenizer when para keep_whitespace=False
""" """
...@@ -80,18 +132,18 @@ def test_unicode_script_tokenizer(): ...@@ -80,18 +132,18 @@ def test_unicode_script_tokenizer():
["我喜欢", "English", "!"], ["我喜欢", "English", "!"],
[""]] [""]]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=False) tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
tokens = [] tokens = []
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist() token = text.to_str(i['text']).tolist()
tokens.append(text) tokens.append(token)
logger.info("The out tokens is : {}".format(tokens)) logger.info("The out tokens is : {}".format(tokens))
assert unicode_script_strs == tokens assert unicode_script_strs == tokens
def test_unicode_script_tokenizer2(): def test_unicode_script_tokenizer_default2():
""" """
Test UnicodeScriptTokenizer when para keep_whitespace=True Test UnicodeScriptTokenizer when para keep_whitespace=True
""" """
...@@ -100,12 +152,64 @@ def test_unicode_script_tokenizer2(): ...@@ -100,12 +152,64 @@ def test_unicode_script_tokenizer2():
["我喜欢", "English", "!"], ["我喜欢", "English", "!"],
[" "]] [" "]]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = nlp.UnicodeScriptTokenizer(keep_whitespace=True) tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True)
dataset = dataset.map(operations=tokenizer) dataset = dataset.map(operations=tokenizer)
tokens = [] tokens = []
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist() token = text.to_str(i['text']).tolist()
tokens.append(text) tokens.append(token)
logger.info("The out tokens is :", tokens)
assert unicode_script_strs2 == tokens
def test_unicode_script_tokenizer_with_offsets():
"""
Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True
"""
unicode_script_strs = [["Welcome", "to", "Beijing", "!"],
["北京欢迎您", "!"],
["我喜欢", "English", "!"],
[""]]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=False, with_offsets=True)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
tokens = []
expected_offsets_start = [[0, 8, 11, 18], [0, 15], [0, 9, 16], [0]]
expected_offsets_limit = [[7, 10, 18, 19], [15, 18], [9, 16, 17], [0]]
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token']).tolist()
tokens.append(token)
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count += 1
logger.info("The out tokens is : {}".format(tokens))
assert unicode_script_strs == tokens
def test_unicode_script_tokenizer_with_offsets2():
"""
Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
"""
unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
["北京欢迎您", "!"],
["我喜欢", "English", "!"],
[" "]]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer)
tokens = []
expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]]
expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]]
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token']).tolist()
tokens.append(token)
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count += 1
logger.info("The out tokens is :", tokens) logger.info("The out tokens is :", tokens)
assert unicode_script_strs2 == tokens assert unicode_script_strs2 == tokens
...@@ -116,13 +220,13 @@ def test_case_fold(): ...@@ -116,13 +220,13 @@ def test_case_fold():
""" """
expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "] expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "]
dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
op = nlp.CaseFold() op = text.CaseFold()
dataset = dataset.map(operations=op) dataset = dataset.map(operations=op)
lower_strs = [] lower_strs = []
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist() token = text.to_str(i['text']).tolist()
lower_strs.append(text) lower_strs.append(token)
assert lower_strs == expect_strs assert lower_strs == expect_strs
...@@ -133,13 +237,13 @@ def test_normalize_utf8(): ...@@ -133,13 +237,13 @@ def test_normalize_utf8():
def normalize(normalize_form): def normalize(normalize_form):
dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False) dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
normalize = nlp.NormalizeUTF8(normalize_form=normalize_form) normalize = text.NormalizeUTF8(normalize_form=normalize_form)
dataset = dataset.map(operations=normalize) dataset = dataset.map(operations=normalize)
out_bytes = [] out_bytes = []
out_texts = [] out_texts = []
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
out_bytes.append(i['text']) out_bytes.append(i['text'])
out_texts.append(nlp.to_str(i['text']).tolist()) out_texts.append(text.to_str(i['text']).tolist())
logger.info("The out bytes is : ", out_bytes) logger.info("The out bytes is : ", out_bytes)
logger.info("The out texts is: ", out_texts) logger.info("The out texts is: ", out_texts)
return out_bytes return out_bytes
...@@ -158,10 +262,10 @@ def test_normalize_utf8(): ...@@ -158,10 +262,10 @@ def test_normalize_utf8():
[b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87', [b's\xcc\xa3\xcc\x87', b'd\xcc\xa3\xcc\x87', b'q\xcc\xa3\xcc\x87',
b'fi', b'25', b's\xcc\xa3\xcc\x87'] b'fi', b'25', b's\xcc\xa3\xcc\x87']
] ]
assert normalize(nlp.utils.NormalizeForm.NFC) == expect_normlize_data[0] assert normalize(text.utils.NormalizeForm.NFC) == expect_normlize_data[0]
assert normalize(nlp.utils.NormalizeForm.NFKC) == expect_normlize_data[1] assert normalize(text.utils.NormalizeForm.NFKC) == expect_normlize_data[1]
assert normalize(nlp.utils.NormalizeForm.NFD) == expect_normlize_data[2] assert normalize(text.utils.NormalizeForm.NFD) == expect_normlize_data[2]
assert normalize(nlp.utils.NormalizeForm.NFKD) == expect_normlize_data[3] assert normalize(text.utils.NormalizeForm.NFKD) == expect_normlize_data[3]
def test_regex_replace(): def test_regex_replace():
...@@ -175,12 +279,12 @@ def test_regex_replace(): ...@@ -175,12 +279,12 @@ def test_regex_replace():
dataset = dataset.skip(first - 1) dataset = dataset.skip(first - 1)
if last >= first: if last >= first:
dataset = dataset.take(last - first + 1) dataset = dataset.take(last - first + 1)
replace_op = nlp.RegexReplace(pattern, replace) replace_op = text.RegexReplace(pattern, replace)
dataset = dataset.map(operations=replace_op) dataset = dataset.map(operations=replace_op)
out_text = [] out_text = []
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist() token = text.to_str(i['text']).tolist()
out_text.append(text) out_text.append(token)
logger.info("Out:", out_text) logger.info("Out:", out_text)
logger.info("Exp:", expect_str) logger.info("Exp:", expect_str)
assert expect_str == out_text assert expect_str == out_text
...@@ -191,7 +295,7 @@ def test_regex_replace(): ...@@ -191,7 +295,7 @@ def test_regex_replace():
regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "") regex_replace(7, 8, ['我不想长大', 'WelcometoShenzhen!'], "\\p{Cc}|\\p{Cf}|\\s+", "")
def test_regex_tokenizer(): def test_regex_tokenizer_default():
""" """
Test RegexTokenizer Test RegexTokenizer
""" """
...@@ -202,15 +306,15 @@ def test_regex_tokenizer(): ...@@ -202,15 +306,15 @@ def test_regex_tokenizer():
dataset = dataset.skip(first - 1) dataset = dataset.skip(first - 1)
if last >= first: if last >= first:
dataset = dataset.take(last - first + 1) dataset = dataset.take(last - first + 1)
tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern) tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern)
dataset = dataset.map(operations=tokenizer_op) dataset = dataset.map(operations=tokenizer_op)
out_text = [] out_text = []
count = 0 count = 0
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']).tolist() token = text.to_str(i['text']).tolist()
np.testing.assert_array_equal(text, expect_str[count]) np.testing.assert_array_equal(token, expect_str[count])
count += 1 count += 1
out_text.append(text) out_text.append(token)
logger.info("Out:", out_text) logger.info("Out:", out_text)
logger.info("Exp:", expect_str) logger.info("Exp:", expect_str)
...@@ -222,12 +326,55 @@ def test_regex_tokenizer(): ...@@ -222,12 +326,55 @@ def test_regex_tokenizer():
regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "") regex_tokenizer(3, 3, [['¥+', '¥=?']], r"[\p{N}]+", "")
def test_regex_tokenizer_with_offsets():
"""
Test RegexTokenizer
"""
def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern,
keep_delim_pattern):
dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op)
out_text = []
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token']).tolist()
np.testing.assert_array_equal(token, expect_str[count])
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count += 1
out_text.append(token)
logger.info("Out:", out_text)
logger.info("Exp:", expect_str)
regex_tokenizer(1, 1, [['Welcome', 'to', 'Shenzhen!']], [[0, 8, 11]], [[7, 10, 20]], "\\s+", "")
regex_tokenizer(1, 1, [['Welcome', ' ', 'to', ' ', 'Shenzhen!']], [[0, 7, 8, 10, 11]], [[7, 8, 10, 11, 20]],
"\\s+", "\\s+")
regex_tokenizer(2, 2, [['北', '京', '欢', '迎', '您', '!Welcome to Beijing!']], [[0, 3, 6, 9, 12, 15]],
[[3, 6, 9, 12, 15, 35]], r"\p{Han}", r"\p{Han}")
regex_tokenizer(3, 3, [['12', '¥+', '36', '¥=?']], [[0, 2, 6, 8]], [[2, 6, 8, 13]],
r"[\p{P}|\p{S}]+", r"[\p{P}|\p{S}]+")
regex_tokenizer(3, 3, [['12', '36']], [[0, 6]], [[2, 8]], r"[\p{P}|\p{S}]+", "")
regex_tokenizer(3, 3, [['¥+', '¥=?']], [[2, 8]], [[6, 13]], r"[\p{N}]+", "")
if __name__ == '__main__': if __name__ == '__main__':
test_unicode_char_tokenizer() test_unicode_char_tokenizer_default()
test_whitespace_tokenizer() test_unicode_char_tokenizer_with_offsets()
test_unicode_script_tokenizer() test_whitespace_tokenizer_default()
test_unicode_script_tokenizer2() test_whitespace_tokenizer_with_offsets()
test_unicode_script_tokenizer_default()
test_unicode_script_tokenizer_default2()
test_unicode_script_tokenizer_with_offsets()
test_unicode_script_tokenizer_with_offsets2()
test_case_fold() test_case_fold()
test_normalize_utf8() test_normalize_utf8()
test_regex_replace() test_regex_replace()
test_regex_tokenizer() test_regex_tokenizer_default()
test_regex_tokenizer_with_offsets()
...@@ -18,7 +18,7 @@ Testing WordpieceTokenizer op in DE ...@@ -18,7 +18,7 @@ Testing WordpieceTokenizer op in DE
import numpy as np import numpy as np
import mindspore.dataset as ds import mindspore.dataset as ds
from mindspore import log as logger from mindspore import log as logger
import mindspore.dataset.text as nlp import mindspore.dataset.text as text
WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt" WORDPIECE_TOKENIZER_FILE = "../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
...@@ -38,6 +38,8 @@ test_paras = [ ...@@ -38,6 +38,8 @@ test_paras = [
last=10, last=10,
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
['era'], ['[UNK]']], ['era'], ['[UNK]']],
expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0]],
expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4]],
vocab_list=vocab_english vocab_list=vocab_english
), ),
dict( dict(
...@@ -45,6 +47,8 @@ test_paras = [ ...@@ -45,6 +47,8 @@ test_paras = [
last=10, last=10,
expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'], expect_str=[['my'], ['favor', '##ite'], ['book'], ['is'], ['love'], ['dur', '##ing'], ['the'], ['cholera'],
['era'], ['what']], ['era'], ['what']],
expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0]],
expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4]],
vocab_list=vocab_english, vocab_list=vocab_english,
unknown_token="" unknown_token=""
), ),
...@@ -52,6 +56,8 @@ test_paras = [ ...@@ -52,6 +56,8 @@ test_paras = [
first=1, first=1,
last=10, last=10,
expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']], expect_str=[['my'], ['[UNK]'], ['book'], ['is'], ['love'], ['[UNK]'], ['the'], ['[UNK]'], ['era'], ['[UNK]']],
expected_offsets_start=[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
expected_offsets_limit=[[2], [5], [4], [2], [4], [5], [3], [5], [3], [4]],
vocab_list=vocab_english, vocab_list=vocab_english,
max_bytes_per_token=4 max_bytes_per_token=4
), ),
...@@ -60,12 +66,16 @@ test_paras = [ ...@@ -60,12 +66,16 @@ test_paras = [
last=25, last=25,
expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], expect_str=[['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
['[UNK]']], ['[UNK]']],
expected_offsets_start=[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
expected_offsets_limit=[[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]],
vocab_list=vocab_chinese, vocab_list=vocab_chinese,
), ),
dict( dict(
first=25, first=25,
last=25, last=25,
expect_str=[['您']], expect_str=[['您']],
expected_offsets_start=[[0]],
expected_offsets_limit=[[3]],
vocab_list=vocab_chinese, vocab_list=vocab_chinese,
unknown_token="" unknown_token=""
), ),
...@@ -77,37 +87,74 @@ test_paras = [ ...@@ -77,37 +87,74 @@ test_paras = [
['[UNK]'], ['[UNK]'],
['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'], ['我'], ['最'], ['喜'], ['欢'], ['的'], ['书'], ['是'], ['霍'], ['乱'], ['时'], ['期'], ['的'], ['爱'], ['情'],
['[UNK]']], ['[UNK]']],
expected_offsets_start=[[0], [0, 5], [0], [0], [0], [0, 3], [0], [0], [0], [0],
[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
expected_offsets_limit=[[2], [5, 8], [4], [2], [4], [3, 6], [3], [7], [3], [4],
[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]],
vocab_list=vocab_mix, vocab_list=vocab_mix,
), ),
] ]
def check_wordpiece_tokenizer(first, last, expect_str, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): def check_wordpiece_tokenizer_default(first, last, expect_str, expected_offsets_start, expected_offsets_limit,
vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
if first > 1: if first > 1:
dataset = dataset.skip(first - 1) dataset = dataset.skip(first - 1)
if last >= first: if last >= first:
dataset = dataset.take(last - first + 1) dataset = dataset.take(last - first + 1)
vocab = nlp.Vocab.from_list(vocab_list) vocab = text.Vocab.from_list(vocab_list)
tokenizer_op = nlp.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token, tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token,
max_bytes_per_token=max_bytes_per_token) max_bytes_per_token=max_bytes_per_token)
dataset = dataset.map(operations=tokenizer_op) dataset = dataset.map(operations=tokenizer_op)
count = 0 count = 0
for i in dataset.create_dict_iterator(): for i in dataset.create_dict_iterator():
text = nlp.to_str(i['text']) token = text.to_str(i['text'])
logger.info("Out:", text) logger.info("Out:", token)
logger.info("Exp:", expect_str[count]) logger.info("Exp:", expect_str[count])
np.testing.assert_array_equal(text, expect_str[count]) np.testing.assert_array_equal(token, expect_str[count])
count = count + 1 count = count + 1
def test_wordpiece_tokenizer(): def check_wordpiece_tokenizer_with_offsets(first, last, expect_str, expected_offsets_start, expected_offsets_limit,
vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
if first > 1:
dataset = dataset.skip(first - 1)
if last >= first:
dataset = dataset.take(last - first + 1)
vocab = text.Vocab.from_list(vocab_list)
tokenizer_op = text.WordpieceTokenizer(vocab=vocab, with_offsets=True, unknown_token=unknown_token,
max_bytes_per_token=max_bytes_per_token)
dataset = dataset.map(input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'],
columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op)
count = 0
for i in dataset.create_dict_iterator():
token = text.to_str(i['token'])
logger.info("Out:", token)
logger.info("Exp:", expect_str[count])
np.testing.assert_array_equal(token, expect_str[count])
np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count])
np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count])
count = count + 1
def test_wordpiece_tokenizer_default():
"""
Test WordpieceTokenizer
"""
for paras in test_paras:
check_wordpiece_tokenizer_default(**paras)
def test_wordpiece_tokenizer_with_offsets():
""" """
Test WordpieceTokenizer Test WordpieceTokenizer
""" """
for paras in test_paras: for paras in test_paras:
check_wordpiece_tokenizer(**paras) check_wordpiece_tokenizer_with_offsets(**paras)
if __name__ == '__main__': if __name__ == '__main__':
test_wordpiece_tokenizer() test_wordpiece_tokenizer_default()
test_wordpiece_tokenizer_with_offsets()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册