diff --git a/mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt index 1f1c3c9bdd3b7d7dc3e8fbf111b5e5e3f24d2e8e..556b91ece03611bebaf7c7898870646b601f6406 100644 --- a/mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt +++ b/mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt @@ -26,4 +26,5 @@ add_library(cpp-API OBJECT iterator.cc transforms.cc samplers.cc + text.cc ) diff --git a/mindspore/ccsrc/minddata/dataset/api/datasets.cc b/mindspore/ccsrc/minddata/dataset/api/datasets.cc index a4434afbb23d4cb904eea3641532b531a3b936c2..9dc1537684e2f196a0b1bbddf502896dcb4a343f 100644 --- a/mindspore/ccsrc/minddata/dataset/api/datasets.cc +++ b/mindspore/ccsrc/minddata/dataset/api/datasets.cc @@ -34,6 +34,7 @@ #include "minddata/dataset/engine/datasetops/source/voc_op.h" // Dataset operator headers (in alphabetical order) #include "minddata/dataset/engine/datasetops/batch_op.h" +#include "minddata/dataset/engine/datasetops/build_vocab_op.h" #include "minddata/dataset/engine/datasetops/concat_op.h" #include "minddata/dataset/engine/datasetops/map_op/map_op.h" #include "minddata/dataset/engine/datasetops/project_op.h" @@ -263,6 +264,37 @@ std::shared_ptr Dataset::Batch(int32_t batch_size, bool drop_remai return ds; } +// Function to create a Vocab from dataset +std::shared_ptr Dataset::BuildVocab(const std::vector &columns, + const std::pair &freq_range, int64_t top_k, + const std::vector &special_tokens, bool special_first) { + auto vocab = std::make_shared(); + auto ds = std::make_shared(vocab, columns, freq_range, top_k, special_tokens, special_first); + + if (!ds->ValidateParams()) { + return nullptr; + } + + ds->children.push_back(shared_from_this()); + + // Run tree here to starting building vocab + std::shared_ptr iter = ds->CreateIterator(); + if (iter == nullptr) { + MS_LOG(ERROR) << "Fail to run iterator in BuildVocab."; + return nullptr; + } + + // Finish building vocab by triggering GetNextRow + std::unordered_map> row; + iter->GetNextRow(&row); + if (vocab == nullptr) { + MS_LOG(ERROR) << "Fail to build vocab."; + return nullptr; + } + + return vocab; +} + // Function to create a Concat dataset std::shared_ptr Dataset::Concat(const std::vector> &datasets) { auto ds = std::make_shared(datasets); @@ -1450,13 +1482,52 @@ std::vector> BatchDataset::Build() { bool BatchDataset::ValidateParams() { if (batch_size_ <= 0) { - MS_LOG(ERROR) << "Batch: Batch size cannot be negative"; + MS_LOG(ERROR) << "Batch: batch_size should be positive integer, but got: " << batch_size_; return false; } return true; } +BuildVocabDataset::BuildVocabDataset(std::shared_ptr vocab, const std::vector &columns, + const std::pair &freq_range, int64_t top_k, + const std::vector &special_tokens, bool special_first) + : vocab_(vocab), + columns_(columns), + freq_range_(freq_range), + top_k_(top_k), + special_tokens_(special_tokens), + special_first_(special_first) {} + +// Function to build BuildVocabDataset +std::vector> BuildVocabDataset::Build() { + // A vector containing shared pointer to the Dataset Ops that this object will create + std::vector> node_ops; + + std::shared_ptr build_vocab_op; + build_vocab_op = std::make_shared(vocab_, columns_, freq_range_, top_k_, special_tokens_, + special_first_, num_workers_, connector_que_size_); + node_ops.push_back(build_vocab_op); + return node_ops; +} + +bool BuildVocabDataset::ValidateParams() { + if (vocab_ == nullptr) { + MS_LOG(ERROR) << "BuildVocab: vocab is null."; + return false; + } + if (top_k_ < 0) { + MS_LOG(ERROR) << "BuildVocab: top_k shoule be positive, but got: " << top_k_; + return false; + } + if (freq_range_.first < 0 || freq_range_.second > kDeMaxFreq || freq_range_.first > freq_range_.second) { + MS_LOG(ERROR) << "BuildVocab: requency_range [a,b] should be 0 <= a <= b (a,b are inclusive), " + << "but got [" << freq_range_.first << ", " << freq_range_.second << "]"; + return false; + } + return true; +} + // Function to build ConcatOp ConcatDataset::ConcatDataset(const std::vector> &datasets) : datasets_(datasets) { this->children = datasets_; diff --git a/mindspore/ccsrc/minddata/dataset/api/text.cc b/mindspore/ccsrc/minddata/dataset/api/text.cc new file mode 100644 index 0000000000000000000000000000000000000000..dfc1dbfaec3c9f86eba52e7d2fd0574dc841cf9d --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/api/text.cc @@ -0,0 +1,64 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "minddata/dataset/include/text.h" +#include "minddata/dataset/text/kernels/lookup_op.h" + +namespace mindspore { +namespace dataset { +namespace api { +namespace text { + +std::shared_ptr Lookup(const std::shared_ptr &vocab, const std::string &unknown_token) { + auto op = std::make_shared(vocab, unknown_token); + + if (!op->ValidateParams()) { + return nullptr; + } + return op; +} + +// LookupOperation +LookupOperation::LookupOperation(const std::shared_ptr &vocab, const std::string &unknown_token) + : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists) {} + +bool LookupOperation::ValidateParams() { + if (vocab_ == nullptr) { + LOG(ERROR) << "Lookup: vocab object type is incorrect or null."; + return false; + } + if (unknown_token_.empty()) { + LOG(ERROR) << "Lookup: no unknown token is specified."; + return false; + } else { + default_id_ = vocab_->Lookup(unknown_token_); + if (default_id_ == Vocab::kNoTokenExists) { + LOG(ERROR) << "Lookup: unknown_token: [" + unknown_token_ + "], does not exist in vocab."; + return false; + } + } + return true; +} + +std::shared_ptr LookupOperation::Build() { + std::shared_ptr tensor_op = std::make_shared(vocab_, default_id_); + return tensor_op; +} + +} // namespace text +} // namespace api +} // namespace dataset +} // namespace mindspore diff --git a/mindspore/ccsrc/minddata/dataset/core/constants.h b/mindspore/ccsrc/minddata/dataset/core/constants.h index 55a67c8946cee5817c95a5a57b9d4583d6ae133b..be875c50286d16320f78c0f25bd2e26ad7f68e7f 100644 --- a/mindspore/ccsrc/minddata/dataset/core/constants.h +++ b/mindspore/ccsrc/minddata/dataset/core/constants.h @@ -59,6 +59,8 @@ inline void BitClear(uint32_t *bits, uint32_t bitMask) { *bits &= (~bitMask); } constexpr int32_t kDeMaxDim = std::numeric_limits::max(); // 2147483647 or 2^32 -1 constexpr int32_t kDeMaxRank = std::numeric_limits::max(); +constexpr int64_t kDeMaxFreq = std::numeric_limits::max(); // 9223372036854775807 or 2^(64-1) +constexpr int64_t kDeMaxTopk = std::numeric_limits::max(); constexpr uint32_t kCfgRowsPerBuffer = 1; constexpr uint32_t kCfgParallelWorkers = 4; diff --git a/mindspore/ccsrc/minddata/dataset/include/datasets.h b/mindspore/ccsrc/minddata/dataset/include/datasets.h index 1f12cf0c0c6b27b604aef90e2ff3c9d239d739ec..34c81a976dcc8a2aacbefbe21075a3278fc609a5 100644 --- a/mindspore/ccsrc/minddata/dataset/include/datasets.h +++ b/mindspore/ccsrc/minddata/dataset/include/datasets.h @@ -30,6 +30,7 @@ #include "minddata/dataset/include/iterator.h" #include "minddata/dataset/include/samplers.h" #include "minddata/dataset/include/type_id.h" +#include "minddata/dataset/text/vocab.h" namespace mindspore { namespace dataset { @@ -39,6 +40,7 @@ class DatasetOp; class DataSchema; class Tensor; class TensorShape; +class Vocab; namespace api { @@ -61,6 +63,7 @@ class TextFileDataset; class VOCDataset; // Dataset Op classes (in alphabetical order) class BatchDataset; +class BuildVocabDataset; class ConcatDataset; class MapDataset; class ProjectDataset; @@ -325,6 +328,24 @@ class Dataset : public std::enable_shared_from_this { /// \return Shared pointer to the current BatchDataset std::shared_ptr Batch(int32_t batch_size, bool drop_remainder = false); + /// \brief Function to create a Vocab from source dataset + /// \notes Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab + /// which contains top_k most frequent words (if top_k is specified) + /// \param[in] columns Column names to get words from. It can be a vector of column names + /// \param[in] freq_range A tuple of integers (min_frequency, max_frequency). Words within the frequency + /// range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency + /// can be set to default, which corresponds to 0/total_words separately + /// \param[in] top_k Number of words to be built into vocab. top_k most frequent words are + // taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken + /// \param[in] special_tokens A list of strings, each one is a special token + /// \param[in] special_first Whether special_tokens will be prepended/appended to vocab, If special_tokens + /// is specified and special_first is set to default, special_tokens will be prepended + /// \return Shared pointer to the current Vocab + std::shared_ptr BuildVocab(const std::vector &columns = {}, + const std::pair &freq_range = {0, kDeMaxFreq}, + int64_t top_k = kDeMaxTopk, const std::vector &special_tokens = {}, + bool special_first = true); + /// \brief Function to create a ConcatDataset /// \notes Concat the datasets in the input /// \param[in] datasets List of shared pointers to the dataset that should be concatenated together @@ -859,6 +880,33 @@ class BatchDataset : public Dataset { std::map>> pad_map_; }; +class BuildVocabDataset : public Dataset { + public: + /// \brief Constructor + BuildVocabDataset(std::shared_ptr vocab, const std::vector &columns, + const std::pair &freq_range, int64_t top_k, + const std::vector &special_tokens, bool special_first); + + /// \brief Destructor + ~BuildVocabDataset() = default; + + /// \brief a base class override function to create the required runtime dataset op objects for this class + /// \return The list of shared pointers to the newly created DatasetOps + std::vector> Build() override; + + /// \brief Parameters validation + /// \return bool true if all the params are valid + bool ValidateParams() override; + + private: + std::shared_ptr vocab_; + std::vector columns_; + std::pair freq_range_; + int64_t top_k_; + std::vector special_tokens_; + bool special_first_; +}; + class ConcatDataset : public Dataset { public: /// \brief Constructor diff --git a/mindspore/ccsrc/minddata/dataset/include/text.h b/mindspore/ccsrc/minddata/dataset/include/text.h new file mode 100644 index 0000000000000000000000000000000000000000..7edcdc027cd5c917f821cd7bf8f9dfc9fdf5d18a --- /dev/null +++ b/mindspore/ccsrc/minddata/dataset/include/text.h @@ -0,0 +1,65 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_ +#define MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_ + +#include +#include +#include +#include "minddata/dataset/core/constants.h" +#include "minddata/dataset/include/transforms.h" +#include "minddata/dataset/text/vocab.h" + +namespace mindspore { +namespace dataset { +namespace api { + +// Transform operations for text +namespace text { + +// Text Op classes (in alphabetical order) +class LookupOperation; + +/// \brief Lookup operator that looks up a word to an id. +/// \param[in] vocab a Vocab object. +/// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov). +/// If unknown_token is oov, runtime error will be thrown +/// \return Shared pointer to the current TensorOperation. +std::shared_ptr Lookup(const std::shared_ptr &vocab, const std::string &unknown_token); + +/* ####################################### Derived TensorOperation classes ################################# */ + +class LookupOperation : public TensorOperation { + public: + explicit LookupOperation(const std::shared_ptr &vocab, const std::string &unknown_token); + + ~LookupOperation() = default; + + std::shared_ptr Build() override; + + bool ValidateParams() override; + + private: + std::shared_ptr vocab_; + std::string unknown_token_; + int32_t default_id_; +}; +} // namespace text +} // namespace api +} // namespace dataset +} // namespace mindspore +#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_ diff --git a/mindspore/ccsrc/minddata/dataset/text/vocab.cc b/mindspore/ccsrc/minddata/dataset/text/vocab.cc index c1b7e6265c8a328bcd7136978aeee352be41cb92..e975f5d91888a1aafb3243883df2ef2a55911349 100644 --- a/mindspore/ccsrc/minddata/dataset/text/vocab.cc +++ b/mindspore/ccsrc/minddata/dataset/text/vocab.cc @@ -17,8 +17,10 @@ #include #include #include +#include #include "minddata/dataset/text/vocab.h" +#include "utils/log_adapter.h" namespace mindspore { namespace dataset { @@ -51,6 +53,147 @@ Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tok return Status::OK(); } +Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr *vocab) { + std::unordered_map word2id; + for (auto p : words) { + word2id[py::str(p.first)] = py::reinterpret_borrow(p.second); + } + *vocab = std::make_shared(std::move(word2id)); + return Status::OK(); +} + +void Vocab::append_word(const std::string &word) { + if (word2id_.find(word) == word2id_.end()) { + word2id_[word] = word2id_.size(); + } +} + +Status Vocab::BuildFromUnorderedMap(const std::unordered_map &words, + std::shared_ptr *vocab) { + // Validate parameters and build map + std::unordered_map word2id; + for (auto p : words) { + if (p.second < 0) { + MS_LOG(ERROR) << "index can not be negetive, but got " << p.second; + RETURN_STATUS_UNEXPECTED("index can not be negetive, but got " + std::to_string(p.second)); + } + word2id[p.first] = p.second; + } + *vocab = std::make_shared(std::move(word2id)); + return Status::OK(); +} + +Status Vocab::BuildFromVector(const std::vector &words, const std::vector &special_tokens, + bool prepend_special, std::shared_ptr *vocab) { + // Validate parameters + std::string duplicate_word; + for (const WordType &word : words) { + if (std::count(words.begin(), words.end(), word) > 1) { + if (duplicate_word.find(word) == std::string::npos) { + duplicate_word = duplicate_word + ", " + word; + } + } + } + if (!duplicate_word.empty()) { + MS_LOG(ERROR) << "words contains duplicate word: " << duplicate_word; + RETURN_STATUS_UNEXPECTED("words contains duplicate word: " + duplicate_word); + } + + std::string duplicate_sp; + for (const WordType &sp : special_tokens) { + if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) { + if (duplicate_sp.find(sp) == std::string::npos) { + duplicate_sp = duplicate_sp + ", " + sp; + } + } + } + if (!duplicate_sp.empty()) { + MS_LOG(ERROR) << "special_tokens contains duplicate word: " << duplicate_sp; + RETURN_STATUS_UNEXPECTED("special_tokens contains duplicate word: " + duplicate_sp); + } + + std::unordered_map word2id; + + // if special is added in front, normal words id will start from number of special tokens + WordIdType word_id = prepend_special ? static_cast(special_tokens.size()) : 0; + for (auto word : words) { + word2id[word] = word_id++; + } + + word_id = prepend_special ? 0 : word2id.size(); + + for (auto special_token : special_tokens) { + word2id[special_token] = word_id++; + } + + *vocab = std::make_shared(std::move(word2id)); + return Status::OK(); +} + +Status Vocab::BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size, + const std::vector &special_tokens, bool prepend_special, + std::shared_ptr *vocab) { + // Validate parameters + if (vocab_size < 0 && vocab_size != -1) { + MS_LOG(ERROR) << "vocab_size shoule be either -1 or positive integer, but got " << vocab_size; + RETURN_STATUS_UNEXPECTED("vocab_size shoule be either -1 or positive integer, but got " + + std::to_string(vocab_size)); + } + + std::string duplicate_sp; + for (const WordType &sp : special_tokens) { + if (std::count(special_tokens.begin(), special_tokens.end(), sp) > 1) { + if (duplicate_sp.find(sp) == std::string::npos) { + duplicate_sp = duplicate_sp + ", " + sp; + } + } + } + if (!duplicate_sp.empty()) { + MS_LOG(ERROR) << "special_tokens contains duplicate word: " << duplicate_sp; + RETURN_STATUS_UNEXPECTED("special_tokens contains duplicate word: " + duplicate_sp); + } + + std::unordered_set specials; + // used to check that words in file don't contain any special token that already exists + for (auto word : special_tokens) { + specials.insert(word); + } + WordIdType word_id = prepend_special ? static_cast(special_tokens.size()) : 0; + std::unordered_map word2id; + std::fstream handle(path, std::ios::in); + if (!handle.good() || !handle.is_open()) { + MS_LOG(ERROR) << "fail to open:" + path; + RETURN_STATUS_UNEXPECTED("fail to open:" + path); + } + std::string word; + while (std::getline(handle, word)) { + if (!delimiter.empty()) { + // if delimiter is not found, find_first_of would return std::string::npos which is -1 + word = word.substr(0, word.find_first_of(delimiter)); + } + if (word2id.find(word) != word2id.end()) { + MS_LOG(ERROR) << "duplicate word:" + word + "."; + RETURN_STATUS_UNEXPECTED("duplicate word:" + word + "."); + } + if (specials.find(word) != specials.end()) { + MS_LOG(ERROR) << word + " is already in special_tokens."; + RETURN_STATUS_UNEXPECTED(word + " is already in special_tokens."); + } + word2id[word] = word_id++; + // break if enough row is read, if vocab_size is smaller than 0 + if (word2id.size() == vocab_size) break; + } + + word_id = prepend_special ? 0 : word2id.size(); + + for (auto special_token : special_tokens) { + word2id[special_token] = word_id++; + } + + *vocab = std::make_shared(std::move(word2id)); + return Status::OK(); +} + Status Vocab::BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, const py::list &special_tokens, bool prepend_special, std::shared_ptr *vocab) { // python validator checks special_tokens doesn't contain any duplicate words @@ -86,21 +229,6 @@ Status Vocab::BuildFromFile(const std::string &path, const std::string &delimite return Status::OK(); } -Status Vocab::BuildFromPyDict(const py::dict &words, std::shared_ptr *vocab) { - std::unordered_map word2id; - for (auto p : words) { - word2id[py::str(p.first)] = py::reinterpret_borrow(p.second); - } - *vocab = std::make_shared(std::move(word2id)); - return Status::OK(); -} - -void Vocab::append_word(const std::string &word) { - if (word2id_.find(word) == word2id_.end()) { - word2id_[word] = word2id_.size(); - } -} - const WordIdType Vocab::kNoTokenExists = -1; } // namespace dataset diff --git a/mindspore/ccsrc/minddata/dataset/text/vocab.h b/mindspore/ccsrc/minddata/dataset/text/vocab.h index 06da5f8f332348539a9851c0a329a3775396bae9..f3d46a4cb5f119a7acce0f157af52d478d693a30 100644 --- a/mindspore/ccsrc/minddata/dataset/text/vocab.h +++ b/mindspore/ccsrc/minddata/dataset/text/vocab.h @@ -57,6 +57,34 @@ class Vocab { static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, const py::list &special_tokens, bool prepend_special, std::shared_ptr *vocab); + /// \brief Build a vocab from a c++ map. id needs to start from 2, no duplicate and continuous + /// \param[in] words An unordered_map containing word, word id pair. + /// \param[out] vocab A vocab object + /// \return Error code + static Status BuildFromUnorderedMap(const std::unordered_map &words, + std::shared_ptr *vocab); + + /// \brief Build a vocab from a c++ vector. id needs to start from 2, no duplicate and continuous + /// \param[in] words A vector of string, used to build vocab, id starts from 2 + /// \param[in] special_tokens A vector of string contain special tokens + /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab + /// \param[out] vocab A vocab object + /// \return Error code + static Status BuildFromVector(const std::vector &words, const std::vector &special_tokens, + bool prepend_special, std::shared_ptr *vocab); + + /// \brief Build a vocab from reading a vocab file, id are automatically assigned, start from 2 + /// \param[in] path Path to vocab file , each line is assumed to contain 1 word + /// \param[in] delimiter Delimiter to break each line with + /// \param[in] vocab_size Number of words to read from file + /// \param[in] special_tokens A vector of string contain special tokens + /// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab + /// \param[out] vocab A vocab object + /// \return Error code + static Status BuildFromFileCpp(const std::string &path, const std::string &delimiter, int32_t vocab_size, + const std::vector &special_tokens, bool prepend_special, + std::shared_ptr *vocab); + // Lookup the id of a word, if word doesn't exist in vocab, return default_id // @param const WordType word - word to look up // @param WordIdType default_id - word id to return to user when its not in the vocab diff --git a/tests/ut/cpp/dataset/CMakeLists.txt b/tests/ut/cpp/dataset/CMakeLists.txt index fbcd1dbc75ffd72951060604fb6cedf60d6e41af..6b5f08af99b0881daa91722374fe232cc738b002 100644 --- a/tests/ut/cpp/dataset/CMakeLists.txt +++ b/tests/ut/cpp/dataset/CMakeLists.txt @@ -97,6 +97,7 @@ SET(DE_UT_SRCS concatenate_op_test.cc cyclic_array_test.cc perf_data_test.cc + build_vocab_test.cc c_api_samplers_test.cc c_api_transforms_test.cc c_api_dataset_ops_test.cc @@ -104,12 +105,13 @@ SET(DE_UT_SRCS c_api_dataset_clue_test.cc c_api_dataset_coco_test.cc c_api_dataset_csv_test.cc - c_api_dataset_filetext_test.cc + c_api_dataset_textfile_test.cc c_api_dataset_manifest_test.cc c_api_dataset_randomdata_test.cc c_api_dataset_voc_test.cc c_api_datasets_test.cc c_api_dataset_iterator_test.cc + c_api_dataset_vocab.cc tensor_op_fusion_pass_test.cc sliding_window_op_test.cc epoch_ctrl_op_test.cc diff --git a/tests/ut/cpp/dataset/build_vocab_test.cc b/tests/ut/cpp/dataset/build_vocab_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3edb4a84497e83cdf90b6b4c22575178e3cdfcce --- /dev/null +++ b/tests/ut/cpp/dataset/build_vocab_test.cc @@ -0,0 +1,229 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include "common/common.h" +#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/include/status.h" + +using mindspore::dataset::Tensor; +using mindspore::dataset::Status; +using mindspore::dataset::Vocab; + +class MindDataTestVocab : public UT::DatasetOpTesting { + protected: +}; + +TEST_F(MindDataTestVocab, TestVocabFromUnorderedMap) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromUnorderedMap."; + // Build a map + std::unordered_map dict; + dict["banana"] = 0; + dict["apple"] = 1; + dict["cat"] = 2; + dict["dog"] = 3; + + // Build vocab from map + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Look up specified words + std::vector words = {"apple", "dog", "egg"}; + std::vector expected = {1, 3, -1}; + for (uint32_t i = 0; i < words.size(); ++i) { + int32_t x = vocab->Lookup(words[i]); + EXPECT_EQ(x, expected[i]); + } +} + +TEST_F(MindDataTestVocab, TestVocabFromEmptyMap) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyMap."; + // Build vocab from empty map + std::unordered_map dict; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Look up specified words + // Expect that we will return -1 when word is not in vocab + std::vector words = {"apple", "dog", "egg"}; + std::vector expected = {-1, -1, -1}; + for (uint32_t i = 0; i < words.size(); ++i) { + int32_t x = vocab->Lookup(words[i]); + EXPECT_EQ(x, expected[i]); + } +} + +TEST_F(MindDataTestVocab, TestVocabFromMapFail) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromMapFail."; + // Build a map + std::unordered_map dict; + dict["banana"] = 0; + dict["apple"] = -1; + + // Expected failure: index of word can not be negative + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); + EXPECT_NE(s, Status::OK()); +} + +TEST_F(MindDataTestVocab, TestVocabFromVectorPrependSpTokens) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorPrependSpTokens."; + // Build vocab from a vector of words, special tokens are prepended to vocab + std::vector list = {"apple", "banana", "cat", "dog", "egg"}; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(list, {""}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Look up specified words + // Expect that we will return -1 when word is not in vocab + std::vector words = {"apple", "banana", "fox"}; + std::vector expected = {1, 2, -1}; + for (uint32_t i = 0; i < words.size(); ++i) { + int32_t x = vocab->Lookup(words[i]); + EXPECT_EQ(x, expected[i]); + } +} + +TEST_F(MindDataTestVocab, TestVocabFromVectorAppendSpTokens) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorAppendSpTokens."; + // Build vocab from a vector of words, special tokens are appended to vocab + std::vector list = {"apple", "banana", "cat", "dog", "egg"}; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(list, {""}, false, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Look up specified words + std::vector words = {"apple", "", "fox"}; + std::vector expected = {0, 5, -1}; + for (uint32_t i = 0; i < words.size(); ++i) { + int32_t x = vocab->Lookup(words[i]); + EXPECT_EQ(x, expected[i]); + } +} + +TEST_F(MindDataTestVocab, TestVocabFromVectorWithNoSpTokens) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorWithNoSpTokens."; + // Build vocab from a vector of words with no special tokens + std::vector list = {"apple", "banana", "cat", "dog", "egg"}; + std::vector sp_tokens = {}; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Look up specified words + std::vector words = {"apple", "banana", "fox", ""}; + std::vector expected = {0, 1, -1, -1}; + for (uint32_t i = 0; i < words.size(); ++i) { + int32_t x = vocab->Lookup(words[i]); + EXPECT_EQ(x, expected[i]); + } +} + +TEST_F(MindDataTestVocab, TestVocabFromEmptyVector) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromEmptyVector."; + // Build vocab from empty vector + std::vector list = {}; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(list, {}, false, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Look up specified words + // Expect that we will return -1 when word is not in vocab + std::vector words = {"apple", "banana", "fox"}; + std::vector expected = {-1, -1, -1}; + for (uint32_t i = 0; i < words.size(); ++i) { + int32_t x = vocab->Lookup(words[i]); + EXPECT_EQ(x, expected[i]); + } +} + +TEST_F(MindDataTestVocab, TestVocabFromVectorFail1) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail1."; + // Build vocab from a vector of words with no special tokens + std::vector list = {"apple", "apple", "cat", "cat", "egg"}; + std::vector sp_tokens = {}; + std::shared_ptr vocab = std::make_shared(); + + // Expected failure: duplicate word apple + Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); + EXPECT_NE(s, Status::OK()); +} + +TEST_F(MindDataTestVocab, TestVocabFromVectorFail2) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromVectorFail2."; + // Build vocab from a vector of words with no special tokens + std::vector list = {"apple", "dog", "egg"}; + std::vector sp_tokens = {"", "", "", "", ""}; + std::shared_ptr vocab = std::make_shared(); + + // Expected failure: duplicate special token + Status s = Vocab::BuildFromVector(list, sp_tokens, true, &vocab); + EXPECT_NE(s, Status::OK()); +} + +TEST_F(MindDataTestVocab, TestVocabFromFile) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFile."; + // Build vocab from local file + std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"", ""}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Look up specified words + std::vector words = {"not", "all"}; + std::vector expected = {2, 3}; + for (uint32_t i = 0; i < words.size(); ++i) { + int32_t x = vocab->Lookup(words[i]); + EXPECT_EQ(x, expected[i]); + } +} + +TEST_F(MindDataTestVocab, TestVocabFromFileFail1) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail1."; + // Build vocab from local file which is not exist + std::string vocab_dir = datasets_root_path_ + "/testVocab/not_exist.txt"; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {}, true, &vocab); + EXPECT_NE(s, Status::OK()); +} + +TEST_F(MindDataTestVocab, TestVocabFromFileFail2) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail2."; + // Build vocab from local file + std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; + std::shared_ptr vocab = std::make_shared(); + + // Expected failure: vocab_size shoule be either -1 or positive integer + Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -2, {}, true, &vocab); + EXPECT_NE(s, Status::OK()); +} + +TEST_F(MindDataTestVocab, TestVocabFromFileFail3) { + MS_LOG(INFO) << "Doing MindDataTestVocab-TestVocabFromFileFail2."; + // Build vocab from local file which is not exist + std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; + std::shared_ptr vocab = std::make_shared(); + + // Expected failure: duplicate special token + Status s = Vocab::BuildFromFileCpp(vocab_dir, ",", -1, {"", ""}, true, &vocab); + EXPECT_NE(s, Status::OK()); +} diff --git a/tests/ut/cpp/dataset/c_api_dataset_filetext_test.cc b/tests/ut/cpp/dataset/c_api_dataset_textfile_test.cc similarity index 100% rename from tests/ut/cpp/dataset/c_api_dataset_filetext_test.cc rename to tests/ut/cpp/dataset/c_api_dataset_textfile_test.cc diff --git a/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc b/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc index b80f76f586cbe5c2d7bfde0abf897cc6a482ea74..17fa23198a51eeccd2057129badc5caae86f7039 100644 --- a/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc +++ b/tests/ut/cpp/dataset/c_api_dataset_voc_test.cc @@ -14,7 +14,6 @@ * limitations under the License. */ #include "common/common.h" -#include "minddata/dataset/engine/datasetops/source/voc_op.h" #include "minddata/dataset/include/datasets.h" using namespace mindspore::dataset::api; diff --git a/tests/ut/cpp/dataset/c_api_dataset_vocab.cc b/tests/ut/cpp/dataset/c_api_dataset_vocab.cc new file mode 100644 index 0000000000000000000000000000000000000000..87d5046c44cf67d59327f6d089129123cf31c110 --- /dev/null +++ b/tests/ut/cpp/dataset/c_api_dataset_vocab.cc @@ -0,0 +1,254 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +#include "common/common.h" +#include "minddata/dataset/include/datasets.h" +#include "minddata/dataset/include/status.h" +#include "minddata/dataset/include/transforms.h" +#include "minddata/dataset/include/text.h" + +using namespace mindspore::dataset::api; +using mindspore::dataset::ShuffleMode; +using mindspore::dataset::Tensor; +using mindspore::dataset::Status; +using mindspore::dataset::Vocab; + +class MindDataTestPipeline : public UT::DatasetOpTesting { + protected: +}; + +TEST_F(MindDataTestPipeline, TestVocabLookupOp) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOp."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create a vocab from vector + std::vector list = {"home", "IS", "behind", "the", "world", "ahead", "!"}; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(list, {"", ""}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create Lookup operation on ds + std::shared_ptr lookup = text::Lookup(vocab, ""); + EXPECT_NE(lookup, nullptr); + + // Create Map operation on ds + ds = ds->Map({lookup}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + std::vector expected = {2, 1, 4, 5, 6, 7}; + while (row.size() != 0) { + auto ind = row["text"]; + MS_LOG(INFO) << ind->shape() << " " << *ind; + std::shared_ptr expected_item; + Tensor::CreateScalar(expected[i], &expected_item); + EXPECT_EQ(*ind, *expected_item); + iter->GetNextRow(&row); + i++; + } +} + +TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail1."; + // Create a TextFile Dataset + std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Build vocab from vector + std::vector list = {"home", "IS", "behind", "the", "world", "ahead", "!"}; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromVector(list, {}, true, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create lookup op for ds + // Expected failure: "" is not a word of vocab + std::shared_ptr lookup = text::Lookup(vocab, ""); + EXPECT_EQ(lookup, nullptr); +} + +TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpFail2."; + // Vocab has nothing + std::shared_ptr vocab; + + // Create lookup op + // Expected failure: vocab is null + std::shared_ptr lookup = text::Lookup(vocab, ""); + EXPECT_EQ(lookup, nullptr); +} + +TEST_F(MindDataTestPipeline, TestVocabLookupOpWithEmptyUnknownToken) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabLookupOpWithEmptyUnknownToken."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create a vocab from map + std::unordered_map dict; + dict["Home"] = 3; + std::shared_ptr vocab = std::make_shared(); + Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); + EXPECT_EQ(s, Status::OK()); + + // Create Lookup operation on ds + // Expected failure: "" is not a word of vocab + std::shared_ptr lookup = text::Lookup(vocab, ""); + EXPECT_EQ(lookup, nullptr); +} + +TEST_F(MindDataTestPipeline, TestVocabFromDataset) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDataset."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create vocab from dataset + std::shared_ptr vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits::max()}, + std::numeric_limits::max(), {"", ""}, true); + EXPECT_NE(vocab, nullptr); + + // Check if vocab has words or not + int32_t home_index = vocab->Lookup("home"); + EXPECT_EQ(home_index, 4); + + // Create Lookup operation on ds + std::shared_ptr lookup = text::Lookup(vocab, ""); + EXPECT_NE(lookup, nullptr); + + // Create Map operation on ds + ds = ds->Map({lookup}, {"text"}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + std::vector expected = {4, 5, 3, 6, 7, 2}; + while (row.size() != 0) { + auto ind = row["text"]; + MS_LOG(INFO) << ind->shape() << " " << *ind; + std::shared_ptr expected_item; + Tensor::CreateScalar(expected[i], &expected_item); + EXPECT_EQ(*ind, *expected_item); + iter->GetNextRow(&row); + i++; + } +} + +TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetDefault."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create vocab from dataset + std::shared_ptr vocab = ds->BuildVocab(); + EXPECT_NE(vocab, nullptr); + + // Check if vocab has words or not + int32_t home_index = vocab->Lookup("home"); + EXPECT_EQ(home_index, 2); + + // Create Lookup operation on ds + std::shared_ptr lookup = text::Lookup(vocab, "home"); + EXPECT_NE(lookup, nullptr); + + // Create Map operation on ds + ds = ds->Map({lookup}); + EXPECT_NE(ds, nullptr); + + // Create an iterator over the result of the above dataset + // This will trigger the creation of the Execution Tree and launch it. + std::shared_ptr iter = ds->CreateIterator(); + EXPECT_NE(iter, nullptr); + + // Iterate the dataset and get each row + std::unordered_map> row; + iter->GetNextRow(&row); + + uint64_t i = 0; + std::vector expected = {2, 3, 1, 4, 5, 0}; + while (row.size() != 0) { + auto ind = row["text"]; + MS_LOG(INFO) << ind->shape() << " " << *ind; + std::shared_ptr expected_item; + Tensor::CreateScalar(expected[i], &expected_item); + EXPECT_EQ(*ind, *expected_item); + iter->GetNextRow(&row); + i++; + } +} + +TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail1) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetFail1."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create vocab from dataset + // Expected failure: top_k can not be negative + std::shared_ptr vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits::max()}, + -2, {"", ""}, true); + EXPECT_EQ(vocab, nullptr); +} + +TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail2) { + MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetFail2."; + + // Create a TextFile dataset + std::string data_file = datasets_root_path_ + "/testVocab/words.txt"; + std::shared_ptr ds = TextFile({data_file}, 0, ShuffleMode::kFalse); + EXPECT_NE(ds, nullptr); + + // Create vocab from dataset + // Expected failure: requency_range [a,b] should be 0 <= a <= b + std::shared_ptr vocab = ds->BuildVocab({"text"}, {4, 1}, + std::numeric_limits::max(), {"", ""}, true); + EXPECT_EQ(vocab, nullptr); +}