update lookup api to take in a type

ci add test case address some of the review cmts address review cmts

update lookup api to take in a type
ci add test case address some of the review cmts address review cmts
bde9f18f · Zirui Wu · 75045e3e · bde9f18f · bde9f18f · bde9f18f
9 changed file
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/text/kernels/bindings.cc
@@ -121,12 +121,13 @@ PYBIND_REGISTER(UnicodeCharTokenizerOp, 1, ([](const py::module *m) {
 PYBIND_REGISTER(LookupOp, 1, ([](const py::module *m) {
                  (void)py::class_<LookupOp, TensorOp, std::shared_ptr<LookupOp>>(*m, "LookupOp")
-                    .def(py::init([](std::shared_ptr<Vocab> vocab, const py::object &py_word) {
+                    .def(py::init([](std::shared_ptr<Vocab> vocab, const py::object &py_word,
+                                     const DataType &data_type) {
                      if (vocab == nullptr) {
                        THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "vocab object type is incorrect or null."));
                      }
                      if (py_word.is_none()) {
-                        return std::make_shared<LookupOp>(vocab, Vocab::kNoTokenExists);
+                        return std::make_shared<LookupOp>(vocab, Vocab::kNoTokenExists, data_type);
                      }
                      std::string word = py::reinterpret_borrow<py::str>(py_word);
                      WordIdType default_id = vocab->Lookup(word);
@@ -134,7 +135,7 @@ PYBIND_REGISTER(LookupOp, 1, ([](const py::module *m) {
                        THROW_IF_ERROR(Status(StatusCode::kUnexpectedError,
                                              "default unknown token: " + word + " doesn't exist in vocab."));
                      }
-                      return std::make_shared<LookupOp>(vocab, default_id);
+                      return std::make_shared<LookupOp>(vocab, default_id, data_type);
                    }));
                }));

--- a/mindspore/ccsrc/minddata/dataset/api/text.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/text.cc
@@ -22,8 +22,9 @@ namespace dataset {
 namespace api {
 namespace text {
-std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token) {
+std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
-  auto op = std::make_shared<LookupOperation>(vocab, unknown_token);
+                                        const DataType &data_type) {
+  auto op = std::make_shared<LookupOperation>(vocab, unknown_token, data_type);
  if (!op->ValidateParams()) {
    return nullptr;
@@ -32,8 +33,9 @@ std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, con
 }
 // LookupOperation
-LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token)
+LookupOperation::LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
-    : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists) {}
+                                 const DataType &data_type)
+    : vocab_(vocab), unknown_token_(unknown_token), default_id_(Vocab::kNoTokenExists), data_type_(data_type) {}
 bool LookupOperation::ValidateParams() {
  if (vocab_ == nullptr) {
@@ -54,7 +56,7 @@ bool LookupOperation::ValidateParams() {
 }
 std::shared_ptr<TensorOp> LookupOperation::Build() {
-  std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_);
+  std::shared_ptr<LookupOp> tensor_op = std::make_shared<LookupOp>(vocab_, default_id_, data_type_);
  return tensor_op;
 }

--- a/mindspore/ccsrc/minddata/dataset/include/text.h
+++ b/mindspore/ccsrc/minddata/dataset/include/text.h
@@ -20,9 +20,11 @@
 #include <vector>
 #include <memory>
 #include <string>
 #include "minddata/dataset/core/constants.h"
 #include "minddata/dataset/include/transforms.h"
 #include "minddata/dataset/text/vocab.h"
+#include "mindspore/ccsrc/minddata/dataset/core/data_type.h"
 namespace mindspore {
 namespace dataset {
@@ -37,15 +39,18 @@ class LookupOperation;
 /// \brief Lookup operator that looks up a word to an id.
 /// \param[in] vocab a Vocab object.
 /// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
-///   If unknown_token is oov, runtime error will be thrown
+///   If unknown_token is oov, runtime error will be thrown.
+/// \param[in] DataType type of the tensor after lookup, typically int32.
 /// \return Shared pointer to the current TensorOperation.
-std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token);
+std::shared_ptr<LookupOperation> Lookup(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
+                                        const mindspore::dataset::DataType &data_type = DataType("int32"));
 /* ####################################### Derived TensorOperation classes ################################# */
 class LookupOperation : public TensorOperation {
 public:
-  explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token);
+  explicit LookupOperation(const std::shared_ptr<Vocab> &vocab, const std::string &unknown_token,
+                           const DataType &data_type);
  ~LookupOperation() = default;
@@ -57,6 +62,7 @@ class LookupOperation : public TensorOperation {
  std::shared_ptr<Vocab> vocab_;
  std::string unknown_token_;
  int32_t default_id_;
+  DataType data_type_;
 };
 }  // namespace text
 }  // namespace api

--- a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
@@ -13,15 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "minddata/dataset/text/kernels/lookup_op.h"
 #include <string>
+#include "minddata/dataset/kernels/data/data_utils.h"
+#include "minddata/dataset/text/kernels/lookup_op.h"
 namespace mindspore {
 namespace dataset {
-LookupOp::LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id)
+LookupOp::LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id, const DataType &data_type)
-    : vocab_(vocab), default_id_(default_id), type_(DataType("int32")) {}
+    : vocab_(vocab), default_id_(default_id), type_(data_type) {}
 Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
  IO_CHECK(input, output);
@@ -37,6 +38,14 @@ Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
      "Lookup Error: token: " + std::string(*itr) + " doesn't exist in vocab and no unknown token is specified.");
  }
  RETURN_IF_NOT_OK(Tensor::CreateFromVector(word_ids, input->shape(), output));
+  // type cast to user's requirements if what user wants isn't int32_t
+  if ((*output)->type() != type_) {
+    std::shared_ptr<Tensor> cast_to;
+    RETURN_IF_NOT_OK(TypeCast(*output, &cast_to, type_));
+    *output = cast_to;
+  }
  return Status::OK();
 }
 Status LookupOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {

--- a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.h
@@ -18,9 +18,9 @@
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_TEXT_KERNELS_LOOKUP_OP_H_
 #include <memory>
-#include <vector>
-#include <utility>
 #include <string>
+#include <utility>
+#include <vector>
 #include "minddata/dataset/core/tensor.h"
 #include "minddata/dataset/kernels/tensor_op.h"
@@ -31,26 +31,27 @@ namespace mindspore {
 namespace dataset {
 class LookupOp : public TensorOp {
 public:
-  // constructor for lookup, takes in a vocab object
+  /// \brief constructor for lookup, takes in a vocab object.
-  // @param std::shared_ptr<Vocab> vocab -
+  /// \param[in] std::shared_ptr<Vocab> vocab - vocab used for lookup.
-  // @param WordIdType default_id, id to lookup if a word is not in vocab
+  /// \param[in] WordIdType default_id, id to lookup if a word is not in vocab.
-  explicit LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id = 1);
+  /// \param[in] DataType type of the tensor after lookup, mostly int32.
+  explicit LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id, const DataType &data_type);
  ~LookupOp() = default;
-  // perform actual lookup on each tensor
+  /// \brief perform actual lookup on each tensor.
-  // @param const std::shared_ptr<Tensor> &input
+  /// \param[in] const std::shared_ptr<Tensor> &input
-  // @param std::shared_ptr<Tensor> *output
+  /// \param[in] std::shared_ptr<Tensor> *output
-  // @return error code
+  /// \return[out] error code.
  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
-  // print method
+  /// \brief print method.
-  // @param std::ostream out
+  /// \param[in] std::ostream out
  void Print(std::ostream &out) const override;
-  // @param std::vector<DataType> &inputs -
+  /// \param[in] std::vector<DataType> &inputs -
-  // @param std::vector<DataType> &outputs -
+  /// \param[in] std::vector<DataType> &outputs -
-  // @return error code
+  /// \return[out] error code.
  Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
  std::string Name() const override { return kLookupOp; }

--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@@ -49,6 +49,7 @@ import platform
 import numpy as np
 import mindspore._c_dataengine as cde
+import mindspore.common.dtype as mstype
 from .utils import JiebaMode, NormalizeForm, to_str, SPieceTokenizerOutType, SPieceTokenizerLoadType
 from .validators import check_lookup, check_jieba_add_dict, \
@@ -66,11 +67,12 @@ class Lookup(cde.LookupOp):
        vocab(Vocab): a Vocab object.
        unknown_token(str, optional): word to use for lookup if the word being looked up is out of Vocabulary (oov).
            If unknown_token is oov, runtime error will be thrown (default=None).
+        data_type (mindspore.dtype, optional): mindspore.dtype lookup maps string to (default=mstype.int32)
    """
    @check_lookup
-    def __init__(self, vocab, unknown_token=None):
+    def __init__(self, vocab, unknown_token=None, data_type=mstype.int32):
-        super().__init__(vocab, unknown_token)
+        super().__init__(vocab, unknown_token, mstype_to_detype(data_type))
 class SlidingWindow(cde.SlidingWindowOp):
@@ -103,7 +105,6 @@ class SlidingWindow(cde.SlidingWindowOp):
        super().__init__(width, axis)
 class Ngram(cde.NgramOp):
    """
    TensorOp to generate n-gram from a 1-D string Tensor.

--- a/mindspore/dataset/text/validators.py
+++ b/mindspore/dataset/text/validators.py
@@ -44,12 +44,13 @@ def check_lookup(method):
    @wraps(method)
    def new_method(self, *args, **kwargs):
-        [vocab, unknown_token], _ = parse_user_args(method, *args, **kwargs)
+        [vocab, unknown_token, data_type], _ = parse_user_args(method, *args, **kwargs)
        if unknown_token is not None:
            type_check(unknown_token, (str,), "unknown_token")
        type_check(vocab, (cde.Vocab,), "vocab is not an instance of cde.Vocab.")
+        type_check(data_type, (typing.Type,), "data_type")
        return method(self, *args, **kwargs)
@@ -327,6 +328,7 @@ def check_from_dataset(method):
    return new_method
 def check_slidingwindow(method):
    """A wrapper that wraps a parameter checker to the original function(sliding window operation)."""
@@ -339,6 +341,7 @@ def check_slidingwindow(method):
    return new_method
 def check_ngram(method):
    """A wrapper that wraps a parameter checker to the original function."""

--- a/tests/ut/cpp/dataset/c_api_dataset_vocab.cc
+++ b/tests/ut/cpp/dataset/c_api_dataset_vocab.cc
@@ -26,9 +26,10 @@
 #include "minddata/dataset/include/text.h"
 using namespace mindspore::dataset::api;
+using mindspore::dataset::DataType;
 using mindspore::dataset::ShuffleMode;
-using mindspore::dataset::Tensor;
 using mindspore::dataset::Status;
+using mindspore::dataset::Tensor;
 using mindspore::dataset::Vocab;
 class MindDataTestPipeline : public UT::DatasetOpTesting {
@@ -50,7 +51,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOp) {
  EXPECT_EQ(s, Status::OK());
  // Create Lookup operation on ds
-  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>");
+  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", DataType("int32"));
  EXPECT_NE(lookup, nullptr);
  // Create Map operation on ds
@@ -94,7 +95,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail1) {
  // Create lookup op for ds
  // Expected failure: "<unk>" is not a word of vocab
-  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>");
+  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", DataType("int32"));
  EXPECT_EQ(lookup, nullptr);
 }
@@ -105,7 +106,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpFail2) {
  // Create lookup op
  // Expected failure: vocab is null
-  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "");
+  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32"));
  EXPECT_EQ(lookup, nullptr);
 }
@@ -126,7 +127,7 @@ TEST_F(MindDataTestPipeline, TestVocabLookupOpWithEmptyUnknownToken) {
  // Create Lookup operation on ds
  // Expected failure: "" is not a word of vocab
-  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "");
+  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "", DataType("int32"));
  EXPECT_EQ(lookup, nullptr);
 }
@@ -148,7 +149,7 @@ TEST_F(MindDataTestPipeline, TestVocabFromDataset) {
  EXPECT_EQ(home_index, 4);
  // Create Lookup operation on ds
-  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>");
+  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "<unk>", DataType("int32"));
  EXPECT_NE(lookup, nullptr);
  // Create Map operation on ds
@@ -212,12 +213,15 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetDefault) {
  uint64_t i = 0;
  std::vector<int32_t> expected = {2, 3, 1, 4, 5, 0};
+  std::vector<int64_t> not_expected = {2, 3, 1, 4, 5, 0};
  while (row.size() != 0) {
    auto ind = row["text"];
    MS_LOG(INFO) << ind->shape() << " " << *ind;
-    std::shared_ptr<Tensor> expected_item;
+    std::shared_ptr<Tensor> expected_item, not_expected_item;
    Tensor::CreateScalar(expected[i], &expected_item);
+    Tensor::CreateScalar(not_expected[i], &not_expected_item);
    EXPECT_EQ(*ind, *expected_item);
+    EXPECT_NE(*ind, *not_expected_item);
    iter->GetNextRow(&row);
    i++;
  }
@@ -233,8 +237,8 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail1) {
  // Create vocab from dataset
  // Expected failure: top_k can not be negative
-  std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {0, std::numeric_limits<int64_t>::max()},
+  std::shared_ptr<Vocab> vocab =
-                                                -2, {"<pad>", "<unk>"}, true);
+    ds->BuildVocab({"text"}, {0, std::numeric_limits<int64_t>::max()}, -2, {"<pad>", "<unk>"}, true);
  EXPECT_EQ(vocab, nullptr);
 }
@@ -247,9 +251,9 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail2) {
  EXPECT_NE(ds, nullptr);
  // Create vocab from dataset
-  // Expected failure: requency_range [a,b] should be 0 <= a <= b
+  // Expected failure: frequency_range [a,b] should be 0 <= a <= b
-  std::shared_ptr<Vocab> vocab = ds->BuildVocab({"text"}, {4, 1},
+  std::shared_ptr<Vocab> vocab =
-                                                std::numeric_limits<int64_t>::max(), {"<pad>", "<unk>"}, true);
+    ds->BuildVocab({"text"}, {4, 1}, std::numeric_limits<int64_t>::max(), {"<pad>", "<unk>"}, true);
  EXPECT_EQ(vocab, nullptr);
 }
@@ -266,3 +270,52 @@ TEST_F(MindDataTestPipeline, TestVocabFromDatasetFail3) {
  std::shared_ptr<Vocab> vocab = ds->BuildVocab({"ColumnNotExist"});
  EXPECT_EQ(vocab, nullptr);
 }
+TEST_F(MindDataTestPipeline, TestVocabFromDatasetInt64) {
+  MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVocabFromDatasetInt64.";
+  // Create a TextFile dataset
+  std::string data_file = datasets_root_path_ + "/testVocab/words.txt";
+  std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
+  EXPECT_NE(ds, nullptr);
+  // Create vocab from dataset
+  std::shared_ptr<Vocab> vocab = ds->BuildVocab();
+  EXPECT_NE(vocab, nullptr);
+  // Check if vocab has words or not
+  int32_t home_index = vocab->Lookup("home");
+  EXPECT_EQ(home_index, 2);
+  // Create Lookup operation on ds
+  std::shared_ptr<TensorOperation> lookup = text::Lookup(vocab, "home", DataType("int64"));
+  EXPECT_NE(lookup, nullptr);
+  // Create Map operation on ds
+  ds = ds->Map({lookup});
+  EXPECT_NE(ds, nullptr);
+  // Create an iterator over the result of the above dataset
+  // This will trigger the creation of the Execution Tree and launch it.
+  std::shared_ptr<Iterator> iter = ds->CreateIterator();
+  EXPECT_NE(iter, nullptr);
+  // Iterate the dataset and get each row
+  std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
+  iter->GetNextRow(&row);
+  uint64_t i = 0;
+  std::vector<int64_t> expected = {2, 3, 1, 4, 5, 0};
+  std::vector<int8_t> not_expected = {2, 3, 1, 4, 5, 0};
+  while (row.size() != 0) {
+    auto ind = row["text"];
+    MS_LOG(INFO) << ind->shape() << " " << *ind;
+    std::shared_ptr<Tensor> expected_item, not_expected_item;
+    Tensor::CreateScalar(expected[i], &expected_item);
+    Tensor::CreateScalar(not_expected[i], &not_expected_item);
+    EXPECT_EQ(*ind, *expected_item);
+    EXPECT_NE(*ind, *not_expected_item);
+    iter->GetNextRow(&row);
+    i++;
+  }
+}
\ No newline at end of file
--- a/tests/ut/python/dataset/test_vocab.py
+++ b/tests/ut/python/dataset/test_vocab.py
@@ -17,6 +17,7 @@ import numpy as np
 import mindspore.dataset as ds
 import mindspore.dataset.text as text
+import mindspore.common.dtype as mstype
 # this file contains "home is behind the world head" each word is 1 line
 DATA_FILE = "../data/dataset/testVocab/words.txt"
@@ -137,6 +138,36 @@ def test_from_file():
    assert "Input vocab_size must be greater than 0" in test_config("w1 w2", 0, [], True)
    assert "Input vocab_size must be greater than 0" in test_config("w1 w2", -1, [], True)
+def test_lookup_cast_type():
+    def gen(texts):
+        for word in texts.split(" "):
+            yield (np.array(word, dtype='S'),)
+    def test_config(lookup_str, data_type=None):
+        try:
+            vocab = text.Vocab.from_list(["w1", "w2", "w3"], special_tokens=["<unk>"], special_first=True)
+            data = ds.GeneratorDataset(gen(lookup_str), column_names=["text"])
+            # if data_type is None, test the default value of data_type
+            op = text.Lookup(vocab, "<unk>") if data_type is None else text.Lookup(vocab, "<unk>", data_type)
+            data = data.map(input_columns=["text"], operations=op)
+            res = []
+            for d in data.create_dict_iterator(num_epochs=1):
+                res.append(d["text"])
+            return res[0].dtype
+        except (ValueError, RuntimeError, TypeError) as e:
+            return str(e)
+    # test result is correct
+    assert test_config("w1", mstype.int8) == np.dtype("int8")
+    assert test_config("w2", mstype.int32) == np.dtype("int32")
+    assert test_config("w3", mstype.int64) == np.dtype("int64")
+    assert test_config("unk", mstype.float32) != np.dtype("int32")
+    assert test_config("unk") == np.dtype("int32")
+    # test exception, data_type isn't the correct type
+    assert "tldr is not of type (<class 'mindspore._c_expression.typing.Type'>,)" in test_config("unk", "tldr")
 if __name__ == '__main__':
    test_from_dict_exception()
    test_from_list_tutorial()
@@ -144,3 +175,4 @@ if __name__ == '__main__':
    test_from_dict_tutorial()
    test_from_list()
    test_from_file()
+    test_lookup_cast_type()