提交 b91e5637 编写于 作者: X xiefangqi

add randomdataset and schema

上级 2cc6230f
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include "minddata/dataset/engine/datasetops/source/coco_op.h" #include "minddata/dataset/engine/datasetops/source/coco_op.h"
#include "minddata/dataset/engine/datasetops/source/image_folder_op.h" #include "minddata/dataset/engine/datasetops/source/image_folder_op.h"
#include "minddata/dataset/engine/datasetops/source/mnist_op.h" #include "minddata/dataset/engine/datasetops/source/mnist_op.h"
#include "minddata/dataset/engine/datasetops/source/random_data_op.h"
#include "minddata/dataset/engine/datasetops/source/text_file_op.h" #include "minddata/dataset/engine/datasetops/source/text_file_op.h"
#include "minddata/dataset/engine/datasetops/source/voc_op.h" #include "minddata/dataset/engine/datasetops/source/voc_op.h"
// Dataset operator headers (in alphabetical order) // Dataset operator headers (in alphabetical order)
...@@ -100,6 +101,15 @@ Dataset::Dataset() { ...@@ -100,6 +101,15 @@ Dataset::Dataset() {
worker_connector_size_ = cfg->worker_connector_size(); worker_connector_size_ = cfg->worker_connector_size();
} }
/// \brief Function to create a SchemaObj
/// \param[in] schema_file Path of schema file
/// \return Shared pointer to the current schema
std::shared_ptr<SchemaObj> Schema(const std::string &schema_file) {
auto schema = std::make_shared<SchemaObj>(schema_file);
return schema->init() ? schema : nullptr;
}
// FUNCTIONS TO CREATE DATASETS FOR LEAF-NODE DATASETS // FUNCTIONS TO CREATE DATASETS FOR LEAF-NODE DATASETS
// (In alphabetical order) // (In alphabetical order)
...@@ -353,6 +363,163 @@ std::shared_ptr<ZipDataset> Dataset::Zip(const std::vector<std::shared_ptr<Datas ...@@ -353,6 +363,163 @@ std::shared_ptr<ZipDataset> Dataset::Zip(const std::vector<std::shared_ptr<Datas
return ds->ValidateParams() ? ds : nullptr; return ds->ValidateParams() ? ds : nullptr;
} }
SchemaObj::SchemaObj(const std::string &schema_file) : schema_file_(schema_file), num_rows_(0), dataset_type_("") {}
// SchemaObj init function
bool SchemaObj::init() {
if (schema_file_ != "") {
Path schema_file(schema_file_);
if (!schema_file.Exists()) {
MS_LOG(ERROR) << "The file " << schema_file << " does not exist or permission denied!";
return false;
}
nlohmann::json js;
try {
std::ifstream in(schema_file_);
in >> js;
} catch (const std::exception &err) {
MS_LOG(ERROR) << "Schema file failed to load";
return false;
}
return from_json(js);
}
return true;
}
// Function to add a column to schema with a mstype de_type
bool SchemaObj::add_column(std::string name, TypeId de_type, std::vector<int32_t> shape) {
nlohmann::json new_column;
new_column["name"] = name;
// if de_type is mstype
DataType data_type = dataset::MSTypeToDEType(de_type);
new_column["type"] = data_type.ToString();
if (shape.size() > 0) {
new_column["shape"] = shape;
new_column["rank"] = shape.size();
} else {
new_column["rank"] = 1;
}
columns_.push_back(new_column);
return true;
}
// Function to add a column to schema with a string de_type
bool SchemaObj::add_column(std::string name, std::string de_type, std::vector<int32_t> shape) {
nlohmann::json new_column;
new_column["name"] = name;
DataType data_type(de_type);
new_column["type"] = data_type.ToString();
if (shape.size() > 0) {
new_column["shape"] = shape;
new_column["rank"] = shape.size();
} else {
new_column["rank"] = 1;
}
columns_.push_back(new_column);
return true;
}
std::string SchemaObj::to_json() {
nlohmann::json json_file;
json_file["columns"] = columns_;
if (dataset_type_ != "") {
json_file["datasetType"] = dataset_type_;
}
if (num_rows_ > 0) {
json_file["numRows"] = num_rows_;
}
return json_file.dump(2);
}
bool SchemaObj::parse_column(nlohmann::json columns) {
std::string name, de_type;
std::vector<int32_t> shape;
columns_.clear();
if (columns.type() == nlohmann::json::value_t::array) {
// reference to python list
for (auto column : columns) {
auto key_name = column.find("name");
if (key_name == column.end()) {
MS_LOG(ERROR) << "Column's name is missing";
return false;
}
name = *key_name;
auto key_type = column.find("type");
if (key_type == column.end()) {
MS_LOG(ERROR) << "Column's type is missing";
return false;
}
de_type = *key_type;
shape.clear();
auto key_shape = column.find("shape");
if (key_shape != column.end()) {
shape.insert(shape.end(), (*key_shape).begin(), (*key_shape).end());
}
if (!add_column(name, de_type, shape)) {
return false;
}
}
} else if (columns.type() == nlohmann::json::value_t::object) {
for (const auto &it_child : columns.items()) {
name = it_child.key();
auto key_type = it_child.value().find("type");
if (key_type == it_child.value().end()) {
MS_LOG(ERROR) << "Column's type is missing";
return false;
}
de_type = *key_type;
shape.clear();
auto key_shape = it_child.value().find("shape");
if (key_shape != it_child.value().end()) {
shape.insert(shape.end(), (*key_shape).begin(), (*key_shape).end());
}
if (!add_column(name, de_type, shape)) {
return false;
}
}
} else {
MS_LOG(ERROR) << "columns must be dict or list, columns contain name, type, shape(optional).";
return false;
}
return true;
}
bool SchemaObj::from_json(nlohmann::json json_obj) {
for (const auto &it_child : json_obj.items()) {
if (it_child.key() == "datasetType") {
dataset_type_ = it_child.value();
} else if (it_child.key() == "numRows") {
num_rows_ = it_child.value();
} else if (it_child.key() == "columns") {
if (!parse_column(it_child.value())) {
MS_LOG(ERROR) << "parse columns failed";
return false;
}
} else {
MS_LOG(ERROR) << "Unknown field " << it_child.key();
return false;
}
}
if (columns_.empty()) {
MS_LOG(ERROR) << "Columns are missing.";
return false;
}
if (num_rows_ <= 0) {
MS_LOG(ERROR) << "numRows must be greater than 0";
return false;
}
return true;
}
// OTHER FUNCTIONS // OTHER FUNCTIONS
// (In alphabetical order) // (In alphabetical order)
...@@ -864,6 +1031,67 @@ std::vector<std::shared_ptr<DatasetOp>> MnistDataset::Build() { ...@@ -864,6 +1031,67 @@ std::vector<std::shared_ptr<DatasetOp>> MnistDataset::Build() {
return node_ops; return node_ops;
} }
// ValideParams for RandomDataset
bool RandomDataset::ValidateParams() {
if (total_rows_ < 0) {
MS_LOG(ERROR) << "RandomDataset: total_rows must be greater than 0, now get " << total_rows_;
return false;
}
return true;
}
int32_t RandomDataset::GenRandomInt(int32_t min, int32_t max) {
std::uniform_int_distribution<int32_t> uniDist(min, max);
return uniDist(rand_gen_);
}
// Build for RandomDataset
std::vector<std::shared_ptr<DatasetOp>> RandomDataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
rand_gen_.seed(GetSeed()); // seed the random generator
// If total rows was not given, then randomly pick a number
std::shared_ptr<SchemaObj> schema_obj;
if (!schema_path_.empty()) schema_obj = std::make_shared<SchemaObj>(schema_path_);
if (schema_obj != nullptr && total_rows_ == 0) {
total_rows_ = schema_obj->get_num_rows();
}
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
std::string schema_json_string, schema_file_path;
if (schema_ != nullptr) {
schema_->set_dataset_type("Random");
if (total_rows_ != 0) {
schema_->set_num_rows(total_rows_);
}
schema_json_string = schema_->to_json();
} else {
schema_file_path = schema_path_;
}
std::unique_ptr<DataSchema> data_schema;
std::vector<std::string> columns_to_load;
if (!schema_file_path.empty() || !schema_json_string.empty()) {
data_schema = std::make_unique<DataSchema>();
if (!schema_file_path.empty()) {
data_schema->LoadSchemaFile(schema_file_path, columns_to_load);
} else if (!schema_json_string.empty()) {
data_schema->LoadSchemaString(schema_json_string, columns_to_load);
}
}
std::shared_ptr<RandomDataOp> op;
op = std::make_shared<RandomDataOp>(num_workers_, connector_que_size_, rows_per_buffer_, total_rows_,
std::move(data_schema), std::move(sampler_->Build()));
node_ops.push_back(op);
return node_ops;
}
// Constructor for TextFileDataset // Constructor for TextFileDataset
TextFileDataset::TextFileDataset(std::vector<std::string> dataset_files, int32_t num_samples, ShuffleMode shuffle, TextFileDataset::TextFileDataset(std::vector<std::string> dataset_files, int32_t num_samples, ShuffleMode shuffle,
int32_t num_shards, int32_t shard_id) int32_t num_shards, int32_t shard_id)
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
*/ */
#include "minddata/dataset/include/de_tensor.h" #include "minddata/dataset/include/de_tensor.h"
#include "minddata/dataset/include/type_id.h"
#include "minddata/dataset/core/constants.h" #include "minddata/dataset/core/constants.h"
#include "minddata/dataset/core/data_type.h" #include "minddata/dataset/core/data_type.h"
#include "mindspore/core/ir/dtype/type_id.h" #include "mindspore/core/ir/dtype/type_id.h"
...@@ -23,68 +24,6 @@ ...@@ -23,68 +24,6 @@
namespace mindspore { namespace mindspore {
namespace tensor { namespace tensor {
dataset::DataType MSTypeToDEType(TypeId data_type) {
switch (data_type) {
case kNumberTypeBool:
return dataset::DataType(dataset::DataType::DE_BOOL);
case kNumberTypeInt8:
return dataset::DataType(dataset::DataType::DE_INT8);
case kNumberTypeUInt8:
return dataset::DataType(dataset::DataType::DE_UINT8);
case kNumberTypeInt16:
return dataset::DataType(dataset::DataType::DE_INT16);
case kNumberTypeUInt16:
return dataset::DataType(dataset::DataType::DE_UINT16);
case kNumberTypeInt32:
return dataset::DataType(dataset::DataType::DE_INT32);
case kNumberTypeUInt32:
return dataset::DataType(dataset::DataType::DE_UINT32);
case kNumberTypeInt64:
return dataset::DataType(dataset::DataType::DE_INT64);
case kNumberTypeUInt64:
return dataset::DataType(dataset::DataType::DE_UINT64);
case kNumberTypeFloat16:
return dataset::DataType(dataset::DataType::DE_FLOAT16);
case kNumberTypeFloat32:
return dataset::DataType(dataset::DataType::DE_FLOAT32);
case kNumberTypeFloat64:
return dataset::DataType(dataset::DataType::DE_FLOAT64);
default:
return dataset::DataType(dataset::DataType::DE_UNKNOWN);
}
}
TypeId DETypeToMSType(dataset::DataType data_type) {
switch (data_type.value()) {
case dataset::DataType::DE_BOOL:
return mindspore::TypeId::kNumberTypeBool;
case dataset::DataType::DE_INT8:
return mindspore::TypeId::kNumberTypeInt8;
case dataset::DataType::DE_UINT8:
return mindspore::TypeId::kNumberTypeUInt8;
case dataset::DataType::DE_INT16:
return mindspore::TypeId::kNumberTypeInt16;
case dataset::DataType::DE_UINT16:
return mindspore::TypeId::kNumberTypeUInt16;
case dataset::DataType::DE_INT32:
return mindspore::TypeId::kNumberTypeInt32;
case dataset::DataType::DE_UINT32:
return mindspore::TypeId::kNumberTypeUInt32;
case dataset::DataType::DE_INT64:
return mindspore::TypeId::kNumberTypeInt64;
case dataset::DataType::DE_UINT64:
return mindspore::TypeId::kNumberTypeUInt64;
case dataset::DataType::DE_FLOAT16:
return mindspore::TypeId::kNumberTypeFloat16;
case dataset::DataType::DE_FLOAT32:
return mindspore::TypeId::kNumberTypeFloat32;
case dataset::DataType::DE_FLOAT64:
return mindspore::TypeId::kNumberTypeFloat64;
default:
return kTypeUnknown;
}
}
MSTensor *DETensor::CreateTensor(TypeId data_type, const std::vector<int> &shape) { MSTensor *DETensor::CreateTensor(TypeId data_type, const std::vector<int> &shape) {
return new DETensor(data_type, shape); return new DETensor(data_type, shape);
} }
...@@ -100,7 +39,7 @@ DETensor::DETensor(TypeId data_type, const std::vector<int> &shape) { ...@@ -100,7 +39,7 @@ DETensor::DETensor(TypeId data_type, const std::vector<int> &shape) {
t_shape.reserve(shape.size()); t_shape.reserve(shape.size());
std::transform(shape.begin(), shape.end(), std::back_inserter(t_shape), std::transform(shape.begin(), shape.end(), std::back_inserter(t_shape),
[](int s) -> dataset::dsize_t { return static_cast<dataset::dsize_t>(s); }); [](int s) -> dataset::dsize_t { return static_cast<dataset::dsize_t>(s); });
dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), MSTypeToDEType(data_type), &this->tensor_impl_); dataset::Tensor::CreateEmpty(dataset::TensorShape(t_shape), dataset::MSTypeToDEType(data_type), &this->tensor_impl_);
} }
DETensor::DETensor(std::shared_ptr<dataset::Tensor> tensor_ptr) { this->tensor_impl_ = std::move(tensor_ptr); } DETensor::DETensor(std::shared_ptr<dataset::Tensor> tensor_ptr) { this->tensor_impl_ = std::move(tensor_ptr); }
...@@ -120,14 +59,14 @@ std::shared_ptr<dataset::Tensor> DETensor::tensor() const { ...@@ -120,14 +59,14 @@ std::shared_ptr<dataset::Tensor> DETensor::tensor() const {
TypeId DETensor::data_type() const { TypeId DETensor::data_type() const {
MS_ASSERT(this->tensor_impl_ != nullptr); MS_ASSERT(this->tensor_impl_ != nullptr);
return DETypeToMSType(this->tensor_impl_->type()); return dataset::DETypeToMSType(this->tensor_impl_->type());
} }
TypeId DETensor::set_data_type(TypeId data_type) { TypeId DETensor::set_data_type(TypeId data_type) {
MS_ASSERT(this->tensor_impl_ != nullptr); MS_ASSERT(this->tensor_impl_ != nullptr);
if (data_type != this->data_type()) { if (data_type != this->data_type()) {
std::shared_ptr<dataset::Tensor> temp; std::shared_ptr<dataset::Tensor> temp;
dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), MSTypeToDEType(data_type), dataset::Tensor::CreateFromMemory(this->tensor_impl_->shape(), dataset::MSTypeToDEType(data_type),
this->tensor_impl_->GetBuffer(), &temp); this->tensor_impl_->GetBuffer(), &temp);
this->tensor_impl_ = temp; this->tensor_impl_ = temp;
} }
......
...@@ -50,13 +50,6 @@ Status RandomDataOp::Builder::Build(std::shared_ptr<RandomDataOp> *out_op) { ...@@ -50,13 +50,6 @@ Status RandomDataOp::Builder::Build(std::shared_ptr<RandomDataOp> *out_op) {
std::make_shared<RandomDataOp>(builder_num_workers_, builder_op_connector_size_, builder_rows_per_buffer_, std::make_shared<RandomDataOp>(builder_num_workers_, builder_op_connector_size_, builder_rows_per_buffer_,
builder_total_rows_, std::move(builder_data_schema_), std::move(builder_sampler_)); builder_total_rows_, std::move(builder_data_schema_), std::move(builder_sampler_));
// If the user did not provide a schema, then we will ask the op to generate a pseudo-random
// schema.
// See details of generateSchema function to learn what type of schema it will create.
if ((*out_op)->data_schema_ == nullptr) {
RETURN_IF_NOT_OK((*out_op)->GenerateSchema());
}
return Status::OK(); return Status::OK();
} }
...@@ -85,6 +78,12 @@ RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64 ...@@ -85,6 +78,12 @@ RandomDataOp::RandomDataOp(int32_t num_workers, int32_t op_connector_size, int64
if (total_rows_ == 0) { if (total_rows_ == 0) {
total_rows_ = GenRandomInt(1, kMaxTotalRows); total_rows_ = GenRandomInt(1, kMaxTotalRows);
} }
// If the user did not provide a schema, then we will ask the op to generate a pseudo-random
// schema.
// See details of generateSchema function to learn what type of schema it will create.
if (data_schema_ == nullptr) {
GenerateSchema();
}
// Everyone is already out from the sync area. // Everyone is already out from the sync area.
all_out_.Set(); all_out_.Set();
} }
...@@ -106,11 +105,7 @@ void RandomDataOp::Print(std::ostream &out, bool show_all) const { ...@@ -106,11 +105,7 @@ void RandomDataOp::Print(std::ostream &out, bool show_all) const {
} }
// Helper function to produce a default/random schema if one didn't exist // Helper function to produce a default/random schema if one didn't exist
Status RandomDataOp::GenerateSchema() { void RandomDataOp::GenerateSchema() {
if (data_schema_ != nullptr) {
return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Generating a schema but one already exists!");
}
// To randomly create a schema, we need to choose: // To randomly create a schema, we need to choose:
// a) how many columns // a) how many columns
// b) the type of each column // b) the type of each column
...@@ -144,8 +139,6 @@ Status RandomDataOp::GenerateSchema() { ...@@ -144,8 +139,6 @@ Status RandomDataOp::GenerateSchema() {
data_schema_->AddColumn(*newCol); data_schema_->AddColumn(*newCol);
} }
return Status::OK();
} }
// Class functor operator () override. // Class functor operator () override.
......
...@@ -213,9 +213,8 @@ class RandomDataOp : public ParallelOp { ...@@ -213,9 +213,8 @@ class RandomDataOp : public ParallelOp {
/** /**
* Helper function to produce a default/random schema if one didn't exist * Helper function to produce a default/random schema if one didn't exist
@return Status - The error code return
*/ */
Status GenerateSchema(); void GenerateSchema();
/** /**
* Performs a synchronization between workers at the end of an epoch * Performs a synchronization between workers at the end of an epoch
......
...@@ -24,9 +24,11 @@ ...@@ -24,9 +24,11 @@
#include <utility> #include <utility>
#include <string> #include <string>
#include "minddata/dataset/core/constants.h" #include "minddata/dataset/core/constants.h"
#include "minddata/dataset/engine/data_schema.h"
#include "minddata/dataset/include/tensor.h" #include "minddata/dataset/include/tensor.h"
#include "minddata/dataset/include/iterator.h" #include "minddata/dataset/include/iterator.h"
#include "minddata/dataset/include/samplers.h" #include "minddata/dataset/include/samplers.h"
#include "minddata/dataset/include/type_id.h"
namespace mindspore { namespace mindspore {
namespace dataset { namespace dataset {
...@@ -40,6 +42,7 @@ class TensorShape; ...@@ -40,6 +42,7 @@ class TensorShape;
namespace api { namespace api {
class TensorOperation; class TensorOperation;
class SchemaObj;
class SamplerObj; class SamplerObj;
// Datasets classes (in alphabetical order) // Datasets classes (in alphabetical order)
class CelebADataset; class CelebADataset;
...@@ -49,6 +52,7 @@ class CLUEDataset; ...@@ -49,6 +52,7 @@ class CLUEDataset;
class CocoDataset; class CocoDataset;
class ImageFolderDataset; class ImageFolderDataset;
class MnistDataset; class MnistDataset;
class RandomDataset;
class TextFileDataset; class TextFileDataset;
class VOCDataset; class VOCDataset;
// Dataset Op classes (in alphabetical order) // Dataset Op classes (in alphabetical order)
...@@ -63,6 +67,11 @@ class SkipDataset; ...@@ -63,6 +67,11 @@ class SkipDataset;
class TakeDataset; class TakeDataset;
class ZipDataset; class ZipDataset;
/// \brief Function to create a SchemaObj
/// \param[in] schema_file Path of schema file
/// \return Shared pointer to the current schema
std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = "");
/// \brief Function to create a CelebADataset /// \brief Function to create a CelebADataset
/// \notes The generated dataset has two columns ['image', 'attr']. /// \notes The generated dataset has two columns ['image', 'attr'].
// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type. // The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
...@@ -167,6 +176,21 @@ std::shared_ptr<MnistDataset> Mnist(std::string dataset_dir, std::shared_ptr<Sam ...@@ -167,6 +176,21 @@ std::shared_ptr<MnistDataset> Mnist(std::string dataset_dir, std::shared_ptr<Sam
std::shared_ptr<ConcatDataset> operator+(const std::shared_ptr<Dataset> &datasets1, std::shared_ptr<ConcatDataset> operator+(const std::shared_ptr<Dataset> &datasets1,
const std::shared_ptr<Dataset> &datasets2); const std::shared_ptr<Dataset> &datasets2);
/// \brief Function to create a RandomDataset
/// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random)
/// \param[in] schema SchemaObj to set column type, data type and data shape
/// \param[in] columns_list List of columns to be read (default=None, read all columns)
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// \return Shared pointer to the current Dataset
template <typename T = std::shared_ptr<SchemaObj>>
std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schema = nullptr,
std::vector<std::string> columns_list = {},
std::shared_ptr<SamplerObj> sampler = nullptr) {
auto ds = std::make_shared<RandomDataset>(total_rows, schema, std::move(columns_list), std::move(sampler));
return ds->ValidateParams() ? ds : nullptr;
}
/// \brief Function to create a TextFileDataset /// \brief Function to create a TextFileDataset
/// \notes The generated dataset has one column ['text'] /// \notes The generated dataset has one column ['text']
/// \param[in] dataset_files List of files to be read to search for a pattern of files. The list /// \param[in] dataset_files List of files to be read to search for a pattern of files. The list
...@@ -335,6 +359,66 @@ class Dataset : public std::enable_shared_from_this<Dataset> { ...@@ -335,6 +359,66 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
int32_t worker_connector_size_; int32_t worker_connector_size_;
}; };
class SchemaObj {
public:
/// \brief Constructor
explicit SchemaObj(const std::string &schema_file = "");
/// \brief Destructor
~SchemaObj() = default;
/// \brief SchemaObj init function
/// \return bool true if schema init success
bool init();
/// \brief Add new column to the schema
/// \param[in] name name of the column.
/// \param[in] de_type data type of the column(TypeId).
/// \param[in] shape shape of the column.
/// \return bool true if schema init success
bool add_column(std::string name, TypeId de_type, std::vector<int32_t> shape);
/// \brief Add new column to the schema
/// \param[in] name name of the column.
/// \param[in] de_type data type of the column(std::string).
/// \param[in] shape shape of the column.
/// \return bool true if schema init success
bool add_column(std::string name, std::string de_type, std::vector<int32_t> shape);
/// \brief Get a JSON string of the schema
/// \return JSON string of the schema
std::string to_json();
/// \brief Get a JSON string of the schema
std::string to_string() { return to_json(); }
/// \brief set a new value to dataset_type
inline void set_dataset_type(std::string dataset_type) { dataset_type_ = dataset_type; }
/// \brief set a new value to num_rows
inline void set_num_rows(int32_t num_rows) { num_rows_ = num_rows; }
/// \brief get the current num_rows
inline int32_t get_num_rows() { return num_rows_; }
private:
/// \brief Parse the columns and add it to columns
/// \param[in] columns dataset attribution information, decoded from schema file.
/// support both nlohmann::json::value_t::array and nlohmann::json::value_t::onject.
/// \return JSON string of the schema
bool parse_column(nlohmann::json columns);
/// \brief Get schema file from json file
/// \param[in] json_obj object of json parsed.
/// \return bool true if json dump success
bool from_json(nlohmann::json json_obj);
int32_t num_rows_;
std::string dataset_type_;
std::string schema_file_;
nlohmann::json columns_;
};
/* ####################################### Derived Dataset classes ################################# */ /* ####################################### Derived Dataset classes ################################# */
// DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS // DERIVED DATASET CLASSES FOR LEAF-NODE DATASETS
...@@ -517,6 +601,53 @@ class MnistDataset : public Dataset { ...@@ -517,6 +601,53 @@ class MnistDataset : public Dataset {
std::shared_ptr<SamplerObj> sampler_; std::shared_ptr<SamplerObj> sampler_;
}; };
class RandomDataset : public Dataset {
public:
// Some constants to provide limits to random generation.
static constexpr int32_t kMaxNumColumns = 4;
static constexpr int32_t kMaxRank = 4;
static constexpr int32_t kMaxDimValue = 32;
/// \brief Constructor
RandomDataset(const int32_t &total_rows, std::shared_ptr<SchemaObj> schema, std::vector<std::string> columns_list,
std::shared_ptr<SamplerObj> sampler)
: total_rows_(total_rows),
schema_path_(""),
schema_(std::move(schema)),
columns_list_(columns_list),
sampler_(std::move(sampler)) {}
/// \brief Constructor
RandomDataset(const int32_t &total_rows, std::string schema_path, std::vector<std::string> columns_list,
std::shared_ptr<SamplerObj> sampler)
: total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), sampler_(std::move(sampler)) {}
/// \brief Destructor
~RandomDataset() = default;
/// \brief a base class override function to create the required runtime dataset op objects for this class
/// \return The list of shared pointers to the newly created DatasetOps
std::vector<std::shared_ptr<DatasetOp>> Build() override;
/// \brief Parameters validation
/// \return bool true if all the params are valid
bool ValidateParams() override;
private:
/// \brief A quick inline for producing a random number between (and including) min/max
/// \param[in] min minimum number that can be generated.
/// \param[in] max maximum number that can be generated.
/// \return The generated random number
int32_t GenRandomInt(int32_t min, int32_t max);
int32_t total_rows_;
std::string schema_path_;
std::shared_ptr<SchemaObj> schema_;
std::vector<std::string> columns_list_;
std::shared_ptr<SamplerObj> sampler_;
std::mt19937 rand_gen_;
};
/// \class TextFileDataset /// \class TextFileDataset
/// \brief A Dataset derived class to represent TextFile dataset /// \brief A Dataset derived class to represent TextFile dataset
class TextFileDataset : public Dataset { class TextFileDataset : public Dataset {
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
#include "minddata/dataset/core/data_type.h"
#include "mindspore/core/ir/dtype/type_id.h"
namespace mindspore {
namespace dataset {
inline dataset::DataType MSTypeToDEType(TypeId data_type) {
switch (data_type) {
case kNumberTypeBool:
return dataset::DataType(dataset::DataType::DE_BOOL);
case kNumberTypeInt8:
return dataset::DataType(dataset::DataType::DE_INT8);
case kNumberTypeUInt8:
return dataset::DataType(dataset::DataType::DE_UINT8);
case kNumberTypeInt16:
return dataset::DataType(dataset::DataType::DE_INT16);
case kNumberTypeUInt16:
return dataset::DataType(dataset::DataType::DE_UINT16);
case kNumberTypeInt32:
return dataset::DataType(dataset::DataType::DE_INT32);
case kNumberTypeUInt32:
return dataset::DataType(dataset::DataType::DE_UINT32);
case kNumberTypeInt64:
return dataset::DataType(dataset::DataType::DE_INT64);
case kNumberTypeUInt64:
return dataset::DataType(dataset::DataType::DE_UINT64);
case kNumberTypeFloat16:
return dataset::DataType(dataset::DataType::DE_FLOAT16);
case kNumberTypeFloat32:
return dataset::DataType(dataset::DataType::DE_FLOAT32);
case kNumberTypeFloat64:
return dataset::DataType(dataset::DataType::DE_FLOAT64);
default:
return dataset::DataType(dataset::DataType::DE_UNKNOWN);
}
}
inline TypeId DETypeToMSType(dataset::DataType data_type) {
switch (data_type.value()) {
case dataset::DataType::DE_BOOL:
return mindspore::TypeId::kNumberTypeBool;
case dataset::DataType::DE_INT8:
return mindspore::TypeId::kNumberTypeInt8;
case dataset::DataType::DE_UINT8:
return mindspore::TypeId::kNumberTypeUInt8;
case dataset::DataType::DE_INT16:
return mindspore::TypeId::kNumberTypeInt16;
case dataset::DataType::DE_UINT16:
return mindspore::TypeId::kNumberTypeUInt16;
case dataset::DataType::DE_INT32:
return mindspore::TypeId::kNumberTypeInt32;
case dataset::DataType::DE_UINT32:
return mindspore::TypeId::kNumberTypeUInt32;
case dataset::DataType::DE_INT64:
return mindspore::TypeId::kNumberTypeInt64;
case dataset::DataType::DE_UINT64:
return mindspore::TypeId::kNumberTypeUInt64;
case dataset::DataType::DE_FLOAT16:
return mindspore::TypeId::kNumberTypeFloat16;
case dataset::DataType::DE_FLOAT32:
return mindspore::TypeId::kNumberTypeFloat32;
case dataset::DataType::DE_FLOAT64:
return mindspore::TypeId::kNumberTypeFloat64;
default:
return kTypeUnknown;
}
}
} // namespace dataset
} // namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_TYPEID_H_
...@@ -100,6 +100,7 @@ SET(DE_UT_SRCS ...@@ -100,6 +100,7 @@ SET(DE_UT_SRCS
c_api_dataset_clue_test.cc c_api_dataset_clue_test.cc
c_api_dataset_coco_test.cc c_api_dataset_coco_test.cc
c_api_dataset_filetext_test.cc c_api_dataset_filetext_test.cc
c_api_dataset_randomdata_test.cc
c_api_dataset_voc_test.cc c_api_dataset_voc_test.cc
c_api_datasets_test.cc c_api_datasets_test.cc
c_api_dataset_iterator_test.cc c_api_dataset_iterator_test.cc
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "common/common.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/core/config_manager.h"
#include "minddata/dataset/core/global_context.h"
#include "mindspore/core/ir/dtype/type_id.h"
using namespace mindspore::dataset;
using namespace mindspore::dataset::api;
using mindspore::dataset::Tensor;
using mindspore::dataset::TensorShape;
using mindspore::dataset::DataType;
class MindDataTestPipeline : public UT::DatasetOpTesting {
protected:
};
TEST_F(MindDataTestPipeline, TestRandomDatasetBasic1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic1.";
// Create a RandomDataset
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2});
schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1});
std::shared_ptr<Dataset> ds = RandomData(50, schema);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(4);
EXPECT_NE(ds, nullptr);
// Create a Repeat operation on ds
ds = ds->Repeat(4);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
// Check if RandomDataOp read correct columns
uint64_t i = 0;
while (row.size() != 0) {
auto image = row["image"];
auto label = row["label"];
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
MS_LOG(INFO) << "Tensor label shape: " << label->shape();
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 200);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestRandomDatasetBasic2) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic2.";
// Create a RandomDataset
std::shared_ptr<Dataset> ds = RandomData(10);
EXPECT_NE(ds, nullptr);
ds = ds->SetNumWorkers(1);
EXPECT_NE(ds, nullptr);
// Create a Repeat operation on ds
ds = ds->Repeat(2);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
// Check if RandomDataOp read correct columns
uint64_t i = 0;
while (row.size() != 0) {
auto image = row["image"];
auto label = row["label"];
MS_LOG(INFO) << "Tensor image shape: " << image->shape();
MS_LOG(INFO) << "Tensor label shape: " << label->shape();
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 20);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestRandomDatasetBasic3) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic3.";
// Create a RandomDataset
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(246);
std::string SCHEMA_FILE = datasets_root_path_ + "/testTFTestAllTypes/datasetSchema.json";
std::shared_ptr<SchemaObj> schema = Schema(SCHEMA_FILE);
std::shared_ptr<Dataset> ds = RandomData(0, schema);
EXPECT_NE(ds, nullptr);
// Create a Repeat operation on ds
ds = ds->Repeat(2);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
// Check if RandomDataOp read correct columns
uint64_t i = 0;
while (row.size() != 0) {
auto col_sint16 = row["col_sint16"];
auto col_sint32 = row["col_sint32"];
auto col_sint64 = row["col_sint64"];
auto col_float = row["col_float"];
auto col_1d = row["col_1d"];
auto col_2d = row["col_2d"];
auto col_3d = row["col_3d"];
auto col_binary = row["col_binary"];
// validate shape
ASSERT_EQ(col_sint16->shape(), TensorShape({1}));
ASSERT_EQ(col_sint32->shape(), TensorShape({1}));
ASSERT_EQ(col_sint64->shape(), TensorShape({1}));
ASSERT_EQ(col_float->shape(), TensorShape({1}));
ASSERT_EQ(col_1d->shape(), TensorShape({2}));
ASSERT_EQ(col_2d->shape(), TensorShape({2, 2}));
ASSERT_EQ(col_3d->shape(), TensorShape({2, 2, 2}));
ASSERT_EQ(col_binary->shape(), TensorShape({1}));
// validate Rank
ASSERT_EQ(col_sint16->Rank(), 1);
ASSERT_EQ(col_sint32->Rank(), 1);
ASSERT_EQ(col_sint64->Rank(), 1);
ASSERT_EQ(col_float->Rank(), 1);
ASSERT_EQ(col_1d->Rank(), 1);
ASSERT_EQ(col_2d->Rank(), 2);
ASSERT_EQ(col_3d->Rank(), 3);
ASSERT_EQ(col_binary->Rank(), 1);
// validate type
ASSERT_EQ(col_sint16->type(), DataType::DE_INT16);
ASSERT_EQ(col_sint32->type(), DataType::DE_INT32);
ASSERT_EQ(col_sint64->type(), DataType::DE_INT64);
ASSERT_EQ(col_float->type(), DataType::DE_FLOAT32);
ASSERT_EQ(col_1d->type(), DataType::DE_INT64);
ASSERT_EQ(col_2d->type(), DataType::DE_INT64);
ASSERT_EQ(col_3d->type(), DataType::DE_INT64);
ASSERT_EQ(col_binary->type(), DataType::DE_UINT8);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 984);
// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}
TEST_F(MindDataTestPipeline, TestRandomDatasetBasic4) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetBasic3.";
// Create a RandomDataset
u_int32_t curr_seed = GlobalContext::config_manager()->seed();
GlobalContext::config_manager()->set_seed(246);
std::string SCHEMA_FILE = datasets_root_path_ + "/testTFTestAllTypes/datasetSchema.json";
std::shared_ptr<Dataset> ds = RandomData(0, SCHEMA_FILE);
EXPECT_NE(ds, nullptr);
// Create a Repeat operation on ds
ds = ds->Repeat(2);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std::shared_ptr<Iterator> iter = ds->CreateIterator();
EXPECT_NE(iter, nullptr);
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
// Check if RandomDataOp read correct columns
uint64_t i = 0;
while (row.size() != 0) {
auto col_sint16 = row["col_sint16"];
auto col_sint32 = row["col_sint32"];
auto col_sint64 = row["col_sint64"];
auto col_float = row["col_float"];
auto col_1d = row["col_1d"];
auto col_2d = row["col_2d"];
auto col_3d = row["col_3d"];
auto col_binary = row["col_binary"];
// validate shape
ASSERT_EQ(col_sint16->shape(), TensorShape({1}));
ASSERT_EQ(col_sint32->shape(), TensorShape({1}));
ASSERT_EQ(col_sint64->shape(), TensorShape({1}));
ASSERT_EQ(col_float->shape(), TensorShape({1}));
ASSERT_EQ(col_1d->shape(), TensorShape({2}));
ASSERT_EQ(col_2d->shape(), TensorShape({2, 2}));
ASSERT_EQ(col_3d->shape(), TensorShape({2, 2, 2}));
ASSERT_EQ(col_binary->shape(), TensorShape({1}));
// validate Rank
ASSERT_EQ(col_sint16->Rank(), 1);
ASSERT_EQ(col_sint32->Rank(), 1);
ASSERT_EQ(col_sint64->Rank(), 1);
ASSERT_EQ(col_float->Rank(), 1);
ASSERT_EQ(col_1d->Rank(), 1);
ASSERT_EQ(col_2d->Rank(), 2);
ASSERT_EQ(col_3d->Rank(), 3);
ASSERT_EQ(col_binary->Rank(), 1);
// validate type
ASSERT_EQ(col_sint16->type(), DataType::DE_INT16);
ASSERT_EQ(col_sint32->type(), DataType::DE_INT32);
ASSERT_EQ(col_sint64->type(), DataType::DE_INT64);
ASSERT_EQ(col_float->type(), DataType::DE_FLOAT32);
ASSERT_EQ(col_1d->type(), DataType::DE_INT64);
ASSERT_EQ(col_2d->type(), DataType::DE_INT64);
ASSERT_EQ(col_3d->type(), DataType::DE_INT64);
ASSERT_EQ(col_binary->type(), DataType::DE_UINT8);
iter->GetNextRow(&row);
i++;
}
EXPECT_EQ(i, 984);
// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册