Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
89cd4652
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
89cd4652
编写于
8月 20, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 20, 2020
浏览文件
操作
浏览文件
下载
差异文件
!4597 [Dataset] C++ API Support for build_vocab
Merge pull request !4597 from luoyang/c-api
上级
0e65b3ba
b50ae27c
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
909 addition
and
18 deletion
+909
-18
mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt
mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt
+1
-0
mindspore/ccsrc/minddata/dataset/api/datasets.cc
mindspore/ccsrc/minddata/dataset/api/datasets.cc
+72
-1
mindspore/ccsrc/minddata/dataset/api/text.cc
mindspore/ccsrc/minddata/dataset/api/text.cc
+64
-0
mindspore/ccsrc/minddata/dataset/core/constants.h
mindspore/ccsrc/minddata/dataset/core/constants.h
+2
-0
mindspore/ccsrc/minddata/dataset/include/datasets.h
mindspore/ccsrc/minddata/dataset/include/datasets.h
+48
-0
mindspore/ccsrc/minddata/dataset/include/text.h
mindspore/ccsrc/minddata/dataset/include/text.h
+65
-0
mindspore/ccsrc/minddata/dataset/text/vocab.cc
mindspore/ccsrc/minddata/dataset/text/vocab.cc
+143
-15
mindspore/ccsrc/minddata/dataset/text/vocab.h
mindspore/ccsrc/minddata/dataset/text/vocab.h
+28
-0
tests/ut/cpp/dataset/CMakeLists.txt
tests/ut/cpp/dataset/CMakeLists.txt
+3
-1
tests/ut/cpp/dataset/build_vocab_test.cc
tests/ut/cpp/dataset/build_vocab_test.cc
+229
-0
tests/ut/cpp/dataset/c_api_dataset_textfile_test.cc
tests/ut/cpp/dataset/c_api_dataset_textfile_test.cc
+0
-0
tests/ut/cpp/dataset/c_api_dataset_voc_test.cc
tests/ut/cpp/dataset/c_api_dataset_voc_test.cc
+0
-1
tests/ut/cpp/dataset/c_api_dataset_vocab.cc
tests/ut/cpp/dataset/c_api_dataset_vocab.cc
+254
-0
未找到文件。
mindspore/ccsrc/minddata/dataset/api/CMakeLists.txt
浏览文件 @
89cd4652
...
@@ -26,4 +26,5 @@ add_library(cpp-API OBJECT
...
@@ -26,4 +26,5 @@ add_library(cpp-API OBJECT
iterator.cc
iterator.cc
transforms.cc
transforms.cc
samplers.cc
samplers.cc
text.cc
)
)
mindspore/ccsrc/minddata/dataset/api/datasets.cc
浏览文件 @
89cd4652
...
@@ -34,6 +34,7 @@
...
@@ -34,6 +34,7 @@
#include "minddata/dataset/engine/datasetops/source/voc_op.h"
#include "minddata/dataset/engine/datasetops/source/voc_op.h"
// Dataset operator headers (in alphabetical order)
// Dataset operator headers (in alphabetical order)
#include "minddata/dataset/engine/datasetops/batch_op.h"
#include "minddata/dataset/engine/datasetops/batch_op.h"
#include "minddata/dataset/engine/datasetops/build_vocab_op.h"
#include "minddata/dataset/engine/datasetops/concat_op.h"
#include "minddata/dataset/engine/datasetops/concat_op.h"
#include "minddata/dataset/engine/datasetops/map_op/map_op.h"
#include "minddata/dataset/engine/datasetops/map_op/map_op.h"
#include "minddata/dataset/engine/datasetops/project_op.h"
#include "minddata/dataset/engine/datasetops/project_op.h"
...
@@ -263,6 +264,37 @@ std::shared_ptr<BatchDataset> Dataset::Batch(int32_t batch_size, bool drop_remai
...
@@ -263,6 +264,37 @@ std::shared_ptr<BatchDataset> Dataset::Batch(int32_t batch_size, bool drop_remai
return
ds
;
return
ds
;
}
}
// Function to create a Vocab from dataset
std
::
shared_ptr
<
Vocab
>
Dataset
::
BuildVocab
(
const
std
::
vector
<
std
::
string
>
&
columns
,
const
std
::
pair
<
int64_t
,
int64_t
>
&
freq_range
,
int64_t
top_k
,
const
std
::
vector
<
std
::
string
>
&
special_tokens
,
bool
special_first
)
{
auto
vocab
=
std
::
make_shared
<
Vocab
>
();
auto
ds
=
std
::
make_shared
<
BuildVocabDataset
>
(
vocab
,
columns
,
freq_range
,
top_k
,
special_tokens
,
special_first
);
if
(
!
ds
->
ValidateParams
())
{
return
nullptr
;
}
ds
->
children
.
push_back
(
shared_from_this
());
// Run tree here to starting building vocab
std
::
shared_ptr
<
Iterator
>
iter
=
ds
->
CreateIterator
();
if
(
iter
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Fail to run iterator in BuildVocab."
;
return
nullptr
;
}
// Finish building vocab by triggering GetNextRow
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
row
;
iter
->
GetNextRow
(
&
row
);
if
(
vocab
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Fail to build vocab."
;
return
nullptr
;
}
return
vocab
;
}
// Function to create a Concat dataset
// Function to create a Concat dataset
std
::
shared_ptr
<
ConcatDataset
>
Dataset
::
Concat
(
const
std
::
vector
<
std
::
shared_ptr
<
Dataset
>>
&
datasets
)
{
std
::
shared_ptr
<
ConcatDataset
>
Dataset
::
Concat
(
const
std
::
vector
<
std
::
shared_ptr
<
Dataset
>>
&
datasets
)
{
auto
ds
=
std
::
make_shared
<
ConcatDataset
>
(
datasets
);
auto
ds
=
std
::
make_shared
<
ConcatDataset
>
(
datasets
);
...
@@ -1450,13 +1482,52 @@ std::vector<std::shared_ptr<DatasetOp>> BatchDataset::Build() {
...
@@ -1450,13 +1482,52 @@ std::vector<std::shared_ptr<DatasetOp>> BatchDataset::Build() {
bool
BatchDataset
::
ValidateParams
()
{
bool
BatchDataset
::
ValidateParams
()
{
if
(
batch_size_
<=
0
)
{
if
(
batch_size_
<=
0
)
{
MS_LOG
(
ERROR
)
<<
"Batch:
Batch size cannot be negative"
;
MS_LOG
(
ERROR
)
<<
"Batch:
batch_size should be positive integer, but got: "
<<
batch_size_
;
return
false
;
return
false
;
}
}
return
true
;
return
true
;
}
}
BuildVocabDataset
::
BuildVocabDataset
(
std
::
shared_ptr
<
Vocab
>
vocab
,
const
std
::
vector
<
std
::
string
>
&
columns
,
const
std
::
pair
<
int64_t
,
int64_t
>
&
freq_range
,
int64_t
top_k
,
const
std
::
vector
<
std
::
string
>
&
special_tokens
,
bool
special_first
)
:
vocab_
(
vocab
),
columns_
(
columns
),
freq_range_
(
freq_range
),
top_k_
(
top_k
),
special_tokens_
(
special_tokens
),
special_first_
(
special_first
)
{}
// Function to build BuildVocabDataset
std
::
vector
<
std
::
shared_ptr
<
DatasetOp
>>
BuildVocabDataset
::
Build
()
{
// A vector containing shared pointer to the Dataset Ops that this object will create
std
::
vector
<
std
::
shared_ptr
<
DatasetOp
>>
node_ops
;
std
::
shared_ptr
<
BuildVocabOp
>
build_vocab_op
;
build_vocab_op
=
std
::
make_shared
<
BuildVocabOp
>
(
vocab_
,
columns_
,
freq_range_
,
top_k_
,
special_tokens_
,
special_first_
,
num_workers_
,
connector_que_size_
);
node_ops
.
push_back
(
build_vocab_op
);
return
node_ops
;
}
bool
BuildVocabDataset
::
ValidateParams
()
{
if
(
vocab_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"BuildVocab: vocab is null."
;
return
false
;
}
if
(
top_k_
<
0
)
{
MS_LOG
(
ERROR
)
<<
"BuildVocab: top_k shoule be positive, but got: "
<<
top_k_
;
return
false
;
}
if
(
freq_range_
.
first
<
0
||
freq_range_
.
second
>
kDeMaxFreq
||
freq_range_
.
first
>
freq_range_
.
second
)
{
MS_LOG
(
ERROR
)
<<
"BuildVocab: requency_range [a,b] should be 0 <= a <= b (a,b are inclusive), "
<<
"but got ["
<<
freq_range_
.
first
<<
", "
<<
freq_range_
.
second
<<
"]"
;
return
false
;
}
return
true
;
}
// Function to build ConcatOp
// Function to build ConcatOp
ConcatDataset
::
ConcatDataset
(
const
std
::
vector
<
std
::
shared_ptr
<
Dataset
>>
&
datasets
)
:
datasets_
(
datasets
)
{
ConcatDataset
::
ConcatDataset
(
const
std
::
vector
<
std
::
shared_ptr
<
Dataset
>>
&
datasets
)
:
datasets_
(
datasets
)
{
this
->
children
=
datasets_
;
this
->
children
=
datasets_
;
...
...
mindspore/ccsrc/minddata/dataset/api/text.cc
0 → 100644
浏览文件 @
89cd4652
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "minddata/dataset/include/text.h"
#include "minddata/dataset/text/kernels/lookup_op.h"
namespace
mindspore
{
namespace
dataset
{
namespace
api
{
namespace
text
{
std
::
shared_ptr
<
LookupOperation
>
Lookup
(
const
std
::
shared_ptr
<
Vocab
>
&
vocab
,
const
std
::
string
&
unknown_token
)
{
auto
op
=
std
::
make_shared
<
LookupOperation
>
(
vocab
,
unknown_token
);
if
(
!
op
->
ValidateParams
())
{
return
nullptr
;
}
return
op
;
}
// LookupOperation
LookupOperation
::
LookupOperation
(
const
std
::
shared_ptr
<
Vocab
>
&
vocab
,
const
std
::
string
&
unknown_token
)
:
vocab_
(
vocab
),
unknown_token_
(
unknown_token
),
default_id_
(
Vocab
::
kNoTokenExists
)
{}
bool
LookupOperation
::
ValidateParams
()
{
if
(
vocab_
==
nullptr
)
{
LOG
(
ERROR
)
<<
"Lookup: vocab object type is incorrect or null."
;
return
false
;
}
if
(
unknown_token_
.
empty
())
{
LOG
(
ERROR
)
<<
"Lookup: no unknown token is specified."
;
return
false
;
}
else
{
default_id_
=
vocab_
->
Lookup
(
unknown_token_
);
if
(
default_id_
==
Vocab
::
kNoTokenExists
)
{
LOG
(
ERROR
)
<<
"Lookup: unknown_token: ["
+
unknown_token_
+
"], does not exist in vocab."
;
return
false
;
}
}
return
true
;
}
std
::
shared_ptr
<
TensorOp
>
LookupOperation
::
Build
()
{
std
::
shared_ptr
<
LookupOp
>
tensor_op
=
std
::
make_shared
<
LookupOp
>
(
vocab_
,
default_id_
);
return
tensor_op
;
}
}
// namespace text
}
// namespace api
}
// namespace dataset
}
// namespace mindspore
mindspore/ccsrc/minddata/dataset/core/constants.h
浏览文件 @
89cd4652
...
@@ -59,6 +59,8 @@ inline void BitClear(uint32_t *bits, uint32_t bitMask) { *bits &= (~bitMask); }
...
@@ -59,6 +59,8 @@ inline void BitClear(uint32_t *bits, uint32_t bitMask) { *bits &= (~bitMask); }
constexpr
int32_t
kDeMaxDim
=
std
::
numeric_limits
<
int32_t
>::
max
();
// 2147483647 or 2^32 -1
constexpr
int32_t
kDeMaxDim
=
std
::
numeric_limits
<
int32_t
>::
max
();
// 2147483647 or 2^32 -1
constexpr
int32_t
kDeMaxRank
=
std
::
numeric_limits
<
int32_t
>::
max
();
constexpr
int32_t
kDeMaxRank
=
std
::
numeric_limits
<
int32_t
>::
max
();
constexpr
int64_t
kDeMaxFreq
=
std
::
numeric_limits
<
int64_t
>::
max
();
// 9223372036854775807 or 2^(64-1)
constexpr
int64_t
kDeMaxTopk
=
std
::
numeric_limits
<
int64_t
>::
max
();
constexpr
uint32_t
kCfgRowsPerBuffer
=
1
;
constexpr
uint32_t
kCfgRowsPerBuffer
=
1
;
constexpr
uint32_t
kCfgParallelWorkers
=
4
;
constexpr
uint32_t
kCfgParallelWorkers
=
4
;
...
...
mindspore/ccsrc/minddata/dataset/include/datasets.h
浏览文件 @
89cd4652
...
@@ -30,6 +30,7 @@
...
@@ -30,6 +30,7 @@
#include "minddata/dataset/include/iterator.h"
#include "minddata/dataset/include/iterator.h"
#include "minddata/dataset/include/samplers.h"
#include "minddata/dataset/include/samplers.h"
#include "minddata/dataset/include/type_id.h"
#include "minddata/dataset/include/type_id.h"
#include "minddata/dataset/text/vocab.h"
namespace
mindspore
{
namespace
mindspore
{
namespace
dataset
{
namespace
dataset
{
...
@@ -39,6 +40,7 @@ class DatasetOp;
...
@@ -39,6 +40,7 @@ class DatasetOp;
class
DataSchema
;
class
DataSchema
;
class
Tensor
;
class
Tensor
;
class
TensorShape
;
class
TensorShape
;
class
Vocab
;
namespace
api
{
namespace
api
{
...
@@ -61,6 +63,7 @@ class TextFileDataset;
...
@@ -61,6 +63,7 @@ class TextFileDataset;
class
VOCDataset
;
class
VOCDataset
;
// Dataset Op classes (in alphabetical order)
// Dataset Op classes (in alphabetical order)
class
BatchDataset
;
class
BatchDataset
;
class
BuildVocabDataset
;
class
ConcatDataset
;
class
ConcatDataset
;
class
MapDataset
;
class
MapDataset
;
class
ProjectDataset
;
class
ProjectDataset
;
...
@@ -325,6 +328,24 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
...
@@ -325,6 +328,24 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
/// \return Shared pointer to the current BatchDataset
/// \return Shared pointer to the current BatchDataset
std
::
shared_ptr
<
BatchDataset
>
Batch
(
int32_t
batch_size
,
bool
drop_remainder
=
false
);
std
::
shared_ptr
<
BatchDataset
>
Batch
(
int32_t
batch_size
,
bool
drop_remainder
=
false
);
/// \brief Function to create a Vocab from source dataset
/// \notes Build a vocab from a dataset. This would collect all the unique words in a dataset and return a vocab
/// which contains top_k most frequent words (if top_k is specified)
/// \param[in] columns Column names to get words from. It can be a vector of column names
/// \param[in] freq_range A tuple of integers (min_frequency, max_frequency). Words within the frequency
/// range would be kept. 0 <= min_frequency <= max_frequency <= total_words. min_frequency/max_frequency
/// can be set to default, which corresponds to 0/total_words separately
/// \param[in] top_k Number of words to be built into vocab. top_k most frequent words are
// taken. The top_k is taken after freq_range. If not enough top_k, all words will be taken
/// \param[in] special_tokens A list of strings, each one is a special token
/// \param[in] special_first Whether special_tokens will be prepended/appended to vocab, If special_tokens
/// is specified and special_first is set to default, special_tokens will be prepended
/// \return Shared pointer to the current Vocab
std
::
shared_ptr
<
Vocab
>
BuildVocab
(
const
std
::
vector
<
std
::
string
>
&
columns
=
{},
const
std
::
pair
<
int64_t
,
int64_t
>
&
freq_range
=
{
0
,
kDeMaxFreq
},
int64_t
top_k
=
kDeMaxTopk
,
const
std
::
vector
<
std
::
string
>
&
special_tokens
=
{},
bool
special_first
=
true
);
/// \brief Function to create a ConcatDataset
/// \brief Function to create a ConcatDataset
/// \notes Concat the datasets in the input
/// \notes Concat the datasets in the input
/// \param[in] datasets List of shared pointers to the dataset that should be concatenated together
/// \param[in] datasets List of shared pointers to the dataset that should be concatenated together
...
@@ -859,6 +880,33 @@ class BatchDataset : public Dataset {
...
@@ -859,6 +880,33 @@ class BatchDataset : public Dataset {
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
std
::
shared_ptr
<
Tensor
>>>
pad_map_
;
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
std
::
shared_ptr
<
Tensor
>>>
pad_map_
;
};
};
class
BuildVocabDataset
:
public
Dataset
{
public:
/// \brief Constructor
BuildVocabDataset
(
std
::
shared_ptr
<
Vocab
>
vocab
,
const
std
::
vector
<
std
::
string
>
&
columns
,
const
std
::
pair
<
int64_t
,
int64_t
>
&
freq_range
,
int64_t
top_k
,
const
std
::
vector
<
std
::
string
>
&
special_tokens
,
bool
special_first
);
/// \brief Destructor
~
BuildVocabDataset
()
=
default
;
/// \brief a base class override function to create the required runtime dataset op objects for this class
/// \return The list of shared pointers to the newly created DatasetOps
std
::
vector
<
std
::
shared_ptr
<
DatasetOp
>>
Build
()
override
;
/// \brief Parameters validation
/// \return bool true if all the params are valid
bool
ValidateParams
()
override
;
private:
std
::
shared_ptr
<
Vocab
>
vocab_
;
std
::
vector
<
std
::
string
>
columns_
;
std
::
pair
<
int64_t
,
int64_t
>
freq_range_
;
int64_t
top_k_
;
std
::
vector
<
std
::
string
>
special_tokens_
;
bool
special_first_
;
};
class
ConcatDataset
:
public
Dataset
{
class
ConcatDataset
:
public
Dataset
{
public:
public:
/// \brief Constructor
/// \brief Constructor
...
...
mindspore/ccsrc/minddata/dataset/include/text.h
0 → 100644
浏览文件 @
89cd4652
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_
#define MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_
#include <vector>
#include <memory>
#include <string>
#include "minddata/dataset/core/constants.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/text/vocab.h"
namespace
mindspore
{
namespace
dataset
{
namespace
api
{
// Transform operations for text
namespace
text
{
// Text Op classes (in alphabetical order)
class
LookupOperation
;
/// \brief Lookup operator that looks up a word to an id.
/// \param[in] vocab a Vocab object.
/// \param[in] unknown_token word to use for lookup if the word being looked up is out of Vocabulary (oov).
/// If unknown_token is oov, runtime error will be thrown
/// \return Shared pointer to the current TensorOperation.
std
::
shared_ptr
<
LookupOperation
>
Lookup
(
const
std
::
shared_ptr
<
Vocab
>
&
vocab
,
const
std
::
string
&
unknown_token
);
/* ####################################### Derived TensorOperation classes ################################# */
class
LookupOperation
:
public
TensorOperation
{
public:
explicit
LookupOperation
(
const
std
::
shared_ptr
<
Vocab
>
&
vocab
,
const
std
::
string
&
unknown_token
);
~
LookupOperation
()
=
default
;
std
::
shared_ptr
<
TensorOp
>
Build
()
override
;
bool
ValidateParams
()
override
;
private:
std
::
shared_ptr
<
Vocab
>
vocab_
;
std
::
string
unknown_token_
;
int32_t
default_id_
;
};
}
// namespace text
}
// namespace api
}
// namespace dataset
}
// namespace mindspore
#endif // MINDSPORE_CCSRC_MINDDATA_DATASET_API_TEXT_H_
mindspore/ccsrc/minddata/dataset/text/vocab.cc
浏览文件 @
89cd4652
...
@@ -17,8 +17,10 @@
...
@@ -17,8 +17,10 @@
#include <unordered_set>
#include <unordered_set>
#include <unordered_map>
#include <unordered_map>
#include <utility>
#include <utility>
#include <algorithm>
#include "minddata/dataset/text/vocab.h"
#include "minddata/dataset/text/vocab.h"
#include "utils/log_adapter.h"
namespace
mindspore
{
namespace
mindspore
{
namespace
dataset
{
namespace
dataset
{
...
@@ -51,6 +53,147 @@ Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tok
...
@@ -51,6 +53,147 @@ Status Vocab::BuildFromPyList(const py::list &words, const py::list &special_tok
return
Status
::
OK
();
return
Status
::
OK
();
}
}
Status
Vocab
::
BuildFromPyDict
(
const
py
::
dict
&
words
,
std
::
shared_ptr
<
Vocab
>
*
vocab
)
{
std
::
unordered_map
<
WordType
,
WordIdType
>
word2id
;
for
(
auto
p
:
words
)
{
word2id
[
py
::
str
(
p
.
first
)]
=
py
::
reinterpret_borrow
<
py
::
int_
>
(
p
.
second
);
}
*
vocab
=
std
::
make_shared
<
Vocab
>
(
std
::
move
(
word2id
));
return
Status
::
OK
();
}
void
Vocab
::
append_word
(
const
std
::
string
&
word
)
{
if
(
word2id_
.
find
(
word
)
==
word2id_
.
end
())
{
word2id_
[
word
]
=
word2id_
.
size
();
}
}
Status
Vocab
::
BuildFromUnorderedMap
(
const
std
::
unordered_map
<
WordType
,
WordIdType
>
&
words
,
std
::
shared_ptr
<
Vocab
>
*
vocab
)
{
// Validate parameters and build map
std
::
unordered_map
<
WordType
,
WordIdType
>
word2id
;
for
(
auto
p
:
words
)
{
if
(
p
.
second
<
0
)
{
MS_LOG
(
ERROR
)
<<
"index can not be negetive, but got "
<<
p
.
second
;
RETURN_STATUS_UNEXPECTED
(
"index can not be negetive, but got "
+
std
::
to_string
(
p
.
second
));
}
word2id
[
p
.
first
]
=
p
.
second
;
}
*
vocab
=
std
::
make_shared
<
Vocab
>
(
std
::
move
(
word2id
));
return
Status
::
OK
();
}
Status
Vocab
::
BuildFromVector
(
const
std
::
vector
<
WordType
>
&
words
,
const
std
::
vector
<
WordType
>
&
special_tokens
,
bool
prepend_special
,
std
::
shared_ptr
<
Vocab
>
*
vocab
)
{
// Validate parameters
std
::
string
duplicate_word
;
for
(
const
WordType
&
word
:
words
)
{
if
(
std
::
count
(
words
.
begin
(),
words
.
end
(),
word
)
>
1
)
{
if
(
duplicate_word
.
find
(
word
)
==
std
::
string
::
npos
)
{
duplicate_word
=
duplicate_word
+
", "
+
word
;
}
}
}
if
(
!
duplicate_word
.
empty
())
{
MS_LOG
(
ERROR
)
<<
"words contains duplicate word: "
<<
duplicate_word
;
RETURN_STATUS_UNEXPECTED
(
"words contains duplicate word: "
+
duplicate_word
);
}
std
::
string
duplicate_sp
;
for
(
const
WordType
&
sp
:
special_tokens
)
{
if
(
std
::
count
(
special_tokens
.
begin
(),
special_tokens
.
end
(),
sp
)
>
1
)
{
if
(
duplicate_sp
.
find
(
sp
)
==
std
::
string
::
npos
)
{
duplicate_sp
=
duplicate_sp
+
", "
+
sp
;
}
}
}
if
(
!
duplicate_sp
.
empty
())
{
MS_LOG
(
ERROR
)
<<
"special_tokens contains duplicate word: "
<<
duplicate_sp
;
RETURN_STATUS_UNEXPECTED
(
"special_tokens contains duplicate word: "
+
duplicate_sp
);
}
std
::
unordered_map
<
WordType
,
WordIdType
>
word2id
;
// if special is added in front, normal words id will start from number of special tokens
WordIdType
word_id
=
prepend_special
?
static_cast
<
WordIdType
>
(
special_tokens
.
size
())
:
0
;
for
(
auto
word
:
words
)
{
word2id
[
word
]
=
word_id
++
;
}
word_id
=
prepend_special
?
0
:
word2id
.
size
();
for
(
auto
special_token
:
special_tokens
)
{
word2id
[
special_token
]
=
word_id
++
;
}
*
vocab
=
std
::
make_shared
<
Vocab
>
(
std
::
move
(
word2id
));
return
Status
::
OK
();
}
Status
Vocab
::
BuildFromFileCpp
(
const
std
::
string
&
path
,
const
std
::
string
&
delimiter
,
int32_t
vocab_size
,
const
std
::
vector
<
WordType
>
&
special_tokens
,
bool
prepend_special
,
std
::
shared_ptr
<
Vocab
>
*
vocab
)
{
// Validate parameters
if
(
vocab_size
<
0
&&
vocab_size
!=
-
1
)
{
MS_LOG
(
ERROR
)
<<
"vocab_size shoule be either -1 or positive integer, but got "
<<
vocab_size
;
RETURN_STATUS_UNEXPECTED
(
"vocab_size shoule be either -1 or positive integer, but got "
+
std
::
to_string
(
vocab_size
));
}
std
::
string
duplicate_sp
;
for
(
const
WordType
&
sp
:
special_tokens
)
{
if
(
std
::
count
(
special_tokens
.
begin
(),
special_tokens
.
end
(),
sp
)
>
1
)
{
if
(
duplicate_sp
.
find
(
sp
)
==
std
::
string
::
npos
)
{
duplicate_sp
=
duplicate_sp
+
", "
+
sp
;
}
}
}
if
(
!
duplicate_sp
.
empty
())
{
MS_LOG
(
ERROR
)
<<
"special_tokens contains duplicate word: "
<<
duplicate_sp
;
RETURN_STATUS_UNEXPECTED
(
"special_tokens contains duplicate word: "
+
duplicate_sp
);
}
std
::
unordered_set
<
std
::
string
>
specials
;
// used to check that words in file don't contain any special token that already exists
for
(
auto
word
:
special_tokens
)
{
specials
.
insert
(
word
);
}
WordIdType
word_id
=
prepend_special
?
static_cast
<
WordIdType
>
(
special_tokens
.
size
())
:
0
;
std
::
unordered_map
<
WordType
,
WordIdType
>
word2id
;
std
::
fstream
handle
(
path
,
std
::
ios
::
in
);
if
(
!
handle
.
good
()
||
!
handle
.
is_open
())
{
MS_LOG
(
ERROR
)
<<
"fail to open:"
+
path
;
RETURN_STATUS_UNEXPECTED
(
"fail to open:"
+
path
);
}
std
::
string
word
;
while
(
std
::
getline
(
handle
,
word
))
{
if
(
!
delimiter
.
empty
())
{
// if delimiter is not found, find_first_of would return std::string::npos which is -1
word
=
word
.
substr
(
0
,
word
.
find_first_of
(
delimiter
));
}
if
(
word2id
.
find
(
word
)
!=
word2id
.
end
())
{
MS_LOG
(
ERROR
)
<<
"duplicate word:"
+
word
+
"."
;
RETURN_STATUS_UNEXPECTED
(
"duplicate word:"
+
word
+
"."
);
}
if
(
specials
.
find
(
word
)
!=
specials
.
end
())
{
MS_LOG
(
ERROR
)
<<
word
+
" is already in special_tokens."
;
RETURN_STATUS_UNEXPECTED
(
word
+
" is already in special_tokens."
);
}
word2id
[
word
]
=
word_id
++
;
// break if enough row is read, if vocab_size is smaller than 0
if
(
word2id
.
size
()
==
vocab_size
)
break
;
}
word_id
=
prepend_special
?
0
:
word2id
.
size
();
for
(
auto
special_token
:
special_tokens
)
{
word2id
[
special_token
]
=
word_id
++
;
}
*
vocab
=
std
::
make_shared
<
Vocab
>
(
std
::
move
(
word2id
));
return
Status
::
OK
();
}
Status
Vocab
::
BuildFromFile
(
const
std
::
string
&
path
,
const
std
::
string
&
delimiter
,
int32_t
vocab_size
,
Status
Vocab
::
BuildFromFile
(
const
std
::
string
&
path
,
const
std
::
string
&
delimiter
,
int32_t
vocab_size
,
const
py
::
list
&
special_tokens
,
bool
prepend_special
,
std
::
shared_ptr
<
Vocab
>
*
vocab
)
{
const
py
::
list
&
special_tokens
,
bool
prepend_special
,
std
::
shared_ptr
<
Vocab
>
*
vocab
)
{
// python validator checks special_tokens doesn't contain any duplicate words
// python validator checks special_tokens doesn't contain any duplicate words
...
@@ -86,21 +229,6 @@ Status Vocab::BuildFromFile(const std::string &path, const std::string &delimite
...
@@ -86,21 +229,6 @@ Status Vocab::BuildFromFile(const std::string &path, const std::string &delimite
return
Status
::
OK
();
return
Status
::
OK
();
}
}
Status
Vocab
::
BuildFromPyDict
(
const
py
::
dict
&
words
,
std
::
shared_ptr
<
Vocab
>
*
vocab
)
{
std
::
unordered_map
<
WordType
,
WordIdType
>
word2id
;
for
(
auto
p
:
words
)
{
word2id
[
py
::
str
(
p
.
first
)]
=
py
::
reinterpret_borrow
<
py
::
int_
>
(
p
.
second
);
}
*
vocab
=
std
::
make_shared
<
Vocab
>
(
std
::
move
(
word2id
));
return
Status
::
OK
();
}
void
Vocab
::
append_word
(
const
std
::
string
&
word
)
{
if
(
word2id_
.
find
(
word
)
==
word2id_
.
end
())
{
word2id_
[
word
]
=
word2id_
.
size
();
}
}
const
WordIdType
Vocab
::
kNoTokenExists
=
-
1
;
const
WordIdType
Vocab
::
kNoTokenExists
=
-
1
;
}
// namespace dataset
}
// namespace dataset
...
...
mindspore/ccsrc/minddata/dataset/text/vocab.h
浏览文件 @
89cd4652
...
@@ -57,6 +57,34 @@ class Vocab {
...
@@ -57,6 +57,34 @@ class Vocab {
static
Status
BuildFromFile
(
const
std
::
string
&
path
,
const
std
::
string
&
delimiter
,
int32_t
vocab_size
,
static
Status
BuildFromFile
(
const
std
::
string
&
path
,
const
std
::
string
&
delimiter
,
int32_t
vocab_size
,
const
py
::
list
&
special_tokens
,
bool
prepend_special
,
std
::
shared_ptr
<
Vocab
>
*
vocab
);
const
py
::
list
&
special_tokens
,
bool
prepend_special
,
std
::
shared_ptr
<
Vocab
>
*
vocab
);
/// \brief Build a vocab from a c++ map. id needs to start from 2, no duplicate and continuous
/// \param[in] words An unordered_map containing word, word id pair.
/// \param[out] vocab A vocab object
/// \return Error code
static
Status
BuildFromUnorderedMap
(
const
std
::
unordered_map
<
WordType
,
WordIdType
>
&
words
,
std
::
shared_ptr
<
Vocab
>
*
vocab
);
/// \brief Build a vocab from a c++ vector. id needs to start from 2, no duplicate and continuous
/// \param[in] words A vector of string, used to build vocab, id starts from 2
/// \param[in] special_tokens A vector of string contain special tokens
/// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
/// \param[out] vocab A vocab object
/// \return Error code
static
Status
BuildFromVector
(
const
std
::
vector
<
WordType
>
&
words
,
const
std
::
vector
<
WordType
>
&
special_tokens
,
bool
prepend_special
,
std
::
shared_ptr
<
Vocab
>
*
vocab
);
/// \brief Build a vocab from reading a vocab file, id are automatically assigned, start from 2
/// \param[in] path Path to vocab file , each line is assumed to contain 1 word
/// \param[in] delimiter Delimiter to break each line with
/// \param[in] vocab_size Number of words to read from file
/// \param[in] special_tokens A vector of string contain special tokens
/// \param[in] prepend_special Whether special_tokens will be prepended/appended to vocab
/// \param[out] vocab A vocab object
/// \return Error code
static
Status
BuildFromFileCpp
(
const
std
::
string
&
path
,
const
std
::
string
&
delimiter
,
int32_t
vocab_size
,
const
std
::
vector
<
WordType
>
&
special_tokens
,
bool
prepend_special
,
std
::
shared_ptr
<
Vocab
>
*
vocab
);
// Lookup the id of a word, if word doesn't exist in vocab, return default_id
// Lookup the id of a word, if word doesn't exist in vocab, return default_id
// @param const WordType word - word to look up
// @param const WordType word - word to look up
// @param WordIdType default_id - word id to return to user when its not in the vocab
// @param WordIdType default_id - word id to return to user when its not in the vocab
...
...
tests/ut/cpp/dataset/CMakeLists.txt
浏览文件 @
89cd4652
...
@@ -97,6 +97,7 @@ SET(DE_UT_SRCS
...
@@ -97,6 +97,7 @@ SET(DE_UT_SRCS
concatenate_op_test.cc
concatenate_op_test.cc
cyclic_array_test.cc
cyclic_array_test.cc
perf_data_test.cc
perf_data_test.cc
build_vocab_test.cc
c_api_samplers_test.cc
c_api_samplers_test.cc
c_api_transforms_test.cc
c_api_transforms_test.cc
c_api_dataset_ops_test.cc
c_api_dataset_ops_test.cc
...
@@ -104,12 +105,13 @@ SET(DE_UT_SRCS
...
@@ -104,12 +105,13 @@ SET(DE_UT_SRCS
c_api_dataset_clue_test.cc
c_api_dataset_clue_test.cc
c_api_dataset_coco_test.cc
c_api_dataset_coco_test.cc
c_api_dataset_csv_test.cc
c_api_dataset_csv_test.cc
c_api_dataset_
filetext
_test.cc
c_api_dataset_
textfile
_test.cc
c_api_dataset_manifest_test.cc
c_api_dataset_manifest_test.cc
c_api_dataset_randomdata_test.cc
c_api_dataset_randomdata_test.cc
c_api_dataset_voc_test.cc
c_api_dataset_voc_test.cc
c_api_datasets_test.cc
c_api_datasets_test.cc
c_api_dataset_iterator_test.cc
c_api_dataset_iterator_test.cc
c_api_dataset_vocab.cc
tensor_op_fusion_pass_test.cc
tensor_op_fusion_pass_test.cc
sliding_window_op_test.cc
sliding_window_op_test.cc
epoch_ctrl_op_test.cc
epoch_ctrl_op_test.cc
...
...
tests/ut/cpp/dataset/build_vocab_test.cc
0 → 100644
浏览文件 @
89cd4652
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <fstream>
#include <iostream>
#include <memory>
#include <vector>
#include <string>
#include "common/common.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/status.h"
using
mindspore
::
dataset
::
Tensor
;
using
mindspore
::
dataset
::
Status
;
using
mindspore
::
dataset
::
Vocab
;
class
MindDataTestVocab
:
public
UT
::
DatasetOpTesting
{
protected:
};
TEST_F
(
MindDataTestVocab
,
TestVocabFromUnorderedMap
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromUnorderedMap."
;
// Build a map
std
::
unordered_map
<
std
::
string
,
int32_t
>
dict
;
dict
[
"banana"
]
=
0
;
dict
[
"apple"
]
=
1
;
dict
[
"cat"
]
=
2
;
dict
[
"dog"
]
=
3
;
// Build vocab from map
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromUnorderedMap
(
dict
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Look up specified words
std
::
vector
<
std
::
string
>
words
=
{
"apple"
,
"dog"
,
"egg"
};
std
::
vector
<
int32_t
>
expected
=
{
1
,
3
,
-
1
};
for
(
uint32_t
i
=
0
;
i
<
words
.
size
();
++
i
)
{
int32_t
x
=
vocab
->
Lookup
(
words
[
i
]);
EXPECT_EQ
(
x
,
expected
[
i
]);
}
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromEmptyMap
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromEmptyMap."
;
// Build vocab from empty map
std
::
unordered_map
<
std
::
string
,
int32_t
>
dict
;
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromUnorderedMap
(
dict
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Look up specified words
// Expect that we will return -1 when word is not in vocab
std
::
vector
<
std
::
string
>
words
=
{
"apple"
,
"dog"
,
"egg"
};
std
::
vector
<
int32_t
>
expected
=
{
-
1
,
-
1
,
-
1
};
for
(
uint32_t
i
=
0
;
i
<
words
.
size
();
++
i
)
{
int32_t
x
=
vocab
->
Lookup
(
words
[
i
]);
EXPECT_EQ
(
x
,
expected
[
i
]);
}
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromMapFail
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromMapFail."
;
// Build a map
std
::
unordered_map
<
std
::
string
,
int32_t
>
dict
;
dict
[
"banana"
]
=
0
;
dict
[
"apple"
]
=
-
1
;
// Expected failure: index of word can not be negative
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromUnorderedMap
(
dict
,
&
vocab
);
EXPECT_NE
(
s
,
Status
::
OK
());
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromVectorPrependSpTokens
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromVectorPrependSpTokens."
;
// Build vocab from a vector of words, special tokens are prepended to vocab
std
::
vector
<
std
::
string
>
list
=
{
"apple"
,
"banana"
,
"cat"
,
"dog"
,
"egg"
};
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromVector
(
list
,
{
"<unk>"
},
true
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Look up specified words
// Expect that we will return -1 when word is not in vocab
std
::
vector
<
std
::
string
>
words
=
{
"apple"
,
"banana"
,
"fox"
};
std
::
vector
<
int32_t
>
expected
=
{
1
,
2
,
-
1
};
for
(
uint32_t
i
=
0
;
i
<
words
.
size
();
++
i
)
{
int32_t
x
=
vocab
->
Lookup
(
words
[
i
]);
EXPECT_EQ
(
x
,
expected
[
i
]);
}
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromVectorAppendSpTokens
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromVectorAppendSpTokens."
;
// Build vocab from a vector of words, special tokens are appended to vocab
std
::
vector
<
std
::
string
>
list
=
{
"apple"
,
"banana"
,
"cat"
,
"dog"
,
"egg"
};
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromVector
(
list
,
{
"<unk>"
},
false
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Look up specified words
std
::
vector
<
std
::
string
>
words
=
{
"apple"
,
"<unk>"
,
"fox"
};
std
::
vector
<
int32_t
>
expected
=
{
0
,
5
,
-
1
};
for
(
uint32_t
i
=
0
;
i
<
words
.
size
();
++
i
)
{
int32_t
x
=
vocab
->
Lookup
(
words
[
i
]);
EXPECT_EQ
(
x
,
expected
[
i
]);
}
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromVectorWithNoSpTokens
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromVectorWithNoSpTokens."
;
// Build vocab from a vector of words with no special tokens
std
::
vector
<
std
::
string
>
list
=
{
"apple"
,
"banana"
,
"cat"
,
"dog"
,
"egg"
};
std
::
vector
<
std
::
string
>
sp_tokens
=
{};
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromVector
(
list
,
sp_tokens
,
true
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Look up specified words
std
::
vector
<
std
::
string
>
words
=
{
"apple"
,
"banana"
,
"fox"
,
"<pad>"
};
std
::
vector
<
int32_t
>
expected
=
{
0
,
1
,
-
1
,
-
1
};
for
(
uint32_t
i
=
0
;
i
<
words
.
size
();
++
i
)
{
int32_t
x
=
vocab
->
Lookup
(
words
[
i
]);
EXPECT_EQ
(
x
,
expected
[
i
]);
}
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromEmptyVector
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromEmptyVector."
;
// Build vocab from empty vector
std
::
vector
<
std
::
string
>
list
=
{};
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromVector
(
list
,
{},
false
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Look up specified words
// Expect that we will return -1 when word is not in vocab
std
::
vector
<
std
::
string
>
words
=
{
"apple"
,
"banana"
,
"fox"
};
std
::
vector
<
int32_t
>
expected
=
{
-
1
,
-
1
,
-
1
};
for
(
uint32_t
i
=
0
;
i
<
words
.
size
();
++
i
)
{
int32_t
x
=
vocab
->
Lookup
(
words
[
i
]);
EXPECT_EQ
(
x
,
expected
[
i
]);
}
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromVectorFail1
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromVectorFail1."
;
// Build vocab from a vector of words with no special tokens
std
::
vector
<
std
::
string
>
list
=
{
"apple"
,
"apple"
,
"cat"
,
"cat"
,
"egg"
};
std
::
vector
<
std
::
string
>
sp_tokens
=
{};
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
// Expected failure: duplicate word apple
Status
s
=
Vocab
::
BuildFromVector
(
list
,
sp_tokens
,
true
,
&
vocab
);
EXPECT_NE
(
s
,
Status
::
OK
());
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromVectorFail2
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromVectorFail2."
;
// Build vocab from a vector of words with no special tokens
std
::
vector
<
std
::
string
>
list
=
{
"apple"
,
"dog"
,
"egg"
};
std
::
vector
<
std
::
string
>
sp_tokens
=
{
"<pad>"
,
"<unk>"
,
"<pad>"
,
"<unk>"
,
"<none>"
};
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
// Expected failure: duplicate special token <pad> <unk>
Status
s
=
Vocab
::
BuildFromVector
(
list
,
sp_tokens
,
true
,
&
vocab
);
EXPECT_NE
(
s
,
Status
::
OK
());
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromFile
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromFile."
;
// Build vocab from local file
std
::
string
vocab_dir
=
datasets_root_path_
+
"/testVocab/vocab_list.txt"
;
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromFileCpp
(
vocab_dir
,
","
,
-
1
,
{
"<pad>"
,
"<unk>"
},
true
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Look up specified words
std
::
vector
<
std
::
string
>
words
=
{
"not"
,
"all"
};
std
::
vector
<
int32_t
>
expected
=
{
2
,
3
};
for
(
uint32_t
i
=
0
;
i
<
words
.
size
();
++
i
)
{
int32_t
x
=
vocab
->
Lookup
(
words
[
i
]);
EXPECT_EQ
(
x
,
expected
[
i
]);
}
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromFileFail1
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromFileFail1."
;
// Build vocab from local file which is not exist
std
::
string
vocab_dir
=
datasets_root_path_
+
"/testVocab/not_exist.txt"
;
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromFileCpp
(
vocab_dir
,
","
,
-
1
,
{},
true
,
&
vocab
);
EXPECT_NE
(
s
,
Status
::
OK
());
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromFileFail2
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromFileFail2."
;
// Build vocab from local file
std
::
string
vocab_dir
=
datasets_root_path_
+
"/testVocab/vocab_list.txt"
;
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
// Expected failure: vocab_size shoule be either -1 or positive integer
Status
s
=
Vocab
::
BuildFromFileCpp
(
vocab_dir
,
","
,
-
2
,
{},
true
,
&
vocab
);
EXPECT_NE
(
s
,
Status
::
OK
());
}
TEST_F
(
MindDataTestVocab
,
TestVocabFromFileFail3
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestVocab-TestVocabFromFileFail2."
;
// Build vocab from local file which is not exist
std
::
string
vocab_dir
=
datasets_root_path_
+
"/testVocab/vocab_list.txt"
;
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
// Expected failure: duplicate special token <unk>
Status
s
=
Vocab
::
BuildFromFileCpp
(
vocab_dir
,
","
,
-
1
,
{
"<unk>"
,
"<unk>"
},
true
,
&
vocab
);
EXPECT_NE
(
s
,
Status
::
OK
());
}
tests/ut/cpp/dataset/c_api_dataset_
filetext
_test.cc
→
tests/ut/cpp/dataset/c_api_dataset_
textfile
_test.cc
浏览文件 @
89cd4652
文件已移动
tests/ut/cpp/dataset/c_api_dataset_voc_test.cc
浏览文件 @
89cd4652
...
@@ -14,7 +14,6 @@
...
@@ -14,7 +14,6 @@
* limitations under the License.
* limitations under the License.
*/
*/
#include "common/common.h"
#include "common/common.h"
#include "minddata/dataset/engine/datasetops/source/voc_op.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/datasets.h"
using
namespace
mindspore
::
dataset
::
api
;
using
namespace
mindspore
::
dataset
::
api
;
...
...
tests/ut/cpp/dataset/c_api_dataset_vocab.cc
0 → 100644
浏览文件 @
89cd4652
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <fstream>
#include <iostream>
#include <memory>
#include <vector>
#include <string>
#include "common/common.h"
#include "minddata/dataset/include/datasets.h"
#include "minddata/dataset/include/status.h"
#include "minddata/dataset/include/transforms.h"
#include "minddata/dataset/include/text.h"
using
namespace
mindspore
::
dataset
::
api
;
using
mindspore
::
dataset
::
ShuffleMode
;
using
mindspore
::
dataset
::
Tensor
;
using
mindspore
::
dataset
::
Status
;
using
mindspore
::
dataset
::
Vocab
;
class
MindDataTestPipeline
:
public
UT
::
DatasetOpTesting
{
protected:
};
TEST_F
(
MindDataTestPipeline
,
TestVocabLookupOp
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestPipeline-TestVocabLookupOp."
;
// Create a TextFile dataset
std
::
string
data_file
=
datasets_root_path_
+
"/testVocab/words.txt"
;
std
::
shared_ptr
<
Dataset
>
ds
=
TextFile
({
data_file
},
0
,
ShuffleMode
::
kFalse
);
EXPECT_NE
(
ds
,
nullptr
);
// Create a vocab from vector
std
::
vector
<
std
::
string
>
list
=
{
"home"
,
"IS"
,
"behind"
,
"the"
,
"world"
,
"ahead"
,
"!"
};
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromVector
(
list
,
{
"<pad>"
,
"<unk>"
},
true
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Create Lookup operation on ds
std
::
shared_ptr
<
TensorOperation
>
lookup
=
text
::
Lookup
(
vocab
,
"<unk>"
);
EXPECT_NE
(
lookup
,
nullptr
);
// Create Map operation on ds
ds
=
ds
->
Map
({
lookup
},
{
"text"
});
EXPECT_NE
(
ds
,
nullptr
);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std
::
shared_ptr
<
Iterator
>
iter
=
ds
->
CreateIterator
();
EXPECT_NE
(
iter
,
nullptr
);
// Iterate the dataset and get each row
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
row
;
iter
->
GetNextRow
(
&
row
);
uint64_t
i
=
0
;
std
::
vector
<
int32_t
>
expected
=
{
2
,
1
,
4
,
5
,
6
,
7
};
while
(
row
.
size
()
!=
0
)
{
auto
ind
=
row
[
"text"
];
MS_LOG
(
INFO
)
<<
ind
->
shape
()
<<
" "
<<
*
ind
;
std
::
shared_ptr
<
Tensor
>
expected_item
;
Tensor
::
CreateScalar
(
expected
[
i
],
&
expected_item
);
EXPECT_EQ
(
*
ind
,
*
expected_item
);
iter
->
GetNextRow
(
&
row
);
i
++
;
}
}
TEST_F
(
MindDataTestPipeline
,
TestVocabLookupOpFail1
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestPipeline-TestVocabLookupOpFail1."
;
// Create a TextFile Dataset
std
::
string
data_file
=
datasets_root_path_
+
"/testVocab/words.txt"
;
std
::
shared_ptr
<
Dataset
>
ds
=
TextFile
({
data_file
},
0
,
ShuffleMode
::
kFalse
);
EXPECT_NE
(
ds
,
nullptr
);
// Build vocab from vector
std
::
vector
<
std
::
string
>
list
=
{
"home"
,
"IS"
,
"behind"
,
"the"
,
"world"
,
"ahead"
,
"!"
};
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromVector
(
list
,
{},
true
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Create lookup op for ds
// Expected failure: "<unk>" is not a word of vocab
std
::
shared_ptr
<
TensorOperation
>
lookup
=
text
::
Lookup
(
vocab
,
"<unk>"
);
EXPECT_EQ
(
lookup
,
nullptr
);
}
TEST_F
(
MindDataTestPipeline
,
TestVocabLookupOpFail2
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestPipeline-TestVocabLookupOpFail2."
;
// Vocab has nothing
std
::
shared_ptr
<
Vocab
>
vocab
;
// Create lookup op
// Expected failure: vocab is null
std
::
shared_ptr
<
TensorOperation
>
lookup
=
text
::
Lookup
(
vocab
,
""
);
EXPECT_EQ
(
lookup
,
nullptr
);
}
TEST_F
(
MindDataTestPipeline
,
TestVocabLookupOpWithEmptyUnknownToken
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestPipeline-TestVocabLookupOpWithEmptyUnknownToken."
;
// Create a TextFile dataset
std
::
string
data_file
=
datasets_root_path_
+
"/testVocab/words.txt"
;
std
::
shared_ptr
<
Dataset
>
ds
=
TextFile
({
data_file
},
0
,
ShuffleMode
::
kFalse
);
EXPECT_NE
(
ds
,
nullptr
);
// Create a vocab from map
std
::
unordered_map
<
std
::
string
,
int32_t
>
dict
;
dict
[
"Home"
]
=
3
;
std
::
shared_ptr
<
Vocab
>
vocab
=
std
::
make_shared
<
Vocab
>
();
Status
s
=
Vocab
::
BuildFromUnorderedMap
(
dict
,
&
vocab
);
EXPECT_EQ
(
s
,
Status
::
OK
());
// Create Lookup operation on ds
// Expected failure: "" is not a word of vocab
std
::
shared_ptr
<
TensorOperation
>
lookup
=
text
::
Lookup
(
vocab
,
""
);
EXPECT_EQ
(
lookup
,
nullptr
);
}
TEST_F
(
MindDataTestPipeline
,
TestVocabFromDataset
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestPipeline-TestVocabFromDataset."
;
// Create a TextFile dataset
std
::
string
data_file
=
datasets_root_path_
+
"/testVocab/words.txt"
;
std
::
shared_ptr
<
Dataset
>
ds
=
TextFile
({
data_file
},
0
,
ShuffleMode
::
kFalse
);
EXPECT_NE
(
ds
,
nullptr
);
// Create vocab from dataset
std
::
shared_ptr
<
Vocab
>
vocab
=
ds
->
BuildVocab
({
"text"
},
{
0
,
std
::
numeric_limits
<
int64_t
>::
max
()},
std
::
numeric_limits
<
int64_t
>::
max
(),
{
"<pad>"
,
"<unk>"
},
true
);
EXPECT_NE
(
vocab
,
nullptr
);
// Check if vocab has words or not
int32_t
home_index
=
vocab
->
Lookup
(
"home"
);
EXPECT_EQ
(
home_index
,
4
);
// Create Lookup operation on ds
std
::
shared_ptr
<
TensorOperation
>
lookup
=
text
::
Lookup
(
vocab
,
"<unk>"
);
EXPECT_NE
(
lookup
,
nullptr
);
// Create Map operation on ds
ds
=
ds
->
Map
({
lookup
},
{
"text"
});
EXPECT_NE
(
ds
,
nullptr
);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std
::
shared_ptr
<
Iterator
>
iter
=
ds
->
CreateIterator
();
EXPECT_NE
(
iter
,
nullptr
);
// Iterate the dataset and get each row
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
row
;
iter
->
GetNextRow
(
&
row
);
uint64_t
i
=
0
;
std
::
vector
<
int32_t
>
expected
=
{
4
,
5
,
3
,
6
,
7
,
2
};
while
(
row
.
size
()
!=
0
)
{
auto
ind
=
row
[
"text"
];
MS_LOG
(
INFO
)
<<
ind
->
shape
()
<<
" "
<<
*
ind
;
std
::
shared_ptr
<
Tensor
>
expected_item
;
Tensor
::
CreateScalar
(
expected
[
i
],
&
expected_item
);
EXPECT_EQ
(
*
ind
,
*
expected_item
);
iter
->
GetNextRow
(
&
row
);
i
++
;
}
}
TEST_F
(
MindDataTestPipeline
,
TestVocabFromDatasetDefault
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestPipeline-TestVocabFromDatasetDefault."
;
// Create a TextFile dataset
std
::
string
data_file
=
datasets_root_path_
+
"/testVocab/words.txt"
;
std
::
shared_ptr
<
Dataset
>
ds
=
TextFile
({
data_file
},
0
,
ShuffleMode
::
kFalse
);
EXPECT_NE
(
ds
,
nullptr
);
// Create vocab from dataset
std
::
shared_ptr
<
Vocab
>
vocab
=
ds
->
BuildVocab
();
EXPECT_NE
(
vocab
,
nullptr
);
// Check if vocab has words or not
int32_t
home_index
=
vocab
->
Lookup
(
"home"
);
EXPECT_EQ
(
home_index
,
2
);
// Create Lookup operation on ds
std
::
shared_ptr
<
TensorOperation
>
lookup
=
text
::
Lookup
(
vocab
,
"home"
);
EXPECT_NE
(
lookup
,
nullptr
);
// Create Map operation on ds
ds
=
ds
->
Map
({
lookup
});
EXPECT_NE
(
ds
,
nullptr
);
// Create an iterator over the result of the above dataset
// This will trigger the creation of the Execution Tree and launch it.
std
::
shared_ptr
<
Iterator
>
iter
=
ds
->
CreateIterator
();
EXPECT_NE
(
iter
,
nullptr
);
// Iterate the dataset and get each row
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
row
;
iter
->
GetNextRow
(
&
row
);
uint64_t
i
=
0
;
std
::
vector
<
int32_t
>
expected
=
{
2
,
3
,
1
,
4
,
5
,
0
};
while
(
row
.
size
()
!=
0
)
{
auto
ind
=
row
[
"text"
];
MS_LOG
(
INFO
)
<<
ind
->
shape
()
<<
" "
<<
*
ind
;
std
::
shared_ptr
<
Tensor
>
expected_item
;
Tensor
::
CreateScalar
(
expected
[
i
],
&
expected_item
);
EXPECT_EQ
(
*
ind
,
*
expected_item
);
iter
->
GetNextRow
(
&
row
);
i
++
;
}
}
TEST_F
(
MindDataTestPipeline
,
TestVocabFromDatasetFail1
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestPipeline-TestVocabFromDatasetFail1."
;
// Create a TextFile dataset
std
::
string
data_file
=
datasets_root_path_
+
"/testVocab/words.txt"
;
std
::
shared_ptr
<
Dataset
>
ds
=
TextFile
({
data_file
},
0
,
ShuffleMode
::
kFalse
);
EXPECT_NE
(
ds
,
nullptr
);
// Create vocab from dataset
// Expected failure: top_k can not be negative
std
::
shared_ptr
<
Vocab
>
vocab
=
ds
->
BuildVocab
({
"text"
},
{
0
,
std
::
numeric_limits
<
int64_t
>::
max
()},
-
2
,
{
"<pad>"
,
"<unk>"
},
true
);
EXPECT_EQ
(
vocab
,
nullptr
);
}
TEST_F
(
MindDataTestPipeline
,
TestVocabFromDatasetFail2
)
{
MS_LOG
(
INFO
)
<<
"Doing MindDataTestPipeline-TestVocabFromDatasetFail2."
;
// Create a TextFile dataset
std
::
string
data_file
=
datasets_root_path_
+
"/testVocab/words.txt"
;
std
::
shared_ptr
<
Dataset
>
ds
=
TextFile
({
data_file
},
0
,
ShuffleMode
::
kFalse
);
EXPECT_NE
(
ds
,
nullptr
);
// Create vocab from dataset
// Expected failure: requency_range [a,b] should be 0 <= a <= b
std
::
shared_ptr
<
Vocab
>
vocab
=
ds
->
BuildVocab
({
"text"
},
{
4
,
1
},
std
::
numeric_limits
<
int64_t
>::
max
(),
{
"<pad>"
,
"<unk>"
},
true
);
EXPECT_EQ
(
vocab
,
nullptr
);
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录