未验证 提交 3f2d6a3f 编写于 作者: S Steffy-zxf 提交者: GitHub

Add FasterTokenizer Operator (#34491)

Add Tokenizer related functionalities for Transformer model in order that the process of training and predicting is consistent.

* support the text string as an input Tensor
* support the "VOCAB"unordered_map<wstring, int> as an input Tensor to lookup tokens
* Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization.
* It first applies basic tokenization, followed by wordpiece tokenization.
上级 873ee4e3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
SET(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc)
SET(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc)
# As we add extra features for utf8proc, we use the non-official repo
SET(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git)
SET(UTF8PROC_TAG v2.6.1)
IF(WIN32)
SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib")
add_definitions(-DUTF8PROC_STATIC)
ELSE(WIN32)
SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a")
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include)
ExternalProject_Add(
extern_utf8proc
${EXTERNAL_PROJECT_LOG_ARGS}
${SHALLOW_CLONE}
GIT_REPOSITORY ${UTF8PROC_REPOSITORY}
GIT_TAG ${UTF8PROC_TAG}
PREFIX ${UTF8PROC_PREFIX_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DBUILD_SHARED=ON
-DBUILD_STATIC=ON
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES}
)
ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES})
ADD_DEPENDENCIES(utf8proc extern_utf8proc)
......@@ -124,6 +124,11 @@ function(copy_part_of_thrid_party TARGET DST)
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
set(dst_dir "${DST}/third_party/install/utf8proc")
copy(${TARGET}
SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
if (WITH_CRYPTO)
set(dst_dir "${DST}/third_party/install/cryptopp")
copy(${TARGET}
......
......@@ -210,6 +210,10 @@ include(external/threadpool)# download threadpool
include(external/dlpack) # download dlpack
include(external/xxhash) # download, build, install xxhash
include(external/warpctc) # download, build, install warpctc
include(external/utf8proc) # download, build, install utf8proc
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_utf8proc)
include(external/lapack) # download, build, install lapack
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
......
......@@ -51,6 +51,8 @@ proto_library(data_feed_proto SRCS data_feed.proto)
proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
data_feed_proto)
cc_library(string_array SRCS string_array.cc DEPS utf8proc)
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
if(WITH_GPU)
......
......@@ -102,14 +102,18 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
if (var->Persistable()) {
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
VLOG(3) << "Initialize Variable " << var->Name();
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
<< " global, which pointer is " << ptr << " type is "
<< static_cast<int>(var->GetType());
} else {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
<< " locally, which pointer is " << ptr << "Variable Type "
<< static_cast<int>(var->GetType());
}
}
} else {
......
......@@ -125,6 +125,7 @@ void DeleteUnusedTensors(const Scope &scope,
for (auto &t : *lod_tensor_arr) {
garbages.emplace_back(t.MoveMemoryHolder());
}
} else if (var->IsType<Strings>()) {
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Type %s of variable %s is not supported eager deletion.",
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <boost/variant.hpp>
#include "glog/logging.h"
namespace paddle {
......@@ -35,9 +36,24 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index].ShareDataWith(input);
auto& val = BOOST_GET(LoDTensor, feed_inputs[index]);
val.ShareDataWith(input);
// set lod
feed_inputs[index].set_lod(input.lod());
val.set_lod(input.lod());
}
void SetFeedVariable(Scope* scope, const Strings& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(3) << "SetFeedStringVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index] = input;
}
FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/string_array.h"
namespace paddle {
namespace framework {
......@@ -28,6 +29,9 @@ class Scope;
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index);
void SetFeedVariable(Scope* scope, const Strings& input,
const std::string& var_name, size_t index);
FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index);
......
......@@ -13,14 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace framework {
using FeedType = LoDTensor;
using FeedType = boost::variant<LoDTensor, Strings>;
using FeedList = std::vector<FeedType>;
using FetchType = boost::variant<LoDTensor, LoDTensorArray>;
......@@ -43,6 +46,13 @@ inline bool data_is_lod_tensor_array(const FetchType &data) {
return false;
}
inline bool data_is_string_tensor(const FeedType &data) {
if (data.type() == typeid(Strings)) {
return true;
}
return false;
}
static const char kFeedOpType[] = "feed";
static const char kFetchOpType[] = "fetch";
......
......@@ -147,6 +147,11 @@ message VarType {
// in operators like nccl_op
RAW = 17;
TUPLE = 18;
STRING = 25;
STRINGS = 26;
VOCAB = 27;
FEED_LIST = 28;
}
required Type type = 1;
......@@ -175,6 +180,10 @@ message VarType {
message Tuple { repeated Type element_type = 1; }
optional Tuple tuple = 7;
optional TensorDesc string = 8;
optional TensorDesc strings = 9;
optional TensorDesc vocab = 10;
}
message VarDesc {
......
......@@ -76,6 +76,8 @@ static DDim GetDimsDebug(const Scope& scope, const std::string& name,
} else {
return var->Get<SelectedRows>().GetCompleteDims();
}
} else if (var->IsType<Strings>()) {
return DDim({static_cast<int64_t>(var->Get<Strings>().size())});
} else {
return DDim({-1});
}
......@@ -106,6 +108,8 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
} else {
return DataTypeToString(tensor.type());
}
} else if (var->IsType<Strings>()) {
return "strings";
} else {
return "";
}
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <utf8proc.h>
#include <exception>
#include "glog/logging.h"
#include "paddle/fluid/framework/string_array.h"
namespace paddle {
namespace framework {
std::wstring_convert<std::codecvt_utf8<wchar_t>> kConverter;
// Convert the std::string type to the std::wstring type.
bool ConvertStrToWstr(const std::string& src, std::wstring* res) {
try {
*res = kConverter.from_bytes(src);
} catch (std::range_error& e) {
VLOG(3) << "The string " << src << " was converted to unicode failedly! ";
return false;
}
return true;
}
// Convert the std::wstring type to the std::string type.
void ConvertWstrToStr(const std::wstring& src, std::string* res) {
*res = kConverter.to_bytes(src);
}
// Normalization Form Canonical Decomposition.
void NFD(const std::string& s, std::string* ret) {
*ret = "";
char* result = reinterpret_cast<char*>(
utf8proc_NFD(reinterpret_cast<const unsigned char*>(s.c_str())));
if (result) {
*ret = std::move(std::string(result));
free(result);
}
}
// Write the data which is type of
// std::unordered_map<std::string, int32_t> to ostream.
void StringMapToStream(std::ostream& os,
const std::unordered_map<std::string, int32_t>& data) {
{
// firstly write the data size.
size_t t = data.size();
os.write(reinterpret_cast<const char*>(&t), sizeof(t));
}
{
// then write the data
for (auto it = data.begin(); it != data.end(); ++it) {
std::string token = it->first;
int32_t token_id = it->second;
// write the token
size_t length = token.size();
os.write(reinterpret_cast<const char*>(&length), sizeof(length));
os.write(token.c_str(), length);
// write the token_id
os.write(reinterpret_cast<const char*>(&token_id), sizeof(token_id));
}
}
}
// Read the data which is type of
// std::unordered_map<td::string, int32_t> from istream.
void StringMapFromStream(std::istream& is,
std::unordered_map<std::string, int32_t>* data) {
// first read the map size
size_t map_size;
is.read(reinterpret_cast<char*>(&map_size), sizeof(map_size));
data->reserve(map_size);
// then read the data
for (size_t i = 0; i < map_size; ++i) {
// read the token
size_t token_length;
is.read(reinterpret_cast<char*>(&token_length), sizeof(token_length));
char* tmp = new char[token_length];
is.read(tmp, token_length);
std::string token(tmp, tmp + token_length);
delete[] tmp;
// read the token_id
int32_t token_id;
is.read(reinterpret_cast<char*>(&token_id), sizeof(token_id));
data->emplace(token, token_id);
}
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <codecvt>
#include <iostream>
#include <locale>
#include <string>
#include <unordered_map>
#include <vector>
namespace paddle {
namespace framework {
using String = std::string;
using Strings = std::vector<std::string>;
using Vocab = std::unordered_map<std::wstring, std::int32_t>;
// Convert the std::string type to the std::string type.
bool ConvertStrToWstr(const std::string& src, std::wstring* res);
// Convert the std::wstring type to the std::string type.
void ConvertWstrToStr(const std::wstring& src, std::string* res);
// Normalization Form Canonical Decomposition.
void NFD(const std::string& s, std::string* ret);
// Write the data which is type of
// std::unordered_map<td::string, int32_t> to ostream.
void StringMapToStream(std::ostream& os,
const std::unordered_map<std::string, int32_t>& data);
// Read the data which is type of
// std::unordered_map<td::string, int32_t> from istream.
void StringMapFromStream(std::istream& is,
std::unordered_map<std::string, int32_t>* data);
} // namespace framework
} // namespace paddle
......@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include <algorithm>
#include <limits>
#include <memory>
......@@ -22,6 +20,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_MKLDNN
......
......@@ -13,11 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <codecvt>
#include <locale>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/dlpack_tensor.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -48,6 +54,14 @@ class PrintOptions {
PrintOptions() {}
};
void TensorToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx);
void TensorFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx);
void TensorFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx,
const size_t& seek, const std::vector<int64_t>& shape);
// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
// and dst_place are two different GPU, to ensure that the operation can
// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
......
......@@ -209,6 +209,10 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
return desc_.type().lod_tensor().tensor();
case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().tensor();
case proto::VarType::STRINGS:
return desc_.type().strings();
case proto::VarType::VOCAB:
return desc_.type().vocab();
default:
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_desc' is not supported by the %s type variable.",
......@@ -249,6 +253,10 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
return desc_.mutable_type()->mutable_lod_tensor()->mutable_tensor();
case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
case proto::VarType::STRINGS:
return desc_.mutable_type()->mutable_strings();
case proto::VarType::VOCAB:
return desc_.mutable_type()->mutable_vocab();
default:
PADDLE_THROW(
platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
......
......@@ -18,10 +18,12 @@
#include <string>
#include <tuple>
#include <typeindex>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include <cudnn.h>
......@@ -162,8 +164,8 @@ struct VarTypeRegistryImpl {
// Paddle would generate unique Ids for each registered variable types.
using VarTypeRegistry = detail::VarTypeRegistryImpl<
Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
operators::reader::LoDTensorBlockingQueueHolder, FetchList,
Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *,
operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList,
operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......@@ -177,8 +179,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#if defined(PADDLE_WITH_XPU_BKCL)
BKCLUniqueId, platform::BKCLCommunicator,
#endif
int, float>;
int, float, Vocab>;
template <typename T>
struct VarTypeTrait {
static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
......@@ -208,9 +209,13 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
REG_PROTO_VAR_TYPE_TRAIT(FeedList, proto::VarType::FEED_LIST);
REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST);
REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
REG_PROTO_VAR_TYPE_TRAIT(Vocab, proto::VarType::VOCAB);
REG_PROTO_VAR_TYPE_TRAIT(String, proto::VarType::STRING);
REG_PROTO_VAR_TYPE_TRAIT(Strings, proto::VarType::STRINGS);
/** End of variable type registration */
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
......@@ -41,6 +42,10 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::STRINGS) {
var->GetMutable<Strings>();
} else if (var_type == proto::VarType::VOCAB) {
var->GetMutable<Vocab>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
......
......@@ -20,6 +20,7 @@
#include <utility>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/hooks.h"
#include "paddle/fluid/imperative/op_base.h"
......@@ -153,6 +154,15 @@ class VariableWrapper {
tensor = &(var_.Get<framework::LoDTensor>());
} else if (type_ == framework::proto::VarType::SELECTED_ROWS) {
tensor = &(var_.Get<framework::SelectedRows>().value());
} else if (type_ == framework::proto::VarType::VOCAB) {
const framework::Vocab* data = nullptr;
data = &(var_.Get<framework::Vocab>());
if (data && data->size() != 0) {
VLOG(6) << "The tensor of variable " << name_
<< " is not initialized";
return data_type_;
}
return framework::proto::VarType::VOCAB;
} else {
VLOG(6) << "Variable " << name_ << " is not initialized";
return data_type_;
......
......@@ -26,7 +26,7 @@ if(WITH_MKLDNN)
set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE)
endif()
cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer)
cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer utf8proc)
cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
......
......@@ -34,12 +34,14 @@ include_directories("${PADDLE_LIB}/")
set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
......@@ -151,12 +153,13 @@ if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
glog gflags protobuf xxhash cryptopp
glog gflags protobuf xxhash cryptopp utf8proc
${EXTERNAL_LIB})
else()
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
glog gflags_static libprotobuf xxhash cryptopp-static ${EXTERNAL_LIB})
glog gflags_static libprotobuf xxhash cryptopp-static utf8proc_static
${EXTERNAL_LIB})
set(DEPS ${DEPS} shlwapi.lib)
endif(NOT WIN32)
......
......@@ -43,15 +43,33 @@ void Tensor::Reshape(const std::vector<int> &shape) {
tensor->Resize(paddle::framework::make_ddim(shape));
}
#define EAGER_GET_TENSOR \
if (!tensor_) { \
tensor_ = FindTensor(); \
} \
auto *tensor = static_cast<paddle::framework::LoDTensor *>(tensor_);
void Tensor::ReshapeStrings(const size_t &shape) {
PADDLE_ENFORCE_EQ(
name_.empty(), false,
paddle::platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can "
"be retrieved."));
PADDLE_ENFORCE_EQ(input_or_output_, true,
paddle::platform::errors::PermissionDenied(
"Can't reshape the output tensor, it is readonly"));
auto *scope = static_cast<paddle::framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE_NOT_NULL(
var, paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
paddle_infer::Strings *tensor = var->GetMutable<paddle_infer::Strings>();
tensor->resize(shape);
}
#define EAGER_GET_TENSOR(tensor_type) \
if (!tensor_) { \
tensor_ = FindTensor<tensor_type>(); \
} \
auto *tensor = static_cast<tensor_type *>(tensor_);
template <typename T>
T *Tensor::mutable_data(PlaceType place) {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
PADDLE_ENFORCE_GT(
tensor->numel(), 0,
paddle::platform::errors::PreconditionNotMet(
......@@ -83,7 +101,7 @@ T *Tensor::mutable_data(PlaceType place) {
template <typename T>
T *Tensor::data(PlaceType *place, int *size) const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
auto *res = tensor->data<T>();
if (paddle::platform::is_cpu_place(tensor->place())) {
......@@ -103,7 +121,7 @@ T *Tensor::data(PlaceType *place, int *size) const {
}
DataType Tensor::type() const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
auto type = tensor->type();
if (type == paddle::framework::proto::VarType::FP32) {
return DataType::FLOAT32;
......@@ -125,7 +143,7 @@ PlaceType Tensor::place() const { return place_; }
template <typename T>
void Tensor::CopyFromCpu(const T *data) {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
PADDLE_ENFORCE_GE(tensor->numel(), 0,
paddle::platform::errors::PreconditionNotMet(
"You should call Tensor::Reshape(const "
......@@ -186,10 +204,20 @@ void Tensor::CopyFromCpu(const T *data) {
}
}
void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
EAGER_GET_TENSOR(paddle_infer::Strings);
PADDLE_ENFORCE_GE(tensor->size(), 0,
paddle::platform::errors::PreconditionNotMet(
"You should call Tensor::Reshape(const "
"std::size_t &shape)function before copying"
"the string data from cpu."));
*tensor = *data;
}
template <typename T>
void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
void *cb_params) const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
auto ele_num = tensor->numel();
auto *t_data = tensor->data<T>();
auto t_place = tensor->place();
......@@ -371,6 +399,7 @@ Tensor::Tensor(void *scope) : scope_{scope} {
"set to the pointer of scope."));
}
template <typename T>
void *Tensor::FindTensor() const {
PADDLE_ENFORCE_EQ(
name_.empty(), false,
......@@ -382,12 +411,12 @@ void *Tensor::FindTensor() const {
PADDLE_ENFORCE_NOT_NULL(
var, paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
auto *tensor = var->GetMutable<T>();
return tensor;
}
std::vector<int> Tensor::shape() const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
PADDLE_ENFORCE_NOT_NULL(
tensor_, paddle::platform::errors::PreconditionNotMet(
"Not found tensor called %s in the scope", name_));
......@@ -395,7 +424,7 @@ std::vector<int> Tensor::shape() const {
}
void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
paddle::framework::LoD lod;
for (auto &level : x) {
lod.emplace_back(level);
......@@ -404,7 +433,7 @@ void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
}
std::vector<std::vector<size_t>> Tensor::lod() const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
std::vector<std::vector<size_t>> res;
for (auto &level : tensor->lod()) {
res.emplace_back(level);
......
......@@ -36,7 +36,10 @@ template PD_INFER_DECL int64_t *Tensor::data<int64_t>(PlaceType *place,
template float *Tensor::mutable_data(PlaceType place);
template int64_t *Tensor::mutable_data(PlaceType place);
void *Tensor::FindTensor() const { return nullptr; }
template <typename T>
void *Tensor::FindTensor() const {
return nullptr;
}
std::vector<int> Tensor::shape() const { return {}; }
......
......@@ -88,7 +88,8 @@ bool SetPlaceAndCheck(PlaceType place, size_t length) {
const std::vector<std::vector<size_t>> lod{{0, length}};
scope.Var(name);
auto tensor = CreateTensor(place, &scope, name);
tensor->Reshape({static_cast<int>(length)});
std::vector<int> shape{static_cast<int>(length)};
tensor->Reshape(shape);
tensor->mutable_data<T>(place);
tensor->SetLoD(lod);
......
......@@ -174,6 +174,14 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
void copy_from_cpu(const T* data) {
return CopyFromCpu(data);
}
/// \brief Experimental interface.
/// It's usually used to set the input tensor data with Strings data type.
/// \param data The pointer of the data, from which the tensor will copy.
void copy_strings_from_cpu(const paddle_infer::Strings* data) {
return CopyStringsFromCpu(data);
}
/// \brief Copy the tensor data to the host memory.
/// It's usually used to get the output tensor data.
/// \param[out] data The tensor will copy the data to the address.
......
......@@ -14,10 +14,16 @@
#pragma once
#include <string>
#include "paddle_infer_declare.h" // NOLINT
namespace paddle_infer {
/// \brief Experimental.
/// Strings for text data.
using Strings = std::vector<std::string>;
typedef void (*CallbackFunc)(void*);
#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
......@@ -57,6 +63,14 @@ class PD_INFER_DECL Tensor {
/// \param shape The shape to set.
void Reshape(const std::vector<int>& shape);
/// \brief Experimental interface.
/// Reset the shape of the Strings tensor.
/// Generally it's only used for the input tensor.
/// Reshape must be called before calling
/// ZeroCopyStringTensorCreate() or PaddleInferTensorCreate()
/// \param shape The shape to set.
void ReshapeStrings(const std::size_t& shape);
/// \brief Get the memory pointer in CPU or GPU with specific data type.
/// Please Reshape the tensor first before call this.
/// It's usually used to get input data pointer.
......@@ -78,6 +92,11 @@ class PD_INFER_DECL Tensor {
template <typename T>
void CopyFromCpu(const T* data);
/// \brief Experimental interface.
/// It's usually used to set the input tensor data with Strings data type.
/// \param data The pointer of the data, from which the tensor will copy.
void CopyStringsFromCpu(const paddle_infer::Strings* data);
/// \brief Copy the tensor data to the host memory.
/// It's usually used to get the output tensor data.
/// \param[out] data The tensor will copy the data to the address.
......@@ -122,7 +141,10 @@ class PD_INFER_DECL Tensor {
protected:
explicit Tensor(void* scope);
template <typename T>
void* FindTensor() const;
void SetPlace(PlaceType place, int device = -1);
void SetName(const std::string& name);
......
......@@ -17,11 +17,13 @@ limitations under the License. */
#include <algorithm>
#include <fstream>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/pybind/pybind.h"
DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
......@@ -85,10 +87,12 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->GetShape());
new_var->SetDataType(var->GetDataType());
new_var->SetType(var->GetType());
auto var_type = var->GetType();
new_var->SetType(var_type);
if (var->GetType() !=
framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) {
if ((var_type !=
framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) &&
(var_type != framework::proto::VarType::VOCAB)) {
new_var->SetLoDLevel(var->GetLoDLevel());
}
......
......@@ -17,6 +17,7 @@ add_subdirectory(metrics)
add_subdirectory(optimizers)
add_subdirectory(reduce_ops)
add_subdirectory(sequence_ops)
add_subdirectory(string)
add_subdirectory(jit)
if(WITH_MKLDNN)
add_subdirectory(mkldnn)
......@@ -78,10 +79,12 @@ if(WITH_UNITY_BUILD)
include(unity_build_rule.cmake)
endif()
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op sparse_attention_op lstm_op run_program_op eye_op recurrent_op
sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
op_library(save_combine_op DEPS string_array)
op_library(load_combine_op DEPS string_array)
if (WITH_GPU OR WITH_ROCM)
if(WITH_ROCM)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -29,6 +26,39 @@ class OpBase;
namespace paddle {
namespace operators {
// FeedVariableVisitor is to feed the variable data
// according to data type (LoDTensor or Strings).
class FeedVariableVisitor : public boost::static_visitor<void> {
public:
explicit FeedVariableVisitor(framework::Variable *out_var,
const platform::Place &place)
: out_var_(out_var), place_(place) {}
void operator()(const framework::LoDTensor &in_tensor) const {
framework::LoDTensor *out_tensor =
out_var_->GetMutable<framework::LoDTensor>();
if (platform::is_same_place(in_tensor.place(), place_)) {
out_tensor->ShareDataWith(in_tensor);
} else {
platform::DeviceContext *context =
platform::DeviceContextPool::Instance().Get(place_);
framework::TensorCopy(in_tensor, place_, *context, out_tensor);
}
out_tensor->set_lod(in_tensor.lod());
}
void operator()(const framework::Strings &in_str) const {
framework::Strings *out_str = out_var_->GetMutable<framework::Strings>();
out_str->resize(in_str.size());
*out_str = in_str;
}
private:
framework::Variable *out_var_;
const platform::Place &place_;
};
class FeedOp : public framework::OperatorBase {
public:
FeedOp(const std::string &type, const framework::VariableNameMap &inputs,
......@@ -79,15 +109,9 @@ class FeedOp : public framework::OperatorBase {
col, feed_list.size()));
auto &feed_item = feed_list.at(static_cast<size_t>(col));
auto *out_item = out_var->GetMutable<framework::FeedType>();
if (platform::is_same_place(feed_item.place(), place)) {
out_item->ShareDataWith(feed_item);
} else {
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
framework::TensorCopy(feed_item, place, *dev_ctx, out_item);
}
out_item->set_lod(feed_item.lod());
FeedVariableVisitor visitor(out_var, place);
boost::apply_visitor(visitor, feed_item);
}
};
......@@ -95,17 +119,17 @@ class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(vector<LoDTensor>) A feeding list of LoDTensor, which may have "
"(vector<LoDTensor>) "
"A feeding list of LoDTensor, which may have "
"different dimension and data type.");
AddOutput("Out",
"(LoDTensor) The LoDTensor which is a copy of the col-th feeding "
"(LoDTensor) The LoDTensor which is a copy "
"of the col-th feeding "
"object.");
AddAttr<int>("col", "(int) The column index of current feeding object.");
AddComment(R"DOC(
Feed Operator.
It should not be configured by users directly.
)DOC");
}
};
......
......@@ -109,6 +109,10 @@ class FetchOp : public framework::OperatorBase {
auto &src_item = fetch_var->Get<framework::LoDTensor>();
auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col)));
DataCopy(src_item, fetch_var_name, dst_item);
} else if (fetch_var->IsType<framework::Vocab>()) {
auto &src_item = fetch_var->Get<framework::Vocab>();
auto *dst_item = &(BOOST_GET(framework::Vocab, fetch_list->at(col)));
*dst_item = src_item;
} else {
auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
framework::LoDTensorArray tmp(src_item.size());
......@@ -128,9 +132,11 @@ class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X",
"(LoDTensor) The resulted LoDTensor which is expected to return "
"to users.");
AddOutput("Out",
"(vector<LoDTensor>) A fetching list of LoDTensor which may have "
"different dimension, shape and data type.");
AddOutput(
"Out",
"(vector<LoDTensor>|unordered_map<string, int32_t>) A fetching list"
" of LoDTensor|unordered_map<string, int32_t> which may have "
"different dimension, shape and data type.");
AddAttr<int>("col", "(int) The column index of fetching object.");
AddComment(R"DOC(
Fetch Operator.
......
......@@ -21,6 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
......@@ -75,38 +77,57 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
out_vars[i], platform::errors::InvalidArgument(
"The variable %s to be loaded cannot be found.",
out_var_names[i]));
auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
// Error checking
PADDLE_ENFORCE_EQ(
static_cast<bool>(*buffer), true,
platform::errors::Unavailable(
"An error occurred while loading model parameters. "
"Please check whether the model file is complete or damaged."));
// Get data from fin to tensor
DeserializeFromStream(*buffer, tensor, dev_ctx);
auto in_dtype = tensor->type();
auto out_dtype =
load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
// convert to float16 tensor
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor fp16_tensor;
// copy LoD info to the new tensor
fp16_tensor.set_lod(tensor->lod());
framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
&fp16_tensor);
// reset output tensor
out_vars[i]->Clear();
tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor);
if (out_vars[i]->IsType<framework::Vocab>()) {
auto *tensor = out_vars[i]->GetMutable<framework::Vocab>();
tensor->clear();
std::unordered_map<std::string, std::int32_t> data;
framework::StringMapFromStream(*buffer, &data);
for (auto it = data.begin(); it != data.end(); ++it) {
std::string tmp;
framework::NFD(it->first, &tmp);
if (tmp.empty()) {
VLOG(0) << "The string " << it->first
<< " was converted to unicode failedly! "
<< "Then dropped to load it.";
continue;
}
std::wstring token;
bool status = framework::ConvertStrToWstr(tmp, &token);
if (!status) continue;
tensor->emplace(token, it->second);
}
} else {
auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
// Get data from fin to tensor
DeserializeFromStream(*buffer, tensor, dev_ctx);
auto in_dtype = tensor->type();
auto out_dtype =
load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
// convert to float16 tensor
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor fp16_tensor;
// copy LoD info to the new tensor
fp16_tensor.set_lod(tensor->lod());
framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
&fp16_tensor);
// reset output tensor
out_vars[i]->Clear();
tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor);
}
}
}
buffer->peek();
......
......@@ -19,11 +19,13 @@ limitations under the License. */
#include <numeric>
#include <sstream>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
......@@ -66,34 +68,48 @@ class SaveCombineOpKernel : public framework::OpKernel<T> {
inp_vars[i],
platform::errors::InvalidArgument("Cannot find variable %s to save.",
inp_var_names[i]));
PADDLE_ENFORCE_EQ(inp_vars[i]->IsType<framework::LoDTensor>(), true,
PADDLE_ENFORCE_EQ(inp_vars[i]->IsType<framework::LoDTensor>() ||
inp_vars[i]->IsType<framework::Vocab>(),
true,
platform::errors::InvalidArgument(
"SaveCombine operator only supports saving "
"LoDTensor variable, %s has wrong type.",
"LoDTensor or Vocab variable, %s has wrong type.",
inp_var_names[i]));
auto &tensor = inp_vars[i]->Get<framework::LoDTensor>();
PADDLE_ENFORCE_EQ(
tensor.IsInitialized(), true,
platform::errors::InvalidArgument(
"The Tensor of Variable(%s) to be saved is not initialized.",
inp_var_names[i]));
// Serialize tensors one by one
// Check types to see if a fp16 transformation is required
auto in_dtype = tensor.type();
auto out_dtype =
save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (inp_vars[i]->IsType<framework::LoDTensor>()) {
auto &tensor = inp_vars[i]->Get<framework::LoDTensor>();
PADDLE_ENFORCE_EQ(
tensor.IsInitialized(), true,
platform::errors::InvalidArgument(
"The Tensor of Variable(%s) to be saved is not initialized.",
inp_var_names[i]));
// Serialize tensors one by one
// Check types to see if a fp16 transformation is required
auto in_dtype = tensor.type();
auto out_dtype =
save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
// copy LoD info to the new tensor
out.set_lod(tensor.lod());
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
framework::SerializeToStream(ss, out, dev_ctx);
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
// copy LoD info to the new tensor
out.set_lod(tensor.lod());
framework::TransDataType(in_kernel_type, out_kernel_type, tensor,
&out);
framework::SerializeToStream(ss, out, dev_ctx);
} else {
framework::SerializeToStream(ss, tensor, dev_ctx);
}
} else {
framework::SerializeToStream(ss, tensor, dev_ctx);
auto &tensor = inp_vars[i]->Get<framework::Vocab>();
std::unordered_map<std::string, std::int32_t> data;
for (auto it = tensor.begin(); it != tensor.end(); ++it) {
std::string t;
framework::ConvertWstrToStr(it->first, &t);
data.emplace(t, it->second);
}
framework::StringMapToStream(ss, data);
}
}
if (save_to_memory) {
......
include(operators)
if(WITH_UNITY_BUILD)
# Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops.
include(unity_build_rule.cmake)
endif()
register_operators(DEPS op_version_registry utf8proc string_array)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <utf8proc.h>
#include <algorithm>
#include <chrono>
#include <codecvt>
#include <fstream>
#include <iostream>
#include <numeric>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <boost/algorithm/string.hpp>
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/operators/string/faster_tokenizer_op.h"
namespace paddle {
namespace operators {
using std::bad_cast;
using std::codecvt_utf8;
using std::endl;
using std::exception;
using std::ifstream;
using std::int64_t;
using std::min;
using std::runtime_error;
using std::unordered_map;
using std::unordered_set;
using std::shared_ptr;
using std::size_t;
using std::int64_t;
using std::string;
using std::vector;
using std::wstring;
const wstring kStripChars = L" \t\n\r\v\f";
inline bool IsControl(const wchar_t& ch) {
if (ch == L'\t' || ch == L'\n' || ch == L'\r') return false;
auto cat = utf8proc_category(ch);
if (cat == UTF8PROC_CATEGORY_CC || cat == UTF8PROC_CATEGORY_CF) return true;
return false;
}
inline bool IsChineseChar(const wchar_t& ch) {
if ((ch >= 0x4E00 && ch <= 0x9FFF) || (ch >= 0x3400 && ch <= 0x4DBF) ||
(ch >= 0x20000 && ch <= 0x2A6DF) || (ch >= 0x2A700 && ch <= 0x2B73F) ||
(ch >= 0x2B740 && ch <= 0x2B81F) || (ch >= 0x2B820 && ch <= 0x2CEAF) ||
(ch >= 0xF900 && ch <= 0xFAFF) || (ch >= 0x2F800 && ch <= 0x2FA1F))
return true;
return false;
}
inline bool IsWhiteSpace(const wchar_t& ch) {
if (ch == L' ' || ch == L'\t' || ch == L'\n' || ch == L'\r') return true;
auto cat = utf8proc_category(ch);
if (cat == UTF8PROC_CATEGORY_ZS) return true;
return false;
}
inline bool IsPunctuation(const wchar_t& ch) {
if ((ch >= 33 && ch <= 47) || (ch >= 58 && ch <= 64) ||
(ch >= 91 && ch <= 96) || (ch >= 123 && ch <= 126))
return true;
auto cat = utf8proc_category(ch);
if (cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PS ||
cat == UTF8PROC_CATEGORY_PE || cat == UTF8PROC_CATEGORY_PC ||
cat == UTF8PROC_CATEGORY_PO // sometimes ¶ belong SO
|| cat == UTF8PROC_CATEGORY_PI || cat == UTF8PROC_CATEGORY_PF)
return true;
return false;
}
BasicTokenizer::BasicTokenizer(bool do_lower_case /* = true */)
: do_lower_case_(do_lower_case) {}
wchar_t BasicTokenizer::do_lower_case(wchar_t ch) const {
wchar_t new_ch = utf8proc_tolower(ch);
return new_ch;
}
void BasicTokenizer::Tokenize(const string& text, vector<wstring>* res) const {
std::wstring unicode_text;
bool status = framework::ConvertStrToWstr(text, &unicode_text);
if (!status) {
// String is converted into wstring failedly.
return;
}
std::wstring dest_text;
for (auto ch : unicode_text) {
if (ch == 0 || ch == 0xfffd || IsControl(ch)) {
continue;
}
if (do_lower_case_) {
ch = do_lower_case(ch);
}
if (IsChineseChar(ch) || IsPunctuation(ch)) {
dest_text += ' ';
dest_text += ch;
dest_text += ' ';
} else if (IsWhiteSpace(ch)) {
dest_text += ' ';
} else {
dest_text += ch;
}
}
boost::split(*res, dest_text, boost::is_any_of(kStripChars));
}
WordPieceTokenizer::WordPieceTokenizer(
framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/,
const size_t max_input_chars_per_word /* = 100 */)
: vocab_(vocab),
unk_token_(unk_token),
max_input_chars_per_word_(max_input_chars_per_word) {
unk_token_id_ = (*vocab_)[unk_token_];
}
void WordPieceTokenizer::Tokenize(const wstring& text,
vector<int64_t>* token_ids) const {
size_t len = text.size();
if (len > max_input_chars_per_word_) {
token_ids->emplace_back(std::move(unk_token_id_));
return;
}
auto it = vocab_->find(text);
if (it != vocab_->end()) {
token_ids->emplace_back(std::move(it->second));
return;
}
size_t start = 0;
vector<int64_t> wordpiece_ids;
while (start < len) {
size_t end = len;
std::wstring cur_substr;
int64_t cur_substr_id;
while (start < end) {
std::wstring sub = text.substr(start, end - start);
if (start > 0) {
sub = L"##" + sub;
}
auto it = vocab_->find(sub);
if (it != vocab_->end()) {
cur_substr = sub;
cur_substr_id = it->second;
break;
}
end -= 1;
}
if (cur_substr.empty()) {
token_ids->emplace_back(std::move(unk_token_id_));
return;
} else {
start = end;
wordpiece_ids.emplace_back(std::move(cur_substr_id));
}
}
for (auto& token_id : wordpiece_ids) {
token_ids->emplace_back(std::move(token_id));
}
}
BertTokenizer::BertTokenizer(framework::Vocab* vocab,
bool do_lower_case /* = false */,
const wstring& unk_token /* = L"[UNK]" */,
const wstring& pad_token /* = L"[PAD]" */,
const wstring& cls_token /* = L"[CLS]" */,
const wstring& mask_token /* = L"[MASK]" */,
const wstring& sep_token /* = L"[SEP]" */,
const string& padding_site /* = "right" */)
: do_lower_case_(do_lower_case),
unk_token_(unk_token),
pad_token_(pad_token),
cls_token_(cls_token),
mask_token_(mask_token),
sep_token_(sep_token),
padding_site_(padding_site),
vocab_(vocab),
basic_tokenizer_(do_lower_case_),
word_piece_tokenizer_(vocab_, unk_token) {
unk_token_id_ = (*vocab_)[unk_token_];
pad_token_id_ = (*vocab_)[pad_token_];
cls_token_id_ = (*vocab_)[cls_token_];
mask_token_id_ = (*vocab_)[mask_token_];
sep_token_id_ = (*vocab_)[sep_token_];
all_special_tokens_ = vector<wstring>(
{unk_token_, pad_token_, cls_token_, mask_token_, sep_token_});
all_special_token_ids_ =
unordered_set<int64_t>({unk_token_id_, pad_token_id_, cls_token_id_,
mask_token_id_, sep_token_id_});
}
void BertTokenizer::Tokenize(const string& text,
vector<int64_t>* split_token_ids) const {
std::vector<std::wstring> tmp_tokens;
basic_tokenizer_.Tokenize(text, &tmp_tokens);
if (tmp_tokens.empty()) return;
split_token_ids->reserve(tmp_tokens.size());
for (auto& w_token : tmp_tokens) {
const auto& vec_size = w_token.size();
if (vec_size == 1) {
if (IsChineseChar(w_token[0])) {
auto vocab_it = vocab_->find(w_token);
if (vocab_it != vocab_->end()) {
split_token_ids->emplace_back(std::move(vocab_it->second));
} else {
split_token_ids->emplace_back(std::move(unk_token_id_));
}
} else {
word_piece_tokenizer_.Tokenize(w_token, split_token_ids);
}
} else if (vec_size > 1) {
word_piece_tokenizer_.Tokenize(w_token, split_token_ids);
} else {
continue;
}
}
}
void BertTokenizer::BuildInputsWithSpecialTokens(
vector<int64_t>* inputs, const vector<int64_t>& token_ids_0,
const vector<int64_t>& token_ids_1 /* = vector<int64_t>() */) const {
if (token_ids_1.size() == 0) {
inputs->clear();
inputs->resize(token_ids_0.size() + 2);
inputs->at(0) = std::move(cls_token_id_);
size_t i = 1;
for (auto& token_id : token_ids_0) {
inputs->at(i) = std::move(token_id);
++i;
}
inputs->at(i) = std::move(sep_token_id_);
} else {
inputs->clear();
inputs->resize(token_ids_0.size() + token_ids_1.size() + 3);
inputs->at(0) = std::move(cls_token_id_);
size_t i = 1;
for (auto& token_id : token_ids_0) {
inputs->at(i) = std::move(token_id);
++i;
}
inputs->at(i) = std::move(sep_token_id_);
++i;
for (auto& token_id : token_ids_1) {
inputs->at(i) = std::move(token_id);
++i;
}
inputs->at(i) = std::move(sep_token_id_);
}
}
int64_t BertTokenizer::GetNumSpecialTokensToAdd(const bool pair) const {
if (pair) {
return 3;
} else {
return 2;
}
}
void BertTokenizer::CreateTokenTypeIdsFromSequences(
vector<int64_t>* token_type_ids, const vector<int64_t>& token_ids_0,
const vector<int64_t>& token_ids_1 /* = vector<int64_t>() */) const {
if (token_ids_1.size() == 0) {
vector<int64_t> tmp(token_ids_0.size() + 2, 0);
token_type_ids->swap(tmp);
} else {
vector<int64_t> tmp(token_ids_0.size() + token_ids_1.size() + 3, 0);
for (size_t i = token_ids_0.size() + 2; i < tmp.size(); i++) {
tmp[i] = 1;
}
token_type_ids->swap(tmp);
}
}
void BertTokenizer::TruncateSequence(
vector<int64_t>* ids, vector<int64_t>* pair_ids,
const size_t num_tokens_to_remove /* = 0 */,
const size_t stride /* = 0 */) const {
for (size_t i = 0; i < num_tokens_to_remove; i++) {
if ((pair_ids->size() == 0) || (ids->size() > pair_ids->size())) {
ids->pop_back();
} else {
pair_ids->pop_back();
}
}
}
int64_t BertTokenizer::GetPadTokenID() const { return pad_token_id_; }
int BertTokenizer::Encode(
unordered_map<string, vector<int64_t>>* encoded_inputs, const string& text,
const string& text_pair /* = "" */, bool is_split_into_words /* = false */,
const size_t max_seq_len /* = 0 */,
bool pad_to_max_seq_len /* = false */) const {
vector<int64_t> ids;
vector<int64_t> pair_ids;
if (!is_split_into_words) {
Tokenize(text, &ids);
if (ids.empty()) return 0;
if (text_pair != "") {
Tokenize(text_pair, &pair_ids);
if (pair_ids.empty()) return 0;
}
} else {
std::wstring unicode_text;
bool status_a = framework::ConvertStrToWstr(text, &unicode_text);
if (!status_a) {
return 0;
}
for (size_t i = 0; i < unicode_text.size(); i++) {
wstring token = unicode_text.substr(i, 1);
auto it = vocab_->find(token);
if (it != vocab_->end()) {
ids.emplace_back(std::move(it->second));
} else {
ids.emplace_back(std::move(unk_token_id_));
}
}
}
bool pair = false;
if (pair_ids.size() != 0) {
pair = true;
}
size_t len_ids = ids.size();
size_t len_pair_ids = pair_ids.size();
// Truncation: Handle max sequence length
// If max_seq_len == 0, then do nothing and keep the real length.
// If max_seq_len > 0 and
// all the input sequence len is over the max_seq_len,
// then we truncate it.
size_t total_len = len_ids + len_pair_ids + GetNumSpecialTokensToAdd(pair);
if (max_seq_len > 0 && total_len > max_seq_len) {
TruncateSequence(&ids, &pair_ids, total_len - max_seq_len);
}
// Add special tokens
vector<int64_t> sequence;
BuildInputsWithSpecialTokens(&sequence, ids, pair_ids);
size_t seq_len = sequence.size();
vector<int64_t> token_type_ids;
CreateTokenTypeIdsFromSequences(&token_type_ids, ids, pair_ids);
// Build output dictionnary
encoded_inputs->emplace("input_ids", sequence);
encoded_inputs->emplace("token_type_ids", token_type_ids);
// Check lengths
if (max_seq_len > 0 && seq_len > max_seq_len) {
VLOG(3) << "There is something wrong with the input sequence length."
" Please check it.";
// Failed.
return 0;
}
// Padding
bool needs_to_be_padded = false;
if (pad_to_max_seq_len && max_seq_len > 0 && (seq_len < max_seq_len)) {
needs_to_be_padded = true;
}
if (needs_to_be_padded) {
int64_t difference = max_seq_len - seq_len;
size_t pad_start = max_seq_len - 1 - difference;
encoded_inputs->at("token_type_ids").resize(max_seq_len);
for (size_t i = max_seq_len - 1; i > pad_start; i--) {
encoded_inputs->at("token_type_ids")[i] = pad_token_id_;
}
encoded_inputs->at("input_ids").resize(max_seq_len);
for (size_t i = max_seq_len - 1; i > pad_start; i--) {
encoded_inputs->at("input_ids")[i] = pad_token_id_;
}
}
return 1;
}
void BertTokenizer::BatchEncode(
vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
const vector<string>& batch_text,
const vector<string>& batch_text_pair /* = vector<string>() */,
bool is_split_into_words /* = false */, const size_t max_seq_len /* = 0 */,
bool pad_to_max_seq_len /* = false */) const {
bool has_text_pair = false;
if (batch_text_pair.size() != 0) {
has_text_pair = true;
}
size_t batch_size = batch_text.size();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (size_t i = 0; i < batch_size; i++) {
unordered_map<string, vector<int64_t>> res;
if (has_text_pair) {
auto status =
Encode(&res, batch_text[i], batch_text_pair[i], is_split_into_words,
max_seq_len, pad_to_max_seq_len);
if (!status) {
res["input_ids"] =
std::vector<int64_t>{cls_token_id_, sep_token_id_, cls_token_id_};
res["token_type_ids"] = std::vector<int64_t>{0, 0, 1};
}
} else {
auto status = Encode(&res, batch_text[i], {}, is_split_into_words,
max_seq_len, pad_to_max_seq_len);
if (!status) {
res["input_ids"] = std::vector<int64_t>{cls_token_id_, sep_token_id_};
res["token_type_ids"] = std::vector<int64_t>{0, 0};
}
}
batch_encode_inputs->at(i) = std::move(res);
}
}
class FasterTokenizerOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Text"), "Input", "Text", "Tokenizer");
OP_INOUT_CHECK(ctx->HasInput("Vocab"), "Input", "Vocab", "Tokenizer");
OP_INOUT_CHECK(ctx->HasOutput("InputIds"), "Output", "InputIds",
"Tokenizer");
OP_INOUT_CHECK(ctx->HasOutput("SegmentIds"), "Output", "SegmentIds",
"Tokenizer");
ctx->SetOutputDim("InputIds", {-1, -1});
ctx->SetOutputDim("SegmentIds", {-1, -1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(framework::proto::VarType::INT64,
paddle::platform::CPUPlace());
}
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
return framework::OpKernelType(expected_kernel_type.data_type_,
expected_kernel_type.place_,
tensor.layout());
}
};
class FasterTokenizerOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Vocab",
"(std::map<std::wstring, std::int>), The vocab to map "
"token string to token id.");
AddInput("Text",
"(std::vector<std::string>), The sequence to be processed. "
"One sequence is a string, a list of strings, "
"or a list of integers depending on whether it "
"has been pretokenized and converted to ids. ");
AddInput("TextPair",
"(std::vector<std::string>), Same as `text` argument, "
"while it represents for the latter sequence of the "
"sequence pair.")
.AsDispensable();
AddOutput("InputIds", "(Tensor), The token ids of the input text.");
AddOutput("SegmentIds", "(Tensor), The segments ids of the input text.");
AddAttr<bool>(
"do_lower_case",
"(bool), Whether or not to lowercase the input when tokenizing.")
.SetDefault(false);
AddAttr<bool>(
"is_split_into_words",
"(bool), Whether or not the input is already pre-tokenized "
"(e.g., split into words). If set to True, the tokenizer "
"assumes the input is already split into words (for instance, "
"by splitting it on whitespace) which it will tokenize. This "
"is useful for NER or token classification.")
.SetDefault(false);
AddAttr<int>("max_seq_len",
"(int), If set to a positive number, will limit the "
"total sequence returned so that it has a maximum length."
" If there are overflowing tokens, those overflowing "
"tokens will be added to the returned dictionary when "
"`return_overflowing_tokens` is `True`.")
.SetDefault(0);
AddAttr<bool>("pad_to_max_seq_len",
"(bool), If set to `True`, the returned sequences would be"
" padded up to `max_seq_len` specified length according to"
" padding side and padding token id.")
.SetDefault(false);
AddComment(R"DOC(Performs tokenization and uses the tokenized tokens to "
"prepare model inputs. It supports sequence or sequence pair as input, "
"and batch input is not allowed.)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(faster_tokenizer, ops::FasterTokenizerOp,
ops::FasterTokenizerOpMaker);
REGISTER_OP_CPU_KERNEL(faster_tokenizer, ops::FasterTokenizerKernel<int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utf8proc.h>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/string_array.h"
namespace paddle {
namespace operators {
using std::endl;
using std::int64_t;
using std::size_t;
using std::string;
using std::shared_ptr;
using std::vector;
using std::unordered_map;
using std::unordered_set;
using std::vector;
using std::wstring;
using std::wcout;
inline bool IsControl(const wchar_t& ch);
inline bool IsChineseChar(const wchar_t& ch);
inline bool IsWhiteSpace(const wchar_t& ch);
using Vocab = unordered_map<wstring, int>;
using InvVocab = unordered_map<int, wstring>;
class BasicTokenizer {
public:
explicit BasicTokenizer(bool do_lower_case = true);
void Tokenize(const string& text, vector<wstring>* res) const;
private:
wchar_t do_lower_case(wchar_t ch) const;
bool do_lower_case_;
};
class WordPieceTokenizer {
public:
explicit WordPieceTokenizer(framework::Vocab* vocab,
const wstring& unk_token = L"[UNK]",
const size_t max_input_chars_per_word = 100);
void Tokenize(const wstring& text, vector<int64_t>* output) const;
private:
framework::Vocab* vocab_;
wstring unk_token_{L"[UNK]"};
int64_t unk_token_id_;
size_t max_input_chars_per_word_;
};
class BertTokenizer {
public:
explicit BertTokenizer(framework::Vocab* vocab, bool do_lower_case = false,
const wstring& unk_token = L"[UNK]",
const wstring& pad_token = L"[PAD]",
const wstring& cls_token = L"[CLS]",
const wstring& mask_token = L"[MASK]",
const wstring& sep_token = L"[SEP]",
const string& padding_site = "right");
void Tokenize(const string& text, vector<int64_t>* split_tokens) const;
void BuildInputsWithSpecialTokens(
vector<int64_t>* res, const vector<int64_t>& token_ids_0,
const vector<int64_t>& token_ids_1 = vector<int64_t>()) const;
void CreateTokenTypeIdsFromSequences(
vector<int64_t>* token_type_ids, const vector<int64_t>& token_ids_0,
const vector<int64_t>& token_ids_1 = vector<int64_t>()) const;
void TruncateSequence(vector<int64_t>* ids, vector<int64_t>* pair_ids,
const size_t num_tokens_to_remove = 0,
const size_t stride = 0) const;
int64_t GetNumSpecialTokensToAdd(const bool pair = false) const;
int Encode(unordered_map<string, vector<int64_t>>* encoded_inputs,
const string& text, const string& text_pair = "",
bool is_split_into_words = false, const size_t max_seq_len = 0,
bool pad_to_max_seq_len = false) const;
void BatchEncode(
vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
const vector<string>& batch_text,
const vector<string>& batch_text_pair = vector<string>(),
bool is_split_into_words = false, const size_t max_seq_len = 0,
bool pad_to_max_seq_len = false) const;
int64_t GetPadTokenID() const;
private:
bool do_lower_case_;
wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_;
string padding_site_;
framework::Vocab* vocab_;
BasicTokenizer basic_tokenizer_;
WordPieceTokenizer word_piece_tokenizer_;
int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_,
sep_token_id_;
vector<wstring> all_special_tokens_;
unordered_set<int64_t> all_special_token_ids_;
InvVocab inv_vocab_;
};
template <typename T>
class FasterTokenizerKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* text = ctx.Input<framework::Strings>("Text");
auto* vocab = ctx.Input<framework::Vocab>("Vocab");
auto* input_ids = ctx.Output<framework::Tensor>("InputIds");
auto* seg_ids = ctx.Output<framework::Tensor>("SegmentIds");
auto do_lower_case = static_cast<bool>(ctx.Attr<bool>("do_lower_case"));
auto is_split_into_words =
static_cast<bool>(ctx.Attr<bool>("is_split_into_words"));
auto max_seq_len = static_cast<size_t>(ctx.Attr<int>("max_seq_len"));
auto pad_to_max_seq_len =
static_cast<bool>(ctx.Attr<bool>("pad_to_max_seq_len"));
auto* text_pair = ctx.Input<framework::Strings>("TextPair");
if (text_pair && text->size() != text_pair->size()) {
VLOG(3) << "The input text(list[str]) and text pair (list[str]) must"
<< "be the same number of text sequence. Please check the input!";
return;
}
BertTokenizer* tokenizer_ptr =
new BertTokenizer(const_cast<framework::Vocab*>(vocab), do_lower_case);
size_t batch_max_seq_len = 0;
size_t batch_size = text->size();
vector<unordered_map<string, vector<int64_t>>> batch_encode_inputs(
batch_size);
if (text_pair) {
tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, *text_pair,
is_split_into_words, max_seq_len,
pad_to_max_seq_len);
} else {
tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, vector<string>(),
is_split_into_words, max_seq_len,
pad_to_max_seq_len);
}
for (size_t i = 0; i < batch_size; ++i) {
size_t seq_len = batch_encode_inputs[i]["input_ids"].size();
if (seq_len > batch_max_seq_len) {
batch_max_seq_len = seq_len;
}
}
input_ids->Resize(
framework::make_ddim({static_cast<int64_t>(batch_size),
static_cast<int64_t>(batch_max_seq_len)}));
auto* input_ids_data = input_ids->mutable_data<T>(ctx.GetPlace());
seg_ids->Resize(
framework::make_ddim({static_cast<int64_t>(batch_size),
static_cast<int64_t>(batch_max_seq_len)}));
auto* seg_ids_data = seg_ids->mutable_data<T>(ctx.GetPlace());
auto pad_token_id = tokenizer_ptr->GetPadTokenID();
for (size_t i = 0; i < batch_size; i++) {
auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"];
auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"];
const size_t& seq_len = encoder_input_ids.size();
// Copy the memory
std::memcpy(input_ids_data + i * batch_max_seq_len,
encoder_input_ids.data(), seq_len * sizeof(T));
std::memcpy(seg_ids_data + i * batch_max_seq_len, encoder_seg_ids.data(),
seq_len * sizeof(T));
std::memset(input_ids_data + i * batch_max_seq_len + seq_len,
pad_token_id, (batch_max_seq_len - seq_len) * sizeof(T));
std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, pad_token_id,
(batch_max_seq_len - seq_len) * sizeof(T));
}
delete tokenizer_ptr;
}
};
} // namespace operators
} // namespace paddle
# This file records the Unity Build compilation rules.
# The source files in a `register_unity_group` called are compiled in a unity
# file.
# Generally, the combination rules in this file do not need to be modified.
# If there are some redefined error in compiling with the source file which
# in combination rule, you can remove the source file from the following rules.
register_unity_group(cc
faster_tokenizer_op.cc)
\ No newline at end of file
......@@ -1875,6 +1875,12 @@ void BindImperative(py::module *m_ptr) {
} else if (self.Var().IsType<framework::SelectedRows>()) {
return framework::vectorize<int>(
self.Var().Get<framework::SelectedRows>().value().dims());
} else if (self.Var().IsType<framework::Strings>()) {
return std::vector<int>{static_cast<int>(
self.Var().Get<framework::Strings>().size())};
} else if (self.Var().IsType<framework::Vocab>()) {
return std::vector<int>{
static_cast<int>(self.Var().Get<framework::Vocab>().size())};
} else {
VLOG(2) << "It is meaningless to get shape of "
"variable type "
......
......@@ -185,6 +185,18 @@ void ZeroCopyTensorCreate(
tensor.copy_from_cpu(static_cast<const T *>(data.data()));
}
/// \brief Experimental interface.
/// Create the Strings tensor from data.
/// \param tensor The tensor will be created and
/// the tensor value is same as data.
/// \param data The input text.
void ZeroCopyStringTensorCreate(ZeroCopyTensor &tensor, // NOLINT
const paddle_infer::Strings *data) {
size_t shape = data->size();
tensor.ReshapeStrings(shape);
tensor.copy_strings_from_cpu(data);
}
template <typename T>
void PaddleInferTensorCreate(
paddle_infer::Tensor &tensor, // NOLINT
......@@ -195,6 +207,19 @@ void PaddleInferTensorCreate(
tensor.CopyFromCpu(static_cast<const T *>(data.data()));
}
/// \brief Experimental interface.
/// Create the Strings tensor from data.
/// \param tensor The tensor will be created and
/// the tensor value is same as data.
/// \param data The input text.
void PaddleInferStringTensorCreate(paddle_infer::Tensor &tensor, // NOLINT
const paddle_infer::Strings *data) {
VLOG(3) << "Create PaddleInferTensor, dtype = Strings ";
size_t shape = data->size();
tensor.ReshapeStrings(shape);
tensor.CopyStringsFromCpu(data);
}
size_t PaddleGetDTypeSize(PaddleDType dt) {
size_t size{0};
switch (dt) {
......@@ -726,11 +751,15 @@ void BindPaddleInferPredictor(py::module *m) {
void BindZeroCopyTensor(py::module *m) {
py::class_<ZeroCopyTensor>(*m, "ZeroCopyTensor")
.def("reshape", &ZeroCopyTensor::Reshape)
.def("reshape", py::overload_cast<const std::vector<int> &>(
&ZeroCopyTensor::Reshape))
.def("reshape", py::overload_cast<const std::size_t &>(
&paddle_infer::Tensor::ReshapeStrings))
.def("copy_from_cpu", &ZeroCopyTensorCreate<int32_t>)
.def("copy_from_cpu", &ZeroCopyTensorCreate<int64_t>)
.def("copy_from_cpu", &ZeroCopyTensorCreate<float>)
.def("copy_from_cpu", &ZeroCopyTensorCreate<paddle_infer::float16>)
.def("copy_from_cpu", &ZeroCopyStringTensorCreate)
.def("copy_to_cpu", &ZeroCopyTensorToNumpy)
.def("shape", &ZeroCopyTensor::shape)
.def("set_lod", &ZeroCopyTensor::SetLoD)
......@@ -740,12 +769,16 @@ void BindZeroCopyTensor(py::module *m) {
void BindPaddleInferTensor(py::module *m) {
py::class_<paddle_infer::Tensor>(*m, "PaddleInferTensor")
.def("reshape", &paddle_infer::Tensor::Reshape)
.def("reshape", py::overload_cast<const std::vector<int> &>(
&paddle_infer::Tensor::Reshape))
.def("reshape", py::overload_cast<const std::size_t &>(
&paddle_infer::Tensor::ReshapeStrings))
.def("copy_from_cpu_bind", &PaddleInferTensorCreate<int32_t>)
.def("copy_from_cpu_bind", &PaddleInferTensorCreate<int64_t>)
.def("copy_from_cpu_bind", &PaddleInferTensorCreate<float>)
.def("copy_from_cpu_bind",
&PaddleInferTensorCreate<paddle_infer::float16>)
.def("copy_from_cpu_bind", &PaddleInferStringTensorCreate)
.def("copy_to_cpu", &PaddleInferTensorToNumpy)
.def("shape", &paddle_infer::Tensor::shape)
.def("set_lod", &paddle_infer::Tensor::SetLoD)
......
......@@ -68,6 +68,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
{"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
{"run_program", {"X", "Params"}},
{"faster_tokenizer", {"Text", "Vocab", "TextPair"}},
{"matrix_rank", {"X", "TolTensor"}},
{"adam",
{"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
......
......@@ -227,7 +227,10 @@ void BindVarDsec(pybind11::module *m) {
.value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
.value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
.value("READER", pd::proto::VarType::READER)
.value("RAW", pd::proto::VarType::RAW);
.value("RAW", pd::proto::VarType::RAW)
.value("STRING", pd::proto::VarType::STRING)
.value("STRINGS", pd::proto::VarType::STRINGS)
.value("VOCAB", pd::proto::VarType::VOCAB);
}
void BindOpDesc(pybind11::module *m) {
......
......@@ -1239,6 +1239,18 @@ All parameter, weight, gradient are variables in Paddle.
[](Variable &self) {
return py::bytes(*self.GetMutable<std::string>());
})
.def("set_string_list",
[](Variable &self, Strings str_list) {
*self.GetMutable<Strings>() = str_list;
})
.def("set_vocab", [](Variable &self,
Vocab vocab) { *self.GetMutable<Vocab>() = vocab; })
.def("get_string_tensor",
[](Variable &self) { return self.GetMutable<Strings>(); },
py::return_value_policy::reference)
.def("get_map_tensor",
[](Variable &self) { return self.GetMutable<Vocab>(); },
py::return_value_policy::reference)
.def("get_lod_rank_table",
[](Variable &self) { return self.GetMutable<LoDRankTable>(); },
py::return_value_policy::reference)
......@@ -1872,20 +1884,20 @@ All parameter, weight, gradient are variables in Paddle.
.def("__str__", string::to_string<const platform::Place &>);
py::class_<OperatorBase>(m, "Operator")
.def_static("create",
[](py::bytes protobin) {
proto::OpDesc desc;
PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin),
true,
platform::errors::InvalidArgument(
"Cannot parse user input to OpDesc"));
PADDLE_ENFORCE_EQ(desc.IsInitialized(), true,
platform::errors::InvalidArgument(
"The provided OpDesc is not "
"initialized, the reason is: %s",
desc.InitializationErrorString()));
return OpRegistry::CreateOp(desc);
})
.def_static(
"create",
[](py::bytes protobin) {
proto::OpDesc desc;
PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true,
platform::errors::InvalidArgument(
"Cannot parse user input to OpDesc"));
PADDLE_ENFORCE_EQ(
desc.IsInitialized(), true,
platform::errors::InvalidArgument(
"The provided OpDesc is not initialized, the reason is: %s",
desc.InitializationErrorString()));
return OpRegistry::CreateOp(desc);
})
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::CPUPlace &place) {
......@@ -2139,7 +2151,12 @@ All parameter, weight, gradient are variables in Paddle.
});
#endif
m.def("set_feed_variable", framework::SetFeedVariable);
m.def("set_feed_variable",
static_cast<void (*)(Scope *, const LoDTensor &, const std::string &,
size_t)>(&framework::SetFeedVariable));
m.def("set_feed_variable",
static_cast<void (*)(Scope *, const Strings &, const std::string &,
size_t)>(&framework::SetFeedVariable));
m.def("get_fetch_variable",
[](const Scope &scope, const std::string &var_name,
size_t index) -> py::object {
......
......@@ -799,12 +799,17 @@ def save(layer, path, input_spec=None, **configs):
# 3. share parameters from Layer to scope & record var info
for param_or_buffer in concrete_program.parameters:
# share to scope
param_or_buffer_tensor = scope.var(
param_or_buffer.name).get_tensor()
#src_tensor = param_or_buffer.value().get_tensor()
src_tensor = state_var_dict[param_or_buffer.name].value(
).get_tensor()
param_or_buffer_tensor._share_data_with(src_tensor)
if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
scr_tensor = param_or_buffer.value().get_map_tensor()
tgt_var = scope.var(param_or_buffer.name)
tgt_var.set_vocab(scr_tensor)
else:
param_or_buffer_tensor = scope.var(
param_or_buffer.name).get_tensor()
#src_tensor = param_or_buffer.value().get_tensor()
src_tensor = state_var_dict[param_or_buffer.name].value(
).get_tensor()
param_or_buffer_tensor._share_data_with(src_tensor)
# record var info
if param_or_buffer.name not in extra_var_info:
extra_info_dict = dict()
......
......@@ -1409,13 +1409,22 @@ class Layer(core.Layer):
if state is None:
raise ValueError("{} is not found in the provided dict.".format(
key))
state_shape = state.shape() if inspect.ismethod(
state.shape) else state.shape
if list(state_shape) != list(param.shape):
raise ValueError(
"{} receives a shape {}, but the expected shape is {}.".
format(key, list(state_shape), list(param.shape)))
return param, state
if (isinstance(state, dict) or isinstance(state, list)):
if (len(state) != len(param)):
raise ValueError("{} receieves the length of {}, "
"but the expected shape is {}".format(
key, len(state), len(param)))
else:
return param, state
else:
state_shape = state.shape() if inspect.ismethod(
state.shape) else state.shape
if list(state_shape) != list(param.shape):
raise ValueError(
"{} receives a shape {}, but the expected shape is {}.".
format(key, list(state_shape), list(param.shape)))
return param, state
matched_param_state = []
for key, param in self.state_dict().items():
......
......@@ -133,7 +133,12 @@ def monkey_patch_math_varbase():
return int(var.numpy().flatten()[0])
def _len_(var):
return var.shape[0]
if var.type == core.VarDesc.VarType.VOCAB:
return len(var.value().get_map_tensor())
elif var.type == core.VarDesc.VarType.STRINGS:
return len(var.value().get_string_tensor())
else:
return var.shape[0]
def _index_(var):
numel = np.prod(var.shape)
......
......@@ -146,25 +146,35 @@ def monkey_patch_varbase():
out = linear(t) # call with different weight
"""
assert isinstance(value, (np.ndarray, core.VarBase)), \
"Variable set_value function, arguments type only support Variable, numpy, VarBase"
value_np = value
if isinstance(value, core.VarBase):
value_np = value.numpy()
assert isinstance(value, (np.ndarray, core.VarBase, dict, str)), \
"Variable set_value function, arguments type only support Variable, numpy, VarBase, dict, string."
if isinstance(value, (dict, str)):
assert len(self) == len(
value
), "Variable length not match, Variable [ {} ] need tensor with length {} but load set tensor with length {}".format(
self.name, len(self), len(value))
if isinstance(value, dict):
self.value().set_vocab(value)
else:
self.value().set_string_list(value)
else:
value_np = value
if isinstance(value, core.VarBase):
value_np = value.numpy()
self_tensor_np = self.numpy()
self_tensor_np = self.numpy()
assert self_tensor_np.shape == value_np.shape, \
"Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format(
self.name, self_tensor_np.shape, value_np.shape)
assert self_tensor_np.shape == value_np.shape, \
"Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format(
self.name, self_tensor_np.shape, value_np.shape)
assert self_tensor_np.dtype == value_np.dtype, \
"Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
self.name, self_tensor_np.dtype, value_np.dtype)
assert self_tensor_np.dtype == value_np.dtype, \
"Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
self.name, self_tensor_np.dtype, value_np.dtype)
self.value().get_tensor().set(value_np,
framework._current_expected_place())
self.value().get_tensor().set(value_np,
framework._current_expected_place())
@framework.dygraph_only
def backward(self, grad_tensor=None, retain_graph=False):
......
......@@ -792,9 +792,11 @@ class Executor(object):
feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name]
var = global_block.var(feed_target_name)
if not isinstance(cur_feed, core.LoDTensor):
cur_feed = _as_lodtensor(cur_feed, self.place, var.dtype)
check_feed_shape_type(var, cur_feed)
if var.dtype != core.VarDesc.VarType.STRINGS:
if not isinstance(cur_feed, core.LoDTensor):
cur_feed = _as_lodtensor(cur_feed, self.place,
var.dtype)
check_feed_shape_type(var, cur_feed)
idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else:
......
......@@ -979,6 +979,10 @@ class Variable(object):
if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype)
if dtype == core.VarDesc.VarType.STRINGS:
type = core.VarDesc.VarType.STRINGS
lod_level = None
self.belong_to_optimizer = belong_to_optimizer
self.error_clip = error_clip
......
......@@ -29,10 +29,14 @@ def tensor_copy_from_cpu(self, data):
'''
Support input type check based on tensor.copy_from_cpu.
'''
if not isinstance(data, np.ndarray):
if isinstance(data, np.ndarray) or (isinstance(data, list) and
len(data) > 0 and
isinstance(data[0], str)):
self.copy_from_cpu_bind(data)
else:
raise TypeError(
"In copy_from_cpu, we only support numpy ndarray data type.")
self.copy_from_cpu_bind(data)
"In copy_from_cpu, we only support numpy ndarray and list[str] data type."
)
Tensor.copy_from_cpu = tensor_copy_from_cpu
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import io
import os
import unittest
import numpy as np
import paddle
import paddle.nn as nn
from paddle.dataset.common import DATA_HOME
from paddle.fluid.framework import core, in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
import sys
sys.path.append("./tokenizer")
from tokenizer.bert_tokenizer import BertTokenizer
def to_string_tensor(string_values, name):
"""
Create the tensor that the value holds the list of string.
NOTICE: The value will be holded in the cpu place.
Args:
string_values(list[string]): The value will be setted to the tensor.
name(string): The name of the tensor.
"""
tensor = paddle.Tensor(core.VarDesc.VarType.STRING, [], name,
core.VarDesc.VarType.STRINGS, False)
tensor.value().set_string_list(string_values)
return tensor
def to_map_tensor(string_dict, name):
"""
Create the tensor that the value holds the map, the type of key is the string
and the value is the int.
NOTICE: The value will be holded in the cpu place.
Args:
string_dict(dict): The value will be setted to the tensor.
name(string): The name of the tensor.
"""
tensor = paddle.Tensor(core.VarDesc.VarType.RAW, [], name,
core.VarDesc.VarType.VOCAB, True)
tensor.value().set_vocab(string_dict)
return tensor
class FasterTokenizer(nn.Layer):
def __init__(self, vocab_dict):
super(FasterTokenizer, self).__init__()
vocab_tensor = to_map_tensor(vocab_dict, "vocab")
self.register_buffer("vocab", vocab_tensor, persistable=True)
def forward(self,
text,
text_pair=None,
do_lower_case=True,
max_seq_len=-1,
is_split_into_words=False,
pad_to_max_seq_len=False):
if in_dygraph_mode():
input_ids, seg_ids = core.ops.faster_tokenizer(
self.vocab, text, text_pair, "do_lower_case", do_lower_case,
"max_seq_len", max_seq_len, "pad_to_max_seq_len",
pad_to_max_seq_len, "is_split_into_words", is_split_into_words)
return input_ids, seg_ids
attrs = {
"do_lower_case": do_lower_case,
"max_seq_len": max_seq_len,
"pad_to_max_seq_len": pad_to_max_seq_len,
"is_split_into_words": is_split_into_words,
}
helper = LayerHelper("faster_tokenizer")
input_ids = helper.create_variable_for_type_inference(dtype="int64")
seg_ids = helper.create_variable_for_type_inference(dtype="int64")
if text_pair is None:
helper.append_op(
type='faster_tokenizer',
inputs={'Vocab': self.vocab,
'Text': text},
outputs={'InputIds': input_ids,
'SegmentIds': seg_ids},
attrs=attrs)
else:
helper.append_op(
type='faster_tokenizer',
inputs={
'Vocab': self.vocab,
'Text': text,
'TextPair': text_pair
},
outputs={'InputIds': input_ids,
'SegmentIds': seg_ids},
attrs=attrs)
return input_ids, seg_ids
class Predictor(object):
def __init__(self, model_dir):
model_file = os.path.join(model_dir, "inference.pdmodel")
params_file = os.path.join(model_dir, "inference.pdiparams")
if not os.path.exists(model_file):
raise ValueError("not find model file path {}".format(model_file))
if not os.path.exists(params_file):
raise ValueError("not find params file path {}".format(params_file))
config = paddle.inference.Config(model_file, params_file)
# fast_tokenizer op only support cpu.
config.disable_gpu()
config.set_cpu_math_library_num_threads(10)
config.switch_use_feed_fetch_ops(False)
self.predictor = paddle.inference.create_predictor(config)
self.input_handles = [
self.predictor.get_input_handle(name)
for name in self.predictor.get_input_names()
]
self.output_handles = [
self.predictor.get_output_handle(name)
for name in self.predictor.get_output_names()
]
def predict(self, data):
self.input_handles[0].copy_from_cpu(data)
self.predictor.run()
input_ids = self.output_handles[0].copy_to_cpu()
token_type_ids = self.output_handles[1].copy_to_cpu()
return input_ids, token_type_ids
class TestBertTokenizerOp(unittest.TestCase):
def setUp(self):
self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab)
self.init_data()
self.save_path = os.path.join(DATA_HOME, "fast_tokenizer")
self.param_path = os.path.join(self.save_path, "model.pdparams")
self.inference_path = os.path.join(self.save_path, "inference")
def init_data(self):
self.text = [
'选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。'
'酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,'
'还算丰富。 服务吗,一般'
]
self.text_pair = ['非常不错,服务很好,位于市中心区,交通方便,不过价格也高!']
self.text_tensor = to_string_tensor(self.text, "text")
self.text_pair_tensor = to_string_tensor(self.text_pair, "text_pair")
self.texts = [
'很好的地理位置,一蹋糊涂的服务,萧条的酒店。',
' 选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般,'
'但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。 服务吗,一般',
'Test bert tokenizer. The first text.'
]
self.text_pairs = [
'非常不错,服务很好,位于市中心区,交通方便,不过价格也高!', '房间太小。其他的都一般。。。。。。。。。',
'Test bert tokenizer. The second text.'
]
self.texts_tensor = to_string_tensor(self.texts, "texts")
self.text_pairs_tensor = to_string_tensor(self.text_pairs, "text_pairs")
def test_padding(self):
self.max_seq_len = 128
self.pad_to_max_seq_len = True
self.is_split_into_words = False
# case 1: only one text (batch_size = 1)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.text_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
text=self.text,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
# case 2: only one text and one text_pair (batch_size = 1)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.text_tensor,
text_pair=self.text_pair_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
text=self.text,
text_pair=self.text_pair,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
# case 3: only texts (batch_size = 3)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.texts_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
self.texts,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = [i["input_ids"] for i in encoded_inputs]
py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs]
py_input_ids = np.array(py_input_ids).reshape([3, -1])
py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
# case 4: texts and text pairs (batch_size = 3)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.texts_tensor,
text_pair=self.text_pairs_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
self.texts,
self.text_pairs,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = [i["input_ids"] for i in encoded_inputs]
py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs]
py_input_ids = np.array(py_input_ids).reshape([3, -1])
py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
def test_no_padding(self):
self.max_seq_len = 128
self.pad_to_max_seq_len = False
self.is_split_into_words = False
# case 1: only one text (batch_size = 1)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.text_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
self.text,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
# case 2: only one text and one text_pair (batch_size = 1)
input_ids, token_type_ids = self.faster_tokenizer(
self.text_tensor,
self.text_pair_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
self.text,
self.text_pair,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
def test_is_split_into_words(self):
self.is_split_into_words = True
input_ids, token_type_ids = self.faster_tokenizer(
self.text_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
list(self.text[0]), is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs["token_type_ids"]).reshape(
[1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
def test_inference(self):
if not os.path.exists(self.save_path):
os.makedirs(self.save_path, exist_ok=True)
paddle.save(self.faster_tokenizer.state_dict(), self.param_path)
state_dict = paddle.load(self.param_path)
self.faster_tokenizer.set_dict(state_dict)
static_model = paddle.jit.to_static(
self.faster_tokenizer,
input_spec=[
paddle.static.InputSpec(
shape=[None], dtype=core.VarDesc.VarType.STRINGS), # texts
])
# Save in static graph model.
paddle.jit.save(static_model, self.inference_path)
predictor = Predictor(self.save_path)
input_ids, token_type_ids = predictor.predict(self.text)
encoded_inputs = self.bert_tokenizer(self.text)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
def test_feed_string_var(self):
paddle.enable_static()
x = paddle.static.data(
name="x", shape=[-1], dtype=core.VarDesc.VarType.STRINGS)
exe = paddle.static.Executor(paddle.framework.CPUPlace())
exe.run(paddle.static.default_main_program(), feed={'x': self.text})
paddle.disable_static()
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import io
import json
import os
import six
import unicodedata
from tokenizer_utils import PretrainedTokenizer
from tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation
class BasicTokenizer(object):
"""
Runs basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (bool):
Whether or not to lowercase the input when tokenizing.
Defaults to `True`.
"""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer."""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""
Tokenizes a piece of text using basic tokenizer.
Args:
text (str): A piece of text.
Returns:
list(str): A list of tokens.
Examples:
.. code-block::
from paddlenlp.transformers import BasicTokenizer
basictokenizer = BasicTokenizer()
tokens = basictokenizer.tokenize('He was a puppeteer')
'''
['he', 'was', 'a', 'puppeteer']
'''
"""
text = convert_to_unicode(text)
text = self._clean_text(text)
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""
Strips accents from a piece of text.
"""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""
Splits punctuation on a piece of text.
"""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""
Adds whitespace around any CJK character.
"""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""
Checks whether CP is the codepoint of a CJK character.
"""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""
Performs invalid character removal and whitespace cleanup on text.
"""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""
Runs WordPiece tokenization.
Args:
vocab (Vocab|dict):
Vocab of the word piece tokenizer.
unk_token (str):
A specific token to replace all unknown tokens.
max_input_chars_per_word (int):
If a word's length is more than
max_input_chars_per_word, it will be dealt as unknown word.
Defaults to 100.
"""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
list (str): A list of wordpiece tokens.
Examples:
.. code-block::
from paddlenlp.transformers import BertTokenizer, WordpieceTokenizer
berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocab = berttokenizer.vocab
unk_token = berttokenizer.unk_token
wordpiecetokenizer = WordpieceTokenizer(vocab,unk_token)
inputs = wordpiecetokenizer.tokenize("unaffable")
print(inputs)
'''
["un", "##aff", "##able"]
'''
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
class BertTokenizer(PretrainedTokenizer):
"""
Constructs a BERT tokenizer. It uses a basic tokenizer to do punctuation
splitting, lower casing and so on, and follows a WordPiece tokenizer to
tokenize as subwords.
Args:
vocab_file (str):
The vocabulary file path (ends with '.txt') required to instantiate
a `WordpieceTokenizer`.
do_lower_case (bool):
Whether or not to lowercase the input when tokenizing.
Defaults to`True`.
unk_token (str):
A special token representing the *unknown (out-of-vocabulary)* token.
An unknown token is set to be `unk_token` inorder to be converted to an ID.
Defaults to "[UNK]".
sep_token (str):
A special token separating two different sentences in the same input.
Defaults to "[SEP]".
pad_token (str):
A special token used to make arrays of tokens the same size for batching purposes.
Defaults to "[PAD]".
cls_token (str):
A special token used for sequence classification. It is the last token
of the sequence when built with special tokens. Defaults to "[CLS]".
mask_token (str):
A special token representing a masked token. This is the token used
in the masked language modeling task which the model tries to predict the original unmasked ones.
Defaults to "[MASK]".
Examples:
.. code-block::
from paddlenlp.transformers import BertTokenizer
berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = berttokenizer.tokenize('He was a puppeteer')
print(inputs)
'''
{'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
'''
"""
resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained
pretrained_resource_files_map = {
"vocab_file": {
"bert-base-uncased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt",
"bert-large-uncased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-uncased-vocab.txt",
"bert-base-cased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt",
"bert-large-cased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-cased-vocab.txt",
"bert-base-multilingual-uncased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-uncased-vocab.txt",
"bert-base-multilingual-cased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-cased-vocab.txt",
"bert-base-chinese":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt",
"bert-wwm-chinese":
"http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt",
"bert-wwm-ext-chinese":
"http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-ext-chinese-vocab.txt",
"macbert-large-chinese":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt",
"macbert-base-chinese":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt",
"simbert-base-chinese":
"https://paddlenlp.bj.bcebos.com/models/transformers/simbert/vocab.txt",
}
}
pretrained_init_configuration = {
"bert-base-uncased": {
"do_lower_case": True
},
"bert-large-uncased": {
"do_lower_case": True
},
"bert-base-cased": {
"do_lower_case": False
},
"bert-large-cased": {
"do_lower_case": False
},
"bert-base-multilingual-uncased": {
"do_lower_case": True
},
"bert-base-multilingual-cased": {
"do_lower_case": False
},
"bert-base-chinese": {
"do_lower_case": False
},
"bert-wwm-chinese": {
"do_lower_case": False
},
"bert-wwm-ext-chinese": {
"do_lower_case": False
},
"macbert-large-chinese": {
"do_lower_case": False
},
"macbert-base-chinese": {
"do_lower_case": False
},
"simbert-base-chinese": {
"do_lower_case": True
},
}
padding_side = 'right'
def __init__(self,
vocab_file,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]"):
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the "
"vocabulary from a pretrained model please use "
"`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
.format(vocab_file))
self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
self.do_lower_case = do_lower_case
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(
vocab=self.vocab, unk_token=unk_token)
self.special_tokens_map = {
'unk_token': unk_token,
'sep_token': sep_token,
'pad_token': pad_token,
'cls_token': cls_token,
'mask_token': mask_token
}
@property
def vocab_size(self):
"""
Return the size of vocabulary.
Returns:
int: The size of vocabulary.
"""
return len(self.vocab)
def _tokenize(self, text):
"""
End-to-end tokenization for BERT models.
Args:
text (str): The text to be tokenized.
Returns:
list: A list of string representing converted tokens.
"""
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def tokenize(self, text):
"""
Converts a string to a list of tokens.
Args:
text (str): The text to be tokenized.
Returns:
List(str): A list of string representing converted tokens.
Examples:
.. code-block::
from paddlenlp.transformers import BertTokenizer
berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = berttokenizer.tokenize('He was a puppeteer')
'''
['he', 'was', 'a', 'puppet', '##eer']
'''
"""
return self._tokenize(text)
def num_special_tokens_to_add(self, pair=False):
"""
Returns the number of added tokens when encoding a sequence with special tokens.
Args:
pair(bool):
Whether the input is a sequence pair or a single sequence.
Defaults to `False` and the input is a single sequence.
Returns:
int: Number of tokens added to sequences.
"""
token_ids_0 = []
token_ids_1 = []
return len(
self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
if pair else None))
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (List[int]):
List of IDs to which the special tokens will be added.
token_ids_1 (List[int], optional):
Optional second list of IDs for sequence pairs. Defaults to None.
Returns:
List[int]: List of input_id with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
_cls = [self.cls_token_id]
_sep = [self.sep_token_id]
return _cls + token_ids_0 + _sep + token_ids_1 + _sep
def create_token_type_ids_from_sequences(self,
token_ids_0,
token_ids_1=None):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (List[int]):
A list of `inputs_ids` for the first sequence.
token_ids_1 (List[int], optional):
Optional second list of IDs for sequence pairs. Defaults to None.
Returns:
List[int]: List of token_type_id according to the given sequence(s).
"""
_sep = [self.sep_token_id]
_cls = [self.cls_token_id]
if token_ids_1 is None:
return len(_cls + token_ids_0 + _sep) * [0]
return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 +
_sep) * [1]
def get_special_tokens_mask(self,
token_ids_0,
token_ids_1=None,
already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``encode`` methods.
Args:
token_ids_0 (List[int]):
A list of `inputs_ids` for the first sequence.
token_ids_1 (List[int], optinal):
Optional second list of IDs for sequence pairs. Defaults to None.
already_has_special_tokens (bool, optional): Whether or not the token list is already
formatted with special tokens for the model. Defaults to None.
Returns:
List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(
map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
token_ids_0))
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + (
[0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import io
import json
import os
import unicodedata
from shutil import copyfile
from typing import Iterable, Iterator, Optional, List, Any, Callable, Union
from paddle.dataset.common import DATA_HOME
from paddle.utils.download import get_path_from_url
def convert_to_unicode(text):
"""
Converts `text` to Unicode (if it's not already), assuming utf-8 input.
Args:
text (str|bytes): Text to be converted to unicode.
Returns:
str: converted text.
"""
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
def whitespace_tokenize(text):
"""
Runs basic whitespace cleaning and splitting on a peice of text.
Args:
text (str): Text to be tokened.
Returns:
list(str): Token list.
"""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
def _is_whitespace(char):
"""
Checks whether `chars` is a whitespace character.
"""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
def is_chinese_char(cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def tokenize_chinese_chars(text):
"""Adds whitespace around any CJK character."""
output = []
buff = ""
for char in text:
cp = ord(char)
if is_chinese_char(cp):
if buff != "":
output.append(buff)
buff = ""
output.append(char)
else:
buff += char
if buff != "":
output.append(buff)
return output
class PretrainedTokenizer(object):
"""
The base class for all pretrained tokenizers. It mainly provides common methods
for loading (construction and loading) and saving pretrained tokenizers. Loading
and saving also rely on the following class attributes which should be overridden
by derived classes accordingly:
- **tokenizer_config_file** (str): Represents the file name of tokenizer
configuration for configuration saving and loading in local file system.
The value is `tokenizer_config.json`.
- **resource_files_names** (dict): Represents resources to specific file
names mapping for resource saving and loading in local file system. The
keys of dict representing resource items should be argument names in
tokenizer's `__init__` method, and the values are file names for saving
and loading corresponding resources. The mostly used resources here are
vocabulary file and sentence-piece model file.
- **pretrained_init_configuration** (dict): Provides the tokenizer configurations
of built-in pretrained tokenizers (contrasts to tokenizers in local file
system). It has pretrained tokenizer names as keys (the same as pretrained
model names, such as `bert-base-uncased`), and the values are dict preserving
corresponding configuration for tokenizer initialization.
- **pretrained_resource_files_map** (dict): Provides resource URLs of built-in
pretrained tokenizers (contrasts to tokenizers in local file system). It
has the same keys as `resource_files_names`, and the values are also `dict`
mapping specific pretrained tokenizer names (such as `bert-base-uncased`)
to corresponding resource URLs.
Moreover, methods common to tokenizers for tokenization, token/id conversion
and encoding as model inputs are also provided here.
Besides, metaclass `InitTrackerMeta` is used to create `PretrainedTokenizer`,
by which subclasses can track arguments for initialization automatically
and expose special tokens initialization used as attributes.
"""
tokenizer_config_file = "tokenizer_config.json"
pretrained_init_configuration = {}
resource_files_names = {} # keys are arguments of __init__
pretrained_resource_files_map = {}
padding_side = 'right'
pad_token_type_id = 0
def __call__(self,
text,
text_pair=None,
max_seq_len: Optional[int]=None,
stride=0,
is_split_into_words=False,
pad_to_max_seq_len=False,
truncation_strategy="longest_first",
return_position_ids=False,
return_token_type_ids=True,
return_attention_mask=False,
return_length=False,
return_overflowing_tokens=False,
return_special_tokens_mask=False):
"""
Performs tokenization and uses the tokenized tokens to prepare model
inputs. It supports sequence or sequence pair as input, and batch input
is allowed. `self.encode()` or `self.batch_encode()` would be called
separately for single or batch input depending on input format and
`is_split_into_words` argument.
Args:
text (str, List[str] or List[List[str]]):
The sequence or batch of sequences to be processed. One sequence
is a string or a list of strings depending on whether it has been
pretokenized. If each sequence is provided as a list of strings
(pretokenized), you must set `is_split_into_words` as `True` to
disambiguate with a batch of sequences.
text_pair (str, List[str] or List[List[str]], optional):
Same as `text` argument, while it represents for the latter
sequence of the sequence pair.
max_seq_len (int, optional):
If set to a number, will limit the total sequence returned so
that it has a maximum length. If there are overflowing tokens,
those overflowing tokens will be added to the returned dictionary
when `return_overflowing_tokens` is `True`. Defaults to `None`.
stride (int, optional):
Only available for batch input of sequence pair and mainly for
question answering usage. When for QA, `text` represents questions
and `text_pair` represents contexts. If `stride` is set to a
positive number, the context will be split into multiple spans
where `stride` defines the number of (tokenized) tokens to skip
from the start of one span to get the next span, thus will produce
a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
and 'offset_mapping' preserving the original example and position
information will be added to the returned dictionary. Defaults to 0.
pad_to_max_seq_len (bool, optional):
If set to `True`, the returned sequences would be padded up to
`max_seq_len` specified length according to padding side
(`self.padding_side`) and padding token id. Defaults to `False`.
truncation_strategy (str, optional):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence
until the input is under `max_seq_len` starting from the longest
one at each token (when there is a pair of input sequences).
- 'only_first': Only truncate the first sequence.
- 'only_second': Only truncate the second sequence.
- 'do_not_truncate': Do not truncate (raise an error if the input
sequence is longer than `max_seq_len`).
Defaults to 'longest_first'.
return_position_ids (bool, optional):
Whether to include tokens position ids in the returned dictionary.
Defaults to `False`.
return_token_type_ids (bool, optional):
Whether to include token type ids in the returned dictionary.
Defaults to `True`.
return_attention_mask (bool, optional):
Whether to include the attention mask in the returned dictionary.
Defaults to `False`.
return_length (bool, optional):
Whether to include the length of each encoded inputs in the
returned dictionary. Defaults to `False`.
return_overflowing_tokens (bool, optional):
Whether to include overflowing token information in the returned
dictionary. Defaults to `False`.
return_special_tokens_mask (bool, optional):
Whether to include special tokens mask information in the returned
dictionary. Defaults to `False`.
Returns:
dict or list[dict] (for batch input):
The dict has the following optional items:
- **input_ids** (list[int]): List of token ids to be fed to a model.
- **position_ids** (list[int], optional): List of token position ids to be
fed to a model. Included when `return_position_ids` is `True`
- **token_type_ids** (list[int], optional): List of token type ids to be
fed to a model. Included when `return_token_type_ids` is `True`.
- **attention_mask** (list[int], optional): List of integers valued 0 or 1,
where 0 specifies paddings and should not be attended to by the
model. Included when `return_attention_mask` is `True`.
- **seq_len** (int, optional): The input_ids length. Included when `return_length`
is `True`.
- **overflowing_tokens** (list[int], optional): List of overflowing tokens.
Included when if `max_seq_len` is specified and `return_overflowing_tokens`
is True.
- **num_truncated_tokens** (int, optional): The number of overflowing tokens.
Included when if `max_seq_len` is specified and `return_overflowing_tokens`
is True.
- **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1,
with 0 specifying special added tokens and 1 specifying sequence tokens.
Included when `return_special_tokens_mask` is `True`.
- **offset_mapping** (list[int], optional): list of pair preserving the
index of start and end char in original input for each token.
For a special token, the index pair is `(0, 0)`. Included when
`stride` works.
- **overflow_to_sample** (int, optional): Index of example from which this
feature is generated. Included when `stride` works.
"""
# Input type checking for clearer error
assert isinstance(text, str) or (
isinstance(text, (list, tuple)) and (len(text) == 0 or (
isinstance(text[0], str) or
(isinstance(text[0], (list, tuple)) and
(len(text[0]) == 0 or isinstance(text[0][0], str)))))
), ("text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples).")
assert (text_pair is None or isinstance(text_pair, str) or (
isinstance(text_pair, (list, tuple)) and (len(text_pair) == 0 or (
isinstance(text_pair[0], str) or
(isinstance(text_pair[0], (list, tuple)) and
(len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)))))
)), (
"text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples).")
is_batched = bool(
(not is_split_into_words and isinstance(text, (list, tuple))) or
(is_split_into_words and isinstance(text, (list, tuple)) and
text and isinstance(text[0], (list, tuple))))
if is_batched:
batch_text_or_text_pairs = list(zip(
text, text_pair)) if text_pair is not None else text
return self.batch_encode(
batch_text_or_text_pairs=batch_text_or_text_pairs,
max_seq_len=max_seq_len,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_max_seq_len=pad_to_max_seq_len,
truncation_strategy="longest_first",
return_position_ids=return_position_ids,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_length=return_length,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask)
else:
return self.encode(
text=text,
text_pair=text_pair,
max_seq_len=max_seq_len,
pad_to_max_seq_len=pad_to_max_seq_len,
truncation_strategy="longest_first",
return_position_ids=return_position_ids,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_length=return_length,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask)
@property
def all_special_tokens(self):
"""
list: All the special tokens ('<unk>', '<cls>'...) corresponding to
special token arguments in `__init__` (arguments end with '_end').
"""
all_toks = []
set_attr = self.special_tokens_map
for attr_value in set_attr.values():
all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (
list, tuple)) else [attr_value])
all_toks = list(set(all_toks))
return all_toks
@property
def all_special_ids(self):
"""
list: All the token ids corresponding to all the special tokens.
"""
all_toks = self.all_special_tokens
all_ids = self.convert_tokens_to_ids(all_toks)
return all_ids
def convert_tokens_to_ids(self, tokens):
"""
Converts a sequence of tokens into ids using the `vocab` attribute (an
instance of `Vocab`). Override it if needed.
Args:
tokens (list[int]): List of token ids.
Returns:
list: Converted id list.
"""
if isinstance(tokens, list):
token_ids = []
for token in tokens:
token_id = self.vocab.get(token, self.unk_token_id)
token_ids.append(token_id)
return token_ids
elif isinstance(tokens, str):
token_id = self.vocab.get(tokens, self.unk_token_id)
token_ids.append(token_id)
return token_ids
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
"""
Creates an instance of `PretrainedTokenizer`. Related resources are loaded
by specifying name of a built-in pretrained model, or a community-contributed
pretrained model, or a local file directory path.
Args:
pretrained_model_name_or_path (str): Name of pretrained model or dir path
to load from. The string can be:
- Name of built-in pretrained model
- Name of a community-contributed pretrained model.
- Local directory path which contains tokenizer related resources
and tokenizer config file ("tokenizer_config.json").
*args (tuple): position arguments for model `__init__`. If provided,
use these as position argument values for tokenizer initialization.
**kwargs (dict): keyword arguments for model `__init__`. If provided,
use these to update pre-defined keyword argument values for tokenizer
initialization.
Returns:
PretrainedTokenizer: An instance of `PretrainedTokenizer`.
Example:
.. code-block::
from paddlenlp.transformers import BertTokenizer
# Name of built-in pretrained model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Name of community-contributed pretrained model
tokenizer = BertTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned')
# Load from local directory path
tokenizer = BertTokenizer.from_pretrained('./my_bert/')
"""
pretrained_models = list(cls.pretrained_init_configuration.keys())
vocab_files = {}
init_configuration = {}
# From built-in pretrained models
if pretrained_model_name_or_path in pretrained_models:
for file_id, map_list in cls.pretrained_resource_files_map.items():
vocab_files[file_id] = map_list[pretrained_model_name_or_path]
init_configuration = copy.deepcopy(
cls.pretrained_init_configuration[
pretrained_model_name_or_path])
# From local dir path
elif os.path.isdir(pretrained_model_name_or_path):
for file_id, file_name in cls.resource_files_names.items():
full_file_name = os.path.join(pretrained_model_name_or_path,
file_name)
vocab_files[file_id] = full_file_name
vocab_files["tokenizer_config_file"] = os.path.join(
pretrained_model_name_or_path, cls.tokenizer_config_file)
default_root = os.path.join(DATA_HOME, pretrained_model_name_or_path)
resolved_vocab_files = {}
for file_id, file_path in vocab_files.items():
if file_path is None or os.path.isfile(file_path):
resolved_vocab_files[file_id] = file_path
continue
path = os.path.join(default_root, file_path.split('/')[-1])
if os.path.exists(path):
print("Already cached %s" % path)
resolved_vocab_files[file_id] = path
else:
print("Downloading %s and saved to %s" %
(file_path, default_root))
try:
resolved_vocab_files[file_id] = get_path_from_url(
file_path, default_root)
except RuntimeError as err:
print(err)
raise RuntimeError(
f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n"
f"Please make sure that '{pretrained_model_name_or_path}' is:\n"
"- a correct model-identifier of built-in pretrained models,\n"
"- or a correct model-identifier of community-contributed pretrained models,\n"
"- or the correct path to a directory containing relevant tokenizer files.\n"
)
# Prepare tokenizer initialization kwargs
# Did we saved some inputs and kwargs to reload ?
tokenizer_config_file = resolved_vocab_files.pop(
"tokenizer_config_file", None)
if tokenizer_config_file is not None:
with io.open(tokenizer_config_file, encoding="utf-8") as f:
init_kwargs = json.load(f)
else:
init_kwargs = init_configuration
# position args are stored in kwargs, maybe better not include
init_args = init_kwargs.pop("init_args", ())
init_kwargs.pop("init_class", None)
# Update with newly provided args and kwargs
init_args = init_args if not args else args
init_kwargs.update(kwargs)
# Merge resolved_vocab_files arguments in init_kwargs if not including.
# Maybe need more ways to load resources.
for args_name, file_path in resolved_vocab_files.items():
# when `pretrained_model_name_or_path` is a pretrained model name,
# use pretrained_init_configuration as `init_kwargs` to init which
# does not include the vocab file in it, thus add vocab file into
# args.
if args_name not in init_kwargs:
init_kwargs[args_name] = file_path
# when `pretrained_model_name_or_path` is a pretrained model dir,
# use tokenizer_config_file.json as `init_kwargs` to init which
# does include a vocab file path in it. However, if the vocab file
# path included in json does not exist, such as was deleted, to make
# it still work, use the vocab file under this dir.
elif not os.path.isfile(init_kwargs[args_name]) and os.path.isfile(
file_path):
init_kwargs[args_name] = file_path
# TODO(guosheng): avoid reduplication of position args and key word args
tokenizer = cls(*init_args, **init_kwargs)
return tokenizer
def save_pretrained(self, save_directory):
"""
Save tokenizer configuration and related resources to files under
`save_directory`. The tokenizer configuration would be saved into
`tokenizer_config_file` indicating file (thus `tokenizer_config.json`),
and resources would be saved into `resource_files_names` indicating files
by using `self.save_resources(save_directory)`.
The `save_directory` can be used in `from_pretrained` as argument value
of `pretrained_model_name_or_path` to re-load the tokenizer.
Args:
save_directory (str): Directory to save files into.
Example:
.. code-block::
from paddlenlp.transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.save_pretrained('trained_model')
# reload from save_directory
tokenizer = BertTokenizer.from_pretrained('trained_model')
"""
assert not os.path.isfile(
save_directory
), "Saving directory ({}) should be a directory, not a file".format(
save_directory)
os.makedirs(save_directory, exist_ok=True)
tokenizer_config_file = os.path.join(save_directory,
self.tokenizer_config_file)
# init_config is set in metaclass created `__init__`,
tokenizer_config = self.init_config
with io.open(tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
self.save_resources(save_directory)
def save_resources(self, save_directory):
"""
Save tokenizer related resources to `resource_files_names` indicating
files under `save_directory` by copying directly. Override it if necessary.
Args:
save_directory (str): Directory to save files into.
"""
for name, file_name in self.resource_files_names.items():
src_path = self.init_config[name]
dst_path = os.path.join(save_directory, file_name)
if os.path.abspath(src_path) != os.path.abspath(dst_path):
copyfile(src_path, dst_path)
@staticmethod
def load_vocabulary(filepath,
unk_token=None,
pad_token=None,
bos_token=None,
eos_token=None,
**kwargs):
"""
Instantiate an instance of `Vocab` from a file reserving all tokens
by using `Vocab.from_dict`. The file contains a token per line, and the
line number would be the index of corresponding token.
Args:
filepath (str): path of file to construct vocabulary.
unk_token (str): special token for unknown token. If no need, it also
could be `None`. Defaults to `None`.
pad_token (str): special token for padding token. If no need, it also
could be `None`. Defaults to `None`.
bos_token (str): special token for bos token. If no need, it also
could be `None`. Defaults to `None`.
eos_token (str): special token for eos token. If no need, it also
could be `None`. Defaults to `None`.
**kwargs (dict): keyword arguments for `Vocab.from_dict`.
Returns:
Vocab: An instance of `Vocab`.
"""
token_to_idx = {}
with io.open(filepath, 'r', encoding='utf-8') as f:
for index, line in enumerate(f):
token = line.rstrip('\n')
token_to_idx[token] = int(index)
return token_to_idx
def __getattr__(self, name):
if name.endswith('_token'):
return self.special_tokens_map[name]
elif name.endswith('_token_id'):
return self.vocab[self.special_tokens_map[name[:-3]]]
raise AttributeError("'{}' object has no attribute '{}'".format(
type(self).__name__, name))
def truncate_sequences(self,
ids,
pair_ids=None,
num_tokens_to_remove=0,
truncation_strategy='longest_first',
stride=0):
"""
Truncates a sequence pair in place to the maximum length.
Args:
ids: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``):
number of tokens to remove using the truncation strategy
truncation_strategy: string selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len
starting from the longest one at each token (when there is a pair of input sequences).
Overflowing tokens only contains overflow from the first sequence.
- 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len)
stride (:obj:`int`, `optional`, defaults to ``0``):
If set to a number along with max_seq_len, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens.
"""
if num_tokens_to_remove <= 0:
return ids, pair_ids, []
if truncation_strategy == 'longest_first':
overflowing_tokens = []
for _ in range(num_tokens_to_remove):
if pair_ids is None or len(ids) > len(pair_ids):
overflowing_tokens = [ids[-1]] + overflowing_tokens
ids = ids[:-1]
else:
pair_ids = pair_ids[:-1]
window_len = min(len(ids), stride)
if window_len > 0:
overflowing_tokens = ids[-window_len:] + overflowing_tokens
elif truncation_strategy == 'only_first':
assert len(ids) > num_tokens_to_remove
window_len = min(len(ids), stride + num_tokens_to_remove)
overflowing_tokens = ids[-window_len:]
ids = ids[:-num_tokens_to_remove]
elif truncation_strategy == 'only_second':
assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
window_len = min(len(pair_ids), stride + num_tokens_to_remove)
overflowing_tokens = pair_ids[-window_len:]
pair_ids = pair_ids[:-num_tokens_to_remove]
elif truncation_strategy == 'do_not_truncate':
raise ValueError(
"Input sequence are too long for max_length. Please select a truncation strategy."
)
else:
raise ValueError(
"Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
)
return (ids, pair_ids, overflowing_tokens)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens.
Should be overridden in a subclass if the model has a special way of building those.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
List[int]: List of input_id with the appropriate special tokens.
"""
if token_ids_1 is None:
return token_ids_0
return token_ids_0 + token_ids_1
def build_offset_mapping_with_special_tokens(self,
offset_mapping_0,
offset_mapping_1=None):
"""
Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
Should be overridden in a subclass if the model has a special way of building those.
Args:
offset_mapping_0 (List[tuple]):
List of char offsets to which the special tokens will be added.
offset_mapping_1 (List[tuple], optional):
Optional second list of char offsets for offset mapping pairs.
Returns:
List[tuple]: List of char offsets with the appropriate offsets of special tokens.
"""
if offset_mapping_1 is None:
return offset_mapping_0
return offset_mapping_0 + offset_mapping_1
def get_special_tokens_mask(self,
token_ids_0,
token_ids_1=None,
already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``encode`` methods.
Args:
token_ids_0 (List[int]): List of ids of the first sequence.
token_ids_1 (List[int], optional): List of ids of the second sequence.
already_has_special_tokens (bool, optional): Whether or not the token list is already
formatted with special tokens for the model. Defaults to None.
Returns:
results (List[int]): The list of integers in the range [0, 1]:
1 for a special token, 0 for a sequence token.
"""
return [0] * ((len(token_ids_1)
if token_ids_1 else 0) + len(token_ids_0))
def create_token_type_ids_from_sequences(self,
token_ids_0,
token_ids_1=None):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
Should be overridden in a subclass if the model has a special way of building those.
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (List[int]):
List of IDs.
token_ids_1 (List[int], optional):
Optional second list of IDs for sequence pairs.
Returns:
List[int]: List of token_type_id according to the given sequence(s).
"""
if token_ids_1 is None:
return len(token_ids_0) * [0]
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
def num_special_tokens_to_add(self, pair):
"""
Returns the number of added tokens when encoding a sequence with special tokens.
Args:
pair (bool, optional):
Whether the number of added tokens should be computed in the case of a sequence pair or a single
sequence. Defaults to `False`.
Returns:
int: Number of special tokens added to sequences.
"""
token_ids_0 = []
token_ids_1 = []
return len(
self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
if pair else None))
def encode(self,
text,
text_pair=None,
max_seq_len=512,
pad_to_max_seq_len=False,
truncation_strategy="longest_first",
return_position_ids=False,
return_token_type_ids=True,
return_attention_mask=False,
return_length=False,
return_overflowing_tokens=False,
return_special_tokens_mask=False):
"""
Performs tokenization and uses the tokenized tokens to prepare model
inputs. It supports sequence or sequence pair as input, and batch input
is not allowed.
Args:
text (str, List[str] or List[int]):
The sequence to be processed. One sequence is a string, a list
of strings, or a list of integers depending on whether it has
been pretokenized and converted to ids.
text_pair (str, List[str] or List[List[str]]):
Same as `text` argument, while it represents for the latter
sequence of the sequence pair.
max_seq_len (int, optional):
If set to a number, will limit the total sequence returned so
that it has a maximum length. If there are overflowing tokens,
those overflowing tokens will be added to the returned dictionary
when `return_overflowing_tokens` is `True`. Defaults to `None`.
stride (int, optional):
Only available for batch input of sequence pair and mainly for
question answering usage. When for QA, `text` represents questions
and `text_pair` represents contexts. If `stride` is set to a
positive number, the context will be split into multiple spans
where `stride` defines the number of (tokenized) tokens to skip
from the start of one span to get the next span, thus will produce
a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
and 'offset_mapping' preserving the original example and position
information will be added to the returned dictionary. Defaults to 0.
pad_to_max_seq_len (bool, optional):
If set to `True`, the returned sequences would be padded up to
`max_seq_len` specified length according to padding side
(`self.padding_side`) and padding token id. Defaults to `False`.
truncation_strategy (str, optional):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence
until the input is under `max_seq_len` starting from the longest
one at each token (when there is a pair of input sequences).
- 'only_first': Only truncate the first sequence.
- 'only_second': Only truncate the second sequence.
- 'do_not_truncate': Do not truncate (raise an error if the input
sequence is longer than `max_seq_len`).
Defaults to 'longest_first'.
return_position_ids (bool, optional):
Whether to include tokens position ids in the returned dictionary.
Defaults to `False`.
return_token_type_ids (bool, optional):
Whether to include token type ids in the returned dictionary.
Defaults to `True`.
return_attention_mask (bool, optional):
Whether to include the attention mask in the returned dictionary.
Defaults to `False`.
return_length (bool, optional):
Whether to include the length of each encoded inputs in the
returned dictionary. Defaults to `False`.
return_overflowing_tokens (bool, optional):
Whether to include overflowing token information in the returned
dictionary. Defaults to `False`.
return_special_tokens_mask (bool, optional):
Whether to include special tokens mask information in the returned
dictionary. Defaults to `False`.
Returns:
dict:
The dict has the following optional items:
- **input_ids** (list[int]): List of token ids to be fed to a model.
- **position_ids** (list[int], optional): List of token position ids to be
fed to a model. Included when `return_position_ids` is `True`
- **token_type_ids** (list[int], optional): List of token type ids to be
fed to a model. Included when `return_token_type_ids` is `True`.
- **attention_mask** (list[int], optional): List of integers valued 0 or 1,
where 0 specifies paddings and should not be attended to by the
model. Included when `return_attention_mask` is `True`.
- **seq_len** (int, optional): The input_ids length. Included when `return_length`
is `True`.
- **overflowing_tokens** (list[int], optional): List of overflowing tokens.
Included when if `max_seq_len` is specified and `return_overflowing_tokens`
is True.
- **num_truncated_tokens** (int, optional): The number of overflowing tokens.
Included when if `max_seq_len` is specified and `return_overflowing_tokens`
is True.
- **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1,
with 0 specifying special added tokens and 1 specifying sequence tokens.
Included when `return_special_tokens_mask` is `True`.
"""
def get_input_ids(text):
if isinstance(text, str):
tokens = self._tokenize(text)
return self.convert_tokens_to_ids(tokens)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
return self.convert_tokens_to_ids(text)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], int):
return text
else:
raise ValueError(
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
ids = get_input_ids(text)
pair_ids = get_input_ids(text_pair) if text_pair is not None else None
pair = bool(pair_ids is not None)
len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0
encoded_inputs = {}
# Truncation: Handle max sequence length
total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(
pair=pair))
if max_seq_len and total_len > max_seq_len:
ids, pair_ids, overflowing_tokens = self.truncate_sequences(
ids,
pair_ids=pair_ids,
num_tokens_to_remove=total_len - max_seq_len,
truncation_strategy=truncation_strategy, )
if return_overflowing_tokens:
encoded_inputs["overflowing_tokens"] = overflowing_tokens
encoded_inputs["num_truncated_tokens"] = total_len - max_seq_len
# Add special tokens
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids,
pair_ids)
# Build output dictionnary
encoded_inputs["input_ids"] = sequence
if return_token_type_ids:
encoded_inputs["token_type_ids"] = token_type_ids
if return_special_tokens_mask:
encoded_inputs[
"special_tokens_mask"] = self.get_special_tokens_mask(ids,
pair_ids)
if return_length:
encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"])
# Check lengths
assert max_seq_len is None or len(encoded_inputs[
"input_ids"]) <= max_seq_len
# Padding
needs_to_be_padded = pad_to_max_seq_len and \
max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
if needs_to_be_padded:
difference = max_seq_len - len(encoded_inputs["input_ids"])
if self.padding_side == 'right':
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
"input_ids"]) + [0] * difference
if return_token_type_ids:
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"] +
[self.pad_token_type_id] * difference)
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs[
"special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs[
"input_ids"] + [self.pad_token_id] * difference
elif self.padding_side == 'left':
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + [
1
] * len(encoded_inputs["input_ids"])
if return_token_type_ids:
encoded_inputs["token_type_ids"] = (
[self.pad_token_type_id] * difference +
encoded_inputs["token_type_ids"])
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = [
1
] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs["input_ids"] = [
self.pad_token_id
] * difference + encoded_inputs["input_ids"]
else:
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[
"input_ids"])
if return_position_ids:
encoded_inputs["position_ids"] = list(
range(len(encoded_inputs["input_ids"])))
return encoded_inputs
def batch_encode(self,
batch_text_or_text_pairs,
max_seq_len=512,
pad_to_max_seq_len=False,
stride=0,
is_split_into_words=False,
truncation_strategy="longest_first",
return_position_ids=False,
return_token_type_ids=True,
return_attention_mask=False,
return_length=False,
return_overflowing_tokens=False,
return_special_tokens_mask=False):
"""
Performs tokenization and uses the tokenized tokens to prepare model
inputs. It supports batch inputs of sequence or sequence pair.
Args:
batch_text_or_text_pairs (list):
The element of list can be sequence or sequence pair, and the
sequence is a string or a list of strings depending on whether
it has been pretokenized. If each sequence is provided as a list
of strings (pretokenized), you must set `is_split_into_words` as
`True` to disambiguate with a sequence pair.
max_seq_len (int, optional):
If set to a number, will limit the total sequence returned so
that it has a maximum length. If there are overflowing tokens,
those overflowing tokens will be added to the returned dictionary
when `return_overflowing_tokens` is `True`. Defaults to `None`.
stride (int, optional):
Only available for batch input of sequence pair and mainly for
question answering usage. When for QA, `text` represents questions
and `text_pair` represents contexts. If `stride` is set to a
positive number, the context will be split into multiple spans
where `stride` defines the number of (tokenized) tokens to skip
from the start of one span to get the next span, thus will produce
a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
and 'offset_mapping' preserving the original example and position
information will be added to the returned dictionary. Defaults to 0.
pad_to_max_seq_len (bool, optional):
If set to `True`, the returned sequences would be padded up to
`max_seq_len` specified length according to padding side
(`self.padding_side`) and padding token id. Defaults to `False`.
truncation_strategy (str, optional):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence
until the input is under `max_seq_len` starting from the longest
one at each token (when there is a pair of input sequences).
- 'only_first': Only truncate the first sequence.
- 'only_second': Only truncate the second sequence.
- 'do_not_truncate': Do not truncate (raise an error if the input
sequence is longer than `max_seq_len`).
Defaults to 'longest_first'.
return_position_ids (bool, optional):
Whether to include tokens position ids in the returned dictionary.
Defaults to `False`.
return_token_type_ids (bool, optional):
Whether to include token type ids in the returned dictionary.
Defaults to `True`.
return_attention_mask (bool, optional):
Whether to include the attention mask in the returned dictionary.
Defaults to `False`.
return_length (bool, optional):
Whether to include the length of each encoded inputs in the
returned dictionary. Defaults to `False`.
return_overflowing_tokens (bool, optional):
Whether to include overflowing token information in the returned
dictionary. Defaults to `False`.
return_special_tokens_mask (bool, optional):
Whether to include special tokens mask information in the returned
dictionary. Defaults to `False`.
Returns:
list[dict]:
The dict has the following optional items:
- **input_ids** (list[int]): List of token ids to be fed to a model.
- **position_ids** (list[int], optional): List of token position ids to be
fed to a model. Included when `return_position_ids` is `True`
- **token_type_ids** (list[int], optional): List of token type ids to be
fed to a model. Included when `return_token_type_ids` is `True`.
- **attention_mask** (list[int], optional): List of integers valued 0 or 1,
where 0 specifies paddings and should not be attended to by the
model. Included when `return_attention_mask` is `True`.
- **seq_len** (int, optional): The input_ids length. Included when `return_length`
is `True`.
- **overflowing_tokens** (list[int], optional): List of overflowing tokens.
Included when if `max_seq_len` is specified and `return_overflowing_tokens`
is True.
- **num_truncated_tokens** (int, optional): The number of overflowing tokens.
Included when if `max_seq_len` is specified and `return_overflowing_tokens`
is True.
- **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1,
with 0 specifying special added tokens and 1 specifying sequence tokens.
Included when `return_special_tokens_mask` is `True`.
- **offset_mapping** (list[int], optional): list of pair preserving the
index of start and end char in original input for each token.
For a sqecial token, the index pair is `(0, 0)`. Included when
`stride` works.
- **overflow_to_sample** (int, optional): Index of example from which this
feature is generated. Included when `stride` works.
"""
def get_input_ids(text):
if isinstance(text, str):
tokens = self._tokenize(text)
return self.convert_tokens_to_ids(tokens)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
return self.convert_tokens_to_ids(text)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], int):
return text
else:
raise ValueError(
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
batch_encode_inputs = []
for example_id, tokens_or_pair_tokens in enumerate(
batch_text_or_text_pairs):
if not isinstance(tokens_or_pair_tokens, (list, tuple)):
text, text_pair = tokens_or_pair_tokens, None
elif is_split_into_words and not isinstance(
tokens_or_pair_tokens[0], (list, tuple)):
text, text_pair = tokens_or_pair_tokens, None
else:
text, text_pair = tokens_or_pair_tokens
first_ids = get_input_ids(text)
second_ids = get_input_ids(
text_pair) if text_pair is not None else None
if stride > 0 and second_ids is not None:
max_len_for_pair = max_seq_len - len(
first_ids) - self.num_special_tokens_to_add(pair=True)
token_offset_mapping = self.get_offset_mapping(text)
token_pair_offset_mapping = self.get_offset_mapping(text_pair)
offset = 0
while offset < len(second_ids):
encoded_inputs = {}
length = len(second_ids) - offset
if length > max_len_for_pair:
length = max_len_for_pair
ids = first_ids
pair_ids = second_ids[offset:offset + length]
mapping = token_offset_mapping
pair_mapping = token_pair_offset_mapping[offset:offset +
length]
offset_mapping = self.build_offset_mapping_with_special_tokens(
mapping, pair_mapping)
sequence = self.build_inputs_with_special_tokens(ids,
pair_ids)
token_type_ids = self.create_token_type_ids_from_sequences(
ids, pair_ids)
# Build output dictionnary
encoded_inputs["input_ids"] = sequence
if return_token_type_ids:
encoded_inputs["token_type_ids"] = token_type_ids
if return_special_tokens_mask:
encoded_inputs[
"special_tokens_mask"] = self.get_special_tokens_mask(
ids, pair_ids)
if return_length:
encoded_inputs["seq_len"] = len(encoded_inputs[
"input_ids"])
# Check lengths
assert max_seq_len is None or len(encoded_inputs[
"input_ids"]) <= max_seq_len
# Padding
needs_to_be_padded = pad_to_max_seq_len and \
max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
encoded_inputs['offset_mapping'] = offset_mapping
if needs_to_be_padded:
difference = max_seq_len - len(encoded_inputs[
"input_ids"])
if self.padding_side == 'right':
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(
encoded_inputs[
"input_ids"]) + [0] * difference
if return_token_type_ids:
# 0 for padding token mask
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"] +
[self.pad_token_type_id] * difference)
if return_special_tokens_mask:
encoded_inputs[
"special_tokens_mask"] = encoded_inputs[
"special_tokens_mask"] + [1
] * difference
encoded_inputs["input_ids"] = encoded_inputs[
"input_ids"] + [self.pad_token_id] * difference
encoded_inputs['offset_mapping'] = encoded_inputs[
'offset_mapping'] + [(0, 0)] * difference
elif self.padding_side == 'left':
if return_attention_mask:
encoded_inputs["attention_mask"] = [
0
] * difference + [1] * len(encoded_inputs[
"input_ids"])
if return_token_type_ids:
# 0 for padding token mask
encoded_inputs["token_type_ids"] = (
[self.pad_token_type_id] * difference +
encoded_inputs["token_type_ids"])
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = [
1
] * difference + encoded_inputs[
"special_tokens_mask"]
encoded_inputs["input_ids"] = [
self.pad_token_id
] * difference + encoded_inputs["input_ids"]
encoded_inputs['offset_mapping'] = [
(0, 0)
] * difference + encoded_inputs['offset_mapping']
else:
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(
encoded_inputs["input_ids"])
if return_position_ids:
encoded_inputs["position_ids"] = list(
range(len(encoded_inputs["input_ids"])))
encoded_inputs['overflow_to_sample'] = example_id
batch_encode_inputs.append(encoded_inputs)
if offset + length == len(second_ids):
break
offset += min(length, stride)
else:
batch_encode_inputs.append(
self.encode(
first_ids,
second_ids,
max_seq_len=max_seq_len,
pad_to_max_seq_len=pad_to_max_seq_len,
truncation_strategy=truncation_strategy,
return_position_ids=return_position_ids,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_length=return_length,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask))
return batch_encode_inputs
def get_offset_mapping(self, text):
"""
Returns the map of tokens and the start and end index of their start and end character.
Modified from https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372
Args:
text (str):
Input text.
Returns:
list: The offset map of input text.
"""
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token
if sub_token != self.unk_token else token)
normalized_text, char_mapping = '', []
for i, ch in enumerate(text):
if self.basic_tokenizer.do_lower_case:
ch = ch.lower()
ch = unicodedata.normalize('NFD', ch)
ch = ''.join([c for c in ch if unicodedata.category(c) != 'Mn'])
ch = ''.join([
c for c in ch
if not (ord(c) == 0 or ord(c) == 0xfffd or _is_control(c))
])
normalized_text += ch
char_mapping.extend([i] * len(ch))
text, token_mapping, offset = normalized_text, [], 0
for token in split_tokens:
if token[:2] == '##':
token = token[2:]
start = text[offset:].index(token) + offset
end = start + len(token)
token_mapping.append(
(char_mapping[start], char_mapping[end - 1] + 1))
offset = end
return token_mapping
......@@ -43,7 +43,10 @@ def _build_saved_state_dict(state_dict):
name_table = {}
for key, value in state_dict.items():
if isinstance(value, (Variable, core.VarBase)):
save_dict[key] = value.numpy()
if value.type == core.VarDesc.VarType.VOCAB:
save_dict[key] = value.value().get_map_tensor()
else:
save_dict[key] = value.numpy()
name_table[key] = value.name
else:
save_dict[key] = value
......@@ -938,8 +941,9 @@ def load(path, **configs):
if "StructuredToParameterName@@" in load_result:
for key in load_result["StructuredToParameterName@@"]:
load_result[key] = _ndarray_to_tensor(
load_result[key], config.return_numpy)
if isinstance(load_result[key], np.ndarray):
load_result[key] = _ndarray_to_tensor(
load_result[key], config.return_numpy)
if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
del load_result["StructuredToParameterName@@"]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册