未验证 提交 3f2d6a3f 编写于 作者: S Steffy-zxf 提交者: GitHub

Add FasterTokenizer Operator (#34491)

Add Tokenizer related functionalities for Transformer model in order that the process of training and predicting is consistent.

* support the text string as an input Tensor
* support the "VOCAB"unordered_map<wstring, int> as an input Tensor to lookup tokens
* Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization.
* It first applies basic tokenization, followed by wordpiece tokenization.
上级 873ee4e3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
SET(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc)
SET(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc)
# As we add extra features for utf8proc, we use the non-official repo
SET(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git)
SET(UTF8PROC_TAG v2.6.1)
IF(WIN32)
SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib")
add_definitions(-DUTF8PROC_STATIC)
ELSE(WIN32)
SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a")
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include)
ExternalProject_Add(
extern_utf8proc
${EXTERNAL_PROJECT_LOG_ARGS}
${SHALLOW_CLONE}
GIT_REPOSITORY ${UTF8PROC_REPOSITORY}
GIT_TAG ${UTF8PROC_TAG}
PREFIX ${UTF8PROC_PREFIX_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DBUILD_SHARED=ON
-DBUILD_STATIC=ON
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES}
)
ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES})
ADD_DEPENDENCIES(utf8proc extern_utf8proc)
......@@ -124,6 +124,11 @@ function(copy_part_of_thrid_party TARGET DST)
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
set(dst_dir "${DST}/third_party/install/utf8proc")
copy(${TARGET}
SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
if (WITH_CRYPTO)
set(dst_dir "${DST}/third_party/install/cryptopp")
copy(${TARGET}
......
......@@ -210,6 +210,10 @@ include(external/threadpool)# download threadpool
include(external/dlpack) # download dlpack
include(external/xxhash) # download, build, install xxhash
include(external/warpctc) # download, build, install warpctc
include(external/utf8proc) # download, build, install utf8proc
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_utf8proc)
include(external/lapack) # download, build, install lapack
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
......
......@@ -51,6 +51,8 @@ proto_library(data_feed_proto SRCS data_feed.proto)
proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
data_feed_proto)
cc_library(string_array SRCS string_array.cc DEPS utf8proc)
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
if(WITH_GPU)
......
......@@ -102,14 +102,18 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
if (var->Persistable()) {
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
VLOG(3) << "Initialize Variable " << var->Name();
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
<< " global, which pointer is " << ptr << " type is "
<< static_cast<int>(var->GetType());
} else {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
<< " locally, which pointer is " << ptr << "Variable Type "
<< static_cast<int>(var->GetType());
}
}
} else {
......
......@@ -125,6 +125,7 @@ void DeleteUnusedTensors(const Scope &scope,
for (auto &t : *lod_tensor_arr) {
garbages.emplace_back(t.MoveMemoryHolder());
}
} else if (var->IsType<Strings>()) {
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Type %s of variable %s is not supported eager deletion.",
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <boost/variant.hpp>
#include "glog/logging.h"
namespace paddle {
......@@ -35,9 +36,24 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index].ShareDataWith(input);
auto& val = BOOST_GET(LoDTensor, feed_inputs[index]);
val.ShareDataWith(input);
// set lod
feed_inputs[index].set_lod(input.lod());
val.set_lod(input.lod());
}
void SetFeedVariable(Scope* scope, const Strings& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(3) << "SetFeedStringVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index] = input;
}
FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/string_array.h"
namespace paddle {
namespace framework {
......@@ -28,6 +29,9 @@ class Scope;
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index);
void SetFeedVariable(Scope* scope, const Strings& input,
const std::string& var_name, size_t index);
FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index);
......
......@@ -13,14 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace framework {
using FeedType = LoDTensor;
using FeedType = boost::variant<LoDTensor, Strings>;
using FeedList = std::vector<FeedType>;
using FetchType = boost::variant<LoDTensor, LoDTensorArray>;
......@@ -43,6 +46,13 @@ inline bool data_is_lod_tensor_array(const FetchType &data) {
return false;
}
inline bool data_is_string_tensor(const FeedType &data) {
if (data.type() == typeid(Strings)) {
return true;
}
return false;
}
static const char kFeedOpType[] = "feed";
static const char kFetchOpType[] = "fetch";
......
......@@ -147,6 +147,11 @@ message VarType {
// in operators like nccl_op
RAW = 17;
TUPLE = 18;
STRING = 25;
STRINGS = 26;
VOCAB = 27;
FEED_LIST = 28;
}
required Type type = 1;
......@@ -175,6 +180,10 @@ message VarType {
message Tuple { repeated Type element_type = 1; }
optional Tuple tuple = 7;
optional TensorDesc string = 8;
optional TensorDesc strings = 9;
optional TensorDesc vocab = 10;
}
message VarDesc {
......
......@@ -76,6 +76,8 @@ static DDim GetDimsDebug(const Scope& scope, const std::string& name,
} else {
return var->Get<SelectedRows>().GetCompleteDims();
}
} else if (var->IsType<Strings>()) {
return DDim({static_cast<int64_t>(var->Get<Strings>().size())});
} else {
return DDim({-1});
}
......@@ -106,6 +108,8 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
} else {
return DataTypeToString(tensor.type());
}
} else if (var->IsType<Strings>()) {
return "strings";
} else {
return "";
}
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <utf8proc.h>
#include <exception>
#include "glog/logging.h"
#include "paddle/fluid/framework/string_array.h"
namespace paddle {
namespace framework {
std::wstring_convert<std::codecvt_utf8<wchar_t>> kConverter;
// Convert the std::string type to the std::wstring type.
bool ConvertStrToWstr(const std::string& src, std::wstring* res) {
try {
*res = kConverter.from_bytes(src);
} catch (std::range_error& e) {
VLOG(3) << "The string " << src << " was converted to unicode failedly! ";
return false;
}
return true;
}
// Convert the std::wstring type to the std::string type.
void ConvertWstrToStr(const std::wstring& src, std::string* res) {
*res = kConverter.to_bytes(src);
}
// Normalization Form Canonical Decomposition.
void NFD(const std::string& s, std::string* ret) {
*ret = "";
char* result = reinterpret_cast<char*>(
utf8proc_NFD(reinterpret_cast<const unsigned char*>(s.c_str())));
if (result) {
*ret = std::move(std::string(result));
free(result);
}
}
// Write the data which is type of
// std::unordered_map<std::string, int32_t> to ostream.
void StringMapToStream(std::ostream& os,
const std::unordered_map<std::string, int32_t>& data) {
{
// firstly write the data size.
size_t t = data.size();
os.write(reinterpret_cast<const char*>(&t), sizeof(t));
}
{
// then write the data
for (auto it = data.begin(); it != data.end(); ++it) {
std::string token = it->first;
int32_t token_id = it->second;
// write the token
size_t length = token.size();
os.write(reinterpret_cast<const char*>(&length), sizeof(length));
os.write(token.c_str(), length);
// write the token_id
os.write(reinterpret_cast<const char*>(&token_id), sizeof(token_id));
}
}
}
// Read the data which is type of
// std::unordered_map<td::string, int32_t> from istream.
void StringMapFromStream(std::istream& is,
std::unordered_map<std::string, int32_t>* data) {
// first read the map size
size_t map_size;
is.read(reinterpret_cast<char*>(&map_size), sizeof(map_size));
data->reserve(map_size);
// then read the data
for (size_t i = 0; i < map_size; ++i) {
// read the token
size_t token_length;
is.read(reinterpret_cast<char*>(&token_length), sizeof(token_length));
char* tmp = new char[token_length];
is.read(tmp, token_length);
std::string token(tmp, tmp + token_length);
delete[] tmp;
// read the token_id
int32_t token_id;
is.read(reinterpret_cast<char*>(&token_id), sizeof(token_id));
data->emplace(token, token_id);
}
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <codecvt>
#include <iostream>
#include <locale>
#include <string>
#include <unordered_map>
#include <vector>
namespace paddle {
namespace framework {
using String = std::string;
using Strings = std::vector<std::string>;
using Vocab = std::unordered_map<std::wstring, std::int32_t>;
// Convert the std::string type to the std::string type.
bool ConvertStrToWstr(const std::string& src, std::wstring* res);
// Convert the std::wstring type to the std::string type.
void ConvertWstrToStr(const std::wstring& src, std::string* res);
// Normalization Form Canonical Decomposition.
void NFD(const std::string& s, std::string* ret);
// Write the data which is type of
// std::unordered_map<td::string, int32_t> to ostream.
void StringMapToStream(std::ostream& os,
const std::unordered_map<std::string, int32_t>& data);
// Read the data which is type of
// std::unordered_map<td::string, int32_t> from istream.
void StringMapFromStream(std::istream& is,
std::unordered_map<std::string, int32_t>* data);
} // namespace framework
} // namespace paddle
......@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include <algorithm>
#include <limits>
#include <memory>
......@@ -22,6 +20,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_MKLDNN
......
......@@ -13,11 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <codecvt>
#include <locale>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/dlpack_tensor.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -48,6 +54,14 @@ class PrintOptions {
PrintOptions() {}
};
void TensorToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx);
void TensorFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx);
void TensorFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx,
const size_t& seek, const std::vector<int64_t>& shape);
// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
// and dst_place are two different GPU, to ensure that the operation can
// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
......
......@@ -209,6 +209,10 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
return desc_.type().lod_tensor().tensor();
case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.type().tensor_array().tensor();
case proto::VarType::STRINGS:
return desc_.type().strings();
case proto::VarType::VOCAB:
return desc_.type().vocab();
default:
PADDLE_THROW(platform::errors::Unavailable(
"Getting 'tensor_desc' is not supported by the %s type variable.",
......@@ -249,6 +253,10 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
return desc_.mutable_type()->mutable_lod_tensor()->mutable_tensor();
case proto::VarType::LOD_TENSOR_ARRAY:
return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
case proto::VarType::STRINGS:
return desc_.mutable_type()->mutable_strings();
case proto::VarType::VOCAB:
return desc_.mutable_type()->mutable_vocab();
default:
PADDLE_THROW(
platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
......
......@@ -18,10 +18,12 @@
#include <string>
#include <tuple>
#include <typeindex>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include <cudnn.h>
......@@ -162,8 +164,8 @@ struct VarTypeRegistryImpl {
// Paddle would generate unique Ids for each registered variable types.
using VarTypeRegistry = detail::VarTypeRegistryImpl<
Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
operators::reader::LoDTensorBlockingQueueHolder, FetchList,
Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *,
operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList,
operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
......@@ -177,8 +179,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#if defined(PADDLE_WITH_XPU_BKCL)
BKCLUniqueId, platform::BKCLCommunicator,
#endif
int, float>;
int, float, Vocab>;
template <typename T>
struct VarTypeTrait {
static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
......@@ -208,9 +209,13 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
REG_PROTO_VAR_TYPE_TRAIT(FeedList, proto::VarType::FEED_LIST);
REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST);
REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
REG_PROTO_VAR_TYPE_TRAIT(Vocab, proto::VarType::VOCAB);
REG_PROTO_VAR_TYPE_TRAIT(String, proto::VarType::STRING);
REG_PROTO_VAR_TYPE_TRAIT(Strings, proto::VarType::STRINGS);
/** End of variable type registration */
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
......@@ -41,6 +42,10 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::STRINGS) {
var->GetMutable<Strings>();
} else if (var_type == proto::VarType::VOCAB) {
var->GetMutable<Vocab>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
......
......@@ -20,6 +20,7 @@
#include <utility>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/hooks.h"
#include "paddle/fluid/imperative/op_base.h"
......@@ -153,6 +154,15 @@ class VariableWrapper {
tensor = &(var_.Get<framework::LoDTensor>());
} else if (type_ == framework::proto::VarType::SELECTED_ROWS) {
tensor = &(var_.Get<framework::SelectedRows>().value());
} else if (type_ == framework::proto::VarType::VOCAB) {
const framework::Vocab* data = nullptr;
data = &(var_.Get<framework::Vocab>());
if (data && data->size() != 0) {
VLOG(6) << "The tensor of variable " << name_
<< " is not initialized";
return data_type_;
}
return framework::proto::VarType::VOCAB;
} else {
VLOG(6) << "Variable " << name_ << " is not initialized";
return data_type_;
......
......@@ -26,7 +26,7 @@ if(WITH_MKLDNN)
set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE)
endif()
cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer)
cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer utf8proc)
cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
......
......@@ -34,12 +34,14 @@ include_directories("${PADDLE_LIB}/")
set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
......@@ -151,12 +153,13 @@ if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
glog gflags protobuf xxhash cryptopp
glog gflags protobuf xxhash cryptopp utf8proc
${EXTERNAL_LIB})
else()
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
glog gflags_static libprotobuf xxhash cryptopp-static ${EXTERNAL_LIB})
glog gflags_static libprotobuf xxhash cryptopp-static utf8proc_static
${EXTERNAL_LIB})
set(DEPS ${DEPS} shlwapi.lib)
endif(NOT WIN32)
......
......@@ -43,15 +43,33 @@ void Tensor::Reshape(const std::vector<int> &shape) {
tensor->Resize(paddle::framework::make_ddim(shape));
}
#define EAGER_GET_TENSOR \
if (!tensor_) { \
tensor_ = FindTensor(); \
} \
auto *tensor = static_cast<paddle::framework::LoDTensor *>(tensor_);
void Tensor::ReshapeStrings(const size_t &shape) {
PADDLE_ENFORCE_EQ(
name_.empty(), false,
paddle::platform::errors::PreconditionNotMet(
"Need to SetName first, so that the corresponding tensor can "
"be retrieved."));
PADDLE_ENFORCE_EQ(input_or_output_, true,
paddle::platform::errors::PermissionDenied(
"Can't reshape the output tensor, it is readonly"));
auto *scope = static_cast<paddle::framework::Scope *>(scope_);
auto *var = scope->FindVar(name_);
PADDLE_ENFORCE_NOT_NULL(
var, paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
paddle_infer::Strings *tensor = var->GetMutable<paddle_infer::Strings>();
tensor->resize(shape);
}
#define EAGER_GET_TENSOR(tensor_type) \
if (!tensor_) { \
tensor_ = FindTensor<tensor_type>(); \
} \
auto *tensor = static_cast<tensor_type *>(tensor_);
template <typename T>
T *Tensor::mutable_data(PlaceType place) {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
PADDLE_ENFORCE_GT(
tensor->numel(), 0,
paddle::platform::errors::PreconditionNotMet(
......@@ -83,7 +101,7 @@ T *Tensor::mutable_data(PlaceType place) {
template <typename T>
T *Tensor::data(PlaceType *place, int *size) const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
auto *res = tensor->data<T>();
if (paddle::platform::is_cpu_place(tensor->place())) {
......@@ -103,7 +121,7 @@ T *Tensor::data(PlaceType *place, int *size) const {
}
DataType Tensor::type() const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
auto type = tensor->type();
if (type == paddle::framework::proto::VarType::FP32) {
return DataType::FLOAT32;
......@@ -125,7 +143,7 @@ PlaceType Tensor::place() const { return place_; }
template <typename T>
void Tensor::CopyFromCpu(const T *data) {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
PADDLE_ENFORCE_GE(tensor->numel(), 0,
paddle::platform::errors::PreconditionNotMet(
"You should call Tensor::Reshape(const "
......@@ -186,10 +204,20 @@ void Tensor::CopyFromCpu(const T *data) {
}
}
void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
EAGER_GET_TENSOR(paddle_infer::Strings);
PADDLE_ENFORCE_GE(tensor->size(), 0,
paddle::platform::errors::PreconditionNotMet(
"You should call Tensor::Reshape(const "
"std::size_t &shape)function before copying"
"the string data from cpu."));
*tensor = *data;
}
template <typename T>
void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
void *cb_params) const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
auto ele_num = tensor->numel();
auto *t_data = tensor->data<T>();
auto t_place = tensor->place();
......@@ -371,6 +399,7 @@ Tensor::Tensor(void *scope) : scope_{scope} {
"set to the pointer of scope."));
}
template <typename T>
void *Tensor::FindTensor() const {
PADDLE_ENFORCE_EQ(
name_.empty(), false,
......@@ -382,12 +411,12 @@ void *Tensor::FindTensor() const {
PADDLE_ENFORCE_NOT_NULL(
var, paddle::platform::errors::PreconditionNotMet(
"No tensor called [%s] in the runtime scope", name_));
auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
auto *tensor = var->GetMutable<T>();
return tensor;
}
std::vector<int> Tensor::shape() const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
PADDLE_ENFORCE_NOT_NULL(
tensor_, paddle::platform::errors::PreconditionNotMet(
"Not found tensor called %s in the scope", name_));
......@@ -395,7 +424,7 @@ std::vector<int> Tensor::shape() const {
}
void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
paddle::framework::LoD lod;
for (auto &level : x) {
lod.emplace_back(level);
......@@ -404,7 +433,7 @@ void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
}
std::vector<std::vector<size_t>> Tensor::lod() const {
EAGER_GET_TENSOR;
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
std::vector<std::vector<size_t>> res;
for (auto &level : tensor->lod()) {
res.emplace_back(level);
......
......@@ -36,7 +36,10 @@ template PD_INFER_DECL int64_t *Tensor::data<int64_t>(PlaceType *place,
template float *Tensor::mutable_data(PlaceType place);
template int64_t *Tensor::mutable_data(PlaceType place);
void *Tensor::FindTensor() const { return nullptr; }
template <typename T>
void *Tensor::FindTensor() const {
return nullptr;
}
std::vector<int> Tensor::shape() const { return {}; }
......
......@@ -88,7 +88,8 @@ bool SetPlaceAndCheck(PlaceType place, size_t length) {
const std::vector<std::vector<size_t>> lod{{0, length}};
scope.Var(name);
auto tensor = CreateTensor(place, &scope, name);
tensor->Reshape({static_cast<int>(length)});
std::vector<int> shape{static_cast<int>(length)};
tensor->Reshape(shape);
tensor->mutable_data<T>(place);
tensor->SetLoD(lod);
......
......@@ -174,6 +174,14 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
void copy_from_cpu(const T* data) {
return CopyFromCpu(data);
}
/// \brief Experimental interface.
/// It's usually used to set the input tensor data with Strings data type.
/// \param data The pointer of the data, from which the tensor will copy.
void copy_strings_from_cpu(const paddle_infer::Strings* data) {
return CopyStringsFromCpu(data);
}
/// \brief Copy the tensor data to the host memory.
/// It's usually used to get the output tensor data.
/// \param[out] data The tensor will copy the data to the address.
......
......@@ -14,10 +14,16 @@
#pragma once
#include <string>
#include "paddle_infer_declare.h" // NOLINT
namespace paddle_infer {
/// \brief Experimental.
/// Strings for text data.
using Strings = std::vector<std::string>;
typedef void (*CallbackFunc)(void*);
#if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
......@@ -57,6 +63,14 @@ class PD_INFER_DECL Tensor {
/// \param shape The shape to set.
void Reshape(const std::vector<int>& shape);
/// \brief Experimental interface.
/// Reset the shape of the Strings tensor.
/// Generally it's only used for the input tensor.
/// Reshape must be called before calling
/// ZeroCopyStringTensorCreate() or PaddleInferTensorCreate()
/// \param shape The shape to set.
void ReshapeStrings(const std::size_t& shape);
/// \brief Get the memory pointer in CPU or GPU with specific data type.
/// Please Reshape the tensor first before call this.
/// It's usually used to get input data pointer.
......@@ -78,6 +92,11 @@ class PD_INFER_DECL Tensor {
template <typename T>
void CopyFromCpu(const T* data);
/// \brief Experimental interface.
/// It's usually used to set the input tensor data with Strings data type.
/// \param data The pointer of the data, from which the tensor will copy.
void CopyStringsFromCpu(const paddle_infer::Strings* data);
/// \brief Copy the tensor data to the host memory.
/// It's usually used to get the output tensor data.
/// \param[out] data The tensor will copy the data to the address.
......@@ -122,7 +141,10 @@ class PD_INFER_DECL Tensor {
protected:
explicit Tensor(void* scope);
template <typename T>
void* FindTensor() const;
void SetPlace(PlaceType place, int device = -1);
void SetName(const std::string& name);
......
......@@ -17,11 +17,13 @@ limitations under the License. */
#include <algorithm>
#include <fstream>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/pybind/pybind.h"
DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
......@@ -85,10 +87,12 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->GetShape());
new_var->SetDataType(var->GetDataType());
new_var->SetType(var->GetType());
auto var_type = var->GetType();
new_var->SetType(var_type);
if (var->GetType() !=
framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) {
if ((var_type !=
framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) &&
(var_type != framework::proto::VarType::VOCAB)) {
new_var->SetLoDLevel(var->GetLoDLevel());
}
......
......@@ -17,6 +17,7 @@ add_subdirectory(metrics)
add_subdirectory(optimizers)
add_subdirectory(reduce_ops)
add_subdirectory(sequence_ops)
add_subdirectory(string)
add_subdirectory(jit)
if(WITH_MKLDNN)
add_subdirectory(mkldnn)
......@@ -78,10 +79,12 @@ if(WITH_UNITY_BUILD)
include(unity_build_rule.cmake)
endif()
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op sparse_attention_op lstm_op run_program_op eye_op recurrent_op
sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
op_library(save_combine_op DEPS string_array)
op_library(load_combine_op DEPS string_array)
if (WITH_GPU OR WITH_ROCM)
if(WITH_ROCM)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -29,6 +26,39 @@ class OpBase;
namespace paddle {
namespace operators {
// FeedVariableVisitor is to feed the variable data
// according to data type (LoDTensor or Strings).
class FeedVariableVisitor : public boost::static_visitor<void> {
public:
explicit FeedVariableVisitor(framework::Variable *out_var,
const platform::Place &place)
: out_var_(out_var), place_(place) {}
void operator()(const framework::LoDTensor &in_tensor) const {
framework::LoDTensor *out_tensor =
out_var_->GetMutable<framework::LoDTensor>();
if (platform::is_same_place(in_tensor.place(), place_)) {
out_tensor->ShareDataWith(in_tensor);
} else {
platform::DeviceContext *context =
platform::DeviceContextPool::Instance().Get(place_);
framework::TensorCopy(in_tensor, place_, *context, out_tensor);
}
out_tensor->set_lod(in_tensor.lod());
}
void operator()(const framework::Strings &in_str) const {
framework::Strings *out_str = out_var_->GetMutable<framework::Strings>();
out_str->resize(in_str.size());
*out_str = in_str;
}
private:
framework::Variable *out_var_;
const platform::Place &place_;
};
class FeedOp : public framework::OperatorBase {
public:
FeedOp(const std::string &type, const framework::VariableNameMap &inputs,
......@@ -79,15 +109,9 @@ class FeedOp : public framework::OperatorBase {
col, feed_list.size()));
auto &feed_item = feed_list.at(static_cast<size_t>(col));
auto *out_item = out_var->GetMutable<framework::FeedType>();
if (platform::is_same_place(feed_item.place(), place)) {
out_item->ShareDataWith(feed_item);
} else {
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
framework::TensorCopy(feed_item, place, *dev_ctx, out_item);
}
out_item->set_lod(feed_item.lod());
FeedVariableVisitor visitor(out_var, place);
boost::apply_visitor(visitor, feed_item);
}
};
......@@ -95,17 +119,17 @@ class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(vector<LoDTensor>) A feeding list of LoDTensor, which may have "
"(vector<LoDTensor>) "
"A feeding list of LoDTensor, which may have "
"different dimension and data type.");
AddOutput("Out",
"(LoDTensor) The LoDTensor which is a copy of the col-th feeding "
"(LoDTensor) The LoDTensor which is a copy "
"of the col-th feeding "
"object.");
AddAttr<int>("col", "(int) The column index of current feeding object.");
AddComment(R"DOC(
Feed Operator.
It should not be configured by users directly.
)DOC");
}
};
......
......@@ -109,6 +109,10 @@ class FetchOp : public framework::OperatorBase {
auto &src_item = fetch_var->Get<framework::LoDTensor>();
auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col)));
DataCopy(src_item, fetch_var_name, dst_item);
} else if (fetch_var->IsType<framework::Vocab>()) {
auto &src_item = fetch_var->Get<framework::Vocab>();
auto *dst_item = &(BOOST_GET(framework::Vocab, fetch_list->at(col)));
*dst_item = src_item;
} else {
auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
framework::LoDTensorArray tmp(src_item.size());
......@@ -128,9 +132,11 @@ class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X",
"(LoDTensor) The resulted LoDTensor which is expected to return "
"to users.");
AddOutput("Out",
"(vector<LoDTensor>) A fetching list of LoDTensor which may have "
"different dimension, shape and data type.");
AddOutput(
"Out",
"(vector<LoDTensor>|unordered_map<string, int32_t>) A fetching list"
" of LoDTensor|unordered_map<string, int32_t> which may have "
"different dimension, shape and data type.");
AddAttr<int>("col", "(int) The column index of fetching object.");
AddComment(R"DOC(
Fetch Operator.
......
......@@ -21,6 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
......@@ -75,38 +77,57 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
out_vars[i], platform::errors::InvalidArgument(
"The variable %s to be loaded cannot be found.",
out_var_names[i]));
auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
// Error checking
PADDLE_ENFORCE_EQ(
static_cast<bool>(*buffer), true,
platform::errors::Unavailable(
"An error occurred while loading model parameters. "
"Please check whether the model file is complete or damaged."));
// Get data from fin to tensor
DeserializeFromStream(*buffer, tensor, dev_ctx);
auto in_dtype = tensor->type();
auto out_dtype =
load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
// convert to float16 tensor
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor fp16_tensor;
// copy LoD info to the new tensor
fp16_tensor.set_lod(tensor->lod());
framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
&fp16_tensor);
// reset output tensor
out_vars[i]->Clear();
tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor);
if (out_vars[i]->IsType<framework::Vocab>()) {
auto *tensor = out_vars[i]->GetMutable<framework::Vocab>();
tensor->clear();
std::unordered_map<std::string, std::int32_t> data;
framework::StringMapFromStream(*buffer, &data);
for (auto it = data.begin(); it != data.end(); ++it) {
std::string tmp;
framework::NFD(it->first, &tmp);
if (tmp.empty()) {
VLOG(0) << "The string " << it->first
<< " was converted to unicode failedly! "
<< "Then dropped to load it.";
continue;
}
std::wstring token;
bool status = framework::ConvertStrToWstr(tmp, &token);
if (!status) continue;
tensor->emplace(token, it->second);
}
} else {
auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
// Get data from fin to tensor
DeserializeFromStream(*buffer, tensor, dev_ctx);
auto in_dtype = tensor->type();
auto out_dtype =
load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
// convert to float16 tensor
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor fp16_tensor;
// copy LoD info to the new tensor
fp16_tensor.set_lod(tensor->lod());
framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
&fp16_tensor);
// reset output tensor
out_vars[i]->Clear();
tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor);
}
}
}
buffer->peek();
......
......@@ -19,11 +19,13 @@ limitations under the License. */
#include <numeric>
#include <sstream>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
......@@ -66,34 +68,48 @@ class SaveCombineOpKernel : public framework::OpKernel<T> {
inp_vars[i],
platform::errors::InvalidArgument("Cannot find variable %s to save.",
inp_var_names[i]));
PADDLE_ENFORCE_EQ(inp_vars[i]->IsType<framework::LoDTensor>(), true,
PADDLE_ENFORCE_EQ(inp_vars[i]->IsType<framework::LoDTensor>() ||
inp_vars[i]->IsType<framework::Vocab>(),
true,
platform::errors::InvalidArgument(
"SaveCombine operator only supports saving "
"LoDTensor variable, %s has wrong type.",
"LoDTensor or Vocab variable, %s has wrong type.",
inp_var_names[i]));
auto &tensor = inp_vars[i]->Get<framework::LoDTensor>();
PADDLE_ENFORCE_EQ(
tensor.IsInitialized(), true,
platform::errors::InvalidArgument(
"The Tensor of Variable(%s) to be saved is not initialized.",
inp_var_names[i]));
// Serialize tensors one by one
// Check types to see if a fp16 transformation is required
auto in_dtype = tensor.type();
auto out_dtype =
save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (inp_vars[i]->IsType<framework::LoDTensor>()) {
auto &tensor = inp_vars[i]->Get<framework::LoDTensor>();
PADDLE_ENFORCE_EQ(
tensor.IsInitialized(), true,
platform::errors::InvalidArgument(
"The Tensor of Variable(%s) to be saved is not initialized.",
inp_var_names[i]));
// Serialize tensors one by one
// Check types to see if a fp16 transformation is required
auto in_dtype = tensor.type();
auto out_dtype =
save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
// copy LoD info to the new tensor
out.set_lod(tensor.lod());
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
framework::SerializeToStream(ss, out, dev_ctx);
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
// copy LoD info to the new tensor
out.set_lod(tensor.lod());
framework::TransDataType(in_kernel_type, out_kernel_type, tensor,
&out);
framework::SerializeToStream(ss, out, dev_ctx);
} else {
framework::SerializeToStream(ss, tensor, dev_ctx);
}
} else {
framework::SerializeToStream(ss, tensor, dev_ctx);
auto &tensor = inp_vars[i]->Get<framework::Vocab>();
std::unordered_map<std::string, std::int32_t> data;
for (auto it = tensor.begin(); it != tensor.end(); ++it) {
std::string t;
framework::ConvertWstrToStr(it->first, &t);
data.emplace(t, it->second);
}
framework::StringMapToStream(ss, data);
}
}
if (save_to_memory) {
......
include(operators)
if(WITH_UNITY_BUILD)
# Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops.
include(unity_build_rule.cmake)
endif()
register_operators(DEPS op_version_registry utf8proc string_array)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <utf8proc.h>
#include <algorithm>
#include <chrono>
#include <codecvt>
#include <fstream>
#include <iostream>
#include <numeric>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <boost/algorithm/string.hpp>
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/operators/string/faster_tokenizer_op.h"
namespace paddle {
namespace operators {
using std::bad_cast;
using std::codecvt_utf8;
using std::endl;
using std::exception;
using std::ifstream;
using std::int64_t;
using std::min;
using std::runtime_error;
using std::unordered_map;
using std::unordered_set;
using std::shared_ptr;
using std::size_t;
using std::int64_t;
using std::string;
using std::vector;
using std::wstring;
const wstring kStripChars = L" \t\n\r\v\f";
inline bool IsControl(const wchar_t& ch) {
if (ch == L'\t' || ch == L'\n' || ch == L'\r') return false;
auto cat = utf8proc_category(ch);
if (cat == UTF8PROC_CATEGORY_CC || cat == UTF8PROC_CATEGORY_CF) return true;
return false;
}
inline bool IsChineseChar(const wchar_t& ch) {
if ((ch >= 0x4E00 && ch <= 0x9FFF) || (ch >= 0x3400 && ch <= 0x4DBF) ||
(ch >= 0x20000 && ch <= 0x2A6DF) || (ch >= 0x2A700 && ch <= 0x2B73F) ||
(ch >= 0x2B740 && ch <= 0x2B81F) || (ch >= 0x2B820 && ch <= 0x2CEAF) ||
(ch >= 0xF900 && ch <= 0xFAFF) || (ch >= 0x2F800 && ch <= 0x2FA1F))
return true;
return false;
}
inline bool IsWhiteSpace(const wchar_t& ch) {
if (ch == L' ' || ch == L'\t' || ch == L'\n' || ch == L'\r') return true;
auto cat = utf8proc_category(ch);
if (cat == UTF8PROC_CATEGORY_ZS) return true;
return false;
}
inline bool IsPunctuation(const wchar_t& ch) {
if ((ch >= 33 && ch <= 47) || (ch >= 58 && ch <= 64) ||
(ch >= 91 && ch <= 96) || (ch >= 123 && ch <= 126))
return true;
auto cat = utf8proc_category(ch);
if (cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PS ||
cat == UTF8PROC_CATEGORY_PE || cat == UTF8PROC_CATEGORY_PC ||
cat == UTF8PROC_CATEGORY_PO // sometimes ¶ belong SO
|| cat == UTF8PROC_CATEGORY_PI || cat == UTF8PROC_CATEGORY_PF)
return true;
return false;
}
BasicTokenizer::BasicTokenizer(bool do_lower_case /* = true */)
: do_lower_case_(do_lower_case) {}
wchar_t BasicTokenizer::do_lower_case(wchar_t ch) const {
wchar_t new_ch = utf8proc_tolower(ch);
return new_ch;
}
void BasicTokenizer::Tokenize(const string& text, vector<wstring>* res) const {
std::wstring unicode_text;
bool status = framework::ConvertStrToWstr(text, &unicode_text);
if (!status) {
// String is converted into wstring failedly.
return;
}
std::wstring dest_text;
for (auto ch : unicode_text) {
if (ch == 0 || ch == 0xfffd || IsControl(ch)) {
continue;
}
if (do_lower_case_) {
ch = do_lower_case(ch);
}
if (IsChineseChar(ch) || IsPunctuation(ch)) {
dest_text += ' ';
dest_text += ch;
dest_text += ' ';
} else if (IsWhiteSpace(ch)) {
dest_text += ' ';
} else {
dest_text += ch;
}
}
boost::split(*res, dest_text, boost::is_any_of(kStripChars));
}
WordPieceTokenizer::WordPieceTokenizer(
framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/,
const size_t max_input_chars_per_word /* = 100 */)
: vocab_(vocab),
unk_token_(unk_token),
max_input_chars_per_word_(max_input_chars_per_word) {
unk_token_id_ = (*vocab_)[unk_token_];
}
void WordPieceTokenizer::Tokenize(const wstring& text,
vector<int64_t>* token_ids) const {
size_t len = text.size();
if (len > max_input_chars_per_word_) {
token_ids->emplace_back(std::move(unk_token_id_));
return;
}
auto it = vocab_->find(text);
if (it != vocab_->end()) {
token_ids->emplace_back(std::move(it->second));
return;
}
size_t start = 0;
vector<int64_t> wordpiece_ids;
while (start < len) {
size_t end = len;
std::wstring cur_substr;
int64_t cur_substr_id;
while (start < end) {
std::wstring sub = text.substr(start, end - start);
if (start > 0) {
sub = L"##" + sub;
}
auto it = vocab_->find(sub);
if (it != vocab_->end()) {
cur_substr = sub;
cur_substr_id = it->second;
break;
}
end -= 1;
}
if (cur_substr.empty()) {
token_ids->emplace_back(std::move(unk_token_id_));
return;
} else {
start = end;
wordpiece_ids.emplace_back(std::move(cur_substr_id));
}
}
for (auto& token_id : wordpiece_ids) {
token_ids->emplace_back(std::move(token_id));
}
}
BertTokenizer::BertTokenizer(framework::Vocab* vocab,
bool do_lower_case /* = false */,
const wstring& unk_token /* = L"[UNK]" */,
const wstring& pad_token /* = L"[PAD]" */,
const wstring& cls_token /* = L"[CLS]" */,
const wstring& mask_token /* = L"[MASK]" */,
const wstring& sep_token /* = L"[SEP]" */,
const string& padding_site /* = "right" */)
: do_lower_case_(do_lower_case),
unk_token_(unk_token),
pad_token_(pad_token),
cls_token_(cls_token),
mask_token_(mask_token),
sep_token_(sep_token),
padding_site_(padding_site),
vocab_(vocab),
basic_tokenizer_(do_lower_case_),
word_piece_tokenizer_(vocab_, unk_token) {
unk_token_id_ = (*vocab_)[unk_token_];
pad_token_id_ = (*vocab_)[pad_token_];
cls_token_id_ = (*vocab_)[cls_token_];
mask_token_id_ = (*vocab_)[mask_token_];
sep_token_id_ = (*vocab_)[sep_token_];
all_special_tokens_ = vector<wstring>(
{unk_token_, pad_token_, cls_token_, mask_token_, sep_token_});
all_special_token_ids_ =
unordered_set<int64_t>({unk_token_id_, pad_token_id_, cls_token_id_,
mask_token_id_, sep_token_id_});
}
void BertTokenizer::Tokenize(const string& text,
vector<int64_t>* split_token_ids) const {
std::vector<std::wstring> tmp_tokens;
basic_tokenizer_.Tokenize(text, &tmp_tokens);
if (tmp_tokens.empty()) return;
split_token_ids->reserve(tmp_tokens.size());
for (auto& w_token : tmp_tokens) {
const auto& vec_size = w_token.size();
if (vec_size == 1) {
if (IsChineseChar(w_token[0])) {
auto vocab_it = vocab_->find(w_token);
if (vocab_it != vocab_->end()) {
split_token_ids->emplace_back(std::move(vocab_it->second));
} else {
split_token_ids->emplace_back(std::move(unk_token_id_));
}
} else {
word_piece_tokenizer_.Tokenize(w_token, split_token_ids);
}
} else if (vec_size > 1) {
word_piece_tokenizer_.Tokenize(w_token, split_token_ids);
} else {
continue;
}
}
}
void BertTokenizer::BuildInputsWithSpecialTokens(
vector<int64_t>* inputs, const vector<int64_t>& token_ids_0,
const vector<int64_t>& token_ids_1 /* = vector<int64_t>() */) const {
if (token_ids_1.size() == 0) {
inputs->clear();
inputs->resize(token_ids_0.size() + 2);
inputs->at(0) = std::move(cls_token_id_);
size_t i = 1;
for (auto& token_id : token_ids_0) {
inputs->at(i) = std::move(token_id);
++i;
}
inputs->at(i) = std::move(sep_token_id_);
} else {
inputs->clear();
inputs->resize(token_ids_0.size() + token_ids_1.size() + 3);
inputs->at(0) = std::move(cls_token_id_);
size_t i = 1;
for (auto& token_id : token_ids_0) {
inputs->at(i) = std::move(token_id);
++i;
}
inputs->at(i) = std::move(sep_token_id_);
++i;
for (auto& token_id : token_ids_1) {
inputs->at(i) = std::move(token_id);
++i;
}
inputs->at(i) = std::move(sep_token_id_);
}
}
int64_t BertTokenizer::GetNumSpecialTokensToAdd(const bool pair) const {
if (pair) {
return 3;
} else {
return 2;
}
}
void BertTokenizer::CreateTokenTypeIdsFromSequences(
vector<int64_t>* token_type_ids, const vector<int64_t>& token_ids_0,
const vector<int64_t>& token_ids_1 /* = vector<int64_t>() */) const {
if (token_ids_1.size() == 0) {
vector<int64_t> tmp(token_ids_0.size() + 2, 0);
token_type_ids->swap(tmp);
} else {
vector<int64_t> tmp(token_ids_0.size() + token_ids_1.size() + 3, 0);
for (size_t i = token_ids_0.size() + 2; i < tmp.size(); i++) {
tmp[i] = 1;
}
token_type_ids->swap(tmp);
}
}
void BertTokenizer::TruncateSequence(
vector<int64_t>* ids, vector<int64_t>* pair_ids,
const size_t num_tokens_to_remove /* = 0 */,
const size_t stride /* = 0 */) const {
for (size_t i = 0; i < num_tokens_to_remove; i++) {
if ((pair_ids->size() == 0) || (ids->size() > pair_ids->size())) {
ids->pop_back();
} else {
pair_ids->pop_back();
}
}
}
int64_t BertTokenizer::GetPadTokenID() const { return pad_token_id_; }
int BertTokenizer::Encode(
unordered_map<string, vector<int64_t>>* encoded_inputs, const string& text,
const string& text_pair /* = "" */, bool is_split_into_words /* = false */,
const size_t max_seq_len /* = 0 */,
bool pad_to_max_seq_len /* = false */) const {
vector<int64_t> ids;
vector<int64_t> pair_ids;
if (!is_split_into_words) {
Tokenize(text, &ids);
if (ids.empty()) return 0;
if (text_pair != "") {
Tokenize(text_pair, &pair_ids);
if (pair_ids.empty()) return 0;
}
} else {
std::wstring unicode_text;
bool status_a = framework::ConvertStrToWstr(text, &unicode_text);
if (!status_a) {
return 0;
}
for (size_t i = 0; i < unicode_text.size(); i++) {
wstring token = unicode_text.substr(i, 1);
auto it = vocab_->find(token);
if (it != vocab_->end()) {
ids.emplace_back(std::move(it->second));
} else {
ids.emplace_back(std::move(unk_token_id_));
}
}
}
bool pair = false;
if (pair_ids.size() != 0) {
pair = true;
}
size_t len_ids = ids.size();
size_t len_pair_ids = pair_ids.size();
// Truncation: Handle max sequence length
// If max_seq_len == 0, then do nothing and keep the real length.
// If max_seq_len > 0 and
// all the input sequence len is over the max_seq_len,
// then we truncate it.
size_t total_len = len_ids + len_pair_ids + GetNumSpecialTokensToAdd(pair);
if (max_seq_len > 0 && total_len > max_seq_len) {
TruncateSequence(&ids, &pair_ids, total_len - max_seq_len);
}
// Add special tokens
vector<int64_t> sequence;
BuildInputsWithSpecialTokens(&sequence, ids, pair_ids);
size_t seq_len = sequence.size();
vector<int64_t> token_type_ids;
CreateTokenTypeIdsFromSequences(&token_type_ids, ids, pair_ids);
// Build output dictionnary
encoded_inputs->emplace("input_ids", sequence);
encoded_inputs->emplace("token_type_ids", token_type_ids);
// Check lengths
if (max_seq_len > 0 && seq_len > max_seq_len) {
VLOG(3) << "There is something wrong with the input sequence length."
" Please check it.";
// Failed.
return 0;
}
// Padding
bool needs_to_be_padded = false;
if (pad_to_max_seq_len && max_seq_len > 0 && (seq_len < max_seq_len)) {
needs_to_be_padded = true;
}
if (needs_to_be_padded) {
int64_t difference = max_seq_len - seq_len;
size_t pad_start = max_seq_len - 1 - difference;
encoded_inputs->at("token_type_ids").resize(max_seq_len);
for (size_t i = max_seq_len - 1; i > pad_start; i--) {
encoded_inputs->at("token_type_ids")[i] = pad_token_id_;
}
encoded_inputs->at("input_ids").resize(max_seq_len);
for (size_t i = max_seq_len - 1; i > pad_start; i--) {
encoded_inputs->at("input_ids")[i] = pad_token_id_;
}
}
return 1;
}
void BertTokenizer::BatchEncode(
vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
const vector<string>& batch_text,
const vector<string>& batch_text_pair /* = vector<string>() */,
bool is_split_into_words /* = false */, const size_t max_seq_len /* = 0 */,
bool pad_to_max_seq_len /* = false */) const {
bool has_text_pair = false;
if (batch_text_pair.size() != 0) {
has_text_pair = true;
}
size_t batch_size = batch_text.size();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (size_t i = 0; i < batch_size; i++) {
unordered_map<string, vector<int64_t>> res;
if (has_text_pair) {
auto status =
Encode(&res, batch_text[i], batch_text_pair[i], is_split_into_words,
max_seq_len, pad_to_max_seq_len);
if (!status) {
res["input_ids"] =
std::vector<int64_t>{cls_token_id_, sep_token_id_, cls_token_id_};
res["token_type_ids"] = std::vector<int64_t>{0, 0, 1};
}
} else {
auto status = Encode(&res, batch_text[i], {}, is_split_into_words,
max_seq_len, pad_to_max_seq_len);
if (!status) {
res["input_ids"] = std::vector<int64_t>{cls_token_id_, sep_token_id_};
res["token_type_ids"] = std::vector<int64_t>{0, 0};
}
}
batch_encode_inputs->at(i) = std::move(res);
}
}
class FasterTokenizerOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Text"), "Input", "Text", "Tokenizer");
OP_INOUT_CHECK(ctx->HasInput("Vocab"), "Input", "Vocab", "Tokenizer");
OP_INOUT_CHECK(ctx->HasOutput("InputIds"), "Output", "InputIds",
"Tokenizer");
OP_INOUT_CHECK(ctx->HasOutput("SegmentIds"), "Output", "SegmentIds",
"Tokenizer");
ctx->SetOutputDim("InputIds", {-1, -1});
ctx->SetOutputDim("SegmentIds", {-1, -1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(framework::proto::VarType::INT64,
paddle::platform::CPUPlace());
}
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const framework::Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override {
return framework::OpKernelType(expected_kernel_type.data_type_,
expected_kernel_type.place_,
tensor.layout());
}
};
class FasterTokenizerOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Vocab",
"(std::map<std::wstring, std::int>), The vocab to map "
"token string to token id.");
AddInput("Text",
"(std::vector<std::string>), The sequence to be processed. "
"One sequence is a string, a list of strings, "
"or a list of integers depending on whether it "
"has been pretokenized and converted to ids. ");
AddInput("TextPair",
"(std::vector<std::string>), Same as `text` argument, "
"while it represents for the latter sequence of the "
"sequence pair.")
.AsDispensable();
AddOutput("InputIds", "(Tensor), The token ids of the input text.");
AddOutput("SegmentIds", "(Tensor), The segments ids of the input text.");
AddAttr<bool>(
"do_lower_case",
"(bool), Whether or not to lowercase the input when tokenizing.")
.SetDefault(false);
AddAttr<bool>(
"is_split_into_words",
"(bool), Whether or not the input is already pre-tokenized "
"(e.g., split into words). If set to True, the tokenizer "
"assumes the input is already split into words (for instance, "
"by splitting it on whitespace) which it will tokenize. This "
"is useful for NER or token classification.")
.SetDefault(false);
AddAttr<int>("max_seq_len",
"(int), If set to a positive number, will limit the "
"total sequence returned so that it has a maximum length."
" If there are overflowing tokens, those overflowing "
"tokens will be added to the returned dictionary when "
"`return_overflowing_tokens` is `True`.")
.SetDefault(0);
AddAttr<bool>("pad_to_max_seq_len",
"(bool), If set to `True`, the returned sequences would be"
" padded up to `max_seq_len` specified length according to"
" padding side and padding token id.")
.SetDefault(false);
AddComment(R"DOC(Performs tokenization and uses the tokenized tokens to "
"prepare model inputs. It supports sequence or sequence pair as input, "
"and batch input is not allowed.)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(faster_tokenizer, ops::FasterTokenizerOp,
ops::FasterTokenizerOpMaker);
REGISTER_OP_CPU_KERNEL(faster_tokenizer, ops::FasterTokenizerKernel<int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utf8proc.h>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/string_array.h"
namespace paddle {
namespace operators {
using std::endl;
using std::int64_t;
using std::size_t;
using std::string;
using std::shared_ptr;
using std::vector;
using std::unordered_map;
using std::unordered_set;
using std::vector;
using std::wstring;
using std::wcout;
inline bool IsControl(const wchar_t& ch);
inline bool IsChineseChar(const wchar_t& ch);
inline bool IsWhiteSpace(const wchar_t& ch);
using Vocab = unordered_map<wstring, int>;
using InvVocab = unordered_map<int, wstring>;
class BasicTokenizer {
public:
explicit BasicTokenizer(bool do_lower_case = true);
void Tokenize(const string& text, vector<wstring>* res) const;
private:
wchar_t do_lower_case(wchar_t ch) const;
bool do_lower_case_;
};
class WordPieceTokenizer {
public:
explicit WordPieceTokenizer(framework::Vocab* vocab,
const wstring& unk_token = L"[UNK]",
const size_t max_input_chars_per_word = 100);
void Tokenize(const wstring& text, vector<int64_t>* output) const;
private:
framework::Vocab* vocab_;
wstring unk_token_{L"[UNK]"};
int64_t unk_token_id_;
size_t max_input_chars_per_word_;
};
class BertTokenizer {
public:
explicit BertTokenizer(framework::Vocab* vocab, bool do_lower_case = false,
const wstring& unk_token = L"[UNK]",
const wstring& pad_token = L"[PAD]",
const wstring& cls_token = L"[CLS]",
const wstring& mask_token = L"[MASK]",
const wstring& sep_token = L"[SEP]",
const string& padding_site = "right");
void Tokenize(const string& text, vector<int64_t>* split_tokens) const;
void BuildInputsWithSpecialTokens(
vector<int64_t>* res, const vector<int64_t>& token_ids_0,
const vector<int64_t>& token_ids_1 = vector<int64_t>()) const;
void CreateTokenTypeIdsFromSequences(
vector<int64_t>* token_type_ids, const vector<int64_t>& token_ids_0,
const vector<int64_t>& token_ids_1 = vector<int64_t>()) const;
void TruncateSequence(vector<int64_t>* ids, vector<int64_t>* pair_ids,
const size_t num_tokens_to_remove = 0,
const size_t stride = 0) const;
int64_t GetNumSpecialTokensToAdd(const bool pair = false) const;
int Encode(unordered_map<string, vector<int64_t>>* encoded_inputs,
const string& text, const string& text_pair = "",
bool is_split_into_words = false, const size_t max_seq_len = 0,
bool pad_to_max_seq_len = false) const;
void BatchEncode(
vector<unordered_map<string, vector<int64_t>>>* batch_encode_inputs,
const vector<string>& batch_text,
const vector<string>& batch_text_pair = vector<string>(),
bool is_split_into_words = false, const size_t max_seq_len = 0,
bool pad_to_max_seq_len = false) const;
int64_t GetPadTokenID() const;
private:
bool do_lower_case_;
wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_;
string padding_site_;
framework::Vocab* vocab_;
BasicTokenizer basic_tokenizer_;
WordPieceTokenizer word_piece_tokenizer_;
int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_,
sep_token_id_;
vector<wstring> all_special_tokens_;
unordered_set<int64_t> all_special_token_ids_;
InvVocab inv_vocab_;
};
template <typename T>
class FasterTokenizerKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* text = ctx.Input<framework::Strings>("Text");
auto* vocab = ctx.Input<framework::Vocab>("Vocab");
auto* input_ids = ctx.Output<framework::Tensor>("InputIds");
auto* seg_ids = ctx.Output<framework::Tensor>("SegmentIds");
auto do_lower_case = static_cast<bool>(ctx.Attr<bool>("do_lower_case"));
auto is_split_into_words =
static_cast<bool>(ctx.Attr<bool>("is_split_into_words"));
auto max_seq_len = static_cast<size_t>(ctx.Attr<int>("max_seq_len"));
auto pad_to_max_seq_len =
static_cast<bool>(ctx.Attr<bool>("pad_to_max_seq_len"));
auto* text_pair = ctx.Input<framework::Strings>("TextPair");
if (text_pair && text->size() != text_pair->size()) {
VLOG(3) << "The input text(list[str]) and text pair (list[str]) must"
<< "be the same number of text sequence. Please check the input!";
return;
}
BertTokenizer* tokenizer_ptr =
new BertTokenizer(const_cast<framework::Vocab*>(vocab), do_lower_case);
size_t batch_max_seq_len = 0;
size_t batch_size = text->size();
vector<unordered_map<string, vector<int64_t>>> batch_encode_inputs(
batch_size);
if (text_pair) {
tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, *text_pair,
is_split_into_words, max_seq_len,
pad_to_max_seq_len);
} else {
tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, vector<string>(),
is_split_into_words, max_seq_len,
pad_to_max_seq_len);
}
for (size_t i = 0; i < batch_size; ++i) {
size_t seq_len = batch_encode_inputs[i]["input_ids"].size();
if (seq_len > batch_max_seq_len) {
batch_max_seq_len = seq_len;
}
}
input_ids->Resize(
framework::make_ddim({static_cast<int64_t>(batch_size),
static_cast<int64_t>(batch_max_seq_len)}));
auto* input_ids_data = input_ids->mutable_data<T>(ctx.GetPlace());
seg_ids->Resize(
framework::make_ddim({static_cast<int64_t>(batch_size),
static_cast<int64_t>(batch_max_seq_len)}));
auto* seg_ids_data = seg_ids->mutable_data<T>(ctx.GetPlace());
auto pad_token_id = tokenizer_ptr->GetPadTokenID();
for (size_t i = 0; i < batch_size; i++) {
auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"];
auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"];
const size_t& seq_len = encoder_input_ids.size();
// Copy the memory
std::memcpy(input_ids_data + i * batch_max_seq_len,
encoder_input_ids.data(), seq_len * sizeof(T));
std::memcpy(seg_ids_data + i * batch_max_seq_len, encoder_seg_ids.data(),
seq_len * sizeof(T));
std::memset(input_ids_data + i * batch_max_seq_len + seq_len,
pad_token_id, (batch_max_seq_len - seq_len) * sizeof(T));
std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, pad_token_id,
(batch_max_seq_len - seq_len) * sizeof(T));
}
delete tokenizer_ptr;
}
};
} // namespace operators
} // namespace paddle
# This file records the Unity Build compilation rules.
# The source files in a `register_unity_group` called are compiled in a unity
# file.
# Generally, the combination rules in this file do not need to be modified.
# If there are some redefined error in compiling with the source file which
# in combination rule, you can remove the source file from the following rules.
register_unity_group(cc
faster_tokenizer_op.cc)
\ No newline at end of file
......@@ -1875,6 +1875,12 @@ void BindImperative(py::module *m_ptr) {
} else if (self.Var().IsType<framework::SelectedRows>()) {
return framework::vectorize<int>(
self.Var().Get<framework::SelectedRows>().value().dims());
} else if (self.Var().IsType<framework::Strings>()) {
return std::vector<int>{static_cast<int>(
self.Var().Get<framework::Strings>().size())};
} else if (self.Var().IsType<framework::Vocab>()) {
return std::vector<int>{
static_cast<int>(self.Var().Get<framework::Vocab>().size())};
} else {
VLOG(2) << "It is meaningless to get shape of "
"variable type "
......
......@@ -185,6 +185,18 @@ void ZeroCopyTensorCreate(
tensor.copy_from_cpu(static_cast<const T *>(data.data()));
}
/// \brief Experimental interface.
/// Create the Strings tensor from data.
/// \param tensor The tensor will be created and
/// the tensor value is same as data.
/// \param data The input text.
void ZeroCopyStringTensorCreate(ZeroCopyTensor &tensor, // NOLINT
const paddle_infer::Strings *data) {
size_t shape = data->size();
tensor.ReshapeStrings(shape);
tensor.copy_strings_from_cpu(data);
}
template <typename T>
void PaddleInferTensorCreate(
paddle_infer::Tensor &tensor, // NOLINT
......@@ -195,6 +207,19 @@ void PaddleInferTensorCreate(
tensor.CopyFromCpu(static_cast<const T *>(data.data()));
}
/// \brief Experimental interface.
/// Create the Strings tensor from data.
/// \param tensor The tensor will be created and
/// the tensor value is same as data.
/// \param data The input text.
void PaddleInferStringTensorCreate(paddle_infer::Tensor &tensor, // NOLINT
const paddle_infer::Strings *data) {
VLOG(3) << "Create PaddleInferTensor, dtype = Strings ";
size_t shape = data->size();
tensor.ReshapeStrings(shape);
tensor.CopyStringsFromCpu(data);
}
size_t PaddleGetDTypeSize(PaddleDType dt) {
size_t size{0};
switch (dt) {
......@@ -726,11 +751,15 @@ void BindPaddleInferPredictor(py::module *m) {
void BindZeroCopyTensor(py::module *m) {
py::class_<ZeroCopyTensor>(*m, "ZeroCopyTensor")
.def("reshape", &ZeroCopyTensor::Reshape)
.def("reshape", py::overload_cast<const std::vector<int> &>(
&ZeroCopyTensor::Reshape))
.def("reshape", py::overload_cast<const std::size_t &>(
&paddle_infer::Tensor::ReshapeStrings))
.def("copy_from_cpu", &ZeroCopyTensorCreate<int32_t>)
.def("copy_from_cpu", &ZeroCopyTensorCreate<int64_t>)
.def("copy_from_cpu", &ZeroCopyTensorCreate<float>)
.def("copy_from_cpu", &ZeroCopyTensorCreate<paddle_infer::float16>)
.def("copy_from_cpu", &ZeroCopyStringTensorCreate)
.def("copy_to_cpu", &ZeroCopyTensorToNumpy)
.def("shape", &ZeroCopyTensor::shape)
.def("set_lod", &ZeroCopyTensor::SetLoD)
......@@ -740,12 +769,16 @@ void BindZeroCopyTensor(py::module *m) {
void BindPaddleInferTensor(py::module *m) {
py::class_<paddle_infer::Tensor>(*m, "PaddleInferTensor")
.def("reshape", &paddle_infer::Tensor::Reshape)
.def("reshape", py::overload_cast<const std::vector<int> &>(
&paddle_infer::Tensor::Reshape))
.def("reshape", py::overload_cast<const std::size_t &>(
&paddle_infer::Tensor::ReshapeStrings))
.def("copy_from_cpu_bind", &PaddleInferTensorCreate<int32_t>)
.def("copy_from_cpu_bind", &PaddleInferTensorCreate<int64_t>)
.def("copy_from_cpu_bind", &PaddleInferTensorCreate<float>)
.def("copy_from_cpu_bind",
&PaddleInferTensorCreate<paddle_infer::float16>)
.def("copy_from_cpu_bind", &PaddleInferStringTensorCreate)
.def("copy_to_cpu", &PaddleInferTensorToNumpy)
.def("shape", &paddle_infer::Tensor::shape)
.def("set_lod", &paddle_infer::Tensor::SetLoD)
......
......@@ -68,6 +68,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
{"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
{"run_program", {"X", "Params"}},
{"faster_tokenizer", {"Text", "Vocab", "TextPair"}},
{"matrix_rank", {"X", "TolTensor"}},
{"adam",
{"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
......
......@@ -227,7 +227,10 @@ void BindVarDsec(pybind11::module *m) {
.value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
.value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
.value("READER", pd::proto::VarType::READER)
.value("RAW", pd::proto::VarType::RAW);
.value("RAW", pd::proto::VarType::RAW)
.value("STRING", pd::proto::VarType::STRING)
.value("STRINGS", pd::proto::VarType::STRINGS)
.value("VOCAB", pd::proto::VarType::VOCAB);
}
void BindOpDesc(pybind11::module *m) {
......
......@@ -1239,6 +1239,18 @@ All parameter, weight, gradient are variables in Paddle.
[](Variable &self) {
return py::bytes(*self.GetMutable<std::string>());
})
.def("set_string_list",
[](Variable &self, Strings str_list) {
*self.GetMutable<Strings>() = str_list;
})
.def("set_vocab", [](Variable &self,
Vocab vocab) { *self.GetMutable<Vocab>() = vocab; })
.def("get_string_tensor",
[](Variable &self) { return self.GetMutable<Strings>(); },
py::return_value_policy::reference)
.def("get_map_tensor",
[](Variable &self) { return self.GetMutable<Vocab>(); },
py::return_value_policy::reference)
.def("get_lod_rank_table",
[](Variable &self) { return self.GetMutable<LoDRankTable>(); },
py::return_value_policy::reference)
......@@ -1872,20 +1884,20 @@ All parameter, weight, gradient are variables in Paddle.
.def("__str__", string::to_string<const platform::Place &>);
py::class_<OperatorBase>(m, "Operator")
.def_static("create",
[](py::bytes protobin) {
proto::OpDesc desc;
PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin),
true,
platform::errors::InvalidArgument(
"Cannot parse user input to OpDesc"));
PADDLE_ENFORCE_EQ(desc.IsInitialized(), true,
platform::errors::InvalidArgument(
"The provided OpDesc is not "
"initialized, the reason is: %s",
desc.InitializationErrorString()));
return OpRegistry::CreateOp(desc);
})
.def_static(
"create",
[](py::bytes protobin) {
proto::OpDesc desc;
PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true,
platform::errors::InvalidArgument(
"Cannot parse user input to OpDesc"));
PADDLE_ENFORCE_EQ(
desc.IsInitialized(), true,
platform::errors::InvalidArgument(
"The provided OpDesc is not initialized, the reason is: %s",
desc.InitializationErrorString()));
return OpRegistry::CreateOp(desc);
})
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::CPUPlace &place) {
......@@ -2139,7 +2151,12 @@ All parameter, weight, gradient are variables in Paddle.
});
#endif
m.def("set_feed_variable", framework::SetFeedVariable);
m.def("set_feed_variable",
static_cast<void (*)(Scope *, const LoDTensor &, const std::string &,
size_t)>(&framework::SetFeedVariable));
m.def("set_feed_variable",
static_cast<void (*)(Scope *, const Strings &, const std::string &,
size_t)>(&framework::SetFeedVariable));
m.def("get_fetch_variable",
[](const Scope &scope, const std::string &var_name,
size_t index) -> py::object {
......
......@@ -799,12 +799,17 @@ def save(layer, path, input_spec=None, **configs):
# 3. share parameters from Layer to scope & record var info
for param_or_buffer in concrete_program.parameters:
# share to scope
param_or_buffer_tensor = scope.var(
param_or_buffer.name).get_tensor()
#src_tensor = param_or_buffer.value().get_tensor()
src_tensor = state_var_dict[param_or_buffer.name].value(
).get_tensor()
param_or_buffer_tensor._share_data_with(src_tensor)
if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
scr_tensor = param_or_buffer.value().get_map_tensor()
tgt_var = scope.var(param_or_buffer.name)
tgt_var.set_vocab(scr_tensor)
else:
param_or_buffer_tensor = scope.var(
param_or_buffer.name).get_tensor()
#src_tensor = param_or_buffer.value().get_tensor()
src_tensor = state_var_dict[param_or_buffer.name].value(
).get_tensor()
param_or_buffer_tensor._share_data_with(src_tensor)
# record var info
if param_or_buffer.name not in extra_var_info:
extra_info_dict = dict()
......
......@@ -1409,13 +1409,22 @@ class Layer(core.Layer):
if state is None:
raise ValueError("{} is not found in the provided dict.".format(
key))
state_shape = state.shape() if inspect.ismethod(
state.shape) else state.shape
if list(state_shape) != list(param.shape):
raise ValueError(
"{} receives a shape {}, but the expected shape is {}.".
format(key, list(state_shape), list(param.shape)))
return param, state
if (isinstance(state, dict) or isinstance(state, list)):
if (len(state) != len(param)):
raise ValueError("{} receieves the length of {}, "
"but the expected shape is {}".format(
key, len(state), len(param)))
else:
return param, state
else:
state_shape = state.shape() if inspect.ismethod(
state.shape) else state.shape
if list(state_shape) != list(param.shape):
raise ValueError(
"{} receives a shape {}, but the expected shape is {}.".
format(key, list(state_shape), list(param.shape)))
return param, state
matched_param_state = []
for key, param in self.state_dict().items():
......
......@@ -133,7 +133,12 @@ def monkey_patch_math_varbase():
return int(var.numpy().flatten()[0])
def _len_(var):
return var.shape[0]
if var.type == core.VarDesc.VarType.VOCAB:
return len(var.value().get_map_tensor())
elif var.type == core.VarDesc.VarType.STRINGS:
return len(var.value().get_string_tensor())
else:
return var.shape[0]
def _index_(var):
numel = np.prod(var.shape)
......
......@@ -146,25 +146,35 @@ def monkey_patch_varbase():
out = linear(t) # call with different weight
"""
assert isinstance(value, (np.ndarray, core.VarBase)), \
"Variable set_value function, arguments type only support Variable, numpy, VarBase"
value_np = value
if isinstance(value, core.VarBase):
value_np = value.numpy()
assert isinstance(value, (np.ndarray, core.VarBase, dict, str)), \
"Variable set_value function, arguments type only support Variable, numpy, VarBase, dict, string."
if isinstance(value, (dict, str)):
assert len(self) == len(
value
), "Variable length not match, Variable [ {} ] need tensor with length {} but load set tensor with length {}".format(
self.name, len(self), len(value))
if isinstance(value, dict):
self.value().set_vocab(value)
else:
self.value().set_string_list(value)
else:
value_np = value
if isinstance(value, core.VarBase):
value_np = value.numpy()
self_tensor_np = self.numpy()
self_tensor_np = self.numpy()
assert self_tensor_np.shape == value_np.shape, \
"Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format(
self.name, self_tensor_np.shape, value_np.shape)
assert self_tensor_np.shape == value_np.shape, \
"Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format(
self.name, self_tensor_np.shape, value_np.shape)
assert self_tensor_np.dtype == value_np.dtype, \
"Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
self.name, self_tensor_np.dtype, value_np.dtype)
assert self_tensor_np.dtype == value_np.dtype, \
"Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format(
self.name, self_tensor_np.dtype, value_np.dtype)
self.value().get_tensor().set(value_np,
framework._current_expected_place())
self.value().get_tensor().set(value_np,
framework._current_expected_place())
@framework.dygraph_only
def backward(self, grad_tensor=None, retain_graph=False):
......
......@@ -792,9 +792,11 @@ class Executor(object):
feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name]
var = global_block.var(feed_target_name)
if not isinstance(cur_feed, core.LoDTensor):
cur_feed = _as_lodtensor(cur_feed, self.place, var.dtype)
check_feed_shape_type(var, cur_feed)
if var.dtype != core.VarDesc.VarType.STRINGS:
if not isinstance(cur_feed, core.LoDTensor):
cur_feed = _as_lodtensor(cur_feed, self.place,
var.dtype)
check_feed_shape_type(var, cur_feed)
idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else:
......
......@@ -979,6 +979,10 @@ class Variable(object):
if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype)
if dtype == core.VarDesc.VarType.STRINGS:
type = core.VarDesc.VarType.STRINGS
lod_level = None
self.belong_to_optimizer = belong_to_optimizer
self.error_clip = error_clip
......
......@@ -29,10 +29,14 @@ def tensor_copy_from_cpu(self, data):
'''
Support input type check based on tensor.copy_from_cpu.
'''
if not isinstance(data, np.ndarray):
if isinstance(data, np.ndarray) or (isinstance(data, list) and
len(data) > 0 and
isinstance(data[0], str)):
self.copy_from_cpu_bind(data)
else:
raise TypeError(
"In copy_from_cpu, we only support numpy ndarray data type.")
self.copy_from_cpu_bind(data)
"In copy_from_cpu, we only support numpy ndarray and list[str] data type."
)
Tensor.copy_from_cpu = tensor_copy_from_cpu
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import io
import os
import unittest
import numpy as np
import paddle
import paddle.nn as nn
from paddle.dataset.common import DATA_HOME
from paddle.fluid.framework import core, in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
import sys
sys.path.append("./tokenizer")
from tokenizer.bert_tokenizer import BertTokenizer
def to_string_tensor(string_values, name):
"""
Create the tensor that the value holds the list of string.
NOTICE: The value will be holded in the cpu place.
Args:
string_values(list[string]): The value will be setted to the tensor.
name(string): The name of the tensor.
"""
tensor = paddle.Tensor(core.VarDesc.VarType.STRING, [], name,
core.VarDesc.VarType.STRINGS, False)
tensor.value().set_string_list(string_values)
return tensor
def to_map_tensor(string_dict, name):
"""
Create the tensor that the value holds the map, the type of key is the string
and the value is the int.
NOTICE: The value will be holded in the cpu place.
Args:
string_dict(dict): The value will be setted to the tensor.
name(string): The name of the tensor.
"""
tensor = paddle.Tensor(core.VarDesc.VarType.RAW, [], name,
core.VarDesc.VarType.VOCAB, True)
tensor.value().set_vocab(string_dict)
return tensor
class FasterTokenizer(nn.Layer):
def __init__(self, vocab_dict):
super(FasterTokenizer, self).__init__()
vocab_tensor = to_map_tensor(vocab_dict, "vocab")
self.register_buffer("vocab", vocab_tensor, persistable=True)
def forward(self,
text,
text_pair=None,
do_lower_case=True,
max_seq_len=-1,
is_split_into_words=False,
pad_to_max_seq_len=False):
if in_dygraph_mode():
input_ids, seg_ids = core.ops.faster_tokenizer(
self.vocab, text, text_pair, "do_lower_case", do_lower_case,
"max_seq_len", max_seq_len, "pad_to_max_seq_len",
pad_to_max_seq_len, "is_split_into_words", is_split_into_words)
return input_ids, seg_ids
attrs = {
"do_lower_case": do_lower_case,
"max_seq_len": max_seq_len,
"pad_to_max_seq_len": pad_to_max_seq_len,
"is_split_into_words": is_split_into_words,
}
helper = LayerHelper("faster_tokenizer")
input_ids = helper.create_variable_for_type_inference(dtype="int64")
seg_ids = helper.create_variable_for_type_inference(dtype="int64")
if text_pair is None:
helper.append_op(
type='faster_tokenizer',
inputs={'Vocab': self.vocab,
'Text': text},
outputs={'InputIds': input_ids,
'SegmentIds': seg_ids},
attrs=attrs)
else:
helper.append_op(
type='faster_tokenizer',
inputs={
'Vocab': self.vocab,
'Text': text,
'TextPair': text_pair
},
outputs={'InputIds': input_ids,
'SegmentIds': seg_ids},
attrs=attrs)
return input_ids, seg_ids
class Predictor(object):
def __init__(self, model_dir):
model_file = os.path.join(model_dir, "inference.pdmodel")
params_file = os.path.join(model_dir, "inference.pdiparams")
if not os.path.exists(model_file):
raise ValueError("not find model file path {}".format(model_file))
if not os.path.exists(params_file):
raise ValueError("not find params file path {}".format(params_file))
config = paddle.inference.Config(model_file, params_file)
# fast_tokenizer op only support cpu.
config.disable_gpu()
config.set_cpu_math_library_num_threads(10)
config.switch_use_feed_fetch_ops(False)
self.predictor = paddle.inference.create_predictor(config)
self.input_handles = [
self.predictor.get_input_handle(name)
for name in self.predictor.get_input_names()
]
self.output_handles = [
self.predictor.get_output_handle(name)
for name in self.predictor.get_output_names()
]
def predict(self, data):
self.input_handles[0].copy_from_cpu(data)
self.predictor.run()
input_ids = self.output_handles[0].copy_to_cpu()
token_type_ids = self.output_handles[1].copy_to_cpu()
return input_ids, token_type_ids
class TestBertTokenizerOp(unittest.TestCase):
def setUp(self):
self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab)
self.init_data()
self.save_path = os.path.join(DATA_HOME, "fast_tokenizer")
self.param_path = os.path.join(self.save_path, "model.pdparams")
self.inference_path = os.path.join(self.save_path, "inference")
def init_data(self):
self.text = [
'选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。'
'酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,'
'还算丰富。 服务吗,一般'
]
self.text_pair = ['非常不错,服务很好,位于市中心区,交通方便,不过价格也高!']
self.text_tensor = to_string_tensor(self.text, "text")
self.text_pair_tensor = to_string_tensor(self.text_pair, "text_pair")
self.texts = [
'很好的地理位置,一蹋糊涂的服务,萧条的酒店。',
' 选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般,'
'但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。 服务吗,一般',
'Test bert tokenizer. The first text.'
]
self.text_pairs = [
'非常不错,服务很好,位于市中心区,交通方便,不过价格也高!', '房间太小。其他的都一般。。。。。。。。。',
'Test bert tokenizer. The second text.'
]
self.texts_tensor = to_string_tensor(self.texts, "texts")
self.text_pairs_tensor = to_string_tensor(self.text_pairs, "text_pairs")
def test_padding(self):
self.max_seq_len = 128
self.pad_to_max_seq_len = True
self.is_split_into_words = False
# case 1: only one text (batch_size = 1)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.text_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
text=self.text,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
# case 2: only one text and one text_pair (batch_size = 1)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.text_tensor,
text_pair=self.text_pair_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
text=self.text,
text_pair=self.text_pair,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
# case 3: only texts (batch_size = 3)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.texts_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
self.texts,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = [i["input_ids"] for i in encoded_inputs]
py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs]
py_input_ids = np.array(py_input_ids).reshape([3, -1])
py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
# case 4: texts and text pairs (batch_size = 3)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.texts_tensor,
text_pair=self.text_pairs_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
self.texts,
self.text_pairs,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = [i["input_ids"] for i in encoded_inputs]
py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs]
py_input_ids = np.array(py_input_ids).reshape([3, -1])
py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
def test_no_padding(self):
self.max_seq_len = 128
self.pad_to_max_seq_len = False
self.is_split_into_words = False
# case 1: only one text (batch_size = 1)
input_ids, token_type_ids = self.faster_tokenizer(
text=self.text_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
self.text,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
# case 2: only one text and one text_pair (batch_size = 1)
input_ids, token_type_ids = self.faster_tokenizer(
self.text_tensor,
self.text_pair_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
self.text,
self.text_pair,
max_seq_len=self.max_seq_len,
pad_to_max_seq_len=self.pad_to_max_seq_len,
is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
def test_is_split_into_words(self):
self.is_split_into_words = True
input_ids, token_type_ids = self.faster_tokenizer(
self.text_tensor,
do_lower_case=self.bert_tokenizer.do_lower_case,
is_split_into_words=self.is_split_into_words)
input_ids = input_ids.numpy()
token_type_ids = token_type_ids.numpy()
encoded_inputs = self.bert_tokenizer(
list(self.text[0]), is_split_into_words=self.is_split_into_words)
py_input_ids = np.array(encoded_inputs["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs["token_type_ids"]).reshape(
[1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
def test_inference(self):
if not os.path.exists(self.save_path):
os.makedirs(self.save_path, exist_ok=True)
paddle.save(self.faster_tokenizer.state_dict(), self.param_path)
state_dict = paddle.load(self.param_path)
self.faster_tokenizer.set_dict(state_dict)
static_model = paddle.jit.to_static(
self.faster_tokenizer,
input_spec=[
paddle.static.InputSpec(
shape=[None], dtype=core.VarDesc.VarType.STRINGS), # texts
])
# Save in static graph model.
paddle.jit.save(static_model, self.inference_path)
predictor = Predictor(self.save_path)
input_ids, token_type_ids = predictor.predict(self.text)
encoded_inputs = self.bert_tokenizer(self.text)
py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1])
py_token_type_ids = np.array(encoded_inputs[0][
"token_type_ids"]).reshape([1, -1])
self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01))
self.assertTrue(
np.allclose(
token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
def test_feed_string_var(self):
paddle.enable_static()
x = paddle.static.data(
name="x", shape=[-1], dtype=core.VarDesc.VarType.STRINGS)
exe = paddle.static.Executor(paddle.framework.CPUPlace())
exe.run(paddle.static.default_main_program(), feed={'x': self.text})
paddle.disable_static()
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import io
import json
import os
import six
import unicodedata
from tokenizer_utils import PretrainedTokenizer
from tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation
class BasicTokenizer(object):
"""
Runs basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (bool):
Whether or not to lowercase the input when tokenizing.
Defaults to `True`.
"""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer."""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""
Tokenizes a piece of text using basic tokenizer.
Args:
text (str): A piece of text.
Returns:
list(str): A list of tokens.
Examples:
.. code-block::
from paddlenlp.transformers import BasicTokenizer
basictokenizer = BasicTokenizer()
tokens = basictokenizer.tokenize('He was a puppeteer')
'''
['he', 'was', 'a', 'puppeteer']
'''
"""
text = convert_to_unicode(text)
text = self._clean_text(text)
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""
Strips accents from a piece of text.
"""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""
Splits punctuation on a piece of text.
"""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""
Adds whitespace around any CJK character.
"""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""
Checks whether CP is the codepoint of a CJK character.
"""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""
Performs invalid character removal and whitespace cleanup on text.
"""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""
Runs WordPiece tokenization.
Args:
vocab (Vocab|dict):
Vocab of the word piece tokenizer.
unk_token (str):
A specific token to replace all unknown tokens.
max_input_chars_per_word (int):
If a word's length is more than
max_input_chars_per_word, it will be dealt as unknown word.
Defaults to 100.
"""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
list (str): A list of wordpiece tokens.
Examples:
.. code-block::
from paddlenlp.transformers import BertTokenizer, WordpieceTokenizer
berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocab = berttokenizer.vocab
unk_token = berttokenizer.unk_token
wordpiecetokenizer = WordpieceTokenizer(vocab,unk_token)
inputs = wordpiecetokenizer.tokenize("unaffable")
print(inputs)
'''
["un", "##aff", "##able"]
'''
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
class BertTokenizer(PretrainedTokenizer):
"""
Constructs a BERT tokenizer. It uses a basic tokenizer to do punctuation
splitting, lower casing and so on, and follows a WordPiece tokenizer to
tokenize as subwords.
Args:
vocab_file (str):
The vocabulary file path (ends with '.txt') required to instantiate
a `WordpieceTokenizer`.
do_lower_case (bool):
Whether or not to lowercase the input when tokenizing.
Defaults to`True`.
unk_token (str):
A special token representing the *unknown (out-of-vocabulary)* token.
An unknown token is set to be `unk_token` inorder to be converted to an ID.
Defaults to "[UNK]".
sep_token (str):
A special token separating two different sentences in the same input.
Defaults to "[SEP]".
pad_token (str):
A special token used to make arrays of tokens the same size for batching purposes.
Defaults to "[PAD]".
cls_token (str):
A special token used for sequence classification. It is the last token
of the sequence when built with special tokens. Defaults to "[CLS]".
mask_token (str):
A special token representing a masked token. This is the token used
in the masked language modeling task which the model tries to predict the original unmasked ones.
Defaults to "[MASK]".
Examples:
.. code-block::
from paddlenlp.transformers import BertTokenizer
berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = berttokenizer.tokenize('He was a puppeteer')
print(inputs)
'''
{'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
'''
"""
resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained
pretrained_resource_files_map = {
"vocab_file": {
"bert-base-uncased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt",
"bert-large-uncased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-uncased-vocab.txt",
"bert-base-cased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt",
"bert-large-cased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-cased-vocab.txt",
"bert-base-multilingual-uncased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-uncased-vocab.txt",
"bert-base-multilingual-cased":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-cased-vocab.txt",
"bert-base-chinese":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt",
"bert-wwm-chinese":
"http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt",
"bert-wwm-ext-chinese":
"http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-ext-chinese-vocab.txt",
"macbert-large-chinese":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt",
"macbert-base-chinese":
"https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt",
"simbert-base-chinese":
"https://paddlenlp.bj.bcebos.com/models/transformers/simbert/vocab.txt",
}
}
pretrained_init_configuration = {
"bert-base-uncased": {
"do_lower_case": True
},
"bert-large-uncased": {
"do_lower_case": True
},
"bert-base-cased": {
"do_lower_case": False
},
"bert-large-cased": {
"do_lower_case": False
},
"bert-base-multilingual-uncased": {
"do_lower_case": True
},
"bert-base-multilingual-cased": {
"do_lower_case": False
},
"bert-base-chinese": {
"do_lower_case": False
},
"bert-wwm-chinese": {
"do_lower_case": False
},
"bert-wwm-ext-chinese": {
"do_lower_case": False
},
"macbert-large-chinese": {
"do_lower_case": False
},
"macbert-base-chinese": {
"do_lower_case": False
},
"simbert-base-chinese": {
"do_lower_case": True
},
}
padding_side = 'right'
def __init__(self,
vocab_file,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]"):
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the "
"vocabulary from a pretrained model please use "
"`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
.format(vocab_file))
self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
self.do_lower_case = do_lower_case
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(
vocab=self.vocab, unk_token=unk_token)
self.special_tokens_map = {
'unk_token': unk_token,
'sep_token': sep_token,
'pad_token': pad_token,
'cls_token': cls_token,
'mask_token': mask_token
}
@property
def vocab_size(self):
"""
Return the size of vocabulary.
Returns:
int: The size of vocabulary.
"""
return len(self.vocab)
def _tokenize(self, text):
"""
End-to-end tokenization for BERT models.
Args:
text (str): The text to be tokenized.
Returns:
list: A list of string representing converted tokens.
"""
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def tokenize(self, text):
"""
Converts a string to a list of tokens.
Args:
text (str): The text to be tokenized.
Returns:
List(str): A list of string representing converted tokens.
Examples:
.. code-block::
from paddlenlp.transformers import BertTokenizer
berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = berttokenizer.tokenize('He was a puppeteer')
'''
['he', 'was', 'a', 'puppet', '##eer']
'''
"""
return self._tokenize(text)
def num_special_tokens_to_add(self, pair=False):
"""
Returns the number of added tokens when encoding a sequence with special tokens.
Args:
pair(bool):
Whether the input is a sequence pair or a single sequence.
Defaults to `False` and the input is a single sequence.
Returns:
int: Number of tokens added to sequences.
"""
token_ids_0 = []
token_ids_1 = []
return len(
self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
if pair else None))
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (List[int]):
List of IDs to which the special tokens will be added.
token_ids_1 (List[int], optional):
Optional second list of IDs for sequence pairs. Defaults to None.
Returns:
List[int]: List of input_id with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
_cls = [self.cls_token_id]
_sep = [self.sep_token_id]
return _cls + token_ids_0 + _sep + token_ids_1 + _sep
def create_token_type_ids_from_sequences(self,
token_ids_0,
token_ids_1=None):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (List[int]):
A list of `inputs_ids` for the first sequence.
token_ids_1 (List[int], optional):
Optional second list of IDs for sequence pairs. Defaults to None.
Returns:
List[int]: List of token_type_id according to the given sequence(s).
"""
_sep = [self.sep_token_id]
_cls = [self.cls_token_id]
if token_ids_1 is None:
return len(_cls + token_ids_0 + _sep) * [0]
return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 +
_sep) * [1]
def get_special_tokens_mask(self,
token_ids_0,
token_ids_1=None,
already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``encode`` methods.
Args:
token_ids_0 (List[int]):
A list of `inputs_ids` for the first sequence.
token_ids_1 (List[int], optinal):
Optional second list of IDs for sequence pairs. Defaults to None.
already_has_special_tokens (bool, optional): Whether or not the token list is already
formatted with special tokens for the model. Defaults to None.
Returns:
List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(
map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
token_ids_0))
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + (
[0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
......@@ -43,7 +43,10 @@ def _build_saved_state_dict(state_dict):
name_table = {}
for key, value in state_dict.items():
if isinstance(value, (Variable, core.VarBase)):
save_dict[key] = value.numpy()
if value.type == core.VarDesc.VarType.VOCAB:
save_dict[key] = value.value().get_map_tensor()
else:
save_dict[key] = value.numpy()
name_table[key] = value.name
else:
save_dict[key] = value
......@@ -938,8 +941,9 @@ def load(path, **configs):
if "StructuredToParameterName@@" in load_result:
for key in load_result["StructuredToParameterName@@"]:
load_result[key] = _ndarray_to_tensor(
load_result[key], config.return_numpy)
if isinstance(load_result[key], np.ndarray):
load_result[key] = _ndarray_to_tensor(
load_result[key], config.return_numpy)
if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
del load_result["StructuredToParameterName@@"]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册