From edff5b7975dcbf5ed6996f952de79be8ca15b49a Mon Sep 17 00:00:00 2001 From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com> Date: Tue, 26 Oct 2021 20:27:08 +0800 Subject: [PATCH] [Cherry-pick] Add FasterTokenizer Operator (#36716) * Add FasterTokenizer Operator (#34491) Add Tokenizer related functionalities for Transformer model in order that the process of training and predicting is consistent. * support the text string as an input Tensor * support the "VOCAB"unordered_map as an input Tensor to lookup tokens * Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization. * It first applies basic tokenization, followed by wordpiece tokenization. * optimize fast tokenizer * remove const_cast Co-authored-by: zhoushunjie Co-authored-by: wawltor --- cmake/external/utf8proc.cmake | 51 + cmake/inference_lib.cmake | 5 + cmake/third_party.cmake | 4 + paddle/fluid/framework/CMakeLists.txt | 2 + paddle/fluid/framework/executor.cc | 8 +- paddle/fluid/framework/executor_gc_helper.cc | 1 + paddle/fluid/framework/feed_fetch_method.cc | 20 +- paddle/fluid/framework/feed_fetch_method.h | 4 + paddle/fluid/framework/feed_fetch_type.h | 12 +- paddle/fluid/framework/framework.proto | 9 + paddle/fluid/framework/operator.cc | 4 + paddle/fluid/framework/string_array.cc | 104 ++ paddle/fluid/framework/string_array.h | 48 + paddle/fluid/framework/tensor_util.cc | 3 +- paddle/fluid/framework/tensor_util.h | 14 + paddle/fluid/framework/var_desc.cc | 8 + paddle/fluid/framework/var_type_traits.h | 13 +- paddle/fluid/framework/variable_helper.cc | 5 + paddle/fluid/imperative/variable_wrapper.h | 10 + paddle/fluid/inference/api/CMakeLists.txt | 2 +- .../inference/api/demo_ci/CMakeLists.txt | 7 +- .../inference/api/details/zero_copy_tensor.cc | 57 +- .../api/details/zero_copy_tensor_dummy.cc | 5 +- .../api/details/zero_copy_tensor_test.cc | 3 +- paddle/fluid/inference/api/paddle_api.h | 8 + paddle/fluid/inference/api/paddle_tensor.h | 22 + paddle/fluid/inference/io.cc | 10 +- paddle/fluid/operators/CMakeLists.txt | 7 +- paddle/fluid/operators/controlflow/feed_op.cc | 54 +- .../fluid/operators/controlflow/fetch_op.cc | 12 +- paddle/fluid/operators/load_combine_op.h | 73 +- paddle/fluid/operators/save_combine_op.h | 60 +- paddle/fluid/operators/string/CMakeLists.txt | 6 + .../operators/string/faster_tokenizer_op.cc | 528 +++++++ .../operators/string/faster_tokenizer_op.h | 195 +++ .../operators/string/unity_build_rule.cmake | 8 + paddle/fluid/pybind/imperative.cc | 6 + paddle/fluid/pybind/inference_api.cc | 37 +- paddle/fluid/pybind/op_function_generator.cc | 1 + paddle/fluid/pybind/protobuf.cc | 5 +- paddle/fluid/pybind/pybind.cc | 47 +- python/paddle/fluid/dygraph/jit.py | 17 +- python/paddle/fluid/dygraph/layers.py | 23 +- python/paddle/fluid/dygraph/math_op_patch.py | 7 +- .../fluid/dygraph/varbase_patch_methods.py | 40 +- python/paddle/fluid/executor.py | 8 +- python/paddle/fluid/framework.py | 4 + python/paddle/fluid/inference/wrapper.py | 10 +- .../unittests/test_faster_tokenizer_op.py | 393 ++++++ .../tests/unittests/tokenizer/__init__.py | 13 + .../unittests/tokenizer/bert_tokenizer.py | 517 +++++++ .../unittests/tokenizer/tokenizer_utils.py | 1244 +++++++++++++++++ python/paddle/framework/io.py | 10 +- 53 files changed, 3607 insertions(+), 157 deletions(-) create mode 100644 cmake/external/utf8proc.cmake create mode 100755 paddle/fluid/framework/string_array.cc create mode 100755 paddle/fluid/framework/string_array.h create mode 100644 paddle/fluid/operators/string/CMakeLists.txt create mode 100644 paddle/fluid/operators/string/faster_tokenizer_op.cc create mode 100644 paddle/fluid/operators/string/faster_tokenizer_op.h create mode 100644 paddle/fluid/operators/string/unity_build_rule.cmake create mode 100755 python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py create mode 100644 python/paddle/fluid/tests/unittests/tokenizer/__init__.py create mode 100755 python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py create mode 100644 python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py diff --git a/cmake/external/utf8proc.cmake b/cmake/external/utf8proc.cmake new file mode 100644 index 0000000000..a5de5c15c3 --- /dev/null +++ b/cmake/external/utf8proc.cmake @@ -0,0 +1,51 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc) +SET(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc) +# As we add extra features for utf8proc, we use the non-official repo +SET(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git) +SET(UTF8PROC_TAG v2.6.1) + +IF(WIN32) + SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib") + add_definitions(-DUTF8PROC_STATIC) +ELSE(WIN32) + SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a") +ENDIF(WIN32) + +INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include) + +ExternalProject_Add( + extern_utf8proc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${UTF8PROC_REPOSITORY} + GIT_TAG ${UTF8PROC_TAG} + PREFIX ${UTF8PROC_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DBUILD_SHARED=ON + -DBUILD_STATIC=ON + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES} +) + +ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES}) +ADD_DEPENDENCIES(utf8proc extern_utf8proc) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index cb2ed614d3..4166f8c103 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -124,6 +124,11 @@ function(copy_part_of_thrid_party TARGET DST) SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) + set(dst_dir "${DST}/third_party/install/utf8proc") + copy(${TARGET} + SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + if (WITH_CRYPTO) set(dst_dir "${DST}/third_party/install/cryptopp") copy(${TARGET} diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 892ae27026..29a5587a07 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -210,6 +210,10 @@ include(external/threadpool)# download threadpool include(external/dlpack) # download dlpack include(external/xxhash) # download, build, install xxhash include(external/warpctc) # download, build, install warpctc +include(external/utf8proc) # download, build, install utf8proc + +list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) +list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_utf8proc) include(external/lapack) # download, build, install lapack list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index de19c7a0e7..fc562a26c1 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -50,6 +50,8 @@ proto_library(data_feed_proto SRCS data_feed.proto) proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto data_feed_proto) +cc_library(string_array SRCS string_array.cc DEPS utf8proc) + cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) if(WITH_GPU) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index de007c128d..5f681ec7ea 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -102,14 +102,18 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, if (var->Persistable()) { auto* ptr = const_cast(ancestor_scope)->Var(var->Name()); + + VLOG(3) << "Initialize Variable " << var->Name(); InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; + << " global, which pointer is " << ptr << " type is " + << static_cast(var->GetType()); } else { auto* ptr = scope->Var(var->Name()); InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; + << " locally, which pointer is " << ptr << "Variable Type " + << static_cast(var->GetType()); } } } else { diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index 43eb1ce8c7..8c64d65ff4 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -125,6 +125,7 @@ void DeleteUnusedTensors(const Scope &scope, for (auto &t : *lod_tensor_arr) { garbages.emplace_back(t.MoveMemoryHolder()); } + } else if (var->IsType()) { } else { PADDLE_THROW(platform::errors::Unimplemented( "Type %s of variable %s is not supported eager deletion.", diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3bd85b2b24..2eac65c90c 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include +#include #include "glog/logging.h" namespace paddle { @@ -35,9 +36,24 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, feed_inputs.resize(index + 1); } // shared data with input tensor - feed_inputs[index].ShareDataWith(input); + auto& val = BOOST_GET(LoDTensor, feed_inputs[index]); + val.ShareDataWith(input); // set lod - feed_inputs[index].set_lod(input.lod()); + val.set_lod(input.lod()); +} + +void SetFeedVariable(Scope* scope, const Strings& input, + const std::string& var_name, size_t index) { + // If var_name Variable is not found in GlobalScope, a new variable will + // be created. + VLOG(3) << "SetFeedStringVariable name=" << var_name << " index=" << index; + Variable* g_feed_value = scope->Var(var_name); + auto& feed_inputs = *(g_feed_value->GetMutable()); + if (index >= feed_inputs.size()) { + feed_inputs.resize(index + 1); + } + // shared data with input tensor + feed_inputs[index] = input; } FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index a52ef517c8..4c2f5b9796 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/string_array.h" namespace paddle { namespace framework { @@ -28,6 +29,9 @@ class Scope; void SetFeedVariable(Scope* scope, const LoDTensor& input, const std::string& var_name, size_t index); +void SetFeedVariable(Scope* scope, const Strings& input, + const std::string& var_name, size_t index); + FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index 1996327fe8..12c111e58f 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -13,14 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { -using FeedType = LoDTensor; +using FeedType = boost::variant; using FeedList = std::vector; using FetchType = boost::variant; @@ -43,6 +46,13 @@ inline bool data_is_lod_tensor_array(const FetchType &data) { return false; } +inline bool data_is_string_tensor(const FeedType &data) { + if (data.type() == typeid(Strings)) { + return true; + } + return false; +} + static const char kFeedOpType[] = "feed"; static const char kFetchOpType[] = "fetch"; diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index eb72d9e142..300d5f6e8f 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -147,6 +147,11 @@ message VarType { // in operators like nccl_op RAW = 17; TUPLE = 18; + + STRING = 25; + STRINGS = 26; + VOCAB = 27; + FEED_LIST = 28; } required Type type = 1; @@ -175,6 +180,10 @@ message VarType { message Tuple { repeated Type element_type = 1; } optional Tuple tuple = 7; + + optional TensorDesc string = 8; + optional TensorDesc strings = 9; + optional TensorDesc vocab = 10; } message VarDesc { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 670cb36dcc..c847467294 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -76,6 +76,8 @@ static DDim GetDimsDebug(const Scope& scope, const std::string& name, } else { return var->Get().GetCompleteDims(); } + } else if (var->IsType()) { + return DDim({static_cast(var->Get().size())}); } else { return DDim({-1}); } @@ -106,6 +108,8 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { } else { return DataTypeToString(tensor.type()); } + } else if (var->IsType()) { + return "strings"; } else { return ""; } diff --git a/paddle/fluid/framework/string_array.cc b/paddle/fluid/framework/string_array.cc new file mode 100755 index 0000000000..3071e6bf4c --- /dev/null +++ b/paddle/fluid/framework/string_array.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/string_array.h" + +namespace paddle { +namespace framework { + +std::wstring_convert> kConverter; + +// Convert the std::string type to the std::wstring type. +bool ConvertStrToWstr(const std::string& src, std::wstring* res) { + try { + *res = kConverter.from_bytes(src); + } catch (std::range_error& e) { + VLOG(3) << "The string " << src << " was converted to unicode failedly! "; + return false; + } + return true; +} + +// Convert the std::wstring type to the std::string type. +void ConvertWstrToStr(const std::wstring& src, std::string* res) { + *res = kConverter.to_bytes(src); +} + +// Normalization Form Canonical Decomposition. +void NFD(const std::string& s, std::string* ret) { + *ret = ""; + char* result = reinterpret_cast( + utf8proc_NFD(reinterpret_cast(s.c_str()))); + if (result) { + *ret = std::move(std::string(result)); + free(result); + } +} + +// Write the data which is type of +// std::unordered_map to ostream. +void StringMapToStream(std::ostream& os, + const std::unordered_map& data) { + { + // firstly write the data size. + size_t t = data.size(); + os.write(reinterpret_cast(&t), sizeof(t)); + } + { + // then write the data + for (auto it = data.begin(); it != data.end(); ++it) { + std::string token = it->first; + int32_t token_id = it->second; + // write the token + size_t length = token.size(); + os.write(reinterpret_cast(&length), sizeof(length)); + os.write(token.c_str(), length); + // write the token_id + os.write(reinterpret_cast(&token_id), sizeof(token_id)); + } + } +} + +// Read the data which is type of +// std::unordered_map from istream. +void StringMapFromStream(std::istream& is, + std::unordered_map* data) { + // first read the map size + size_t map_size; + is.read(reinterpret_cast(&map_size), sizeof(map_size)); + data->reserve(map_size); + // then read the data + for (size_t i = 0; i < map_size; ++i) { + // read the token + size_t token_length; + is.read(reinterpret_cast(&token_length), sizeof(token_length)); + char* tmp = new char[token_length]; + is.read(tmp, token_length); + std::string token(tmp, tmp + token_length); + delete[] tmp; + // read the token_id + int32_t token_id; + is.read(reinterpret_cast(&token_id), sizeof(token_id)); + + data->emplace(token, token_id); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/string_array.h b/paddle/fluid/framework/string_array.h new file mode 100755 index 0000000000..b874fbac4c --- /dev/null +++ b/paddle/fluid/framework/string_array.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +using String = std::string; +using Strings = std::vector; +using Vocab = std::unordered_map; + +// Convert the std::string type to the std::string type. +bool ConvertStrToWstr(const std::string& src, std::wstring* res); +// Convert the std::wstring type to the std::string type. +void ConvertWstrToStr(const std::wstring& src, std::string* res); +// Normalization Form Canonical Decomposition. +void NFD(const std::string& s, std::string* ret); + +// Write the data which is type of +// std::unordered_map to ostream. +void StringMapToStream(std::ostream& os, + const std::unordered_map& data); + +// Read the data which is type of +// std::unordered_map from istream. +void StringMapFromStream(std::istream& is, + std::unordered_map* data); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index ee30a82aff..1c43219330 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/tensor_util.h" - #include #include #include @@ -22,6 +20,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index f4bbbaa2e7..73829898be 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -13,11 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include +#include +#include #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #ifdef PADDLE_WITH_ASCEND_CL @@ -48,6 +54,14 @@ class PrintOptions { PrintOptions() {} }; +void TensorToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx, + const size_t& seek, const std::vector& shape); + // NOTE(zcd): Because TensorCopy is an async operation, when the src_place // and dst_place are two different GPU, to ensure that the operation can // be carried out correctly, there is a src_ctx wait operation in TensorCopy. diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index c3bdd6ae7f..41fe9fbbc0 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -209,6 +209,10 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { return desc_.type().lod_tensor().tensor(); case proto::VarType::LOD_TENSOR_ARRAY: return desc_.type().tensor_array().tensor(); + case proto::VarType::STRINGS: + return desc_.type().strings(); + case proto::VarType::VOCAB: + return desc_.type().vocab(); default: PADDLE_THROW(platform::errors::Unavailable( "Getting 'tensor_desc' is not supported by the %s type variable.", @@ -249,6 +253,10 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { return desc_.mutable_type()->mutable_lod_tensor()->mutable_tensor(); case proto::VarType::LOD_TENSOR_ARRAY: return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor(); + case proto::VarType::STRINGS: + return desc_.mutable_type()->mutable_strings(); + case proto::VarType::VOCAB: + return desc_.mutable_type()->mutable_vocab(); default: PADDLE_THROW( platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not " diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 473df85aa0..c8c3cf364e 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -18,10 +18,12 @@ #include #include #include +#include #include #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include @@ -162,8 +164,8 @@ struct VarTypeRegistryImpl { // Paddle would generate unique Ids for each registered variable types. using VarTypeRegistry = detail::VarTypeRegistryImpl< Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, - LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, - operators::reader::LoDTensorBlockingQueueHolder, FetchList, + Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *, + operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -177,8 +179,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, #endif - int, float>; - + int, float, Vocab>; template struct VarTypeTrait { static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); @@ -208,9 +209,13 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(FeedList, proto::VarType::FEED_LIST); REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST); REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); +REG_PROTO_VAR_TYPE_TRAIT(Vocab, proto::VarType::VOCAB); +REG_PROTO_VAR_TYPE_TRAIT(String, proto::VarType::STRING); +REG_PROTO_VAR_TYPE_TRAIT(Strings, proto::VarType::STRINGS); /** End of variable type registration */ diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index bdcdd4e64e..37ec5d7bc8 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -41,6 +42,10 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { var->GetMutable(); + } else if (var_type == proto::VarType::STRINGS) { + var->GetMutable(); + } else if (var_type == proto::VarType::VOCAB) { + var->GetMutable(); } else if (var_type == proto::VarType::PLACE_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::READER) { diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index 5fa8b89a39..559b935d7e 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -20,6 +20,7 @@ #include #include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/op_base.h" @@ -153,6 +154,15 @@ class VariableWrapper { tensor = &(var_.Get()); } else if (type_ == framework::proto::VarType::SELECTED_ROWS) { tensor = &(var_.Get().value()); + } else if (type_ == framework::proto::VarType::VOCAB) { + const framework::Vocab* data = nullptr; + data = &(var_.Get()); + if (data && data->size() != 0) { + VLOG(6) << "The tensor of variable " << name_ + << " is not initialized"; + return data_type_; + } + return framework::proto::VarType::VOCAB; } else { VLOG(6) << "Variable " << name_ << " is not initialized"; return data_type_; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index bbec3eab1c..53b92c1336 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -26,7 +26,7 @@ if(WITH_MKLDNN) set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE) endif() -cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer) +cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer utf8proc) cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 47abe3298a..1fdc5cd730 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -34,12 +34,14 @@ include_directories("${PADDLE_LIB}/") set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") @@ -151,12 +153,13 @@ if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp + glog gflags protobuf xxhash cryptopp utf8proc ${EXTERNAL_LIB}) else() set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags_static libprotobuf xxhash cryptopp-static ${EXTERNAL_LIB}) + glog gflags_static libprotobuf xxhash cryptopp-static utf8proc_static + ${EXTERNAL_LIB}) set(DEPS ${DEPS} shlwapi.lib) endif(NOT WIN32) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index a9c6ef1317..bb537f0c65 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -43,15 +43,33 @@ void Tensor::Reshape(const std::vector &shape) { tensor->Resize(paddle::framework::make_ddim(shape)); } -#define EAGER_GET_TENSOR \ - if (!tensor_) { \ - tensor_ = FindTensor(); \ - } \ - auto *tensor = static_cast(tensor_); +void Tensor::ReshapeStrings(const size_t &shape) { + PADDLE_ENFORCE_EQ( + name_.empty(), false, + paddle::platform::errors::PreconditionNotMet( + "Need to SetName first, so that the corresponding tensor can " + "be retrieved.")); + PADDLE_ENFORCE_EQ(input_or_output_, true, + paddle::platform::errors::PermissionDenied( + "Can't reshape the output tensor, it is readonly")); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE_NOT_NULL( + var, paddle::platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", name_)); + paddle_infer::Strings *tensor = var->GetMutable(); + tensor->resize(shape); +} + +#define EAGER_GET_TENSOR(tensor_type) \ + if (!tensor_) { \ + tensor_ = FindTensor(); \ + } \ + auto *tensor = static_cast(tensor_); template T *Tensor::mutable_data(PlaceType place) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GT( tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( @@ -83,7 +101,7 @@ T *Tensor::mutable_data(PlaceType place) { template T *Tensor::data(PlaceType *place, int *size) const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto *res = tensor->data(); if (paddle::platform::is_cpu_place(tensor->place())) { @@ -103,7 +121,7 @@ T *Tensor::data(PlaceType *place, int *size) const { } DataType Tensor::type() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto type = tensor->type(); if (type == paddle::framework::proto::VarType::FP32) { return DataType::FLOAT32; @@ -125,7 +143,7 @@ PlaceType Tensor::place() const { return place_; } template void Tensor::CopyFromCpu(const T *data) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GE(tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( "You should call Tensor::Reshape(const " @@ -186,10 +204,20 @@ void Tensor::CopyFromCpu(const T *data) { } } +void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { + EAGER_GET_TENSOR(paddle_infer::Strings); + PADDLE_ENFORCE_GE(tensor->size(), 0, + paddle::platform::errors::PreconditionNotMet( + "You should call Tensor::Reshape(const " + "std::size_t &shape)function before copying" + "the string data from cpu.")); + *tensor = *data; +} + template void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, void *cb_params) const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto ele_num = tensor->numel(); auto *t_data = tensor->data(); auto t_place = tensor->place(); @@ -371,6 +399,7 @@ Tensor::Tensor(void *scope) : scope_{scope} { "set to the pointer of scope.")); } +template void *Tensor::FindTensor() const { PADDLE_ENFORCE_EQ( name_.empty(), false, @@ -382,12 +411,12 @@ void *Tensor::FindTensor() const { PADDLE_ENFORCE_NOT_NULL( var, paddle::platform::errors::PreconditionNotMet( "No tensor called [%s] in the runtime scope", name_)); - auto *tensor = var->GetMutable(); + auto *tensor = var->GetMutable(); return tensor; } std::vector Tensor::shape() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_NOT_NULL( tensor_, paddle::platform::errors::PreconditionNotMet( "Not found tensor called %s in the scope", name_)); @@ -395,7 +424,7 @@ std::vector Tensor::shape() const { } void Tensor::SetLoD(const std::vector> &x) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); paddle::framework::LoD lod; for (auto &level : x) { lod.emplace_back(level); @@ -404,7 +433,7 @@ void Tensor::SetLoD(const std::vector> &x) { } std::vector> Tensor::lod() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); std::vector> res; for (auto &level : tensor->lod()) { res.emplace_back(level); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc index 1f1be13610..eb134874c3 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -36,7 +36,10 @@ template PD_INFER_DECL int64_t *Tensor::data(PlaceType *place, template float *Tensor::mutable_data(PlaceType place); template int64_t *Tensor::mutable_data(PlaceType place); -void *Tensor::FindTensor() const { return nullptr; } +template +void *Tensor::FindTensor() const { + return nullptr; +} std::vector Tensor::shape() const { return {}; } diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc index 0c092a8684..4b6f90f3f0 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc @@ -88,7 +88,8 @@ bool SetPlaceAndCheck(PlaceType place, size_t length) { const std::vector> lod{{0, length}}; scope.Var(name); auto tensor = CreateTensor(place, &scope, name); - tensor->Reshape({static_cast(length)}); + std::vector shape{static_cast(length)}; + tensor->Reshape(shape); tensor->mutable_data(place); tensor->SetLoD(lod); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index de6b28de27..b137b7ba6f 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -174,6 +174,14 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor { void copy_from_cpu(const T* data) { return CopyFromCpu(data); } + + /// \brief Experimental interface. + /// It's usually used to set the input tensor data with Strings data type. + /// \param data The pointer of the data, from which the tensor will copy. + void copy_strings_from_cpu(const paddle_infer::Strings* data) { + return CopyStringsFromCpu(data); + } + /// \brief Copy the tensor data to the host memory. /// It's usually used to get the output tensor data. /// \param[out] data The tensor will copy the data to the address. diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index f6dce74c30..24a72a0b9d 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -14,10 +14,16 @@ #pragma once +#include + #include "paddle_infer_declare.h" // NOLINT namespace paddle_infer { +/// \brief Experimental. +/// Strings for text data. +using Strings = std::vector; + typedef void (*CallbackFunc)(void*); #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) @@ -57,6 +63,14 @@ class PD_INFER_DECL Tensor { /// \param shape The shape to set. void Reshape(const std::vector& shape); + /// \brief Experimental interface. + /// Reset the shape of the Strings tensor. + /// Generally it's only used for the input tensor. + /// Reshape must be called before calling + /// ZeroCopyStringTensorCreate() or PaddleInferTensorCreate() + /// \param shape The shape to set. + void ReshapeStrings(const std::size_t& shape); + /// \brief Get the memory pointer in CPU or GPU with specific data type. /// Please Reshape the tensor first before call this. /// It's usually used to get input data pointer. @@ -78,6 +92,11 @@ class PD_INFER_DECL Tensor { template void CopyFromCpu(const T* data); + /// \brief Experimental interface. + /// It's usually used to set the input tensor data with Strings data type. + /// \param data The pointer of the data, from which the tensor will copy. + void CopyStringsFromCpu(const paddle_infer::Strings* data); + /// \brief Copy the tensor data to the host memory. /// It's usually used to get the output tensor data. /// \param[out] data The tensor will copy the data to the address. @@ -122,7 +141,10 @@ class PD_INFER_DECL Tensor { protected: explicit Tensor(void* scope); + + template void* FindTensor() const; + void SetPlace(PlaceType place, int device = -1); void SetName(const std::string& name); diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index d2bc95e7c3..f976e217ba 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -17,11 +17,13 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); @@ -85,10 +87,12 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); new_var->SetDataType(var->GetDataType()); - new_var->SetType(var->GetType()); + auto var_type = var->GetType(); + new_var->SetType(var_type); - if (var->GetType() != - framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) { + if ((var_type != + framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) && + (var_type != framework::proto::VarType::VOCAB)) { new_var->SetLoDLevel(var->GetLoDLevel()); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index b910b4ec73..50b83970ab 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) +add_subdirectory(string) add_subdirectory(jit) if(WITH_MKLDNN) add_subdirectory(mkldnn) @@ -78,10 +79,12 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op sparse_attention_op lstm_op run_program_op eye_op recurrent_op - sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) +register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op + recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) +op_library(save_combine_op DEPS string_array) +op_library(load_combine_op DEPS string_array) if (WITH_GPU OR WITH_ROCM) if(WITH_ROCM) diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index 9597dd25ec..bc29c92b09 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,6 +26,39 @@ class OpBase; namespace paddle { namespace operators { + +// FeedVariableVisitor is to feed the variable data +// according to data type (LoDTensor or Strings). +class FeedVariableVisitor : public boost::static_visitor { + public: + explicit FeedVariableVisitor(framework::Variable *out_var, + const platform::Place &place) + : out_var_(out_var), place_(place) {} + + void operator()(const framework::LoDTensor &in_tensor) const { + framework::LoDTensor *out_tensor = + out_var_->GetMutable(); + if (platform::is_same_place(in_tensor.place(), place_)) { + out_tensor->ShareDataWith(in_tensor); + } else { + platform::DeviceContext *context = + platform::DeviceContextPool::Instance().Get(place_); + framework::TensorCopy(in_tensor, place_, *context, out_tensor); + } + out_tensor->set_lod(in_tensor.lod()); + } + + void operator()(const framework::Strings &in_str) const { + framework::Strings *out_str = out_var_->GetMutable(); + out_str->resize(in_str.size()); + *out_str = in_str; + } + + private: + framework::Variable *out_var_; + const platform::Place &place_; +}; + class FeedOp : public framework::OperatorBase { public: FeedOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -79,15 +109,9 @@ class FeedOp : public framework::OperatorBase { col, feed_list.size())); auto &feed_item = feed_list.at(static_cast(col)); - auto *out_item = out_var->GetMutable(); - if (platform::is_same_place(feed_item.place(), place)) { - out_item->ShareDataWith(feed_item); - } else { - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); - framework::TensorCopy(feed_item, place, *dev_ctx, out_item); - } - out_item->set_lod(feed_item.lod()); + FeedVariableVisitor visitor(out_var, place); + boost::apply_visitor(visitor, feed_item); } }; @@ -95,17 +119,17 @@ class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(vector) A feeding list of LoDTensor, which may have " + "(vector) " + "A feeding list of LoDTensor, which may have " "different dimension and data type."); AddOutput("Out", - "(LoDTensor) The LoDTensor which is a copy of the col-th feeding " + "(LoDTensor) The LoDTensor which is a copy " + "of the col-th feeding " "object."); AddAttr("col", "(int) The column index of current feeding object."); AddComment(R"DOC( Feed Operator. - It should not be configured by users directly. - )DOC"); } }; diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index d86b6b4842..99b16d9b69 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -109,6 +109,10 @@ class FetchOp : public framework::OperatorBase { auto &src_item = fetch_var->Get(); auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col))); DataCopy(src_item, fetch_var_name, dst_item); + } else if (fetch_var->IsType()) { + auto &src_item = fetch_var->Get(); + auto *dst_item = &(BOOST_GET(framework::Vocab, fetch_list->at(col))); + *dst_item = src_item; } else { auto &src_item = fetch_var->Get(); framework::LoDTensorArray tmp(src_item.size()); @@ -128,9 +132,11 @@ class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(LoDTensor) The resulted LoDTensor which is expected to return " "to users."); - AddOutput("Out", - "(vector) A fetching list of LoDTensor which may have " - "different dimension, shape and data type."); + AddOutput( + "Out", + "(vector|unordered_map) A fetching list" + " of LoDTensor|unordered_map which may have " + "different dimension, shape and data type."); AddAttr("col", "(int) The column index of fetching object."); AddComment(R"DOC( Fetch Operator. diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h index 589df8821b..a02b0e61d9 100644 --- a/paddle/fluid/operators/load_combine_op.h +++ b/paddle/fluid/operators/load_combine_op.h @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -75,38 +77,57 @@ class LoadCombineOpKernel : public framework::OpKernel { out_vars[i], platform::errors::InvalidArgument( "The variable %s to be loaded cannot be found.", out_var_names[i])); - - auto *tensor = out_vars[i]->GetMutable(); - // Error checking PADDLE_ENFORCE_EQ( static_cast(*buffer), true, platform::errors::Unavailable( "An error occurred while loading model parameters. " "Please check whether the model file is complete or damaged.")); - - // Get data from fin to tensor - DeserializeFromStream(*buffer, tensor, dev_ctx); - - auto in_dtype = tensor->type(); - auto out_dtype = - load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, - &fp16_tensor); - - // reset output tensor - out_vars[i]->Clear(); - tensor = out_vars[i]->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); + if (out_vars[i]->IsType()) { + auto *tensor = out_vars[i]->GetMutable(); + tensor->clear(); + std::unordered_map data; + framework::StringMapFromStream(*buffer, &data); + for (auto it = data.begin(); it != data.end(); ++it) { + std::string tmp; + framework::NFD(it->first, &tmp); + if (tmp.empty()) { + VLOG(0) << "The string " << it->first + << " was converted to unicode failedly! " + << "Then dropped to load it."; + continue; + } + std::wstring token; + bool status = framework::ConvertStrToWstr(tmp, &token); + if (!status) continue; + tensor->emplace(token, it->second); + } + } else { + auto *tensor = out_vars[i]->GetMutable(); + + // Get data from fin to tensor + DeserializeFromStream(*buffer, tensor, dev_ctx); + + auto in_dtype = tensor->type(); + auto out_dtype = + load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + // convert to float16 tensor + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor fp16_tensor; + // copy LoD info to the new tensor + fp16_tensor.set_lod(tensor->lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, + &fp16_tensor); + + // reset output tensor + out_vars[i]->Clear(); + tensor = out_vars[i]->GetMutable(); + tensor->set_lod(fp16_tensor.lod()); + tensor->ShareDataWith(fp16_tensor); + } } } buffer->peek(); diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h index 939768693a..6e6c826a22 100644 --- a/paddle/fluid/operators/save_combine_op.h +++ b/paddle/fluid/operators/save_combine_op.h @@ -19,11 +19,13 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/port.h" @@ -66,34 +68,48 @@ class SaveCombineOpKernel : public framework::OpKernel { inp_vars[i], platform::errors::InvalidArgument("Cannot find variable %s to save.", inp_var_names[i])); - PADDLE_ENFORCE_EQ(inp_vars[i]->IsType(), true, + PADDLE_ENFORCE_EQ(inp_vars[i]->IsType() || + inp_vars[i]->IsType(), + true, platform::errors::InvalidArgument( "SaveCombine operator only supports saving " - "LoDTensor variable, %s has wrong type.", + "LoDTensor or Vocab variable, %s has wrong type.", inp_var_names[i])); - auto &tensor = inp_vars[i]->Get(); - PADDLE_ENFORCE_EQ( - tensor.IsInitialized(), true, - platform::errors::InvalidArgument( - "The Tensor of Variable(%s) to be saved is not initialized.", - inp_var_names[i])); - // Serialize tensors one by one - // Check types to see if a fp16 transformation is required - auto in_dtype = tensor.type(); - auto out_dtype = - save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + if (inp_vars[i]->IsType()) { + auto &tensor = inp_vars[i]->Get(); + PADDLE_ENFORCE_EQ( + tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "The Tensor of Variable(%s) to be saved is not initialized.", + inp_var_names[i])); + // Serialize tensors one by one + // Check types to see if a fp16 transformation is required + auto in_dtype = tensor.type(); + auto out_dtype = + save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - if (in_dtype != out_dtype) { - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor out; - // copy LoD info to the new tensor - out.set_lod(tensor.lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); - framework::SerializeToStream(ss, out, dev_ctx); + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + // copy LoD info to the new tensor + out.set_lod(tensor.lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, + &out); + framework::SerializeToStream(ss, out, dev_ctx); + } else { + framework::SerializeToStream(ss, tensor, dev_ctx); + } } else { - framework::SerializeToStream(ss, tensor, dev_ctx); + auto &tensor = inp_vars[i]->Get(); + std::unordered_map data; + for (auto it = tensor.begin(); it != tensor.end(); ++it) { + std::string t; + framework::ConvertWstrToStr(it->first, &t); + data.emplace(t, it->second); + } + framework::StringMapToStream(ss, data); } } if (save_to_memory) { diff --git a/paddle/fluid/operators/string/CMakeLists.txt b/paddle/fluid/operators/string/CMakeLists.txt new file mode 100644 index 0000000000..1da2e8e455 --- /dev/null +++ b/paddle/fluid/operators/string/CMakeLists.txt @@ -0,0 +1,6 @@ +include(operators) +if(WITH_UNITY_BUILD) + # Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops. + include(unity_build_rule.cmake) +endif() +register_operators(DEPS op_version_registry utf8proc string_array) diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc new file mode 100644 index 0000000000..42047021b4 --- /dev/null +++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc @@ -0,0 +1,528 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/fluid/framework/string_array.h" +#include "paddle/fluid/operators/string/faster_tokenizer_op.h" + +namespace paddle { +namespace operators { + +using std::bad_cast; +using std::codecvt_utf8; +using std::endl; +using std::exception; +using std::ifstream; +using std::int64_t; +using std::min; +using std::runtime_error; +using std::unordered_map; +using std::unordered_set; +using std::shared_ptr; +using std::size_t; +using std::int64_t; +using std::string; +using std::vector; +using std::wstring; + +const wstring kStripChars = L" \t\n\r\v\f"; + +inline bool IsControl(const wchar_t& ch) { + if (ch == L'\t' || ch == L'\n' || ch == L'\r') return false; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_CC || cat == UTF8PROC_CATEGORY_CF) return true; + return false; +} + +inline bool IsChineseChar(const wchar_t& ch) { + if ((ch >= 0x4E00 && ch <= 0x9FFF) || (ch >= 0x3400 && ch <= 0x4DBF) || + (ch >= 0x20000 && ch <= 0x2A6DF) || (ch >= 0x2A700 && ch <= 0x2B73F) || + (ch >= 0x2B740 && ch <= 0x2B81F) || (ch >= 0x2B820 && ch <= 0x2CEAF) || + (ch >= 0xF900 && ch <= 0xFAFF) || (ch >= 0x2F800 && ch <= 0x2FA1F)) + return true; + return false; +} + +inline bool IsWhiteSpace(const wchar_t& ch) { + if (ch == L' ' || ch == L'\t' || ch == L'\n' || ch == L'\r') return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_ZS) return true; + return false; +} + +inline bool IsPunctuation(const wchar_t& ch) { + if ((ch >= 33 && ch <= 47) || (ch >= 58 && ch <= 64) || + (ch >= 91 && ch <= 96) || (ch >= 123 && ch <= 126)) + return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PS || + cat == UTF8PROC_CATEGORY_PE || cat == UTF8PROC_CATEGORY_PC || + cat == UTF8PROC_CATEGORY_PO // sometimes ¶ belong SO + || cat == UTF8PROC_CATEGORY_PI || cat == UTF8PROC_CATEGORY_PF) + return true; + return false; +} + +BasicTokenizer::BasicTokenizer(bool do_lower_case /* = true */) + : do_lower_case_(do_lower_case) {} + +wchar_t BasicTokenizer::do_lower_case(wchar_t ch) const { + wchar_t new_ch = utf8proc_tolower(ch); + return new_ch; +} + +void BasicTokenizer::Tokenize(const string& text, vector* res) const { + std::wstring unicode_text; + bool status = framework::ConvertStrToWstr(text, &unicode_text); + if (!status) { + // String is converted into wstring failedly. + return; + } + std::wstring cache_text = L""; + auto PushCacheText = [&]() { + if (cache_text != L"") { + res->emplace_back(cache_text); + cache_text = L""; + } + }; + for (auto& ch : unicode_text) { + if (ch == 0 || ch == 0xfffd || IsControl(ch)) { + continue; + } + if (do_lower_case_) { + ch = do_lower_case(ch); + } + if (IsChineseChar(ch) || IsPunctuation(ch)) { + PushCacheText(); + res->emplace_back(std::wstring{ch}); + } else if (IsWhiteSpace(ch)) { + PushCacheText(); + } else { + cache_text += ch; + } + } + PushCacheText(); +} + +WordPieceTokenizer::WordPieceTokenizer( + const framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/, + const size_t max_input_chars_per_word /* = 100 */) + : vocab_(vocab), + unk_token_(unk_token), + max_input_chars_per_word_(max_input_chars_per_word) { + unk_token_id_ = vocab_->at(unk_token_); +} + +void WordPieceTokenizer::Tokenize(const wstring& text, + vector* token_ids) const { + size_t len = text.size(); + if (len > max_input_chars_per_word_) { + token_ids->emplace_back(std::move(unk_token_id_)); + return; + } + + auto it = vocab_->find(text); + if (it != vocab_->end()) { + token_ids->emplace_back(std::move(it->second)); + return; + } + + size_t start = 0; + vector wordpiece_ids; + while (start < len) { + size_t end = len; + std::wstring cur_substr; + int64_t cur_substr_id; + while (start < end) { + std::wstring sub = text.substr(start, end - start); + if (start > 0) { + sub = L"##" + sub; + } + auto it = vocab_->find(sub); + if (it != vocab_->end()) { + cur_substr = sub; + cur_substr_id = it->second; + break; + } + end -= 1; + } + + if (cur_substr.empty()) { + token_ids->emplace_back(std::move(unk_token_id_)); + return; + } else { + start = end; + wordpiece_ids.emplace_back(std::move(cur_substr_id)); + } + } + for (auto& token_id : wordpiece_ids) { + token_ids->emplace_back(std::move(token_id)); + } +} + +BertTokenizer::BertTokenizer(const framework::Vocab* vocab, + bool do_lower_case /* = false */, + const wstring& unk_token /* = L"[UNK]" */, + const wstring& pad_token /* = L"[PAD]" */, + const wstring& cls_token /* = L"[CLS]" */, + const wstring& mask_token /* = L"[MASK]" */, + const wstring& sep_token /* = L"[SEP]" */, + const string& padding_site /* = "right" */) + : do_lower_case_(do_lower_case), + unk_token_(unk_token), + pad_token_(pad_token), + cls_token_(cls_token), + mask_token_(mask_token), + sep_token_(sep_token), + padding_site_(padding_site), + vocab_(vocab), + basic_tokenizer_(do_lower_case_), + word_piece_tokenizer_(vocab_, unk_token) { + unk_token_id_ = vocab_->at(unk_token_); + pad_token_id_ = vocab_->at(pad_token_); + cls_token_id_ = vocab_->at(cls_token_); + mask_token_id_ = vocab_->at(mask_token_); + sep_token_id_ = vocab_->at(sep_token_); + + all_special_tokens_ = vector( + {unk_token_, pad_token_, cls_token_, mask_token_, sep_token_}); + all_special_token_ids_ = + unordered_set({unk_token_id_, pad_token_id_, cls_token_id_, + mask_token_id_, sep_token_id_}); +} + +void BertTokenizer::Tokenize(const string& text, + vector* split_token_ids) const { + std::vector tmp_tokens; + basic_tokenizer_.Tokenize(text, &tmp_tokens); + if (tmp_tokens.empty()) return; + split_token_ids->reserve(tmp_tokens.size()); + for (auto& w_token : tmp_tokens) { + const auto& vec_size = w_token.size(); + if (vec_size == 1) { + if (IsChineseChar(w_token[0])) { + auto vocab_it = vocab_->find(w_token); + if (vocab_it != vocab_->end()) { + split_token_ids->emplace_back(std::move(vocab_it->second)); + } else { + split_token_ids->emplace_back(std::move(unk_token_id_)); + } + } else { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } + } else if (vec_size > 1) { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } else { + continue; + } + } +} + +void BertTokenizer::BuildInputsWithSpecialTokens( + vector* inputs, const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.size() == 0) { + inputs->clear(); + inputs->resize(token_ids_0.size() + 2); + inputs->at(0) = std::move(cls_token_id_); + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + } else { + inputs->clear(); + inputs->resize(token_ids_0.size() + token_ids_1.size() + 3); + inputs->at(0) = std::move(cls_token_id_); + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + ++i; + for (auto& token_id : token_ids_1) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + } +} + +int64_t BertTokenizer::GetNumSpecialTokensToAdd(const bool pair) const { + if (pair) { + return 3; + } else { + return 2; + } +} + +void BertTokenizer::CreateTokenTypeIdsFromSequences( + vector* token_type_ids, const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.size() == 0) { + vector tmp(token_ids_0.size() + 2, 0); + token_type_ids->swap(tmp); + } else { + vector tmp(token_ids_0.size() + token_ids_1.size() + 3, 0); + for (size_t i = token_ids_0.size() + 2; i < tmp.size(); i++) { + tmp[i] = 1; + } + token_type_ids->swap(tmp); + } +} + +void BertTokenizer::TruncateSequence( + vector* ids, vector* pair_ids, + const size_t num_tokens_to_remove /* = 0 */, + const size_t stride /* = 0 */) const { + for (size_t i = 0; i < num_tokens_to_remove; i++) { + if ((pair_ids->size() == 0) || (ids->size() > pair_ids->size())) { + ids->pop_back(); + } else { + pair_ids->pop_back(); + } + } +} + +int64_t BertTokenizer::GetPadTokenID() const { return pad_token_id_; } + +int BertTokenizer::Encode( + unordered_map>* encoded_inputs, const string& text, + const string& text_pair /* = "" */, bool is_split_into_words /* = false */, + const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + vector ids; + vector pair_ids; + if (!is_split_into_words) { + Tokenize(text, &ids); + if (ids.empty()) return 0; + if (text_pair != "") { + Tokenize(text_pair, &pair_ids); + if (pair_ids.empty()) return 0; + } + } else { + std::wstring unicode_text; + bool status_a = framework::ConvertStrToWstr(text, &unicode_text); + if (!status_a) { + return 0; + } + for (size_t i = 0; i < unicode_text.size(); i++) { + wstring token = unicode_text.substr(i, 1); + auto it = vocab_->find(token); + if (it != vocab_->end()) { + ids.emplace_back(std::move(it->second)); + } else { + ids.emplace_back(std::move(unk_token_id_)); + } + } + } + + bool pair = false; + if (pair_ids.size() != 0) { + pair = true; + } + + size_t len_ids = ids.size(); + size_t len_pair_ids = pair_ids.size(); + + // Truncation: Handle max sequence length + // If max_seq_len == 0, then do nothing and keep the real length. + // If max_seq_len > 0 and + // all the input sequence len is over the max_seq_len, + // then we truncate it. + size_t total_len = len_ids + len_pair_ids + GetNumSpecialTokensToAdd(pair); + if (max_seq_len > 0 && total_len > max_seq_len) { + TruncateSequence(&ids, &pair_ids, total_len - max_seq_len); + } + + // Add special tokens + vector sequence; + BuildInputsWithSpecialTokens(&sequence, ids, pair_ids); + size_t seq_len = sequence.size(); + vector token_type_ids; + CreateTokenTypeIdsFromSequences(&token_type_ids, ids, pair_ids); + + // Build output dictionnary + encoded_inputs->emplace("input_ids", sequence); + encoded_inputs->emplace("token_type_ids", token_type_ids); + // Check lengths + if (max_seq_len > 0 && seq_len > max_seq_len) { + VLOG(3) << "There is something wrong with the input sequence length." + " Please check it."; + // Failed. + return 0; + } + + // Padding + bool needs_to_be_padded = false; + if (pad_to_max_seq_len && max_seq_len > 0 && (seq_len < max_seq_len)) { + needs_to_be_padded = true; + } + + if (needs_to_be_padded) { + int64_t difference = max_seq_len - seq_len; + size_t pad_start = max_seq_len - 1 - difference; + encoded_inputs->at("token_type_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("token_type_ids")[i] = pad_token_id_; + } + + encoded_inputs->at("input_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("input_ids")[i] = pad_token_id_; + } + } + return 1; +} + +void BertTokenizer::BatchEncode( + vector>>* batch_encode_inputs, + const vector& batch_text, + const vector& batch_text_pair /* = vector() */, + bool is_split_into_words /* = false */, const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + bool has_text_pair = false; + if (batch_text_pair.size() != 0) { + has_text_pair = true; + } + + size_t batch_size = batch_text.size(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (size_t i = 0; i < batch_size; i++) { + unordered_map> res; + if (has_text_pair) { + auto status = + Encode(&res, batch_text[i], batch_text_pair[i], is_split_into_words, + max_seq_len, pad_to_max_seq_len); + if (!status) { + res["input_ids"] = + std::vector{cls_token_id_, sep_token_id_, cls_token_id_}; + res["token_type_ids"] = std::vector{0, 0, 1}; + } + } else { + auto status = Encode(&res, batch_text[i], {}, is_split_into_words, + max_seq_len, pad_to_max_seq_len); + + if (!status) { + res["input_ids"] = std::vector{cls_token_id_, sep_token_id_}; + res["token_type_ids"] = std::vector{0, 0}; + } + } + batch_encode_inputs->at(i) = std::move(res); + } +} + +class FasterTokenizerOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Text"), "Input", "Text", "Tokenizer"); + OP_INOUT_CHECK(ctx->HasInput("Vocab"), "Input", "Vocab", "Tokenizer"); + OP_INOUT_CHECK(ctx->HasOutput("InputIds"), "Output", "InputIds", + "Tokenizer"); + OP_INOUT_CHECK(ctx->HasOutput("SegmentIds"), "Output", "SegmentIds", + "Tokenizer"); + + ctx->SetOutputDim("InputIds", {-1, -1}); + ctx->SetOutputDim("SegmentIds", {-1, -1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::INT64, + paddle::platform::CPUPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return framework::OpKernelType(expected_kernel_type.data_type_, + expected_kernel_type.place_, + tensor.layout()); + } +}; + +class FasterTokenizerOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Vocab", + "(std::map), The vocab to map " + "token string to token id."); + AddInput("Text", + "(std::vector), The sequence to be processed. " + "One sequence is a string, a list of strings, " + "or a list of integers depending on whether it " + "has been pretokenized and converted to ids. "); + AddInput("TextPair", + "(std::vector), Same as `text` argument, " + "while it represents for the latter sequence of the " + "sequence pair.") + .AsDispensable(); + AddOutput("InputIds", "(Tensor), The token ids of the input text."); + AddOutput("SegmentIds", "(Tensor), The segments ids of the input text."); + AddAttr( + "do_lower_case", + "(bool), Whether or not to lowercase the input when tokenizing.") + .SetDefault(false); + AddAttr( + "is_split_into_words", + "(bool), Whether or not the input is already pre-tokenized " + "(e.g., split into words). If set to True, the tokenizer " + "assumes the input is already split into words (for instance, " + "by splitting it on whitespace) which it will tokenize. This " + "is useful for NER or token classification.") + .SetDefault(false); + AddAttr("max_seq_len", + "(int), If set to a positive number, will limit the " + "total sequence returned so that it has a maximum length." + " If there are overflowing tokens, those overflowing " + "tokens will be added to the returned dictionary when " + "`return_overflowing_tokens` is `True`.") + .SetDefault(0); + AddAttr("pad_to_max_seq_len", + "(bool), If set to `True`, the returned sequences would be" + " padded up to `max_seq_len` specified length according to" + " padding side and padding token id.") + .SetDefault(false); + AddComment(R"DOC(Performs tokenization and uses the tokenized tokens to " + "prepare model inputs. It supports sequence or sequence pair as input, " + "and batch input is not allowed.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(faster_tokenizer, ops::FasterTokenizerOp, + ops::FasterTokenizerOpMaker); + +REGISTER_OP_CPU_KERNEL(faster_tokenizer, ops::FasterTokenizerKernel); diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h new file mode 100644 index 0000000000..5218b7c2ea --- /dev/null +++ b/paddle/fluid/operators/string/faster_tokenizer_op.h @@ -0,0 +1,195 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" + +namespace paddle { +namespace operators { + +using std::endl; +using std::int64_t; +using std::size_t; +using std::string; +using std::shared_ptr; +using std::vector; +using std::unordered_map; +using std::unordered_set; +using std::vector; +using std::wstring; +using std::wcout; + +inline bool IsControl(const wchar_t& ch); +inline bool IsChineseChar(const wchar_t& ch); +inline bool IsWhiteSpace(const wchar_t& ch); + +using Vocab = unordered_map; +using InvVocab = unordered_map; + +class BasicTokenizer { + public: + explicit BasicTokenizer(bool do_lower_case = true); + void Tokenize(const string& text, vector* res) const; + + private: + wchar_t do_lower_case(wchar_t ch) const; + + bool do_lower_case_; +}; + +class WordPieceTokenizer { + public: + explicit WordPieceTokenizer(const framework::Vocab* vocab, + const wstring& unk_token = L"[UNK]", + const size_t max_input_chars_per_word = 100); + void Tokenize(const wstring& text, vector* output) const; + + private: + const framework::Vocab* vocab_; + wstring unk_token_{L"[UNK]"}; + int64_t unk_token_id_; + size_t max_input_chars_per_word_; +}; + +class BertTokenizer { + public: + explicit BertTokenizer(const framework::Vocab* vocab, + bool do_lower_case = false, + const wstring& unk_token = L"[UNK]", + const wstring& pad_token = L"[PAD]", + const wstring& cls_token = L"[CLS]", + const wstring& mask_token = L"[MASK]", + const wstring& sep_token = L"[SEP]", + const string& padding_site = "right"); + + void Tokenize(const string& text, vector* split_tokens) const; + void BuildInputsWithSpecialTokens( + vector* res, const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void CreateTokenTypeIdsFromSequences( + vector* token_type_ids, const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void TruncateSequence(vector* ids, vector* pair_ids, + const size_t num_tokens_to_remove = 0, + const size_t stride = 0) const; + int64_t GetNumSpecialTokensToAdd(const bool pair = false) const; + int Encode(unordered_map>* encoded_inputs, + const string& text, const string& text_pair = "", + bool is_split_into_words = false, const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + void BatchEncode( + vector>>* batch_encode_inputs, + const vector& batch_text, + const vector& batch_text_pair = vector(), + bool is_split_into_words = false, const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + + int64_t GetPadTokenID() const; + + private: + bool do_lower_case_; + wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_; + string padding_site_; + const framework::Vocab* vocab_; + BasicTokenizer basic_tokenizer_; + WordPieceTokenizer word_piece_tokenizer_; + int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_, + sep_token_id_; + vector all_special_tokens_; + unordered_set all_special_token_ids_; + InvVocab inv_vocab_; +}; + +template +class FasterTokenizerKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* text = ctx.Input("Text"); + auto* vocab = ctx.Input("Vocab"); + + auto* input_ids = ctx.Output("InputIds"); + auto* seg_ids = ctx.Output("SegmentIds"); + + auto do_lower_case = static_cast(ctx.Attr("do_lower_case")); + auto is_split_into_words = + static_cast(ctx.Attr("is_split_into_words")); + auto max_seq_len = static_cast(ctx.Attr("max_seq_len")); + auto pad_to_max_seq_len = + static_cast(ctx.Attr("pad_to_max_seq_len")); + + auto* text_pair = ctx.Input("TextPair"); + if (text_pair && text->size() != text_pair->size()) { + VLOG(3) << "The input text(list[str]) and text pair (list[str]) must" + << "be the same number of text sequence. Please check the input!"; + return; + } + + BertTokenizer tokenizer(vocab, do_lower_case); + size_t batch_max_seq_len = 0; + size_t batch_size = text->size(); + + vector>> batch_encode_inputs( + batch_size); + if (text_pair) { + tokenizer.BatchEncode(&batch_encode_inputs, *text, *text_pair, + is_split_into_words, max_seq_len, + pad_to_max_seq_len); + } else { + tokenizer.BatchEncode(&batch_encode_inputs, *text, vector(), + is_split_into_words, max_seq_len, + pad_to_max_seq_len); + } + + for (size_t i = 0; i < batch_size; ++i) { + size_t seq_len = batch_encode_inputs[i]["input_ids"].size(); + if (seq_len > batch_max_seq_len) { + batch_max_seq_len = seq_len; + } + } + + input_ids->Resize( + framework::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* input_ids_data = input_ids->mutable_data(ctx.GetPlace()); + seg_ids->Resize( + framework::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* seg_ids_data = seg_ids->mutable_data(ctx.GetPlace()); + + auto pad_token_id = tokenizer.GetPadTokenID(); + for (size_t i = 0; i < batch_size; i++) { + auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"]; + auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"]; + const size_t& seq_len = encoder_input_ids.size(); + // Copy the memory + std::memcpy(input_ids_data + i * batch_max_seq_len, + encoder_input_ids.data(), seq_len * sizeof(T)); + std::memcpy(seg_ids_data + i * batch_max_seq_len, encoder_seg_ids.data(), + seq_len * sizeof(T)); + std::memset(input_ids_data + i * batch_max_seq_len + seq_len, + pad_token_id, (batch_max_seq_len - seq_len) * sizeof(T)); + std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, pad_token_id, + (batch_max_seq_len - seq_len) * sizeof(T)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/string/unity_build_rule.cmake b/paddle/fluid/operators/string/unity_build_rule.cmake new file mode 100644 index 0000000000..a4b209d2df --- /dev/null +++ b/paddle/fluid/operators/string/unity_build_rule.cmake @@ -0,0 +1,8 @@ +# This file records the Unity Build compilation rules. +# The source files in a `register_unity_group` called are compiled in a unity +# file. +# Generally, the combination rules in this file do not need to be modified. +# If there are some redefined error in compiling with the source file which +# in combination rule, you can remove the source file from the following rules. +register_unity_group(cc + faster_tokenizer_op.cc) \ No newline at end of file diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index f94afaa56b..8b01f02ee2 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1875,6 +1875,12 @@ void BindImperative(py::module *m_ptr) { } else if (self.Var().IsType()) { return framework::vectorize( self.Var().Get().value().dims()); + } else if (self.Var().IsType()) { + return std::vector{static_cast( + self.Var().Get().size())}; + } else if (self.Var().IsType()) { + return std::vector{ + static_cast(self.Var().Get().size())}; } else { VLOG(2) << "It is meaningless to get shape of " "variable type " diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 1db769e828..1ecc5fea85 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -185,6 +185,18 @@ void ZeroCopyTensorCreate( tensor.copy_from_cpu(static_cast(data.data())); } +/// \brief Experimental interface. +/// Create the Strings tensor from data. +/// \param tensor The tensor will be created and +/// the tensor value is same as data. +/// \param data The input text. +void ZeroCopyStringTensorCreate(ZeroCopyTensor &tensor, // NOLINT + const paddle_infer::Strings *data) { + size_t shape = data->size(); + tensor.ReshapeStrings(shape); + tensor.copy_strings_from_cpu(data); +} + template void PaddleInferTensorCreate( paddle_infer::Tensor &tensor, // NOLINT @@ -195,6 +207,19 @@ void PaddleInferTensorCreate( tensor.CopyFromCpu(static_cast(data.data())); } +/// \brief Experimental interface. +/// Create the Strings tensor from data. +/// \param tensor The tensor will be created and +/// the tensor value is same as data. +/// \param data The input text. +void PaddleInferStringTensorCreate(paddle_infer::Tensor &tensor, // NOLINT + const paddle_infer::Strings *data) { + VLOG(3) << "Create PaddleInferTensor, dtype = Strings "; + size_t shape = data->size(); + tensor.ReshapeStrings(shape); + tensor.CopyStringsFromCpu(data); +} + size_t PaddleGetDTypeSize(PaddleDType dt) { size_t size{0}; switch (dt) { @@ -726,11 +751,15 @@ void BindPaddleInferPredictor(py::module *m) { void BindZeroCopyTensor(py::module *m) { py::class_(*m, "ZeroCopyTensor") - .def("reshape", &ZeroCopyTensor::Reshape) + .def("reshape", py::overload_cast &>( + &ZeroCopyTensor::Reshape)) + .def("reshape", py::overload_cast( + &paddle_infer::Tensor::ReshapeStrings)) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) + .def("copy_from_cpu", &ZeroCopyStringTensorCreate) .def("copy_to_cpu", &ZeroCopyTensorToNumpy) .def("shape", &ZeroCopyTensor::shape) .def("set_lod", &ZeroCopyTensor::SetLoD) @@ -740,12 +769,16 @@ void BindZeroCopyTensor(py::module *m) { void BindPaddleInferTensor(py::module *m) { py::class_(*m, "PaddleInferTensor") - .def("reshape", &paddle_infer::Tensor::Reshape) + .def("reshape", py::overload_cast &>( + &paddle_infer::Tensor::Reshape)) + .def("reshape", py::overload_cast( + &paddle_infer::Tensor::ReshapeStrings)) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", &PaddleInferStringTensorCreate) .def("copy_to_cpu", &PaddleInferTensorToNumpy) .def("shape", &paddle_infer::Tensor::shape) .def("set_lod", &paddle_infer::Tensor::SetLoD) diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 569b6b3319..e67c1c98aa 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -72,6 +72,7 @@ std::map> op_ins_map = { {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, {"run_program", {"X", "Params"}}, + {"faster_tokenizer", {"Text", "Vocab", "TextPair"}}, {"fused_feedforward", {"Dropout1Seed", "Dropout2Seed", "Linear1Bias", "Linear2Bias", "Ln1Scale", "Ln1Bias", "Ln2Scale", "Ln2Bias"}}, diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 99607d7f97..984f3d1a31 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -227,7 +227,10 @@ void BindVarDsec(pybind11::module *m) { .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST) .value("READER", pd::proto::VarType::READER) - .value("RAW", pd::proto::VarType::RAW); + .value("RAW", pd::proto::VarType::RAW) + .value("STRING", pd::proto::VarType::STRING) + .value("STRINGS", pd::proto::VarType::STRINGS) + .value("VOCAB", pd::proto::VarType::VOCAB); } void BindOpDesc(pybind11::module *m) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a16916ab33..d4f4323238 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1213,6 +1213,18 @@ All parameter, weight, gradient are variables in Paddle. [](Variable &self) { return py::bytes(*self.GetMutable()); }) + .def("set_string_list", + [](Variable &self, Strings str_list) { + *self.GetMutable() = str_list; + }) + .def("set_vocab", [](Variable &self, + Vocab vocab) { *self.GetMutable() = vocab; }) + .def("get_string_tensor", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) + .def("get_map_tensor", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) .def("get_lod_rank_table", [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) @@ -1846,20 +1858,20 @@ All parameter, weight, gradient are variables in Paddle. .def("__str__", string::to_string); py::class_(m, "Operator") - .def_static("create", - [](py::bytes protobin) { - proto::OpDesc desc; - PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), - true, - platform::errors::InvalidArgument( - "Cannot parse user input to OpDesc")); - PADDLE_ENFORCE_EQ(desc.IsInitialized(), true, - platform::errors::InvalidArgument( - "The provided OpDesc is not " - "initialized, the reason is: %s", - desc.InitializationErrorString())); - return OpRegistry::CreateOp(desc); - }) + .def_static( + "create", + [](py::bytes protobin) { + proto::OpDesc desc; + PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true, + platform::errors::InvalidArgument( + "Cannot parse user input to OpDesc")); + PADDLE_ENFORCE_EQ( + desc.IsInitialized(), true, + platform::errors::InvalidArgument( + "The provided OpDesc is not initialized, the reason is: %s", + desc.InitializationErrorString())); + return OpRegistry::CreateOp(desc); + }) .def("run", [](OperatorBase &self, const Scope &scope, const platform::CPUPlace &place) { @@ -2113,7 +2125,12 @@ All parameter, weight, gradient are variables in Paddle. }); #endif - m.def("set_feed_variable", framework::SetFeedVariable); + m.def("set_feed_variable", + static_cast(&framework::SetFeedVariable)); + m.def("set_feed_variable", + static_cast(&framework::SetFeedVariable)); m.def("get_fetch_variable", [](const Scope &scope, const std::string &var_name, size_t index) -> py::object { diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index d41c373bf5..2db9fb5d76 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -799,12 +799,17 @@ def save(layer, path, input_spec=None, **configs): # 3. share parameters from Layer to scope & record var info for param_or_buffer in concrete_program.parameters: # share to scope - param_or_buffer_tensor = scope.var( - param_or_buffer.name).get_tensor() - #src_tensor = param_or_buffer.value().get_tensor() - src_tensor = state_var_dict[param_or_buffer.name].value( - ).get_tensor() - param_or_buffer_tensor._share_data_with(src_tensor) + if param_or_buffer.type == core.VarDesc.VarType.VOCAB: + scr_tensor = param_or_buffer.value().get_map_tensor() + tgt_var = scope.var(param_or_buffer.name) + tgt_var.set_vocab(scr_tensor) + else: + param_or_buffer_tensor = scope.var( + param_or_buffer.name).get_tensor() + #src_tensor = param_or_buffer.value().get_tensor() + src_tensor = state_var_dict[param_or_buffer.name].value( + ).get_tensor() + param_or_buffer_tensor._share_data_with(src_tensor) # record var info if param_or_buffer.name not in extra_var_info: extra_info_dict = dict() diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 30d5ee4417..65fa9bc5a6 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -1409,13 +1409,22 @@ class Layer(core.Layer): if state is None: raise ValueError("{} is not found in the provided dict.".format( key)) - state_shape = state.shape() if inspect.ismethod( - state.shape) else state.shape - if list(state_shape) != list(param.shape): - raise ValueError( - "{} receives a shape {}, but the expected shape is {}.". - format(key, list(state_shape), list(param.shape))) - return param, state + if (isinstance(state, dict) or isinstance(state, list)): + if (len(state) != len(param)): + raise ValueError("{} receieves the length of {}, " + "but the expected shape is {}".format( + key, len(state), len(param))) + else: + return param, state + else: + state_shape = state.shape() if inspect.ismethod( + state.shape) else state.shape + + if list(state_shape) != list(param.shape): + raise ValueError( + "{} receives a shape {}, but the expected shape is {}.". + format(key, list(state_shape), list(param.shape))) + return param, state matched_param_state = [] for key, param in self.state_dict().items(): diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py index b92e54d486..3731976ad1 100644 --- a/python/paddle/fluid/dygraph/math_op_patch.py +++ b/python/paddle/fluid/dygraph/math_op_patch.py @@ -133,7 +133,12 @@ def monkey_patch_math_varbase(): return int(var.numpy().flatten()[0]) def _len_(var): - return var.shape[0] + if var.type == core.VarDesc.VarType.VOCAB: + return len(var.value().get_map_tensor()) + elif var.type == core.VarDesc.VarType.STRINGS: + return len(var.value().get_string_tensor()) + else: + return var.shape[0] def _index_(var): numel = np.prod(var.shape) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 9d8b1500d5..e2fd36448b 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -146,25 +146,35 @@ def monkey_patch_varbase(): out = linear(t) # call with different weight """ - assert isinstance(value, (np.ndarray, core.VarBase)), \ - "Variable set_value function, arguments type only support Variable, numpy, VarBase" - - value_np = value - if isinstance(value, core.VarBase): - value_np = value.numpy() + assert isinstance(value, (np.ndarray, core.VarBase, dict, str)), \ + "Variable set_value function, arguments type only support Variable, numpy, VarBase, dict, string." + + if isinstance(value, (dict, str)): + assert len(self) == len( + value + ), "Variable length not match, Variable [ {} ] need tensor with length {} but load set tensor with length {}".format( + self.name, len(self), len(value)) + if isinstance(value, dict): + self.value().set_vocab(value) + else: + self.value().set_string_list(value) + else: + value_np = value + if isinstance(value, core.VarBase): + value_np = value.numpy() - self_tensor_np = self.numpy() + self_tensor_np = self.numpy() - assert self_tensor_np.shape == value_np.shape, \ - "Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format( - self.name, self_tensor_np.shape, value_np.shape) + assert self_tensor_np.shape == value_np.shape, \ + "Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format( + self.name, self_tensor_np.shape, value_np.shape) - assert self_tensor_np.dtype == value_np.dtype, \ - "Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( - self.name, self_tensor_np.dtype, value_np.dtype) + assert self_tensor_np.dtype == value_np.dtype, \ + "Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( + self.name, self_tensor_np.dtype, value_np.dtype) - self.value().get_tensor().set(value_np, - framework._current_expected_place()) + self.value().get_tensor().set(value_np, + framework._current_expected_place()) @framework.dygraph_only def backward(self, grad_tensor=None, retain_graph=False): diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 8cd8bc3994..99009a4743 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -786,9 +786,11 @@ class Executor(object): feed_target_name = op.desc.output('Out')[0] cur_feed = feed[feed_target_name] var = global_block.var(feed_target_name) - if not isinstance(cur_feed, core.LoDTensor): - cur_feed = _as_lodtensor(cur_feed, self.place, var.dtype) - check_feed_shape_type(var, cur_feed) + if var.dtype != core.VarDesc.VarType.STRINGS: + if not isinstance(cur_feed, core.LoDTensor): + cur_feed = _as_lodtensor(cur_feed, self.place, + var.dtype) + check_feed_shape_type(var, cur_feed) idx = op.desc.attr('col') core.set_feed_variable(scope, cur_feed, feed_var_name, idx) else: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d93b407c1f..d2f8862266 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -901,6 +901,10 @@ class Variable(object): if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) + if dtype == core.VarDesc.VarType.STRINGS: + type = core.VarDesc.VarType.STRINGS + lod_level = None + self.belong_to_optimizer = belong_to_optimizer self.error_clip = error_clip diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py index 2c1b2c7750..6576ca785b 100644 --- a/python/paddle/fluid/inference/wrapper.py +++ b/python/paddle/fluid/inference/wrapper.py @@ -29,10 +29,14 @@ def tensor_copy_from_cpu(self, data): ''' Support input type check based on tensor.copy_from_cpu. ''' - if not isinstance(data, np.ndarray): + if isinstance(data, np.ndarray) or (isinstance(data, list) and + len(data) > 0 and + isinstance(data[0], str)): + self.copy_from_cpu_bind(data) + else: raise TypeError( - "In copy_from_cpu, we only support numpy ndarray data type.") - self.copy_from_cpu_bind(data) + "In copy_from_cpu, we only support numpy ndarray and list[str] data type." + ) Tensor.copy_from_cpu = tensor_copy_from_cpu diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py new file mode 100755 index 0000000000..496f3505ec --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py @@ -0,0 +1,393 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import io +import os +import unittest + +import numpy as np +import paddle +import paddle.nn as nn +from paddle.dataset.common import DATA_HOME +from paddle.fluid.framework import core, in_dygraph_mode +from paddle.fluid.layer_helper import LayerHelper + +import sys +sys.path.append("./tokenizer") +from tokenizer.bert_tokenizer import BertTokenizer + + +def to_string_tensor(string_values, name): + """ + Create the tensor that the value holds the list of string. + NOTICE: The value will be holded in the cpu place. + + Args: + string_values(list[string]): The value will be setted to the tensor. + name(string): The name of the tensor. + """ + tensor = paddle.Tensor(core.VarDesc.VarType.STRING, [], name, + core.VarDesc.VarType.STRINGS, False) + tensor.value().set_string_list(string_values) + return tensor + + +def to_map_tensor(string_dict, name): + """ + Create the tensor that the value holds the map, the type of key is the string + and the value is the int. + NOTICE: The value will be holded in the cpu place. + + Args: + string_dict(dict): The value will be setted to the tensor. + name(string): The name of the tensor. + """ + tensor = paddle.Tensor(core.VarDesc.VarType.RAW, [], name, + core.VarDesc.VarType.VOCAB, True) + tensor.value().set_vocab(string_dict) + return tensor + + +class FasterTokenizer(nn.Layer): + def __init__(self, vocab_dict): + super(FasterTokenizer, self).__init__() + vocab_tensor = to_map_tensor(vocab_dict, "vocab") + self.register_buffer("vocab", vocab_tensor, persistable=True) + + def forward(self, + text, + text_pair=None, + do_lower_case=True, + max_seq_len=-1, + is_split_into_words=False, + pad_to_max_seq_len=False): + if in_dygraph_mode(): + input_ids, seg_ids = core.ops.faster_tokenizer( + self.vocab, text, text_pair, "do_lower_case", do_lower_case, + "max_seq_len", max_seq_len, "pad_to_max_seq_len", + pad_to_max_seq_len, "is_split_into_words", is_split_into_words) + return input_ids, seg_ids + + attrs = { + "do_lower_case": do_lower_case, + "max_seq_len": max_seq_len, + "pad_to_max_seq_len": pad_to_max_seq_len, + "is_split_into_words": is_split_into_words, + } + helper = LayerHelper("faster_tokenizer") + input_ids = helper.create_variable_for_type_inference(dtype="int64") + seg_ids = helper.create_variable_for_type_inference(dtype="int64") + if text_pair is None: + helper.append_op( + type='faster_tokenizer', + inputs={'Vocab': self.vocab, + 'Text': text}, + outputs={'InputIds': input_ids, + 'SegmentIds': seg_ids}, + attrs=attrs) + else: + helper.append_op( + type='faster_tokenizer', + inputs={ + 'Vocab': self.vocab, + 'Text': text, + 'TextPair': text_pair + }, + outputs={'InputIds': input_ids, + 'SegmentIds': seg_ids}, + attrs=attrs) + return input_ids, seg_ids + + +class Predictor(object): + def __init__(self, model_dir): + model_file = os.path.join(model_dir, "inference.pdmodel") + params_file = os.path.join(model_dir, "inference.pdiparams") + if not os.path.exists(model_file): + raise ValueError("not find model file path {}".format(model_file)) + if not os.path.exists(params_file): + raise ValueError("not find params file path {}".format(params_file)) + config = paddle.inference.Config(model_file, params_file) + + # fast_tokenizer op only support cpu. + config.disable_gpu() + config.set_cpu_math_library_num_threads(10) + + config.switch_use_feed_fetch_ops(False) + self.predictor = paddle.inference.create_predictor(config) + self.input_handles = [ + self.predictor.get_input_handle(name) + for name in self.predictor.get_input_names() + ] + self.output_handles = [ + self.predictor.get_output_handle(name) + for name in self.predictor.get_output_names() + ] + + def predict(self, data): + + self.input_handles[0].copy_from_cpu(data) + self.predictor.run() + input_ids = self.output_handles[0].copy_to_cpu() + token_type_ids = self.output_handles[1].copy_to_cpu() + return input_ids, token_type_ids + + +class TestBertTokenizerOp(unittest.TestCase): + def setUp(self): + self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") + self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab) + self.init_data() + self.save_path = os.path.join(DATA_HOME, "fast_tokenizer") + self.param_path = os.path.join(self.save_path, "model.pdparams") + self.inference_path = os.path.join(self.save_path, "inference") + + def init_data(self): + self.text = [ + '选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。' + '酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,' + '还算丰富。 服务吗,一般' + ] + self.text_pair = ['非常不错,服务很好,位于市中心区,交通方便,不过价格也高!'] + self.text_tensor = to_string_tensor(self.text, "text") + self.text_pair_tensor = to_string_tensor(self.text_pair, "text_pair") + self.texts = [ + '很好的地理位置,一蹋糊涂的服务,萧条的酒店。', + ' 选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般,' + '但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。 服务吗,一般', + 'Test bert tokenizer. The first text.' + ] + self.text_pairs = [ + '非常不错,服务很好,位于市中心区,交通方便,不过价格也高!', '房间太小。其他的都一般。。。。。。。。。', + 'Test bert tokenizer. The second text.' + ] + self.texts_tensor = to_string_tensor(self.texts, "texts") + self.text_pairs_tensor = to_string_tensor(self.text_pairs, "text_pairs") + + def test_padding(self): + + self.max_seq_len = 128 + self.pad_to_max_seq_len = True + self.is_split_into_words = False + + # case 1: only one text (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.text_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + text=self.text, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 2: only one text and one text_pair (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.text_tensor, + text_pair=self.text_pair_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + text=self.text, + text_pair=self.text_pair, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 3: only texts (batch_size = 3) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.texts_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.texts, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = [i["input_ids"] for i in encoded_inputs] + py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs] + py_input_ids = np.array(py_input_ids).reshape([3, -1]) + py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 4: texts and text pairs (batch_size = 3) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.texts_tensor, + text_pair=self.text_pairs_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.texts, + self.text_pairs, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = [i["input_ids"] for i in encoded_inputs] + py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs] + py_input_ids = np.array(py_input_ids).reshape([3, -1]) + py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_no_padding(self): + self.max_seq_len = 128 + self.pad_to_max_seq_len = False + self.is_split_into_words = False + + # case 1: only one text (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.text_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.text, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 2: only one text and one text_pair (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + self.text_tensor, + self.text_pair_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.text, + self.text_pair, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_is_split_into_words(self): + self.is_split_into_words = True + + input_ids, token_type_ids = self.faster_tokenizer( + self.text_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + encoded_inputs = self.bert_tokenizer( + list(self.text[0]), is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs["token_type_ids"]).reshape( + [1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_inference(self): + if not os.path.exists(self.save_path): + os.makedirs(self.save_path, exist_ok=True) + paddle.save(self.faster_tokenizer.state_dict(), self.param_path) + state_dict = paddle.load(self.param_path) + self.faster_tokenizer.set_dict(state_dict) + + static_model = paddle.jit.to_static( + self.faster_tokenizer, + input_spec=[ + paddle.static.InputSpec( + shape=[None], dtype=core.VarDesc.VarType.STRINGS), # texts + ]) + # Save in static graph model. + paddle.jit.save(static_model, self.inference_path) + predictor = Predictor(self.save_path) + input_ids, token_type_ids = predictor.predict(self.text) + + encoded_inputs = self.bert_tokenizer(self.text) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_feed_string_var(self): + paddle.enable_static() + x = paddle.static.data( + name="x", shape=[-1], dtype=core.VarDesc.VarType.STRINGS) + exe = paddle.static.Executor(paddle.framework.CPUPlace()) + exe.run(paddle.static.default_main_program(), feed={'x': self.text}) + paddle.disable_static() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/tokenizer/__init__.py b/python/paddle/fluid/tests/unittests/tokenizer/__init__.py new file mode 100644 index 0000000000..b9a7651e44 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/tokenizer/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py new file mode 100755 index 0000000000..00d5f4e772 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py @@ -0,0 +1,517 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import io +import json +import os +import six +import unicodedata + +from tokenizer_utils import PretrainedTokenizer +from tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation + + +class BasicTokenizer(object): + """ + Runs basic tokenization (punctuation splitting, lower casing, etc.). + Args: + do_lower_case (bool): + Whether or not to lowercase the input when tokenizing. + Defaults to `True`. + """ + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer.""" + + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """ + Tokenizes a piece of text using basic tokenizer. + Args: + text (str): A piece of text. + Returns: + list(str): A list of tokens. + Examples: + .. code-block:: + from paddlenlp.transformers import BasicTokenizer + basictokenizer = BasicTokenizer() + tokens = basictokenizer.tokenize('He was a puppeteer') + ''' + ['he', 'was', 'a', 'puppeteer'] + ''' + """ + + text = convert_to_unicode(text) + text = self._clean_text(text) + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """ + Strips accents from a piece of text. + """ + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """ + Splits punctuation on a piece of text. + """ + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """ + Adds whitespace around any CJK character. + """ + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """ + Checks whether CP is the codepoint of a CJK character. + """ + + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """ + Performs invalid character removal and whitespace cleanup on text. + """ + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """ + Runs WordPiece tokenization. + Args: + vocab (Vocab|dict): + Vocab of the word piece tokenizer. + unk_token (str): + A specific token to replace all unknown tokens. + max_input_chars_per_word (int): + If a word's length is more than + max_input_chars_per_word, it will be dealt as unknown word. + Defaults to 100. + """ + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + Returns: + list (str): A list of wordpiece tokens. + Examples: + .. code-block:: + from paddlenlp.transformers import BertTokenizer, WordpieceTokenizer + berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + vocab = berttokenizer.vocab + unk_token = berttokenizer.unk_token + wordpiecetokenizer = WordpieceTokenizer(vocab,unk_token) + inputs = wordpiecetokenizer.tokenize("unaffable") + print(inputs) + ''' + ["un", "##aff", "##able"] + ''' + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +class BertTokenizer(PretrainedTokenizer): + """ + Constructs a BERT tokenizer. It uses a basic tokenizer to do punctuation + splitting, lower casing and so on, and follows a WordPiece tokenizer to + tokenize as subwords. + Args: + vocab_file (str): + The vocabulary file path (ends with '.txt') required to instantiate + a `WordpieceTokenizer`. + do_lower_case (bool): + Whether or not to lowercase the input when tokenizing. + Defaults to`True`. + unk_token (str): + A special token representing the *unknown (out-of-vocabulary)* token. + An unknown token is set to be `unk_token` inorder to be converted to an ID. + Defaults to "[UNK]". + sep_token (str): + A special token separating two different sentences in the same input. + Defaults to "[SEP]". + pad_token (str): + A special token used to make arrays of tokens the same size for batching purposes. + Defaults to "[PAD]". + cls_token (str): + A special token used for sequence classification. It is the last token + of the sequence when built with special tokens. Defaults to "[CLS]". + mask_token (str): + A special token representing a masked token. This is the token used + in the masked language modeling task which the model tries to predict the original unmasked ones. + Defaults to "[MASK]". + Examples: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + inputs = berttokenizer.tokenize('He was a puppeteer') + print(inputs) + ''' + {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]} + ''' + """ + resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained + pretrained_resource_files_map = { + "vocab_file": { + "bert-base-uncased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt", + "bert-large-uncased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-uncased-vocab.txt", + "bert-base-cased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt", + "bert-large-cased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-cased-vocab.txt", + "bert-base-multilingual-uncased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-uncased-vocab.txt", + "bert-base-multilingual-cased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-cased-vocab.txt", + "bert-base-chinese": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", + "bert-wwm-chinese": + "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt", + "bert-wwm-ext-chinese": + "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-ext-chinese-vocab.txt", + "macbert-large-chinese": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", + "macbert-base-chinese": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", + "simbert-base-chinese": + "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/vocab.txt", + } + } + pretrained_init_configuration = { + "bert-base-uncased": { + "do_lower_case": True + }, + "bert-large-uncased": { + "do_lower_case": True + }, + "bert-base-cased": { + "do_lower_case": False + }, + "bert-large-cased": { + "do_lower_case": False + }, + "bert-base-multilingual-uncased": { + "do_lower_case": True + }, + "bert-base-multilingual-cased": { + "do_lower_case": False + }, + "bert-base-chinese": { + "do_lower_case": False + }, + "bert-wwm-chinese": { + "do_lower_case": False + }, + "bert-wwm-ext-chinese": { + "do_lower_case": False + }, + "macbert-large-chinese": { + "do_lower_case": False + }, + "macbert-base-chinese": { + "do_lower_case": False + }, + "simbert-base-chinese": { + "do_lower_case": True + }, + } + padding_side = 'right' + + def __init__(self, + vocab_file, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]"): + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the " + "vocabulary from a pretrained model please use " + "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + .format(vocab_file)) + self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token) + self.do_lower_case = do_lower_case + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer( + vocab=self.vocab, unk_token=unk_token) + self.special_tokens_map = { + 'unk_token': unk_token, + 'sep_token': sep_token, + 'pad_token': pad_token, + 'cls_token': cls_token, + 'mask_token': mask_token + } + + @property + def vocab_size(self): + """ + Return the size of vocabulary. + Returns: + int: The size of vocabulary. + """ + + return len(self.vocab) + + def _tokenize(self, text): + """ + End-to-end tokenization for BERT models. + Args: + text (str): The text to be tokenized. + + Returns: + list: A list of string representing converted tokens. + """ + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + return split_tokens + + def tokenize(self, text): + """ + Converts a string to a list of tokens. + Args: + text (str): The text to be tokenized. + + Returns: + List(str): A list of string representing converted tokens. + Examples: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokens = berttokenizer.tokenize('He was a puppeteer') + + ''' + ['he', 'was', 'a', 'puppet', '##eer'] + ''' + """ + + return self._tokenize(text) + + def num_special_tokens_to_add(self, pair=False): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + Args: + pair(bool): + Whether the input is a sequence pair or a single sequence. + Defaults to `False` and the input is a single sequence. + Returns: + int: Number of tokens added to sequences. + """ + token_ids_0 = [] + token_ids_1 = [] + return len( + self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 + if pair else None)) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. + + A BERT sequence has the following format: + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + Args: + token_ids_0 (List[int]): + List of IDs to which the special tokens will be added. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. Defaults to None. + Returns: + List[int]: List of input_id with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + _cls = [self.cls_token_id] + _sep = [self.sep_token_id] + return _cls + token_ids_0 + _sep + token_ids_1 + _sep + + def create_token_type_ids_from_sequences(self, + token_ids_0, + token_ids_1=None): + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + :: + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + Args: + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. Defaults to None. + Returns: + List[int]: List of token_type_id according to the given sequence(s). + """ + _sep = [self.sep_token_id] + _cls = [self.cls_token_id] + if token_ids_1 is None: + return len(_cls + token_ids_0 + _sep) * [0] + return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + + _sep) * [1] + + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + Args: + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optinal): + Optional second list of IDs for sequence pairs. Defaults to None. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + Returns: + List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list( + map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, + token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py new file mode 100644 index 0000000000..7da3cd56e2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py @@ -0,0 +1,1244 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import io +import json +import os +import unicodedata +from shutil import copyfile +from typing import Iterable, Iterator, Optional, List, Any, Callable, Union + +from paddle.dataset.common import DATA_HOME +from paddle.utils.download import get_path_from_url + + +def convert_to_unicode(text): + """ + Converts `text` to Unicode (if it's not already), assuming utf-8 input. + Args: + text (str|bytes): Text to be converted to unicode. + Returns: + str: converted text. + """ + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + + +def whitespace_tokenize(text): + """ + Runs basic whitespace cleaning and splitting on a peice of text. + Args: + text (str): Text to be tokened. + Returns: + list(str): Token list. + """ + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +def _is_whitespace(char): + """ + Checks whether `chars` is a whitespace character. + """ + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + + +def is_chinese_char(cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + +def tokenize_chinese_chars(text): + """Adds whitespace around any CJK character.""" + output = [] + buff = "" + for char in text: + cp = ord(char) + if is_chinese_char(cp): + if buff != "": + output.append(buff) + buff = "" + output.append(char) + else: + buff += char + + if buff != "": + output.append(buff) + + return output + + +class PretrainedTokenizer(object): + """ + The base class for all pretrained tokenizers. It mainly provides common methods + for loading (construction and loading) and saving pretrained tokenizers. Loading + and saving also rely on the following class attributes which should be overridden + by derived classes accordingly: + - **tokenizer_config_file** (str): Represents the file name of tokenizer + configuration for configuration saving and loading in local file system. + The value is `tokenizer_config.json`. + - **resource_files_names** (dict): Represents resources to specific file + names mapping for resource saving and loading in local file system. The + keys of dict representing resource items should be argument names in + tokenizer's `__init__` method, and the values are file names for saving + and loading corresponding resources. The mostly used resources here are + vocabulary file and sentence-piece model file. + - **pretrained_init_configuration** (dict): Provides the tokenizer configurations + of built-in pretrained tokenizers (contrasts to tokenizers in local file + system). It has pretrained tokenizer names as keys (the same as pretrained + model names, such as `bert-base-uncased`), and the values are dict preserving + corresponding configuration for tokenizer initialization. + - **pretrained_resource_files_map** (dict): Provides resource URLs of built-in + pretrained tokenizers (contrasts to tokenizers in local file system). It + has the same keys as `resource_files_names`, and the values are also `dict` + mapping specific pretrained tokenizer names (such as `bert-base-uncased`) + to corresponding resource URLs. + Moreover, methods common to tokenizers for tokenization, token/id conversion + and encoding as model inputs are also provided here. + Besides, metaclass `InitTrackerMeta` is used to create `PretrainedTokenizer`, + by which subclasses can track arguments for initialization automatically + and expose special tokens initialization used as attributes. + """ + tokenizer_config_file = "tokenizer_config.json" + pretrained_init_configuration = {} + resource_files_names = {} # keys are arguments of __init__ + pretrained_resource_files_map = {} + padding_side = 'right' + pad_token_type_id = 0 + + def __call__(self, + text, + text_pair=None, + max_seq_len: Optional[int]=None, + stride=0, + is_split_into_words=False, + pad_to_max_seq_len=False, + truncation_strategy="longest_first", + return_position_ids=False, + return_token_type_ids=True, + return_attention_mask=False, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False): + """ + Performs tokenization and uses the tokenized tokens to prepare model + inputs. It supports sequence or sequence pair as input, and batch input + is allowed. `self.encode()` or `self.batch_encode()` would be called + separately for single or batch input depending on input format and + `is_split_into_words` argument. + Args: + text (str, List[str] or List[List[str]]): + The sequence or batch of sequences to be processed. One sequence + is a string or a list of strings depending on whether it has been + pretokenized. If each sequence is provided as a list of strings + (pretokenized), you must set `is_split_into_words` as `True` to + disambiguate with a batch of sequences. + text_pair (str, List[str] or List[List[str]], optional): + Same as `text` argument, while it represents for the latter + sequence of the sequence pair. + max_seq_len (int, optional): + If set to a number, will limit the total sequence returned so + that it has a maximum length. If there are overflowing tokens, + those overflowing tokens will be added to the returned dictionary + when `return_overflowing_tokens` is `True`. Defaults to `None`. + stride (int, optional): + Only available for batch input of sequence pair and mainly for + question answering usage. When for QA, `text` represents questions + and `text_pair` represents contexts. If `stride` is set to a + positive number, the context will be split into multiple spans + where `stride` defines the number of (tokenized) tokens to skip + from the start of one span to get the next span, thus will produce + a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' + and 'offset_mapping' preserving the original example and position + information will be added to the returned dictionary. Defaults to 0. + pad_to_max_seq_len (bool, optional): + If set to `True`, the returned sequences would be padded up to + `max_seq_len` specified length according to padding side + (`self.padding_side`) and padding token id. Defaults to `False`. + truncation_strategy (str, optional): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence + until the input is under `max_seq_len` starting from the longest + one at each token (when there is a pair of input sequences). + - 'only_first': Only truncate the first sequence. + - 'only_second': Only truncate the second sequence. + - 'do_not_truncate': Do not truncate (raise an error if the input + sequence is longer than `max_seq_len`). + Defaults to 'longest_first'. + return_position_ids (bool, optional): + Whether to include tokens position ids in the returned dictionary. + Defaults to `False`. + return_token_type_ids (bool, optional): + Whether to include token type ids in the returned dictionary. + Defaults to `True`. + return_attention_mask (bool, optional): + Whether to include the attention mask in the returned dictionary. + Defaults to `False`. + return_length (bool, optional): + Whether to include the length of each encoded inputs in the + returned dictionary. Defaults to `False`. + return_overflowing_tokens (bool, optional): + Whether to include overflowing token information in the returned + dictionary. Defaults to `False`. + return_special_tokens_mask (bool, optional): + Whether to include special tokens mask information in the returned + dictionary. Defaults to `False`. + Returns: + dict or list[dict] (for batch input): + The dict has the following optional items: + - **input_ids** (list[int]): List of token ids to be fed to a model. + - **position_ids** (list[int], optional): List of token position ids to be + fed to a model. Included when `return_position_ids` is `True` + - **token_type_ids** (list[int], optional): List of token type ids to be + fed to a model. Included when `return_token_type_ids` is `True`. + - **attention_mask** (list[int], optional): List of integers valued 0 or 1, + where 0 specifies paddings and should not be attended to by the + model. Included when `return_attention_mask` is `True`. + - **seq_len** (int, optional): The input_ids length. Included when `return_length` + is `True`. + - **overflowing_tokens** (list[int], optional): List of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **num_truncated_tokens** (int, optional): The number of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1, + with 0 specifying special added tokens and 1 specifying sequence tokens. + Included when `return_special_tokens_mask` is `True`. + - **offset_mapping** (list[int], optional): list of pair preserving the + index of start and end char in original input for each token. + For a special token, the index pair is `(0, 0)`. Included when + `stride` works. + - **overflow_to_sample** (int, optional): Index of example from which this + feature is generated. Included when `stride` works. + """ + # Input type checking for clearer error + assert isinstance(text, str) or ( + isinstance(text, (list, tuple)) and (len(text) == 0 or ( + isinstance(text[0], str) or + (isinstance(text[0], (list, tuple)) and + (len(text[0]) == 0 or isinstance(text[0][0], str))))) + ), ("text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples).") + + assert (text_pair is None or isinstance(text_pair, str) or ( + isinstance(text_pair, (list, tuple)) and (len(text_pair) == 0 or ( + isinstance(text_pair[0], str) or + (isinstance(text_pair[0], (list, tuple)) and + (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))))) + )), ( + "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples).") + + is_batched = bool( + (not is_split_into_words and isinstance(text, (list, tuple))) or + (is_split_into_words and isinstance(text, (list, tuple)) and + text and isinstance(text[0], (list, tuple)))) + + if is_batched: + batch_text_or_text_pairs = list(zip( + text, text_pair)) if text_pair is not None else text + return self.batch_encode( + batch_text_or_text_pairs=batch_text_or_text_pairs, + max_seq_len=max_seq_len, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_max_seq_len=pad_to_max_seq_len, + truncation_strategy="longest_first", + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask) + else: + return self.encode( + text=text, + text_pair=text_pair, + max_seq_len=max_seq_len, + pad_to_max_seq_len=pad_to_max_seq_len, + truncation_strategy="longest_first", + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask) + + @property + def all_special_tokens(self): + """ + list: All the special tokens ('', ''...) corresponding to + special token arguments in `__init__` (arguments end with '_end'). + """ + all_toks = [] + set_attr = self.special_tokens_map + for attr_value in set_attr.values(): + all_toks = all_toks + (list(attr_value) if isinstance(attr_value, ( + list, tuple)) else [attr_value]) + all_toks = list(set(all_toks)) + return all_toks + + @property + def all_special_ids(self): + """ + list: All the token ids corresponding to all the special tokens. + """ + all_toks = self.all_special_tokens + all_ids = self.convert_tokens_to_ids(all_toks) + return all_ids + + def convert_tokens_to_ids(self, tokens): + """ + Converts a sequence of tokens into ids using the `vocab` attribute (an + instance of `Vocab`). Override it if needed. + Args: + tokens (list[int]): List of token ids. + Returns: + list: Converted id list. + """ + if isinstance(tokens, list): + token_ids = [] + for token in tokens: + token_id = self.vocab.get(token, self.unk_token_id) + token_ids.append(token_id) + return token_ids + elif isinstance(tokens, str): + token_id = self.vocab.get(tokens, self.unk_token_id) + token_ids.append(token_id) + return token_ids + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): + """ + Creates an instance of `PretrainedTokenizer`. Related resources are loaded + by specifying name of a built-in pretrained model, or a community-contributed + pretrained model, or a local file directory path. + Args: + pretrained_model_name_or_path (str): Name of pretrained model or dir path + to load from. The string can be: + - Name of built-in pretrained model + - Name of a community-contributed pretrained model. + - Local directory path which contains tokenizer related resources + and tokenizer config file ("tokenizer_config.json"). + *args (tuple): position arguments for model `__init__`. If provided, + use these as position argument values for tokenizer initialization. + **kwargs (dict): keyword arguments for model `__init__`. If provided, + use these to update pre-defined keyword argument values for tokenizer + initialization. + Returns: + PretrainedTokenizer: An instance of `PretrainedTokenizer`. + Example: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + # Name of built-in pretrained model + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + # Name of community-contributed pretrained model + tokenizer = BertTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') + # Load from local directory path + tokenizer = BertTokenizer.from_pretrained('./my_bert/') + """ + pretrained_models = list(cls.pretrained_init_configuration.keys()) + vocab_files = {} + init_configuration = {} + # From built-in pretrained models + if pretrained_model_name_or_path in pretrained_models: + for file_id, map_list in cls.pretrained_resource_files_map.items(): + vocab_files[file_id] = map_list[pretrained_model_name_or_path] + init_configuration = copy.deepcopy( + cls.pretrained_init_configuration[ + pretrained_model_name_or_path]) + # From local dir path + elif os.path.isdir(pretrained_model_name_or_path): + for file_id, file_name in cls.resource_files_names.items(): + full_file_name = os.path.join(pretrained_model_name_or_path, + file_name) + vocab_files[file_id] = full_file_name + vocab_files["tokenizer_config_file"] = os.path.join( + pretrained_model_name_or_path, cls.tokenizer_config_file) + + default_root = os.path.join(DATA_HOME, pretrained_model_name_or_path) + resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): + if file_path is None or os.path.isfile(file_path): + resolved_vocab_files[file_id] = file_path + continue + path = os.path.join(default_root, file_path.split('/')[-1]) + if os.path.exists(path): + print("Already cached %s" % path) + resolved_vocab_files[file_id] = path + else: + print("Downloading %s and saved to %s" % + (file_path, default_root)) + try: + resolved_vocab_files[file_id] = get_path_from_url( + file_path, default_root) + except RuntimeError as err: + print(err) + raise RuntimeError( + f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" + f"Please make sure that '{pretrained_model_name_or_path}' is:\n" + "- a correct model-identifier of built-in pretrained models,\n" + "- or a correct model-identifier of community-contributed pretrained models,\n" + "- or the correct path to a directory containing relevant tokenizer files.\n" + ) + + # Prepare tokenizer initialization kwargs + # Did we saved some inputs and kwargs to reload ? + tokenizer_config_file = resolved_vocab_files.pop( + "tokenizer_config_file", None) + if tokenizer_config_file is not None: + with io.open(tokenizer_config_file, encoding="utf-8") as f: + init_kwargs = json.load(f) + else: + init_kwargs = init_configuration + # position args are stored in kwargs, maybe better not include + init_args = init_kwargs.pop("init_args", ()) + init_kwargs.pop("init_class", None) + + # Update with newly provided args and kwargs + init_args = init_args if not args else args + init_kwargs.update(kwargs) + + # Merge resolved_vocab_files arguments in init_kwargs if not including. + # Maybe need more ways to load resources. + for args_name, file_path in resolved_vocab_files.items(): + # when `pretrained_model_name_or_path` is a pretrained model name, + # use pretrained_init_configuration as `init_kwargs` to init which + # does not include the vocab file in it, thus add vocab file into + # args. + if args_name not in init_kwargs: + init_kwargs[args_name] = file_path + # when `pretrained_model_name_or_path` is a pretrained model dir, + # use tokenizer_config_file.json as `init_kwargs` to init which + # does include a vocab file path in it. However, if the vocab file + # path included in json does not exist, such as was deleted, to make + # it still work, use the vocab file under this dir. + elif not os.path.isfile(init_kwargs[args_name]) and os.path.isfile( + file_path): + init_kwargs[args_name] = file_path + # TODO(guosheng): avoid reduplication of position args and key word args + tokenizer = cls(*init_args, **init_kwargs) + return tokenizer + + def save_pretrained(self, save_directory): + """ + Save tokenizer configuration and related resources to files under + `save_directory`. The tokenizer configuration would be saved into + `tokenizer_config_file` indicating file (thus `tokenizer_config.json`), + and resources would be saved into `resource_files_names` indicating files + by using `self.save_resources(save_directory)`. + + The `save_directory` can be used in `from_pretrained` as argument value + of `pretrained_model_name_or_path` to re-load the tokenizer. + Args: + save_directory (str): Directory to save files into. + Example: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokenizer.save_pretrained('trained_model') + # reload from save_directory + tokenizer = BertTokenizer.from_pretrained('trained_model') + """ + assert not os.path.isfile( + save_directory + ), "Saving directory ({}) should be a directory, not a file".format( + save_directory) + os.makedirs(save_directory, exist_ok=True) + + tokenizer_config_file = os.path.join(save_directory, + self.tokenizer_config_file) + # init_config is set in metaclass created `__init__`, + tokenizer_config = self.init_config + with io.open(tokenizer_config_file, "w", encoding="utf-8") as f: + f.write(json.dumps(tokenizer_config, ensure_ascii=False)) + + self.save_resources(save_directory) + + def save_resources(self, save_directory): + """ + Save tokenizer related resources to `resource_files_names` indicating + files under `save_directory` by copying directly. Override it if necessary. + Args: + save_directory (str): Directory to save files into. + """ + for name, file_name in self.resource_files_names.items(): + src_path = self.init_config[name] + dst_path = os.path.join(save_directory, file_name) + if os.path.abspath(src_path) != os.path.abspath(dst_path): + copyfile(src_path, dst_path) + + @staticmethod + def load_vocabulary(filepath, + unk_token=None, + pad_token=None, + bos_token=None, + eos_token=None, + **kwargs): + """ + Instantiate an instance of `Vocab` from a file reserving all tokens + by using `Vocab.from_dict`. The file contains a token per line, and the + line number would be the index of corresponding token. + Args: + filepath (str): path of file to construct vocabulary. + unk_token (str): special token for unknown token. If no need, it also + could be `None`. Defaults to `None`. + pad_token (str): special token for padding token. If no need, it also + could be `None`. Defaults to `None`. + bos_token (str): special token for bos token. If no need, it also + could be `None`. Defaults to `None`. + eos_token (str): special token for eos token. If no need, it also + could be `None`. Defaults to `None`. + **kwargs (dict): keyword arguments for `Vocab.from_dict`. + Returns: + Vocab: An instance of `Vocab`. + """ + token_to_idx = {} + with io.open(filepath, 'r', encoding='utf-8') as f: + for index, line in enumerate(f): + token = line.rstrip('\n') + token_to_idx[token] = int(index) + return token_to_idx + + def __getattr__(self, name): + if name.endswith('_token'): + return self.special_tokens_map[name] + elif name.endswith('_token_id'): + return self.vocab[self.special_tokens_map[name[:-3]]] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, name)) + + def truncate_sequences(self, + ids, + pair_ids=None, + num_tokens_to_remove=0, + truncation_strategy='longest_first', + stride=0): + """ + Truncates a sequence pair in place to the maximum length. + Args: + ids: list of tokenized input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): + number of tokens to remove using the truncation strategy + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len) + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_seq_len, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if truncation_strategy == 'longest_first': + overflowing_tokens = [] + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + overflowing_tokens = [ids[-1]] + overflowing_tokens + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + window_len = min(len(ids), stride) + if window_len > 0: + overflowing_tokens = ids[-window_len:] + overflowing_tokens + elif truncation_strategy == 'only_first': + assert len(ids) > num_tokens_to_remove + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + elif truncation_strategy == 'only_second': + assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif truncation_strategy == 'do_not_truncate': + raise ValueError( + "Input sequence are too long for max_length. Please select a truncation strategy." + ) + else: + raise ValueError( + "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" + ) + return (ids, pair_ids, overflowing_tokens) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. + Should be overridden in a subclass if the model has a special way of building those. + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + Returns: + List[int]: List of input_id with the appropriate special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + + return token_ids_0 + token_ids_1 + + def build_offset_mapping_with_special_tokens(self, + offset_mapping_0, + offset_mapping_1=None): + """ + Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. + Should be overridden in a subclass if the model has a special way of building those. + Args: + offset_mapping_0 (List[tuple]): + List of char offsets to which the special tokens will be added. + offset_mapping_1 (List[tuple], optional): + Optional second list of char offsets for offset mapping pairs. + Returns: + List[tuple]: List of char offsets with the appropriate offsets of special tokens. + """ + if offset_mapping_1 is None: + return offset_mapping_0 + + return offset_mapping_0 + offset_mapping_1 + + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + Args: + token_ids_0 (List[int]): List of ids of the first sequence. + token_ids_1 (List[int], optional): List of ids of the second sequence. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + Returns: + results (List[int]): The list of integers in the range [0, 1]: + 1 for a special token, 0 for a sequence token. + """ + return [0] * ((len(token_ids_1) + if token_ids_1 else 0) + len(token_ids_0)) + + def create_token_type_ids_from_sequences(self, + token_ids_0, + token_ids_1=None): + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + Should be overridden in a subclass if the model has a special way of building those. + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + Args: + token_ids_0 (List[int]): + List of IDs. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. + Returns: + List[int]: List of token_type_id according to the given sequence(s). + """ + if token_ids_1 is None: + return len(token_ids_0) * [0] + return [0] * len(token_ids_0) + [1] * len(token_ids_1) + + def num_special_tokens_to_add(self, pair): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + Args: + pair (bool, optional): + Whether the number of added tokens should be computed in the case of a sequence pair or a single + sequence. Defaults to `False`. + Returns: + int: Number of special tokens added to sequences. + """ + token_ids_0 = [] + token_ids_1 = [] + return len( + self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 + if pair else None)) + + def encode(self, + text, + text_pair=None, + max_seq_len=512, + pad_to_max_seq_len=False, + truncation_strategy="longest_first", + return_position_ids=False, + return_token_type_ids=True, + return_attention_mask=False, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False): + """ + Performs tokenization and uses the tokenized tokens to prepare model + inputs. It supports sequence or sequence pair as input, and batch input + is not allowed. + Args: + text (str, List[str] or List[int]): + The sequence to be processed. One sequence is a string, a list + of strings, or a list of integers depending on whether it has + been pretokenized and converted to ids. + text_pair (str, List[str] or List[List[str]]): + Same as `text` argument, while it represents for the latter + sequence of the sequence pair. + max_seq_len (int, optional): + If set to a number, will limit the total sequence returned so + that it has a maximum length. If there are overflowing tokens, + those overflowing tokens will be added to the returned dictionary + when `return_overflowing_tokens` is `True`. Defaults to `None`. + stride (int, optional): + Only available for batch input of sequence pair and mainly for + question answering usage. When for QA, `text` represents questions + and `text_pair` represents contexts. If `stride` is set to a + positive number, the context will be split into multiple spans + where `stride` defines the number of (tokenized) tokens to skip + from the start of one span to get the next span, thus will produce + a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' + and 'offset_mapping' preserving the original example and position + information will be added to the returned dictionary. Defaults to 0. + pad_to_max_seq_len (bool, optional): + If set to `True`, the returned sequences would be padded up to + `max_seq_len` specified length according to padding side + (`self.padding_side`) and padding token id. Defaults to `False`. + truncation_strategy (str, optional): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence + until the input is under `max_seq_len` starting from the longest + one at each token (when there is a pair of input sequences). + - 'only_first': Only truncate the first sequence. + - 'only_second': Only truncate the second sequence. + - 'do_not_truncate': Do not truncate (raise an error if the input + sequence is longer than `max_seq_len`). + Defaults to 'longest_first'. + return_position_ids (bool, optional): + Whether to include tokens position ids in the returned dictionary. + Defaults to `False`. + return_token_type_ids (bool, optional): + Whether to include token type ids in the returned dictionary. + Defaults to `True`. + return_attention_mask (bool, optional): + Whether to include the attention mask in the returned dictionary. + Defaults to `False`. + return_length (bool, optional): + Whether to include the length of each encoded inputs in the + returned dictionary. Defaults to `False`. + return_overflowing_tokens (bool, optional): + Whether to include overflowing token information in the returned + dictionary. Defaults to `False`. + return_special_tokens_mask (bool, optional): + Whether to include special tokens mask information in the returned + dictionary. Defaults to `False`. + Returns: + dict: + The dict has the following optional items: + - **input_ids** (list[int]): List of token ids to be fed to a model. + - **position_ids** (list[int], optional): List of token position ids to be + fed to a model. Included when `return_position_ids` is `True` + - **token_type_ids** (list[int], optional): List of token type ids to be + fed to a model. Included when `return_token_type_ids` is `True`. + - **attention_mask** (list[int], optional): List of integers valued 0 or 1, + where 0 specifies paddings and should not be attended to by the + model. Included when `return_attention_mask` is `True`. + - **seq_len** (int, optional): The input_ids length. Included when `return_length` + is `True`. + - **overflowing_tokens** (list[int], optional): List of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **num_truncated_tokens** (int, optional): The number of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1, + with 0 specifying special added tokens and 1 specifying sequence tokens. + Included when `return_special_tokens_mask` is `True`. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self._tokenize(text) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], str): + return self.convert_tokens_to_ids(text) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + ids = get_input_ids(text) + pair_ids = get_input_ids(text_pair) if text_pair is not None else None + + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + encoded_inputs = {} + + # Truncation: Handle max sequence length + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add( + pair=pair)) + if max_seq_len and total_len > max_seq_len: + + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_seq_len, + truncation_strategy=truncation_strategy, ) + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_seq_len + + # Add special tokens + + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, + pair_ids) + + # Build output dictionnary + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = self.get_special_tokens_mask(ids, + pair_ids) + if return_length: + encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"]) + + # Check lengths + assert max_seq_len is None or len(encoded_inputs[ + "input_ids"]) <= max_seq_len + + # Padding + needs_to_be_padded = pad_to_max_seq_len and \ + max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len + + if needs_to_be_padded: + difference = max_seq_len - len(encoded_inputs["input_ids"]) + if self.padding_side == 'right': + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[ + "input_ids"]) + [0] * difference + if return_token_type_ids: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + + [self.pad_token_type_id] * difference) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = encoded_inputs[ + "special_tokens_mask"] + [1] * difference + encoded_inputs["input_ids"] = encoded_inputs[ + "input_ids"] + [self.pad_token_id] * difference + elif self.padding_side == 'left': + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + [ + 1 + ] * len(encoded_inputs["input_ids"]) + if return_token_type_ids: + encoded_inputs["token_type_ids"] = ( + [self.pad_token_type_id] * difference + + encoded_inputs["token_type_ids"]) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [ + 1 + ] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs["input_ids"] = [ + self.pad_token_id + ] * difference + encoded_inputs["input_ids"] + else: + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[ + "input_ids"]) + + if return_position_ids: + encoded_inputs["position_ids"] = list( + range(len(encoded_inputs["input_ids"]))) + + return encoded_inputs + + def batch_encode(self, + batch_text_or_text_pairs, + max_seq_len=512, + pad_to_max_seq_len=False, + stride=0, + is_split_into_words=False, + truncation_strategy="longest_first", + return_position_ids=False, + return_token_type_ids=True, + return_attention_mask=False, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False): + """ + Performs tokenization and uses the tokenized tokens to prepare model + inputs. It supports batch inputs of sequence or sequence pair. + Args: + batch_text_or_text_pairs (list): + The element of list can be sequence or sequence pair, and the + sequence is a string or a list of strings depending on whether + it has been pretokenized. If each sequence is provided as a list + of strings (pretokenized), you must set `is_split_into_words` as + `True` to disambiguate with a sequence pair. + max_seq_len (int, optional): + If set to a number, will limit the total sequence returned so + that it has a maximum length. If there are overflowing tokens, + those overflowing tokens will be added to the returned dictionary + when `return_overflowing_tokens` is `True`. Defaults to `None`. + stride (int, optional): + Only available for batch input of sequence pair and mainly for + question answering usage. When for QA, `text` represents questions + and `text_pair` represents contexts. If `stride` is set to a + positive number, the context will be split into multiple spans + where `stride` defines the number of (tokenized) tokens to skip + from the start of one span to get the next span, thus will produce + a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' + and 'offset_mapping' preserving the original example and position + information will be added to the returned dictionary. Defaults to 0. + pad_to_max_seq_len (bool, optional): + If set to `True`, the returned sequences would be padded up to + `max_seq_len` specified length according to padding side + (`self.padding_side`) and padding token id. Defaults to `False`. + truncation_strategy (str, optional): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence + until the input is under `max_seq_len` starting from the longest + one at each token (when there is a pair of input sequences). + - 'only_first': Only truncate the first sequence. + - 'only_second': Only truncate the second sequence. + - 'do_not_truncate': Do not truncate (raise an error if the input + sequence is longer than `max_seq_len`). + Defaults to 'longest_first'. + return_position_ids (bool, optional): + Whether to include tokens position ids in the returned dictionary. + Defaults to `False`. + return_token_type_ids (bool, optional): + Whether to include token type ids in the returned dictionary. + Defaults to `True`. + return_attention_mask (bool, optional): + Whether to include the attention mask in the returned dictionary. + Defaults to `False`. + return_length (bool, optional): + Whether to include the length of each encoded inputs in the + returned dictionary. Defaults to `False`. + return_overflowing_tokens (bool, optional): + Whether to include overflowing token information in the returned + dictionary. Defaults to `False`. + return_special_tokens_mask (bool, optional): + Whether to include special tokens mask information in the returned + dictionary. Defaults to `False`. + Returns: + list[dict]: + The dict has the following optional items: + - **input_ids** (list[int]): List of token ids to be fed to a model. + - **position_ids** (list[int], optional): List of token position ids to be + fed to a model. Included when `return_position_ids` is `True` + - **token_type_ids** (list[int], optional): List of token type ids to be + fed to a model. Included when `return_token_type_ids` is `True`. + - **attention_mask** (list[int], optional): List of integers valued 0 or 1, + where 0 specifies paddings and should not be attended to by the + model. Included when `return_attention_mask` is `True`. + - **seq_len** (int, optional): The input_ids length. Included when `return_length` + is `True`. + - **overflowing_tokens** (list[int], optional): List of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **num_truncated_tokens** (int, optional): The number of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1, + with 0 specifying special added tokens and 1 specifying sequence tokens. + Included when `return_special_tokens_mask` is `True`. + - **offset_mapping** (list[int], optional): list of pair preserving the + index of start and end char in original input for each token. + For a sqecial token, the index pair is `(0, 0)`. Included when + `stride` works. + - **overflow_to_sample** (int, optional): Index of example from which this + feature is generated. Included when `stride` works. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self._tokenize(text) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], str): + return self.convert_tokens_to_ids(text) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + batch_encode_inputs = [] + for example_id, tokens_or_pair_tokens in enumerate( + batch_text_or_text_pairs): + if not isinstance(tokens_or_pair_tokens, (list, tuple)): + text, text_pair = tokens_or_pair_tokens, None + elif is_split_into_words and not isinstance( + tokens_or_pair_tokens[0], (list, tuple)): + text, text_pair = tokens_or_pair_tokens, None + else: + text, text_pair = tokens_or_pair_tokens + + first_ids = get_input_ids(text) + second_ids = get_input_ids( + text_pair) if text_pair is not None else None + + if stride > 0 and second_ids is not None: + + max_len_for_pair = max_seq_len - len( + first_ids) - self.num_special_tokens_to_add(pair=True) + + token_offset_mapping = self.get_offset_mapping(text) + token_pair_offset_mapping = self.get_offset_mapping(text_pair) + + offset = 0 + while offset < len(second_ids): + encoded_inputs = {} + length = len(second_ids) - offset + if length > max_len_for_pair: + length = max_len_for_pair + + ids = first_ids + pair_ids = second_ids[offset:offset + length] + + mapping = token_offset_mapping + pair_mapping = token_pair_offset_mapping[offset:offset + + length] + + offset_mapping = self.build_offset_mapping_with_special_tokens( + mapping, pair_mapping) + sequence = self.build_inputs_with_special_tokens(ids, + pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences( + ids, pair_ids) + + # Build output dictionnary + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = self.get_special_tokens_mask( + ids, pair_ids) + if return_length: + encoded_inputs["seq_len"] = len(encoded_inputs[ + "input_ids"]) + + # Check lengths + assert max_seq_len is None or len(encoded_inputs[ + "input_ids"]) <= max_seq_len + + # Padding + needs_to_be_padded = pad_to_max_seq_len and \ + max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len + + encoded_inputs['offset_mapping'] = offset_mapping + + if needs_to_be_padded: + difference = max_seq_len - len(encoded_inputs[ + "input_ids"]) + if self.padding_side == 'right': + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len( + encoded_inputs[ + "input_ids"]) + [0] * difference + if return_token_type_ids: + # 0 for padding token mask + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + + [self.pad_token_type_id] * difference) + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = encoded_inputs[ + "special_tokens_mask"] + [1 + ] * difference + encoded_inputs["input_ids"] = encoded_inputs[ + "input_ids"] + [self.pad_token_id] * difference + encoded_inputs['offset_mapping'] = encoded_inputs[ + 'offset_mapping'] + [(0, 0)] * difference + elif self.padding_side == 'left': + if return_attention_mask: + encoded_inputs["attention_mask"] = [ + 0 + ] * difference + [1] * len(encoded_inputs[ + "input_ids"]) + if return_token_type_ids: + # 0 for padding token mask + encoded_inputs["token_type_ids"] = ( + [self.pad_token_type_id] * difference + + encoded_inputs["token_type_ids"]) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [ + 1 + ] * difference + encoded_inputs[ + "special_tokens_mask"] + encoded_inputs["input_ids"] = [ + self.pad_token_id + ] * difference + encoded_inputs["input_ids"] + encoded_inputs['offset_mapping'] = [ + (0, 0) + ] * difference + encoded_inputs['offset_mapping'] + else: + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len( + encoded_inputs["input_ids"]) + + if return_position_ids: + encoded_inputs["position_ids"] = list( + range(len(encoded_inputs["input_ids"]))) + + encoded_inputs['overflow_to_sample'] = example_id + batch_encode_inputs.append(encoded_inputs) + if offset + length == len(second_ids): + break + offset += min(length, stride) + + else: + batch_encode_inputs.append( + self.encode( + first_ids, + second_ids, + max_seq_len=max_seq_len, + pad_to_max_seq_len=pad_to_max_seq_len, + truncation_strategy=truncation_strategy, + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask)) + + return batch_encode_inputs + + def get_offset_mapping(self, text): + """ + Returns the map of tokens and the start and end index of their start and end character. + Modified from https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372 + Args: + text (str): + Input text. + Returns: + list: The offset map of input text. + + """ + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token + if sub_token != self.unk_token else token) + + normalized_text, char_mapping = '', [] + + for i, ch in enumerate(text): + if self.basic_tokenizer.do_lower_case: + ch = ch.lower() + ch = unicodedata.normalize('NFD', ch) + ch = ''.join([c for c in ch if unicodedata.category(c) != 'Mn']) + + ch = ''.join([ + c for c in ch + if not (ord(c) == 0 or ord(c) == 0xfffd or _is_control(c)) + ]) + normalized_text += ch + + char_mapping.extend([i] * len(ch)) + + text, token_mapping, offset = normalized_text, [], 0 + + for token in split_tokens: + if token[:2] == '##': + token = token[2:] + + start = text[offset:].index(token) + offset + end = start + len(token) + + token_mapping.append( + (char_mapping[start], char_mapping[end - 1] + 1)) + offset = end + + return token_mapping diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py index 7fdce2af64..8b72f05f36 100644 --- a/python/paddle/framework/io.py +++ b/python/paddle/framework/io.py @@ -43,7 +43,10 @@ def _build_saved_state_dict(state_dict): name_table = {} for key, value in state_dict.items(): if isinstance(value, (Variable, core.VarBase)): - save_dict[key] = value.numpy() + if value.type == core.VarDesc.VarType.VOCAB: + save_dict[key] = value.value().get_map_tensor() + else: + save_dict[key] = value.numpy() name_table[key] = value.name else: save_dict[key] = value @@ -938,8 +941,9 @@ def load(path, **configs): if "StructuredToParameterName@@" in load_result: for key in load_result["StructuredToParameterName@@"]: - load_result[key] = _ndarray_to_tensor( - load_result[key], config.return_numpy) + if isinstance(load_result[key], np.ndarray): + load_result[key] = _ndarray_to_tensor( + load_result[key], config.return_numpy) if not config.keep_name_table and "StructuredToParameterName@@" in load_result: del load_result["StructuredToParameterName@@"] -- GitLab