diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index b11eac058a1814fa9d1dcfb3358cba2879230b2c..6b76e3cc1d1a5905a027b08f7d08df4a47cc33b3 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -181,6 +181,13 @@ IF(WITH_XPU) DSTS ${dst_dir} ${dst_dir}) ENDIF() +IF(WITH_IPU) + set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/install/ipu") + copy(inference_lib_dist + SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/platform/device/ipu/libpaddle_ipu.so + DSTS ${dst_dir}) +ENDIF() + # CMakeCache Info copy(inference_lib_dist SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt @@ -189,6 +196,7 @@ copy(inference_lib_dist copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR}) set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") + if(WIN32) if(WITH_STATIC_LIB) set(paddle_inference_lib $/libpaddle_inference.lib @@ -304,7 +312,7 @@ copy(fluid_lib_dist ) set(module "platform") -set(platform_lib_deps profiler_proto error_codes_proto) +set(platform_lib_deps profiler_proto errors) if(WITH_GPU) set(platform_lib_deps ${platform_lib_deps} external_error_proto) endif(WITH_GPU) @@ -317,7 +325,7 @@ copy(fluid_lib_dist set(module "string") copy(fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h + SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/*.h ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/tinyformat/*.h DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat ) diff --git a/cmake/pten.cmake b/cmake/pten.cmake index 70d61027da872aa19d91c8fbc13d6acee007d048..8e1d233986209b8e4f51065db998ebd46e1290cd 100644 --- a/cmake/pten.cmake +++ b/cmake/pten.cmake @@ -243,3 +243,29 @@ function(register_kernels) endif() endforeach() endfunction() + +function(append_op_util_declare TARGET) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content) + string(REGEX MATCH "(PT_REGISTER_API_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}") + string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}") + string(REPLACE "PT_REGISTER_API_NAME" "PT_REGISTER_API_NAME" util_declare "${util_declare}") + string(APPEND util_declare ");") + file(APPEND ${op_utils_header} "${util_declare}") +endfunction() + +function(register_op_utils TARGET_NAME) + set(utils_srcs) + set(options "") + set(oneValueArgs "") + set(multiValueArgs EXCLUDES DEPS) + cmake_parse_arguments(register_op_utils "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + file(GLOB SIGNATURES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_sig.cc") + foreach(target ${SIGNATURES}) + append_op_util_declare(${target}) + list(APPEND utils_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${target}) + endforeach() + + cc_library(${TARGET_NAME} SRCS ${utils_srcs} DEPS ${register_op_utils_DEPS}) +endfunction() diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 4b88689b9b6dfa7383d79a834afa3f23debb0890..3e7669849882687510dd8193e7c2762d08f332a0 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(utils) add_subdirectory(scripts) add_subdirectory(testing) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory") diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 08c2d2e05558bd616674522a1c8d1c4c2698d196..75966399148d455debac27e2fe890ae7fa0ffb0a 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -4,7 +4,6 @@ add_subdirectory(distributed) add_subdirectory(framework) add_subdirectory(imperative) add_subdirectory(operators) -add_subdirectory(string) add_subdirectory(pybind) add_subdirectory(eager) # NOTE: please add subdirectory inference at last. diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 24923d72681869348ec7db816349bdef010c973d..5ae2e26e87c7b33a75325f5b585ca115bd3b6308 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -13,17 +13,7 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) endif() add_subdirectory(common) -add_subdirectory(service) -add_subdirectory(table) +add_subdirectory(ps) add_subdirectory(test) add_subdirectory(index_dataset) add_subdirectory(fleet_executor) - -get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) - -set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(fleet - SRCS fleet.cc - DEPS framework_proto ps_framework_proto ps_service variable_helper scope op_registry fs shell ${RPC_DEPS}) - -target_link_libraries(fleet z) diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index 6454a349505131a461d99fe90db9dd69cb916507..452c666a1523cb81f7857684896997f1ad20d20d 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -13,11 +13,13 @@ // limitations under the License. #include +#include // NOLINT #include "paddle/fluid/distributed/fleet_executor/dist_model.h" #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/program_desc.h" @@ -37,10 +39,110 @@ bool IsPersistable(const framework::VarDesc *var) { } return false; } + +bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, + framework::LoDTensor *input_tensor, + const platform::Place &place) { + VLOG(3) << "Loading data from DistModelTensor for " << input_data.name; + framework::DDim dims = framework::make_ddim(input_data.shape); + void *input_tensor_ptr; + if (input_data.dtype == DistModelDataType::INT64) { + input_tensor_ptr = input_tensor->mutable_data(dims, place); + } else if (input_data.dtype == DistModelDataType::FLOAT32) { + input_tensor_ptr = input_tensor->mutable_data(dims, place); + } else if (input_data.dtype == DistModelDataType::INT32) { + input_tensor_ptr = input_tensor->mutable_data(dims, place); + } else { + // Q(fleet exe dev): for input/output, should we support fp16 + LOG(ERROR) << "unsupported feed type " << input_data.dtype; + return false; + } + + PADDLE_ENFORCE_NOT_NULL( + input_tensor_ptr, + paddle::platform::errors::Fatal( + "LoDTensor creation failed. DistModel loaded data failed.")); + PADDLE_ENFORCE_NOT_NULL(input_data.data.data(), + paddle::platform::errors::InvalidArgument( + "DistModelTensor contains no data.")); + + if (platform::is_cpu_place(place)) { + VLOG(3) << "Loading data for CPU."; + std::memcpy(static_cast(input_tensor_ptr), input_data.data.data(), + input_data.data.length()); + } else if (platform::is_gpu_place(place)) { + VLOG(3) << "Loading data for GPU."; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = + dynamic_cast(pool.Get(place)); + auto gpu_place = place; + memory::Copy(gpu_place, static_cast(input_tensor_ptr), + platform::CPUPlace(), input_data.data.data(), + input_data.data.length(), dev_ctx->stream()); +#else + PADDLE_THROW(paddle::platform::errors::Fatal( + "Paddle wasn't compiled with CUDA, but place is GPU.")); +#endif + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "DistModel only supports CPU and GPU.")); + } + + framework::LoD dst_lod; + for (auto &src_lod : input_data.lod) { + dst_lod.emplace_back(src_lod); + } + input_tensor->set_lod(dst_lod); + return true; +} + +std::string DistModelDTypeToString(DistModelDataType dtype) { + switch (dtype) { + case DistModelDataType::FLOAT32: + return "float32"; + case DistModelDataType::FLOAT16: + return "float16"; + case DistModelDataType::INT64: + return "int64"; + case DistModelDataType::INT32: + return "int32"; + case DistModelDataType::INT8: + return "int8"; + } + return "NOT SUPPORT DTYPE"; +} + +bool IsPPFirstStage(const DistModelConfig &config) { + return config.local_rank - config.mp_degree < 0; +} + +bool IsPPLastStage(const DistModelConfig &config) { + return config.local_rank + config.mp_degree >= config.nranks; +} + +class DistModelTimer { + public: + void tic() { tic_time = std::chrono::high_resolution_clock::now(); } + double toc() { + std::chrono::high_resolution_clock::time_point toc_time = + std::chrono::high_resolution_clock::now(); + std::chrono::duration time_elapse = + std::chrono::duration_cast>(toc_time - + tic_time); + double time_elapse_in_ms = + static_cast(time_elapse.count()) * 1000.0; + return time_elapse_in_ms; + } + + private: + std::chrono::high_resolution_clock::time_point tic_time; +}; + } // namespace bool DistModel::Init() { - /* TODO(fleet exe dev): implement this funct */ + carrier_id_ = "inference"; bool init_method = (!config_.model_dir.empty() || config_.program_desc); PADDLE_ENFORCE_EQ(init_method, true, platform::errors::InvalidArgument( @@ -127,10 +229,9 @@ bool DistModel::CommInit() { InsertCommOp("mp_comm_id", mp_group_nranks, mp_group_rank, peer_endpoints, comm_init_block, config_.mp_ring_id); } - if (config_.pp_degree) { - // NOTE: the last pp stage doesn't need init pp comm + if (config_.pp_degree > 1) { VLOG(3) << "Init comm group for pp."; - if (config_.local_rank - config_.mp_degree >= 0) { + if (!IsPPFirstStage(config_)) { PADDLE_ENFORCE_EQ(config_.pp_upstream_ring_id >= 0, true, platform::errors::InvalidArgument( "pp upstream ring id must be provided for " @@ -143,7 +244,7 @@ bool DistModel::CommInit() { comm_init_block, config_.pp_upstream_ring_id); } - if (config_.local_rank + config_.mp_degree < config_.nranks) { + if (!IsPPLastStage(config_)) { PADDLE_ENFORCE_EQ(config_.pp_downstream_ring_id >= 0, true, platform::errors::InvalidArgument( "pp downstream ring id must be provided for " @@ -326,7 +427,7 @@ bool DistModel::PrepareFleetExe() { id_to_rank.insert({i, i}); } fleet_exe.reset(new FleetExecutor(executor_desc_)); - fleet_exe->Init("inference", *(program_.get()), scope_.get(), place_, 1, + fleet_exe->Init(carrier_id_, *(program_.get()), scope_.get(), place_, 1, {task_node_.get()}, id_to_rank); return true; } @@ -340,8 +441,27 @@ bool DistModel::PrepareFeedAndFetch() { feeds_.resize(idx + 1); } feeds_[idx] = op; - feed_names_[op->Output("Out")[0]] = idx; - idx_to_feeds_[idx] = op->Output("Out")[0]; + std::string var_name = op->Output("Out")[0]; + feed_names_[var_name] = idx; + idx_to_feeds_[idx] = var_name; + framework::VarDesc *real_var = program_->Block(0).FindVar(var_name); + if (!real_var) { + LOG(ERROR) + << "The output of feed ops [" << var_name + << "] cannot be found in the program. Check the inference program."; + return false; + } + if (real_var->GetDataType() == framework::proto::VarType::FP32) { + feeds_to_dtype_.insert({var_name, DistModelDataType::FLOAT32}); + } else if (real_var->GetDataType() == framework::proto::VarType::INT32) { + feeds_to_dtype_.insert({var_name, DistModelDataType::INT32}); + } else if (real_var->GetDataType() == framework::proto::VarType::INT64) { + feeds_to_dtype_.insert({var_name, DistModelDataType::INT64}); + } else { + LOG(ERROR) << "Don't support feed var dtype for: " + << real_var->GetDataType(); + return false; + } } else if (op->Type() == "fetch") { VLOG(3) << "fetch op with fetch var: " << op->Input("X")[0]; int idx = BOOST_GET_CONST(int, op->GetAttr("col")); @@ -349,15 +469,170 @@ bool DistModel::PrepareFeedAndFetch() { fetches_.resize(idx + 1); } fetches_[idx] = op; - id_to_fetches_[idx] = op->Input("X")[0]; + idx_to_fetches_[idx] = op->Input("X")[0]; } } + + if (config_.pp_degree == 1) { + if (feeds_.size() == 0) { + LOG(ERROR) << "No feed ops in the inf program, please check the program."; + return false; + } + if (fetches_.size() == 0) { + LOG(ERROR) << "No fetch op in the inf program, please check the program."; + return false; + } + } else { + if (IsPPFirstStage(config_)) { + if (feeds_.size() == 0) { + LOG(ERROR) << "Feed ops are needed for the first pp stage."; + return false; + } + } else { + if (feeds_.size() > 0) { + LOG(WARNING) << "Feed op is found in the non-first stage of pp."; + } else { + LOG(INFO) << "No feed ops in non-first pp stage."; + } + } + if (IsPPLastStage(config_)) { + if (fetches_.size() == 0) { + LOG(WARNING) << "No fetch op was found in the last pp stage. Make sure " + "the result has been sent to frist pp stage."; + } + } else { + if (fetches_.size() > 0) { + LOG(WARNING) << "Fetch op is found in the non-last stage of pp."; + } else { + LOG(INFO) << "No fetch op in non-last pp stage."; + } + } + } + return true; +} + +bool DistModel::FeedData(const std::vector &input_data, + framework::Scope *scope) { + VLOG(3) << "DistModel is feeding data."; + if (input_data.size() != feeds_.size()) { + LOG(ERROR) << "Should provide " << feeds_.size() << " feeds, but got " + << input_data.size() << " data."; + return false; + } + feed_tensors_.resize(feeds_.size()); + for (size_t i = 0; i < input_data.size(); ++i) { + // feed each data separately + framework::LoDTensor *input_tensor = &(feed_tensors_[i]); + if (!LoadDataFromDistModelTensor(input_data[i], input_tensor, place_)) { + LOG(ERROR) << "Fail to load data from tensor " << input_data[i].name; + return false; + } + std::string target_name = input_data[i].name; + if (feed_names_.find(target_name) == feed_names_.end()) { + LOG(ERROR) << "The input name [" << target_name + << "] cannot be found in the program." + << " DistModel loads data failed."; + return false; + } + if (input_data[i].dtype != feeds_to_dtype_[target_name]) { + LOG(ERROR) << "Feed var [" << target_name << "] expected dtype is: " + << DistModelDTypeToString(feeds_to_dtype_[target_name]) + << ". But received dtype is: " + << DistModelDTypeToString(input_data[i].dtype) << "."; + return false; + } + int feed_idx = feed_names_[target_name]; + framework::SetFeedVariable(scope, *input_tensor, "feed", feed_idx); + } return true; } -void DistModel::Run(const std::vector &input_data, +bool DistModel::FetchResults(std::vector *output_data, + framework::Scope *scope) { + VLOG(3) << "DistModel is fetch results."; + output_data->resize(fetches_.size()); + for (size_t i = 0; i < fetches_.size(); ++i) { + int idx = BOOST_GET_CONST(int, fetches_[i]->GetAttr("col")); + VLOG(3) << "Fetching data for [" << idx_to_fetches_[idx] << "]"; + PADDLE_ENFORCE_EQ( + static_cast(idx), i, + platform::errors::InvalidArgument( + "Fetch op's col attr(%d) should be equal to the index(%d)", idx, + i)); + framework::FetchType &fetch_var = + framework::GetFetchVariable(*scope, "fetch", idx); + auto &fetch = BOOST_GET(framework::LoDTensor, fetch_var); + auto type = fetch.type(); + auto output = &(output_data->at(i)); + output->name = idx_to_fetches_[idx]; + bool rst = false; + if (type == framework::proto::VarType::FP32) { + rst = FetchResult(fetch, output); + output->dtype = DistModelDataType::FLOAT32; + } else if (type == framework::proto::VarType::INT64) { + rst = FetchResult(fetch, output); + output->dtype = DistModelDataType::INT64; + } else if (type == framework::proto::VarType::INT32) { + rst = FetchResult(fetch, output); + output->dtype = DistModelDataType::INT32; + } else { + LOG(ERROR) << "DistModel meets unknown fetch data type. DistModel only " + "supports float32, int64 and int32 fetch type for now."; + } + if (!rst) { + LOG(ERROR) << "DistModel fails to fetch result " << idx_to_fetches_[idx]; + return false; + } + } + return true; +} + +template +bool DistModel::FetchResult(const framework::LoDTensor &fetch, + DistModelTensor *output_data) { + auto shape = framework::vectorize(fetch.dims()); + output_data->shape.assign(shape.begin(), shape.end()); + const T *data = fetch.data(); + int64_t num_elems = fetch.numel(); + output_data->data.Resize(num_elems * sizeof(T)); + // The output of fetch op is always on the cpu, no need switch on place + memcpy(output_data->data.data(), data, num_elems * sizeof(T)); + output_data->lod.clear(); + for (auto &level : fetch.lod()) { + output_data->lod.emplace_back(level.begin(), level.end()); + } + return true; +} + +bool DistModel::Run(const std::vector &input_data, std::vector *output_data) { - /* TODO(fleet exe dev): implement this funct */ + // TODO(fleet exe dev): support pipeline inf mode + VLOG(3) << "DistModel run for once."; + + DistModelTimer timer; + timer.tic(); + + if (!FeedData(input_data, scope_.get())) { + LOG(ERROR) << "DistModel failed at feeding data."; + return false; + } + double feed_elapse = timer.toc(); + VLOG(3) << "Finish loading data, cost " << feed_elapse << "ms."; + + fleet_exe->Run(carrier_id_); + double fleet_exe_elapse = timer.toc(); + VLOG(3) << "Finish FleetExe running, cost " << fleet_exe_elapse - feed_elapse + << "ms."; + + if (!FetchResults(output_data, scope_.get())) { + LOG(ERROR) << "DistModel failed at fetching result."; + return false; + } + double fetch_elapse = timer.toc(); + VLOG(3) << "Finish fetching data, cost " << fetch_elapse - fleet_exe_elapse + << "ms."; + VLOG(3) << "DistModel finish inf, cost " << fetch_elapse << "ms"; + return true; } } // namespace distributed diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h index 96e9c018074b5f0079c62d0c89c45be8ec0e172b..e6ad94e266a964bdc3c6cfba39cbf86786a4acea 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.h +++ b/paddle/fluid/distributed/fleet_executor/dist_model.h @@ -19,6 +19,7 @@ #include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h" #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/place.h" @@ -57,7 +58,7 @@ class DistModel { public: explicit DistModel(const DistModelConfig& config) : config_(config) {} bool Init(); - void Run(const std::vector& input_data, + bool Run(const std::vector& input_data, std::vector* output_data); ~DistModel() = default; @@ -75,12 +76,22 @@ class DistModel { void InsertCommOp(std::string tmp_var_name, int nranks, int rank, const std::vector& peer_endpoints, framework::BlockDesc* block, int ring_id); + bool FeedData(const std::vector& input_data, + framework::Scope* scope); + bool FetchResults(std::vector* output_data, + framework::Scope* scope); + template + bool FetchResult(const framework::LoDTensor& fetch, + DistModelTensor* output_data); + std::string carrier_id_; + std::vector feed_tensors_; std::vector feeds_; std::map feed_names_; std::map idx_to_feeds_; + std::map feeds_to_dtype_; std::vector fetches_; - std::map id_to_fetches_; + std::map idx_to_fetches_; DistModelConfig config_; FleetExecutorDesc executor_desc_; std::shared_ptr fleet_exe; diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h index 4a04633388af21277806115b77d69ce05867519a..6bdd858d6cf9ed78c1a655c28ed58574374ce3fb 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h +++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h @@ -62,7 +62,7 @@ class DistModelDataBuf { void Free(); void* data_{nullptr}; size_t length_{0}; - bool memory_owned_{false}; + bool memory_owned_{true}; }; struct DistModelTensor { diff --git a/paddle/fluid/distributed/ps/CMakeLists.txt b/paddle/fluid/distributed/ps/CMakeLists.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e97c9db1a5199175507884e29c2f53e8a5bae07a 100644 --- a/paddle/fluid/distributed/ps/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/CMakeLists.txt @@ -0,0 +1,4 @@ +set_property(GLOBAL PROPERTY RPC_DEPS sendrecv_rpc ${BRPC_DEPS} string_helper) +add_subdirectory(table) +add_subdirectory(service) +add_subdirectory(wrapper) diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt similarity index 73% rename from paddle/fluid/distributed/service/CMakeLists.txt rename to paddle/fluid/distributed/ps/service/CMakeLists.txt index d1f04e26ade7289bcb10988d02de01962a1889ab..ab6c2e26002743fc129c4a7d0e532a63aa1d610b 100644 --- a/paddle/fluid/distributed/service/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt @@ -8,12 +8,12 @@ brpc_library(sendrecv_rpc SRCS PROTO sendrecv.proto DEPS ${BRPC_DEPS} ) -set_property(GLOBAL PROPERTY RPC_DEPS sendrecv_rpc ${BRPC_DEPS} string_helper) +#set_property(GLOBAL PROPERTY RPC_DEPS sendrecv_rpc ${BRPC_DEPS} string_helper) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) -set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(communicator/communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(ps_service/service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) @@ -36,11 +36,13 @@ ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DE cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS}) cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS}) -cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS}) -cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RPC_DEPS}) +cc_library(communicator SRCS communicator/communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS}) +cc_library(ps_service SRCS ps_service/service.cc DEPS communicator client server boost ${RPC_DEPS}) cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) -set_source_files_properties(graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(graph_py_service SRCS graph_py_service.cc DEPS ps_service) +set_source_files_properties(ps_service/graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_py_service SRCS ps_service/graph_py_service.cc DEPS ps_service) + +#add_subdirectory(communicator) diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc similarity index 99% rename from paddle/fluid/distributed/service/brpc_ps_client.cc rename to paddle/fluid/distributed/ps/service/brpc_ps_client.cc index db1dd2ced84e53aee8a57f70a3d11301fc00b4eb..e855fcbd02553ac1ea2e753239deaa8371661b32 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc @@ -16,7 +16,7 @@ #include #include -#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/framework/archive.h" static const int max_port = 65535; diff --git a/paddle/fluid/distributed/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h similarity index 98% rename from paddle/fluid/distributed/service/brpc_ps_client.h rename to paddle/fluid/distributed/ps/service/brpc_ps_client.h index d5388a5cd07c9e1d982f7e08d7a0c1c361af1d0d..70f406ee248dc3d39777297d8387b4749439cf82 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h @@ -22,8 +22,8 @@ #include "brpc/channel.h" #include "brpc/controller.h" #include "brpc/server.h" -#include "paddle/fluid/distributed/service/brpc_utils.h" -#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_utils.h" +#include "paddle/fluid/distributed/ps/service/ps_client.h" #include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc similarity index 99% rename from paddle/fluid/distributed/service/brpc_ps_server.cc rename to paddle/fluid/distributed/ps/service/brpc_ps_server.cc index dd7072be7de63ba90c55e176671c63ba1d444e09..58ce52552c9d22c56b314dfe0bccbb8a564edb5d 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" #include // NOLINT #include "butil/object_pool.h" #include "paddle/fluid/distributed/common/cost_timer.h" -#include "paddle/fluid/distributed/table/depends/sparse_utils.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h" +#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/distributed/service/brpc_ps_server.h b/paddle/fluid/distributed/ps/service/brpc_ps_server.h similarity index 98% rename from paddle/fluid/distributed/service/brpc_ps_server.h rename to paddle/fluid/distributed/ps/service/brpc_ps_server.h index bf228a5d1b0ae58669e5f555d2f99200d6099661..4310c247438ceb9bff541fdd21e00ff70ff7b4fd 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.h +++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.h @@ -17,8 +17,8 @@ #include "brpc/channel.h" #include "brpc/controller.h" #include "brpc/server.h" -#include "paddle/fluid/distributed/service/brpc_utils.h" -#include "paddle/fluid/distributed/service/server.h" +#include "paddle/fluid/distributed/ps/service/brpc_utils.h" +#include "paddle/fluid/distributed/ps/service/server.h" namespace brpc { class Controller; diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc similarity index 98% rename from paddle/fluid/distributed/service/brpc_utils.cc rename to paddle/fluid/distributed/ps/service/brpc_utils.cc index 147758abfd55530d66b66bd8cad110e5202f7dc2..23b2f5545ffc2ae8939dba26e602505aa8197139 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/distributed/service/brpc_utils.h" +#include "paddle/fluid/distributed/ps/service/brpc_utils.h" #include #include #include "paddle/fluid/platform/enforce.h" @@ -76,7 +76,7 @@ void SerializeToMultiVarMsgAndIOBuf( if (var->IsType()) { SerializeLodTensor(var, ctx, send_var_msg, &temp_iobuf); - } else if (var->IsType()) { + } else if (var->IsType()) { SerializeSelectedRows(var, ctx, send_var_msg, &temp_iobuf); } iobuf->append(temp_iobuf); @@ -127,7 +127,7 @@ void SerializeLodTensor(framework::Variable* var, void SerializeSelectedRows(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* var_msg, butil::IOBuf* iobuf) { - framework::SelectedRows* slr = var->GetMutable(); + pten::SelectedRows* slr = var->GetMutable(); auto* tensor = slr->mutable_value(); auto* rows = slr->mutable_rows(); @@ -255,7 +255,7 @@ void DeserializeSelectedRows( butil::IOBufBytesIterator& io_buffer_itr, // NOLINT const platform::DeviceContext& ctx) { const auto place = ctx.GetPlace(); - auto* slr = var->GetMutable(); + auto* slr = var->GetMutable(); framework::Tensor* tensor = slr->mutable_value(); slr->set_height(msg.slr_height()); std::vector tmp_rows(msg.dims()[0]); diff --git a/paddle/fluid/distributed/service/brpc_utils.h b/paddle/fluid/distributed/ps/service/brpc_utils.h similarity index 98% rename from paddle/fluid/distributed/service/brpc_utils.h rename to paddle/fluid/distributed/ps/service/brpc_utils.h index ebae710acc28b58a503bc9c0b455ef7c5ca10cff..b241f7f80121cc6920720e3d24332d3be129bd77 100644 --- a/paddle/fluid/distributed/service/brpc_utils.h +++ b/paddle/fluid/distributed/ps/service/brpc_utils.h @@ -20,7 +20,7 @@ limitations under the License. */ #include #include "brpc/channel.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3610729d74d939b47fbd6f8e7b58219934021bca --- /dev/null +++ b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt @@ -0,0 +1,8 @@ + + +get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + +set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + + +cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS}) diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc similarity index 99% rename from paddle/fluid/distributed/service/communicator.cc rename to paddle/fluid/distributed/ps/service/communicator/communicator.cc index e2b81ace2914789110e9e9410e314f6db1dccf50..a73f87c1d88965ce2e1b522ac79422f94ef0ea98 100644 --- a/paddle/fluid/distributed/service/communicator.cc +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include #include "gflags/gflags.h" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/string_helper.h" @@ -28,7 +28,7 @@ namespace paddle { namespace distributed { using framework::LoDTensor; -using framework::SelectedRows; +using pten::SelectedRows; const uint32_t MAX_FEASIGN_NUM = 1024 * 100 * 100; @@ -293,7 +293,7 @@ void Communicator::RpcSendSparse(const std::string &var_name, int table_id, std::vector push_g_vec; auto *send_var = scope.FindVar(var_name); - auto *tensor = send_var->GetMutable(); + auto *tensor = send_var->GetMutable(); auto dim = tensor->value().dims()[1]; std::transform(tensor->rows().begin(), tensor->rows().end(), std::back_inserter(sparse_push_keys), @@ -1012,10 +1012,10 @@ void GeoCommunicator::Send(const std::vector &var_names, auto *var = scope.FindVar(table_name); - PADDLE_ENFORCE_EQ(var->IsType(), true, + PADDLE_ENFORCE_EQ(var->IsType(), true, platform::errors::InvalidArgument( "Only need to send Sparse Grad in Geo mode.")); - auto &rows = var->Get().rows(); + auto &rows = var->Get().rows(); // insert ids which has not been record for (size_t j = 0; j < rows.size(); j++) { @@ -1290,7 +1290,7 @@ void GeoCommunicator::SendSparse(const std::string &varname, auto cpu_ctx = paddle::platform::CPUDeviceContext(); auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); + auto *t_delta = var_delta->GetMutable(); auto *var_t_value = t_delta->mutable_value(); var_t_value->Resize({static_cast(sparse_ids.size()), dims1}); auto *t_value = var_t_value->mutable_data(cpu_ctx.GetPlace()); diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h similarity index 97% rename from paddle/fluid/distributed/service/communicator.h rename to paddle/fluid/distributed/ps/service/communicator/communicator.h index 7056c9aba62dd5618d185d16e7eb8bd168dc5a73..570e668d9d5d2b40d280ef905b12fcb0e4ada09b 100644 --- a/paddle/fluid/distributed/service/communicator.h +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h @@ -29,7 +29,7 @@ limitations under the License. */ #include #include "gflags/gflags.h" -#include "paddle/fluid/distributed/communicator_common.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable_helper.h" @@ -41,7 +41,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" -#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/ps/service/ps_client.h" namespace paddle { namespace distributed { @@ -193,15 +193,15 @@ inline void MergeVars(const std::string &var_name, result.device(*cpu_ctx.eigen_device()) = result / static_cast(vars.size()); } - } else if (var0->IsType()) { - auto &slr0 = var0->Get(); - auto *out_slr = out_var->GetMutable(); + } else if (var0->IsType()) { + auto &slr0 = var0->Get(); + auto *out_slr = out_var->GetMutable(); out_slr->mutable_rows()->clear(); out_slr->mutable_value()->mutable_data({{}}, cpu_place); - std::vector inputs; + std::vector inputs; inputs.reserve(vars.size()); for (auto &var : vars) { - inputs.push_back(&var->Get()); + inputs.push_back(&var->Get()); } auto dev_ctx = paddle::platform::CPUDeviceContext(); if (merge_add) { diff --git a/paddle/fluid/distributed/communicator_common.h b/paddle/fluid/distributed/ps/service/communicator/communicator_common.h similarity index 100% rename from paddle/fluid/distributed/communicator_common.h rename to paddle/fluid/distributed/ps/service/communicator/communicator_common.h diff --git a/paddle/fluid/distributed/service/env.cc b/paddle/fluid/distributed/ps/service/env.cc similarity index 93% rename from paddle/fluid/distributed/service/env.cc rename to paddle/fluid/distributed/ps/service/env.cc index 25bc2cc366aaacba32c22a5225d344f8618767d9..15bd31ce958685643c26af044cdc948725589105 100644 --- a/paddle/fluid/distributed/service/env.cc +++ b/paddle/fluid/distributed/ps/service/env.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/ps/service/env.h" namespace paddle { namespace distributed {} // namespace distributed diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/ps/service/env.h similarity index 100% rename from paddle/fluid/distributed/service/env.h rename to paddle/fluid/distributed/ps/service/env.h diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc similarity index 99% rename from paddle/fluid/distributed/service/graph_brpc_client.cc rename to paddle/fluid/distributed/ps/service/graph_brpc_client.cc index a9682d6a6efcc9db33e33c3e4fef1ec60f5bedf3..301708f6b7bb3d465d8dcbd2b94bbc4c217fcc77 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" #include #include #include @@ -20,8 +20,8 @@ #include #include #include "Eigen/Dense" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/string/string_helper.h" namespace paddle { diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h similarity index 95% rename from paddle/fluid/distributed/service/graph_brpc_client.h rename to paddle/fluid/distributed/ps/service/graph_brpc_client.h index 2e5d5b6ee93cbe606ed87a4c947d993ecccfc59a..06e753d028baa2d9c0002620dc445d4204046180 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h @@ -24,10 +24,10 @@ #include "brpc/channel.h" #include "brpc/controller.h" #include "brpc/server.h" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" -#include "paddle/fluid/distributed/service/graph_brpc_server.h" -#include "paddle/fluid/distributed/service/ps_client.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/ps/service/ps_client.h" +#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor_util.h" diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc similarity index 99% rename from paddle/fluid/distributed/service/graph_brpc_server.cc rename to paddle/fluid/distributed/ps/service/graph_brpc_server.cc index c1348e4804e2badcfc02c61dbbb0f83892cedefb..441f489fb3097cda51fc62dc35e93264a1f7caef 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/graph_brpc_server.h" -#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" #include // NOLINT #include #include "butil/endpoint.h" #include "iomanip" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h similarity index 96% rename from paddle/fluid/distributed/service/graph_brpc_server.h rename to paddle/fluid/distributed/ps/service/graph_brpc_server.h index ecd78d28ca812a1e4c3b1429e891b3d0b7d5dd95..aee0190850753786ce0f083257458caf50a63d26 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h @@ -20,10 +20,10 @@ #include #include -#include "paddle/fluid/distributed/service/brpc_ps_server.h" -#include "paddle/fluid/distributed/service/server.h" -#include "paddle/fluid/distributed/table/common_graph_table.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/ps/service/server.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { class GraphBrpcServer : public PSServer { diff --git a/paddle/fluid/distributed/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc similarity index 99% rename from paddle/fluid/distributed/service/heter_client.cc rename to paddle/fluid/distributed/ps/service/heter_client.cc index 95023704f9d51522386eaadee0f5c6fc01d1764d..e9e3ec1d9df471db2c8e54e5c0eaf71f9b0e9bd3 100644 --- a/paddle/fluid/distributed/service/heter_client.cc +++ b/paddle/fluid/distributed/ps/service/heter_client.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/split.h" diff --git a/paddle/fluid/distributed/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h similarity index 95% rename from paddle/fluid/distributed/service/heter_client.h rename to paddle/fluid/distributed/ps/service/heter_client.h index 7ba47ad9a5df58a75cfe736a0c16a82f43ec9576..4f27ef75ea954dece5cd734108c64813b681c6f6 100644 --- a/paddle/fluid/distributed/service/heter_client.h +++ b/paddle/fluid/distributed/ps/service/heter_client.h @@ -25,9 +25,9 @@ limitations under the License. */ #include "brpc/channel.h" #include "brpc/controller.h" #include "brpc/server.h" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" -#include "paddle/fluid/distributed/service/brpc_utils.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_utils.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" diff --git a/paddle/fluid/distributed/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc similarity index 98% rename from paddle/fluid/distributed/service/heter_server.cc rename to paddle/fluid/distributed/ps/service/heter_server.cc index fee3081f0329a92bb4903d8540dcadb73d663154..01afed3f1237515cf5c5e4ad01d329b424b5079e 100644 --- a/paddle/fluid/distributed/service/heter_server.cc +++ b/paddle/fluid/distributed/ps/service/heter_server.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/heter_server.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" #include "paddle/fluid/string/split.h" namespace paddle { diff --git a/paddle/fluid/distributed/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h similarity index 99% rename from paddle/fluid/distributed/service/heter_server.h rename to paddle/fluid/distributed/ps/service/heter_server.h index 094ee6036413d5f5469e5ab4bee14913d39aad97..86f83cb1fc4fe5ef881dbb2e8f88bd6d1bc67bc5 100644 --- a/paddle/fluid/distributed/service/heter_server.h +++ b/paddle/fluid/distributed/ps/service/heter_server.h @@ -25,8 +25,8 @@ limitations under the License. */ #include "brpc/channel.h" #include "brpc/controller.h" #include "brpc/server.h" -#include "paddle/fluid/distributed/service/brpc_utils.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/service/brpc_utils.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/ps/service/ps_client.cc similarity index 90% rename from paddle/fluid/distributed/service/ps_client.cc rename to paddle/fluid/distributed/ps/service/ps_client.cc index d45f41a0f58de36bb1575c1b51663f8899fb215d..fd956b758de1ae00155b37bb4d2c9e8134da09e4 100644 --- a/paddle/fluid/distributed/service/ps_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_client.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/ps/service/ps_client.h" #include "glog/logging.h" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" -#include "paddle/fluid/distributed/service/graph_brpc_client.h" -#include "paddle/fluid/distributed/service/ps_local_client.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/ps/service/ps_local_client.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h similarity index 97% rename from paddle/fluid/distributed/service/ps_client.h rename to paddle/fluid/distributed/ps/service/ps_client.h index a408a0cc24fb51de041ecd4098b5434e9c5d91ca..7db8b0c1244594ba4483101536995f9e414382ab 100644 --- a/paddle/fluid/distributed/service/ps_client.h +++ b/paddle/fluid/distributed/ps/service/ps_client.h @@ -22,10 +22,10 @@ #include #include "paddle/fluid/distributed/common/cost_timer.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/service/env.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" -#include "paddle/fluid/distributed/table/accessor.h" -#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/platform/timer.h" namespace paddle { diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc similarity index 98% rename from paddle/fluid/distributed/service/ps_local_client.cc rename to paddle/fluid/distributed/ps/service/ps_local_client.cc index e949b21b02e6d9842ffae377a17610757a65ae75..972cce135f189bee6dbba9e0b89baa288816827b 100644 --- a/paddle/fluid/distributed/service/ps_local_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/ps_local_client.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/service/ps_local_client.h" +#include "paddle/fluid/distributed/ps/table/table.h" //#define pslib_debug_dense_compress diff --git a/paddle/fluid/distributed/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h similarity index 99% rename from paddle/fluid/distributed/service/ps_local_client.h rename to paddle/fluid/distributed/ps/service/ps_local_client.h index 9d2b01a45fe929097c06fb264f470974410e7f4e..e73974ac562861d86e679ddbc213335d10731281 100644 --- a/paddle/fluid/distributed/service/ps_local_client.h +++ b/paddle/fluid/distributed/ps/service/ps_local_client.h @@ -15,7 +15,7 @@ #include "brpc/channel.h" #include "brpc/controller.h" #include "brpc/server.h" -#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/ps/service/ps_client.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h similarity index 95% rename from paddle/fluid/distributed/service/ps_local_server.h rename to paddle/fluid/distributed/ps/service/ps_local_server.h index 33b0b5fa796d7571e16a0f79fc6ce4de21b1e7a8..91f8bc4c9127115c9b5595270973d011778c6262 100644 --- a/paddle/fluid/distributed/service/ps_local_server.h +++ b/paddle/fluid/distributed/ps/service/ps_local_server.h @@ -16,7 +16,7 @@ #include #include -#include "paddle/fluid/distributed/service/server.h" +#include "paddle/fluid/distributed/ps/service/server.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc similarity index 99% rename from paddle/fluid/distributed/service/graph_py_service.cc rename to paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc index 8d7a822321a2b34aae12035c549ca23f21ad16a0..b2aece98071c146b23e897900b9c7f9736c2f2de 100644 --- a/paddle/fluid/distributed/service/graph_py_service.cc +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/graph_py_service.h" +#include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h" #include // NOLINT #include "butil/endpoint.h" #include "iomanip" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h similarity index 95% rename from paddle/fluid/distributed/service/graph_py_service.h rename to paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h index a860d1f58d3a23e79ca3d3a380b6067c13e76371..71b44f36d0107fa57b3beb51f29e7509d967f995 100644 --- a/paddle/fluid/distributed/service/graph_py_service.h +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h @@ -32,11 +32,11 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/service/env.h" -#include "paddle/fluid/distributed/service/graph_brpc_client.h" -#include "paddle/fluid/distributed/service/graph_brpc_server.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" -#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/ps/service/ps_service/service.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/ps/service/ps_service/service.cc similarity index 96% rename from paddle/fluid/distributed/service/service.cc rename to paddle/fluid/distributed/ps/service/ps_service/service.cc index 698ceb1578f47eec83d0ae1efb3bbac6149de210..73793d2f9bd0ec8c5b485830059a730bb8d8559a 100644 --- a/paddle/fluid/distributed/service/service.cc +++ b/paddle/fluid/distributed/ps/service/ps_service/service.cc @@ -12,13 +12,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/distributed/ps/service/ps_service/service.h" #include #include #include #include -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include "paddle/fluid/string/string_helper.h" using namespace std; // NOLINT diff --git a/paddle/fluid/distributed/service/service.h b/paddle/fluid/distributed/ps/service/ps_service/service.h similarity index 93% rename from paddle/fluid/distributed/service/service.h rename to paddle/fluid/distributed/ps/service/ps_service/service.h index 5c987267f9d2e581f0340afca1ec803a14ab6962..202c2407f15ae9fbf5087b55a65f6acd2957ddc5 100644 --- a/paddle/fluid/distributed/service/service.h +++ b/paddle/fluid/distributed/ps/service/ps_service/service.h @@ -20,9 +20,9 @@ limitations under the License. */ #include #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/service/ps_client.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" -#include "paddle/fluid/distributed/service/server.h" +#include "paddle/fluid/distributed/ps/service/ps_client.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/service/server.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/ps/service/sendrecv.proto similarity index 100% rename from paddle/fluid/distributed/service/sendrecv.proto rename to paddle/fluid/distributed/ps/service/sendrecv.proto diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/ps/service/server.cc similarity index 92% rename from paddle/fluid/distributed/service/server.cc rename to paddle/fluid/distributed/ps/service/server.cc index e44876e3d2b789580152626ea8c290db0d369509..5f1974e3e610c6772457514759bff83db944bf52 100644 --- a/paddle/fluid/distributed/service/server.cc +++ b/paddle/fluid/distributed/ps/service/server.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/service/server.h" +#include "paddle/fluid/distributed/ps/service/server.h" #include "glog/logging.h" -#include "paddle/fluid/distributed/service/brpc_ps_server.h" -#include "paddle/fluid/distributed/service/graph_brpc_server.h" -#include "paddle/fluid/distributed/service/ps_local_server.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/ps/service/ps_local_server.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/service/server.h b/paddle/fluid/distributed/ps/service/server.h similarity index 97% rename from paddle/fluid/distributed/service/server.h rename to paddle/fluid/distributed/ps/service/server.h index ebebedc80efb83f88a7e366b39a20e93961d0087..160d4a612829531d619c69a0cd5e9cd091f94868 100644 --- a/paddle/fluid/distributed/service/server.h +++ b/paddle/fluid/distributed/ps/service/server.h @@ -24,8 +24,8 @@ #include "google/protobuf/service.h" #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/service/env.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt similarity index 100% rename from paddle/fluid/distributed/table/CMakeLists.txt rename to paddle/fluid/distributed/ps/table/CMakeLists.txt diff --git a/paddle/fluid/distributed/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h similarity index 100% rename from paddle/fluid/distributed/table/accessor.h rename to paddle/fluid/distributed/ps/table/accessor.h diff --git a/paddle/fluid/distributed/table/barrier_table.cc b/paddle/fluid/distributed/ps/table/barrier_table.cc similarity index 97% rename from paddle/fluid/distributed/table/barrier_table.cc rename to paddle/fluid/distributed/ps/table/barrier_table.cc index 72394d15c54af5b346c70359b4dcde0ad2cd063c..25838e7ac2f047d9ff7bf20705459c6b1d60d26f 100644 --- a/paddle/fluid/distributed/table/barrier_table.cc +++ b/paddle/fluid/distributed/ps/table/barrier_table.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/common_table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/common_dense_table.cc b/paddle/fluid/distributed/ps/table/common_dense_table.cc similarity index 99% rename from paddle/fluid/distributed/table/common_dense_table.cc rename to paddle/fluid/distributed/ps/table/common_dense_table.cc index b34b143a3ce37ef9a61c41143a2dfcb1fc614eaa..607469e2f7b0d5df79d4cb7477e0eaa3f4a8323a 100644 --- a/paddle/fluid/distributed/table/common_dense_table.cc +++ b/paddle/fluid/distributed/ps/table/common_dense_table.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/common_dense_table.h" +#include "paddle/fluid/distributed/ps/table/common_dense_table.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/distributed/table/common_dense_table.h b/paddle/fluid/distributed/ps/table/common_dense_table.h similarity index 91% rename from paddle/fluid/distributed/table/common_dense_table.h rename to paddle/fluid/distributed/ps/table/common_dense_table.h index c8813dc33053f0c8a42a1090b262c7fde79f5ed5..a4c0f29ddb8770c8adc0d6885929aaac8a028e90 100644 --- a/paddle/fluid/distributed/table/common_dense_table.h +++ b/paddle/fluid/distributed/ps/table/common_dense_table.h @@ -19,10 +19,10 @@ #include #include #include "Eigen/Dense" -#include "paddle/fluid/distributed/table/accessor.h" -#include "paddle/fluid/distributed/table/common_table.h" -#include "paddle/fluid/distributed/table/depends/dense.h" -#include "paddle/fluid/distributed/table/depends/initializers.h" +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/depends/dense.h" +#include "paddle/fluid/distributed/ps/table/depends/initializers.h" #include "paddle/fluid/string/string_helper.h" namespace paddle { diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc similarity index 99% rename from paddle/fluid/distributed/table/common_graph_table.cc rename to paddle/fluid/distributed/ps/table/common_graph_table.cc index 042a4dee62bda6f80ba94d16eba8abab150aa0bc..54b98cb96ce5196bb5133f777b2571f4d3d43c6e 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/common_graph_table.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" #include #include #include #include #include #include "paddle/fluid/distributed/common/utils.h" -#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h similarity index 98% rename from paddle/fluid/distributed/table/common_graph_table.h rename to paddle/fluid/distributed/ps/table/common_graph_table.h index b76ab0ae9506027091ee3f0ab356f884b83346a3..4fc5b5ab633f9e0815461413829eeef7071b5718 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -36,11 +36,12 @@ #include #include #include -#include "paddle/fluid/distributed/table/accessor.h" -#include "paddle/fluid/distributed/table/common_table.h" -#include "paddle/fluid/distributed/table/graph/graph_node.h" -#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/core/utils/rw_lock.h" + namespace paddle { namespace distributed { class GraphShard { diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/ps/table/common_sparse_table.cc similarity index 99% rename from paddle/fluid/distributed/table/common_sparse_table.cc rename to paddle/fluid/distributed/ps/table/common_sparse_table.cc index 143b24cf3264774c8852307f4071cd03a41010d1..b44d08b937a96c806142f5d7f1ba2ae0bcdb0f5e 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/common_sparse_table.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/common_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/common_sparse_table.h" #include #include "glog/logging.h" diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/ps/table/common_sparse_table.h similarity index 92% rename from paddle/fluid/distributed/table/common_sparse_table.h rename to paddle/fluid/distributed/ps/table/common_sparse_table.h index a443710bf0fd82bc157db26184d5c2d87f191004..2e02d13e7e5aec928468dcfbde1cff5e0b9c514a 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/common_sparse_table.h @@ -24,13 +24,13 @@ #include #include #include "Eigen/Dense" -#include "paddle/fluid/distributed/table/accessor.h" -#include "paddle/fluid/distributed/table/common_table.h" -#include "paddle/fluid/distributed/table/depends/initializers.h" -#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" -#include "paddle/fluid/distributed/table/depends/sparse.h" -#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/depends/initializers.h" +#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h" +#include "paddle/fluid/distributed/ps/table/depends/sparse.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/core/utils/rw_lock.h" #define PSERVER_SAVE_SUFFIX ".shard" @@ -110,7 +110,7 @@ struct Meta { class CommonSparseTable : public SparseTable { public: - CommonSparseTable() { rwlock_.reset(new framework::RWLock); } + CommonSparseTable() { rwlock_.reset(new pten::RWLock); } virtual ~CommonSparseTable() {} // unused method begin @@ -193,7 +193,7 @@ class CommonSparseTable : public SparseTable { std::shared_ptr optimizer_; std::vector> shard_values_; std::unordered_map> pull_reservoir_; - std::unique_ptr rwlock_{nullptr}; + std::unique_ptr rwlock_{nullptr}; }; } // namespace distributed diff --git a/paddle/fluid/distributed/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h similarity index 98% rename from paddle/fluid/distributed/table/common_table.h rename to paddle/fluid/distributed/ps/table/common_table.h index bc7f17f5f245794cebf96a8a4bc69e0dce8ac997..bac826dfe0e20b42d5cc47467356bc5614383a44 100644 --- a/paddle/fluid/distributed/table/common_table.h +++ b/paddle/fluid/distributed/ps/table/common_table.h @@ -19,7 +19,7 @@ #include // NOLINT #include -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/distributed/common/utils.h" diff --git a/paddle/fluid/distributed/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc similarity index 99% rename from paddle/fluid/distributed/table/ctr_accessor.cc rename to paddle/fluid/distributed/ps/table/ctr_accessor.cc index 23144f39ade396613ff91b033dca364dd05a1a77..866bd8114ccea329123e16585c33366e759d5df8 100644 --- a/paddle/fluid/distributed/table/ctr_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/ctr_accessor.h" +#include "paddle/fluid/distributed/ps/table/ctr_accessor.h" #include #include "glog/logging.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/distributed/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h similarity index 98% rename from paddle/fluid/distributed/table/ctr_accessor.h rename to paddle/fluid/distributed/ps/table/ctr_accessor.h index 8be672e8e0d15e124d8babfb7dbc30b3d38f491f..1e31fec04649b19882269fa9cce5f5d7fb4978c1 100644 --- a/paddle/fluid/distributed/table/ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h @@ -18,8 +18,8 @@ #include #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/accessor.h" -#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/ps/table/depends/dense.h similarity index 100% rename from paddle/fluid/distributed/table/depends/dense.h rename to paddle/fluid/distributed/ps/table/depends/dense.h diff --git a/paddle/fluid/distributed/table/depends/feature_value.h b/paddle/fluid/distributed/ps/table/depends/feature_value.h similarity index 100% rename from paddle/fluid/distributed/table/depends/feature_value.h rename to paddle/fluid/distributed/ps/table/depends/feature_value.h diff --git a/paddle/fluid/distributed/table/depends/geo_recorder.h b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h similarity index 100% rename from paddle/fluid/distributed/table/depends/geo_recorder.h rename to paddle/fluid/distributed/ps/table/depends/geo_recorder.h diff --git a/paddle/fluid/distributed/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h similarity index 100% rename from paddle/fluid/distributed/table/depends/initializers.h rename to paddle/fluid/distributed/ps/table/depends/initializers.h diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h similarity index 98% rename from paddle/fluid/distributed/table/depends/large_scale_kv.h rename to paddle/fluid/distributed/ps/table/depends/large_scale_kv.h index 3408ef5f91ad009a33c28fb4093a79075112c0bd..dc7766c7ceb06eb0f57094af1f4e11df72da18aa 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/ps/table/depends/large_scale_kv.h @@ -28,11 +28,10 @@ #include "butil/object_pool.h" #include "paddle/fluid/distributed/common/utils.h" -#include "paddle/fluid/distributed/table/depends/initializers.h" -#include "paddle/fluid/distributed/thirdparty/round_robin.h" +#include "paddle/fluid/distributed/ps/table/depends/initializers.h" +#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/threadpool.h" @@ -43,6 +42,7 @@ #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/pten/backends/dynload/port.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h similarity index 100% rename from paddle/fluid/distributed/table/depends/rocksdb_warpper.h rename to paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h diff --git a/paddle/fluid/distributed/table/depends/sparse.h b/paddle/fluid/distributed/ps/table/depends/sparse.h similarity index 99% rename from paddle/fluid/distributed/table/depends/sparse.h rename to paddle/fluid/distributed/ps/table/depends/sparse.h index 0e1d7ef03c129c2dc6f72d6e56fafb143d879bd4..d4ea7829e45f8326fdbe33ebb1c7c9cfa3d35f6f 100644 --- a/paddle/fluid/distributed/table/depends/sparse.h +++ b/paddle/fluid/distributed/ps/table/depends/sparse.h @@ -24,7 +24,7 @@ #include "gflags/gflags.h" #include "paddle/fluid/distributed/common/utils.h" -#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" +#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h similarity index 100% rename from paddle/fluid/distributed/table/depends/sparse_utils.h rename to paddle/fluid/distributed/ps/table/depends/sparse_utils.h diff --git a/paddle/fluid/distributed/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc similarity index 93% rename from paddle/fluid/distributed/table/graph/graph_edge.cc rename to paddle/fluid/distributed/ps/table/graph/graph_edge.cc index 0ab0d5a76d6715401dd55ce7487634b72d452ddf..d1961b655d8829716b392c24ad6f1139089eb80d 100644 --- a/paddle/fluid/distributed/table/graph/graph_edge.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/graph/graph_edge.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_edge.h" #include namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h similarity index 100% rename from paddle/fluid/distributed/table/graph/graph_edge.h rename to paddle/fluid/distributed/ps/table/graph/graph_edge.h diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc similarity index 98% rename from paddle/fluid/distributed/table/graph/graph_node.cc rename to paddle/fluid/distributed/ps/table/graph/graph_node.cc index 52c708be88488465b9f7c7abac27b6ddc3b991c1..366e607261f0c350c5097fc76e7bcc87b04ee878 100644 --- a/paddle/fluid/distributed/table/graph/graph_node.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h similarity index 98% rename from paddle/fluid/distributed/table/graph/graph_node.h rename to paddle/fluid/distributed/ps/table/graph/graph_node.h index b7a564ef7b0bb6a9f8b307edbb674ab6a32c7404..b838c2c1258d84fec8c4a25f5855209d5b428d4c 100644 --- a/paddle/fluid/distributed/table/graph/graph_node.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h @@ -18,7 +18,7 @@ #include #include #include -#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc similarity index 98% rename from paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc rename to paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc index 7a46433e3defbd51b68ed9f25e9e92f64b6d1afa..8186acec1be3da2abc18775e519ab38dac9f6dfd 100644 --- a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h" #include #include #include diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h similarity index 96% rename from paddle/fluid/distributed/table/graph/graph_weighted_sampler.h rename to paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h index 4a75a112697d322a2eb49a57d379889d34b6009f..c10617022decb2eaf3c8a9684fd3265e88722e76 100644 --- a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h @@ -18,7 +18,7 @@ #include #include #include -#include "paddle/fluid/distributed/table/graph/graph_edge.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_edge.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc similarity index 99% rename from paddle/fluid/distributed/table/memory_sparse_table.cc rename to paddle/fluid/distributed/ps/table/memory_sparse_table.cc index 086ddcafeb48d82b576cf525df4451fce8e77c10..7ce6e9005cf56ca295a6620a209551e303c112f3 100644 --- a/paddle/fluid/distributed/table/memory_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc @@ -16,7 +16,7 @@ #include #include "paddle/fluid/distributed/common/cost_timer.h" -#include "paddle/fluid/distributed/table/memory_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h" #include "paddle/fluid/framework/io/fs.h" #include "boost/lexical_cast.hpp" diff --git a/paddle/fluid/distributed/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h similarity index 94% rename from paddle/fluid/distributed/table/memory_sparse_table.h rename to paddle/fluid/distributed/ps/table/memory_sparse_table.h index cb552beab13717c270c4a8495a6794c9dc912b08..5770f25f8f41dec286993d6b586959c8c0d3a0c0 100644 --- a/paddle/fluid/distributed/table/memory_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h @@ -24,9 +24,9 @@ #include #include #include "Eigen/Dense" -#include "paddle/fluid/distributed/table/accessor.h" -#include "paddle/fluid/distributed/table/common_table.h" -#include "paddle/fluid/distributed/table/depends/feature_value.h" +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #include "paddle/fluid/string/string_helper.h" #define PSERVER_SAVE_SUFFIX ".shard" diff --git a/paddle/fluid/distributed/table/sparse_geo_table.cc b/paddle/fluid/distributed/ps/table/sparse_geo_table.cc similarity index 97% rename from paddle/fluid/distributed/table/sparse_geo_table.cc rename to paddle/fluid/distributed/ps/table/sparse_geo_table.cc index 655c4784156e84e7071b738adac8c24ade6bd08e..6ef4330113e8fee3d2cb0d3e541194ca7b600a82 100644 --- a/paddle/fluid/distributed/table/sparse_geo_table.cc +++ b/paddle/fluid/distributed/ps/table/sparse_geo_table.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/sparse_geo_table.h" +#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/sparse_geo_table.h b/paddle/fluid/distributed/ps/table/sparse_geo_table.h similarity index 77% rename from paddle/fluid/distributed/table/sparse_geo_table.h rename to paddle/fluid/distributed/ps/table/sparse_geo_table.h index 4ddb1fd706069f742debe23f6b7ec1b93692dec3..6eb913a02bc475a148ccb24797618339867f1121 100644 --- a/paddle/fluid/distributed/table/sparse_geo_table.h +++ b/paddle/fluid/distributed/ps/table/sparse_geo_table.h @@ -24,15 +24,15 @@ #include #include "Eigen/Dense" -#include "paddle/fluid/distributed/table/accessor.h" -#include "paddle/fluid/distributed/table/common_sparse_table.h" -#include "paddle/fluid/distributed/table/common_table.h" -#include "paddle/fluid/distributed/table/depends/geo_recorder.h" -#include "paddle/fluid/distributed/table/depends/initializers.h" -#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" -#include "paddle/fluid/distributed/table/depends/sparse.h" -#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/common_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/depends/geo_recorder.h" +#include "paddle/fluid/distributed/ps/table/depends/initializers.h" +#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h" +#include "paddle/fluid/distributed/ps/table/depends/sparse.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc similarity index 99% rename from paddle/fluid/distributed/table/sparse_sgd_rule.cc rename to paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc index 614656a5a85d3029b82b8452b403253043bbc846..3e39d6f976d129903283060fb5111bd9eea03afc 100644 --- a/paddle/fluid/distributed/table/sparse_sgd_rule.cc +++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" #include #include "glog/logging.h" diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h similarity index 100% rename from paddle/fluid/distributed/table/sparse_sgd_rule.h rename to paddle/fluid/distributed/ps/table/sparse_sgd_rule.h diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc similarity index 99% rename from paddle/fluid/distributed/table/ssd_sparse_table.cc rename to paddle/fluid/distributed/ps/table/ssd_sparse_table.cc index 41eca72cf80717cb5f0ad731d19a9da79009ec96..60514b4e19ffaf63f285e25f1355660fabe58d48 100644 --- a/paddle/fluid/distributed/table/ssd_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc @@ -13,7 +13,7 @@ // limitations under the License. #ifdef PADDLE_WITH_HETERPS -#include "paddle/fluid/distributed/table/ssd_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h" DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file"); diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h similarity index 93% rename from paddle/fluid/distributed/table/ssd_sparse_table.h rename to paddle/fluid/distributed/ps/table/ssd_sparse_table.h index 5e85fa3ce59d13c1f996f00a4b5b7dd9114ed764..f5e8a7067e0e041f9913bef8e43ad8b35bdb2783 100644 --- a/paddle/fluid/distributed/table/ssd_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h @@ -13,8 +13,8 @@ // limitations under the License. #pragma once -#include "paddle/fluid/distributed/table/common_sparse_table.h" -#include "paddle/fluid/distributed/table/depends/rocksdb_warpper.h" +#include "paddle/fluid/distributed/ps/table/common_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h" #ifdef PADDLE_WITH_HETERPS namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc similarity index 80% rename from paddle/fluid/distributed/table/table.cc rename to paddle/fluid/distributed/ps/table/table.cc index ac026184b8864ddb4c0b8f9ac2dfa2cc7c4c0dc3..b9b5ff12fc97a74dc4ce7b835ba981d73ca86104 100644 --- a/paddle/fluid/distributed/table/table.cc +++ b/paddle/fluid/distributed/ps/table/table.cc @@ -12,22 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/table.h" #include "glog/logging.h" #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/table/common_dense_table.h" -#include "paddle/fluid/distributed/table/common_graph_table.h" -#include "paddle/fluid/distributed/table/common_sparse_table.h" -#include "paddle/fluid/distributed/table/sparse_geo_table.h" +#include "paddle/fluid/distributed/ps/table/common_dense_table.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/distributed/ps/table/common_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h" #ifdef PADDLE_WITH_HETERPS -#include "paddle/fluid/distributed/table/ssd_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h" #endif -#include "paddle/fluid/distributed/table/ctr_accessor.h" -#include "paddle/fluid/distributed/table/memory_sparse_table.h" -#include "paddle/fluid/distributed/table/tensor_accessor.h" -#include "paddle/fluid/distributed/table/tensor_table.h" +#include "paddle/fluid/distributed/ps/table/ctr_accessor.h" +#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/tensor_accessor.h" +#include "paddle/fluid/distributed/ps/table/tensor_table.h" namespace paddle { namespace distributed { @@ -83,9 +83,10 @@ int32_t Table::initialize_accessor() { LOG(INFO) << "accessor initializing: table_id: " << _config.table_id() << ", accessor_name: " << _config.accessor().accessor_class(); - auto *accessor = CREATE_PSCORE_CLASS( - ValueAccessor, - _config.accessor().accessor_class()) if (accessor == NULL) { + auto *accessor = + CREATE_PSCORE_CLASS(ValueAccessor, _config.accessor().accessor_class()); + + if (accessor == NULL) { LOG(ERROR) << "accessor is unregisteg, table_id:" << _config.table_id() << ", accessor_name:" << _config.accessor().accessor_class(); return -1; diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/ps/table/table.h similarity index 96% rename from paddle/fluid/distributed/table/table.h rename to paddle/fluid/distributed/ps/table/table.h index f6568b4336fbbdee10236d4d8642cd6d1e28b2d9..da1bb668ccfa3c5f1a4f876a396847b6b3853772 100644 --- a/paddle/fluid/distributed/table/table.h +++ b/paddle/fluid/distributed/ps/table/table.h @@ -21,9 +21,9 @@ #include #include #include "paddle/fluid/distributed/common/afs_warpper.h" -#include "paddle/fluid/distributed/table/accessor.h" -#include "paddle/fluid/distributed/table/depends/sparse_utils.h" -#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/distributed/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc similarity index 98% rename from paddle/fluid/distributed/table/tensor_accessor.cc rename to paddle/fluid/distributed/ps/table/tensor_accessor.cc index b1ece52c133a7169273d1a2f62da4d34a01cb029..70a580c1e53a931dc2affd29db01b72691c68a39 100644 --- a/paddle/fluid/distributed/table/tensor_accessor.cc +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/tensor_accessor.h" +#include "paddle/fluid/distributed/ps/table/tensor_accessor.h" #include "Eigen/Dense" namespace paddle { diff --git a/paddle/fluid/distributed/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h similarity index 98% rename from paddle/fluid/distributed/table/tensor_accessor.h rename to paddle/fluid/distributed/ps/table/tensor_accessor.h index 9f4e2bc0def4faf9b750e663bfda99e51b1a2347..5041b8fdf8733eff676b5fce1a972e39182df48e 100644 --- a/paddle/fluid/distributed/table/tensor_accessor.h +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h @@ -20,7 +20,7 @@ #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/accessor.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/tensor_table.cc b/paddle/fluid/distributed/ps/table/tensor_table.cc similarity index 98% rename from paddle/fluid/distributed/table/tensor_table.cc rename to paddle/fluid/distributed/ps/table/tensor_table.cc index 0199f0528a9098b521ca11af522c6d189cc5169a..dfe778fa61e9e003ac1b3de48bf837be1d88ea22 100644 --- a/paddle/fluid/distributed/table/tensor_table.cc +++ b/paddle/fluid/distributed/ps/table/tensor_table.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/tensor_table.h" +#include "paddle/fluid/distributed/ps/table/tensor_table.h" DECLARE_double(eager_delete_tensor_gb); namespace paddle { diff --git a/paddle/fluid/distributed/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h similarity index 99% rename from paddle/fluid/distributed/table/tensor_table.h rename to paddle/fluid/distributed/ps/table/tensor_table.h index 080682d131420b5b57ce470b6b570fe24a1925b3..64d81327acc55ba0655bfc33efaa0d9d9f59649e 100644 --- a/paddle/fluid/distributed/table/tensor_table.h +++ b/paddle/fluid/distributed/ps/table/tensor_table.h @@ -24,7 +24,7 @@ #include #include "paddle/fluid/distributed/common/utils.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/distributed/thirdparty/round_robin.h b/paddle/fluid/distributed/ps/thirdparty/round_robin.h similarity index 100% rename from paddle/fluid/distributed/thirdparty/round_robin.h rename to paddle/fluid/distributed/ps/thirdparty/round_robin.h diff --git a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6279b6aa95412cb282cbe6ad3e5edb7b33adf289 --- /dev/null +++ b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt @@ -0,0 +1,9 @@ + +get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + +set_source_files_properties(fleet.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(fleet + SRCS fleet.cc + DEPS framework_proto ps_framework_proto ps_service variable_helper scope op_registry fs shell ${RPC_DEPS}) + +target_link_libraries(fleet z) diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc similarity index 99% rename from paddle/fluid/distributed/fleet.cc rename to paddle/fluid/distributed/ps/wrapper/fleet.cc index 5caeab832a3e746720dae2104e6f91d325e101fd..0588dbdf0fc61298d33eeb6db5b3de91a6de8256 100644 --- a/paddle/fluid/distributed/fleet.cc +++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/distributed/fleet.h" -#include "paddle/fluid/distributed/service/communicator.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h similarity index 98% rename from paddle/fluid/distributed/fleet.h rename to paddle/fluid/distributed/ps/wrapper/fleet.h index be7fe8ea23fac1da5c55916c5ccaa7108a2b2bf9..1ec580c4d920d45b3bf43981494fde460095bcae 100644 --- a/paddle/fluid/distributed/fleet.h +++ b/paddle/fluid/distributed/ps/wrapper/fleet.h @@ -23,8 +23,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/distributed/communicator_common.h" -#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h" +#include "paddle/fluid/distributed/ps/service/ps_service/service.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/framework/io/shell.h" @@ -49,7 +49,7 @@ class PSCore; using framework::LoDTensor; using framework::Scope; -using framework::SelectedRows; +using pten::SelectedRows; using framework::Variable; using RpcCtxMap = std::unordered_map; diff --git a/paddle/fluid/distributed/test/barrier_table_test.cc b/paddle/fluid/distributed/test/barrier_table_test.cc index 8dc2aa2299be7d72e07e98dae0352a9d791d3f99..0715f777fa5cb286ff393190a3d94dd86e74518a 100644 --- a/paddle/fluid/distributed/test/barrier_table_test.cc +++ b/paddle/fluid/distributed/test/barrier_table_test.cc @@ -17,8 +17,8 @@ limitations under the License. */ #include #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/common_table.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc index f83c7bdb15fa1cad53a033f0444a6854910475e1..d7d9d1ed1bafd95e9d6db75c1e848693a3de55b1 100644 --- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc @@ -17,8 +17,8 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" -#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc index f9c2b55eb4fee2e9bbaa49183b23192d04e61733..4f7b608c8bfb9366e010abda8fc72e68d72fa4e3 100644 --- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc @@ -18,9 +18,9 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" -#include "paddle/fluid/distributed/service/brpc_ps_server.h" -#include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/ps/service/env.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc index 19198b4d207d157629dd3847040a19d9f30ba9b8..608f647d148e4243c6e683e5e600424dd79d8192 100644 --- a/paddle/fluid/distributed/test/brpc_utils_test.cc +++ b/paddle/fluid/distributed/test/brpc_utils_test.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "gtest/gtest.h" -#include "paddle/fluid/distributed/service/brpc_utils.h" +#include "paddle/fluid/distributed/ps/service/brpc_utils.h" #include "paddle/fluid/operators/math/math_function.h" namespace paddle { @@ -56,7 +56,7 @@ void CreateVarsOnScope(framework::Scope* scope, platform::Place* place, // var 3 framework::Variable* var3 = scope->Var("x3"); - auto* slr = var3->GetMutable(); + auto* slr = var3->GetMutable(); slr->set_height(564); auto* tensor3 = slr->mutable_value(); auto* rows = slr->mutable_rows(); @@ -111,7 +111,7 @@ void RunMultiVarMsg(platform::Place place) { // check var3 framework::Variable* var3 = scope_recv.FindVar("x3"); - auto* slr = var3->GetMutable(); + auto* slr = var3->GetMutable(); EXPECT_EQ(slr->rows().size(), 564); for (int i = 0; i < 564; ++i) { EXPECT_EQ(slr->rows()[i], i); diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc index 8c667cad605fcc6b581d91ebbb6e2be812e1d1be..835b1a361573d4991e05551af10b2bd1567db388 100644 --- a/paddle/fluid/distributed/test/ctr_accessor_test.cc +++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc @@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/distributed/table/ctr_accessor.h" +#include "paddle/fluid/distributed/ps/table/ctr_accessor.h" #include #include #include "gtest/gtest.h" #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/dense_table_test.cc b/paddle/fluid/distributed/test/dense_table_test.cc index 2e48b791dc8db510749aec7eed2184b8ef232381..c9a038e000e149f354db2bab72b48c04a721a5f6 100644 --- a/paddle/fluid/distributed/test/dense_table_test.cc +++ b/paddle/fluid/distributed/test/dense_table_test.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/common_dense_table.h" +#include "paddle/fluid/distributed/ps/table/common_dense_table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc index 9bd00dcc56fc2da43135d0ffc9fc36821fb59941..32e3944d35a1c69ce375db207427a535018da481 100644 --- a/paddle/fluid/distributed/test/feature_value_test.cc +++ b/paddle/fluid/distributed/test/feature_value_test.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/distributed/table/depends/feature_value.h" +#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #include #include "gtest/gtest.h" diff --git a/paddle/fluid/distributed/test/geo_table_test.cc b/paddle/fluid/distributed/test/geo_table_test.cc index c9f15db3f788e13ca2f9a8279358358f1c50131b..b148c32f4968ce5a8c6b939978f7a983f15be702 100644 --- a/paddle/fluid/distributed/test/geo_table_test.cc +++ b/paddle/fluid/distributed/test/geo_table_test.cc @@ -21,11 +21,11 @@ limitations under the License. */ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/common_dense_table.h" -#include "paddle/fluid/distributed/table/common_sparse_table.h" -#include "paddle/fluid/distributed/table/depends/sparse_utils.h" -#include "paddle/fluid/distributed/table/sparse_geo_table.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/common_dense_table.h" +#include "paddle/fluid/distributed/ps/table/common_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h" +#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc index 714fbb1e4aa2d8abb10eebe464cd8ac11ad1dc18..e808d2a81539acc78a0c01155e1a63e357cead78 100644 --- a/paddle/fluid/distributed/test/graph_node_split_test.cc +++ b/paddle/fluid/distributed/test/graph_node_split_test.cc @@ -21,16 +21,16 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" -#include "paddle/fluid/distributed/service/brpc_ps_server.h" -#include "paddle/fluid/distributed/service/env.h" -#include "paddle/fluid/distributed/service/graph_brpc_client.h" -#include "paddle/fluid/distributed/service/graph_brpc_server.h" -#include "paddle/fluid/distributed/service/graph_py_service.h" -#include "paddle/fluid/distributed/service/ps_client.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" -#include "paddle/fluid/distributed/service/service.h" -#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/ps/service/ps_client.h" +#include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h" +#include "paddle/fluid/distributed/ps/service/ps_service/service.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 3a430d7a51068a3aa1fb341b3425830add5266cf..3243ebc389c851a2fb0c706280f2f6b8a24c1ef9 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -21,16 +21,16 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/service/brpc_ps_client.h" -#include "paddle/fluid/distributed/service/brpc_ps_server.h" -#include "paddle/fluid/distributed/service/env.h" -#include "paddle/fluid/distributed/service/graph_brpc_client.h" -#include "paddle/fluid/distributed/service/graph_brpc_server.h" -#include "paddle/fluid/distributed/service/graph_py_service.h" -#include "paddle/fluid/distributed/service/ps_client.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" -#include "paddle/fluid/distributed/service/service.h" -#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/ps/service/ps_client.h" +#include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h" +#include "paddle/fluid/distributed/ps/service/ps_service/service.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/distributed/test/large_scale_test.cc b/paddle/fluid/distributed/test/large_scale_test.cc index 6ce8723abeea1ef0cc15d197135d7d14dc2fa86f..13c1d132124ebefc45284c5ab2c47efac6ca6ed3 100644 --- a/paddle/fluid/distributed/test/large_scale_test.cc +++ b/paddle/fluid/distributed/test/large_scale_test.cc @@ -21,9 +21,9 @@ limitations under the License. */ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/common_sparse_table.h" -#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/common_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/memory_sparse_table_test.cc b/paddle/fluid/distributed/test/memory_sparse_table_test.cc index 30a1107d64e3c4fcb8a0b091d4c11f11a81ad947..62992c74bfd23456959ce7531afd268e62ee9df3 100644 --- a/paddle/fluid/distributed/test/memory_sparse_table_test.cc +++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc @@ -21,8 +21,8 @@ limitations under the License. */ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/memory_sparse_table.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc index e86234f1bd9c7618eab0220cc41994b9e2855c7f..c895231d93ec5e3bb12d7d4eb2769a630016e2ef 100644 --- a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc +++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" #include #include #include "gtest/gtest.h" diff --git a/paddle/fluid/distributed/test/sparse_table_test.cc b/paddle/fluid/distributed/test/sparse_table_test.cc index 26bede392d6fade06dd29cf5e5a28295bb1cbc43..f13bab078a6b0c95ad580b36ad2d7c34d0b470e6 100644 --- a/paddle/fluid/distributed/test/sparse_table_test.cc +++ b/paddle/fluid/distributed/test/sparse_table_test.cc @@ -21,10 +21,10 @@ limitations under the License. */ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/common_dense_table.h" -#include "paddle/fluid/distributed/table/common_sparse_table.h" -#include "paddle/fluid/distributed/table/sparse_geo_table.h" -#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/distributed/ps/table/common_dense_table.h" +#include "paddle/fluid/distributed/ps/table/common_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h" +#include "paddle/fluid/distributed/ps/table/table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/table_test.cc b/paddle/fluid/distributed/test/table_test.cc index 9b12717f73087751ab08b37f5232c434e14b3c31..6a29781158b838378468b1789b9eed0408c3435d 100644 --- a/paddle/fluid/distributed/test/table_test.cc +++ b/paddle/fluid/distributed/test/table_test.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/distributed/ps.pb.h" -#include "paddle/fluid/distributed/table/common_sparse_table.h" -#include "paddle/fluid/distributed/table/sparse_geo_table.h" +#include "paddle/fluid/distributed/ps/table/common_sparse_table.h" +#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/eager/legacy/infer_shape_context.h b/paddle/fluid/eager/legacy/infer_shape_context.h index 0979abc63d65870e1a2aabdc14116a55d786ed00..b43eda7abc345b0533cdc1bca017bc8311d90a79 100644 --- a/paddle/fluid/eager/legacy/infer_shape_context.h +++ b/paddle/fluid/eager/legacy/infer_shape_context.h @@ -197,9 +197,8 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext { out_var->GetMutable(); out_lod_tensor->Resize(in_lod_tensor.dims()); } else { - auto& in_sele_rows = in_var->Get(); - auto out_sele_rows = - out_var->GetMutable(); + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); out_sele_rows->set_rows(in_sele_rows.rows()); out_sele_rows->set_height(in_sele_rows.height()); @@ -368,8 +367,8 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext { "Input variable should not be null")); if (var->IsType()) { return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); } else { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Only LoDTensor/SelectedRows support 'GetDim', but Variables " @@ -385,8 +384,8 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext { void SetDim(paddle::framework::Variable* var, const DDim& dim) { if (var->IsType()) { var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Variable type_id %s, expect LoDTensor/SelectedRows.")); diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc index bd7e5c549872d32a93b7b1b303081a17ade167ff..3179b96807119eac9c200f79f4c7990c3026ad4f 100644 --- a/paddle/fluid/eager/legacy/prepared_operator.cc +++ b/paddle/fluid/eager/legacy/prepared_operator.cc @@ -32,8 +32,8 @@ const paddle::framework::Tensor* GetTensorFromVar( const paddle::framework::Variable& var) { if (var.IsType()) { return &(var.Get()); - } else if (var.IsType()) { - return &(var.Get().value()); + } else if (var.IsType()) { + return &(var.Get().value()); } else { return nullptr; } diff --git a/paddle/fluid/eager/legacy/tensor_helper.cc b/paddle/fluid/eager/legacy/tensor_helper.cc index 2ee2f9fefa9a342238e764d198124b5d74ee1dd0..fbf3205be2fe37ea5333d4295fd2d0fb0d76f811 100644 --- a/paddle/fluid/eager/legacy/tensor_helper.cc +++ b/paddle/fluid/eager/legacy/tensor_helper.cc @@ -32,7 +32,7 @@ void InitializeVariable(paddle::framework::Variable *var, if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) { var->GetMutable(); } else if (var_type == paddle::framework::proto::VarType::SELECTED_ROWS) { - var->GetMutable(); + var->GetMutable(); } else if (var_type == paddle::framework::proto::VarType::FEED_MINIBATCH) { var->GetMutable(); } else if (var_type == paddle::framework::proto::VarType::FETCH_LIST) { @@ -72,9 +72,9 @@ void CopyVariable(const paddle::framework::Variable &src_var, auto &src_tensor = src_var.Get(); tmp_grad_tensor->set_lod(src_tensor.lod()); paddle::framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor); - } else if (src_var.IsType()) { - auto &src_slr = src_var.Get(); - auto *tmp_grad_slr = dst_var->GetMutable(); + } else if (src_var.IsType()) { + auto &src_slr = src_var.Get(); + auto *tmp_grad_slr = dst_var->GetMutable(); tmp_grad_slr->set_rows(src_slr.rows()); tmp_grad_slr->set_height(src_slr.height()); auto &src_t = src_slr.value(); @@ -89,8 +89,8 @@ paddle::framework::proto::VarType::Type GetDtypeFromVar( const paddle::framework::Variable &var) { if (var.IsType()) { return var.Get().type(); - } else if (var.IsType()) { - return var.Get().value().type(); + } else if (var.IsType()) { + return var.Get().value().type(); } else { PADDLE_THROW(paddle::platform::errors::InvalidArgument( "Variable type is %s, expect LoDTensor or SelectedRows.", @@ -101,8 +101,8 @@ const paddle::platform::Place &GetPlaceFromVar( const paddle::framework::Variable &var) { if (var.IsType()) { return var.Get().place(); - } else if (var.IsType()) { - return var.Get().place(); + } else if (var.IsType()) { + return var.Get().place(); } else { PADDLE_THROW(paddle::platform::errors::InvalidArgument( "Variable type is %s, expect LoDTensor or SelectedRows.", diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 0220e5fd59476a836045fe0d4fcaa48bccdeb92f..ce63a58d41ae004298f239effa80fe1ce79c4eef 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -192,11 +192,11 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va IF(WITH_XPU) cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils - pten pten_utils kernel_factory infershape_utils) + pten pten_utils kernel_factory infershape_utils op_utils) ELSE() cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils - pten pten_utils kernel_factory infershape_utils) + pten pten_utils kernel_factory infershape_utils op_utils) ENDIF() cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -383,7 +383,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto boost) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) -cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS tensor) +cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS selected_rows) cc_test(selected_rows_utils_test SRCS selected_rows_utils_test.cc DEPS selected_rows_utils) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type) @@ -393,10 +393,6 @@ cc_test(tuple_test SRCS tuple_test.cc ) cc_test(inlined_vector_test SRCS inlined_vector_test.cc) -if (NOT WIN32) -cc_test(rw_lock_test SRCS rw_lock_test.cc) -endif (NOT WIN32) - cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) @@ -408,7 +404,7 @@ cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tens cc_library(generator SRCS generator.cc DEPS enforce place) cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info) -cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place pten var_type_traits pten pten_api_utils op_info shape_inference) +cc_library(infershape_utils SRCS infershape_utils.cc DEPS pten_utils attribute shape_inference op_utils) # Get the current working branch execute_process( diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 1b5db8380514d552ed56ae3c65a338a082f02bdc..df1840794af3bbef1a2bdf8c2073c89991cdf9fd 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/platform/timer.h" #ifdef PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/fleet.h" +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #endif #if defined _WIN32 || defined __APPLE__ diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index d8c372becf1b45895920c5d2783f427c2b8d352b..22a2847c1d834fee9fc3012957ddfc70130e41d3 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -120,9 +120,9 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor, tran_lod_tensor->set_format(in_lod_tensor.format()); #endif tran_lod_tensor->ShareDataWith(tensor); - } else if (in_var.IsType()) { - auto &in_selected_rows = in_var.Get(); - auto *trans_selected_rows = out_var->GetMutable(); + } else if (in_var.IsType()) { + auto &in_selected_rows = in_var.Get(); + auto *trans_selected_rows = out_var->GetMutable(); trans_selected_rows->set_height(in_selected_rows.height()); trans_selected_rows->set_rows(in_selected_rows.rows()); trans_selected_rows->mutable_value()->ShareDataWith(tensor); diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 052860cd0ab40479df7672ae32ebc6e75965b97b..4511578f34ec27b31736b2a762991e52e5a66bd4 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/variable_helper.h" #if defined PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 2e82fe22dba73149e722958d6027ee6ba52f12d8..1435a82c0f528ad90a2da7958c602670a33ad1e7 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -237,7 +237,7 @@ struct TestBroadcastOpHandle { PADDLE_ENFORCE_NOT_NULL( var, platform::errors::NotFound("Variable %s is not found in scope.", varname)); - auto selected_rows = var->GetMutable(); + auto selected_rows = var->GetMutable(); auto value = selected_rows->mutable_value(); value->mutable_data(kDims, place_list_[input_scope_idx]); selected_rows->set_height(height); @@ -256,7 +256,7 @@ struct TestBroadcastOpHandle { PADDLE_ENFORCE_NOT_NULL( var, platform::errors::NotFound("Variable %s is not found in scope.", varname)); - auto& selected_rows = var->Get(); + auto& selected_rows = var->Get(); auto rt = selected_rows.value(); PADDLE_ENFORCE_EQ(selected_rows.height(), height, platform::errors::InvalidArgument( diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 59614e89c1344e76a1e7042e27dbff41fccb7799..42b87f3853c58ab336474773f7eeb2501b4fd971 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -129,9 +129,10 @@ void EagerDeletionOpHandle::RunImpl() { if (var->IsType()) { garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); - } else if (var->IsType()) { - garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); } else if (var->IsType()) { auto *tensor_arr = var->GetMutable(); for (auto &t : *tensor_arr) { diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc index 74f5deed45557c96d7d2e84034d5fddf05892079..430f55793b73606ec0087dd4e8823d80587da618 100644 --- a/paddle/fluid/framework/details/gather_op_handle.cc +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -64,14 +64,14 @@ void GatherOpHandle::RunImpl() { platform::errors::NotFound("The variable '%s' is not found in the scope.", in_0_handle->name())); - PADDLE_ENFORCE_EQ(pre_in_var->IsType(), true, + PADDLE_ENFORCE_EQ(pre_in_var->IsType(), true, platform::errors::Unimplemented( "Currently, gather_op only supports SelectedRows.")); // Wait input done, this Wait is asynchronous operation WaitInputVarGenerated(); - auto &pre_in_value = pre_in_var->Get(); + auto &pre_in_value = pre_in_var->Get(); std::vector out_rows; std::vector in_tensors; @@ -85,7 +85,7 @@ void GatherOpHandle::RunImpl() { "The variable '%s' is not found in the scope.", in_handle->name())); VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var); - auto &in_sr_value = in_var->Get(); + auto &in_sr_value = in_var->Get(); auto &in_sr_rows = in_sr_value.rows(); out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end()); @@ -108,7 +108,7 @@ void GatherOpHandle::RunImpl() { out_var, platform::errors::NotFound("The variable '%s' is not found in the scope.", out_var_handle->name())); - auto out_value = out_var->GetMutable(); + auto out_value = out_var->GetMutable(); out_value->set_height(pre_in_value.height()); out_value->set_rows(out_rows); size_t rows = out_rows.size(); diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index 38e20127f1612e74bd4dc6117680a3df8cc8244f..b46168bf8fb314eaf0234ebf5898a790fea714e1 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -146,7 +146,7 @@ struct TestGatherOpHandle { PADDLE_ENFORCE_NOT_NULL( in_var, platform::errors::NotFound( "The variable '%s' is not found in the scope.", "input")); - auto in_selected_rows = in_var->GetMutable(); + auto in_selected_rows = in_var->GetMutable(); auto value = in_selected_rows->mutable_value(); value->mutable_data(kDims, gpu_list_[input_scope_idx]); @@ -162,10 +162,10 @@ struct TestGatherOpHandle { PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( "The variable '%s' is not found in the scope.", "out")); - auto out_selected_rows = out_var->GetMutable(); + auto out_selected_rows = out_var->GetMutable(); auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input"); - auto in_selected_rows = in_var->GetMutable(); + auto in_selected_rows = in_var->GetMutable(); out_selected_rows->mutable_value()->ShareDataWith( in_selected_rows->value()); @@ -177,7 +177,7 @@ struct TestGatherOpHandle { p::CPUPlace cpu_place; - auto& out_select_rows = out_var->Get(); + auto& out_select_rows = out_var->Get(); auto rt = out_select_rows.value(); PADDLE_ENFORCE_EQ(out_select_rows.height(), height, diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index db3eaece3569f19cc8297cbcf94df977c4e013ce..f57136e1f0ed94b3d573a36aa8367e227f7ead24 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -321,8 +321,8 @@ void CheckVarHasNanOrInf(const std::string& op_type, const Tensor* tensor{nullptr}; if (var->IsType()) { tensor = &var->Get(); - } else if (var->IsType()) { - tensor = &var->Get().value(); + } else if (var->IsType()) { + tensor = &var->Get().value(); } else { VLOG(10) << var_name << " var_name need not to check"; return; @@ -468,8 +468,8 @@ void PrintNpuVarInfo(const std::string& op_type, const std::string& var_name, const Tensor* tensor{nullptr}; if (var->IsType()) { tensor = &var->Get(); - } else if (var->IsType()) { - tensor = &var->Get().value(); + } else if (var->IsType()) { + tensor = &var->Get().value(); } else { VLOG(10) << var_name << " var_name need not to check"; return; diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index 27f55e237f51689bc5dfcc1d5bcc92496aa506cb..427b981e7cda27269f9da5f007464a5fd97d28c2 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -275,10 +275,8 @@ struct OpInfoFiller { template struct OpInfoFiller { void operator()(const char* op_type, OpInfo* info) const { - PADDLE_ENFORCE_EQ( - info->infer_shape_, nullptr, - platform::errors::AlreadyExists( - "Duplicate InferShapeFN of %s has been registered", op_type)); + // Note: if fill InferShapeFN by this Filler, the infershape here + // will overwrite the op->InferShape func registered in kOperator Filler info->infer_shape_ = [](InferShapeContext* ctx) { T inference; inference(ctx); diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 583c34494bca4c64c033cde17b031851ae96f209..6d136055da7824a30a086d83a5e65f9674fa9cdb 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -20,6 +20,11 @@ #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows_utils.h" + +namespace pten { +class SelectedRows; +} // namespace pten + namespace paddle { namespace framework { namespace details { @@ -96,10 +101,10 @@ struct ReduceBufferData { struct GatherLocalSelectedRowsFunctor { GatherLocalSelectedRowsFunctor( - const std::vector &src_selected_rows, + const std::vector &src_selected_rows, const std::vector &in_places, const std::map &dev_ctxes, - const platform::Place &out_place, SelectedRows *dst_selected_rows) + const platform::Place &out_place, pten::SelectedRows *dst_selected_rows) : dev_ctxes_(dev_ctxes), in_places_(in_places), out_place_(out_place), @@ -147,7 +152,7 @@ struct GatherLocalSelectedRowsFunctor { std::vector in_tensors_; platform::Place out_place_; - SelectedRows *dst_selected_rows_; + pten::SelectedRows *dst_selected_rows_; }; } // namespace details diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 6493ef540ccbe0f70ea47d817907a75a001a7f94..5cf84a04958b82b91367e6fec477af6467fadd4f 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -114,10 +114,10 @@ void ReduceOpHandle::RunImpl() { t_out_p = platform::CPUPlace(); } - if (pre_in_var->IsType()) { + if (pre_in_var->IsType()) { this->RunAndRecordEvent([&] { - std::vector in_selected_rows = - GetInputValues(in_var_handles, var_scopes); + std::vector in_selected_rows = + GetInputValues(in_var_handles, var_scopes); const CollectiveContext &collective_context = *CollectiveContext::GetInstance(); @@ -130,7 +130,7 @@ void ReduceOpHandle::RunImpl() { platform::is_cpu_place(t_out_p)) { GatherLocalSelectedRowsFunctor functor( in_selected_rows, in_places, dev_ctxes_, t_out_p, - out_var->GetMutable()); + out_var->GetMutable()); WaitInputVarGenerated(); functor(); return; diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index e9c913b0c8255065f5a603560c36830c119d967a..5b1267d0970831431a91a4e8bae493594b929a6d 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -27,7 +27,6 @@ namespace paddle { namespace framework { -class SelectedRows; namespace details { struct VarHandle; @@ -131,11 +130,11 @@ struct ReduceOpHandle : public OpHandleBase { defined PADDLE_WITH_DISTRIBUTE template void GatherSelectedRows( - const std::vector &src_selecte_rows_, + const std::vector &src_selecte_rows_, const std::vector &in_places, const std::map &dev_ctxes, VarHandle *out_var_handle, const platform::Place &out_place, - SelectedRows *dst_selecte_rows); + pten::SelectedRows *dst_selecte_rows); #endif void Wait( diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 35dba488454725ddc889f62a1c7511e38bd570ff..4931c64fdf83f7577f5e7c427c384eca4b83ed5f 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -174,7 +174,7 @@ struct TestReduceOpHandle { PADDLE_ENFORCE_NOT_NULL( in_var, platform::errors::NotFound( "Variable %s is not found in scope.", "input")); - auto in_selected_rows = in_var->GetMutable(); + auto in_selected_rows = in_var->GetMutable(); auto value = in_selected_rows->mutable_value(); value->mutable_data(kDims, gpu_list_[input_scope_idx]); @@ -190,10 +190,10 @@ struct TestReduceOpHandle { PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound( "Variable %s is not found in scope.", "out")); - auto out_selected_rows = out_var->GetMutable(); + auto out_selected_rows = out_var->GetMutable(); auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); - auto in_selected_rows = in_var->GetMutable(); + auto in_selected_rows = in_var->GetMutable(); out_selected_rows->mutable_value()->ShareDataWith( in_selected_rows->value()); @@ -205,7 +205,7 @@ struct TestReduceOpHandle { p::CPUPlace cpu_place; - auto &out_select_rows = out_var->Get(); + auto &out_select_rows = out_var->Get(); auto rt = out_select_rows.value(); PADDLE_ENFORCE_EQ(out_select_rows.height(), height, diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc index 7354824aae5996da77bca2893872300f623bc91f..2efe1c9555857f6e1be27c135c3c613bb2981876 100644 --- a/paddle/fluid/framework/details/scope_buffered_monitor.cc +++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc @@ -33,9 +33,9 @@ static void GetTensors(Variable *var, std::unordered_set *tensor_set) { if (var->IsType() && var->Get().IsInitialized()) { tensor_set->insert(var->GetMutable()); - } else if (var->IsType() && - var->Get().value().IsInitialized()) { - tensor_set->insert(var->GetMutable()->mutable_value()); + } else if (var->IsType() && + var->Get().value().IsInitialized()) { + tensor_set->insert(var->GetMutable()->mutable_value()); } else if (var->IsType()) { auto *tensor_arr = var->GetMutable(); for (auto &t : *tensor_arr) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 265e346a9d8dfb0925783b812174410bb11ae86d..c8a6cd25f0fcbe9724972225d03b539285b7225f 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -18,7 +18,7 @@ #include "paddle/fluid/platform/profiler.h" #if defined PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 4315b6b0fc245a93f6adea9224ba45c40f0a3368..9979d2ee205311517d5047012ec52e3a1d2d9559 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -33,8 +33,8 @@ template static void VisitVariable(Variable* var, Func* func) { if (var->IsType()) { (*func)(var->GetMutable()); - } else if (var->IsType()) { - (*func)(var->GetMutable()); + } else if (var->IsType()) { + (*func)(var->GetMutable()); } else { PADDLE_THROW(platform::errors::Unimplemented( "VisitVariable is not supported for type %s.", @@ -46,8 +46,8 @@ template static void VisitVariable(const Variable& var, Func* func) { if (var.IsType()) { (*func)(var.Get()); - } else if (var.IsType()) { - (*func)(var.Get()); + } else if (var.IsType()) { + (*func)(var.Get()); } else { PADDLE_THROW(platform::errors::Unimplemented( "VisitVariable is not supported for type %s.", ToTypeName(var.Type()))); @@ -59,7 +59,7 @@ struct TensorVisitor { void operator()(LoDTensor* tensor) { result_ = tensor; } - void operator()(SelectedRows* selected_rows) { + void operator()(pten::SelectedRows* selected_rows) { result_ = selected_rows->mutable_value(); } @@ -85,8 +85,8 @@ struct ShareDimsAndLoDVisitor { tensor->Resize(val.dims()); } - void operator()(const SelectedRows& val) { - auto* selected_rows = trg_->GetMutable(); + void operator()(const pten::SelectedRows& val) { + auto* selected_rows = trg_->GetMutable(); selected_rows->set_rows(val.rows()); selected_rows->set_height(val.height()); selected_rows->mutable_value()->Resize(val.value().dims()); @@ -131,8 +131,8 @@ struct EnforceShapeAndDTypeEQVisitor { "The layout of the two variables' tensors tensor is not equal.")); } - void operator()(const SelectedRows& src) { - auto& selected_rows = dst_->Get(); + void operator()(const pten::SelectedRows& src) { + auto& selected_rows = dst_->Get(); PADDLE_ENFORCE_EQ( src.place().GetType(), selected_rows.place().GetType(), platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index 83d5a2efa342e57a3124651324824fddb287cc01..bea23469f113a94489d3ec53206b9b68b433c8e9 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -815,8 +815,8 @@ void DownpourWorker::TrainFiles() { if (var->IsType()) { tensor = var->GetMutable(); len = tensor->numel(); - } else if (var->IsType()) { - auto selected_rows = var->GetMutable(); + } else if (var->IsType()) { + auto selected_rows = var->GetMutable(); tensor = selected_rows->mutable_value(); len = tensor->numel(); } diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index 6e5578a2d12b4c29445c1ee4597431a647a13c9a..00d2149cb184b3766f4e68e179a280c0c98640e5 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -147,9 +147,10 @@ void DeleteUnusedTensors(const Scope &scope, VLOG(2) << "Erase variable " << var_name; if (var->IsType()) { garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); - } else if (var->IsType()) { - garbages.emplace_back( - var->GetMutable()->mutable_value()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); } else if (var->IsType()) { auto *lod_tensor_arr = var->GetMutable(); for (auto &t : *lod_tensor_arr) { diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index 3e8b0cfbc31f3551bcd6101e7ba48927b9600553..a88ffbe3d9637a8c6d3de9e065bd380d0c69c280 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -26,10 +26,10 @@ limitations under the License. */ #endif #ifdef PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" +#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h" #endif -#include "paddle/fluid/distributed/thirdparty/round_robin.h" +#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 509b43431b572539608cd976f67d1cab90414856..b3173a1386582a27faccdcdc49d0c5013204901f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -21,9 +21,9 @@ limitations under the License. */ #include "common_value.h" // NOLINT #endif #ifdef PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" +#include "paddle/fluid/distributed/ps/table/depends/large_scale_kv.h" #endif -#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/pten/core/utils/rw_lock.h" #include "thrust/pair.h" // #include "cudf/concurrent_unordered_map.cuh.h" #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h" @@ -81,7 +81,7 @@ class HashTable { << " push value size: " << push_grad_value_size_; } - std::unique_ptr rwlock_{nullptr}; + std::unique_ptr rwlock_{nullptr}; private: TableContainer* container_; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index dec73574685585747178bd0c2c65d39090eb6943..72e628223e31782b2dcfb74567654708ffbd2d57 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -121,7 +121,7 @@ __global__ void dy_mf_update_kernel(Table* table, template HashTable::HashTable(size_t capacity) { container_ = new TableContainer(capacity); - rwlock_.reset(new RWLock); + rwlock_.reset(new pten::RWLock); } template diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index aa01c5f769ae252ff04ef7e2526c473d6604403a..ef5cd8466f1759484f8541546235ac44dd827037 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -29,7 +29,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif -#include "paddle/fluid/distributed/thirdparty/round_robin.h" +#include "paddle/fluid/distributed/ps/thirdparty/round_robin.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/fleet/heter_context.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h" @@ -43,7 +43,7 @@ limitations under the License. */ #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc index cb939f38ff3d9678e09e5cae433317031a47d78f..13eb78874c395e8ff2baa01d2fd0bd9f2df5c42c 100644 --- a/paddle/fluid/framework/heter_pipeline_trainer.cc +++ b/paddle/fluid/framework/heter_pipeline_trainer.cc @@ -13,7 +13,7 @@ // limitations under the License. #if defined(PADDLE_WITH_PSCORE) -#include "paddle/fluid/distributed/service/heter_server.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc index a4e582c8fed13d93ec54ed29ad26ebe3d109aa09..8e94bb1d0e1498bfa69db565de0de36ffce63cb3 100644 --- a/paddle/fluid/framework/heter_section_worker.cc +++ b/paddle/fluid/framework/heter_section_worker.cc @@ -11,7 +11,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_PSCORE) #include -#include "paddle/fluid/distributed/service/heter_server.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/platform/cpu_helper.h" diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index f4660751b582a460f8079173a9bb859e26711344..0b4c8f4a719afcb0aee39fb369516b9b47e52a71 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/platform/lodtensor_printer.h" #if defined PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #endif namespace paddle { diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 9a91a5208ebbcb97fc1770bc3bfd5b860716c135..08b945159ad7ee201514845af2cb8d8f5876664c 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -15,11 +15,14 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/pten/core/compat/arg_map_context.h" +#include "paddle/pten/core/compat/op_utils.h" #include "paddle/pten/core/compat_utils.h" #include "paddle/pten/core/convert_utils.h" #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/infermeta_utils.h" #include "paddle/pten/core/meta_tensor.h" namespace paddle { @@ -186,5 +189,40 @@ class CompatMetaTensor : public pten::MetaTensor { bool is_runtime_; }; +pten::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, + const std::string& op_type) { + // 1. get kernel args + InitDefaultKernelSignatureMap(); + auto arg_map_fn = pten::OpUtilsMap::Instance().GetArgumentMappingFn(op_type); + PADDLE_ENFORCE_NOT_NULL( + arg_map_fn, platform::errors::NotFound( + "The ArgumentMappingFn of %s op is not found.", op_type)); + InferShapeArgumentMappingContext arg_map_context(*ctx); + auto signature = arg_map_fn(arg_map_context); + VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature; + + // 2. build infermeta context + pten::InferMetaContext infer_meta_context(ctx->IsRuntime()); + + auto& input_names = std::get<0>(signature.args); + auto& output_names = std::get<2>(signature.args); + // TODO(chenweihang): support attrs in next pr + // auto& attr_names = std::get<1>(signature.args); + + // TODO(chenweihang): support multiple inputs and outputs + pten::InferMetaContext infer_mete_context; + for (auto& in_name : input_names) { + infer_meta_context.EmplaceBackInput(std::make_shared( + ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime())); + } + for (auto& out_name : output_names) { + infer_meta_context.EmplaceBackOutput(std::make_shared( + ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime())); + } + // TODO(chenweihang): support attrs later + + return infer_meta_context; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h index f943989523e50d1361aebbdebe771811cdb358f3..fbfb44e27c8b104cfefb8256aedbb3af8a4caf8f 100644 --- a/paddle/fluid/framework/infershape_utils.h +++ b/paddle/fluid/framework/infershape_utils.h @@ -26,7 +26,6 @@ class InferMetaContext; namespace paddle { namespace framework { -// TODO(chenweihang): impl this function in next PR pten::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, const std::string& op_type); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0d9c460628e17186152462c313937aff5490e723..323e743087ffbc0f979768bb9a8b8dd7eaec25b2 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -132,6 +132,22 @@ if(WITH_MKLDNN) pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn) endif() +if(WITH_IPU) + pass_library(forward_graph_extract_pass base DIR ipu) + pass_library(optimizer_extract_pass base DIR ipu) + pass_library(optimizer_state_align_pass base DIR ipu) + pass_library(ipu_graph_builder_pass base DIR ipu) + pass_library(ipu_runtime_replacer_pass base DIR ipu) + pass_library(inference_process_pass base DIR ipu) + pass_library(inference_postprocess_pass base DIR ipu) + pass_library(popart_canonicalization_pass base DIR ipu) + pass_library(ipu_inplace_pass base DIR ipu) + pass_library(infer_shape_pass base DIR ipu) + pass_library(delete_scale_op_pass base DIR ipu) + pass_library(avg_shard_pass base DIR ipu) + pass_library(transfer_cast_op_pass base DIR ipu) +endif() + cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index b2ab6bed36c3afbe99c8debd8547784fb455475f..83bed2a97baa7453ac84039405ad43a20a12a4bd 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -603,9 +603,9 @@ static std::vector> GetOpDependencies( for (const auto *op_desc : block_ops) { size_t op_idx = op_id_to_idx.size(); PADDLE_ENFORCE_EQ( - op_id_to_idx.emplace(op_desc->Id(), op_idx).second, true, + op_id_to_idx.emplace(op_desc->OriginalId(), op_idx).second, true, platform::errors::InvalidArgument( - "There should not be duplicate op id: %d", op_desc->Id())); + "There should not be duplicate op id: %d", op_desc->OriginalId())); } std::vector> dep_matrix(op_num); @@ -624,9 +624,9 @@ static std::vector> GetOpDependencies( for (const auto &pair : all_preceding_ops) { const auto *cur_op_node = pair.first; - size_t op_idx_1 = get_op_idx_by_id(cur_op_node->Op()->Id()); + size_t op_idx_1 = get_op_idx_by_id(cur_op_node->Op()->OriginalId()); for (const auto *preceding_op_node : pair.second) { - size_t op_idx_2 = get_op_idx_by_id(preceding_op_node->Op()->Id()); + size_t op_idx_2 = get_op_idx_by_id(preceding_op_node->Op()->OriginalId()); dep_matrix[op_idx_1][op_idx_2] = ir::Node::Dep::kAfter; dep_matrix[op_idx_2][op_idx_1] = ir::Node::Dep::kBefore; } diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index 45087036b5d17dc500d59f0413dd4f7223bc9e4c..32d3cdef4512bb072820970ed9db6d2d1289652b 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/lodtensor_printer.h" #if defined PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #endif namespace paddle { @@ -136,7 +136,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, if (!root_var) { continue; } - if (root_var->IsType()) { + if (root_var->IsType()) { continue; } LoDTensor* root_tensor = root_var->GetMutable(); diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index 9230c36a0c7450dc96304e5a0f773feabe610afa..3fe9e877658dad64bfeb4737f025ea73b54840f8 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -259,7 +259,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, auto var = var_name_item.second[i]; auto& var_name = new_ins[var_name_item.first].at(i); const Tensor* tensor_in; - if (var->IsType() || var->IsType()) { + if (var->IsType() || var->IsType()) { tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); } else if (var->IsType()) { tensor_in = diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index aea9ad20353966f3b9491d85129bfd62269cfcb0..f71a5b2c710cea76a4d5346b14af7e69b8215f95 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -676,8 +676,9 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { operators::reader:: OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { // do nothing - } else if (var->IsType()) { - TensorRecordStream(*(var->GetMutable()->mutable_value())); + } else if (var->IsType()) { + TensorRecordStream( + *(var->GetMutable()->mutable_value())); } else if (var->IsType()) { auto* tensor_arr = var->GetMutable(); for (auto& tensor : *tensor_arr) { diff --git a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc index 7beefec4487de31d2fa558153b7a0522545def72..ba81ee9166fd655cf1c6b2b0bf14486d5c274143 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc @@ -76,10 +76,12 @@ void InterpreterCoreEventGarbageCollector::Add( } else if (var->IsType()) { // TODO(xiongkun03) in old executor, this type of variable is not support // eager deletion. so we just leave it here ? - } else if (var->IsType()) { - Add(var->GetMutable()->mutable_value()->MoveMemoryHolder(), + } else if (var->IsType()) { + Add(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder(), event, ctx); - var->GetMutable()->mutable_rows()->clear(); + var->GetMutable()->mutable_rows()->clear(); } else if (var->IsType()) { auto* tensor_arr = var->GetMutable(); for (auto& t : *tensor_arr) { @@ -132,4 +134,4 @@ void InterpreterCoreEventGarbageCollector::Free( } } // namespace framework -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc index 784cfca943ea1d88546e5d024bbdeaece2c55849..14fb8a9819b2dc4f1356b881150983937d691af6 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc @@ -32,9 +32,11 @@ void InterpreterCoreFastGarbageCollector::Add(Variable* var) { } else if (var->IsType()) { // TODO(xiongkun03) in old executor, this type of variable is not support // eager deletion. so we just leave it here ? - } else if (var->IsType()) { - Add(var->GetMutable()->mutable_value()->MoveMemoryHolder()); - var->GetMutable()->mutable_rows()->clear(); + } else if (var->IsType()) { + Add(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); + var->GetMutable()->mutable_rows()->clear(); } else if (var->IsType()) { auto* tensor_arr = var->GetMutable(); for (auto& t : *tensor_arr) { diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 214a1d728266b03d122ffc5fdf36d4617612f22b..0371b12d009f3f15cfd649c143a81032484f49f2 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -468,8 +468,8 @@ void build_op_func_list(const platform::Place& place, if (var->IsType()) { garbages->emplace_back( var->GetMutable()->MoveMemoryHolder()); - } else if (var->IsType()) { - garbages->emplace_back(var->GetMutable() + } else if (var->IsType()) { + garbages->emplace_back(var->GetMutable() ->mutable_value() ->MoveMemoryHolder()); } else if (var->IsType()) { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index fb29e18887b4ee74b448323fb1d14409212e9f71..6c5e98489ef5a8db4f163cc31d888e900bdbb582 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -18,7 +18,7 @@ #include #include "paddle/fluid/framework/new_executor/new_executor_defs.h" -#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/pten/core/utils/rw_lock.h" // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and @@ -171,9 +171,9 @@ void InterpretercoreInferShapeContext::ShareDim(const std::string& in, platform::errors::InvalidArgument( "The type of input (%s) and output (%s) are inconsistent.", in, out)); - if (in_var->IsType()) { - auto& in_sele_rows = in_var->Get(); - auto out_sele_rows = out_var->GetMutable(); + if (in_var->IsType()) { + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); out_sele_rows->set_rows(in_sele_rows.rows()); out_sele_rows->set_height(in_sele_rows.height()); @@ -392,8 +392,8 @@ DDim InterpretercoreInferShapeContext::GetDim(Variable* var) const { var, platform::errors::InvalidArgument("Input variable is nullptr.")); if (var->IsType()) { return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Only LoDTensor or SelectedRows support 'GetDim', but input " @@ -420,8 +420,8 @@ std::vector InterpretercoreInferShapeContext::GetRepeatedDims( void InterpretercoreInferShapeContext::SetDim(Variable* var, const DDim& dim) { if (var->IsType()) { var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW(platform::errors::Unimplemented( "Variable type error, expect LoDTensor or SelectedRows, but received " diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 0ef85a25a237b5b97f4bba32dc28a436a5336174..b61b8af1e4a1b38f3db686e3b438aaf7745ed3c0 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -19,10 +19,10 @@ #include #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/device_event_base.h" #include "paddle/fluid/platform/event.h" +#include "paddle/pten/core/utils/rw_lock.h" // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h index f633538e700b242469bce6d76dfb58e89f9cdbe8..e4fd66fee2d732e51351b050c852aefa6cdb6001 100644 --- a/paddle/fluid/framework/op_call_stack.h +++ b/paddle/fluid/framework/op_call_stack.h @@ -19,12 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/enforce.h" -namespace paddle { -namespace platform { -struct EnforceNotMet; -} // namespace platform -} // namespace paddle - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc index fb2d23a5513b4fee64276ec5880ffe7729d2f500..a22adacd31a91c966fba3f77fbf914a987c409a8 100644 --- a/paddle/fluid/framework/op_proto_maker_test.cc +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -18,12 +18,6 @@ limitations under the License. */ #include "gtest/gtest-test-part.h" #include "gtest/gtest.h" -namespace paddle { -namespace platform { -struct EnforceNotMet; -} // namespace platform -} // namespace paddle - class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { public: void Make() { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 33a4e5d2f390611a3f079bff3232a1bd5f7b3ac0..087a817d03af1c5bffd15965071dc48b4a299e9f 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/pten/common/scalar.h" #include "paddle/pten/common/scalar_array.h" +#include "paddle/pten/ops/compat/signatures.h" namespace pten { class DenseTensor; @@ -77,11 +78,11 @@ static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name, if (var->IsType()) { const LoDTensor& tensor = var->Get(); return tensor.dims(); - } else if (var->IsType()) { + } else if (var->IsType()) { if (get_actual_dim) { - return var->Get().value().dims(); + return var->Get().value().dims(); } else { - return var->Get().GetCompleteDims(); + return var->Get().GetCompleteDims(); } } else if (var->IsType()) { return DDim({static_cast(var->Get().size())}); @@ -108,8 +109,8 @@ static std::string GetDtype(const ScopeBase& scope, const std::string& name) { return ""; } return DataTypeToString(tensor.type()); - } else if (var->IsType()) { - auto tensor = var->Get().value(); + } else if (var->IsType()) { + auto tensor = var->Get().value(); if (UNLIKELY(!tensor.IsInitialized())) { return "uninited"; } else { @@ -139,8 +140,8 @@ static std::string GetPlace(const ScopeBase& scope, const std::string& name) { return ""; } return to_string(tensor.place()); - } else if (var->IsType()) { - auto tensor = var->Get().value(); + } else if (var->IsType()) { + auto tensor = var->Get().value(); if (UNLIKELY(!tensor.IsInitialized())) { return "uninited"; } else { @@ -157,8 +158,8 @@ static int GetRowSize(const ScopeBase& scope, const std::string& name) { return -1; } - if (var->IsType()) { - return var->Get().rows().size(); + if (var->IsType()) { + return var->Get().rows().size(); } return -1; @@ -497,8 +498,8 @@ void OperatorBase::GenerateTemporaryNames() { const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { if (var.IsType()) { return static_cast(&(var.Get())); - } else if (var.IsType()) { - return &(var.Get().value()); + } else if (var.IsType()) { + return &(var.Get().value()); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Variable type is %s, expect LoDTensor or SelectedRows.", @@ -509,8 +510,8 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) { if (var->IsType()) { return var->GetMutable(); - } else if (var->IsType()) { - return var->GetMutable()->mutable_value(); + } else if (var->IsType()) { + return var->GetMutable()->mutable_value(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Variable type is %s, expect LoDTensor or SelectedRows.", @@ -741,9 +742,9 @@ class RuntimeInferShapeContext : public InferShapeContext { "The type of input (%s) and output (%s) are inconsistent.", in, out)); - if (in_var->IsType()) { - auto& in_sele_rows = in_var->Get(); - auto out_sele_rows = out_var->GetMutable(); + if (in_var->IsType()) { + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); out_sele_rows->set_rows(in_sele_rows.rows()); out_sele_rows->set_height(in_sele_rows.height()); @@ -950,8 +951,8 @@ class RuntimeInferShapeContext : public InferShapeContext { var, platform::errors::InvalidArgument("Input variable is nullptr.")); if (var->IsType()) { return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Only LoDTensor or SelectedRows support 'GetDim', but input " @@ -976,8 +977,8 @@ class RuntimeInferShapeContext : public InferShapeContext { void SetDim(Variable* var, const DDim& dim) { if (var->IsType()) { var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW(platform::errors::Unimplemented( "Variable type error, expect LoDTensor or SelectedRows, but received " @@ -1086,6 +1087,13 @@ bool OperatorWithKernel::CanMKLDNNBeUsed(const framework::ExecutionContext& ctx, return use_mkldnn_ctx && this->SupportsMKLDNN(data_type); } +void OperatorWithKernel::InferShape(InferShapeContext* ctx) const { + PADDLE_THROW(platform::errors::PermissionDenied( + "The default InferShape function of OperatorWithKernel is not allowed to " + "be called, please override corresponding InferShape function in the " + "specific operator.")); +} + void OperatorWithKernel::RuntimeInferShape(const Scope& scope, const platform::Place& place, const RuntimeContext& ctx) const { @@ -1342,6 +1350,16 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { kernel_iter = kernels.find(expected_kernel_key); } #endif +#ifdef PADDLE_WITH_IPU + if (kernel_iter == kernels.end() && + platform::is_ipu_place(expected_kernel_key.place_)) { + VLOG(3) << "missing IPU kernel: " << type_ + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } +#endif #ifdef PADDLE_WITH_ASCEND_CL if (kernel_iter == kernels.end() && platform::is_npu_place(expected_kernel_key.place_)) { @@ -1646,8 +1664,8 @@ void OperatorWithKernel::ParseInputDataType( t = &var->Get(); } else if (var->IsType()) { t = &var->Get(); - } else if (var->IsType()) { - t = &(var->Get().value()); + } else if (var->IsType()) { + t = &(var->Get().value()); } else if (var->IsType()) { auto t_arr = &var->Get(); for (size_t j = 0; j < t_arr->size(); j++) { @@ -1728,8 +1746,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( t = var->GetMutable(); } else if (var->IsType()) { t = var->GetMutable(); - } else if (var->IsType()) { - t = var->GetMutable()->mutable_value(); + } else if (var->IsType()) { + t = var->GetMutable()->mutable_value(); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input variable type in complex type promotion.")); @@ -1784,8 +1802,10 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar( KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs( const ExecutionContext& ctx) const { - return KernelSignatureMap::Instance().Get( - pten::TransToPtenKernelName(Type())); + InitDefaultKernelSignatureMap(); + ExecutionArgumentMappingContext arg_mapping_ctx(ctx); + return pten::OpUtilsMap::Instance().GetArgumentMappingFn(Type())( + arg_mapping_ctx); } Scope* OperatorWithKernel::PreparePtenData( diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 40c80ec5f2d654b57a72290398e323e1ce91e156..c280eeaa0fa5713bf52679996bbe2b3f7ac22473 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -41,6 +41,7 @@ limitations under the License. */ #include "paddle/utils/flat_hash_map.h" #include "paddle/pten/core/compat/arg_map_context.h" +#include "paddle/pten/core/compat/op_utils.h" #include "paddle/pten/core/kernel_context.h" #include "paddle/pten/core/kernel_factory.h" @@ -117,7 +118,7 @@ inline std::string GradOriginalVarName(const std::string& grad_var_name) { } inline bool VarIsTensor(const Variable& var) { - return var.IsType() || var.IsType(); + return var.IsType() || var.IsType(); } const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var); @@ -468,12 +469,11 @@ class ExecutionArgumentMappingContext : public pten::ArgumentMappingContext { } bool IsDenseTensorInput(const std::string& name) const override { - return ctx_.InputVar(name)->IsType() || - ctx_.InputVar(name)->IsType(); + return ctx_.InputVar(name)->IsType(); } bool IsSelectedRowsInput(const std::string& name) const override { - return ctx_.InputVar(name)->IsType(); + return ctx_.InputVar(name)->IsType(); } private: @@ -550,7 +550,7 @@ class OperatorWithKernel : public OperatorBase { bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx, proto::VarType::Type data_type) const; - virtual void InferShape(InferShapeContext* ctx) const = 0; + virtual void InferShape(InferShapeContext* ctx) const; void RuntimeInferShape(const Scope& scope, const platform::Place& place, const RuntimeContext& ctx) const override; diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index df7e3c4f6dde3b7ff8eb7d9a199f11fca45a034e..ef6c41990cd6e243cd5d7d062722ccd1555e9591 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -456,7 +456,7 @@ TEST(IndicateVarDataTypeTest, selectedrows) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto* var = scope.Var("selected_rows_1"); - var->GetMutable(); + var->GetMutable(); bool caught = false; try { diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 54167d95899d6fab81a8657b167012b47bf950ea..535c9ab58e295fae2048bb162adfb0384745d0ae 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -38,12 +38,12 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/operators/cinn/cinn_launch_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { namespace framework { @@ -75,7 +75,7 @@ const CinnCompiledObject& CinnCompiler::Compile( bool exist = false; { - AutoRDLock r_guard{&rwlock_}; + pten::AutoRDLock r_guard{&rwlock_}; exist = cache_by_address_.count(cur_key_by_address) != 0; // if cannot find graph by address, checkout whether the graph structure // have been stored in cache. @@ -96,13 +96,13 @@ const CinnCompiledObject& CinnCompiler::Compile( std::int64_t compiled_num = real_compiled_num_.fetch_add(1); auto compiled_res = CompileGraph(graph, input_tensors, target, compiled_num, stream); - AutoWRLock w_guard{&rwlock_}; + pten::AutoWRLock w_guard{&rwlock_}; if (!cache_by_struct_.count(cur_key_by_struct)) { cache_by_address_[cur_key_by_address] = compiled_res.get(); cache_by_struct_[cur_key_by_struct] = std::move(compiled_res); } } - AutoRDLock guard{&rwlock_}; + pten::AutoRDLock guard{&rwlock_}; const auto& cached_boj = *cache_by_address_[cur_key_by_address]; return cached_boj; } @@ -198,7 +198,7 @@ std::string CinnCompiler::ReadableKey( void CinnCompiler::Clear() { { - AutoWRLock guard{&rwlock_}; + pten::AutoWRLock guard{&rwlock_}; graphs_.clear(); cache_by_address_.clear(); cache_by_struct_.clear(); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h index 5070eb5ce5674dfc5803c61a1eb38117432fb4c1..024dd26747b8e7db9eec15fd2998cefaeeb931fb 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h @@ -26,9 +26,9 @@ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { @@ -102,7 +102,7 @@ class CinnCompiler { std::unique_ptr, CinnCacheKey::Hash> cache_by_struct_; std::atomic_int64_t real_compiled_num_{0}; - mutable RWLock rwlock_; + mutable pten::RWLock rwlock_; DISABLE_COPY_AND_ASSIGN(CinnCompiler); }; diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index 2fd5b87b7f3fd2cfc655ca6112ef33bddedb59cf..dc20aaffec9ca7abce0096fe1d948d043cc5e044 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/pten_utils.h" +#include "paddle/pten/core/compat/op_utils.h" #include "paddle/pten/core/convert_utils.h" #include "paddle/pten/core/kernel_factory.h" @@ -89,48 +90,6 @@ pten::KernelKey TransOpKernelTypeToPtenKernelKey( return pten::KernelKey(backend, layout, dtype); } -KernelSignatureMap* KernelSignatureMap::kernel_signature_map_ = nullptr; -std::once_flag KernelSignatureMap::init_flag_; - -KernelSignatureMap& KernelSignatureMap::Instance() { - std::call_once(init_flag_, [] { - kernel_signature_map_ = new KernelSignatureMap(); - for (const auto& pair : OpInfoMap::Instance().map()) { - const auto& op_type = pair.first; - const auto* op_proto = pair.second.proto_; - if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) && - op_proto) { - KernelArgsNameMakerByOpProto maker(op_proto); - VLOG(10) << "Register kernel signature for " << op_type; - auto success = kernel_signature_map_->map_ - .emplace(pten::TransToPtenKernelName(op_type), - std::move(maker.GetKernelSignature())) - .second; - PADDLE_ENFORCE_EQ( - success, true, - platform::errors::PermissionDenied( - "Kernel signature of the operator %s has been registered.", - op_type)); - } - } - }); - return *kernel_signature_map_; -} - -bool KernelSignatureMap::Has(const std::string& op_type) const { - return map_.find(op_type) != map_.end(); -} - -const KernelSignature& KernelSignatureMap::Get( - const std::string& op_type) const { - auto it = map_.find(op_type); - PADDLE_ENFORCE_NE( - it, map_.end(), - platform::errors::NotFound( - "Operator `%s`'s kernel signature is not registered.", op_type)); - return it->second; -} - const paddle::SmallVector& KernelArgsNameMakerByOpProto::GetInputArgsNames() { for (int i = 0; i < op_proto_->inputs_size(); ++i) { @@ -196,6 +155,24 @@ KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() { GetOutputArgsNames()); } +std::once_flag kernel_sig_map_init_flag; + +void InitDefaultKernelSignatureMap() { + std::call_once(kernel_sig_map_init_flag, [] { + for (const auto& pair : paddle::framework::OpInfoMap::Instance().map()) { + const auto& op_type = pair.first; + const auto* op_proto = pair.second.proto_; + if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) && + op_proto) { + paddle::framework::KernelArgsNameMakerByOpProto maker(op_proto); + VLOG(10) << "Register kernel signature for " << op_type; + pten::DefaultKernelSignatureMap::Instance().Insert( + op_type, std::move(maker.GetKernelSignature())); + } + } + }); +} + void SetAllocationForOutputTenosr(pten::DenseTensor* tensor, const platform::Place& place) { if (!tensor->IsInitialized() || !(tensor->place() == place)) { diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h index ab129c6313dabfecf3d7cd1968b66485e48ec211..9b1019f65823774d315b12c14c307b416ca9ff70 100644 --- a/paddle/fluid/framework/pten_utils.h +++ b/paddle/fluid/framework/pten_utils.h @@ -44,26 +44,6 @@ pten::KernelKey TransOpKernelTypeToPtenKernelKey( /* Kernel Args parse */ -// TODO(chenweihang): we can generate this map by proto info in compile time -class KernelSignatureMap { - public: - static KernelSignatureMap& Instance(); - - bool Has(const std::string& op_type) const; - - const KernelSignature& Get(const std::string& op_type) const; - - private: - KernelSignatureMap() = default; - DISABLE_COPY_AND_ASSIGN(KernelSignatureMap); - - private: - static KernelSignatureMap* kernel_signature_map_; - static std::once_flag init_flag_; - - paddle::flat_hash_map map_; -}; - class KernelArgsNameMaker { public: virtual ~KernelArgsNameMaker() {} @@ -72,6 +52,8 @@ class KernelArgsNameMaker { virtual const paddle::SmallVector& GetAttrsArgsNames() = 0; }; +void InitDefaultKernelSignatureMap(); + void SetAllocationForOutputTenosr(pten::DenseTensor* tensor, const platform::Place& place); @@ -86,5 +68,12 @@ struct ConvertToPtenContext { using TYPE = pten::CPUContext; }; +#ifdef PADDLE_WITH_XPU +template <> +struct ConvertToPtenContext { + using TYPE = pten::XPUContext; +}; +#endif + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h deleted file mode 100644 index 9b74a55304077c6c13a55f36ea8cf3b6dfbe5b9c..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/rw_lock.h +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if !defined(_WIN32) -#include -#else -#include // NOLINT -#endif // !_WIN32 - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -#if !defined(_WIN32) -struct RWLock { - RWLock() { pthread_rwlock_init(&lock_, nullptr); } - - ~RWLock() { pthread_rwlock_destroy(&lock_); } - - inline void RDLock() { - PADDLE_ENFORCE_EQ( - pthread_rwlock_rdlock(&lock_), 0, - platform::errors::External("The pthread failed to acquire read lock.")); - } - - inline void WRLock() { - PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, - platform::errors::External( - "The pthread failed to acquire write lock.")); - } - - inline void UNLock() { - PADDLE_ENFORCE_EQ( - pthread_rwlock_unlock(&lock_), 0, - platform::errors::External("The pthread failed to unlock.")); - } - - private: - pthread_rwlock_t lock_; -}; -// TODO(paddle-dev): Support RWLock for WIN32 for correctness. -#else -// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive -// In windows, rw_lock seems like a hack. Use empty object and do nothing. -struct RWLock { - // FIXME(minqiyang): use mutex here to do fake lock - inline void RDLock() { mutex_.lock(); } - - inline void WRLock() { mutex_.lock(); } - - inline void UNLock() { mutex_.unlock(); } - - private: - std::mutex mutex_; -}; -#endif - -class AutoWRLock { - public: - explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } - - ~AutoWRLock() { UnLock(); } - - private: - inline void Lock() { lock_->WRLock(); } - - inline void UnLock() { lock_->UNLock(); } - - private: - RWLock* lock_; -}; - -class AutoRDLock { - public: - explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } - - ~AutoRDLock() { UnLock(); } - - private: - inline void Lock() { lock_->RDLock(); } - - inline void UnLock() { lock_->UNLock(); } - - private: - RWLock* lock_; -}; - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc deleted file mode 100644 index d140e95a37d84fe34397e06092a3ec89c8dc8435..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/rw_lock_test.cc +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/rw_lock.h" - -#include -#include // NOLINT - -namespace f = paddle::framework; - -void f1(f::RWLock *lock) { - lock->RDLock(); - lock->UNLock(); -} - -TEST(RWLOCK, read_read) { - f::RWLock lock; - lock.RDLock(); - std::thread t1(f1, &lock); - std::thread t2(f1, &lock); - t1.join(); - t2.join(); - lock.UNLock(); -} - -void f2(f::RWLock *lock, std::vector *result) { - lock->RDLock(); - ASSERT_EQ(result->size(), 0UL); - lock->UNLock(); -} - -void f3(f::RWLock *lock, std::vector *result) { - lock->WRLock(); - result->push_back(1); - lock->UNLock(); -} - -TEST(RWLOCK, read_write) { - f::RWLock lock; - std::vector result; - - lock.RDLock(); - std::thread t1(f2, &lock, &result); - t1.join(); - std::thread t2(f3, &lock, &result); - std::this_thread::sleep_for(std::chrono::seconds(1)); - ASSERT_EQ(result.size(), 0UL); - lock.UNLock(); - t2.join(); - ASSERT_EQ(result.size(), 1UL); -} - -void f4(f::RWLock *lock, std::vector *result) { - lock->RDLock(); - ASSERT_EQ(result->size(), 1UL); - lock->UNLock(); -} - -TEST(RWLOCK, write_read) { - f::RWLock lock; - std::vector result; - - lock.WRLock(); - std::thread t1(f4, &lock, &result); - std::this_thread::sleep_for(std::chrono::seconds(1)); - result.push_back(1); - lock.UNLock(); - t1.join(); -} diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index b2062cc51206a98889a5b584239791483b1722a4..e6a372a8e631f92bee69dfd705d23b0ea56678ac 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -34,10 +34,10 @@ PADDLE_DEFINE_EXPORTED_bool( #define SCOPE_VARS_READER_LOCK #define SCOPE_VARS_WRITER_LOCK #else -#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); -#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); -#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); -#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); +#define SCOPE_KIDS_READER_LOCK pten::AutoRDLock auto_lock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK pten::AutoWRLock auto_lock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK pten::AutoRDLock auto_lock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK pten::AutoWRLock auto_lock(&vars_lock_); #endif namespace paddle { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index b963c28d597bbb3614ccb00c4124123879dc0c84..7eb6082ce15fea2575c12d643329fe2a8bc555d7 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -26,9 +26,9 @@ extern "C" { #include #include -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/pten/core/utils/rw_lock.h" namespace paddle { namespace framework { @@ -194,8 +194,8 @@ class Scope : public ScopeBase { #ifndef PADDLE_ON_INFERENCE private: - mutable RWLock kids_lock_; - mutable RWLock vars_lock_; + mutable pten::RWLock kids_lock_; + mutable pten::RWLock vars_lock_; #endif }; diff --git a/paddle/fluid/framework/selected_rows_utils.cc b/paddle/fluid/framework/selected_rows_utils.cc index c33ee655c2a98b73b517c922895f494f443dfd90..a1bffcfce19f1a0a8c9eaf954f174299790f5384 100644 --- a/paddle/fluid/framework/selected_rows_utils.cc +++ b/paddle/fluid/framework/selected_rows_utils.cc @@ -17,73 +17,8 @@ limitations under the License. */ namespace paddle { namespace framework { -struct ReAllocateVisitor { - ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor) - : dims_(dims), tensor_(tensor) {} - - template - void operator()() const { - framework::Tensor cpu_tensor; - platform::CPUPlace cpu; - T* ptr = cpu_tensor.mutable_data(dims_, cpu); - const T* old_ptr = - tensor_->memory_size() == 0 ? nullptr : tensor_->data(); - if (old_ptr != nullptr) { - std::copy(old_ptr, old_ptr + tensor_->numel(), ptr); - } - tensor_->ShareDataWith(cpu_tensor); - } - - framework::DDim dims_; - framework::Tensor* tensor_; -}; - -struct TensorCopyVisitor { - TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset, - const framework::Tensor src, int64_t src_offset, - int64_t size) - : dst_(dst), - dst_offset_(dst_offset), - src_(src), - src_offset_(src_offset), - size_(size) {} - - template - void apply() const { - // TODO(Yancey1989): support other place - platform::CPUPlace cpu; - memory::Copy(cpu, dst_->mutable_data(cpu) + dst_offset_, cpu, - src_.data() + src_offset_, size_ * sizeof(T)); - } - - framework::Tensor* dst_; - int64_t dst_offset_; - framework::Tensor src_; - int64_t src_offset_; - int64_t size_; -}; - -struct TensorFillVisitor { - TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size, - float value) - : dst_(dst), dst_offset_(dst_offset), size_(size) {} - - template - void apply() const { - // TODO(qiao): support other place - platform::CPUPlace cpu; - auto* tensor_data = dst_->mutable_data(cpu); - auto* start = tensor_data + dst_offset_; - auto* end = start + size_; - std::fill(start, end, static_cast(0.0)); - } - - framework::Tensor* dst_; - int64_t dst_offset_; - int64_t size_; -}; - -void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, +void SerializeToStream(std::ostream& os, + const pten::SelectedRows& selected_rows, const platform::DeviceContext& dev_ctx) { { // the 1st field, uint32_t version constexpr uint32_t version = 0; @@ -107,7 +42,8 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, TensorToStream(os, selected_rows.value(), dev_ctx); } -void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) { +void SerializeToStream(std::ostream& os, + const pten::SelectedRows& selected_rows) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; auto place = selected_rows.place(); @@ -115,14 +51,15 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) { SerializeToStream(os, selected_rows, *dev_ctx); } -void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows) { +void DeserializeFromStream(std::istream& os, + pten::SelectedRows* selected_rows) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; dev_ctx = pool.Get(platform::CPUPlace()); DeserializeFromStream(os, selected_rows, *dev_ctx); } -void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, +void DeserializeFromStream(std::istream& is, pten::SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx) { { // the 1st field, unit32_t version for SelectedRows @@ -151,109 +88,5 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, // the 4st field, tensor which contains the data TensorFromStream(is, selected_rows->mutable_value(), dev_ctx); } - -bool SelectedRows::HasKey(int64_t key) const { - return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false - : true; -} - -int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown, - bool is_test) { - if (is_test) { - auto iter = id_to_index_.find(key); - if (iter == id_to_index_.end()) { - return -1; - } else { - return iter->second; - } - } - - rwlock_->RDLock(); - auto iter = id_to_index_.find(key); - if (iter == id_to_index_.end()) { - rwlock_->UNLock(); - PADDLE_ENFORCE_EQ( - auto_grown, true, - platform::errors::NotFound("Input key(%lld) is not found.", key)); - rwlock_->WRLock(); - auto map_size = id_to_index_.size(); - auto vector_size = rows_.size(); - if (map_size != vector_size) { - rwlock_->UNLock(); - PADDLE_THROW(platform::errors::InvalidArgument( - "Row map size(%zu) should be equal to rows size(%zu).", map_size, - vector_size)); - } - auto write_iter = id_to_index_.find(key); - if (write_iter == id_to_index_.end()) { - int row_num = rows_.size(); - if (row_num == value_->dims()[0]) { - rwlock_->UNLock(); - PADDLE_THROW(platform::errors::InvalidArgument( - "Selected rows is full, then length exceed the length of first " - "dimension (%d).", - row_num)); - } - // key logic to put a key into id_to_index_ - rows_.push_back(key); - auto index = static_cast(rows_.size() - 1); - id_to_index_[key] = index; - rwlock_->UNLock(); - return index; - } else { - auto index = write_iter->second; - rwlock_->UNLock(); - return index; - } - } else { - auto index = iter->second; - rwlock_->UNLock(); - return index; - } -} - -void SelectedRows::SyncIndex() { - rwlock_->WRLock(); - id_to_index_.clear(); - for (size_t i = 0; i < rows_.size(); ++i) { - id_to_index_[rows_[i]] = i; - } - rwlock_->UNLock(); -} - -void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, - bool auto_grown, bool is_test) { - PADDLE_ENFORCE_EQ(value->IsInitialized(), true, - platform::errors::InvalidArgument( - "The value tensor is not initialized.")); - if (ids.numel() == 0) { - VLOG(3) << "keys is empty, please check data!"; - } else { - int64_t value_width = value_->numel() / value_->dims()[0]; - PADDLE_ENFORCE_EQ( - value_width, value->numel() / value->dims()[0], - platform::errors::InvalidArgument( - "Output tensor should have the same shape with table " - "except the first dimmension, excepted value width not counting " - "the first dimension is %d, actual value width is %d.", - value_width, value->numel() / value->dims()[0])); - for (int i = 0; i < ids.numel(); ++i) { - auto id = ids.data()[i]; - int64_t index = AutoGrownIndex(id, auto_grown, is_test); - if (index < 0) { - VLOG(5) << "id " << id << " not in the table, return 0"; - framework::VisitDataType( - value_->type(), - TensorFillVisitor(value, i * value_width, value_width, 0.0)); - } else { - framework::VisitDataType( - value_->type(), - TensorCopyVisitor(value, i * value_width, *value_.get(), - index * value_width, value_width)); - } - } - } -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/selected_rows_utils.h b/paddle/fluid/framework/selected_rows_utils.h index 445f446ef2f4aecac496250a1269514f1faa037b..e1b26f2bbafa3f8ed3c372010c02c44d08c81066 100644 --- a/paddle/fluid/framework/selected_rows_utils.h +++ b/paddle/fluid/framework/selected_rows_utils.h @@ -21,153 +21,28 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/rw_lock.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/place.h" +#include "paddle/pten/core/selected_rows.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace framework { - -class SelectedRows { - /* - * @brief We can use the SelectedRows structure to reproduce a sparse table. - * A sparse table is a key-value structure that the key is an `int64_t`, - * and the value is a Tensor which the first dimension is 0. - * You can use the following interface to operate the sparse table, and you - * can find - * some detail information from the comments of each interface: - * - * HasKey(key), whether the sparse table has the specified key. - * Set(key, value), set a key-value pair into the sparse table. - * Get(keys, value*), get value by given key list and apply it to the given - * value pointer - * with the specified offset. - * - */ - public: - SelectedRows(const std::vector& rows, const int64_t& height) - : rows_(rows), height_(height) { - value_.reset(new Tensor()); - rwlock_.reset(new RWLock); - } - - SelectedRows() { - height_ = 0; - value_.reset(new Tensor()); - rwlock_.reset(new RWLock); - } - - const platform::Place& place() const { return value_->place(); } - - const Tensor& value() const { return *value_; } - - Tensor* mutable_value() { return value_.get(); } - - int64_t height() const { return height_; } - - void set_height(int64_t height) { height_ = height; } - - const Vector& rows() const { return rows_; } - - Vector* mutable_rows() { return &rows_; } - - void set_rows(const Vector& rows) { rows_ = rows; } - - /* - * @brief Get the index of key in rows - * - * @return -1 if the key does not exists. - */ - int64_t Index(int64_t key) const { - auto it = std::find(rows_.begin(), rows_.end(), key); - if (it == rows_.end()) { - PADDLE_THROW(platform::errors::NotFound( - "Input id (%lld) is not in current rows table.", key)); - } - return static_cast(std::distance(rows_.begin(), it)); - } - - /* - * @brief whether has the specified key in the table. - * - * @return true if the key is exists. - */ - bool HasKey(int64_t key) const; - - /* - * @brief Get value by the key list. - * Note!!! this interface is only used when selected_rows is used as - * parameters - * for distribute lookup table. - * - * @return a list of pair which contains the non-exists key and the index in - * the value - */ - void Get(const framework::Tensor& ids, framework::Tensor* value, - bool auto_grown = false, bool is_test = false); - - /* - * @brief Get the index of the key from id_to_index_ map. If the key not - * exist, - * add the key into id_to_index_. - * - * Note!!! this interface is only used when selected_rows is used as - * parameters - * for distribute lookup table. - * - * @return index of the key. - */ - int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false); - - /* - * @brief Get the index of the key from id_to_index_ map. - */ - inline int64_t GetIndexFromId(int64_t key) const { - auto iter = id_to_index_.find(key); - if (iter == id_to_index_.end()) { - return -1; - } else { - return iter->second; - } - } - - void SyncIndex(); - /* - * @brief Get complete Dims before - */ - DDim GetCompleteDims() const { - std::vector dims = vectorize(value_->dims()); - dims[0] = height_; - return make_ddim(dims); - } - - private: - // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. - // SelectedRows are simply concated when adding together. Until a - // SelectedRows add a Tensor, will the duplicate rows be handled. - Vector rows_; - std::unordered_map - id_to_index_; // should not be used when rows_ has duplicate member - std::unique_ptr value_{nullptr}; - int64_t height_; // height indicates the underline tensor's height - std::unique_ptr rwlock_{nullptr}; -}; - /* * Serialize/Desiralize SelectedRows to std::ostream * You can pass ofstream or ostringstream to serilize to file * or to a in memory string. GPU tensor will be copied to CPU. */ -void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, +void SerializeToStream(std::ostream& os, + const pten::SelectedRows& selected_rows, const platform::DeviceContext& dev_ctx); -void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, +void DeserializeFromStream(std::istream& is, pten::SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx); -void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows); +void SerializeToStream(std::ostream& os, + const pten::SelectedRows& selected_rows); -void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows); +void DeserializeFromStream(std::istream& os, pten::SelectedRows* selected_rows); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/selected_rows_utils_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc index 7a9f86041d996eed67d836e52d89d5e57cc740c3..9a14f4395d9a196af67598714e3679c9d11d2289 100644 --- a/paddle/fluid/framework/selected_rows_utils_test.cc +++ b/paddle/fluid/framework/selected_rows_utils_test.cc @@ -24,7 +24,7 @@ class SelectedRowsTester : public ::testing::Test { std::vector rows{0, 4, 7}; int64_t height = 10; int64_t row_numel = 100; - selected_rows_.reset(new SelectedRows(rows, height)); + selected_rows_.reset(new pten::SelectedRows(rows, height)); Tensor* value = selected_rows_->mutable_value(); auto* data = value->mutable_data( @@ -36,7 +36,7 @@ class SelectedRowsTester : public ::testing::Test { protected: platform::CPUPlace place_; - std::unique_ptr selected_rows_{nullptr}; + std::unique_ptr selected_rows_{nullptr}; }; TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); } @@ -50,7 +50,7 @@ TEST_F(SelectedRowsTester, complete_dims) { } TEST_F(SelectedRowsTester, SerializeAndDeseralize) { - SelectedRows dst_tensor; + pten::SelectedRows dst_tensor; platform::CPUDeviceContext cpu_ctx(place_); std::ostringstream oss; @@ -71,7 +71,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { TEST(SelectedRows, SparseTable) { platform::CPUPlace cpu; - SelectedRows table; + pten::SelectedRows table; int64_t table_size = 100; int64_t embedding_width = 8; @@ -124,7 +124,7 @@ TEST(SelectedRows, SparseTable) { } } -void f1(SelectedRows* table, int table_size) { +void f1(pten::SelectedRows* table, int table_size) { for (int i = 1000000; i > 0; --i) { auto id = i % table_size; int64_t index1 = table->AutoGrownIndex(id, true); @@ -135,7 +135,7 @@ void f1(SelectedRows* table, int table_size) { } } -void f2(SelectedRows* table, int table_size) { +void f2(pten::SelectedRows* table, int table_size) { for (int i = 0; i < 1000000; ++i) { auto id = i % table_size; int64_t index1 = table->AutoGrownIndex(id, true); @@ -146,7 +146,7 @@ void f2(SelectedRows* table, int table_size) { } } -void f3(SelectedRows* table, int table_size) { +void f3(pten::SelectedRows* table, int table_size) { clock_t t1 = clock(); for (int i = 100000; i > 0; --i) { auto id1 = table->AutoGrownIndex(i % table_size, true); @@ -157,7 +157,7 @@ void f3(SelectedRows* table, int table_size) { std::cout << "f3 run time:" << t2 - t1 << std::endl; } -void f4(SelectedRows* table, int table_size) { +void f4(pten::SelectedRows* table, int table_size) { clock_t t1 = clock(); for (int i = 0; i < 100000; ++i) { auto id1 = table->AutoGrownIndex(i % table_size, true); @@ -170,7 +170,7 @@ void f4(SelectedRows* table, int table_size) { TEST(SelectedRows, MultiThreadAutoIndex) { platform::CPUPlace cpu; - SelectedRows table; + pten::SelectedRows table; int64_t table_size = 100000; int64_t embedding_width = 8; diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 86bf2d8ac413e388ebba81ccf5e08edd891224e5..fe376a5669c984e439fcb8b93de25b96462d21de 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -21,8 +21,8 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/platform/variant.h" +#include "paddle/pten/core/type_defs.h" #include "paddle/utils/small_vector.h" namespace paddle { @@ -39,14 +39,6 @@ class InferNoNeedBufferVarsFN; using VariableNameMap = std::map>; using VariableValueMap = std::map>; -// The order should be as same as framework.proto -using Attribute = boost::variant< - boost::blank, int, float, std::string, std::vector, std::vector, - std::vector, bool, std::vector, BlockDesc*, int64_t, - std::vector, std::vector, std::vector>; - -using AttributeMap = std::unordered_map; - #ifdef PADDLE_WITH_ASCEND_CL using NPUAttribute = boost::variant, diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 5747df57c456854674515c1f653e4958fc9b57b4..dd1e329ac03231300cd63bd02f828c680203de6a 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -57,7 +57,7 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { visitor(var.Get()); return; case proto::VarType::SELECTED_ROWS: - visitor(var.Get()); + visitor(var.Get()); return; case proto::VarType::READER: visitor(var.Get()); diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 008b6829f9fe374600f837a8f51ee82130ab1ac5..ac55abaad8d0a77d1b4decad733e32f51a994bc4 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -49,6 +49,7 @@ namespace pten { class DenseTensor; +class SelectedRows; } // namespace pten // Users should add forward declarations here @@ -76,7 +77,6 @@ class LoDRankTable; class ScopeBase; class ReaderHolder; class Scope; -class SelectedRows; } // namespace framework namespace operators { @@ -166,7 +166,7 @@ struct VarTypeRegistryImpl { // Users should add other variable types below. // Paddle would generate unique Ids for each registered variable types. using VarTypeRegistry = detail::VarTypeRegistryImpl< - Tensor, SelectedRows, std::vector, LoDRankTable, Strings, + Tensor, pten::SelectedRows, std::vector, LoDRankTable, Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *, operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, @@ -206,7 +206,7 @@ struct VarTypeTrait { // Users should set some of variable type ids to be what is defined in // framework.proto below REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR); -REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS); +REG_PROTO_VAR_TYPE_TRAIT(pten::SelectedRows, proto::VarType::SELECTED_ROWS); REG_PROTO_VAR_TYPE_TRAIT(std::vector, proto::VarType::STEP_SCOPES); REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 812a34112a465a57687c0420edf1cef8ee760abc..bc418363bf737df2ed558a320c1b39582439815d 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -92,7 +92,7 @@ bool CheckVarId(int proto_id) { TEST(var_type_traits, check_proto_type_id) { ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR)); - ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); + ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); ASSERT_TRUE(CheckVarId>(proto::VarType::STEP_SCOPES)); ASSERT_TRUE(CheckVarId(proto::VarType::LOD_RANK_TABLE)); ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR_ARRAY)); diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 188b00d818de3df8ee88790dcf681d287f85833b..52bf3a12a043f63f6d370f528ac55368aed63527 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -72,7 +72,7 @@ class Variable { private: // This method hides type T, so it doesn't appear as a template parameter of // Variable. - pten::TensorInplaceVersion* InplaceVersionCounter(); + pten::DenseTensor::InplaceVersion* InplaceVersionCounter(); public: void SetInplaceVersionToZero(); @@ -114,8 +114,8 @@ class Variable { std::shared_ptr holder_; }; -inline pten::TensorInplaceVersion* Variable::InplaceVersionCounter() { - pten::TensorInplaceVersion* version_counter_ptr(nullptr); +inline pten::DenseTensor::InplaceVersion* Variable::InplaceVersionCounter() { + pten::DenseTensor::InplaceVersion* version_counter_ptr(nullptr); if (IsType()) { version_counter_ptr = &GetMutable()->InplaceVersionCounter(); @@ -123,8 +123,8 @@ inline pten::TensorInplaceVersion* Variable::InplaceVersionCounter() { version_counter_ptr = &GetMutable()->InplaceVersionCounter(); - } else if (IsType()) { - version_counter_ptr = &GetMutable() + } else if (IsType()) { + version_counter_ptr = &GetMutable() ->mutable_value() ->InplaceVersionCounter(); } else { diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index 34ab07def54c18f8377636a7990052712f215ab8..3c71987303bd40ac76f16221b9bbef134df29196 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -31,7 +31,7 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { if (var_type == proto::VarType::LOD_TENSOR) { var->GetMutable(); } else if (var_type == proto::VarType::SELECTED_ROWS) { - var->GetMutable(); + var->GetMutable(); } else if (var_type == proto::VarType::FEED_MINIBATCH) { var->GetMutable(); } else if (var_type == proto::VarType::FETCH_LIST) { @@ -70,9 +70,9 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) { auto &src_tensor = src_var.Get(); tmp_grad_tensor->set_lod(src_tensor.lod()); framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor); - } else if (src_var.IsType()) { - auto &src_slr = src_var.Get(); - auto *tmp_grad_slr = dst_var->GetMutable(); + } else if (src_var.IsType()) { + auto &src_slr = src_var.Get(); + auto *tmp_grad_slr = dst_var->GetMutable(); tmp_grad_slr->set_rows(src_slr.rows()); tmp_grad_slr->set_height(src_slr.height()); auto &src_t = src_slr.value(); diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index d1d6a0f5adf581498ec52cf21ea7c1f762a3b446..0f105ec9a308232ad4c006c208c04981839459ed 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -39,8 +39,8 @@ static const platform::Place &GetVarPlace(const framework::Variable &src) { if (src.IsType()) { return src.Get().place(); #if NCCL_VERSION_CODE >= 2212 - } else if (src.IsType()) { - return src.Get().value().place(); + } else if (src.IsType()) { + return src.Get().value().place(); #endif } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -70,8 +70,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst, } #if NCCL_VERSION_CODE >= 2212 -static void AllReduce(const framework::SelectedRows &src, - framework::SelectedRows *dst, +static void AllReduce(const pten::SelectedRows &src, pten::SelectedRows *dst, const ParallelStrategy &strategy, const gpuStream_t stream, const platform::NCCLComm *comm) { @@ -191,19 +190,18 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst, AllReduce(src.Get(), dst->GetMutable(), stream, comm); #if NCCL_VERSION_CODE >= 2212 - } else if (src.IsType()) { + } else if (src.IsType()) { if (&src != dst) { - if (!dst->IsType()) { + if (!dst->IsType()) { dst->Clear(); } - AllReduce(src.Get(), - dst->GetMutable(), strategy, stream, - comm); + AllReduce(src.Get(), + dst->GetMutable(), strategy, stream, comm); } else { // SelectedRows cannot be allreduce in-place framework::Variable tmp_dst; - AllReduce(src.Get(), - tmp_dst.GetMutable(), strategy, stream, + AllReduce(src.Get(), + tmp_dst.GetMutable(), strategy, stream, comm); // stream must synchronize to ensure accuracy of the move operation platform::GpuStreamSync(stream); diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h index c50018f8236037d344448b18321827b3004c86ed..e1931a3b0f2489798ec935f03d83502b2cdb239f 100644 --- a/paddle/fluid/imperative/dygraph_grad_maker.h +++ b/paddle/fluid/imperative/dygraph_grad_maker.h @@ -365,12 +365,12 @@ class TracedGradOp { var_wrapper->MutableVar()->CurrentInplaceVersion()) { return var_wrapper; } else if (var_wrapper->MutableVar()->IsType() || - var_wrapper->MutableVar()->IsType()) { + var_wrapper->MutableVar()->IsType()) { auto* tensor = var_wrapper->MutableVar()->IsType() ? var_wrapper->MutableVar()->GetMutable() : var_wrapper->MutableVar() - ->GetMutable() + ->GetMutable() ->mutable_value(); if (!tensor->IsInitialized()) { return var_wrapper; diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index 1eaf0c6538043ff274b8a30f8618373deea771b0..44315e267ee78d3bccbb808529269063f3a206c5 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -72,18 +72,18 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src, } AllReduce(src.Get(), dst->GetMutable()); - } else if (src.IsType()) { + } else if (src.IsType()) { if (&src != dst) { - if (!dst->IsType()) { + if (!dst->IsType()) { dst->Clear(); } - AllReduce(src.Get(), - dst->GetMutable()); + AllReduce(src.Get(), + dst->GetMutable()); } else { // SelectedRows cannot be allreduce in-place framework::Variable tmp_dst; - AllReduce(src.Get(), - tmp_dst.GetMutable()); + AllReduce(src.Get(), + tmp_dst.GetMutable()); *dst = std::move(tmp_dst); } } else { @@ -120,8 +120,8 @@ void GLOOParallelContext::AllReduce(const framework::Tensor &src_tensor, break; \ } -void GLOOParallelContext::AllReduce(const framework::SelectedRows &src, - framework::SelectedRows *dst) { +void GLOOParallelContext::AllReduce(const pten::SelectedRows &src, + pten::SelectedRows *dst) { // auto ; // int local_rank = strategy_.local_rank_; int nranks = strategy_.nranks_; diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h index f13bb859eee93691510df27bc3449330344773b3..d63d48eac7e02b4dadf674a314570875b436bdd4 100644 --- a/paddle/fluid/imperative/gloo_context.h +++ b/paddle/fluid/imperative/gloo_context.h @@ -59,8 +59,7 @@ class GLOOParallelContext : public ParallelContext { private: void AllReduce(const framework::Tensor& src, framework::Tensor* dst); - void AllReduce(const framework::SelectedRows& src, - framework::SelectedRows* dst); + void AllReduce(const pten::SelectedRows& src, pten::SelectedRows* dst); private: std::unique_ptr device_; diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 092872247cca56ae90bb6bcf8870de79c2535c11..9ae8b75075a1a724c31a8e03e071912ba140715a 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -55,12 +55,12 @@ static void MoveOrCopyVar(framework::Variable* dst, framework::Variable* src, auto* dst_tensor = dst->GetMutable(); framework::TensorCopy(src_tensor, src_tensor.place(), dst_tensor); dst_tensor->set_lod(src_tensor.lod()); - } else if (src->IsType()) { - auto& src_selected_rows = src->Get(); - if (!dst->IsType()) { + } else if (src->IsType()) { + auto& src_selected_rows = src->Get(); + if (!dst->IsType()) { dst->Clear(); } - auto* dst_selected_rows = dst->GetMutable(); + auto* dst_selected_rows = dst->GetMutable(); framework::TensorCopy(src_selected_rows.value(), src_selected_rows.value().place(), dst_selected_rows->mutable_value()); @@ -243,6 +243,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { "should be equal, Otherwise, the calculation results " "will be incorrect.")); +#ifdef PADDLE_WITH_XPU + // if src and dst are in different place, copy dst to src's place + if (dst_tensor->place() != place) { + paddle::framework::TensorCopySync(*dst_tensor, place, dst_tensor); + } +#endif + #define PADDLE_TENSOR_ADD(cpp_type) \ if (data_type == framework::DataTypeTrait::DataType()) { \ TensorAddFunctor func( \ @@ -332,7 +339,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { void SelectedRowsAddToTensor(const framework::Variable& src, framework::Variable* dst) { auto* dst_tensor = dst->GetMutable(); - auto& src_selected_rows = src.Get(); + auto& src_selected_rows = src.Get(); auto place = dst_tensor->place(); auto data_type = src_selected_rows.value().type(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -371,7 +378,7 @@ static void SelectedRowsAddTensor( const framework::Variable& src_tensor_var, framework::Variable* dst_tensor_var) { const auto& src_selected_rows = - src_selected_rows_var.Get(); + src_selected_rows_var.Get(); const auto& src_tensor = src_tensor_var.Get(); const auto& place = src_tensor.place(); auto data_type = src_tensor.type(); @@ -414,18 +421,18 @@ static void SelectedRowsAddTensor( // to one then add it to a empty selected rows, the after is correct std::shared_ptr SelectedRowsMerge( const framework::Variable& src1, const framework::Variable& src2) { - auto& src_selected_rows1 = src1.Get(); - auto& src_selected_rows2 = src2.Get(); + auto& src_selected_rows1 = src1.Get(); + auto& src_selected_rows2 = src2.Get(); auto place = src_selected_rows1.value().place(); auto data_type = src_selected_rows1.value().type(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - std::vector src_selected_rows; + std::vector src_selected_rows; src_selected_rows.emplace_back(&src_selected_rows1); src_selected_rows.emplace_back(&src_selected_rows2); auto dst_var = std::make_shared("Temp"); auto* dst_selected_rows = - dst_var->MutableVar()->GetMutable(); + dst_var->MutableVar()->GetMutable(); #define PADDLE_SELECTED_ROWS_ADD(dev_ctx_type, cpp_type) \ if (data_type == framework::DataTypeTrait::DataType()) { \ @@ -463,7 +470,7 @@ void VariableWrapperAdd(std::shared_ptr var, if (dst->IsType()) { if (src.IsType()) { TensorAdd(src, dst); - } else if (src.IsType()) { + } else if (src.IsType()) { SelectedRowsAddToTensor(src, dst); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -481,7 +488,7 @@ void VariableWrapperAdd(std::shared_ptr var, SelectedRowsAddToTensor(*dst, src_mutable); *dst = std::move(*(var->MutableVar())); } - } else if (src.IsType()) { + } else if (src.IsType()) { auto temp = SelectedRowsMerge(src, *dst); *dst = std::move(*(temp->MutableVar())); } else { @@ -497,8 +504,8 @@ static platform::Place GetPlaceOfVar( platform::Place place; if (var->Var().IsType()) { place = var->Var().Get().place(); - } else if (var->Var().IsType()) { - place = var->Var().Get().place(); + } else if (var->Var().IsType()) { + place = var->Var().Get().place(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "only support LoDTensor and SelectedRows in dygraph")); @@ -530,14 +537,14 @@ void GradientAccumulator::AccumulateGrad() { if (dst->IsType()) { if (src->IsType()) { TensorAdd(*src, dst); - } else if (src->IsType()) { + } else if (src->IsType()) { SelectedRowsAddToTensor(*src, dst); } - } else if (dst->IsType()) { + } else if (dst->IsType()) { if (src->IsType()) { SelectedRowsAddToTensor(*dst, src); *dst = std::move(*src); - } else if (src->IsType()) { + } else if (src->IsType()) { auto temp = SelectedRowsMerge(*src, *dst); *dst = std::move(*(temp->MutableVar())); } @@ -657,7 +664,7 @@ void EagerGradientAccumulator::SumGrad(std::shared_ptr var, // so synchronous VariableWrapper with Variable. if (dst_var->Var().IsType()) { dst_var->SetType(framework::proto::VarType::LOD_TENSOR); - } else if (dst_var->Var().IsType()) { + } else if (dst_var->Var().IsType()) { dst_var->SetType(framework::proto::VarType::SELECTED_ROWS); } @@ -701,7 +708,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, if (paddle::platform::is_gpu_place(place)) { // sum selected rows firstly for (auto& var_info : tmp_grad_vars_) { - if (!var_info.var->Var().IsType()) { + if (!var_info.var->Var().IsType()) { continue; } @@ -744,7 +751,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, } PADDLE_ENFORCE_EQ( var_info.var->Var().IsType() || - var_info.var->Var().IsType(), + var_info.var->Var().IsType(), true, platform::errors::PermissionDenied("The type of Gradient " "var must be LoDTensor " "or SelectedRows")); @@ -789,7 +796,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, if (dst_var->Var().IsType()) { dst_var->SetType(framework::proto::VarType::LOD_TENSOR); - } else if (dst_var->Var().IsType()) { + } else if (dst_var->Var().IsType()) { dst_var->SetType(framework::proto::VarType::SELECTED_ROWS); } } diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index 6411dce4405c11795418fb8334e26b32079e7596..8896e5d0f406447f524bcdd9215db30d6d2ecc28 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -31,7 +31,7 @@ class GradientAccumulator { if (var && var->Var().IsInitialized()) { if (var->Var().IsType()) { var->SetType(framework::proto::VarType::LOD_TENSOR); - } else if (var->Var().IsType()) { + } else if (var->Var().IsType()) { var->SetType(framework::proto::VarType::SELECTED_ROWS); } else { PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h index 71f7fb7387effe68ae63d5a3c5236e9a9a108d2f..a39e58bba90110c122a666f97a4cf0911284e4a8 100644 --- a/paddle/fluid/imperative/infer_shape_context.h +++ b/paddle/fluid/imperative/infer_shape_context.h @@ -196,8 +196,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext { auto* out_lod_tensor = out_var->GetMutable(); out_lod_tensor->Resize(in_lod_tensor.dims()); } else { - auto& in_sele_rows = in_var->Get(); - auto out_sele_rows = out_var->GetMutable(); + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); out_sele_rows->set_rows(in_sele_rows.rows()); out_sele_rows->set_height(in_sele_rows.height()); @@ -365,8 +365,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext { "Input variable should not be null")); if (var->IsType()) { return var->Get().dims(); - } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); } else { PADDLE_THROW(platform::errors::PermissionDenied( "Only LoDTensor/SelectedRows support 'GetDim', but Variables " @@ -382,8 +382,8 @@ class DygraphInferShapeContext : public framework::InferShapeContext { void SetDim(framework::Variable* var, const DDim& dim) { if (var->IsType()) { var->GetMutable()->Resize(dim); - } else if (var->IsType()) { - var->GetMutable()->set_height(dim[0]); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW(platform::errors::PermissionDenied( "Variable type_id %s, expect LoDTensor/SelectedRows.")); diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index f47b024973ba7899ebf5040a09702f5bab83fe32..65720c8a3cf6578f0c35a7b79be78fde14c1a9cf 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -105,9 +105,9 @@ static std::string DebugString( ss << "NOT_INITED"; } ss << ">"; - } else if (var.IsType()) { + } else if (var.IsType()) { ss << "SelectedRows<"; - auto& selected_rows = var.Get(); + auto& selected_rows = var.Get(); auto& tensor = selected_rows.value(); auto& rows = selected_rows.rows(); if (tensor.IsInitialized()) { @@ -188,9 +188,8 @@ size_t VarBase::GradOpNum() const { void VarBase::ClearGradient(bool set_to_zero) { VLOG(4) << "ClearGradient " << Name(); if (grad_var_) { - if (grad_var_->Var().IsType()) { - auto* grad_t = - grad_var_->MutableVar()->GetMutable(); + if (grad_var_->Var().IsType()) { + auto* grad_t = grad_var_->MutableVar()->GetMutable(); if (grad_t->mutable_value()->IsInitialized()) { #ifdef PADDLE_WITH_MKLDNN if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place()); @@ -248,7 +247,7 @@ std::shared_ptr VarBase::NewVarBase(const platform::Place& dst_place, const bool blocking) const { PADDLE_ENFORCE_EQ( Var().IsInitialized() && (Var().IsType() || - Var().IsType()), + Var().IsType()), true, platform::errors::InvalidArgument( "Variable is not initialized or Variable's type is not " "LoDTensor or SelectedRows when getting numpy tensor")); @@ -277,12 +276,12 @@ std::shared_ptr VarBase::NewVarBase(const platform::Place& dst_place, << dst_place; return new_var; } else { - auto& src_selected_rows = Var().Get(); + auto& src_selected_rows = Var().Get(); auto new_var = std::make_shared( false, "Itmp" + std::to_string(copied_counter_++)); new_var->SetType(framework::proto::VarType::SELECTED_ROWS); auto* dst_selected_rows = - new_var->MutableVar()->GetMutable(); + new_var->MutableVar()->GetMutable(); framework::TensorCopy(src_selected_rows.value(), dst_place, dst_selected_rows->mutable_value()); @@ -346,10 +345,9 @@ void VarBase::CopyFrom(const VarBase& src, const bool blocking) { dst_tensor->Resize(src_tensor.dims()); } framework::TensorCopy(src_tensor, place, dst_tensor); - } else if (src.Var().IsType()) { - auto& src_selected_rows = src.Var().Get(); - auto* dst_selected_rows = - MutableVar()->GetMutable(); + } else if (src.Var().IsType()) { + auto& src_selected_rows = src.Var().Get(); + auto* dst_selected_rows = MutableVar()->GetMutable(); dst_selected_rows->set_height(src_selected_rows.height()); dst_selected_rows->set_rows(src_selected_rows.rows()); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index fe60f05e1da431dc7ed7b45acebb8cffecc12941..d9a21c9247b9363b0f1cbcdf6c8d62bb6242c183 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -25,6 +25,7 @@ #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/profiler.h" DECLARE_bool(check_nan_inf); DECLARE_bool(run_pten_kernel); @@ -47,8 +48,8 @@ const std::shared_ptr& GetVariableWrapper( const framework::Tensor* GetTensorFromVar(const framework::Variable& var) { if (var.IsType()) { return &(var.Get()); - } else if (var.IsType()) { - return &(var.Get().value()); + } else if (var.IsType()) { + return &(var.Get().value()); } else { return nullptr; } @@ -369,6 +370,10 @@ static void BuildDygraphPtenKernelContext( size_t end_idx = start_idx + outs_vector.size(); for (size_t offset = 0; offset < outs_vector.size(); ++offset) { + if (outs_vector[offset] == nullptr) { + kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr}); + continue; + } auto* var = outs_vector[offset]->MutableVar(); framework::Tensor* tensor_out = nullptr; if (var->template IsType()) { @@ -501,12 +506,21 @@ static void PreparedOpRunImpl( // TODO(zjl): remove scope in dygraph framework::Scope scope; - DygraphInferShapeContext infer_shape_ctx( - &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); - op.Info().infer_shape_(&infer_shape_ctx); + { + platform::RecordEvent record_event(op.Type() + " infer_shape", + platform::EventRole::kInnerOp); + DygraphInferShapeContext infer_shape_ctx( + &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); + op.Info().infer_shape_(&infer_shape_ctx); + } + + { + platform::RecordEvent record_event(op.Type() + " compute", + platform::EventRole::kInnerOp); - func(DygraphExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, - attrs, default_attrs)); + func(DygraphExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, + attrs, default_attrs)); + } if (FLAGS_check_nan_inf) { framework::details::CheckOpHasNanOrInfInDygraph( @@ -547,18 +561,27 @@ static void PreparedOpRunPtImpl( const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - DygraphInferShapeContext infer_shape_ctx( - &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); - op.Info().infer_shape_(&infer_shape_ctx); + { + platform::RecordEvent record_event(op.Type() + " infer_shape", + platform::EventRole::kInnerOp); + DygraphInferShapeContext infer_shape_ctx( + &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type); + op.Info().infer_shape_(&infer_shape_ctx); + } - PreparePtenData(pt_kernel, pt_kernel_signature, ins); + { + platform::RecordEvent record_event(op.Type() + " compute", + platform::EventRole::kInnerOp); - pten::KernelContext pt_kernel_context; - BuildDygraphPtenKernelContext(pt_kernel_signature, pt_kernel, ins, - outs, attrs, default_attrs, dev_ctx, - &pt_kernel_context); + PreparePtenData(pt_kernel, pt_kernel_signature, ins); - pt_kernel(&pt_kernel_context); + pten::KernelContext pt_kernel_context; + BuildDygraphPtenKernelContext(pt_kernel_signature, pt_kernel, ins, + outs, attrs, default_attrs, dev_ctx, + &pt_kernel_context); + + pt_kernel(&pt_kernel_context); + } if (FLAGS_benchmark) { dev_ctx->Wait(); diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index ad518eb96062d29a1c8f8f9f25a5c49c48c27b04..54e27b2bd8c313eaa3df016b48ee17957fd833f2 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -36,8 +36,7 @@ namespace imperative { void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { framework::Tensor *tensor = is_sparse_ - ? sparse_contents_->GetMutable() - ->mutable_value() + ? sparse_contents_->GetMutable()->mutable_value() : dense_contents_.GetMutable(); if (platform::is_gpu_place(tensor->place())) { @@ -775,7 +774,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { auto var_base = vars_[var_index]->GradVarBase(); // need to check tensor type PADDLE_ENFORCE_EQ( - var_base->Var().IsType(), true, + var_base->Var().IsType(), true, platform::errors::PreconditionNotMet( "The sparse parameter[%d][%s] must have a selectedrows gradient. " "Before forward pass, the parameter type is inferred to be " @@ -995,8 +994,8 @@ bool Reducer::HasGrad(size_t var_index) { if (var.Get().IsInitialized()) { return true; } - } else if (var.IsType()) { - if (var.Get().value().IsInitialized()) { + } else if (var.IsType()) { + if (var.Get().value().IsInitialized()) { return true; } } else { diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc index 0a7df9953ad45a3d1f93a09af88d34046b0c9776..25ffab470646b3e69e02e049967f540adb776a08 100644 --- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -15,6 +15,7 @@ #include #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/gradient_accumulator.h" @@ -29,8 +30,8 @@ namespace imperative { void TensorAdd(const framework::Variable& src, framework::Variable* dst); -template -int TensorddTest(Place place, T t1, T t2) { +template +int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) { framework::Variable var1; framework::Variable var2; std::vector src_data(10, t1); @@ -46,18 +47,25 @@ int TensorddTest(Place place, T t1, T t2) { auto* dst = var2.GetMutable(); src->Resize(framework::make_ddim(dims)); dst->Resize(framework::make_ddim(dims)); - auto* src_mutable = src->mutable_data(place); - auto* dst_mutable = dst->mutable_data(place); - if (!std::is_same::value) { - paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), + auto* src_mutable = src->mutable_data(place1); + auto* dst_mutable = dst->mutable_data(place2); + + if (!std::is_same::value) { + paddle::memory::Copy(place1, src_mutable, src_place, src_data.data(), sizeof(T) * src_data.size()); - paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), - sizeof(T) * dst_data.size()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else { - paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), + paddle::memory::Copy(place1, src_mutable, src_place, src_data.data(), sizeof(T) * src_data.size(), 0); - paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), +#endif + } + + if (!std::is_same::value) { + paddle::memory::Copy(place2, dst_mutable, src_place, dst_data.data(), + sizeof(T) * dst_data.size()); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + } else { + paddle::memory::Copy(place2, dst_mutable, src_place, dst_data.data(), sizeof(T) * dst_data.size(), 0); #endif } @@ -80,25 +88,64 @@ TEST(test_add_functor, add_functor) { platform::CPUPlace cpu_place; int cpu_res = 1; - cpu_res = TensorddTest(cpu_place, 1.0, 0.0); + + // float32 + cpu_res = TensorddTest(cpu_place, cpu_place, static_cast(1.0), + static_cast(2.0)); EXPECT_EQ(cpu_res, 0); - cpu_res = TensorddTest(cpu_place, static_cast(1.0), - static_cast(2.0)); + // float16 + cpu_res = + TensorddTest(cpu_place, cpu_place, static_cast(1.0), + static_cast(2.0)); EXPECT_EQ(cpu_res, 0); - cpu_res = TensorddTest(cpu_place, static_cast(1.0), - static_cast(2.0)); + +#ifndef PADDLE_WITH_XPU + // does not support double when compiled using xpu + cpu_res = TensorddTest(cpu_place, cpu_place, static_cast(1.0), + static_cast(2.0)); EXPECT_EQ(cpu_res, 0); +#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int gpu_res = 1; - gpu_res = TensorddTest(gpu_place, 1.0, 0.0); + gpu_res = TensorddTest(gpu_place, gpu_place, 1.0, 0.0); EXPECT_EQ(gpu_res, 0); - gpu_res = TensorddTest(gpu_place, static_cast(1.0), + gpu_res = TensorddTest(gpu_place, gpu_place, static_cast(1.0), static_cast(2.0)); EXPECT_EQ(gpu_res, 0); - gpu_res = TensorddTest(gpu_place, static_cast(1.0), - static_cast(2.0)); + gpu_res = + TensorddTest(gpu_place, gpu_place, static_cast(1.0), + static_cast(2.0)); EXPECT_EQ(gpu_res, 0); #endif + +#ifdef PADDLE_WITH_XPU + platform::XPUPlace xpu_place(0); + int xpu_res = 1; + // normal + xpu_res = TensorddTest(xpu_place, xpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(xpu_res, 0); + xpu_res = + TensorddTest(xpu_place, xpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(xpu_res, 0); + // different places + xpu_res = TensorddTest(cpu_place, xpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(xpu_res, 0); + xpu_res = TensorddTest(xpu_place, cpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(xpu_res, 0); + xpu_res = + TensorddTest(cpu_place, xpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(xpu_res, 0); + xpu_res = + TensorddTest(xpu_place, cpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(xpu_res, 0); +#endif } TEST(test_add_functor, execption) { @@ -106,10 +153,11 @@ TEST(test_add_functor, execption) { platform::CUDAPlace cuda_place(0); platform::CPUPlace cpu_place; - ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0)); + ASSERT_ANY_THROW(TensorddTest(cpu_place, cpu_place, 1, 0)); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0)); - ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, + ASSERT_ANY_THROW( + TensorddTest(cuda_pinned_place, cuda_pinned_place, 1.0, 0.0)); + ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, cuda_pinned_place, static_cast(1.0), static_cast(2.0))); #endif @@ -124,8 +172,8 @@ static void CopyVar(const framework::Variable& var, auto* dst_tensor = dst.GetMutable(); framework::TensorCopySync(src_tensor, src_tensor.place(), dst_tensor); } else { - const auto& src_selected_rows = var.Get(); - auto* dst_selected_rows = dst.GetMutable(); + const auto& src_selected_rows = var.Get(); + auto* dst_selected_rows = dst.GetMutable(); dst_selected_rows->set_rows(src_selected_rows.rows()); dst_selected_rows->set_height(src_selected_rows.height()); framework::TensorCopySync(src_selected_rows.value(), @@ -148,8 +196,8 @@ static bool IsEqualVar(const framework::Variable& var1, framework::TensorCopySync(var2.Get(), platform::CPUPlace(), &t2); } else { - auto& s1 = var1.Get(); - auto& s2 = var2.Get(); + auto& s1 = var1.Get(); + auto& s2 = var2.Get(); if (s1.height() != s2.height()) { return false; @@ -166,9 +214,9 @@ static bool IsEqualVar(const framework::Variable& var1, return false; } - framework::TensorCopySync(var1.Get().value(), + framework::TensorCopySync(var1.Get().value(), platform::CPUPlace(), &t1); - framework::TensorCopySync(var2.Get().value(), + framework::TensorCopySync(var2.Get().value(), platform::CPUPlace(), &t2); } @@ -211,7 +259,7 @@ static framework::Variable RandomSelectedRows(framework::DDim dims, dims[0] = row_number; framework::Variable ret; - auto* sr = ret.GetMutable(); + auto* sr = ret.GetMutable(); auto tensor_var = RandomTensor(dims, place, low, high); sr->mutable_value()->ShareDataWith( tensor_var.template Get()); diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc index 064f47f54979a135fb83f9636ebc6f5105e7c39d..c54ed34bb8108afe76459445b3ce695d73ccd0ca 100644 --- a/paddle/fluid/imperative/tests/test_layer.cc +++ b/paddle/fluid/imperative/tests/test_layer.cc @@ -237,7 +237,7 @@ TEST(test_layer, test_debug_string) { std::shared_ptr selected_rows( new imperative::VarBase(false, "selected_rows")); auto tensor_sr = selected_rows->MutableVar() - ->GetMutable() + ->GetMutable() ->mutable_value(); std::string res_ui_sr = test_func(selected_rows); ASSERT_TRUE(res_ui_sr.find("NOT_INITED") != std::string::npos); diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index 5e269d74044d24adc7baea8875ecd9eb2d6772c1..b4ff3cff38217a57c0b1091c3e003043ca4c9673 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -101,7 +101,7 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var); TEST(test_prepare_op, test_get_tensor_from_var) { std::shared_ptr vout_error( new imperative::VarBase(false, "vout_error")); - vout_error->MutableVar()->GetMutable(); + vout_error->MutableVar()->GetMutable(); auto* ts = GetTensorFromVar(*vout_error->MutableVar()); ASSERT_TRUE(ts != nullptr); } diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index f4e535de108a6a69dddd19ad4705c1b08e749e47..e845ce104534cd57ec232957cbbcce88addb60b9 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -32,6 +32,8 @@ namespace imperative { thread_local bool Tracer::has_grad_ = true; +thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0; + static std::shared_ptr g_current_tracer(nullptr); const std::shared_ptr& GetCurrentTracer() { return g_current_tracer; } diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 93f68f2054b9a85b65639ae6ddfdc1f7fc8911f8..bd8521dabde1f43371722bd7c8b0dc9c93787cc4 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -126,7 +126,7 @@ class Tracer { platform::Place expected_place_; GarbageCollectorMap gcs_; static thread_local bool has_grad_; - AmpLevel amp_level_{AmpLevel::O0}; + static thread_local AmpLevel amp_level_; }; // To access static variable current_tracer diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h index 74fd152e72a5752af9becf729b3fde63fa6d9d35..a0258c7a8806fb4562102f7e681d292227bee5ae 100644 --- a/paddle/fluid/imperative/type_defs.h +++ b/paddle/fluid/imperative/type_defs.h @@ -13,47 +13,4 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - -#include -#include -#include -#include - -namespace paddle { -namespace imperative { - -class VariableWrapper; -class SavedVariableWrapperList; -class VarBase; -class OpBase; -class GradOpNode; -class Tracer; - -using WeakNameVarBaseMap = - std::map>>; - -namespace details { -template -struct NameVarMapTrait {}; - -template <> -struct NameVarMapTrait { - using Type = std::map>>; -}; - -template <> -struct NameVarMapTrait { - using Type = std::map; -}; -} // namespace details - -template -using NameVarMap = typename details::NameVarMapTrait::Type; - -using NameVarBaseMap = NameVarMap; -using NameVariableWrapperMap = NameVarMap; - -using VariableWrapperList = std::vector>; - -} // namespace imperative -} // namespace paddle +#include "paddle/pten/core/type_defs.h" diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index c257191a546e439cedee0d2075549a45a3467423..bd96cd3f1aa1781b623c665d5263eaee0a3da244 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -104,8 +104,8 @@ class VariableWrapper { const framework::Tensor* tensor = nullptr; if (var_.IsType()) { tensor = &(var_.Get()); - } else if (var_.IsType()) { - tensor = &(var_.Get().value()); + } else if (var_.IsType()) { + tensor = &(var_.Get().value()); } else { PADDLE_THROW(platform::errors::PermissionDenied( "Only support LoDTensor and SelectedRows for gradient var")); @@ -153,7 +153,7 @@ class VariableWrapper { if (type_ == framework::proto::VarType::LOD_TENSOR) { tensor = &(var_.Get()); } else if (type_ == framework::proto::VarType::SELECTED_ROWS) { - tensor = &(var_.Get().value()); + tensor = &(var_.Get().value()); } else if (type_ == framework::proto::VarType::VOCAB) { const framework::Vocab* data = nullptr; data = &(var_.Get()); @@ -193,7 +193,7 @@ class VariableWrapper { if (type_ == framework::proto::VarType::LOD_TENSOR) { tensor = &(var_.Get()); } else if (type_ == framework::proto::VarType::SELECTED_ROWS) { - tensor = &(var_.Get().value()); + tensor = &(var_.Get().value()); } else { VLOG(6) << "Variable " << name_ << " is not initialized"; return place; diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 6ff25597125c5f0b13ee603bc17329a351074a8b..d731bfe139bac58050fdf79b420744551bfd17e8 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -36,6 +36,7 @@ endif() # fluid_modules exclude API-interface of inference/api and inference/capi_exp get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES) +set(utils_modules stringpiece pretty_log string_helper) add_subdirectory(api) @@ -46,9 +47,9 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg}) #TODO(wilber, T8T9): Do we still need to support windows gpu static library? if(WIN32 AND WITH_GPU) - cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API}) + cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules}) else() - create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API}) + create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules}) if(WITH_IPU) target_link_libraries(paddle_inference -Wl,--allow-multiple-definition popart_canonicalization_utils) endif() diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 175bc55dcff17e46aa47e1d2d187e3a8c8c4b43d..febfdec0b5cf500c30d44feccf4bed7e029feef4 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -282,6 +282,10 @@ struct Argument { DECL_ARGUMENT_FIELD(ipu_batch_size, IpuBatchSize, int); DECL_ARGUMENT_FIELD(ipu_need_avg_shard, IpuNeedAvgShard, bool); + // npu related + DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool); + DECL_ARGUMENT_FIELD(npu_device_id, NPUDeviceId, int); + private: std::unordered_set valid_fields_; }; diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 06a353d5622a7093760c8680bcb8c1e245496ae8..daa18d8c78bf875ebcc6571bf955a7f634948e4f 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -22,16 +22,50 @@ namespace paddle { namespace inference { namespace analysis { -void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { - PADDLE_ENFORCE_EQ( - argument->scope_valid(), true, - platform::errors::PreconditionNotMet("The scope field should be valid")); - PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true, +#ifdef PADDLE_WITH_ASCEND_CL +void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { + if (!argument->use_npu()) return; + + auto &graph = argument->main_graph(); + std::vector repetitive_params; + + if (graph.Has(framework::ir::kRepetitiveParamAttr)) + repetitive_params = graph.Get>( + framework::ir::kRepetitiveParamAttr); + + LOG(INFO) << "Sync params from CPU to NPU"; + + PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(), true, platform::errors::PreconditionNotMet( - "The use_gpu field should be valid")); + "The npu_device_id field should be valid")); + platform::Place place = platform::NPUPlace(argument->npu_device_id()); + auto *scope = argument->scope_ptr(); + std::vector all_vars = scope->LocalVarNames(); - platform::Place place; + for (auto &var_name : all_vars) { + auto *var = scope->FindLocalVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet( + "The var should not be nullptr")); + + if (var->IsType() || + var->IsType()) { + auto *t = var->GetMutable(); + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(t->dims()); + temp_tensor.mutable_data(cpu_place); + + paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); + t->clear(); + paddle::framework::TensorCopySync(temp_tensor, place, t); + } + } +} + +#else + +void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -47,8 +81,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true, platform::errors::PreconditionNotMet( "The gpu_device_id field should be valid")); - place = platform::CUDAPlace(argument->gpu_device_id()); - + platform::Place place = platform::CUDAPlace(argument->gpu_device_id()); auto *scope = argument->scope_ptr(); std::vector all_vars = scope->LocalVarNames(); @@ -100,6 +133,22 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { } } +#endif + +void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { + PADDLE_ENFORCE_EQ( + argument->scope_valid(), true, + platform::errors::PreconditionNotMet("The scope field should be valid")); + +#ifdef PADDLE_WITH_ASCEND_CL + if (!argument->use_npu_valid()) return; + CopyParamsToNpu(argument); +#else + if (!argument->use_gpu_valid()) return; + CopyParamsToGpu(argument); +#endif +} + std::string IrParamsSyncAmongDevicesPass::repr() const { return "ir-params-sync-among-devices-pass"; } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index 61990150a30db147418c4301359428cf3c6db541..d5e98ec886e65f829a1496b1431f23aad6c4bc4c 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -33,6 +33,13 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { public: void RunImpl(Argument *argument) override; std::string repr() const override; + + private: +#ifdef PADDLE_WITH_ASCEND_CL + void CopyParamsToNpu(Argument *argument); +#else + void CopyParamsToGpu(Argument *argument); +#endif }; } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index a86329a2b2b25df7cb256c47200598644af84bfe..628d974c1237862c81c9e124851004c50d07d377 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -668,6 +668,9 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetIpuBatchSize(config_.ipu_batch_size_); argument_.SetIpuNeedAvgShard(config_.ipu_need_avg_shard_); + argument_.SetUseNpu(config_.use_npu_); + argument_.SetNPUDeviceId(config_.npu_device_id()); + if (config_.use_mkldnn_) { LOG(INFO) << "MKLDNN is enabled"; argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index fa5997d92dd231af221265601ba337e9291b6284..bd867ba54d235973663ed61deabc81eb34b76c18 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -43,7 +43,7 @@ struct TensorArrayBatchCleaner { constexpr auto kLoDTensorId = framework::VarTypeTrait::kId; constexpr auto kSelectedRowsId = - framework::VarTypeTrait::kId; + framework::VarTypeTrait::kId; constexpr auto kFetchListId = framework::VarTypeTrait::kId; valid_types_.insert(kTensorId); diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc index f3c4059b8e6456581aad49b944997530e67ef9af..7c5eaa309ef18a839ea97fd9aabd44434c1c903d 100644 --- a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc @@ -83,6 +83,8 @@ class ReduceOpConverter : public OpConverter { } auto output_name = op_desc.Output("Out")[0]; + // Ensure that the output type and input type are consistent. + layer->getOutput(0)->setType(layer->getInput(0)->getType()); RreplenishLayerAndOutput(layer, op_type, {output_name}, test_mode); } diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 73fb6c0b13b70221e5b4125846bab7820353eaf5..4a65a036191038e5e4b2692c41e7b4e201135d07 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1464,30 +1464,48 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "the " << op_type << " does not have attr (keep_dim or dim or " "reduce_all)"; - std::cout << "attr " << desc.HasAttr("keep_dim") << " " - << desc.HasAttr("dim") << " " << desc.HasAttr("reduce_all"); + return false; + } + + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; return false; } // The batch size dimension cannot be reduced if it's not dynamic shape. + auto* x_var_desc = block->FindVar(desc.Input("X")[0]); if (!with_dynamic_shape) { if (BOOST_GET_CONST(bool, desc.GetAttr("reduce_all"))) return false; std::vector dim = BOOST_GET_CONST(std::vector, desc.GetAttr("dim")); + const auto input_shape = x_var_desc->GetShape(); for (auto x : dim) { - if (!x) return false; + if (x == 0 || (x + input_shape.size() == 0)) return false; } + } else { if (BOOST_GET_CONST(bool, desc.GetAttr("reduce_all")) && !BOOST_GET_CONST(bool, desc.GetAttr("keep_dim"))) return false; } - if (desc.HasAttr("out_dtype")) { - int out_dtype = BOOST_GET_CONST(int32_t, desc.GetAttr("out_dtype")); - if (out_dtype != -1) { - return false; - } + + auto dtype = x_var_desc->GetDataType(); +#if IS_TRT_VERSION_GE(7000) + if (dtype != framework::proto::VarType::INT32 && + dtype != framework::proto::VarType::FP32) { + VLOG(3) << "reduce op input data type must be int32 or float32"; + return false; } +#else + if (dtype != framework::proto::VarType::FP32) { + VLOG(3) + << "reduce op input data type must be float32 using TensorRT < 7.0"; + return false; + } +#endif } #if IS_TRT_VERSION_GE(7000) if (op_type == "tile") { diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 97952e4b71641e00e27592380a0fd88f2c17b1a0..023b40518edf216f76642aae1577507ee2c36486 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -34,6 +34,13 @@ if (WITH_ROCM) DEPS device_context malloc) endif() +if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + nv_test(get_base_ptr_test SRCS get_base_ptr_test.cu DEPS malloc gpu_info) + set_tests_properties(get_base_ptr_test PROPERTIES + ENVIRONMENT "FLAGS_allocator_strategy=auto_growth; + FLAGS_use_stream_safe_cuda_allocator=true;") +endif() + #if (WITH_GPU) # nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory) #endif() diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 939ad140415df45619018536520e3ffb9d681366..c0d1934a703b66a8ab8a1eab0c1d0680d73b9e17 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -125,10 +125,3 @@ if(NOT WIN32) cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) endif(NOT WIN32) - -if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") - nv_test(base_ptr_test SRCS base_ptr_test.cu DEPS malloc gpu_info) - set_tests_properties(base_ptr_test PROPERTIES - ENVIRONMENT "FLAGS_allocator_strategy=auto_growth; - FLAGS_use_stream_safe_cuda_allocator=true;") -endif() diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 3f04d47516377251011174b1382679ba41fdca02..878633d1a62915383aa1c5306dcc7940d06282e4 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -93,14 +93,7 @@ class Allocation : public pten::Allocation { const platform::Place& place) : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {} - void* base_ptr() const { - PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth", - paddle::platform::errors::Unimplemented( - "base_ptr() is only implemented for auto_growth " - "strategy, not support %s strategy", - FLAGS_allocator_strategy)); - return base_ptr_; - } + void* base_ptr() const { return base_ptr_; } private: inline void RegisterDecoratedAllocator(Allocator* allocator) { diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 6615bdf4b138b483761c82312841f5887f6075c7..7cdac0de6138f13325500759c0ca2a392e2000f9 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -282,6 +282,10 @@ class AllocatorFacadePrivate { return iter->second; } + void* GetBasePtr(const std::shared_ptr& allocation) { + return static_cast(allocation.get())->base_ptr(); + } + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool HasCUDAAllocator(const platform::CUDAPlace& place, const gpuStream_t& stream) { @@ -821,6 +825,21 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } +void* AllocatorFacade::GetBasePtr( + const std::shared_ptr& allocation) { + PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, + paddle::platform::errors::Unimplemented( + "GetBasePtr() is only implemented for auto_growth " + "strategy, not support allocator strategy: %d", + static_cast(GetAllocatorStrategy()))); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(allocation->place()), true, + paddle::platform::errors::Unimplemented( + "GetBasePtr() is only implemented for CUDAPlace(), not " + "suppot place: %s", + allocation->place())); + return m_->GetBasePtr(allocation); +} + std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size) { return std::shared_ptr(Alloc(place, size)); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 76e2f0b5a94f6ddae8e8fb6281bdfcf70f10b76c..a9b92e1801e4a3c74941388f864172f078d7128a 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -51,6 +51,8 @@ class AllocatorFacade { const std::shared_ptr& GetAllocator(const platform::Place& place); + void* GetBasePtr(const std::shared_ptr& allocation); + // Allocate a shared allocation. std::shared_ptr AllocShared(const platform::Place& place, size_t size); diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/get_base_ptr_test.cu similarity index 80% rename from paddle/fluid/memory/allocation/base_ptr_test.cu rename to paddle/fluid/memory/get_base_ptr_test.cu index 5edabfcb9f5e7efab1242da5f5c091bebcf74c11..fe1d73b60284968d1e0022eb0383bcbcdc25856f 100644 --- a/paddle/fluid/memory/allocation/base_ptr_test.cu +++ b/paddle/fluid/memory/get_base_ptr_test.cu @@ -35,9 +35,9 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { void OneByOneAllocTest() { for (size_t i = 0; i < alloc_times_; ++i) { size_t size = dis_(random_engine_); - AllocationPtr allocation = Alloc(place_, size); + auto allocation = AllocShared(place_, size); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); @@ -47,21 +47,21 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { } void BatchByBatchAllocTest() { - std::vector allocations; + std::vector> allocations; allocations.reserve(batch_size_); size_t batch_num = alloc_times_ / batch_size_; for (size_t i = 0; i < batch_num; ++i) { for (size_t j = 0; j < batch_size_; ++j) { size_t size = dis_(random_engine_); - AllocationPtr allocation = Alloc(place_, size); + auto allocation = AllocShared(place_, size); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); - allocations.emplace_back(std::move(allocation)); + allocations.emplace_back(allocation); } allocations.clear(); } @@ -70,19 +70,19 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { } void ContinuousAllocTest() { - std::vector allocations; + std::vector> allocations; allocations.reserve(alloc_times_); for (size_t i = 0; i < alloc_times_; ++i) { size_t size = dis_(random_engine_); - AllocationPtr allocation = Alloc(place_, size); + auto allocation = AllocShared(place_, size); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); - allocations.emplace_back(std::move(allocation)); + allocations.emplace_back(allocation); } allocations.clear(); @@ -90,8 +90,8 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { } void ZeroSizeAllocTest() { - AllocationPtr allocation = Alloc(place_, 0); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + auto allocation = AllocShared(place_, 0); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 3e859377e98d801e775461d9cfaaa50fe9c43e8e..63c562be97fa0728b26761ac856caf755717a64d 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -47,6 +47,10 @@ bool InSameStream(const std::shared_ptr& allocation, stream); } +void* GetBasePtr(const std::shared_ptr& allocation) { + return allocation::AllocatorFacade::Instance().GetBasePtr(allocation); +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) { diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 6443e91f08cbeb7c3f504e8f4894808bffd5bbf1..855cbb775a1096ba749d93667c71268045645a15 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -44,6 +44,8 @@ extern std::shared_ptr AllocShared(const platform::Place& place, extern bool InSameStream(const std::shared_ptr& allocation, const platform::Stream& stream); +extern void* GetBasePtr(const std::shared_ptr& allocation); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 6d348ceb87c83de1bb201a6b57477d764b58a2ba..d2ab438fd2946701c70ea0bebf35ac33fbfb521e 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -57,33 +57,6 @@ void Copy(platform::IPUPlace dst_place, std::memcpy(dst, src, num); } -// NOTE: only for CPUPlace and IPUPlace. -template <> -void Copy(pten::Place dst_place, void* dst, - pten::Place src_place, const void* src, - size_t num) { - if (src_place.GetType() == pten::AllocationType::CPU && - dst_place.GetType() == pten::AllocationType::CPU) { - platform::CPUPlace place_dst, place_src; - return Copy(place_dst, dst, place_src, src, num); - } else if (src_place.GetType() == pten::AllocationType::CPU && - dst_place.GetType() == pten::AllocationType::IPU) { - platform::IPUPlace place_dst(dst_place.GetDeviceId()); - platform::CPUPlace place_src; - return Copy(place_dst, dst, place_src, src, num); - } else if (src_place.GetType() == pten::AllocationType::IPU && - dst_place.GetType() == pten::AllocationType::CPU) { - platform::IPUPlace place_src(src_place.GetDeviceId()); - platform::CPUPlace place_dst; - return Copy(place_dst, dst, place_src, src, num); - } else if (src_place.GetType() == pten::AllocationType::IPU && - dst_place.GetType() == pten::AllocationType::IPU) { - platform::IPUPlace place_src(src_place.GetDeviceId()); - platform::IPUPlace place_dst(dst_place.GetDeviceId()); - return Copy(place_dst, dst, place_src, src, num); - } -} - // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace). template <> void Copy(pten::IPUPlace dst_place, void* dst, @@ -1039,6 +1012,24 @@ void Copy(pten::Place dst_place, void* dst, return Copy(place_dst, dst, place_src, src, num); } #endif +#ifdef PADDLE_WITH_IPU + else if (src_place.GetType() == pten::AllocationType::CPU && + dst_place.GetType() == pten::AllocationType::IPU) { + platform::IPUPlace place_dst(dst_place.GetDeviceId()); + platform::CPUPlace place_src; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::IPU && + dst_place.GetType() == pten::AllocationType::CPU) { + platform::IPUPlace place_src(src_place.GetDeviceId()); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num); + } else if (src_place.GetType() == pten::AllocationType::IPU && + dst_place.GetType() == pten::AllocationType::IPU) { + platform::IPUPlace place_src(src_place.GetDeviceId()); + platform::IPUPlace place_dst(dst_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num); + } +#endif } // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace). diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index d18ff6f6bfe2f0b04966af9e80bc40f3bebfc593..cbc61fc804397b1f0e4ae28fc792959bf5cfe82e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -59,6 +59,10 @@ if(WITH_CINN) add_subdirectory(cinn) endif() +if(WITH_IPU) + add_subdirectory(ipu) +endif() + SET(OP_HEADER_DEPS xxhash executor) if (WITH_GPU) diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc index 979ae5c508c6b6685848b8eff4944aa5461a1daa..5d769214df4d15823066d6a0c2b5a5af0e06261d 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc @@ -94,11 +94,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { inverse_scale = 0.0; } - paddle::platform::XPUVersion version = dev_ctx.xpu_version(); + auto version = dev_ctx.xpu_version(); framework::Tensor float_x; framework::Tensor float_out; if (std::is_same::value && - (version == paddle::platform::XPUVersion::XPU1)) { + (version == pten::backends::xpu::XPUVersion::XPU1)) { float_x.mutable_data(dev_ctx.GetPlace(), x->numel() * sizeof(MPDType)); float_out.mutable_data(dev_ctx.GetPlace(), diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h index 5fe2ebb20745b28a2c5a34b2257d741f2ca49d05..1125bbe93c37a99966d49e5da623903d6ba9bf19 100644 --- a/paddle/fluid/operators/assign_op.h +++ b/paddle/fluid/operators/assign_op.h @@ -50,9 +50,8 @@ class AssignFunctor { } } - void operator()(const framework::SelectedRows &rows) const { - framework::SelectedRows &out_rows = - *out_->GetMutable(); + void operator()(const pten::SelectedRows &rows) const { + pten::SelectedRows &out_rows = *out_->GetMutable(); out_rows.set_rows(rows.rows()); out_rows.set_height(rows.height()); auto &t = rows.value(); diff --git a/paddle/fluid/operators/assign_op_test.cc b/paddle/fluid/operators/assign_op_test.cc index 3504ec37d6670b73e93a416ca2d9244b94b46b91..efc1ed9e2ee6045870d1201d686df5a145574bd8 100644 --- a/paddle/fluid/operators/assign_op_test.cc +++ b/paddle/fluid/operators/assign_op_test.cc @@ -87,7 +87,7 @@ TEST(AssignOp, AssignSelectedRows) { std::vector rows{0, 4, 7}; int64_t height = 10; - paddle::framework::SelectedRows input(rows, height); + pten::SelectedRows input(rows, height); paddle::framework::Tensor* input_tensor = input.mutable_value(); paddle::framework::DDim in_dims = paddle::framework::make_ddim({3, 4}); @@ -98,7 +98,7 @@ TEST(AssignOp, AssignSelectedRows) { assign_functor(input); - auto& out_selected_row = output.Get(); + auto& out_selected_row = output.Get(); const paddle::framework::Vector& out_rows = out_selected_row.rows(); EXPECT_EQ(rows.size(), out_rows.size()); for (size_t i = 0; i < rows.size(); ++i) { diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc index 534af63d2a03fb0fe71769e32e3e9377be5ba68b..0e64b461786cce845f7388a520c09101dcba9c09 100644 --- a/paddle/fluid/operators/batch_norm_op_mlu.cc +++ b/paddle/fluid/operators/batch_norm_op_mlu.cc @@ -106,7 +106,7 @@ class MLUBatchNormOpKernel : public framework::OpKernel { if (ctx.HasInput("MomentumTensor")) { const auto *mom_tensor = ctx.Input("MomentumTensor"); Tensor mom_cpu; - TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); + framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); momentum = mom_cpu.data()[0]; } diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu index 368fbe836c266c8835f544b7d739797faf019a81..4d04fdc8ce2d2c658d7e39535dbd9ff2d31c216e 100644 --- a/paddle/fluid/operators/clip_by_norm_op.cu +++ b/paddle/fluid/operators/clip_by_norm_op.cu @@ -36,21 +36,22 @@ class ClipByNormKernel output = context.Output("Out"); output->mutable_data(context.GetPlace()); - } else if (in_var->IsType()) { - auto* x = context.Input("X"); + } else if (in_var->IsType()) { + auto* x = context.Input("X"); // merge ids in selected rows first math::scatter::MergeAdd merge_func; - SelectedRows* merged_input = + pten::SelectedRows* merged_input = const_cast(context.scope()) .Var() - ->GetMutable(); + ->GetMutable(); merge_func(context.template device_context(), *x, merged_input); input = &(merged_input->value()); - SelectedRows* output_selected_rows = context.Output("Out"); + pten::SelectedRows* output_selected_rows = + context.Output("Out"); output_selected_rows->set_rows(merged_input->rows()); output_selected_rows->set_height(merged_input->height()); output = output_selected_rows->mutable_value(); diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index adb2a2fcfa3a7050ad8fd80dcdd4acb04ce49d2d..fb21e98efec2c732b8abeb88343982f62ad07712 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -24,7 +24,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using SelectedRows = framework::SelectedRows; +// using SelectedRows = pten::SelectedRows; template using EigenVector = framework::EigenVector; @@ -43,20 +43,21 @@ class ClipByNormKernel : public framework::OpKernel { output = context.Output("Out"); output->mutable_data(context.GetPlace()); - } else if (in_var->IsType()) { - auto* x = context.Input("X"); + } else if (in_var->IsType()) { + auto* x = context.Input("X"); // merge ids in selected rows first math::scatter::MergeAdd merge_func; - SelectedRows* merged_input = + pten::SelectedRows* merged_input = const_cast(context.scope()) .Var() - ->GetMutable(); + ->GetMutable(); merge_func(context.template device_context(), *x, merged_input); input = &(merged_input->value()); - SelectedRows* output_selected_rows = context.Output("Out"); + pten::SelectedRows* output_selected_rows = + context.Output("Out"); output_selected_rows->set_rows(merged_input->rows()); output_selected_rows->set_height(merged_input->height()); output = output_selected_rows->mutable_value(); diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h index fb41dc16d65129e84df693ab9aed6af4607c0db8..5aff62656fb0f4ba0b0044e8c4a6dcabe42181d5 100644 --- a/paddle/fluid/operators/clip_op.h +++ b/paddle/fluid/operators/clip_op.h @@ -113,9 +113,9 @@ class ClipKernel : public framework::OpKernel { trans(context.template device_context(), x_data, x_data + numel, out_data, ClipFunctor(min, max)); } - } else if (x_var->IsType()) { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); + } else if (x_var->IsType()) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); PADDLE_ENFORCE_NE(x, out, platform::errors::InvalidArgument( "Inplace clip is not allowed " "when x is SelectedRows")); diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f7a1cae72be5a42d15d2e89663010489f529962a --- /dev/null +++ b/paddle/fluid/operators/concat_op_mlu.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class ConcatMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + framework::LoDTensor* out = ctx.Output("Out"); + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + auto axis = ctx.Attr("axis"); + auto ins_size = ins.size(); + bool need_resize_out_dims = false; + if (ctx.HasInput("AxisTensor")) { + auto* axis_tensor = ctx.Input("AxisTensor"); + axis = GetDataFromTensor(axis_tensor)[0]; + need_resize_out_dims = true; + } + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + if (need_resize_out_dims) { + const size_t n = ins.size(); + std::vector ins_dims(n); + for (size_t i = 0; i < n; i++) { + ins_dims[i] = ins[i]->dims(); + } + + framework::DDim out_dims = ComputeAndCheckShape(true, ins_dims, axis); + out->Resize(out_dims); + } + const int axis_t = axis; + const int ins_size_t = ins_size; + auto place = ctx.GetPlace(); + out->mutable_data(place); + + // mlu should do sth + // init ins tensors + std::vector inputs; + std::vector input_descs; + std::vector desc_vector; + for (size_t i = 0; i < ins_size; i++) { + input_descs.emplace_back(MLUCnnlTensorDesc( + *ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->type()))); + desc_vector.push_back(input_descs.back().get()); + inputs.push_back(GetBasePtr(ins[i])); + } + // init out tensors + MLUCnnlTensorDesc output_desc(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->type())); + + // MLU should do sth + MLUCnnl::Concat(ctx, ins_size_t, axis_t, desc_vector.data(), inputs.data(), + output_desc.get(), GetBasePtr(out)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_MLU_KERNEL(concat, ops::ConcatMLUKernel, + ops::ConcatMLUKernel, + ops::ConcatMLUKernel, + ops::ConcatMLUKernel, ops::ConcatMLUKernel, + ops::ConcatMLUKernel); diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 4c9727391759b0c1865e9fc51288458e7786c878..7ad49de4eed5e26cdc24a7444ead9a50abf54453 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -251,7 +251,7 @@ struct SearchAlgorithm { args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get())); algo = (perf_results.get())[best_algo_idx].algo; - workspace_size = GetWorkspaceSize(args, algo); + workspace_size = (perf_results.get())[best_algo_idx].memory; if (workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 @@ -502,7 +502,8 @@ struct SearchAlgorithm { args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, &perf_count, perf_results.get())); algo = (perf_results.get())[best_algo_idx].algo; - workspace_size = GetWorkspaceSize(args, algo); + workspace_size = (perf_results.get())[best_algo_idx].memory; + if (workspace_size > workspace_size_limit) { workspace_size = workspace_size_limit; #if CUDNN_VERSION >= 8000 diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..88698c02dd5daf11d6c5b7d68446d292696977ec --- /dev/null +++ b/paddle/fluid/operators/conv_op_mlu.cc @@ -0,0 +1,251 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/conv_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; + +template +class MLUConvOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + output->mutable_data(ctx.GetPlace()); + const std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + const std::string padding_algorithm = + ctx.Attr("padding_algorithm"); + const std::string data_format = ctx.Attr("data_format"); + + const bool channel_last = data_format == "NHWC"; + + // update padding and dilation + auto in_dims = input->dims(); + auto filter_dims = filter->dims(); + auto in_dims_size = in_dims.size(); + framework::DDim in_data_dims; + framework::DDim filter_data_dims; + + if (channel_last) { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + } + filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size()); + std::vector ksize = framework::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + Tensor input_tensor(input->type()); + Tensor output_tensor(output->type()); + const std::vector perm_to_nhwc = {0, 2, 3, 1}; + if (channel_last) { + input_tensor.ShareDataWith(*input); + output_tensor.ShareDataWith(*output); + } else { + // transpose input from NCHW to NHWC + TransposeFromMLUTensor(ctx, perm_to_nhwc, input, &input_tensor, + true /*need_reshape_or_alloc*/); + auto output_dims = output->dims(); + output_tensor.mutable_data( + {output_dims[0], output_dims[2], output_dims[3], output_dims[1]}, + ctx.GetPlace()); + } + input_tensor.set_layout(DataLayout::kNHWC); + output_tensor.set_layout(DataLayout::kNHWC); + + // transpose filter from MCHW to MHWC + Tensor trans_filter(filter->type()); + TransposeFromMLUTensor(ctx, perm_to_nhwc, filter, &trans_filter, + true /*need_reshape_or_alloc*/); + + cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC; + MLUCnnlTensorDesc input_desc(input_tensor, data_layout, + ToCnnlDataType(input_tensor.type())); + MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, + ToCnnlDataType(trans_filter.type())); + MLUCnnlTensorDesc output_desc(output_tensor, data_layout, + ToCnnlDataType(output_tensor.type())); + + MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(), + strides.data(), dilations.data(), groups, + ToCnnlDataType()); + + MLUCnnl::ConvolutionForward( + ctx, conv_desc.get(), nullptr /*alpha*/, nullptr /*beta*/, + nullptr /*bias_desc*/, nullptr /*bias_ptr*/, input_desc.get(), + GetBasePtr(&input_tensor), filter_desc.get(), GetBasePtr(&trans_filter), + output_desc.get(), GetBasePtr(&output_tensor)); + + if (!channel_last) { + // transpose ouput from NHWC to NCHW + const std::vector perm_to_nchw = {0, 3, 1, 2}; + TransposeFromMLUTensor(ctx, perm_to_nchw, &output_tensor, output, + false /*need_reshape_or_alloc*/); + } + } +}; + +template +class MLUConvGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + + const std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + const std::string padding_algorithm = + ctx.Attr("padding_algorithm"); + const std::string data_format = ctx.Attr("data_format"); + + const bool channel_last = data_format == "NHWC"; + + // update padding and dilation + auto in_dims = input->dims(); + auto filter_dims = filter->dims(); + auto in_dims_size = in_dims.size(); + framework::DDim in_data_dims; + framework::DDim filter_data_dims; + + if (channel_last) { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + } + filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size()); + + std::vector ksize = framework::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + Tensor input_tensor(input->type()); + Tensor output_grad_tensor(output_grad->type()); + const std::vector perm_to_nhwc = {0, 2, 3, 1}; + const std::vector perm_to_nchw = {0, 3, 1, 2}; + if (channel_last) { + input_tensor.ShareDataWith(*input); + output_grad_tensor.ShareDataWith(*output_grad); + } else { + // transpose input and output_grad from NCHW to NHWC + TransposeFromMLUTensor(ctx, perm_to_nhwc, input, &input_tensor, + true /*need_reshape_or_alloc*/); + TransposeFromMLUTensor(ctx, perm_to_nhwc, output_grad, + &output_grad_tensor, + true /*need_reshape_or_alloc*/); + } + input_tensor.set_layout(DataLayout::kNHWC); + output_grad_tensor.set_layout(DataLayout::kNHWC); + + if (filter_grad) { + filter_grad->mutable_data(ctx.GetPlace()); + + auto filter_grad_dims = filter_grad->dims(); + Tensor temp_filter_grad(filter_grad->type()); + temp_filter_grad.mutable_data( + {filter_grad_dims[0], filter_grad_dims[2], filter_grad_dims[3], + filter_grad_dims[1]}, + ctx.GetPlace()); + + cnnlDataType_t tensor_dtype = ToCnnlDataType(); + cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC; + MLUCnnlTensorDesc input_desc(input_tensor, data_layout, tensor_dtype); + MLUCnnlTensorDesc out_grad_desc(output_grad_tensor, data_layout, + tensor_dtype); + MLUCnnlTensorDesc temp_filter_grad_desc(temp_filter_grad, data_layout, + tensor_dtype); + + MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(), + strides.data(), dilations.data(), groups, + tensor_dtype); + + MLUCnnl::ConvBackpropFilter( + ctx, conv_desc.get(), input_desc.get(), GetBasePtr(&input_tensor), + out_grad_desc.get(), GetBasePtr(&output_grad_tensor), + temp_filter_grad_desc.get(), GetBasePtr(&temp_filter_grad)); + + // transpose filter_grad from MHWC to MCHW + TransposeFromMLUTensor(ctx, perm_to_nchw, &temp_filter_grad, + filter_grad, false /*need_reshape_or_alloc*/); + } + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + + Tensor input_grad_tensor(input_grad->type()); + if (channel_last) { + input_grad_tensor.ShareDataWith(*input_grad); + } else { + auto input_grad_dims = input_grad->dims(); + input_grad_tensor.mutable_data( + {input_grad_dims[0], input_grad_dims[2], input_grad_dims[3], + input_grad_dims[1]}, + ctx.GetPlace()); + } + input_grad_tensor.set_layout(DataLayout::kNHWC); + + // transpose filter from MCHW to MHWC + Tensor trans_filter(filter->type()); + TransposeFromMLUTensor(ctx, perm_to_nhwc, filter, &trans_filter, + true /*need_reshape_or_alloc*/); + + cnnlDataType_t tensor_dtype = ToCnnlDataType(); + cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC; + MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, tensor_dtype); + MLUCnnlTensorDesc out_grad_desc(output_grad_tensor, data_layout, + tensor_dtype); + MLUCnnlTensorDesc in_grad_desc(input_grad_tensor, data_layout, + tensor_dtype); + + MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(), + strides.data(), dilations.data(), groups, + tensor_dtype); + + MLUCnnl::ConvBackpropInput( + ctx, conv_desc.get(), filter_desc.get(), GetBasePtr(&trans_filter), + out_grad_desc.get(), GetBasePtr(&output_grad_tensor), + in_grad_desc.get(), GetBasePtr(&input_grad_tensor)); + + if (!channel_last) { + // transpose input_grad from NHWC to NCHW + TransposeFromMLUTensor(ctx, perm_to_nchw, &input_grad_tensor, + input_grad, false /*need_reshape_or_alloc*/); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(conv2d, ops::MLUConvOpKernel, + ops::MLUConvOpKernel); + +REGISTER_OP_MLU_KERNEL(conv2d_grad, ops::MLUConvGradOpKernel, + ops::MLUConvGradOpKernel); diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index cded525b030d8d88774d01488e3575195381bba4..e80797bd9b971a210efa423d4797984fb1dacf7d 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -107,8 +107,8 @@ class DropoutGradXPUKernel : public framework::OpKernel { return; } - paddle::platform::XPUVersion version = dev_ctx.xpu_version(); - if (version == paddle::platform::XPUVersion::XPU1) { + auto version = dev_ctx.xpu_version(); + if (version == pten::backends::xpu::XPUVersion::XPU1) { xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); XPUType* mask_new = RAII_GUARD.alloc_l3_or_gm(mask->numel()); float scale = diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 0c2476fde05c2c3226105707fd2686bc61d15bc7..f462c2ea0720b600f238109704e9606a2f7d627c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -33,34 +33,6 @@ class CPUDeviceContext; namespace paddle { namespace operators { -template -struct SameDimsElemwiseAdd< - platform::CPUDeviceContext, T, - typename std::enable_if::value>::type> { - void operator()(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z) { - auto blas = math::GetBlas(ctx); - blas.VADD(x->numel(), x->data(), y->data(), z->data()); - } -}; - -template -struct SameDimsElemwiseAdd< - platform::CPUDeviceContext, T, - typename std::enable_if::value>::type> { - void operator()(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z) { - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_y = framework::EigenVector::Flatten(*y); - auto eigen_z = framework::EigenVector::Flatten(*z); - auto &place = *ctx.template device_context() - .eigen_device(); - eigen_z.device(place) = eigen_x + eigen_y; - } -}; - class ElementwiseAddOpMaker : public ElementwiseOpMaker { protected: std::string GetName() const override { return "Add"; } diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 779779b44da8d1df275b057bbb9d37828c6904ed..2326aa561eaa05986c6e58bc1f2f2c93334cf893 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -13,139 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" +#include "paddle/pten/kernels/gpu/elementwise.h" namespace ops = paddle::operators; namespace plat = paddle::platform; namespace paddle { -namespace operators { - -template -static __global__ void SimpleElemwiseAddGradCUDAKernel( - const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = gridDim.x * blockDim.x; - int loop = size / vec_size; - int remainder = size % vec_size; - const float4* dout_vec = reinterpret_cast(dout); - float4* dx_vec = reinterpret_cast(dx); - float4* dy_vec = reinterpret_cast(dy); - float4 tmp_loop; - - for (int i = tid; i < loop; i += stride) { - tmp_loop = dout_vec[i]; - dx_vec[i] = tmp_loop; - dy_vec[i] = tmp_loop; - } - - if (tid == loop && remainder != 0) { - T tmp_rem; - while (remainder) { - int idx = size - remainder; - remainder--; - tmp_rem = dout[idx]; - dx[idx] = tmp_rem; - dy[idx] = tmp_rem; - } - } -} - -template -typename std::enable_if< - std::is_same::value>::type -default_elementwise_add_grad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, - const framework::Tensor* out, - const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - auto* dout_data = dout->data(); - - // dx - if (dx != nullptr) { - auto* dx_data = dx->mutable_data(ctx.GetPlace()); - if (dx->dims() == dout->dims()) { - if (dx_data != dout_data) { - framework::TensorCopy( - *dout, ctx.GetPlace(), - ctx.template device_context(), dx); - } - } else { - // For inplace strategy, dx will be stored in addr of dout, which makes - // the result of dy wrong. - if (dx->IsSharedBufferWith(*dout)) { - dx->clear(); - dx->mutable_data(x->dims(), ctx.GetPlace()); - } - std::vector reduce_dims = GetReduceDim(x->dims(), out->dims(), axis); - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceFunctorImpl>( - *dout, dx, kps::IdentityFunctor(), reduce_dims, stream); - } - } - // dy - if (dy != nullptr) { - auto* dy_data = dy->mutable_data(ctx.GetPlace()); - if (dy->dims() == dout->dims()) { - if (dy_data != dout_data) { - framework::TensorCopy( - *dout, ctx.GetPlace(), - ctx.template device_context(), dy); - } - } else { - std::vector reduce_dims = GetReduceDim(y->dims(), out->dims(), axis); - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceFunctorImpl>( - *dout, dy, kps::IdentityFunctor(), reduce_dims, stream); - } - } -} - -template -typename std::enable_if< - std::is_same::value>::type -elementwise_add_grad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, - const framework::Tensor* dout, framework::Tensor* dx, - framework::Tensor* dy) { - auto* dx_data = dx->mutable_data(ctx.GetPlace()); - auto* dy_data = dy->mutable_data(ctx.GetPlace()); - auto* dout_data = dout->data(); - if (dx_data == dout_data && dy_data != dout_data) { - VLOG(4) << "Special case when dx_data is the same as dout_data, " - "only need copy dout to dy"; - framework::TensorCopy( - *dout, ctx.GetPlace(), - ctx.template device_context(), dy); - } else if (dx_data != dout_data && dy_data == dout_data) { - VLOG(4) << "Special case when dy_data is the same as dout_data, " - "only need copy dout to dx"; - framework::TensorCopy( - *dout, ctx.GetPlace(), - ctx.template device_context(), dx); - } else if (dx_data != dout_data && dy_data != dout_data) { - auto size = x->numel(); - int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); - dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); - dim3 grid_size = - dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / - PREDEFINED_BLOCK_SIZE, - 1); - SimpleElemwiseAddGradCUDAKernel< - T><<().stream()>>>( - dout->data(), size, vec_size, dx->mutable_data(ctx.GetPlace()), - dy->mutable_data(ctx.GetPlace())); - } else { - VLOG(4) << "Special case when dy_data is the same as dout_data, " - "and dx_data is the same as dout_data, do not need " - "any operator"; - } -} - -} // namespace operators +namespace operators {} // namespace operators } // namespace paddle REGISTER_OP_CUDA_KERNEL( elementwise_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 5c4f791b2270c2d45909c24868e56d0bc62f86c3..73415d3fdb5c83cac1c0a8afb67548d7fa09b3c3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -18,35 +18,13 @@ limitations under the License. */ #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" +// only can include the headers in paddle/pten/include dirs +#include "paddle/pten/kernels/elementwise_grad_kernel.h" #include "paddle/pten/kernels/math_kernel.h" namespace paddle { namespace operators { -template -void LaunchBroadcastElementwiseCpuKernel(const framework::ExecutionContext &ctx, - const framework::Tensor *x, - const framework::Tensor *y, - framework::Tensor *z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - AddFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseAddFunctor(), z); - } -} - -template -struct SameDimsElemwiseAdd { - void operator()(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z); -}; - template class ElementwiseAddKernel : public framework::OpKernel { public: @@ -58,128 +36,29 @@ class ElementwiseAddKernel : public framework::OpKernel { auto &dev_ctx = ctx.device_context(); int axis = ctx.Attr("axis"); - auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); - auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); - auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); pten::AddRawKernel( static_cast::TYPE &>(dev_ctx), - *pt_x.get(), *pt_y.get(), axis, pt_z.get()); + *x, *y, axis, z); } }; -template -struct IdentityGrad { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } -}; - -template -typename std::enable_if< - std::is_same::value>::type -default_elementwise_add_grad(const framework::ExecutionContext &ctx, - const framework::Tensor *x, - const framework::Tensor *y, - const framework::Tensor *out, - const framework::Tensor *dout, - framework::Tensor *dx, framework::Tensor *dy) { - int axis = ctx.Attr("axis"); - - ElemwiseExplicitGradCompute, - IdentityGrad>(ctx, *x, *y, *out, *dout, axis, - dx, dy, IdentityGrad(), - IdentityGrad()); -} - -template -typename std::enable_if< - std::is_floating_point::value && - std::is_same::value>::type -elementwise_add_grad(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - const framework::Tensor *out, - const framework::Tensor *dout, framework::Tensor *dx, - framework::Tensor *dy) { - auto blas = math::GetBlas(ctx); - if (dx) { - blas.VCOPY(dout->numel(), dout->data(), - dx->mutable_data(ctx.GetPlace())); - } - - if (dy) { - blas.VCOPY(dout->numel(), dout->data(), - dy->mutable_data(ctx.GetPlace())); - } -} - -template -typename std::enable_if< - !std::is_floating_point::value && - std::is_same::value>::type -elementwise_add_grad(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - const framework::Tensor *out, - const framework::Tensor *dout, framework::Tensor *dx, - framework::Tensor *dy) { - default_elementwise_add_grad(ctx, x, y, out, dout, dx, dy); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -// cuda definition -template -typename std::enable_if< - std::is_same::value>::type -elementwise_add_grad(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - const framework::Tensor *out, - const framework::Tensor *dout, framework::Tensor *dx, - framework::Tensor *dy); - -template -typename std::enable_if< - std::is_same::value>::type -default_elementwise_add_grad(const framework::ExecutionContext &ctx, - const framework::Tensor *x, - const framework::Tensor *y, - const framework::Tensor *out, - const framework::Tensor *dout, - framework::Tensor *dx, framework::Tensor *dy); -#endif - template class ElementwiseAddGradKernel : public ElemwiseGradKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - auto *x = ctx.Input("X"); auto *y = ctx.Input("Y"); auto *dout = ctx.Input(framework::GradVarName("Out")); auto *dx = ctx.Output(framework::GradVarName("X")); auto *dy = ctx.Output(framework::GradVarName("Y")); - // skip out - auto *out = dout; - - // Special case when dy is not needed and dx doesn't reduce - if (dx != nullptr && dy == nullptr && dx->dims() == dout->dims()) { - VLOG(4) << "Special case when dy is not needed and dx doesn't " - "reduce"; - framework::TensorCopy( - *dout, ctx.GetPlace(), - ctx.template device_context(), dx); - } else if (dx == nullptr && dy != nullptr && dy->dims() == dout->dims()) { - VLOG(4) << "Special case when dx is not needed and dy doesn't " - "reduce"; - framework::TensorCopy( - *dout, ctx.GetPlace(), - ctx.template device_context(), dy); - } else if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { - elementwise_add_grad(ctx, x, y, out, dout, dx, dy); - } else { - default_elementwise_add_grad(ctx, x, y, out, dout, dx, - dy); - } + const auto &dev_ctx = ctx.template device_context(); + int axis = ctx.Attr("axis"); + pten::AddGradKernel( + static_cast::TYPE &>(dev_ctx), + *x, *y, *dout, axis, dx, dy); } }; @@ -195,17 +74,20 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel { auto *ddy = ctx.Input("DDY"); auto *ddout = ctx.Output("DDOut"); - - // ddOut = ddx + ddy - if (ddout) { - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, dout, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - ddout->mutable_data(ctx.GetPlace()); - LaunchBroadcastElementwiseCpuKernel(ctx, &ddx_safe, - &ddy_safe, ddout); + const auto &dev_ctx = ctx.template device_context(); + int axis = ctx.Attr("axis"); + paddle::optional ddx_optional = paddle::none; + paddle::optional ddy_optional = paddle::none; + if (ddx != nullptr) { + ddx_optional = *ddx; + } + if (ddy != nullptr) { + ddy_optional = *ddy; } + pten::AddDoubleGradKernel( + static_cast::TYPE &>(dev_ctx), + *y, ddx_optional, ddy_optional, *dout, axis, ddout); } }; @@ -219,32 +101,13 @@ class ElementwiseAddTripleGradKernel : public framework::OpKernel { auto *d_ddout = ctx.Input("D_DDOut"); auto *d_ddx = ctx.Output("D_DDX"); auto *d_ddy = ctx.Output("D_DDY"); - // skip out - auto *out = d_ddout; - - // Special case when d_ddy is not needed and d_ddx doesn't reduce - if (d_ddx != nullptr && d_ddy == nullptr && - d_ddx->dims() == d_ddout->dims()) { - VLOG(4) << "Special case when d_ddy is not needed and d_ddx doesn't " - "reduce"; - framework::TensorCopy( - *d_ddout, ctx.GetPlace(), - ctx.template device_context(), d_ddx); - } else if (d_ddx == nullptr && d_ddy != nullptr && - d_ddy->dims() == d_ddout->dims()) { - VLOG(4) << "Special case when d_ddx is not needed and d_ddy doesn't " - "reduce"; - framework::TensorCopy( - *d_ddout, ctx.GetPlace(), - ctx.template device_context(), d_ddy); - } else if (d_ddx != nullptr && d_ddy != nullptr && - (d_ddx->dims() == d_ddy->dims())) { - elementwise_add_grad(ctx, ddx, ddy, out, d_ddout, d_ddx, - d_ddy); - } else { - default_elementwise_add_grad(ctx, ddx, ddy, out, - d_ddout, d_ddx, d_ddy); - } + + const auto &dev_ctx = ctx.template device_context(); + int axis = ctx.Attr("axis"); + pten::AddTripleGradKernel( + static_cast::TYPE &>(dev_ctx), + *ddx, *ddy, *d_ddout, axis, d_ddx, d_ddy); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 0c7d12ae0ad55cedfced38705ae40d7394c07158..8923f1fd4b866252ec8048729c717e79230f1f7b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -32,7 +32,7 @@ class ElementwiseMulKernel ctx.InputName("X"))); const auto& cuda_ctx = ctx.template device_context(); - if (x_var->IsType()) { + if (x_var->IsType()) { framework::Tensor x_for_selectedrows; std::vector ins; std::vector outs; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index e7a5e48b1f1b5570d8a4c32b44aac4d8f0705d9a..40faf7cbbe8cd8b30891f5b5865a6eb17f5e27ed 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -92,20 +92,20 @@ class ElementwiseMulKernel : public framework::OpKernel { auto* y = ctx.Input("Y"); framework::Tensor x, *z; - if (x_var->IsType()) { + if (x_var->IsType()) { PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true, platform::errors::InvalidArgument( "For elementwise_op, if X is Sparse, Y must be " "scalar. But reveived the size of Y = %s.", y->dims().size())); - auto& x_sele = x_var->Get(); - auto out_sele = ctx.Output("Out"); + auto& x_sele = x_var->Get(); + auto out_sele = ctx.Output("Out"); x = x_sele.value(); out_sele->set_rows(x_sele.rows()); out_sele->set_height(x_sele.height()); out_sele->mutable_value()->Resize(x_sele.value().dims()); out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type()); - z = ctx.Output("Out")->mutable_value(); + z = ctx.Output("Out")->mutable_value(); z->mutable_data(ctx.GetPlace()); auto dims_equal = x.dims() == y->dims(); if (dims_equal) { diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index aaf33ca67448865abd172d7fdb9f10728ec5766d..64beac0804d0f650a65fe218d2a68495da2303f1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -354,6 +354,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { tensor.place(), tensor.layout()); } } + framework::KernelSignature GetExpectedPtenKernelArgs( + const framework::ExecutionContext &ctx) const override { + if (Type() == "elementwise_add_grad") { + if (ctx.InputVar("X")->IsType()) { + return framework::KernelSignature( + "add_grad", {"X", "Y", framework::GradVarName("Out")}, {"axis"}, + {framework::GradVarName("X"), framework::GradVarName("Y")}); + } + } + + return framework::KernelSignature("None", {"X"}, {}, {"Out"}); + } }; class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel { @@ -522,11 +534,9 @@ class ElemwiseGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &context) const override { auto *dx = context.Output(framework::GradVarName("X")); - if (dx != nullptr) { - auto &dout = - *context.Input(framework::GradVarName("Out")); - dx->set_lod(dout.lod()); - } + auto &dout = + *context.Input(framework::GradVarName("Out")); + pten::funcs::ElementwiseGradPreProcess(dout, dx); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index f0641dd97d87f448021fc7f7a8e02be4cb44d2ba..fdf04181de76c64ba239ce8fbd83bf9f5d1c5124 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -84,7 +84,7 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx, auto *x = ctx.Input("X"); z = ctx.Output("Out"); ins->emplace_back(x); - } else if (x_var->IsType()) { + } else if (x_var->IsType()) { PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true, platform::errors::InvalidArgument( "For elementwise_op, if X is Sparse, Y must be " @@ -96,15 +96,15 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx, "The parameter x_for_selectedrows is excepted to " "be valid, once input varible X`s class type is " "SelectedRows.\n")); - auto &x_sele = x_var->Get(); - auto out_sele = ctx.Output("Out"); + auto &x_sele = x_var->Get(); + auto out_sele = ctx.Output("Out"); *x_for_selectedrows = x_sele.value(); out_sele->set_rows(x_sele.rows()); out_sele->set_height(x_sele.height()); out_sele->mutable_value()->Resize(x_sele.value().dims()); out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x_for_selectedrows->type()); - z = ctx.Output("Out")->mutable_value(); + z = ctx.Output("Out")->mutable_value(); ins->emplace_back(x_for_selectedrows); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -158,32 +158,6 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx, } } -// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub. -// explicit gradient can cut off X, Y, Out from gradient op -// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse -// elementwise code. -template -void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx, - const framework::Tensor &x, - const framework::Tensor &y, - const framework::Tensor &out, - const framework::Tensor &dout, int axis, - framework::Tensor *dx, framework::Tensor *dy, - DX_OP dx_op, DY_OP dy_op) { - const framework::DDim &x_dim = x.dims(); - const framework::DDim &y_dim = y.dims(); - const auto &dev_ctx = ctx.template device_context(); - if (x.dims() == y.dims()) { - pten::funcs::ElemwiseGradComputeNoBroadcast( - dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, - dy_op); - } else { - pten::ElemwiseGradComputeWithBroadcast( - dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, - dy_op); - } -} - // It is a common implementation to compute binary calculation with the support // of broadcast, supporting both CPU and GPU. // - CPU implementation cannot support the case when x needs broadcast, thus @@ -199,30 +173,20 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *y, int axis, Functor func, framework::Tensor *z) { + z->mutable_data(ctx.GetPlace()); if (platform::is_gpu_place(ctx.GetPlace())) { #if defined(__NVCC__) || defined(__HIPCC__) - std::vector ins = {x, y}; - std::vector outs = {z}; - z->mutable_data(ctx.GetPlace()); - const auto &dev_ctx = ctx.template device_context(); - paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, - axis, func); + pten::ElementwiseCompute(dev_ctx, *x, *y, axis, func, + z); + #endif return; } - - z->mutable_data(ctx.GetPlace()); - auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); - auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); - auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); - const auto &dev_ctx = ctx.template device_context(); - pten::ElementwiseCompute( - dev_ctx, *pt_x.get(), *pt_y.get(), axis, func, pt_z.get()); + pten::ElementwiseCompute(dev_ctx, *x, *y, axis, func, z); } // FusedElemwiseAndAct @@ -1207,36 +1171,16 @@ template static inline void GetDoubleGradSafeTensor( const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *ddx, framework::Tensor *ddx_safe) { - if (ddx) { - *ddx_safe = *ddx; - } else { - auto &dev_ctx = ctx.template device_context(); - *ddx_safe = ctx.AllocateTmpTensor(x->dims(), dev_ctx); - math::SetConstant set_zero; - set_zero(ctx.template device_context(), ddx_safe, - static_cast(0)); - } + const auto &dev_ctx = ctx.template device_context(); + pten::funcs::GetDoubleGradSafeTensor(dev_ctx, *x, ddx, + ddx_safe); } // for broadcast backwards static inline std::vector GetReduceDim(const framework::DDim &in, const framework::DDim &out, int axis) { - axis = - (axis == -1 ? std::abs(static_cast(out.size() - in.size())) : axis); - std::vector dims; - for (int i = 0; i < axis; ++i) { - dims.push_back(i); - } - for (int i = 0; i < in.size(); ++i) { - if (out[i + axis] != in[i]) { - dims.push_back(i + axis); - } - } - for (int i = axis + in.size(); i < out.size(); ++i) { - dims.push_back(i); - } - return dims; + return pten::funcs::GetReduceDim(in, out, axis); } #if defined(__NVCC__) || defined(__HIPCC__) diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index 7d1749f20abf29f155e0d05931902a63aa9a1837..8fc6038ab65819dbf6e108b0f3df1a4478e915c4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -78,9 +78,11 @@ default_elementwise_sub_grad(const framework::ExecutionContext& ctx, const framework::Tensor* dout, framework::Tensor* dx, framework::Tensor* dy) { int axis = ctx.Attr("axis"); - - ElemwiseExplicitGradCompute, SubGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), SubGradDY()); + const auto& dev_ctx = + ctx.template device_context(); + pten::ElemwiseExplicitGradCompute, SubGradDY>( + dev_ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), + SubGradDY()); } template diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index ee7c0eb96eae5c14e68023a3ebbdc2ef4ea9ca04..c0e2b4584d0260e221b2fc45d3e7e46415a9b7b5 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -117,7 +117,7 @@ class FillConstantOp : public framework::OperatorWithKernel { const auto& str_value = ctx.Attr("str_value"); value = str_value.empty() ? "value" : "str_value"; } - if (!ctx.OutputVar("Out")->IsType()) { + if (!ctx.OutputVar("Out")->IsType()) { return framework::KernelSignature("full", {}, {shape, value}, {"Out"}); } return framework::KernelSignature("fill_constant.unregistered", {}, {}, {}); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index 9e9bd2e0fbbc94c1aaa85018de3b4ed96a8f686c..c74cf2a824c830a7a3b00f90e31b8508c23aba68 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -92,8 +92,8 @@ class FillConstantKernel : public framework::OpKernel { if (out_var->IsType()) { tensor = out_var->GetMutable(); tensor->Resize(shape); - } else if (out_var->IsType()) { - tensor = out_var->GetMutable()->mutable_value(); + } else if (out_var->IsType()) { + tensor = out_var->GetMutable()->mutable_value(); tensor->Resize(shape); } else { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/operators/fill_constant_op_mlu.cc b/paddle/fluid/operators/fill_constant_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..d161a857d6c1778e8136702564dae2582c8a0465 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op_mlu.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_constant_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class FillConstantMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto str_value = ctx.Attr("str_value"); + auto float_value = ctx.Attr("value"); + + auto *out_var = ctx.Output("Out"); + + T value; + if (str_value.empty()) { + value = static_cast(float_value); + } else { + // handle NaN/Inf first, which cannot be read from stream. + if (str_value == "inf") { + value = static_cast(std::numeric_limits::infinity()); + } else if (str_value == "-inf") { + value = static_cast(-std::numeric_limits::infinity()); + } else if (str_value == "nan") { + value = static_cast(std::numeric_limits::quiet_NaN()); + } else { + std::stringstream convert_stream(str_value); + if (std::is_same::value) { + int64_t tmp_value; + convert_stream >> tmp_value; + value = static_cast(tmp_value); + } else { + double tmp_value; + convert_stream >> tmp_value; + value = static_cast(tmp_value); + } + } + } + if (ctx.HasInput("ValueTensor")) { + auto *value_tensor = ctx.Input("ValueTensor"); + PADDLE_ENFORCE_EQ( + value_tensor->numel(), 1, + platform::errors::InvalidArgument( + "When use Tensor as value to set Tensor value in fill_cosntant, " + "value input(ValueTensor) size must be 1, but get %d", + value_tensor->numel())); + const T *tensor_data = value_tensor->data(); + framework::Tensor mlu_tensor; + auto tmp_place = value_tensor->place(); + if (platform::is_mlu_place(tmp_place)) { + TensorCopySync(*value_tensor, platform::CPUPlace(), &mlu_tensor); + tensor_data = mlu_tensor.data(); + } + value = tensor_data[0]; + } + + auto shape = GetShape(ctx); + out_var->mutable_data(shape, ctx.GetPlace()); + MLUCnnlTensorDesc output_desc(*out_var, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out_var->type())); + MLUCnnl::Fill(ctx, value, output_desc.get(), GetBasePtr(out_var)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_MLU_KERNEL( + fill_constant, paddle::operators::FillConstantMLUKernel, + paddle::operators::FillConstantMLUKernel, + paddle::operators::FillConstantMLUKernel, + paddle::operators::FillConstantMLUKernel, + paddle::operators::FillConstantMLUKernel, + paddle::operators::FillConstantMLUKernel, + paddle::operators::FillConstantMLUKernel); diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index fa0cab04168d1e3ea48fc3cf7397e976a39eac2a..1402f3404fd6de57a244703305db7361879f7bf7 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -29,7 +29,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using LoDTensor = framework::LoDTensor; template diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 4e4322947a8571284202a0fb89af8b167b1a58b9..fc782dc55117519494cb8d527672b01e5654f384 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -30,7 +30,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using DDim = framework::DDim; constexpr int64_t kNoPadding = -1; @@ -200,8 +200,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { DDim table_dim; if (table_var->IsType()) { table_dim = context.Input("W")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("W"); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); table_dim = table_t->value().dims(); } else { PADDLE_THROW(platform::errors::PermissionDenied( @@ -215,7 +215,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { if (is_sparse) { auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); // runtime shape d_table->set_height(table_dim[0]); diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index c6205863103ff99e3d850c5acc739a400cdb5696..babf1c657f232d8316df924487a925c6b6162cf9 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -19,6 +19,8 @@ limitations under the License. */ namespace paddle { namespace operators { +#define LN_NUM_COLS 1024 + template using CudnnDataType = platform::CudnnDataType; template @@ -153,6 +155,191 @@ __global__ void FusedLayernormResidualDropoutBias( invvar); } +/* +* @brief layernorm(residual + dropout(x)); + * Conditions: + * (1) The number of cols is 1024; + * (2) layer_norm scale and bias is not null; + * (3) linear bias is null; + * @param + * rows: batch_size * seq_len + * cols: 1024 + * x_: [rows, cols], inputs + * residual_:[rows, cols] + * gamma_: [cols]: layernorm scale, not null + * beta_: [cols], layernorm bias, not null + * mask_out_: [rows, cols], dropout result + * residual_out_: [rows, cols], residual + dropout(src) + * y_: [rows, cols], layernorm result + * mean_out_: [rows]: layernorm means + * var_out_: [rows]: layernorm vars +*/ +template < + typename T, typename U, typename ScaleT = U, typename MaskType = uint8_t, + int VecSize = 8, int WARPS_M = 4, int WARPS_N = 1, int BYTES_PER_LDG = 16, + int ELTS_PER_ROW = 1024, int THREADS_PER_WARP = 32, + int THREADS_PER_ROW = WARPS_N *THREADS_PER_WARP, + int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW, int ROWS_PER_CTA = WARPS_M, + int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize, + int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA> +__global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( + int rows, int cols, uint64_t seed, const float dropout_prob, + const bool is_upscale_in_train, const bool is_test, + const uint64_t increment, const float epsilon, const T *__restrict__ x_ptr, + const T *__restrict__ residual_ptr, const ScaleT *__restrict__ gamma_ptr, + const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr, + U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, + T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) { + using Vec = platform::AlignedVector; + using Vec_scale = platform::AlignedVector; + using MaskStoreT = platform::AlignedVector; + + const int tidx = threadIdx.x; + const int bidx = blockIdx.x; + const int lane = tidx % THREADS_PER_WARP; // 0, 1, ..., 31 + const int warp = tidx / THREADS_PER_WARP; // 0, 1, 2, 3 + const int warp_n = warp % WARPS_N; // 0 + const int warp_m = warp / WARPS_N; // 0, 1, 2, 3 + + const int c = warp_n * THREADS_PER_WARP + lane; // lane + const int r = bidx * ROWS_PER_CTA + warp_m; // row id + + int idx = r * LN_NUM_COLS + c; + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, increment, &state); + + T factor = GetFactor(dropout_prob, is_upscale_in_train, is_test); + + Vec_scale gamma[LDGS]; + Vec_scale beta[LDGS]; +#pragma unroll + for (int it = 0, col = c; it < LDGS; it++) { + platform::Load(gamma_ptr + col * VecSize, &gamma[it]); + platform::Load(beta_ptr + col * VecSize, &beta[it]); + col += THREADS_PER_ROW; + } + + constexpr U rn = 1.f / U(LN_NUM_COLS); + for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) { + Vec x[LDGS]; + Vec residual[LDGS]; +#pragma unroll + for (int it = 0, col = c; it < LDGS; it++) { + platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, + &x[it]); + platform::Load( + residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]); + col += THREADS_PER_ROW; + } + + MaskStoreT mask_vec[LDGS]; + if (!is_test) { +#pragma unroll + for (int it = 0; it < LDGS; it++) { + float rand[VecSize]; + RandVec(&state, rand); +#pragma unroll + for (int jt = 0; jt < VecSize; jt++) { +#pragma unroll + mask_vec[it][jt] = static_cast(rand[jt] >= dropout_prob); + } + } + } else { +#pragma unroll + for (int it = 0; it < LDGS; it++) { +#pragma unroll + for (int jt = 0; jt < VecSize; jt++) { + mask_vec[it][jt] = static_cast(1); + } + } + } + + // 4 * 8 + U xf[LDGS * VecSize]; +#pragma unroll + for (int it = 0; it < LDGS; it++) { +#pragma unroll + for (int jt = 0; jt < VecSize; jt++) { + // dropout(x) + residual + x[it][jt] = x[it][jt] * static_cast(mask_vec[it][jt]) * factor + + residual[it][jt]; + xf[it * VecSize + jt] = U(x[it][jt]); + } + } + +// store dropout_residual_out and mask_out +#pragma unroll + for (int it = 0, col = c; it < LDGS; it++) { + platform::Store( + x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize); + platform::Store( + mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize); + col += THREADS_PER_ROW; + } + + U mu_local = 0.f; +#pragma unroll + for (int it = 0; it < LDGS; it++) { +#pragma unroll + for (int jt = 0; jt < VecSize; jt++) { + mu_local += xf[it * VecSize + jt]; + } + } + +#pragma unroll + for (int it = 1; it < THREADS_PER_WARP; it *= 2) { + mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it); + } + mu_local *= rn; + if (lane == 0) { + mean_out_ptr[row] = mu_local; + } + U var_local = 0.f; + +#pragma unroll + for (int it = 0; it < LDGS; it++) { +#pragma unroll + for (int jt = 0; jt < VecSize; jt++) { + U diff = xf[it * VecSize + jt] - mu_local; + var_local += diff * diff; + } + } + +#pragma unroll + for (int it = 1; it < THREADS_PER_WARP; it *= 2) { + var_local += __shfl_xor_sync(uint32_t(-1), var_local, it); + } + U rsigma = rsqrtf(var_local * rn + epsilon); + if (lane == 0) { + // Note: the stored var is different for paddle(ln) and apex (fast ln). + // var_out_ptr[row] = rsigma; + var_out_ptr[row] = var_local * rn; + } + +#pragma unroll + for (int it = 0; it < LDGS; it++) { +#pragma unroll + for (int jt = 0; jt < VecSize; jt++) { + // use fp16 to compute + // ScaleT tmp = static_cast(rsigma * (xf[it * VecSize + jt] - + // mu_local)); + // x[it][jt] = gamma[it][jt] * tmp + beta[it][jt]; + // cast to fp32 to compute + U tmp = rsigma * (static_cast(xf[it * VecSize + jt]) - mu_local); + x[it][jt] = static_cast(static_cast(gamma[it][jt]) * tmp + + static_cast(beta[it][jt])); + } + } + +#pragma unroll + for (int it = 0, col = c; it < LDGS; it++) { + platform::Store(x[it], + y_ptr + row * LN_NUM_COLS + col * VecSize); + col += THREADS_PER_ROW; + } + } +} + /** * @brief layernorm(residual + dropout(src + bias)); * @param @@ -205,6 +392,13 @@ void LaunchLayernormResidualDropoutBias( return; } + bool can_call_1024_kernel = false; + if (cols == 1024 && scale != nullptr && layernorm_bias != nullptr && + bias == nullptr) { + can_call_1024_kernel = true; + } + VLOG(6) << "can_call_1024_kernel = " << can_call_1024_kernel; + const int VecSize = MAX_CACHE_BYTES / sizeof(T); if (cols % VecSize != 0) { int blockDim = GetDesiredBlockDim(cols); @@ -215,13 +409,35 @@ void LaunchLayernormResidualDropoutBias( epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst, layernorm_dst, mean, var); } else { - int blockDim = GetDesiredBlockDim(cols / VecSize); - FusedLayernormResidualDropoutBias< - T, uint8_t, VecSize, U, - ScaleBiasWithSameTypeX><<>>( - rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment, - epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst, - layernorm_dst, mean, var); + if (can_call_1024_kernel) { + const int WARPS_M = 4; + const int WARPS_N = 1; + const int THREADS_PER_WARP = 32; + const int BYTES_PER_LDG = 16; + const int VecSize = BYTES_PER_LDG / sizeof(T); + + const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; + const int ROWS_PER_CTA = WARPS_M; + + // Note: the grid can not exceed max_grid of the gpu. + const int grid = + static_cast(std::ceil(rows / static_cast(ROWS_PER_CTA))); + fused_ln_fwd_1024_kernel< + T, U, LayerNormScaleBiasT, uint8_t, + VecSize, WARPS_M, WARPS_N, + BYTES_PER_LDG><<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, + increment, epsilon, src, residual, scale, layernorm_bias, mask_data, + mean, var, dst, layernorm_dst); + } else { + int blockDim = GetDesiredBlockDim(cols / VecSize); + FusedLayernormResidualDropoutBias< + T, uint8_t, VecSize, U, + ScaleBiasWithSameTypeX><<>>( + rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, + increment, epsilon, src, residual, bias, scale, layernorm_bias, + mask_data, dst, layernorm_dst, mean, var); + } } } diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu index 57d3fc94dc88a0699b103c081642757798719332..cc14d0680d381ff2bbe73ee712e218c9c4d79185 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu @@ -66,12 +66,10 @@ struct TestFusedLayernormResidualDropoutBias { ctx = reinterpret_cast(devicectx); } - TestFusedLayernormResidualDropoutBias(int _rows, int _cols, - uint64_t _seed = 0, - float _dropout_prob = 0.0, - float _epsilon = 0.00001f, - bool _is_upscale_in_train = false, - bool _is_test = false) { + TestFusedLayernormResidualDropoutBias( + int _rows, int _cols, uint64_t _seed = 0, float _dropout_prob = 0.0, + float _epsilon = 0.00001f, bool _is_upscale_in_train = false, + bool _is_test = false, bool _has_bias = true) { rows = _rows; cols = _cols; seed = _seed; @@ -79,7 +77,7 @@ struct TestFusedLayernormResidualDropoutBias { epsilon = _epsilon; is_upscale_in_train = _is_upscale_in_train; is_test = _is_test; - has_bias = true; + has_bias = _has_bias; has_scale = true; has_layernorm_bias = true; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); @@ -283,7 +281,6 @@ static void BaseTest(const bool is_fp16 = false) { } } } - TEST(FusedDropout, GPUFusedLayernormResidualDropoutBias) { BaseTest(); } TEST(FusedDropout, GPUFusedLayernormResidualDropoutBiasDouble) { @@ -330,3 +327,12 @@ TEST(FusedDropout, GPUFusedLayernormResidualDropoutLargeShape) { test.Run(); test.CheckOut(static_cast(1e-4)); } + +TEST(FusedDropout, GPUFusedLayernormResidualDropoutFp16MLperf) { + const int rows = 512; + const int cols = 1024; + TestFusedLayernormResidualDropoutBias test( + rows, cols, 0, 0, 0.00001f, false, false, false); + test.Run(); + test.CheckOut(static_cast(1e-2)); +} diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc index 8ce7df7eec15ead25ffb590454dd11228ffdadfc..67c265c97e46160fd824db1a8201b917d3414260 100644 --- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -57,7 +57,7 @@ class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel { class GetTensorFromSelectedRowsKernel { public: void operator()(const framework::ExecutionContext &ctx) const { - auto *x = ctx.Input("X"); + auto *x = ctx.Input("X"); auto *out = ctx.Output("Out"); out->Resize(x->value().dims()); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index a6f5fb017a752ee15fe70a3b57d0dabce3854f50..17734b9c542c830b9aab3498cabac5a8a1c8beca 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -204,7 +204,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { "Custom tree must be set for sparse mode!")); framework::Vector real_rows = PathToRows(*path); auto* w_grad = - ctx.Output(framework::GradVarName("W")); + ctx.Output(framework::GradVarName("W")); w_grad->set_rows(real_rows); // Build a map of id -> row_index to speed up finding the index of one id w_grad->set_height(w.dims()[0]); diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 3db0fdf5e6da4e7b5ed7f0a8dbc2b96b7265cd83..72dd0fc743247116e7b9060676955dbd0ba31c76 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -16,39 +16,121 @@ #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/fast_divmod.h" namespace paddle { namespace operators { using framework::Tensor; +using platform::FastDivMod; using DataLayout = framework::DataLayout; +static inline int GetLastPow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( + const platform::CUDADeviceContext& context, int num_img, int height, + int width) { + const int kThreadsPerBlock = 256; + int max_threads_per_block = context.GetMaxThreadsPerBlock(); // 1024 + int max_threads = std::min(kThreadsPerBlock, max_threads_per_block); + + int block_x = std::min(GetLastPow2(width), max_threads); + int block_y = std::min(GetLastPow2(height), max_threads / block_x); + int block_z = std::min(num_img, max_threads / block_x / block_y); + + dim3 max_grid_dim = context.GetCUDAMaxGridDimSize(); + int grid_x = std::min(max_grid_dim.x, platform::DivUp(width, block_x)); + int grid_y = std::min(max_grid_dim.y, platform::DivUp(height, block_y)); + int grid_z = + std::min(max_grid_dim.z, platform::DivUp(num_img, block_z * 4)); + + const int capability = context.GetComputeCapability(); + platform::GpuLaunchConfig config; + config.compute_capability = capability; + config.thread_per_block = dim3(block_x, block_y, block_z); + config.block_per_grid = dim3(grid_x, grid_y, grid_z); + return config; +} + +struct FastDivModForInterpolate { + public: + FastDivMod channels_div; + FastDivMod output_w_div; + FastDivMod output_wc_div; + + explicit HOSTDEVICE FastDivModForInterpolate(const int channels, + const int output_w, + const int outout_wc) + : channels_div(FastDivMod(channels)), + output_w_div(FastDivMod(output_w)), + output_wc_div(FastDivMod(outout_wc)) {} +}; + +template +__global__ void KeNearestNeighborInterpNCHWFw( + const T* in, const size_t in_img_h, const size_t in_img_w, T* out, + const size_t out_img_h, const size_t out_img_w, const size_t nc, + const float ratio_h, const float ratio_w, const bool align_corners) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + // nearest_sampling by multiple read in_addr and write to out_addr + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + out[out_index] = in[in_index]; + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + template __global__ void KeNearestNeighborInterpFw( const T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { + const bool align_corners, FastDivModForInterpolate divmods) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; + int in_img_size = in_img_h * in_img_w; + int out_img_size = out_img_h * out_img_w; + for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; - int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idy = (out_id_w % out_img_size) / out_img_w; - out_img_idx = tid % out_img_w; - } else { - out_img_idy = out_id_w / (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; int in_img_idy = (align_corners) ? static_cast(ratio_h * out_img_idy + 0.5) @@ -57,13 +139,8 @@ __global__ void KeNearestNeighborInterpFw( ? static_cast(ratio_w * out_img_idx + 0.5) : static_cast(ratio_w * out_img_idx); - if (data_layout == DataLayout::kNCHW) { - out[tid] = in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; - } else { - out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } + out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; } } @@ -1292,11 +1369,25 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); if ("nearest" == interp_method) { - KeNearestNeighborInterpFw< - T><<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + platform::GpuLaunchConfig config_3d = + GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); + KeNearestNeighborInterpNCHWFw< + T><<>>( + input_data, in_h, in_w, output_data, out_h, out_w, nc, ratio_h, + ratio_w, align_corners); + } else { + int64_t cw = c * out_w; + auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); + KeNearestNeighborInterpFw< + T><<>>( + input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_corners, interp_divmods); + } } else if ("bilinear" == interp_method) { dim3 thread_num = config.thread_per_block; #ifdef WITH_NV_JETSON diff --git a/paddle/fluid/operators/ipu/CMakeLists.txt b/paddle/fluid/operators/ipu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..66373d4b5f6b91914e9bb1f3ed5b7fdd5dec37ea --- /dev/null +++ b/paddle/fluid/operators/ipu/CMakeLists.txt @@ -0,0 +1,3 @@ +if(WITH_IPU) + op_library(ipu_runtime_op DEPS ipu_backend) +endif(WITH_IPU) diff --git a/paddle/fluid/operators/ipu_runtime_op.h b/paddle/fluid/operators/ipu/ipu_runtime_op.cc similarity index 55% rename from paddle/fluid/operators/ipu_runtime_op.h rename to paddle/fluid/operators/ipu/ipu_runtime_op.cc index b6fc9ae98895d40d2e2d1c9eb02a63d200b0b1f8..3b6982d4b2b8e3fe29587e2e6cbbc16107326f78 100644 --- a/paddle/fluid/operators/ipu_runtime_op.h +++ b/paddle/fluid/operators/ipu/ipu_runtime_op.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,32 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once -#include -#include +#ifdef PADDLE_WITH_IPU #include "paddle/fluid/framework/op_registry.h" -#ifdef PADDLE_WITH_IPU -#include "paddle/fluid/framework/ipu/ipu_backend.h" -#include "paddle/fluid/framework/tensor.h" -#endif +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" namespace paddle { namespace operators { -template -class IpuRuntimeKernel : public framework::OpKernel { +class IpuRuntimeOp : public framework::OperatorBase { public: - void Compute(const framework::ExecutionContext& ctx) const override { -#ifdef PADDLE_WITH_IPU - auto ipu_backend = framework::ipu::IpuBackend::GetInstance(); - if (!ipu_backend->DeviceIsAttached()) { - const platform::IPUDeviceContext& ipu_ctx = - reinterpret_cast( - ctx.device_context()); - ipu_backend->AttachDevice(ipu_ctx.DeviceId()); - } + IpuRuntimeOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const { + auto ipu_backend = platform::ipu::IpuBackend::GetInstance(); + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); + framework::RuntimeContext runtime_ctx(inputs_, outputs_, scope); + framework::ExecutionContext ctx(*this, scope, *dev_ctx, runtime_ctx); auto inputs = ctx.MultiInput("FeedList"); auto outputs = ctx.MultiOutput("FetchList"); auto output_names = ctx.OutputNames("FetchList"); @@ -58,12 +55,24 @@ class IpuRuntimeKernel : public framework::OpKernel { << "(" << dim << ")"; } } -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Please compile WITH_IPU option to enable ipu_runtime op")); -#endif + } +}; + +class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("FeedList", "FeedList of Graph").AsDuplicable(); + AddOutput("FetchList", "FetchList of Graph").AsDuplicable(); + AddComment(R"DOC( +Run graph by PopART runtime. +)DOC"); } }; } // namespace operators } // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker); + +#endif // PADDLE_WITH_IPU diff --git a/paddle/fluid/operators/ipu_runtime_op.cc b/paddle/fluid/operators/ipu_runtime_op.cc deleted file mode 100644 index 4b473da00f3318135f194dd90151fbfb39315fee..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/ipu_runtime_op.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/ipu_runtime_op.h" - -namespace paddle { -namespace operators { - -class IpuRuntimeOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override {} - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::proto::VarType::Type(ctx.Attr("dtype")), - ctx.device_context()); - } -}; - -class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("FeedList", "FeedList of Graph").AsDuplicable(); - AddOutput("FetchList", "FetchList of Graph").AsDuplicable(); - AddAttr("dtype", - "(int, default 5 (FP32)) " - "Output data type") - .SetDefault(framework::proto::VarType::FP32); - AddComment(R"DOC( -Run graph by PopART runtime. - -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker); - -REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel, - ops::IpuRuntimeKernel, - ops::IpuRuntimeKernel, - ops::IpuRuntimeKernel, - ops::IpuRuntimeKernel, - ops::IpuRuntimeKernel, - ops::IpuRuntimeKernel); diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index 753b34484e41128f4c38332f4c0dd077fd42776b..c4bc3a7fda154f42a07e10b453bba70afa41c629 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -55,8 +55,8 @@ class OverflowOp : public framework::OperatorWithKernel { auto *x_var = ctx.InputVar("X"); if (x_var->IsType()) { dtype = x_var->Get().type(); - } else if (x_var->IsType()) { - dtype = x_var->Get().value().type(); + } else if (x_var->IsType()) { + dtype = x_var->Get().value().type(); } else { PADDLE_ENFORCE_EQ( true, false, diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h index 99db1c7e081dade476e0012275071719d4281b78..abed0e6903dd39d2b3447455f8982e3df24e73fd 100644 --- a/paddle/fluid/operators/isfinite_op.h +++ b/paddle/fluid/operators/isfinite_op.h @@ -58,8 +58,8 @@ class OverflowKernel : public framework::OpKernel { if (x->IsType()) { auto* in = ctx.Input("X"); functor(*in, out); - } else if (x->IsType()) { - auto& in = ctx.Input("X")->value(); + } else if (x->IsType()) { + auto& in = ctx.Input("X")->value(); functor(in, out); } else { PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index 316197ac23c850fde85b65659fe988e61d4b2173..3b48a41ed4f75eba33788f3139a3ff5ae85e300d 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -62,8 +62,8 @@ class OverflowV2Op : public framework::OperatorWithKernel { auto *x_var = ctx.InputVar("X"); if (x_var->IsType()) { dtype = x_var->Get().type(); - } else if (x_var->IsType()) { - dtype = x_var->Get().value().type(); + } else if (x_var->IsType()) { + dtype = x_var->Get().value().type(); } else { PADDLE_THROW(plat::errors::InvalidArgument( "Cannot find the input data type by all input data")); diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h index 03610d4589058e074f64940741df34bd8f66e379..15bb01a865d402f8da3fb7ed4178548c8da46b40 100644 --- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h @@ -13,241 +13,10 @@ // limitations under the License. #pragma once - -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/pten/kernels/funcs/eigen/extensions.h" +#include "paddle/pten/kernels/primitive/functor_primitives.h" namespace paddle { namespace operators { -namespace kernel_primitives { -namespace details { - -static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) { - return ::Eigen::numext::exp(x); -} - -static __device__ __forceinline__ float Exp(float x) { return expf(x); } - -static __device__ __forceinline__ double Exp(double x) { return exp(x); } - -static __device__ __forceinline__ platform::float16 Log(platform::float16 x) { - return ::Eigen::numext::log(x); -} - -static __device__ __forceinline__ float Log(float x) { return logf(x); } - -static __device__ __forceinline__ double Log(double x) { return log(x); } - -} // namespace details - -/******************************** Unary Functor *******************************/ - -/** - * @brief Default unary exp functor - */ -template -struct ExpFunctor { - HOSTDEVICE inline ExpFunctor() {} - - HOSTDEVICE explicit inline ExpFunctor(int n) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(details::Exp(x)); - } -}; - -/** - * @brief Default unary identity functor - */ -template -struct IdentityFunctor { - HOSTDEVICE inline IdentityFunctor() {} - - HOSTDEVICE explicit inline IdentityFunctor(int n) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(x); - } -}; - -/** - * @brief Default unary div functor. Divide by a constant - */ -template -struct DivideFunctor { - private: - using MPType = typename ::paddle::operators::details::MPTypeTrait::Type; - - public: - HOSTDEVICE inline DivideFunctor() { n_inv = static_cast(1.0f); } - - HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(static_cast(x) * n_inv); - } - - private: - MPType n_inv; -}; - -/** - * @brief Default inverse functor - */ -template -struct InverseFunctor { - HOSTDEVICE inline InverseFunctor() {} - - HOSTDEVICE explicit inline InverseFunctor(int n) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(-x); - } -}; - -/** - * @brief Default unary square functor - */ -template -struct SquareFunctor { - HOSTDEVICE inline SquareFunctor() {} - - HOSTDEVICE explicit inline SquareFunctor(int n) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(x) * static_cast(x); - } -}; - -/****************************** Binary Functor ********************************/ - -/** - * @brief Default binary min functor - */ -template -struct MinFunctor { - inline T initial() { return static_cast(std::numeric_limits::max()); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return (b < a) ? b : a; - } -}; - -/** - * @brief Default binary max functor - */ -template -struct MaxFunctor { - inline T initial() { - return static_cast(std::numeric_limits::lowest()); - } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return (b > a) ? b : a; - } -}; - -/** - * @brief Default binary add functor - */ -template -struct AddFunctor { - inline T initial() { return static_cast(0.0f); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return b + a; - } -}; - -/** - * @brief Default binary add functor - */ -template -struct MulFunctor { - inline T initial() { return static_cast(1.0f); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return b * a; - } -}; - -/** - * @brief Default binary logic or functor - */ -template -struct LogicalOrFunctor { - inline T initial() { return static_cast(false); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return b || a; - } -}; - -/** - * @brief Default binary logic and functor - */ -template -struct LogicalAndFunctor { - inline T initial() { return static_cast(true); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return b && a; - } -}; - -/** - * @brief Default binary sub functor - */ -template -struct SubFunctor { - inline T initial() { return static_cast(0.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; } -}; - -/** - * @brief Default binary div functor - */ -template -struct DivFunctor { - inline T initial() { return static_cast(1.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } -}; - -template -struct DivFunctor::value>::type> { - inline T initial() { return static_cast(1.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { - // For int32/int64, need to check whether the divison is zero. - PADDLE_ENFORCE_NE(b, 0, - platform::errors::InvalidArgument( - "Integer division by zero encountered " - "in (floor) divide. Please check the input value.")); - return a / b; - } -}; - -/** - * @brief Default binary floor divide functor - */ -template -struct FloorDivFunctor { - inline T initial() { return static_cast(1.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { - PADDLE_ENFORCE_NE(b, 0, - platform::errors::InvalidArgument( - "Integer division by zero encountered " - "in (floor) divide. Please check the input value.")); - return static_cast(std::trunc(a / b)); - } -}; - -} // namespace kernel_primitives +namespace kernel_primitives = pten::kps; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h index 558f8c81c66428ca0561806b8021f09261e32e3b..4ec3741bc91bb58a183ee9a2ff106461c6d71d05 100644 --- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h @@ -13,61 +13,10 @@ // limitations under the License. #pragma once -#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h" -#ifdef PADDLE_WITH_XPU2 -#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h" -#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h" -#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h" - -#define KPStream XPUStream -#define KPDevice paddle::platform::XPUDeviceContext -#define _ptr_ _global_ptr_ -#define __forceinline__ __inline__ -#define __restrict__ - -#define THREAD_ID_X core_id() -#define THREAD_ID_Y 0 -#define THREAD_ID_Z 0 - -#define BLOCK_NUM_X core_num() -#define BLOCK_NUM_Y 0 -#define BLOCK_NUM_Z 0 - -#define BLOCK_ID_X cluster_id() -#define BLOCK_ID_Y 0 -#define BLOCK_ID_Z 0 - -#define GRID_NUM_X cluster_num() -#define GRID_NUM_Y 0 -#define GRID_NUM_Z 0 -#else -#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h" -#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h" -#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" - -#define KPStream gpuStream_t -#define KPDevice paddle::platform::CUDADeviceContext -#define _ptr_ - -#define THREAD_ID_X threadIdx.x -#define THREAD_ID_Y threadIdx.y -#define THREAD_ID_Z threadIdx.z - -#define BLOCK_NUM_X blockDim.x -#define BLOCK_NUM_Y blockDim.y -#define BLOCK_NUM_Z blockDim.z - -#define BLOCK_ID_X blockIdx.x -#define BLOCK_ID_Y blockIdx.y -#define BLOCK_ID_Z blockIdx.z - -#define GRID_NUM_X gridDim.x -#define GRID_NUM_Y gridDim.y -#define GRID_NUM_Z gridDim.z -#endif +#include "paddle/pten/kernels/primitive/kernel_primitives.h" namespace paddle { namespace operators { -namespace kernel_primitives {} +namespace kernel_primitives = pten::kps; } } diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 0c1f58a2f30f68c184906a0cebd78da98a83d952..bc00d875cd1dd37b64ae8a38c6949054bc168c7c 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -23,6 +23,7 @@ namespace cub = hipcub; #endif #include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -35,6 +36,8 @@ using CudnnDataType = platform::CudnnDataType; template using LayerNormParamType = typename CudnnDataType::BatchNormParamType; +#define LN_NUM_COLS 1024 + inline static int GetDesiredBlockDim(int64_t block_dim) { #ifdef __HIPCC__ const int kMaxBlockDim = 256; @@ -169,6 +172,118 @@ __inline__ __device__ half rsqrt_(const half val) { } #endif +#ifdef PADDLE_WITH_CUDA +template +__global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( + int rows, int cols, const float epsilon, const T *__restrict__ x_ptr, + const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr, + U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, + T *__restrict__ y_ptr) { + using Vec = platform::AlignedVector; + using Vec_scale = platform::AlignedVector; + + const int tidx = threadIdx.x; + const int bidx = blockIdx.x; + const int lane = tidx % THREADS_PER_WARP; // 0, 1, ..., 31 + const int warp = tidx / THREADS_PER_WARP; // 0, 1, 2, 3 + const int warp_n = warp % WARPS_N; // 0 + const int warp_m = warp / WARPS_N; // 0, 1, 2, 3 + + const int c = warp_n * THREADS_PER_WARP + lane; // lane + const int r = bidx * ROWS_PER_CTA + warp_m; // row id + + Vec_scale gamma[LDGS]; + Vec_scale beta[LDGS]; +#pragma unroll + for (int it = 0, col = c; it < LDGS; it++) { + platform::Load(gamma_ptr + col * VecSize, &gamma[it]); + platform::Load(beta_ptr + col * VecSize, &beta[it]); + col += THREADS_PER_ROW; + } + + constexpr U rn = 1.f / U(LN_NUM_COLS); + for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) { + Vec x[LDGS]; +#pragma unroll + for (int it = 0, col = c; it < LDGS; it++) { + platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, + &x[it]); + col += THREADS_PER_ROW; + } + U xf[LDGS * VecSize]; + + U mu_local = 0.f; + +#pragma unroll + for (int it = 0; it < LDGS; it++) { +#pragma unroll + for (int jt = 0; jt < VecSize; jt++) { + xf[it * VecSize + jt] = U(x[it][jt]); + mu_local += xf[it * VecSize + jt]; + } + } + +#pragma unroll + for (int it = 1; it < THREADS_PER_WARP; it *= 2) { + mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it); + } + mu_local *= rn; + if (lane == 0) { + mean_out_ptr[row] = mu_local; + } + U var_local = 0.f; + +#pragma unroll + for (int it = 0; it < LDGS; it++) { +#pragma unroll + for (int jt = 0; jt < VecSize; jt++) { + U diff = xf[it * VecSize + jt] - mu_local; + var_local += diff * diff; + } + } + +#pragma unroll + for (int it = 1; it < THREADS_PER_WARP; it *= 2) { + var_local += __shfl_xor_sync(uint32_t(-1), var_local, it); + } + // Note: to assure if it is right for double + U rsigma = rsqrtf(var_local * rn + epsilon); + if (lane == 0) { + var_out_ptr[row] = var_local * rn; + } + +#pragma unroll + for (int it = 0; it < LDGS; it++) { +#pragma unroll + for (int jt = 0; jt < VecSize; jt++) { + // use fp16 to compute + // ScaleT tmp = static_cast(rsigma * (xf[it * VecSize + jt] - + // mu_local)); + // x[it][jt] = gamma[it][jt] * tmp + beta[it][jt]; + // cast to fp32 to compute + U tmp = (rsigma * (static_cast(xf[it * VecSize + jt]) - mu_local)); + x[it][jt] = static_cast(static_cast(gamma[it][jt]) * tmp + + static_cast(beta[it][jt])); + } + } + +#pragma unroll + for (int it = 0, col = c; it < LDGS; it++) { + platform::Store(x[it], + y_ptr + row * LN_NUM_COLS + col * VecSize); + col += THREADS_PER_ROW; + } + } +} +#endif + template using LayerNormScaleBiasT = typename std::conditional::type; diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index 7725f336416dbb80e0f65a38b6a4f16c88fb799f..ef4f0c6ba7063d4ff39732aed85ab5bbe007e7ca 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -112,11 +112,49 @@ class LayerNormKernel } \ } while (0) - if (is_scale_bias_same_dtype_with_x) { - PADDLE_LAUNCH_LAYERNORM_FWD(T, true); +#ifdef PADDLE_WITH_CUDA + bool can_call_1024_kernel = false; + if (feature_size == 1024 && scale != nullptr && bias != nullptr) { + can_call_1024_kernel = true; + } + if (can_call_1024_kernel) { + const int WARPS_M = 4; + const int WARPS_N = 1; + const int THREADS_PER_WARP = 32; + const int BYTES_PER_LDG = 16; + const int VecSize = BYTES_PER_LDG / sizeof(T); + + const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; + const int ROWS_PER_CTA = WARPS_M; + + const int grid = static_cast( + std::ceil(batch_size / static_cast(ROWS_PER_CTA))); + if (is_scale_bias_same_dtype_with_x) { + ln_fwd_1024_kernel<<>>( + batch_size, feature_size, epsilon, x_data, + static_cast(void_scale_data), + static_cast(void_bias_data), mean_data, var_data, + y_data); + } else { + ln_fwd_1024_kernel<<>>( + batch_size, feature_size, epsilon, x_data, + static_cast(void_scale_data), + static_cast(void_bias_data), mean_data, var_data, + y_data); + } } else { - PADDLE_LAUNCH_LAYERNORM_FWD(U, false); +#endif + if (is_scale_bias_same_dtype_with_x) { + PADDLE_LAUNCH_LAYERNORM_FWD(T, true); + } else { + PADDLE_LAUNCH_LAYERNORM_FWD(U, false); + } +#ifdef PADDLE_WITH_CUDA } +#endif + #undef PADDLE_LAUNCH_LAYERNORM_FWD } }; diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h index 66160695c3d5aa9f7b18ea84156236752e42ae8e..89ad4325a5a534bb246c7017cafab3b96239b463 100644 --- a/paddle/fluid/operators/load_op.h +++ b/paddle/fluid/operators/load_op.h @@ -50,7 +50,7 @@ class LoadOpKernel : public framework::OpKernel { if (out_var->IsType()) { LoadLodTensor(fin, place, out_var, ctx); - } else if (out_var->IsType()) { + } else if (out_var->IsType()) { LoadSelectedRows(fin, place, out_var); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -105,7 +105,7 @@ class LoadOpKernel : public framework::OpKernel { void LoadSelectedRows(std::istream &fin, const platform::Place &place, framework::Variable *var) const { - auto *selectedRows = var->GetMutable(); + auto *selectedRows = var->GetMutable(); // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); diff --git a/paddle/fluid/operators/lookup_table_dequant_op.h b/paddle/fluid/operators/lookup_table_dequant_op.h index 70aad1d3238f2f8fe65c9a3e8bedeb1fd0762e1a..475d0922ccc693bab14000c24413c55b626833e1 100644 --- a/paddle/fluid/operators/lookup_table_dequant_op.h +++ b/paddle/fluid/operators/lookup_table_dequant_op.h @@ -29,7 +29,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using DDim = framework::DDim; template diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 89c84d9e14377315659efc1f3b8a5a9d0406b336..7a32e13122852c7c8e4f00cb03a7d0c85e727e05 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -151,7 +151,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto *ids = context.Input("Ids"); auto *table = context.Input("W"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index a89d5fb7cb6e5db4546e0ff2e90bf9d722e7cd82..91b7f91c8e3bc5319db28a9585c029230d7a33e8 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -28,7 +28,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using DDim = framework::DDim; constexpr int64_t kNoPadding = -1; @@ -82,8 +82,8 @@ class LookupTableKernel : public framework::OpKernel { } } - } else if (table_var->IsType()) { - const auto &table_t = table_var->Get(); + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); int64_t row_width = table_t.value().dims()[1]; const auto *table = table_t.value().data(); auto *output = output_t->mutable_data(context.GetPlace()); @@ -155,8 +155,8 @@ class LookupTableGradKernel : public framework::OpKernel { DDim table_dim; if (table_var->IsType()) { table_dim = context.Input("W")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("W"); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); table_dim = table_t->value().dims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -171,7 +171,8 @@ class LookupTableGradKernel : public framework::OpKernel { if (is_sparse) { auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 44a6151f1b6ce665932a5c9b5f84c9cd2c817ab3..74ad0e4978b4ec6b3aa5553fc0a6202286ea6ffd 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -152,7 +152,8 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel { auto *ids = context.Input("Ids"); auto *table = context.Input("W"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h index 54564395c6d04cebd8861c378c5aa34c899ffd7f..6ea9e58198fbffff5729ed7799a38f5dfece4b35 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.h +++ b/paddle/fluid/operators/lookup_table_v2_op.h @@ -29,7 +29,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using DDim = framework::DDim; constexpr int64_t kNoPadding = -1; @@ -86,8 +86,8 @@ class LookupTableV2Kernel : public framework::OpKernel { row_width * sizeof(T)); } } - } else if (table_var->IsType()) { - const auto &table_t = table_var->Get(); + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); int64_t row_width = table_t.value().dims()[1]; const auto *table = table_t.value().data(); auto *output = output_t->mutable_data(context.GetPlace()); @@ -132,8 +132,8 @@ class LookupTableV2GradKernel : public framework::OpKernel { DDim table_dim; if (table_var->IsType()) { table_dim = context.Input("W")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("W"); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); table_dim = table_t->value().dims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -148,7 +148,8 @@ class LookupTableV2GradKernel : public framework::OpKernel { if (is_sparse) { auto *ids_t = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); - auto *d_table = context.Output(framework::GradVarName("W")); + auto *d_table = + context.Output(framework::GradVarName("W")); int64_t ids_num = ids_t->numel(); std::vector ids; diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index f2d1e79f03524a8cd7d20ec9aefb205c5e12bb0b..2672d02db008e7aadd00d79669e4ab07c36011b5 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/kernels/funcs/eigen/common.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -52,6 +53,18 @@ template struct SetConstant>; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; + #ifdef PADDLE_WITH_XPU template struct SetConstant; template struct SetConstant; diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 7c50ba630dbd91ef8c6d51cbde862336b5ab83cb..a94bb594be5f9d8c3c5eeb58f524161287dc0607 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -227,11 +227,11 @@ template struct MatrixBitCodeFunctorMulGradWeightSR : public boost::static_visitor { const framework::Tensor &tmat_; - framework::SelectedRows *weight_; + pten::SelectedRows *weight_; const framework::Tensor &input_; MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat, - framework::SelectedRows *weight, + pten::SelectedRows *weight, const framework::Tensor &input) : tmat_(tmat), weight_(weight), input_(input) {} @@ -274,7 +274,7 @@ struct MatrixBitCodeFunctorMulGradWeightSR template void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, - framework::SelectedRows *weight, + pten::SelectedRows *weight, const framework::Tensor &input) { MatrixBitCodeFunctorMulGradWeightSR func(tmat, weight, input); code_table_.apply_visitor(func); diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 71d905214ab9f57013bb553179ab6e75116af76d..13ddd27cbf0d7bd1dc1adbd8bfa278827107787a 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -252,8 +252,7 @@ class MatrixBitCodeFunctor { /* For SelectedRows Weight, For index(i, j) >= 0: weight.row(index(i, j)) += tmat(i, j) * input.row(i) */ - void MulGradWeight(const framework::Tensor& tmat, - framework::SelectedRows* weight, + void MulGradWeight(const framework::Tensor& tmat, pten::SelectedRows* weight, const framework::Tensor& input); /* For j < code_length input.row(i) += tmat(i, j) * weight.row(index(i, j)) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index f6178eb0a1eb6e8a4d1886443ec77b945c3b182f..8cd3e1367d86d9bc31e4b12af8baa25144cd14f2 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -24,9 +24,9 @@ namespace math { template struct SelectedRowsAdd { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* output) { + const pten::SelectedRows& input1, + const pten::SelectedRows& input2, + pten::SelectedRows* output) { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ( in1_height, input2.height(), @@ -94,7 +94,7 @@ template struct SelectedRowsAdd; template struct SelectedRowsAddTensor { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input1, + const pten::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); auto in2_dims = input2.dims(); @@ -154,9 +154,8 @@ template struct SelectedRowsAddTensor; template struct SelectedRowsAddTo { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input1, - const int64_t input2_offset, - framework::SelectedRows* input2) { + const pten::SelectedRows& input1, const int64_t input2_offset, + pten::SelectedRows* input2) { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ( in1_height, input2->height(), @@ -198,9 +197,9 @@ template struct SelectedRowsAddTo; template struct SelectedRowsSumTo { void operator()(const platform::CPUDeviceContext& context, - const std::vector& input1, + const std::vector& input1, const std::vector& input2_offsets, - framework::SelectedRows* input2) { + pten::SelectedRows* input2) { // Ensure all selected rows have the same height size_t size = 0u; for (auto iter = input1.begin(); iter != input1.end(); ++iter) { @@ -242,8 +241,7 @@ template struct SelectedRowsSumTo; template struct SelectedRowsAddToTensor { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input1, - framework::Tensor* input2) { + const pten::SelectedRows& input1, framework::Tensor* input2) { if (UNLIKELY(input1.rows().size() == 0)) { LOG(WARNING) << "input selected rows is empty!"; return; @@ -313,7 +311,7 @@ typename std::enable_if::value>::type elementwise_add_to( template typename std::enable_if::value>::type -add_sparse_inputs(const std::vector& inputs, +add_sparse_inputs(const std::vector& inputs, const std::unordered_map& rows_to_id, int64_t input_width, const platform::CPUDeviceContext& context, T* out_data) { @@ -347,7 +345,7 @@ add_sparse_inputs(const std::vector& inputs, template typename std::enable_if::value>::type -add_sparse_inputs(const std::vector& inputs, +add_sparse_inputs(const std::vector& inputs, const std::unordered_map& rows_to_id, int64_t input_width, const platform::CPUDeviceContext& context, T* out_data) { @@ -371,32 +369,31 @@ add_sparse_inputs(const std::vector& inputs, template struct MergeAdd { - framework::SelectedRows operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - const bool sorted_result = false) { - framework::SelectedRows out; + pten::SelectedRows operator()(const platform::CPUDeviceContext& context, + const pten::SelectedRows& input, + const bool sorted_result = false) { + pten::SelectedRows out; (*this)(context, input, &out, sorted_result); return out; } void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output, + const pten::SelectedRows& input, pten::SelectedRows* output, const bool sorted_result = false) { - std::vector inputs; + std::vector inputs; inputs.push_back(&input); (*this)(context, inputs, output, sorted_result); } void operator()(const platform::CPUDeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output, + const std::vector& inputs, + pten::SelectedRows* output, const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; } - const framework::SelectedRows* has_value_input = nullptr; + const pten::SelectedRows* has_value_input = nullptr; for (auto* in : inputs) { if (in->rows().size() > 0) { has_value_input = in; @@ -409,7 +406,7 @@ struct MergeAdd { } auto input_width = has_value_input->value().dims()[1]; auto input_height = has_value_input->height(); - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set merged_row_set; size_t row_num = 0; for (auto* input : inputs) { @@ -480,24 +477,23 @@ struct MergeAdd { #ifdef PADDLE_WITH_XPU template struct MergeAdd { - framework::SelectedRows operator()(const platform::XPUDeviceContext& context, - const framework::SelectedRows& input, - const bool sorted_result = false) { - framework::SelectedRows out; + pten::SelectedRows operator()(const platform::XPUDeviceContext& context, + const pten::SelectedRows& input, + const bool sorted_result = false) { + pten::SelectedRows out; (*this)(context, input, &out, sorted_result); return out; } void operator()(const platform::XPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output, + const pten::SelectedRows& input, pten::SelectedRows* output, const bool sorted_result = false) { framework::Vector input_rows(input.rows()); if (input_rows.size() == 0) { return; } - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows(row_set.begin(), row_set.end()); auto input_width = input.value().dims()[1]; @@ -537,14 +533,14 @@ struct MergeAdd { } void operator()(const platform::XPUDeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output, + const std::vector& inputs, + pten::SelectedRows* output, const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; } - const framework::SelectedRows* has_value_input = nullptr; + const pten::SelectedRows* has_value_input = nullptr; for (auto* in : inputs) { if (in->rows().size() > 0) { has_value_input = in; @@ -557,7 +553,7 @@ struct MergeAdd { } auto input_width = has_value_input->value().dims()[1]; auto input_height = has_value_input->height(); - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set merged_row_set; size_t row_num = 0; for (auto* input : inputs) { @@ -628,29 +624,28 @@ struct MergeAdd { #endif template struct MergeAverage { - framework::SelectedRows operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input) { - framework::SelectedRows out; + pten::SelectedRows operator()(const platform::CPUDeviceContext& context, + const pten::SelectedRows& input) { + pten::SelectedRows out; (*this)(context, input, &out); return out; } void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output) { - std::vector inputs; + const pten::SelectedRows& input, pten::SelectedRows* output) { + std::vector inputs; inputs.push_back(&input); (*this)(context, inputs, output); } void operator()(const platform::CPUDeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output) { + const std::vector& inputs, + pten::SelectedRows* output) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; } - const framework::SelectedRows* has_value_input = nullptr; + const pten::SelectedRows* has_value_input = nullptr; for (auto* in : inputs) { if (in->rows().size() > 0) { has_value_input = in; @@ -663,7 +658,7 @@ struct MergeAverage { } auto input_width = has_value_input->value().dims()[1]; auto input_height = has_value_input->height(); - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set merged_row_set; size_t row_num = 0; for (auto* input : inputs) { @@ -750,7 +745,7 @@ template struct MergeAverage; template struct UpdateToTensor { void operator()(const platform::CPUDeviceContext& context, - const ScatterOps& op, const framework::SelectedRows& input1, + const ScatterOps& op, const pten::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 654a5653cbed1f5595c7e24e0f3da0516d582926..2ae2aaebb6c5324b82e1347d464835c3f0bc4068 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -26,9 +26,9 @@ namespace math { template struct SelectedRowsAdd { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* output) { + const pten::SelectedRows& input1, + const pten::SelectedRows& input2, + pten::SelectedRows* output) { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ( in1_height, input2.height(), @@ -117,7 +117,7 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows, template struct SelectedRowsAddTensor { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input1, + const pten::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); auto in2_dims = input2.dims(); @@ -182,9 +182,8 @@ template struct SelectedRowsAddTensor struct SelectedRowsAddTo { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input1, - const int64_t input2_offset, - framework::SelectedRows* input2) { + const pten::SelectedRows& input1, const int64_t input2_offset, + pten::SelectedRows* input2) { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ( in1_height, input2->height(), @@ -250,8 +249,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, template struct SelectedRowsAddToTensor { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input1, - framework::Tensor* input2) { + const pten::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); PADDLE_ENFORCE_EQ( @@ -320,24 +318,23 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, template struct MergeAdd { - framework::SelectedRows operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input, - const bool sorted_result = false) { - framework::SelectedRows out; + pten::SelectedRows operator()(const platform::CUDADeviceContext& context, + const pten::SelectedRows& input, + const bool sorted_result = false) { + pten::SelectedRows out; (*this)(context, input, &out); return out; } void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output, + const pten::SelectedRows& input, pten::SelectedRows* output, const bool sorted_result = false) { framework::Vector input_rows(input.rows()); if (input_rows.size() == 0) { return; } - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows_cpu(row_set.begin(), row_set.end()); framework::Vector merge_rows(merge_rows_cpu); @@ -368,14 +365,14 @@ struct MergeAdd { } void operator()(const platform::CUDADeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output, + const std::vector& inputs, + pten::SelectedRows* output, const bool sorted_result = false) { if (inputs.size() == 0) { VLOG(3) << "no input! return"; return; } - const framework::SelectedRows* has_value_input = nullptr; + const pten::SelectedRows* has_value_input = nullptr; for (auto* in : inputs) { if (in->rows().size() > 0) { has_value_input = in; @@ -388,7 +385,7 @@ struct MergeAdd { } auto input_width = has_value_input->value().dims()[1]; auto input_height = has_value_input->height(); - framework::SelectedRows& out = *output; + pten::SelectedRows& out = *output; std::set merged_row_set; for (auto* input : inputs) { if (input->rows().size() == 0) { @@ -499,7 +496,7 @@ __global__ void UpdateToTensorKernel(const T* selected_rows, template struct UpdateToTensor { void operator()(const platform::CUDADeviceContext& context, - const ScatterOps& op, const framework::SelectedRows& input1, + const ScatterOps& op, const pten::SelectedRows& input1, framework::Tensor* input2) { // NOTE: Use SelectedRowsAddToTensor for better performance // no additional MergeAdd called. diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 8ba7851d7b979aec33318a237a6c74a15d296e1a..690082036c5e0a4b8da99abc2a4aae588ab6fe31 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -35,15 +35,14 @@ namespace math { template struct SelectedRowsAdd { void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* output); + const pten::SelectedRows& input1, + const pten::SelectedRows& input2, pten::SelectedRows* output); }; template struct SelectedRowsAddTensor { void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, + const pten::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output); }; @@ -51,17 +50,17 @@ struct SelectedRowsAddTensor { template struct SelectedRowsAddTo { void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - const int64_t input2_offset, framework::SelectedRows* input2); + const pten::SelectedRows& input1, const int64_t input2_offset, + pten::SelectedRows* input2); }; // input2 = [all input in input1] + input2 template struct SelectedRowsSumTo { void operator()(const DeviceContext& context, - const std::vector& input1, + const std::vector& input1, const std::vector& input2_offsets, - framework::SelectedRows* input2); + pten::SelectedRows* input2); }; // FIXME: The result of SelectedRowsAddToTensor maybe non deterministic, @@ -70,8 +69,7 @@ struct SelectedRowsSumTo { template struct SelectedRowsAddToTensor { void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - framework::Tensor* input2); + const pten::SelectedRows& input1, framework::Tensor* input2); }; namespace scatter { @@ -80,29 +78,25 @@ template struct MergeAdd { // unary functor, merge by adding duplicated rows in // the input SelectedRows object. - framework::SelectedRows operator()(const DeviceContext& context, - const framework::SelectedRows& input, - const bool sorted_result = false); + pten::SelectedRows operator()(const DeviceContext& context, + const pten::SelectedRows& input, + const bool sorted_result = false); + void operator()(const DeviceContext& context, const pten::SelectedRows& input, + pten::SelectedRows* output, const bool sorted_result = false); void operator()(const DeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output, - const bool sorted_result = false); - void operator()(const DeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output, - const bool sorted_result = false); + const std::vector& inputs, + pten::SelectedRows* output, const bool sorted_result = false); }; template struct MergeAverage { - framework::SelectedRows operator()(const DeviceContext& context, - const framework::SelectedRows& input); - void operator()(const DeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output); + pten::SelectedRows operator()(const DeviceContext& context, + const pten::SelectedRows& input); + void operator()(const DeviceContext& context, const pten::SelectedRows& input, + pten::SelectedRows* output); void operator()(const DeviceContext& context, - const std::vector& inputs, - framework::SelectedRows* output); + const std::vector& inputs, + pten::SelectedRows* output); }; enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; @@ -111,8 +105,7 @@ enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; template struct UpdateToTensor { void operator()(const DeviceContext& context, const ScatterOps& op, - const framework::SelectedRows& input1, - framework::Tensor* input2); + const pten::SelectedRows& input1, framework::Tensor* input2); }; } // namespace scatter diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index e0b368164906626e99a35cffc406a2f30edcc388..19e70f924f15e7d2a7d33a17911b711fc812b501 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -27,8 +27,8 @@ TEST(selected_rows_functor, cpu_add) { int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -37,8 +37,8 @@ TEST(selected_rows_functor, cpu_add) { functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -46,8 +46,7 @@ TEST(selected_rows_functor, cpu_add) { cpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; auto* out_value = output->mutable_value(); // simplely concat two SelectedRows @@ -130,8 +129,8 @@ TEST(selected_rows_functor, cpu_add_to) { int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -140,8 +139,8 @@ TEST(selected_rows_functor, cpu_add_to) { functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -149,8 +148,7 @@ TEST(selected_rows_functor, cpu_add_to) { cpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); auto* out_value = output->mutable_value(); @@ -230,8 +228,8 @@ TEST(selected_rows_functor, cpu_merge_average_float) { int64_t row_numel = 10; std::vector rows{0, 4, 4, 7}; - std::unique_ptr selected_rows{ - new paddle::framework::SelectedRows(rows, height)}; + std::unique_ptr selected_rows{ + new pten::SelectedRows(rows, height)}; auto* in_value = selected_rows->mutable_value(); in_value->mutable_data( paddle::framework::make_ddim( @@ -242,8 +240,7 @@ TEST(selected_rows_functor, cpu_merge_average_float) { paddle::operators::math::scatter::MergeAverage< paddle::platform::CPUDeviceContext, float> merge_average_functor; - paddle::framework::SelectedRows output = - merge_average_functor(ctx, *selected_rows); + pten::SelectedRows output = merge_average_functor(ctx, *selected_rows); auto out_height = output.height(); EXPECT_EQ(out_height, height); @@ -270,8 +267,8 @@ TEST(selected_rows_functor, cpu_merge_add_float) { int64_t row_numel = 10; std::vector rows{0, 4, 4, 7}; - std::unique_ptr selected_rows{ - new paddle::framework::SelectedRows(rows, height)}; + std::unique_ptr selected_rows{ + new pten::SelectedRows(rows, height)}; auto* in_value = selected_rows->mutable_value(); in_value->mutable_data( paddle::framework::make_ddim( @@ -279,8 +276,7 @@ TEST(selected_rows_functor, cpu_merge_add_float) { cpu_place); functor(ctx, in_value, 1.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; paddle::operators::math::scatter::MergeAdd @@ -311,8 +307,8 @@ TEST(selected_rows_functor, cpu_merge_add_int) { int64_t row_numel = 10; std::vector rows{0, 4, 4, 7}; - std::unique_ptr selected_rows{ - new paddle::framework::SelectedRows(rows, height)}; + std::unique_ptr selected_rows{ + new pten::SelectedRows(rows, height)}; auto* in_value = selected_rows->mutable_value(); in_value->mutable_data( paddle::framework::make_ddim( @@ -320,8 +316,7 @@ TEST(selected_rows_functor, cpu_merge_add_int) { cpu_place); functor(ctx, in_value, 1); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; paddle::operators::math::scatter::MergeAdd @@ -354,8 +349,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { int64_t row_numel = 8; std::vector rows1{5, 2, 5, 3, 5}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -364,8 +359,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { set_const(ctx, in1_value, 1.0); std::vector rows2{2, 5, 3, 5, 3}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -373,14 +368,13 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { cpu_place); set_const(ctx, in2_value, 1.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); paddle::operators::math::scatter::MergeAdd merge_add_functor; - std::vector inputs; + std::vector inputs; inputs.push_back(selected_rows1.get()); inputs.push_back(selected_rows2.get()); merge_add_functor(ctx, inputs, output.get()); @@ -411,8 +405,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { int64_t row_numel = 8; std::vector rows1{1, 3, 5, 7, 9}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -421,8 +415,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { set_const(ctx, in1_value, 1.0); std::vector rows2{0, 2, 4, 6, 8}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -430,14 +424,13 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { cpu_place); set_const(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); paddle::operators::math::scatter::MergeAdd merge_add_functor; - std::vector inputs; + std::vector inputs; inputs.push_back(selected_rows1.get()); inputs.push_back(selected_rows2.get()); merge_add_functor(ctx, inputs, output.get()); @@ -472,8 +465,8 @@ TEST(selected_rows_functor, cpu_sum_to) { int64_t height = 10; int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -482,8 +475,8 @@ TEST(selected_rows_functor, cpu_sum_to) { functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -491,8 +484,7 @@ TEST(selected_rows_functor, cpu_sum_to) { cpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); auto* out_value = output->mutable_value(); // simplely concat two SelectedRows @@ -501,7 +493,7 @@ TEST(selected_rows_functor, cpu_sum_to) { paddle::operators::math::SelectedRowsSumTo sum_to_functor; - sum_to_functor(ctx, std::vector( + sum_to_functor(ctx, std::vector( {selected_rows1.get(), selected_rows2.get()}), std::vector({0, in1_value->numel()}), output.get()); auto out_height = output->height(); diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc index ebcd97b32c4a30d76b844546b4e5cd7d177be192..e826c2a7244f719df28ea57a074093d211fe5e6e 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc @@ -29,8 +29,8 @@ TEST(selected_rows_functor, gpu_add) { int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -48,8 +48,8 @@ TEST(selected_rows_functor, gpu_add) { #endif std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -57,8 +57,7 @@ TEST(selected_rows_functor, gpu_add) { gpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; auto* out_value = output->mutable_value(); // simply concat two SelectedRows @@ -152,8 +151,8 @@ TEST(selected_rows_functor, gpu_add_to) { int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -162,8 +161,8 @@ TEST(selected_rows_functor, gpu_add_to) { functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -171,8 +170,7 @@ TEST(selected_rows_functor, gpu_add_to) { gpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); auto* out_value = output->mutable_value(); @@ -264,8 +262,8 @@ TEST(selected_rows_functor, gpu_merge_add) { int64_t row_numel = 8; std::vector rows1{5, 2, 5, 3, 5}; - std::unique_ptr selected_rows1{ - new paddle::framework::SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new pten::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( paddle::framework::make_ddim( @@ -274,8 +272,8 @@ TEST(selected_rows_functor, gpu_merge_add) { set_const(ctx, in1_value, 1.0); std::vector rows2{2, 5, 3, 5, 3}; - std::unique_ptr selected_rows2{ - new paddle::framework::SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new pten::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( paddle::framework::make_ddim( @@ -283,14 +281,13 @@ TEST(selected_rows_functor, gpu_merge_add) { gpu_place); set_const(ctx, in2_value, 1.0); - std::unique_ptr output{ - new paddle::framework::SelectedRows()}; + std::unique_ptr output{new pten::SelectedRows()}; output->set_height(height); paddle::operators::math::scatter::MergeAdd< paddle::platform::CUDADeviceContext, float> merge_add_functor; - std::vector inputs; + std::vector inputs; inputs.push_back(selected_rows1.get()); inputs.push_back(selected_rows2.get()); merge_add_functor(ctx, inputs, output.get()); diff --git a/paddle/fluid/operators/memcpy_d2h_op.h b/paddle/fluid/operators/memcpy_d2h_op.h index e1b81c0c59241a9e0fbc4c5615d74aba47074764..bdedb8e7d29458fec231865879a4aa706bdbedbb 100644 --- a/paddle/fluid/operators/memcpy_d2h_op.h +++ b/paddle/fluid/operators/memcpy_d2h_op.h @@ -51,7 +51,7 @@ class MemcpyD2HFunctor { } } - void operator()(const framework::SelectedRows &rows) const { + void operator()(const pten::SelectedRows &rows) const { // (JZ-LIANG) to support SelectedRows PADDLE_THROW(platform::errors::Unimplemented( "Memcpy for SelectedRows is NOT support yet.")); diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 7f4870010403070b7b07e44c80f32e3162179795..c9995eeca16cd42aaf8d69229a15dde7d949ea72 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -59,7 +59,7 @@ class MemcpyH2DFunctor { out_tensor.set_lod(lod_tensor.lod()); } - void operator()(const framework::SelectedRows &rows) const { + void operator()(const pten::SelectedRows &rows) const { // (JZ-LIANG) to support SelectedRows PADDLE_THROW(platform::errors::Unimplemented( "Memcpy for SelectedRows is NOT support yet.")); diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h index ac4a0d1ab111ed250edf620faefe0e98a28ea78d..40c7aceda5116075d9498903fd788f3a10e080ad 100644 --- a/paddle/fluid/operators/memcpy_op.h +++ b/paddle/fluid/operators/memcpy_op.h @@ -75,7 +75,7 @@ class MemcpyFunctor { out_tensor.set_lod(lod_tensor.lod()); } - void operator()(const framework::SelectedRows &rows) const { + void operator()(const pten::SelectedRows &rows) const { // (JZ-LIANG) to support SelectedRows PADDLE_THROW(platform::errors::Unimplemented( "Memcpy for SelectedRows is NOT support yet.")); diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h index 4c977e94b175c988e4253b273365b0cabc4b87aa..0fe262dea3b1352e590f9018b9152e7537299108 100644 --- a/paddle/fluid/operators/merge_selected_rows_op.h +++ b/paddle/fluid/operators/merge_selected_rows_op.h @@ -24,8 +24,8 @@ template class MergeSelectedRowsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); math::scatter::MergeAdd merge_func; merge_func(context.template device_context(), *x, out); diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 67b6b3ec1614dd51adc62cf418d9eadadf276ca9..82d7c56aea1234bec8c1d22cc10717c100fbe369 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -1137,5 +1137,28 @@ class MLUCnnl { void* output); }; +template +inline void TransposeFromMLUTensor(const ExecutionContext& ctx, + const std::vector perm, + const Tensor* transformed_input, + Tensor* transformed_output, + bool need_reshape_or_alloc) { + auto in_dims_vec = framework::vectorize(transformed_input->dims()); + if (need_reshape_or_alloc) { + transformed_output->mutable_data( + {in_dims_vec[perm[0]], in_dims_vec[perm[1]], in_dims_vec[perm[2]], + in_dims_vec[perm[3]]}, + ctx.GetPlace()); + } + MLUCnnlTensorDesc trans_in_desc(*transformed_input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + MLUCnnlTensorDesc trans_out_desc(*transformed_output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + + MLUCnnl::Transpose(ctx, perm, in_dims_vec.size(), trans_in_desc.get(), + GetBasePtr(transformed_input), trans_out_desc.get(), + GetBasePtr(transformed_output)); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 55f684b66485bb3d23b443000fdbe35c35332486..edd2ae4ca9c87c0c06e41913a4829a5ff057c82a 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -31,7 +31,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using Sampler = math::Sampler; using DDim = framework::DDim; @@ -364,8 +364,8 @@ class NCEGradKernel : public framework::OpKernel { DDim table_dim; if (table_var->IsType()) { table_dim = context.Input("Weight")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("Weight"); + } else if (table_var->IsType()) { + auto *table_t = context.Input("Weight"); table_dim = table_t->value().dims(); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -373,7 +373,8 @@ class NCEGradKernel : public framework::OpKernel { "must be either LoDTensor or SelectedRows")); } - auto d_w = context.Output(framework::GradVarName("Weight")); + auto d_w = + context.Output(framework::GradVarName("Weight")); d_w->set_rows(labels); d_w->set_height(table_dim[0]); diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc index 255dc5bb083114c4bc85739c621f3558d153cc93..31d3e1208dadb72ed9add4d90ad68ca189411f8f 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/adagrad_op.cc @@ -111,7 +111,7 @@ size_t FindPos(const std::vector& rows, int64_t value) { template struct SparseAdagradFunctor { void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& grad, + const pten::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu index 8b939b7c6b3ba275ef050abc08636ea9c8740621..a7c32255bd1ee060435abf1e4d80cf05e4d979ed 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/adagrad_op.cu @@ -72,7 +72,7 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows, template struct SparseAdagradFunctor { void operator()(const platform::CUDADeviceContext& context, - const framework::SelectedRows& grad, + const pten::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h index 057bd4e863ddf7ae27b54ee784174e1452619395..c2dc3f095ed99de2917f70aed27dc1a6b2a8bb4c 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.h +++ b/paddle/fluid/operators/optimizers/adagrad_op.h @@ -22,16 +22,15 @@ namespace operators { template struct SparseAdagradFunctor { - void operator()(const DeviceContext &context, - const framework::SelectedRows &grad, + void operator()(const DeviceContext &context, const pten::SelectedRows &grad, const framework::Tensor &learning_rate, T epsilon, framework::Tensor *moment, framework::Tensor *param); }; template -framework::SelectedRows SquareSelectedRows( - const DeviceContext &context, const framework::SelectedRows &input) { - framework::SelectedRows out; +pten::SelectedRows SquareSelectedRows(const DeviceContext &context, + const pten::SelectedRows &input) { + pten::SelectedRows out; out.set_rows(input.rows()); out.set_height(input.height()); out.mutable_value()->mutable_data(input.value().dims(), @@ -88,7 +87,7 @@ class AdagradOpKernel : public framework::OpKernel { param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); } - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { auto *param_tensor = ctx.Input("Param"); PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor, platform::errors::InvalidArgument( @@ -101,7 +100,7 @@ class AdagradOpKernel : public framework::OpKernel { SparseAdagradFunctor functor; functor(ctx.template device_context(), - *ctx.Input("Grad"), + *ctx.Input("Grad"), *ctx.Input("LearningRate"), epsilon, moment_out_tensor, param_out_tensor); } else { diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index 1ef46ef085c5d73b63ed25ef353cb1477a17776c..c7ffb53a0588267ef205971d5899d5aa36168072 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -314,8 +314,8 @@ class AdamOpCUDAKernel : public framework::OpKernel { beta2_pow_out->mutable_data(ctx.GetPlace())); } } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); if (grad->rows().size() == 0) { VLOG(3) << "grad row size is 0!!"; return; @@ -330,8 +330,8 @@ class AdamOpCUDAKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = grad; } else { diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index bb044b4b4986e3ec9cecc38cc52cf53cddcb45f9..bcc314cd57c017b577d8370a6e593366364dbdd9 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -521,8 +521,8 @@ class AdamOpKernel : public framework::OpKernel { beta2_pow_out->mutable_data(ctx.GetPlace())[0] = beta2 * beta2_pow->data()[0]; } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); if (grad->rows().size() == 0) { VLOG(3) << "grad row size is 0!!"; return; @@ -537,8 +537,8 @@ class AdamOpKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = grad; } else { diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index e462c20c7f51db8195c3acba019d0aa225005dce..fd83b76e02a24f86899c851efe3f773873dc50ce 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -195,8 +195,8 @@ class AdamOpXPUKernel : public framework::OpKernel { xpu_wait(dev_ctx.x_context()->xpu_stream); } } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); auto& dev_ctx = ctx.template device_context(); if (grad->rows().size() == 0) { @@ -213,8 +213,8 @@ class AdamOpXPUKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = grad; } else { diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu index a8b16e73dbfffe69e2b4b10371b30f3c77305696..8bce415cb1ab9835d6c87c9617e9147187b2a2c8 100644 --- a/paddle/fluid/operators/optimizers/adamw_op.cu +++ b/paddle/fluid/operators/optimizers/adamw_op.cu @@ -337,8 +337,8 @@ class AdamWOpCUDAKernel : public framework::OpKernel { beta2_pow_out->mutable_data(ctx.GetPlace())); } } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); if (grad->rows().size() == 0) { VLOG(3) << "grad row size is 0!!"; return; @@ -353,8 +353,8 @@ class AdamWOpCUDAKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = grad; } else { diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index 6bf8c8d724fb892ed934d4c1ee9305e641b62851..9c9355921d8273ea1e7f587e43eaa0bdc6a80838 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -180,11 +180,11 @@ class FTRLOpKernel : public framework::OpKernel { } s_acc_out.device(place) = sq_accum + g * g; - } else if (grad_var->IsType()) { - auto grad = ctx.Input("Grad"); + } else if (grad_var->IsType()) { + auto grad = ctx.Input("Grad"); - framework::SelectedRows tmp_merged_grad; - framework::SelectedRows* merged_grad = &tmp_merged_grad; + pten::SelectedRows tmp_merged_grad; + pten::SelectedRows* merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(ctx.template device_context(), *grad, merged_grad); diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h index 9a3eaa66caa8e870f2692c67aea29535dbd7492a..f1158703f028b6c2ebbb9e1596b240e81b5b0b2b 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.h +++ b/paddle/fluid/operators/optimizers/lamb_op.h @@ -552,7 +552,7 @@ class LambOpKernel : public framework::OpKernel { trust_ratio_div_ptr, skip_update_flag); for_range(moment_update_functor); } - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { PADDLE_ENFORCE_EQ(IsMultiPrecision, false, platform::errors::Unimplemented( "SelectedRows gradient is not supported when " @@ -562,7 +562,7 @@ class LambOpKernel : public framework::OpKernel { platform::errors::Unimplemented( "SelectedRows gradient is not supported when " "multi_precision=True.")); - auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), + auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), "Input", "Grad", "Lamb"); if (grad.rows().size() == 0) { VLOG(3) << "grad row size is 0!!"; @@ -578,8 +578,8 @@ class LambOpKernel : public framework::OpKernel { } } - framework::SelectedRows tmp_grad_merge; - const framework::SelectedRows* grad_merge_ptr; + pten::SelectedRows tmp_grad_merge; + const pten::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = &grad; } else { diff --git a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc index 450ef376799d3d383f3fa55f65850ca73e6c51a3..ee3111c7dd6a09c22682b738ec6a1ea9525134d9 100644 --- a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc +++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc @@ -48,7 +48,7 @@ class SGDOneDNNKernel : public SGDOpKernel { VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel"; const auto *learning_rate = ctx.Input("LearningRate"); auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); + const auto *grad = ctx.Input("Grad"); const auto &grad_value = grad->value(); const auto &grad_rows = grad->rows(); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 2d713308fd938996d45badf1549d2a60d6c8c4ec..79d76d52f48c8c2d5f1c62f4cd08977ce268c573 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -27,7 +27,7 @@ namespace paddle { namespace operators { using framework::Tensor; -using framework::SelectedRows; +using pten::SelectedRows; struct NoNesterov; struct UseNesterov; @@ -545,9 +545,9 @@ class MomentumOpKernel : public framework::OpKernel { } } - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { // sparse update embedding with selectedrows - auto grad = ctx.Input("Grad"); + auto grad = ctx.Input("Grad"); // sparse update maybe empty. if (grad->rows().size() == 0) { @@ -555,8 +555,8 @@ class MomentumOpKernel : public framework::OpKernel { return; } - framework::SelectedRows tmp_merged_grad; - framework::SelectedRows* merged_grad = &tmp_merged_grad; + pten::SelectedRows tmp_merged_grad; + pten::SelectedRows* merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(ctx.template device_context(), *grad, merged_grad); diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc index e3f0e5cc04d9ee970de19a9a1f1724f24fb4eb15..a71847c4690821e33eb5dfa4240a748fe9bd9472 100644 --- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc +++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc @@ -74,7 +74,7 @@ class NPUMomentumOpKernel : public framework::OpKernel { regularized_grad, mu_tensor}, {*param_out}, {{"use_nesterov", use_nesterov}}); runner.Run(dev_ctx.stream()); - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied( "Unsupport SparseMomentum")); } else { diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h index 9971cb92306a2710e02998fade05c8a498e88627..a01f84b37c4eb236c7aff591a49e4d76c55f152d 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.h +++ b/paddle/fluid/operators/optimizers/rmsprop_op.h @@ -218,10 +218,10 @@ class RmspropOpKernel : public framework::OpKernel { rho, epsilon, momentum, grad_func)); } } - } else if (grad_var->IsType()) { - auto &grad = grad_var->Get(); - framework::SelectedRows tmp_merged_grad; - framework::SelectedRows *merged_grad = &tmp_merged_grad; + } else if (grad_var->IsType()) { + auto &grad = grad_var->Get(); + pten::SelectedRows tmp_merged_grad; + pten::SelectedRows *merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(dev_ctx, grad, merged_grad); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index 28f73e0618c2ae6cfd5ec67bc2372cc5584f5586..08c40e02b1702b069052eb0e086111c40739c04e 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -80,7 +80,7 @@ class SGDOp : public framework::OperatorWithKernel { // supported cases bool dense_param_sparse_grad = param_var->IsType() && - grad_var->IsType(); + grad_var->IsType(); bool dense_param_and_grad = param_var->IsType() && grad_var->IsType(); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index 5e3ae6c017bcac71ea2668e914378627ea39b1a2..7ecd84f4ff16a36a1e2e27f45a0ca46a05c35cda 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -112,7 +112,7 @@ class SGDOpKernel param->numel(), param_out->mutable_data(ctx.GetPlace()), master_in_data, master_out_data); - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. // This manual optimization brings difficulty to track data dependency. // It's better to find a more elegant solution. @@ -121,7 +121,7 @@ class SGDOpKernel platform::errors::InvalidArgument( "The input tensor Param of SgdOp should be equal with ParamOut " "if variable's type is SelectedRows.")); - auto* grad = ctx.Input("Grad"); + auto* grad = ctx.Input("Grad"); auto in_height = grad->height(); auto out_dims = param_out->dims(); diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 9d98e745a01aec9ec02e754ec9186ff66f58f53d..7df6bbf410d2d731367c9a6537ab63664f868c82 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -60,13 +60,13 @@ struct sgd_dense_param_kernel< // SelectedRows template struct sgd_dense_param_kernel< - T, framework::VarTypeTrait::kId> { + T, framework::VarTypeTrait::kId> { void operator()(const framework::ExecutionContext &ctx) const { VLOG(4) << "[CPU]: sgd_dense_param_kernel"; const auto *learning_rate = ctx.Input("LearningRate"); const auto *param = ctx.Input("Param"); auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); + const auto *grad = ctx.Input("Grad"); const auto &grad_value = grad->value(); const auto &grad_rows = grad->rows(); @@ -114,12 +114,12 @@ struct sgd_dense_param_kernel< // SelectedRows template <> struct sgd_dense_param_kernel< - platform::bfloat16, framework::VarTypeTrait::kId> { + platform::bfloat16, framework::VarTypeTrait::kId> { void operator()(const framework::ExecutionContext &ctx) const { VLOG(4) << "[CPU]: sgd_dense_param_kernel"; const auto *learning_rate = ctx.Input("LearningRate"); auto *param_out = ctx.Output("ParamOut"); - const auto *grad = ctx.Input("Grad"); + const auto *grad = ctx.Input("Grad"); const auto &grad_value = grad->value(); const auto &grad_rows = grad->rows(); @@ -163,7 +163,7 @@ class SGDOpKernel if (param_var->IsType()) { invoke_dense_param_kernel(ctx); - } else if (param_var->IsType()) { + } else if (param_var->IsType()) { sparse_param_and_grad_kernel(ctx); } else { PADDLE_ENFORCE_EQ( @@ -200,7 +200,7 @@ class SGDOpKernel grad->numel(), sz)); dense_param_and_grad_kernel(ctx); - } else if (grad_var->IsType()) { + } else if (grad_var->IsType()) { // TODO(qijun): In Sparse SGD operator, in-place update is enforced. // This manual optimization brings difficulty to track data dependency. // It's better to find a more elegant solution. @@ -209,7 +209,7 @@ class SGDOpKernel "The input tensor Param of SgdOp " "should be equal with ParamOut if variable's " "type is SelectedRows. ")); - const auto *grad = ctx.Input("Grad"); + const auto *grad = ctx.Input("Grad"); // for distributed training, a sparse var may be empty, // just skip updating. @@ -259,13 +259,13 @@ class SGDOpKernel const auto *param_var = ctx.InputVar("Param"); const auto *grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + PADDLE_ENFORCE_EQ(grad_var->IsType(), true, platform::errors::InvalidArgument( "When param is SelectedRows, gradient should also " "be SelectedRows")); - const auto ¶m = param_var->Get(); - auto *param_out = ctx.Output("ParamOut"); - const auto &grad = grad_var->Get(); + const auto ¶m = param_var->Get(); + auto *param_out = ctx.Output("ParamOut"); + const auto &grad = grad_var->Get(); // for distributed training, a sparse var may be empty, // just skip updating. @@ -309,7 +309,7 @@ class SGDOpKernel virtual void dense_param_sparse_grad_kernel( const framework::ExecutionContext &ctx) const { detail::sgd_dense_param_kernel< - T, framework::VarTypeTrait::kId>()(ctx); + T, framework::VarTypeTrait::kId>()(ctx); } }; diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index 88e94ba039ac277adc8ae4597886da16a0894465..c0bd906685d4d8c5fcb561ffaeab1cdc1b4d1e44 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -76,22 +76,13 @@ struct AbsFunctor { } }; -template +template struct UnsignedPowFunctor { HOSTDEVICE explicit inline UnsignedPowFunctor(float porder) { this->porder = porder; } - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(inline_pow(inline_abs(x), static_cast(porder))); - } - float porder; -}; - -template -struct PowFunctor { - HOSTDEVICE explicit inline PowFunctor(float porder) { this->porder = porder; } - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(inline_pow(x, static_cast(porder))); + HOSTDEVICE inline T operator()(const T x) const { + return static_cast(inline_pow(inline_abs(x), static_cast(porder))); } float porder; }; @@ -105,13 +96,11 @@ class PnormCUDAKernel : public framework::OpKernel { const T* x = in_x->data(); T* norm = out_norm->mutable_data(ctx.GetPlace()); auto xdim = in_x->dims(); - auto ndim = out_norm->dims(); float porder = ctx.Attr("porder"); bool asvector = ctx.Attr("asvector"); int axis = ctx.Attr("axis"); std::vector reduce_axis = {axis}; reduce_axis = GetReduceDim(reduce_axis, xdim.size(), asvector); - auto stream = ctx.cuda_device_context().stream(); using MT = typename details::MPTypeTrait::Type; @@ -125,29 +114,17 @@ class PnormCUDAKernel : public framework::OpKernel { TensorReduceFunctorImpl>( *in_x, out_norm, AbsFunctor(), reduce_axis, stream); } else { - framework::Tensor tmp_x; - tmp_x.mutable_data(xdim, ctx.GetPlace()); - std::vector ins = {in_x}; - std::vector outs = {&tmp_x}; - auto func = UnsignedPowFunctor(porder); + TensorReduceFunctorImpl>( + *in_x, out_norm, UnsignedPowFunctor(porder), reduce_axis, stream); + + const framework::Tensor* tmp_norm = out_norm; + std::vector ins = {tmp_norm}; + std::vector outs = {out_norm}; const auto& cuda_ctx = ctx.template device_context(); - - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, MT, T, UnsignedPowFunctor>( - cuda_ctx, ins, &outs, func); - framework::Tensor tmp_y; - tmp_y.mutable_data(ndim, ctx.GetPlace()); - TensorReduceFunctorImpl>( - tmp_x, &tmp_y, kps::IdentityFunctor(), reduce_axis, stream); - const framework::Tensor* tmp_norm = &tmp_y; - ins = {tmp_norm}; - outs = {out_norm}; - auto func_inverse = UnsignedPowFunctor(1. / porder); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, MT, T, UnsignedPowFunctor>( - cuda_ctx, ins, &outs, func_inverse); + ElementwiseType::kUnary, T, T, UnsignedPowFunctor>( + cuda_ctx, ins, &outs, UnsignedPowFunctor(1. / porder)); } } }; @@ -158,29 +135,25 @@ struct AbsMaxAndMinGradFunctor { typename DY, typename Dim> void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, const Dim& dim, int size) { - auto equals = ((*x).abs() == y->broadcast(dim)); - auto ones = dx->constant(static_cast(1.)); - auto negs = dx->constant(static_cast(-1.)); - auto zeros = dx->constant(static_cast(0.)); - auto positives = (*x) > zeros; - dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros) * - positives.select(ones, negs); + dx->device(place) = dy->broadcast(dim) * (*x).sign() * + ((*x).abs() == y->broadcast(dim)).template cast(); } }; template -struct PNormPostGradFunctor { +struct PNormGradFunctor { + HOSTDEVICE explicit inline PNormGradFunctor(float porder) { + this->porder = static_cast(porder - 1.); + } template void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, const Dim& dim, int size) { - auto ones = dx->constant(static_cast(1.)); - auto negs = dx->constant(static_cast(-1.)); - auto zeros = dx->constant(static_cast(0.)); - auto positives = (*x) > zeros; - dx->device(place) = (*dx) * dy->broadcast(dim) * y->broadcast(dim) * - positives.select(ones, negs); + dx->device(place) = (*x).abs().pow(this->porder) * (*x).sign() * + dy->broadcast(dim) * + (*y).pow(-this->porder).broadcast(dim); } + T porder; }; template @@ -207,26 +180,13 @@ class PnormGradCUDAKernel : public framework::OpKernel { math::SetConstant set_zero; set_zero(cuda_ctx, out_dx, static_cast(0)); } else if (porder == INFINITY || porder == -INFINITY) { + AbsMaxAndMinGradFunctor functor; LaunchReduceGradKernel>( - ctx, in_x, in_norm, in_norm_dy, out_dx, dims, reduce_all); + ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all); } else { - framework::Tensor tmp_norm; - tmp_norm.mutable_data(in_norm->dims(), ctx.GetPlace()); - std::vector ins = {in_norm}; - std::vector outs = {&tmp_norm}; - auto pow_functor = PowFunctor(1. - porder); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T, PowFunctor>(cuda_ctx, ins, &outs, - pow_functor); - ins = {in_x}; - outs = {out_dx}; - auto unsigned_pow = UnsignedPowFunctor(porder - 1.); - paddle::operators::LaunchSameDimsElementwiseCudaKernel< - ElementwiseType::kUnary, T, T, UnsignedPowFunctor>( - cuda_ctx, ins, &outs, unsigned_pow); - const framework::Tensor* tmp_norm_const = &tmp_norm; - LaunchReduceGradKernel>( - ctx, in_x, tmp_norm_const, in_norm_dy, out_dx, dims, reduce_all); + auto functor = PNormGradFunctor(porder); + LaunchReduceGradKernel>( + ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all); } } }; diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h index 292db60079e4cde97ba992b3c9d7151ecae9434a..d715bf34a49ef10de11affacde4ac892be259da8 100644 --- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h +++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h @@ -13,8 +13,8 @@ #include #include #include -#include "paddle/fluid/distributed/fleet.h" -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" @@ -40,8 +40,8 @@ class DistributedLookupTableKernel : public framework::OpKernel { if (var->IsType()) { emb_dim = var->Get().dims()[1]; - } else if (var->IsType()) { - emb_dim = var->Get().value().dims()[1]; + } else if (var->IsType()) { + emb_dim = var->Get().value().dims()[1]; } else { PADDLE_THROW(platform::errors::InvalidArgument( "Expected type of `W` must be Tensor, SelectedRows.But got " diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h index a232d52dec8d62fb42a6d662e94a9e29c5d935f7..f19ba5f2e41da3de710c726bc7899f12cbbc92dc 100644 --- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h +++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h @@ -13,8 +13,8 @@ #include #include #include -#include "paddle/fluid/distributed/fleet.h" -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" diff --git a/paddle/fluid/operators/pscore/fake_init_op.cc b/paddle/fluid/operators/pscore/fake_init_op.cc index cb27dc75eb2faf15746e596265d1b1e4b3717e52..b3a745fc99538edf2a0b387a67d28cb7722709f0 100644 --- a/paddle/fluid/operators/pscore/fake_init_op.cc +++ b/paddle/fluid/operators/pscore/fake_init_op.cc @@ -39,8 +39,8 @@ class FakeInitOp : public framework::OperatorBase { if (out_var.IsType()) { tensor = out_var.GetMutable(); tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else if (out_var.IsType()) { - tensor = out_var.GetMutable()->mutable_value(); + } else if (out_var.IsType()) { + tensor = out_var.GetMutable()->mutable_value(); tensor->Resize(framework::make_ddim(Attr>("shape"))); } else { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h index 77c755581f9de2830c4ae2ab9c281321f7fb986f..2d2d8abe7062788b14b543bad22d699a1f41bd2d 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h +++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h @@ -23,9 +23,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/distributed/service/brpc_utils.h" -#include "paddle/fluid/distributed/service/heter_server.h" -#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/service/brpc_utils.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc index c870e758e96afc1c70a26236b0d20ac05d77aaf1..a195b8dee3c2f5580be5f7c094194576b9eccb88 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc @@ -18,8 +18,8 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/distributed/service/heter_client.h" -#include "paddle/fluid/distributed/service/heter_server.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc index 5029aa0ebdcc0c547c394053ce110dbc9f401a3f..7914e9d9a1058ab15a08e3b0dee8725e7a74bb38 100644 --- a/paddle/fluid/operators/pscore/heter_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_server_test.cc @@ -18,8 +18,8 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/distributed/service/heter_client.h" -#include "paddle/fluid/distributed/service/heter_server.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" #include "paddle/fluid/framework/op_registry.h" namespace framework = paddle::framework; @@ -52,7 +52,7 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { auto w_var = scope->Var("w"); - w_var->GetMutable(); + w_var->GetMutable(); auto out_var = scope->Var("out"); out_var->GetMutable(); @@ -123,7 +123,7 @@ void InitTensorsOnClient2(framework::Scope* scope, platform::CPUPlace* place, void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, int64_t rows_numel) { CreateVarsOnScope(scope, place); - auto w = scope->Var("w")->GetMutable(); + auto w = scope->Var("w")->GetMutable(); auto w_value = w->mutable_value(); w_value->Resize({rows_numel, 10}); for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc index 46f22bcc8b26bc0b4f782ed9459491d471ad219d..980351e12a030760b6793ab665d80db737bfa9d5 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/distributed/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc index 6b1ab77b45d35dfb4439cb4e1927cc928d7ffd4c..07fe44601ca08831a9e4372d04c097a8e56644f2 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/distributed/service/heter_client.h" -#include "paddle/fluid/distributed/service/heter_server.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -55,7 +55,7 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { void CreateVarsOnScope(framework::Scope* scope) { auto w_var = scope->Var("w"); - w_var->GetMutable(); + w_var->GetMutable(); auto out_var = scope->Var("out"); out_var->GetMutable(); @@ -76,7 +76,7 @@ void CreateVarsOnScope(framework::Scope* scope) { void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, int64_t rows_numel) { CreateVarsOnScope(scope); - auto w = scope->Var("w")->GetMutable(); + auto w = scope->Var("w")->GetMutable(); auto w_value = w->mutable_value(); w_value->Resize({rows_numel, 10}); for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index 3a361360e2ed7e7de3c995b60ecf6e8c0f33e415..21f21cdc95606ec98700736f51a2f50af6364e1a 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -20,8 +20,8 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/distributed/service/heter_client.h" -#include "paddle/fluid/distributed/service/heter_server.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/memory/memcpy.h" @@ -59,7 +59,7 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { void CreateVarsOnScope(framework::Scope* scope) { auto w_var = scope->Var("w"); - w_var->GetMutable(); + w_var->GetMutable(); auto out_var = scope->Var("out"); out_var->GetMutable(); @@ -121,7 +121,7 @@ void InitTensorsOnClient(framework::Scope* scope, int64_t rows_numel, void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, int64_t rows_numel) { CreateVarsOnScope(scope); - auto w = scope->Var("w")->GetMutable(); + auto w = scope->Var("w")->GetMutable(); auto w_value = w->mutable_value(); w_value->Resize({rows_numel, 10}); for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); diff --git a/paddle/fluid/operators/pscore/send_barrier_op.cc b/paddle/fluid/operators/pscore/send_barrier_op.cc index 1def919ffdf9fdb8976d6745ac718977eb57df73..fe850bb25d67f33a6dfa076f9a75c0b36cd82e5c 100644 --- a/paddle/fluid/operators/pscore/send_barrier_op.cc +++ b/paddle/fluid/operators/pscore/send_barrier_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/pscore/send_op.cc b/paddle/fluid/operators/pscore/send_op.cc index 482c6ba60d26fdab5776e99d036162a7c67b21f8..bbb3c76beca20b4a20d3ec664ed4fc47ce542414 100644 --- a/paddle/fluid/operators/pscore/send_op.cc +++ b/paddle/fluid/operators/pscore/send_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/distributed/fleet.h" -#include "paddle/fluid/distributed/service/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h index 06c9f23dd2c26fae8fefe6ae7da7df7aa5e67563..4490f08b2129ad0a1dfcd42602ce1ad6f694d1f7 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h @@ -139,26 +139,27 @@ class LogsumexpGradKernel : public framework::OpKernel { broadcast_dim[0]); } else { int rank = input->dims().size(); + LogsumexpGradFunctor functor; switch (rank) { case 1: ReduceGradFunctor( context.template device_context(), *input, *output, - *output_grad, input_grad, axis); + *output_grad, input_grad, functor, axis); break; case 2: ReduceGradFunctor( context.template device_context(), *input, *output, - *output_grad, input_grad, axis); + *output_grad, input_grad, functor, axis); break; case 3: ReduceGradFunctor( context.template device_context(), *input, *output, - *output_grad, input_grad, axis); + *output_grad, input_grad, functor, axis); break; case 4: ReduceGradFunctor( context.template device_context(), *input, *output, - *output_grad, input_grad, axis); + *output_grad, input_grad, functor, axis); break; } } diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu index 197ced2beaac26299ae1ed705ae49d0055dc3c02..30a699e979efc40190a5c83850340f1f15dd918a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu @@ -22,4 +22,6 @@ REGISTER_OP_CUDA_KERNEL( ops::ReduceCudaKernel, ops::ReduceCudaKernel, - ops::ReduceCudaKernel); + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 2e5bd7a42b1d1a4224f6aa516e7b6adb28b4f17a..87f51e4b8002f277a50ca0af5abf1e0f43214758 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -143,7 +143,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context, const framework::Tensor* x, const framework::Tensor* out, const framework::Tensor* dout, framework::Tensor* dx, - const std::vector& dims) { + Functor functor, const std::vector& dims) { const int64_t unreduced = out->numel(); const int64_t reduced = x->numel() / unreduced; DDim out_dim(out->dims()); @@ -157,7 +157,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context, dx->Resize({unreduced, reduced}); ReduceGradFunctor( context.template device_context(), shuffled_x, *out, *dout, - dx, {1}); + dx, functor, {1}); // transpose dX std::vector origin_axis(x_dim.size()); GetOriginDimFromShuffled(x_dim, dims, &origin_axis); @@ -333,7 +333,7 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context, const framework::Tensor* input0, const framework::Tensor* input1, const framework::Tensor* input2, - paddle::framework::Tensor* output, + paddle::framework::Tensor* output, Functor functor, const std::vector& dims, bool reduce_all = false) { if (reduce_all) { @@ -345,7 +345,6 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context, *context.template device_context().eigen_device(); auto broadcast_dim = Eigen::array({{static_cast(input0->numel())}}); - Functor functor; functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, broadcast_dim[0]); } else { @@ -354,36 +353,36 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context, case 1: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 2: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 3: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 4: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 5: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; case 6: ReduceGradFunctor( context.template device_context(), *input0, *input1, - *input2, output, dims); + *input2, output, functor, dims); break; default: - HandleLargeDimGrad(context, input0, input1, - input2, output, dims); + HandleLargeDimGrad( + context, input0, input1, input2, output, functor, dims); break; } } @@ -430,8 +429,10 @@ class ReduceGradKernel : public framework::OpKernel { // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and // not be set as Input in grad Maker, use Out_grad to replace here if (!input1) input1 = input2; - LaunchReduceGradKernel( - context, input0, input1, input2, output, const_dims, reduce_all); + Functor functor; + LaunchReduceGradKernel(context, input0, input1, + input2, output, functor, + const_dims, reduce_all); } void Compute(const framework::ExecutionContext& context) const override { @@ -556,7 +557,7 @@ class ReduceOp : public framework::OperatorWithKernel { if (ctx.InputVar("X")->IsType()) { if (!reduce_all) { return framework::KernelSignature( - "sum", {"X"}, {"dim", "keep_dim", "out_dtype"}, {"Out"}); + "sum", {"X"}, {"dim", "out_dtype", "keep_dim"}, {"Out"}); } return framework::KernelSignature( "sum_raw", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"}, diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h index 3da27bc8ac8d448471b9ff3779ac6aca59fac523..1f3839c8dc7e6d1285462c0e442a5f856dd50066 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h @@ -74,7 +74,7 @@ void ReduceGradFunctor(const DeviceContext& context, const framework::Tensor& input0, const framework::Tensor& input1, const framework::Tensor& input2, - framework::Tensor* output, + framework::Tensor* output, Functor functor, const std::vector& dims) { auto x = EigenTensor::From(input0); auto x_grad = EigenTensor::From(*output); @@ -100,7 +100,6 @@ void ReduceGradFunctor(const DeviceContext& context, auto& place = *context.eigen_device(); - Functor functor; functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim, broad_cats_times); } diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index dc82d7c6c1ee49d6b3f74dbd5d0b1c835819266e..6c2d5ebcc7d880aa33786df153f270db685f3525 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -38,33 +38,6 @@ namespace operators { using Tensor = framework::Tensor; -inline std::vector get_new_shape( - const std::vector &list_new_shape_tensor) { - // get tensor from - std::vector vec_new_shape; - for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { - auto tensor = list_new_shape_tensor[i]; - PADDLE_ENFORCE_EQ( - tensor->dims(), framework::make_ddim({1}), - platform::errors::InvalidArgument( - "If the element type of 'shape' in ReshapeOp is Tensor, " - "the element's shape must be [1]. But received the element's shape " - "is [%s]", - tensor->dims())); - if (platform::is_gpu_place(tensor->place()) || - platform::is_xpu_place(tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); - - vec_new_shape.push_back(static_cast(*temp.data())); - } else { - vec_new_shape.push_back(static_cast(*tensor->data())); - } - } - - return vec_new_shape; -} - class ReshapeOp : public framework::OperatorWithKernel { public: ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -370,30 +343,6 @@ class ReshapeKernel { void operator()(const framework::ExecutionContext &ctx) const { auto *out = ctx.Output("Out"); auto *in = ctx.Input("X"); - // framework::DDim out_dims = out->dims(); - auto pt_x = paddle::experimental::MakePtenDenseTensor(*in); - - // we can't MakePtenDenseTensor by out, because the out of reshape may have - // multiple states, some can MakePtenDenseTensor but other's cannot: - // 1. out tensor is not initialized - // 2. out tensor is input (complete inplace) - // 3. out tensor is view of input - // We can't MakePtenDenseTensor for case 2, so we solve this case by - // creating a temporary tensor here: - pten::DenseTensorMeta meta{pten::TransToPtenDataType(in->type()), - in->dims(), in->layout()}; - auto pt_out_tmp = std::make_shared( - pten::make_intrusive( - ctx.GetPlace()), - std::move(meta)); - pten::DenseTensor *pt_out = nullptr; - if (in != nullptr && out != nullptr && in->Holder() != nullptr && - out->Holder() != nullptr && - in->Holder()->ptr() == out->Holder()->ptr()) { - pt_out = pt_x.get(); - } else { - pt_out = pt_out_tmp.get(); - } auto list_new_shape_tensor = ctx.MultiInput("ShapeTensor"); @@ -410,54 +359,46 @@ class ReshapeKernel { framework::Tensor temp; paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); - pt_vec_shape.push_back( - std::move(*(paddle::experimental::MakePtenDenseTensor(temp)))); + pt_vec_shape.push_back(std::move(temp)); } else { - pt_vec_shape.push_back( - std::move(*(paddle::experimental::MakePtenDenseTensor(*tensor)))); + pt_vec_shape.push_back(*tensor); } } pt_scalar_shape = pten::ScalarArray(pt_vec_shape); } else if (shape_tensor) { - std::unique_ptr pt_shape; + pten::DenseTensor pt_shape; if (platform::is_gpu_place(shape_tensor->place()) || platform::is_xpu_place(shape_tensor->place())) { framework::Tensor temp; paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(), &temp); - pt_shape = paddle::experimental::MakePtenDenseTensor(temp); + pt_shape = std::move(temp); } else { - pt_shape = paddle::experimental::MakePtenDenseTensor(*shape_tensor); + pt_shape = *shape_tensor; } - pt_scalar_shape = pten::ScalarArray(*pt_shape.get()); + pt_scalar_shape = pten::ScalarArray(pt_shape); } else { auto &shape_attr = ctx.Attr>("shape"); pt_scalar_shape = pten::ScalarArray(shape_attr); } if (platform::is_cpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeKernel(static_cast(dev_ctx), - *pt_x.get(), pt_scalar_shape, pt_out); + pten::ReshapeKernel(static_cast(dev_ctx), *in, + pt_scalar_shape, out); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out); + pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out); } #endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out); + pten::ReshapeKernel(static_cast(dev_ctx), *in, + pt_scalar_shape, out); } #endif - // non-inplace need move all result from pt_out to out, inplace need set - // result dims. - if (in != out) { - paddle::experimental::SharesStorage(pt_out, static_cast(out)); - } else { - out->Resize(pt_out->dims()); - } } }; @@ -468,24 +409,22 @@ class ReshapeGradKernel { auto *d_x = ctx.Output(framework::GradVarName("X")); d_x->mutable_data(ctx.GetPlace(), d_out->type()); - auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x); - auto pt_d_out = paddle::experimental::MakePtenDenseTensor(*d_out); - if (platform::is_cpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); pten::ReshapeGradKernel(static_cast(dev_ctx), - *pt_d_out.get(), pt_d_x.get()); + *d_out, d_x); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get()); + pten::ReshapeGradKernel(dev_ctx, *d_out, d_x); } #endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get()); + pten::ReshapeGradKernel(static_cast(dev_ctx), + *d_out, d_x); } #endif } @@ -498,25 +437,22 @@ class ReshapeDoubleGradKernel { auto *dd_out = ctx.Output("DDOut"); dd_out->mutable_data(ctx.GetPlace(), dd_x->type()); - auto pt_dd_x = paddle::experimental::MakePtenDenseTensor(*dd_x); - auto pt_dd_out = paddle::experimental::MakePtenDenseTensor(*dd_out); - if (platform::is_cpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); pten::ReshapeDoubleGradKernel( - static_cast(dev_ctx), *pt_dd_x.get(), - pt_dd_out.get()); + static_cast(dev_ctx), *dd_x, dd_out); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get()); + pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out); } #endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get()); + pten::ReshapeDoubleGradKernel( + static_cast(dev_ctx), *dd_x, dd_out); } #endif } diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index c130dbb35a0daa551f51c9a0315be90a4415a98e..a97876957abd38124c164ba934a0ce1378188659 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -46,7 +46,7 @@ using ProgramDesc = framework::ProgramDesc; using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; namespace details { @@ -86,21 +86,21 @@ static void CheckOutputVarStatus(const Variable &src_var, "RunProgram(Grad)Op's internal " "scope is not initialized.", var_name)); - } else if (dst_var.IsType()) { + } else if (dst_var.IsType()) { PADDLE_ENFORCE_EQ( - src_var.IsType(), true, + src_var.IsType(), true, platform::errors::InvalidArgument( "The output variable %s get from " "RunProgram(Grad)Op's internal scope holds " "wrong type. Expect type is SelectedRows, but receive type is %s.", var_name, platform::demangle(framework::ToTypeName(src_var.Type())))); - PADDLE_ENFORCE_EQ(src_var.Get().value().IsInitialized(), true, - platform::errors::InvalidArgument( - "The tensor in output variable %s get from " - "RunProgram(Grad)Op's " - "internal scope is not initialized.", - var_name)); + PADDLE_ENFORCE_EQ(src_var.Get().value().IsInitialized(), + true, platform::errors::InvalidArgument( + "The tensor in output variable %s get from " + "RunProgram(Grad)Op's " + "internal scope is not initialized.", + var_name)); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -118,12 +118,12 @@ static void VariableShare(const Variable &src_var, Variable *dst_var) { auto *lod_tensor = dst_var->GetMutable(); lod_tensor->ShareDataWith(src_var.Get()); lod_tensor->set_lod(src_var.Get().lod()); - } else if (src_var.IsType()) { - auto *selected_rows = dst_var->GetMutable(); + } else if (src_var.IsType()) { + auto *selected_rows = dst_var->GetMutable(); selected_rows->mutable_value()->ShareDataWith( - src_var.Get().value()); - selected_rows->set_rows(src_var.Get().rows()); - selected_rows->set_height(src_var.Get().height()); + src_var.Get().value()); + selected_rows->set_rows(src_var.Get().rows()); + selected_rows->set_height(src_var.Get().height()); } } diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h index 5ed71a26c8aa3563656baab3c7751a150d5f105f..2a61d7ce0c25b786bb5713e342d371d48ad2d04d 100644 --- a/paddle/fluid/operators/save_op.h +++ b/paddle/fluid/operators/save_op.h @@ -56,7 +56,7 @@ class SaveOpKernel : public framework::OpKernel { if (input_var->IsType()) { SaveLodTensor(ctx, place, input_var, filename); - } else if (input_var->IsType()) { + } else if (input_var->IsType()) { SaveSelectedRows(ctx, place, input_var, filename); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -105,7 +105,7 @@ class SaveOpKernel : public framework::OpKernel { const platform::Place &place, const framework::Variable *var, const std::string &filename) const { - auto &selectedRows = var->Get(); + auto &selectedRows = var->Get(); // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 86f4e1b3ac3ba0b3cfef98e322f89627d4e927da..a195452791048d9875602285551a00cf6e42c7a8 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/operators/scale_op.h" #include #include "paddle/fluid/platform/float16.h" -#include "paddle/pten/ops/compat/scale_args_fn.h" namespace paddle { namespace framework { @@ -71,12 +70,6 @@ class ScaleOp : public framework::OperatorWithKernel { #endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } - - framework::KernelSignature GetExpectedPtenKernelArgs( - const framework::ExecutionContext &ctx) const override { - framework::ExecutionArgumentMappingContext arg_mapping_ctx(ctx); - return pten::ScaleOpArgumentMapping(arg_mapping_ctx); - } }; class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index a04837b6949e0f3ac0e0dda913c878166fb63311..2a30d3f0b08842b5b876847e22dd0ccea2956914 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -55,9 +55,9 @@ class ScaleKernel : public framework::OpKernel { } auto* out_var = ctx.OutputVar("Out"); - if (in_var->IsType() && in_var != out_var) { - auto& in_slr = in_var->Get(); - auto* out_slr = out_var->GetMutable(); + if (in_var->IsType() && in_var != out_var) { + auto& in_slr = in_var->Get(); + auto* out_slr = out_var->GetMutable(); out_slr->set_rows(in_slr.rows()); out_slr->set_height(in_slr.height()); } diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc index 8d9690a866ae26abe0817d98636a45d58735aefd..1e1187845ce477f939e8cf21650076c875861f3d 100644 --- a/paddle/fluid/operators/scale_op_mlu.cc +++ b/paddle/fluid/operators/scale_op_mlu.cc @@ -57,9 +57,9 @@ class ScaleMLUKernel : public framework::OpKernel { MLUCnnl::Fill(ctx, bias, bias_desc.get(), GetBasePtr(&bias_tensor)); auto* out_var = ctx.OutputVar("Out"); - if (in_var->IsType() && in_var != out_var) { - auto& in_slr = in_var->Get(); - auto* out_slr = out_var->GetMutable(); + if (in_var->IsType() && in_var != out_var) { + auto& in_slr = in_var->Get(); + auto* out_slr = out_var->GetMutable(); out_slr->set_rows(in_slr.rows()); out_slr->set_height(in_slr.height()); } diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc index 4960f720ee39aaa130544befc9b0a6449d5381d9..026a5dda89b5f07423090cb83bfb73e706cba7b7 100644 --- a/paddle/fluid/operators/scale_op_xpu.cc +++ b/paddle/fluid/operators/scale_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/scale_op.h" #include -#include "paddle/fluid/platform/device/xpu/xpu_header.h" +#include "paddle/pten/kernels/scale_kernel.h" namespace paddle { namespace operators { @@ -32,30 +32,21 @@ class ScaleXPUKernel : public framework::OpKernel { auto bias = static_cast(ctx.Attr("bias")); auto bias_after_scale = ctx.Attr("bias_after_scale"); auto* out_var = ctx.OutputVar("Out"); - if (in_var->IsType() && in_var != out_var) { - auto& in_slr = in_var->Get(); - auto* out_slr = out_var->GetMutable(); + if (in_var->IsType() && in_var != out_var) { + auto& in_slr = in_var->Get(); + auto* out_slr = out_var->GetMutable(); out_slr->set_rows(in_slr.rows()); out_slr->set_height(in_slr.height()); } auto* out = framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var); out->mutable_data(in->place()); - PADDLE_ENFORCE_EQ( - in->dims(), out->dims(), - platform::errors::InvalidArgument("In and out should have the same dim," - " expected %s, but got %s.", - in->dims().to_str().c_str(), - out->dims().to_str().c_str())); auto& dev_ctx = ctx.template device_context(); - int r = xpu::scale(dev_ctx.x_context(), - reinterpret_cast(in->data()), - reinterpret_cast(out->data()), in->numel(), - bias_after_scale, scale, bias); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU scale kernel return wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + // call pten kernel + pten::ScaleKernel( + static_cast::TYPE&>(dev_ctx), + *in, scale, bias, bias_after_scale, out); } }; diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h index 1f90c041c095331db427ddd5f9a656e948947e46..cac8c10c207a51e7de1bc2ca3346394f39da8ddf 100644 --- a/paddle/fluid/operators/shape_op.h +++ b/paddle/fluid/operators/shape_op.h @@ -21,7 +21,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; template class ShapeKernel : public framework::OpKernel { @@ -29,8 +29,8 @@ class ShapeKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* in_var = ctx.InputVar("Input"); framework::DDim in_dims; - if (in_var->IsType()) { - in_dims = in_var->Get().value().dims(); + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); } else { in_dims = in_var->Get().dims(); } diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc index 94f4737191d11a7ca8a3dd3e7f40399d08813486..89a1e952d1dc558dfad55604713ec601d7ccc125 100644 --- a/paddle/fluid/operators/shape_op_npu.cc +++ b/paddle/fluid/operators/shape_op_npu.cc @@ -22,7 +22,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using SelectedRows = framework::SelectedRows; template class ShapeNPUKernel : public framework::OpKernel { @@ -30,8 +29,8 @@ class ShapeNPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* in_var = ctx.InputVar("Input"); framework::DDim in_dims; - if (in_var->IsType()) { - in_dims = in_var->Get().value().dims(); + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); } else { in_dims = in_var->Get().dims(); } diff --git a/paddle/fluid/operators/share_data_op.h b/paddle/fluid/operators/share_data_op.h index d876b4fabd5c09bf32322cf1a63e0c0fe7ed7d25..f668a1cf01dfc5c141f94997aafb18f1a730707d 100644 --- a/paddle/fluid/operators/share_data_op.h +++ b/paddle/fluid/operators/share_data_op.h @@ -29,9 +29,8 @@ class ShareDataKernel : public framework::OpKernel { auto *detach_tensor = out_var->GetMutable(); detach_tensor->ShareDataWith(origin_tensor); } else { - const auto &origin_selected_rows = in_var->Get(); - auto *detach_selected_rows = - out_var->GetMutable(); + const auto &origin_selected_rows = in_var->Get(); + auto *detach_selected_rows = out_var->GetMutable(); detach_selected_rows->mutable_value()->ShareDataWith( origin_selected_rows.value()); } diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc index 7e21cba14b7dcaad215aa040958a656e9b3058ec..6395aa1caa01b9578d55e1155b0d6cd0d2295e36 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc @@ -18,6 +18,7 @@ #include #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { @@ -41,24 +42,41 @@ class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); // attrs - bool normalize = context.Attr("normalize"); - PADDLE_ENFORCE_EQ( - normalize, false, - platform::errors::InvalidArgument("normalize only support true now.")); int ignore_index = context.Attr("ignore_index"); - PADDLE_ENFORCE_EQ(ignore_index, kIgnoreIndex, - platform::errors::InvalidArgument( - "ignore_index only support %d now.", kIgnoreIndex)); + bool normalize = context.Attr("normalize"); + + // allocate temp memory + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int* hit = RAII_GUARD.alloc_l3_or_gm(input->numel()); + PADDLE_ENFORCE_NOT_NULL( + hit, platform::errors::External("XPU alloc_l3_or_gm returns nullptr")); int r = xpu::sigmoid_cross_entropy_with_logits( dev_ctx.x_context(), reinterpret_cast(input->data()), reinterpret_cast(label->data()), - reinterpret_cast(output->data()), 1, input->numel()); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU sigmoid_cross_entropy_with_logits " - "kernel return wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + reinterpret_cast(output->data()), 1, input->numel(), hit, + ignore_index); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid_cross_entropy_with_logits"); + if (normalize) { + int* non_zero = RAII_GUARD.alloc_l3_or_gm(1); + PADDLE_ENFORCE_NOT_NULL( + non_zero, + platform::errors::External("XPU alloc_l3_or_gm returns nullptr")); + int r = xpu::nonzero_count(dev_ctx.x_context(), + reinterpret_cast(hit), + non_zero, input->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count"); + int non_zero_cpu = 0; + memory::Copy(platform::CPUPlace(), static_cast(&non_zero_cpu), + context.GetPlace(), static_cast(non_zero), + sizeof(int)); + r = xpu::scale(dev_ctx.x_context(), + reinterpret_cast(output->data()), + reinterpret_cast(output->data()), + input->numel(), false, + 1.0f / static_cast(non_zero_cpu), 0.0f); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); + } } }; @@ -81,16 +99,42 @@ class SigmoidCrossEntropyWithLogitsGradXPUKernel dx->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); + // attrs + int ignore_index = context.Attr("ignore_index"); + bool normalize = context.Attr("normalize"); + + // allocate temp memory + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int* hit = RAII_GUARD.alloc_l3_or_gm(input->numel()); + PADDLE_ENFORCE_NOT_NULL( + hit, platform::errors::External("XPU alloc_l3_or_gm returns nullptr")); + int r = xpu::sigmoid_cross_entropy_with_logits_grad( dev_ctx.x_context(), reinterpret_cast(input->data()), reinterpret_cast(label->data()), reinterpret_cast(dy->data()), - reinterpret_cast(dx->data()), 1, input->numel()); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU sigmoid_cross_entropy_with_logits_grad " - "kernel return wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); + reinterpret_cast(dx->data()), 1, input->numel(), hit, + ignore_index); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid_cross_entropy_with_logits"); + if (normalize) { + int* non_zero = RAII_GUARD.alloc_l3_or_gm(1); + PADDLE_ENFORCE_NOT_NULL( + non_zero, + platform::errors::External("XPU alloc_l3_or_gm returns nullptr")); + int r = xpu::nonzero_count(dev_ctx.x_context(), + reinterpret_cast(hit), + non_zero, input->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count"); + int non_zero_cpu = 0; + memory::Copy(platform::CPUPlace(), static_cast(&non_zero_cpu), + context.GetPlace(), static_cast(non_zero), + sizeof(int)); + r = xpu::scale(dev_ctx.x_context(), + reinterpret_cast(dx->data()), + reinterpret_cast(dx->data()), input->numel(), + false, 1.0f / static_cast(non_zero_cpu), 0.0f); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); + } } }; diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc index 6207c33f9d6299605d24f11c13820eac47ee6c98..f36124078054e87f8218f2bb82ff4e58b22fc0ae 100644 --- a/paddle/fluid/operators/sign_op.cc +++ b/paddle/fluid/operators/sign_op.cc @@ -14,7 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/sign_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/pten/core/infermeta_utils.h" +#include "paddle/pten/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,14 +25,6 @@ namespace operators { class SignOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "sign"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sign"); - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; template @@ -64,9 +59,12 @@ class SignGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor, + PT_INFER_META(pten::UnchangedInferMetaNew)); REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, ops::SignGradMaker, - ops::SignGradMaker); + ops::SignGradMaker, + SignInferShapeFunctor); REGISTER_OP_CPU_KERNEL( sign, ops::SignKernel, ops::SignKernel); diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc index 0adc12e684c3a4c816bddb29043e638fbb368ae9..a0d4b4c4eb4604ef699acd20807856ccada8717d 100644 --- a/paddle/fluid/operators/softmax_op_xpu.cc +++ b/paddle/fluid/operators/softmax_op_xpu.cc @@ -45,8 +45,8 @@ class SoftmaxXPUKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); int r = XPU_SUCCESS; - paddle::platform::XPUVersion version = dev_ctx.xpu_version(); - if (version == paddle::platform::XPUVersion::XPU1) { + auto version = dev_ctx.xpu_version(); + if (version == pten::backends::xpu::XPUVersion::XPU1) { xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm(x->numel()); r = xpu::clip_v2(dev_ctx.x_context(), diff --git a/paddle/fluid/operators/split_op_mlu.cc b/paddle/fluid/operators/split_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c569c9bf091335a01dfb2d70808cb6ce0bb66812 --- /dev/null +++ b/paddle/fluid/operators/split_op_mlu.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SplitMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // init parameter + auto* in = ctx.Input("X"); + auto outs = ctx.MultiOutput("Out"); + int num = ctx.Attr("num"); + std::vector sections = ctx.Attr>("sections"); + int axis = ctx.Attr("axis"); + auto in_dims = in->dims(); + auto out_size = outs.size(); + auto num_tensor = num == 0 ? out_size : num; + + bool need_resize_outs_dims = false; + if (ctx.HasInput("AxisTensor")) { + auto* axis_tensor = ctx.Input("AxisTensor"); + axis = GetDataFromTensor(axis_tensor)[0]; + need_resize_outs_dims = true; + } + auto sections_tensor_list = + ctx.MultiInput("SectionsTensorList"); + if (sections_tensor_list.size() > 0) { + sections = GetDataFromTensorList(sections_tensor_list); + need_resize_outs_dims = true; + } + if (need_resize_outs_dims) { + std::vector outs_dims = + UpdateOutsDims(true, true, in_dims, num, sections, axis, out_size); + for (size_t j = 0; j < outs.size(); ++j) { + outs[j]->Resize(outs_dims[j]); + } + } + + // init out tensors + std::vector vct_tensor; + std::vector output_descs; + std::vector desc_vector; + auto place = ctx.GetPlace(); + for (size_t i = 0; i < outs.size(); i++) { + outs[i]->mutable_data(ctx.GetPlace()); + output_descs.emplace_back(MLUCnnlTensorDesc( + *outs[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(outs[i]->type()))); + desc_vector.push_back(output_descs.back().get()); + vct_tensor.push_back(GetBasePtr(outs[i])); + } + // init in tensors + MLUCnnlTensorDesc input_desc(*in, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(in->type())); + + // MLU should do sth + MLUCnnl::Split(ctx, num_tensor, axis, input_desc.get(), GetBasePtr(in), + desc_vector.data(), vct_tensor.data()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(split, ops::SplitMLUKernel, + ops::SplitMLUKernel, ops::SplitMLUKernel, + ops::SplitMLUKernel, + ops::SplitMLUKernel); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 0f520adba57a203fae5d3b34fb67067d01691bed..00aab6b75006aec9b2ff397f2589174aeee615f9 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -165,9 +165,9 @@ class SumOp : public framework::OperatorWithKernel { return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); - } else if (x_vars[0]->IsType()) { + } else if (x_vars[0]->IsType()) { for (auto& var : x_vars) { - auto& value = var->Get().value(); + auto& value = var->Get().value(); if (value.IsInitialized()) { return framework::OpKernelType(value.type(), ctx.device_context(), layout, library); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 4288e9415aa8699d1f98d186d74801bf890d757e..9de9b0b6338dfc78ba06d750ce2c18823d0eda53 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -151,7 +151,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { if (lod_length && in_i.IsInitialized()) { in_data.emplace_back(in_i.data()); } - } else if (in_vars[i]->IsType()) { + } else if (in_vars[i]->IsType()) { selectrow_index.push_back(i); } } @@ -162,7 +162,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { size_t rows = 0; int64_t length = 0; for (auto index : selectrow_index) { - auto &sr = in_vars[index]->Get(); + auto &sr = in_vars[index]->Get(); auto &sr_value = sr.value(); auto &sr_rows = sr.rows(); @@ -235,7 +235,7 @@ class SumKernel if (out_var->IsType()) { SumToLoDTensor(context); - } else if (out_var->IsType()) { + } else if (out_var->IsType()) { SelectedRowsCompute(context); } else if (out_var->IsType()) { LodTensorArrayCompute(context); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 61a9c8b11508f2c9e300c16a661f744bc0248c08..4e108b56a404d590b02c098c845d08b958f15f9a 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -21,7 +21,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using SelectedRows = framework::SelectedRows; +using SelectedRows = pten::SelectedRows; using LoDTensor = framework::LoDTensor; template @@ -37,32 +37,32 @@ void SelectedRowsCompute(const framework::ExecutionContext &context) { return; } - std::vector inputs; + std::vector inputs; SelectedRows temp_in0; if (in_place) { - auto &in0 = in_vars[0]->Get(); + auto &in0 = in_vars[0]->Get(); temp_in0.set_height(in0.height()); temp_in0.set_rows(in0.rows()); framework::TensorCopy(in0.value(), in0.place(), context.device_context(), temp_in0.mutable_value()); inputs.push_back(&temp_in0); for (size_t i = 1; i < in_vars.size(); ++i) { - auto &in = in_vars[i]->Get(); + auto &in = in_vars[i]->Get(); if (in.rows().size() > 0) { inputs.push_back(&in); } } } else { for (auto &in_var : in_vars) { - auto &in = in_var->Get(); + auto &in = in_var->Get(); if (in.rows().size() > 0) { - inputs.push_back(&in_var->Get()); + inputs.push_back(&in_var->Get()); } } } - auto *out = context.Output("Out"); + auto *out = context.Output("Out"); out->mutable_rows()->clear(); bool has_data = false; @@ -183,8 +183,8 @@ class SumKernel : public framework::OpKernel { } auto in = EigenVector::Flatten(in_t); result.device(place) = result + in; - } else if (in_vars[i]->IsType()) { - auto &in_t = in_vars[i]->Get(); + } else if (in_vars[i]->IsType()) { + auto &in_t = in_vars[i]->Get(); functor(context.template device_context(), in_t, out); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -194,7 +194,7 @@ class SumKernel : public framework::OpKernel { framework::ToTypeName(in_vars[i]->Type()))); } } - } else if (out_var->IsType()) { + } else if (out_var->IsType()) { SelectedRowsCompute(context); } else if (out_var->IsType()) { LodTensorArrayCompute(context); diff --git a/paddle/fluid/operators/sum_op_mlu.cc b/paddle/fluid/operators/sum_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2cd649722b2444bb7a032eac18760e582db71d8 --- /dev/null +++ b/paddle/fluid/operators/sum_op_mlu.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sum_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SumMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto out_var = ctx.OutputVar("Out"); + if (out_var->IsType()) { + // init + auto *out = out_var->GetMutable(); + auto ins = ctx.MultiInput("X"); + out->mutable_data(ctx.GetPlace()); + auto place = ctx.GetPlace(); + int ins_size = static_cast(ins.size()); + if (ins_size == 1) { + TensorCopy(*ins[0], place, out); + return; + } + + // MLU shoul do sth + std::vector inputs; + std::vector input_descs; + std::vector desc_vector; + for (int i = 0; i < ins_size; i++) { + input_descs.emplace_back(MLUCnnlTensorDesc( + *ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->type()))); + desc_vector.push_back(input_descs.back().get()); + inputs.push_back(GetBasePtr(ins[i])); + } + // init out tensors + MLUCnnlTensorDesc output_desc(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->type())); + uint32_t ins_size_t = static_cast(ins_size); + MLUCnnl::AddN(ctx, ins_size_t, desc_vector.data(), inputs.data(), + output_desc.get(), GetBasePtr(out)); + + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Expected type of Output(out) must be Tensor or But got " + "unsupport type: %s.", + framework::ToTypeName(out_var->Type()))); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_MLU_KERNEL( + sum, ops::SumMLUKernel, + ops::SumMLUKernel); diff --git a/paddle/fluid/operators/top_k_op_mlu.cc b/paddle/fluid/operators/top_k_op_mlu.cc index affe5a4bc6c2dc603fe5a4cc4ef91c297ec81d59..e5064ed90d5d718c63aaf68c5630f12b7032483a 100644 --- a/paddle/fluid/operators/top_k_op_mlu.cc +++ b/paddle/fluid/operators/top_k_op_mlu.cc @@ -33,8 +33,7 @@ class TopkMLUKernel : public framework::OpKernel { auto k_t_ptr = static_cast(k_t->data()); auto size = k_t->numel() * sizeof(int); memory::Copy(platform::CPUPlace(), reinterpret_cast(&k), - BOOST_GET_CONST(platform::MLUPlace, k_t->place()), k_t_ptr, - size, nullptr); + k_t->place(), k_t_ptr, size, nullptr); framework::DDim output_dims = output->dims(); output_dims[output_dims.size() - 1] = k; output->Resize(output_dims); diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc index 08c960186bafeb59ab6657e2445a3d5a9c58b6ab..cc05e11495b7bbe278cd79aa09cb35077e659d05 100644 --- a/paddle/fluid/operators/top_k_v2_op_mlu.cc +++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc @@ -43,8 +43,7 @@ class TopkV2MLUKernel : public framework::OpKernel { auto k_t_ptr = static_cast(k_t->data()); auto size = k_t->numel() * sizeof(int); memory::Copy(platform::CPUPlace(), reinterpret_cast(&k), - BOOST_GET_CONST(platform::MLUPlace, k_t->place()), k_t_ptr, - size, nullptr); + k_t->place(), k_t_ptr, size, nullptr); framework::DDim output_dims = output->dims(); // accroding to axis to set K value in the dim output_dims[axis] = k; diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index cdb4ad7c40826b94e00dbeba947025b7edf6cfeb..8c603a7c5d8c8f30c769ac53a914bff8305b24cb 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -73,8 +73,8 @@ class CPUUniformRandomKernel : public framework::OpKernel { } } - if (out_var->IsType()) { - auto *selected_rows = out_var->GetMutable(); + if (out_var->IsType()) { + auto *selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); auto shape = ctx.Attr>("shape"); if (!new_shape.empty()) shape = new_shape; diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index 63eecd15c2d69bab3a4e8230f6fa947e3662f22d..5278bdd2f1c7255e4e407fb98de80e79360e5430 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -111,8 +111,8 @@ class GPUUniformRandomKernel : public framework::OpKernel { } } - if (out_var->IsType()) { - auto* selected_rows = out_var->GetMutable(); + if (out_var->IsType()) { + auto* selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); auto shape = context.Attr>("shape"); if (!new_shape.empty()) shape = new_shape; diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc index 1c2f2b07ce897524467ae1877f4a3252571d0106..6812a2b0b7085c6be68325ec506860d3e1b2c4e6 100644 --- a/paddle/fluid/operators/uniform_random_op_npu.cc +++ b/paddle/fluid/operators/uniform_random_op_npu.cc @@ -40,8 +40,8 @@ class NPUUniformRandomKernel : public framework::OpKernel { } } - if (out_var->IsType()) { - auto *selected_rows = out_var->GetMutable(); + if (out_var->IsType()) { + auto *selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); auto shape = ctx.Attr>("shape"); if (!new_shape.empty()) shape = new_shape; diff --git a/paddle/fluid/operators/uniform_random_op_xpu.cc b/paddle/fluid/operators/uniform_random_op_xpu.cc index fed0accd8a14cd7f2434e117b3145e81cfccafd4..848b72727bd28295e8e1a2b3d9e231b3c34c733d 100644 --- a/paddle/fluid/operators/uniform_random_op_xpu.cc +++ b/paddle/fluid/operators/uniform_random_op_xpu.cc @@ -41,8 +41,8 @@ class XPUUniformRandomKernel : public framework::OpKernel { } } - if (out_var->IsType()) { - auto *selected_rows = out_var->GetMutable(); + if (out_var->IsType()) { + auto *selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); auto shape = ctx.Attr>("shape"); if (!new_shape.empty()) shape = new_shape; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 21531a3efd64f33dbd90c8d3114fc54020db427f..eb7057bcd50addd8053738b81b79cf6d0a915941 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,9 +1,7 @@ proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) -proto_library(error_codes_proto SRCS error_codes.proto) if(WITH_GPU) proto_library(external_error_proto SRCS external_error.proto) endif(WITH_GPU) - if (WITH_PYTHON) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -28,10 +26,9 @@ endif() cc_library(flags SRCS flags.cc DEPS gflags boost) cc_library(denormal SRCS denormal.cc DEPS) -cc_library(errors SRCS errors.cc DEPS error_codes_proto) cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) -set(enforce_deps flags errors boost flags) +set(enforce_deps flags errors boost flags pten_enforce) if(WITH_GPU) set(enforce_deps ${enforce_deps} external_error_proto) endif() @@ -75,7 +72,7 @@ IF(WITH_GPU OR WITH_ROCM) ENDIF() IF(WITH_IPU) - set(IPU_CTX_DEPS ipu_backend) + set(IPU_CTX_DEPS ipu_info) ELSE() set(IPU_CTX_DEPS) ENDIF(WITH_IPU) @@ -123,7 +120,10 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} - ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} cpu_context) + ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context) +if(WITH_XPU) + target_link_libraries(device_context xpu_context) +endif() cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) if(WITH_ASCEND_CL) diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h index 43408ca207d1d2c10ba29b32b487e8a7ea99917f..4f8bbb2d2689eb6ffee1119c6eb14ef27de7a2c8 100644 --- a/paddle/fluid/platform/device/device_wrapper.h +++ b/paddle/fluid/platform/device/device_wrapper.h @@ -34,3 +34,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/mlu/enforce.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h" #endif + +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index 883767348f06a99c32664ca2575880737b7418b5..d07ef73a49e7991d43d056da7d41eb83792a402b 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -165,8 +165,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D( return config; } -// TODO(wangchaochaohu): 3D will add later - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt index 5f711937a8098b1d8d83ac0d9f284883191fc796..d54c6a33ecbf53071956aaf4b9d342efa5746f65 100644 --- a/paddle/fluid/platform/device/ipu/CMakeLists.txt +++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt @@ -1,19 +1,22 @@ IF(WITH_IPU) FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc) list(APPEND PADDLE_IPU_SRC ${POPART_CANONICALIZATION_SRC}) - set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "") - set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "") set(IPU_BACKEND_SRC - "ipu_device.cc" "ipu_strategy.cc" "ipu_executor.cc" "ipu_compiler.cc" "ipu_backend.cc" "ipu_utils.cc" ) + set(IPU_INFO_SRC + "ipu_info.cc" + "ipu_device.cc" + ) - cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph framework_proto enforce graph_helper timer) - cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend) - cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart) + cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph graph_helper) + cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart enforce) + cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart graph_helper) add_dependencies(paddle_ipu ipu_backend) + set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "") + set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "") ENDIF() diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc index cd2a628c9abe2bf8e391fcfc7b9d37b293d19936..2459f5140eb5b25af82381366f25c714beb69aaf 100644 --- a/paddle/fluid/platform/device/ipu/ipu_device.cc +++ b/paddle/fluid/platform/device/ipu/ipu_device.cc @@ -13,12 +13,26 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/ipu/ipu_device.h" -#include "paddle/fluid/platform/device/ipu/ipu_utils.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { namespace ipu { +// TODO(alleng) merge with ipu_utils +static bool GetBoolEnv(std::string str) { + char* str_val = getenv(str.c_str()); + if (str_val == NULL) { + return false; + } else { + bool val = false; + if (strcmp(str_val, "1") == 0 || strcmp(str_val, "true") == 0 || + strcmp(str_val, "True") == 0 || strcmp(str_val, "TRUE") == 0) + val = true; + return val; + } +} + int GetNumDevices() { bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); if (ipu_model) { diff --git a/paddle/fluid/platform/device/ipu/ipu_device.h b/paddle/fluid/platform/device/ipu/ipu_device.h index 3da13a522e19a3f6526751e48c70bdd8562d1b6c..d39feffc92655b52dae1792fab0a5ef95bb6075f 100644 --- a/paddle/fluid/platform/device/ipu/ipu_device.h +++ b/paddle/fluid/platform/device/ipu/ipu_device.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc index 4506bfbf972248fd0539927c483b3e23114a6750..9e6951c37139db2bbca6a1eab7f521e850dba6db 100644 --- a/paddle/fluid/platform/device/ipu/ipu_info.cc +++ b/paddle/fluid/platform/device/ipu/ipu_info.cc @@ -16,12 +16,10 @@ namespace paddle { namespace platform { //! Get a list of device ids from environment variable or use all. -std::vector GetSelectedIPUDevices() { - return platform::ipu::GetDeviceIds(); -} +std::vector GetSelectedIPUDevices() { return ipu::GetDeviceIds(); } //! Get the total number of IPU devices in system. -int GetIPUDeviceCount() { return platform::ipu::GetNumDevices(); } +int GetIPUDeviceCount() { return ipu::GetNumDevices(); } } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc index 67012e8d4b92d8d6336f1b192a7b19828511c08e..d4a14a6d8409f9b50247f747016f5284f11037da 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc @@ -32,7 +32,7 @@ Node *mean_handler(Graph *graph, Node *node) { Node *pow_handler(Graph *graph, Node *node) { auto *op = node->Op(); - if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) { + if (!op->Input("FactorTensor").empty()) { return CreateBaseOp( graph, node, "popart_pow", {GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)}, @@ -161,7 +161,7 @@ Node *scale_handler(Graph *graph, Node *node) { static_cast(framework::proto::VarType::FP32)); Node *result = nullptr; - if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) { + if (!op->Input("ScaleTensor").empty()) { auto scale = GetInputVarNode("ScaleTensor", node); if (is_float_equal(bias_, 0.0)) { result = CreateBaseOp(graph, node, "popart_mul", diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc index b7412000107d3157c6b5c38d7c456af3bd36aabd..b731ba532d60c743278b73754deb884c800fe4d1 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc @@ -34,7 +34,7 @@ Node *conv2d_handler(Graph *graph, Node *node) { auto pads = std::vector{pads_.begin(), pads_.end()}; auto stride_ = BOOST_GET_CONST(std::vector, op->GetAttr("strides")); auto stride = std::vector{stride_.begin(), stride_.end()}; - if (op->HasInput("Bias") && !op->Input("Bias").empty()) { + if (!op->Input("Bias").empty()) { return CreateConv( graph, node, { diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc index 662660c23b4a6a357d27565a8c6b37b25db9c9be..539053f2fb67bae4652e61a52bc3254f233d3417 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc @@ -65,7 +65,7 @@ Node *topk_handler(Graph *graph, Node *node) { Node *var_x = GetInputVarNode("X", node); Node *var_k = nullptr; - if (op->HasInput("K") && !op->Input("K").empty()) { + if (!op->Input("K").empty()) { var_k = GetInputVarNode("K", node); } else { auto k = BOOST_GET_CONST(int, op->GetAttr("k")); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc index 296668890ebe5a0f1550e41aff4424b0f87b4f95..db429d2f6228455bd4ca1a47d117ddf2ad286e65 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc @@ -23,7 +23,7 @@ namespace { Node *fill_constant_handler(Graph *graph, Node *node) { auto *op = node->Op(); - if (op->HasInput("ShapeTensor") && !op->Input("ShapeTensor").empty()) { + if (!op->Input("ShapeTensor").empty()) { PADDLE_THROW( platform::errors::Unimplemented("op fill_constant with ShapeTensor")); } @@ -328,7 +328,7 @@ Node *shape_handler(Graph *graph, Node *node) { Node *slice_handler(Graph *graph, Node *node) { auto *op = node->Op(); Node *starts = nullptr; - if (op->HasInput("StartsTensor") && !op->Input("StartsTensor").empty()) { + if (!op->Input("StartsTensor").empty()) { starts = GetInputVarNode("StartsTensor", node); } else { auto starts_ = BOOST_GET_CONST(std::vector, op->GetAttr("starts")); @@ -338,7 +338,7 @@ Node *slice_handler(Graph *graph, Node *node) { starts = starts->outputs[0]; } Node *ends = nullptr; - if (op->HasInput("EndsTensor") && !op->Input("EndsTensor").empty()) { + if (!op->Input("EndsTensor").empty()) { ends = GetInputVarNode("EndsTensor", node); } else { auto ends_ = BOOST_GET_CONST(std::vector, op->GetAttr("ends")); @@ -384,14 +384,13 @@ Node *slice_handler(Graph *graph, Node *node) { Node *expand_handler(Graph *graph, Node *node) { auto *op = node->Op(); - if (op->HasInput("expand_times_tensor") && - !op->Input("expand_times_tensor").empty()) { + if (!op->Input("expand_times_tensor").empty()) { PADDLE_THROW( platform::errors::Unimplemented("Expand op with expand_times_tensor")); } Node *expand_times = nullptr; - if (op->HasInput("ExpandTimes") && !op->Input("ExpandTimes").empty()) { + if (!op->Input("ExpandTimes").empty()) { // cast to int64 expand_times = CreateCast(graph, node, {GetInputVarNode("ExpandTimes", node)}, {}, diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt index 9ef4439f39b6a553e83747452b32d6dd6a2e999b..a4584f54637a615d79995e1e27303128b4202b5e 100644 --- a/paddle/fluid/platform/device/mlu/CMakeLists.txt +++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt @@ -5,6 +5,6 @@ IF(WITH_MLU) cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_manager) - cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream ) + cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream eigen3) cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context) ENDIF() diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt index f89c8c193ae7cf37bc7d2c3b8dc4171badb4a4b4..d292ce130eb34a3c3dfd7e5496f2fbe5112064af 100644 --- a/paddle/fluid/platform/device/xpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt @@ -4,7 +4,7 @@ endif() set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl) -cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place) +cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info) cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context) add_subdirectory(tests) diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h index 4c85168f68dd3a5eed07bb64912dbab5f018f2ab..ae5ec8e851d688e191fd3ed086a48cde54087d1f 100644 --- a/paddle/fluid/platform/device/xpu/enforce_xpu.h +++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h @@ -15,177 +15,36 @@ limitations under the License. */ #pragma once #include "paddle/fluid/platform/device/xpu/xpu_header.h" -#include "paddle/fluid/platform/enforce.h" -#include "xpu/bkcl.h" + +#include "paddle/pten/backends/xpu/enforce_xpu.h" namespace paddle { namespace platform { // Note: XPU runtime api return int, not XPUError_t inline const char* xpuGetErrorString(int stat) { - switch (stat) { - case XPU_SUCCESS: - return "Success"; - case XPUERR_INVALID_DEVICE: - return "Invalid XPU device"; - case XPUERR_UNINIT: - return "XPU runtime not properly inited"; - case XPUERR_NOMEM: - return "Device memory not enough"; - case XPUERR_NOCPUMEM: - return "CPU memory not enough"; - case XPUERR_INVALID_PARAM: - return "Invalid parameter"; - case XPUERR_NOXPUFUNC: - return "Cannot get XPU Func"; - case XPUERR_LDSO: - return "Error loading dynamic library"; - case XPUERR_LDSYM: - return "Error loading func from dynamic library"; - case XPUERR_SIMULATOR: - return "Error from XPU Simulator"; - case XPUERR_NOSUPPORT: - return "Operation not supported"; - case XPUERR_ABNORMAL: - return "Device abnormal due to previous error"; - case XPUERR_KEXCEPTION: - return "Exception in kernel execution"; - case XPUERR_TIMEOUT: - return "Kernel execution timed out"; - case XPUERR_BUSY: - return "Resource busy"; - case XPUERR_USEAFCLOSE: - return "Use a stream after closed"; - case XPUERR_UCECC: - return "Uncorrectable ECC"; - case XPUERR_OVERHEAT: - return "Overheat"; - case XPUERR_UNEXPECT: - return "Execution error, reach unexpected control flow"; - case XPUERR_DEVRESET: - return "Device is being reset, try again later"; - case XPUERR_HWEXCEPTION: - return "Hardware module exception"; - case XPUERR_HBM_INIT: - return "Error init HBM"; - case XPUERR_DEVINIT: - return "Error init device"; - case XPUERR_PEERRESET: - return "Device is being reset, try again later"; - case XPUERR_MAXDEV: - return "Device count exceed limit"; - case XPUERR_NOIOC: - return "Unknown IOCTL command"; - case XPUERR_DMATIMEOUT: - return "DMA timed out, a reboot maybe needed"; - case XPUERR_DMAABORT: - return "DMA aborted due to error, possibly wrong address or hardware " - "state"; - case XPUERR_MCUUNINIT: - return "Firmware not initialized"; - case XPUERR_OLDFW: - return "Firmware version too old (<15), please update."; - case XPUERR_PCIE: - return "Error in PCIE"; - case XPUERR_FAULT: - return "Error copy between kernel and user space"; - case XPUERR_INTERRUPTED: - return "Execution interrupted by user"; - default: - return "unkonwn error"; - } + return pten::backends::xpu::xpuGetErrorString(stat); } inline const char* bkclGetErrorString(BKCLResult_t stat) { - switch (stat) { - case BKCL_SUCCESS: - return "BKCL_SUCCESS"; - case BKCL_INVALID_ARGUMENT: - return "BKCL_INVALID_ARGUMENT"; - case BKCL_RUNTIME_ERROR: - return "BKCL_RUNTIME_ERROR"; - case BKCL_SYSTEM_ERROR: - return "BKCL_SYSTEM_ERROR"; - case BKCL_INTERNAL_ERROR: - return "BKCL_INTERNAL_ERROR"; - default: - return "Unknown BKCL status"; - } + return pten::backends::xpu::bkclGetErrorString(stat); } inline const char* xdnnGetErrorString(int stat) { - switch (stat) { - case xpu::Error_t::SUCCESS: - return "XDNN_SUCCESS"; - case xpu::Error_t::INVALID_PARAM: - return "XDNN_INVALID_PARAM"; - case xpu::Error_t::RUNTIME_ERROR: - return "XDNN_RUNTIME_ERROR"; - case xpu::Error_t::NO_ENOUGH_WORKSPACE: - return "XDNN_NO_ENOUGH_WORKSPACE"; - case xpu::Error_t::NOT_IMPLEMENT: - return "XDNN_NOT_IMPLEMENT"; - default: - return "Unknown XDNN status"; - } + return pten::backends::xpu::xdnnGetErrorString(stat); } inline std::string build_xpu_error_msg(int stat) { - std::string msg("XPU Error <" + std::to_string(stat) + ">, "); - return msg + xpuGetErrorString(stat) + " "; + return pten::backends::xpu::build_xpu_error_msg(stat); } inline std::string build_xpu_error_msg(BKCLResult_t stat) { - std::string msg("BKCL Error, "); - return msg + bkclGetErrorString(stat) + " "; + return pten::backends::xpu::build_xpu_error_msg(stat); } inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) { - return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " "; + return pten::backends::xpu::build_xpu_xdnn_error_msg(stat, msg); } -namespace details { - -template -struct ExternalApiType {}; - -#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ - template <> \ - struct ExternalApiType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ - } - -DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS); -DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); - -#undef DEFINE_EXTERNAL_API_TYPE - -} // namespace details - -#define PADDLE_ENFORCE_XPU_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - using __XPU_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::ExternalApiType< \ - __XPU_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = paddle::platform::errors::External( \ - ::paddle::platform::build_xpu_error_msg(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - -#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG) \ - do { \ - auto __cond__ = (COND); \ - if (UNLIKELY(__cond__ != xpu::Error_t::SUCCESS)) { \ - auto __summary__ = paddle::platform::errors::External( \ - ::paddle::platform::build_xpu_xdnn_error_msg(__cond__, MSG)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index f83e3f6d0db3b3fbbb07a70ce6e9e40d4b675cf3..8764458433072061fd35f264549721a36c60e0d3 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -292,6 +292,10 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT64, XPUPlace())})}, {"scatter", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sigmoid_cross_entropy_with_logits_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sigmoid_cross_entropy_with_logits", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace())})}, {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/paddle/fluid/platform/device/xpu/xpu_header.h b/paddle/fluid/platform/device/xpu/xpu_header.h index 1177fd63742b3b4f104c6943a3e59022677f26d9..6b5c32fd511b3685291a1e7a027834be922ed872 100644 --- a/paddle/fluid/platform/device/xpu/xpu_header.h +++ b/paddle/fluid/platform/device/xpu/xpu_header.h @@ -15,42 +15,5 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_XPU -#include -#include -#include - -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" - -#include "xpu/runtime.h" -#include "xpu/runtime_ex.h" -#include "xpu/xdnn.h" - -namespace xpu = baidu::xpu::api; - -static std::map XPUAPIErrorMsg = { - {xpu::Error_t::SUCCESS, "xpu api success"}, - {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"}, - {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"}, - {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}}; - -template -class XPUTypeTrait { - public: - using Type = T; -}; - -template <> -class XPUTypeTrait { - public: - using Type = float16; -}; - -template <> -class XPUTypeTrait { - public: - using Type = bfloat16; -}; - +#include "paddle/pten/backends/xpu/xpu_header.h" #endif diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc index a8c6ee8f3b0353487f9e09c59e0df8baa01a868d..cf08f9ada6b300dd93bcb7b3dd3fb8c0ecb65f44 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.cc +++ b/paddle/fluid/platform/device/xpu/xpu_info.cc @@ -14,22 +14,14 @@ limitations under the License. */ #include #include #include "gflags/gflags.h" + #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" - -PADDLE_DEFINE_EXPORTED_string( - selected_xpus, "", - "A list of device ids separated by comma, like: 0,1,2,3. " - "This option is useful when doing multi process training and " - "each process have only one device (XPU). If you want to use " - "all visible devices, set this to empty string. NOTE: the " - "reason of doing this is that we want to use P2P communication" - "between XPU devices, use XPU_VISIBLE_DEVICES can only use" - "share-memory only."); + +#include "paddle/pten/backends/xpu/xpu_info.h" namespace paddle { namespace platform { @@ -37,101 +29,40 @@ namespace platform { /**************************** Version Management **************************/ //! Get the version of XPU Driver -int GetDriverVersion() { - uint32_t driver_version_major = 0; - uint32_t driver_version_minor = 0; - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_get_driver_version(&driver_version_major, &driver_version_minor)); - int driver_version = driver_version_major * 10 + driver_version_minor; - return driver_version; -} +int GetDriverVersion() { return pten::backends::xpu::GetDriverVersion(); } //! Get the version of XPU Runtime -int GetRuntimeVersion() { - uint32_t rumtime_version_major = 0; - uint32_t rumtime_version_minor = 0; - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor)); - int runtime_version = rumtime_version_major * 10 + rumtime_version_minor; - return runtime_version; -} +int GetRuntimeVersion() { return pten::backends::xpu::GetRuntimeVersion(); } /**************************** Device Management **************************/ -static int GetDeviceCountImpl() { - const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES"); - if (xpu_visible_devices != nullptr) { - std::string xpu_visible_devices_str(xpu_visible_devices); - if (std::all_of(xpu_visible_devices_str.begin(), - xpu_visible_devices_str.end(), - [](char ch) { return ch == ' '; })) { - VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected."; - return 0; - } - } - - int count = 0; - PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count)); - return count; -} - -int GetXPUDeviceCount() { - static auto dev_cnt = GetDeviceCountImpl(); - return dev_cnt; -} +int GetXPUDeviceCount() { return pten::backends::xpu::GetXPUDeviceCount(); } int GetXPUCurrentDeviceId() { - int dev_id; - PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id)); - if (dev_id >= 64) { - // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id - dev_id -= 64; - } - return dev_id; + return pten::backends::xpu::GetXPUCurrentDeviceId(); } -void SetXPUDeviceId(int id) { - PADDLE_ENFORCE_LT( - id, GetXPUDeviceCount(), - platform::errors::InvalidArgument("id must less than XPU count")); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id)); -} +void SetXPUDeviceId(int id) { pten::backends::xpu::SetXPUDeviceId(id); } //! Get a list of device ids from environment variable or use all. std::vector GetXPUSelectedDevices() { // use user specified XPUs in single-node multi-process mode. - std::vector devices; - if (!FLAGS_selected_xpus.empty()) { - auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ','); - for (auto id : devices_str) { - devices.push_back(atoi(id.c_str())); - } - } else { - int count = GetXPUDeviceCount(); - for (int i = 0; i < count; ++i) { - devices.push_back(i); - } - } - return devices; + return pten::backends::xpu::GetXPUSelectedDevices(); } /**************************** Memory Management **************************/ void MemcpySyncH2D(void* dst, const void* src, size_t count, const platform::XPUPlace& dst_place) { - platform::XPUDeviceGuard guard(dst_place.device); - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + pten::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place); } void MemcpySyncD2H(void* dst, const void* src, size_t count, const platform::XPUPlace& src_place) { - platform::XPUDeviceGuard guard(src_place.device); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.GetByPlace(src_place); dev_ctx->Wait(); - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST)); + pten::backends::xpu::MemcpySyncD2H(dst, src, count, src_place, *dev_ctx); } // if src.device == dst.device and you need sync , after call this function, @@ -139,33 +70,16 @@ void MemcpySyncD2H(void* dst, const void* src, size_t count, void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place, const void* src, const platform::XPUPlace& src_place, size_t count) { - int dev_id = GetXPUCurrentDeviceId(); - if (dst_place.device == dev_id && src_place.device == dev_id) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.GetByPlace(src_place); - PADDLE_ENFORCE_XDNN_SUCCESS( - xpu::copy(dev_ctx->x_context(), static_cast(src), - static_cast(dst), count), - "copy "); - } else { - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count)); - } + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.GetByPlace(src_place); + pten::backends::xpu::MemcpySyncD2D(dst, dst_place, src, src_place, count, + *dev_ctx); } /**************************** Others **************************/ -XPUVersion get_xpu_version(int dev_id) { - uint64_t v = 0; - PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id)); - - if (v == K100 || v == K200) { - VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n"; - return XPU1; - } else { - VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n"; - return XPU2; - } +pten::backends::xpu::XPUVersion get_xpu_version(int dev_id) { + return pten::backends::xpu::get_xpu_version(dev_id); } } // namespace platform diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h index 220bebb9e6b055319a0f642e8f711ccf8302ea43..03082e8dc50eca7e85d22a327600068099ee4567 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.h +++ b/paddle/fluid/platform/device/xpu/xpu_info.h @@ -13,6 +13,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include #include "paddle/fluid/platform/place.h" +#include "paddle/pten/backends/xpu/xpu_info.h" namespace paddle { namespace platform { @@ -50,31 +51,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place, const void *src, const platform::XPUPlace &src_place, size_t count); -class XPUDeviceGuard { - public: - explicit inline XPUDeviceGuard(int dev_id) { - int prev_id = platform::GetXPUCurrentDeviceId(); - if (prev_id != dev_id) { - prev_id_ = prev_id; - platform::SetXPUDeviceId(dev_id); - } - } +using XPUDeviceGuard = pten::backends::xpu::XPUDeviceGuard; - inline ~XPUDeviceGuard() { - if (prev_id_ != -1) { - platform::SetXPUDeviceId(prev_id_); - } - } - - XPUDeviceGuard(const XPUDeviceGuard &o) = delete; - XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete; - - private: - int prev_id_{-1}; -}; - -enum XPUVersion { XPU1, XPU2 }; -XPUVersion get_xpu_version(int dev_id); +pten::backends::xpu::XPUVersion get_xpu_version(int dev_id); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc index 36be4a55d0a6f1f1e85073c35b8d2d4e9092e491..e9b494024bd699d5176226b6535758f1ff2e0c39 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc @@ -24,7 +24,7 @@ namespace platform { bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) { auto& ops = get_kl1_ops(); auto v = get_xpu_version(type.place_.device); - if (v == XPU2) { + if (v == pten::backends::xpu::XPUVersion::XPU2) { ops = get_kl2_ops(); } @@ -74,10 +74,11 @@ bool is_in_xpu_black_list(const std::string& op_name) { return false; } -std::vector get_xpu_op_support_type(const std::string& op_name, - XPUVersion version) { +std::vector get_xpu_op_support_type( + const std::string& op_name, pten::backends::xpu::XPUVersion version) { std::vector res; - auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops(); + auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops() + : get_kl2_ops(); if (ops.find(op_name) != ops.end()) { XPUKernelSet& type_set = ops[op_name]; for (auto& item : type_set) { @@ -87,9 +88,10 @@ std::vector get_xpu_op_support_type(const std::string& op_name, return res; } -XPUOpListMap get_xpu_op_list(XPUVersion version) { +XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version) { XPUOpListMap res; - auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops(); + auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops() + : get_kl2_ops(); for (auto& op : ops) { std::vector op_vartypes; for (auto& item : op.second) { diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h index 3672d68492a6f5485a6c5a48751905e3f6cbbf30..4c3eb097a147ee11fb84c817614ef7c1002bddd5 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h @@ -27,9 +27,9 @@ using XPUOpListMap = bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type); bool is_in_xpu_black_list(const std::string& op_name); -std::vector get_xpu_op_support_type(const std::string& op_name, - XPUVersion version); -XPUOpListMap get_xpu_op_list(XPUVersion version); +std::vector get_xpu_op_support_type( + const std::string& op_name, pten::backends::xpu::XPUVersion version); +XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 6ffeaf101feca795f8a330b72206dffa2d68904c..142e30d161ccadf3c3cb55eee430597e60d50624 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -21,9 +21,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/mlu/device_context.h" #include "paddle/fluid/platform/device/mlu/device_context_allocator.h" #endif -#ifdef PADDLE_WITH_IPU -#include "paddle/fluid/platform/ipu/ipu_backend.h" -#endif #include "glog/logging.h" #include "paddle/fluid/framework/expect.h" #include "paddle/fluid/platform/profiler.h" @@ -230,14 +227,10 @@ CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {} CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {} #ifdef PADDLE_WITH_IPU -IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) { - int id = place.GetDeviceId(); - std::shared_ptr ipu_backend = - platform::ipu::IpuBackend::GetInstance(); - device_ = ipu_backend->GetDevice(id); -} +IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {} Place IPUDeviceContext::GetPlace() const { return place_; } + void IPUDeviceContext::Wait() const { /*! \brief Wait for all operations completion in the stream. */ } @@ -246,52 +239,14 @@ IPUDeviceContext::~IPUDeviceContext() {} #endif #ifdef PADDLE_WITH_XPU -XPUDeviceContext::XPUDeviceContext() { - context_ = xpu::create_context(); - xpu_version_ = get_xpu_version(place_.device); -} +XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {} XPUDeviceContext::~XPUDeviceContext() {} -XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) { - platform::XPUDeviceGuard guard(place.device); - +XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) { LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " - << static_cast(place_.device); - - context_ = xpu::create_context(); - const int MAX_XPU_NUM = 16; - static void* l3ptrs[MAX_XPU_NUM] = {nullptr}; - - int l3_size = 13.5 * 1024 * 1024; - if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) { - l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE")); - } - - auto selected_xpus = GetXPUSelectedDevices(); - for (unsigned int i = 0; i < selected_xpus.size(); i++) { - if (place.device == selected_xpus[i]) { - if (l3ptrs[place.device] == nullptr) { - xpu_malloc(static_cast(&l3ptrs[place.device]), l3_size, - XPU_MEM_L3); - } - if (l3ptrs[place.device] != nullptr) { - context_->_l3_mgr.set(l3ptrs[place.device], l3_size); - VLOG(3) << "xpu place " << place.device << " set l3 size " << l3_size; - } - break; - } - } + << static_cast(place.device); } - -void XPUDeviceContext::Wait() const { - platform::SetXPUDeviceId(place_.device); - xpu_wait(context_->xpu_stream); -} - -Place XPUDeviceContext::GetPlace() const { return place_; } - -xpu::Context* XPUDeviceContext::x_context() const { return context_; } #endif #ifdef PADDLE_WITH_ASCEND_CL diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 78c09dca5b4886cfa03f18065df393c3861eed8f..17b22907b15328ef8fe610ce126639b0a5f927e7 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -65,9 +65,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/npu/enforce_npu.h" #include "paddle/fluid/platform/device/npu/npu_stream.h" #endif -#ifdef PADDLE_WITH_IPU -#include "paddle/fluid/platform/device/ipu/device.h" -#endif #include "unsupported/Eigen/CXX11/Tensor" namespace Eigen { @@ -78,6 +75,7 @@ struct GpuDevice; #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/pten/backends/xpu/xpu_context.h" #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -150,11 +148,9 @@ class IPUDeviceContext : public DeviceContext { Place GetPlace() const override; /*! \brief Wait for all operations completion in the stream. */ void Wait() const override; - int DeviceId() const { return device_.getId(); } private: IPUPlace place_; - platform::ipu::Device device_; }; template <> struct DefaultDeviceContextType { @@ -171,39 +167,12 @@ struct DefaultDeviceContextType; #ifdef PADDLE_WITH_XPU namespace xpu = baidu::xpu::api; -class XPUDeviceContext : public DeviceContext { +class XPUDeviceContext : public pten::XPUContext { public: XPUDeviceContext(); explicit XPUDeviceContext(XPUPlace place); virtual ~XPUDeviceContext(); Eigen::DefaultDevice* eigen_device() const { return nullptr; } - XPUVersion xpu_version() const { return xpu_version_; } - Place GetPlace() const override; - xpu::Context* x_context() const; - - /*! \brief Wait for all operations completion in the stream. */ - void Wait() const override; - -#ifdef PADDLE_WITH_XPU_BKCL - /*! \brief Return bkcl context. */ - BKCLContext_t bkcl_context() const { return bkcl_context_; } - - /*! \brief Set bkcl context. */ - void set_bkcl_context(BKCLContext_t context) { bkcl_context_ = context; } -#endif - - private: - XPUPlace place_; - XPUVersion xpu_version_; - xpu::Context* context_; -#ifdef PADDLE_WITH_XPU_BKCL - BKCLContext_t bkcl_context_; -#endif - - // Need to be the same with other DeviceContext, - // Eventhough eigen_device_ is not used in XPU - std::unique_ptr eigen_device_; - DISABLE_COPY_AND_ASSIGN(XPUDeviceContext); }; template <> diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 32f233e44e952f6c78b7bfbfd3b0c600ac50d5e4..c751ee1e69b2bdcb85de0f9657f679356796ef33 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -95,17 +95,16 @@ limitations under the License. */ // Note: these headers for simplify demangle type string #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/pten/core/enforce.h" // Note: this header for simplify HIP and CUDA type string #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_types.h" #endif #include "paddle/fluid/platform/flags.h" -namespace paddle { -namespace platform { +namespace pten { class ErrorSummary; -} // namespace platform -} // namespace paddle +} // namespace pten #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_int64(gpu_allocator_retry_time); @@ -114,6 +113,7 @@ DECLARE_int32(call_stack_level); namespace paddle { namespace platform { +using namespace ::pten::enforce; // NOLINT /** HELPER MACROS AND FUNCTIONS **/ @@ -121,478 +121,6 @@ namespace platform { #define PADDLE_MAY_THROW noexcept(false) #endif -// Because most enforce conditions would evaluate to true, we can use -// __builtin_expect to instruct the C++ compiler to generate code that -// always forces branch prediction of true. -// This generates faster binary code. __builtin_expect is since C++11. -// For more details, please check https://stackoverflow.com/a/43870188/724872. -#if !defined(_WIN32) -#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) -#else -// there is no equivalent intrinsics in msvc. -#define UNLIKELY(condition) (condition) -#endif - -#if !defined(_WIN32) -#define LIKELY(condition) __builtin_expect(static_cast(condition), 1) -#else -// there is no equivalent intrinsics in msvc. -#define LIKELY(condition) (condition) -#endif - -#if defined _WIN32 && defined PADDLE_ON_INFERENCE && defined PADDLE_NO_PYTHON -#define HANDLE_THE_ERROR try { -#define END_HANDLE_THE_ERROR \ - } \ - catch (const std::exception& e) { \ - std::cout << e.what() << std::endl; \ - throw; \ - } -#else -#define HANDLE_THE_ERROR -#define END_HANDLE_THE_ERROR -#endif - -#ifdef __GNUC__ -inline std::string demangle(std::string name) { - int status = -4; // some arbitrary value to eliminate the compiler warning - std::unique_ptr res{ - abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free}; - return (status == 0) ? res.get() : name; -} -#else -inline std::string demangle(std::string name) { return name; } -#endif - -namespace details { -template -inline constexpr bool IsArithmetic() { - return std::is_arithmetic::value; -} - -template -struct TypeConverterImpl { - using Type1 = typename std::common_type::type; - using Type2 = Type1; -}; - -template -struct TypeConverterImpl { - using Type1 = T1; - using Type2 = T2; -}; - -template -struct TypeConverter { - static constexpr bool kIsArithmetic = - IsArithmetic() && IsArithmetic(); - using Type1 = typename TypeConverterImpl::Type1; - using Type2 = typename TypeConverterImpl::Type2; -}; - -template -using CommonType1 = typename std::add_lvalue_reference< - typename std::add_const::Type1>::type>::type; - -template -using CommonType2 = typename std::add_lvalue_reference< - typename std::add_const::Type2>::type>::type; - -// Here, we use SFINAE to check whether T can be converted to std::string -template -struct CanToString { - private: - using YesType = uint8_t; - using NoType = uint16_t; - - template - static YesType Check(decltype(std::cout << std::declval())) { - return 0; - } - - template - static NoType Check(...) { - return 0; - } - - public: - static constexpr bool kValue = - std::is_same(std::cout))>::value; -}; - -template -struct BinaryCompareMessageConverter { - template - static std::string Convert(const char* expression, const T& value) { - return expression + std::string(":") + string::to_string(value); - } -}; - -template <> -struct BinaryCompareMessageConverter { - template - static const char* Convert(const char* expression, const T& value) { - return expression; - } -}; -} // namespace details - -template -inline std::string ReplaceComplexTypeStr(std::string str, - const std::string& type_name) { - auto demangle_type_str = demangle(typeid(T).name()); - size_t start_pos = 0; - while ((start_pos = str.find(demangle_type_str, start_pos)) != - std::string::npos) { - str.replace(start_pos, demangle_type_str.length(), type_name); - start_pos += type_name.length(); - } - return str; -} - -#define __REPLACE_COMPLEX_TYPE_STR__(__TYPENAME, __STR) \ - do { \ - __STR = paddle::platform::ReplaceComplexTypeStr<__TYPENAME>(__STR, \ - #__TYPENAME); \ - } while (0) - -inline std::string SimplifyDemangleStr(std::string str) { - // the older is important, you have to put complex types in front - __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::AttributeMap, str); - __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::Attribute, str); - __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVariableWrapperMap, str); - __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVarBaseMap, str); - __REPLACE_COMPLEX_TYPE_STR__(std::string, str); - return str; -} - -inline std::string GetCurrentTraceBackString(bool for_signal = false) { - std::ostringstream sout; - - if (!for_signal) { - sout << "\n\n--------------------------------------\n"; - sout << "C++ Traceback (most recent call last):"; - sout << "\n--------------------------------------\n"; - } -#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL) - static constexpr int TRACE_STACK_LIMIT = 100; - - void* call_stack[TRACE_STACK_LIMIT]; - auto size = backtrace(call_stack, TRACE_STACK_LIMIT); - auto symbols = backtrace_symbols(call_stack, size); - Dl_info info; - int idx = 0; - // `for_signal` used to remove the stack trace introduced by - // obtaining the error stack trace when the signal error occurred, - // that is not related to the signal error self, remove it to - // avoid misleading users and developers - int end_idx = for_signal ? 2 : 0; - for (int i = size - 1; i >= end_idx; --i) { - if (dladdr(call_stack[i], &info) && info.dli_sname) { - auto demangled = demangle(info.dli_sname); - std::string path(info.dli_fname); - // C++ traceback info are from core.so - if (path.substr(path.length() - 3).compare(".so") == 0) { - sout << string::Sprintf("%-3d %s\n", idx++, - SimplifyDemangleStr(demangled)); - } - } - } - free(symbols); -#else - sout << "Not support stack backtrace yet.\n"; -#endif - return sout.str(); -} - -template -inline std::string GetErrorSumaryString(StrType&& what, const char* file, - int line) { - std::ostringstream sout; - if (FLAGS_call_stack_level > 1) { - sout << "\n----------------------\nError Message " - "Summary:\n----------------------\n"; - } - sout << string::Sprintf("%s (at %s:%d)", std::forward(what), file, - line) - << std::endl; - return sout.str(); -} - -template -inline std::string GetTraceBackString(StrType&& what, const char* file, - int line) { - if (FLAGS_call_stack_level > 1) { - // FLAGS_call_stack_level>1 means showing c++ call stack - return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line); - } else { - return GetErrorSumaryString(what, file, line); - } -} - -inline std::string SimplifyErrorTypeFormat(const std::string& str) { - std::ostringstream sout; - size_t type_end_pos = str.find(":", 0); - if (type_end_pos == std::string::npos) { - sout << str; - } else { - // Remove "Error:", add "()"" - sout << "(" << str.substr(0, type_end_pos - 5) << ")" - << str.substr(type_end_pos + 1); - } - return sout.str(); -} - -inline bool is_error(bool stat) { return !stat; } - -// Note: This Macro can only be used within enforce.h -#define __THROW_ERROR_INTERNAL__(__ERROR_SUMMARY) \ - do { \ - HANDLE_THE_ERROR \ - throw ::paddle::platform::EnforceNotMet(__ERROR_SUMMARY, __FILE__, \ - __LINE__); \ - END_HANDLE_THE_ERROR \ - } while (0) - -/** ENFORCE EXCEPTION AND MACROS **/ - -struct EnforceNotMet : public std::exception { - public: - EnforceNotMet(std::exception_ptr e, const char* file, int line) { - try { - std::rethrow_exception(e); - } catch (platform::EnforceNotMet& e) { - code_ = e.code(); - err_str_ = GetTraceBackString(e.what(), file, line); - simple_err_str_ = SimplifyErrorTypeFormat(err_str_); - } catch (std::exception& e) { - err_str_ = GetTraceBackString(e.what(), file, line); - simple_err_str_ = SimplifyErrorTypeFormat(err_str_); - } - } - - EnforceNotMet(const std::string& str, const char* file, int line) - : err_str_(GetTraceBackString(str, file, line)) { - simple_err_str_ = SimplifyErrorTypeFormat(err_str_); - } - - EnforceNotMet(const ErrorSummary& error, const char* file, int line) - : code_(error.code()), - err_str_(GetTraceBackString(error.to_string(), file, line)) { - simple_err_str_ = SimplifyErrorTypeFormat(err_str_); - } - - const char* what() const noexcept override { - if (FLAGS_call_stack_level > 1) { - return err_str_.c_str(); - } else { - return simple_err_str_.c_str(); - } - } - - error::Code code() const { return code_; } - - const std::string& error_str() const { return err_str_; } - - const std::string& simple_error_str() const { return simple_err_str_; } - - void set_error_str(std::string str) { - if (FLAGS_call_stack_level > 1) { - err_str_ = str; - } else { - simple_err_str_ = str; - } - } - - private: - // Used to determine the final type of exception thrown - error::Code code_ = error::LEGACY; - // Complete error message - // e.g. InvalidArgumentError: *** - std::string err_str_; - // Simple errror message used when no C++ stack and python compile stack - // e.g. (InvalidArgument) *** - std::string simple_err_str_; -}; - -#define PADDLE_THROW(...) \ - do { \ - HANDLE_THE_ERROR \ - throw ::paddle::platform::EnforceNotMet( \ - ::paddle::platform::ErrorSummary(__VA_ARGS__), __FILE__, __LINE__); \ - END_HANDLE_THE_ERROR \ - } while (0) - -#if defined(__CUDA_ARCH__) -// For cuda, the assertions can affect performance and it is therefore -// recommended to disable them in production code -// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion -#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \ - do { \ - if (!(_IS_NOT_ERROR)) { \ - printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", __FILE__, \ - __LINE__, #_IS_NOT_ERROR, ##__VA_ARGS__); \ - asm("trap;"); \ - } \ - } while (0) -#elif defined(__HIPCC__) -#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \ - do { \ - if (!(_IS_NOT_ERROR)) { \ - printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", __FILE__, \ - __LINE__, #_IS_NOT_ERROR, ##__VA_ARGS__); \ - abort(); \ - } \ - } while (0) -#else -#define PADDLE_ENFORCE(COND, ...) \ - do { \ - auto __cond__ = (COND); \ - if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \ - __THROW_ERROR_INTERNAL__(::paddle::platform::ErrorSummary(__VA_ARGS__)); \ - } \ - } while (0) -#endif - -/* - * Some enforce helpers here, usage: - * int a = 1; - * int b = 2; - * PADDLE_ENFORCE_EQ(a, b); - * - * will raise an expression described as follows: - * "Expected input a == b, but received a(1) != b(2)." - * with detailed stack information. - * - * extra messages is also supported, for example: - * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) - */ - -#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ - do { \ - if (UNLIKELY(nullptr == (__VAL))) { \ - auto __summary__ = ::paddle::platform::ErrorSummary(__VA_ARGS__); \ - auto __message__ = ::paddle::string::Sprintf( \ - "%s\n [Hint: " #__VAL " should not be null.]", \ - __summary__.error_message()); \ - __THROW_ERROR_INTERNAL__( \ - ::paddle::platform::ErrorSummary(__summary__.code(), __message__)); \ - } \ - } while (0) - -#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...) \ - do { \ - auto __val1 = (__VAL1); \ - auto __val2 = (__VAL2); \ - using __TYPE1__ = decltype(__val1); \ - using __TYPE2__ = decltype(__val2); \ - using __COMMON_TYPE1__ = \ - ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>; \ - using __COMMON_TYPE2__ = \ - ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>; \ - bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \ - static_cast<__COMMON_TYPE2__>(__val2)); \ - if (UNLIKELY(!__is_not_error)) { \ - auto __summary__ = ::paddle::platform::ErrorSummary(__VA_ARGS__); \ - constexpr bool __kCanToString__ = \ - ::paddle::platform::details::CanToString<__TYPE1__>::kValue && \ - ::paddle::platform::details::CanToString<__TYPE2__>::kValue; \ - auto __message__ = ::paddle::string::Sprintf( \ - "%s\n [Hint: Expected %s " #__CMP \ - " %s, but received %s " #__INV_CMP " %s.]", \ - __summary__.error_message(), #__VAL1, #__VAL2, \ - ::paddle::platform::details::BinaryCompareMessageConverter< \ - __kCanToString__>::Convert(#__VAL1, __val1), \ - ::paddle::platform::details::BinaryCompareMessageConverter< \ - __kCanToString__>::Convert(#__VAL2, __val2)); \ - __THROW_ERROR_INTERNAL__( \ - ::paddle::platform::ErrorSummary(__summary__.code(), __message__)); \ - } \ - } while (0) - -#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) -#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) -#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) -#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) -#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) -#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) - -/** EXTENDED TOOL FUNCTIONS WITH CHECKING **/ - -/* - * Summary: This macro is used to get Variable or internal type - * data (such as LoDTensor or SelectedRows) of the Input and - * Output in op, generally used when call scope.FindVar(Input/ - * Output("Name")) or ctx.Input(). - * Firstly this macro check whether the obtained pointer is null, - * and then return data if it is not null. - * - * Note: This macro is only suitable for specific scenarios and - * does not intended to be widely used. If it cannot meet the - * requirements, please use other PADDLE_ENFORCE** check macro. - * - * Parameters: - *     __PTR: pointer - * __ROLE: (string), Input or Output - * __NAME: (string), Input or Output name - * __OP_TYPE: (string), the op type - *   - * Return: The data pointed to by the pointer. - * - * Examples: - * GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", "Mul"); - */ -#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \ - (([&]() -> std::add_lvalue_reference::type { \ - auto* __ptr = (__PTR); \ - if (UNLIKELY(nullptr == __ptr)) { \ - auto __summary__ = paddle::platform::errors::NotFound( \ - "Unable to get %s data of %s %s in operator %s. " \ - "Possible reasons are:\n" \ - " 1. The %s is not the %s of operator %s;\n" \ - " 2. The %s has no corresponding variable passed in;\n" \ - " 3. The %s corresponding variable is not initialized.", \ - paddle::platform::demangle( \ - typeid(std::add_lvalue_reference::type) \ - .name()), \ - __ROLE, __NAME, __OP_TYPE, __NAME, __ROLE, __OP_TYPE, __NAME, \ - __NAME); \ - auto __message__ = ::paddle::string::Sprintf( \ - "%s\n [Hint: pointer " #__PTR " should not be null.]", \ - __summary__.error_message()); \ - __THROW_ERROR_INTERNAL__( \ - ::paddle::platform::ErrorSummary(__summary__.code(), __message__)); \ - } \ - return *__ptr; \ - })()) - -/* - * Summary: This macro is used to check whether op has specified - * Input or Output Variables. Because op's Input and Output - * checking are written similarly, so abstract this macro. - * - * Parameters: - *     __EXPR: (bool), the bool expression - * __ROLE: (string), Input or Output - * __NAME: (string), Input or Output name - * __OP_TYPE: (string), the op type - * - * Examples: - * OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul"); - */ -#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \ - do { \ - PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound( \ - "No %s(%s) found for %s operator.", \ - __ROLE, __NAME, __OP_TYPE)); \ - } while (0) - /* * Summary: This BOOST_GET(_**) series macros are used to call boost::get * safely. boost::get is not a completely safe api, although it will not @@ -616,6 +144,8 @@ struct EnforceNotMet : public std::exception { */ namespace details { +using namespace pten::enforce::details; // NOLINT + #define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr, \ __FuncName) \ template \ @@ -627,13 +157,12 @@ namespace details { return boost::get(input); \ } catch (boost::bad_get&) { \ HANDLE_THE_ERROR \ - throw ::paddle::platform::EnforceNotMet( \ - ::paddle::platform::errors::InvalidArgument( \ + throw ::pten::enforce::EnforceNotMet( \ + pten::errors::InvalidArgument( \ "boost::get failed, cannot get value " \ "(%s) by type %s, its type is %s.", \ - expression, \ - paddle::platform::demangle(typeid(OutputType).name()), \ - paddle::platform::demangle(input.type().name())), \ + expression, pten::enforce::demangle(typeid(OutputType).name()), \ + pten::enforce::demangle(input.type().name())), \ file, line); \ END_HANDLE_THE_ERROR \ } \ @@ -647,44 +176,43 @@ DEFINE_SAFE_BOOST_GET(InputType&&, OutputType, OutputType*, } // namespace details -#define BOOST_GET(__TYPE, __VALUE) \ - ::paddle::platform::details::SafeBoostGet<__TYPE>(__VALUE, #__VALUE, \ - __FILE__, __LINE__) -#define BOOST_GET_CONST(__TYPE, __VALUE) \ - ::paddle::platform::details::SafeBoostGetConst<__TYPE>(__VALUE, #__VALUE, \ +#define BOOST_GET(__TYPE, __VALUE) \ + paddle::platform::details::SafeBoostGet<__TYPE>(__VALUE, #__VALUE, __FILE__, \ + __LINE__) +#define BOOST_GET_CONST(__TYPE, __VALUE) \ + paddle::platform::details::SafeBoostGetConst<__TYPE>(__VALUE, #__VALUE, \ + __FILE__, __LINE__) +#define BOOST_GET_MUTABLE(__TYPE, __VALUE) \ + paddle::platform::details::SafeBoostGetMutable<__TYPE>(__VALUE, #__VALUE, \ __FILE__, __LINE__) -#define BOOST_GET_MUTABLE(__TYPE, __VALUE) \ - ::paddle::platform::details::SafeBoostGetMutable<__TYPE>(__VALUE, #__VALUE, \ - __FILE__, __LINE__) /** OTHER EXCEPTION AND ENFORCE **/ struct EOFException : public std::exception { std::string err_str_; EOFException(const char* err_msg, const char* file, int line) { - err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, file, line); + err_str_ = paddle::string::Sprintf("%s at [%s:%d]", err_msg, file, line); } const char* what() const noexcept override { return err_str_.c_str(); } }; -#define PADDLE_THROW_EOF() \ - do { \ - HANDLE_THE_ERROR \ - throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ - __LINE__); \ - END_HANDLE_THE_ERROR \ - } while (0) - -#define PADDLE_THROW_BAD_ALLOC(...) \ +#define PADDLE_THROW_EOF() \ do { \ HANDLE_THE_ERROR \ - throw ::paddle::memory::allocation::BadAlloc( \ - ::paddle::platform::ErrorSummary(__VA_ARGS__).to_string(), __FILE__, \ - __LINE__); \ + throw paddle::platform::EOFException("There is no next data.", __FILE__, \ + __LINE__); \ END_HANDLE_THE_ERROR \ } while (0) +#define PADDLE_THROW_BAD_ALLOC(...) \ + do { \ + HANDLE_THE_ERROR \ + throw ::paddle::memory::allocation::BadAlloc( \ + pten::ErrorSummary(__VA_ARGS__).to_string(), __FILE__, __LINE__); \ + END_HANDLE_THE_ERROR \ + } while (0) + /**************************************************************************/ /**************************** NVIDIA ERROR ********************************/ #ifdef PADDLE_WITH_CUDA @@ -970,7 +498,7 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = ::paddle::platform::errors::External( \ + auto __summary__ = pten::errors::External( \ ::paddle::platform::build_nvidia_error_msg(__cond__)); \ __THROW_ERROR_INTERNAL__(__summary__); \ } \ @@ -1016,7 +544,7 @@ inline void retry_sleep(unsigned milliseconds) { ++retry_count; \ } \ if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = ::paddle::platform::errors::External( \ + auto __summary__ = pten::errors::External( \ ::paddle::platform::build_nvidia_error_msg(__cond__)); \ __THROW_ERROR_INTERNAL__(__summary__); \ } \ @@ -1176,7 +704,7 @@ DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = ::paddle::platform::errors::External( \ + auto __summary__ = pten::errors::External( \ ::paddle::platform::build_rocm_error_msg(__cond__)); \ __THROW_ERROR_INTERNAL__(__summary__); \ } \ @@ -1204,7 +732,7 @@ inline void retry_sleep(unsigned millisecond) { ++retry_count; \ } \ if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = ::paddle::platform::errors::External( \ + auto __summary__ = pten::errors::External( \ ::paddle::platform::build_rocm_error_msg(__cond__)); \ __THROW_ERROR_INTERNAL__(__summary__); \ } \ diff --git a/paddle/fluid/platform/error_codes.proto b/paddle/fluid/platform/error_codes.proto deleted file mode 100644 index 90ab93dd11d0ad1706f6199308ad2a6cb3ffa650..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/error_codes.proto +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto2"; -option optimize_for = LITE_RUNTIME; -package paddle.platform.error; - -enum Code { - // Legacy error. - // Error type string: "Error" - LEGACY = 0; - - // Client specified an invalid argument. - // Error type string: "InvalidArgumentError" - INVALID_ARGUMENT = 1; - - // Some requested entity (e.g., file or directory) was not found. - // Error type string: "NotFoundError" - NOT_FOUND = 2; - - // Operation tried to iterate past the valid input range. E.g., seeking or - // reading past end of file. - // Error type string: "OutOfRangeError" - OUT_OF_RANGE = 3; - - // Some entity that we attempted to create (e.g., file or directory) - // already exists. - // Error type string: "AlreadyExistsError" - ALREADY_EXISTS = 4; - - // Some resource has been exhausted, perhaps a per-user quota, or - // perhaps the entire file system is out of space. - // Error type string: "ResourceExhaustedError" - RESOURCE_EXHAUSTED = 5; - - // Operation was rejected because the system is not in a state - // required for the operation's execution. - // Error type string: "PreconditionNotMetError" - PRECONDITION_NOT_MET = 6; - - // The caller does not have permission to execute the specified - // operation. - // Error type string: "PermissionDeniedError" - PERMISSION_DENIED = 7; - - // Deadline expired before operation could complete. - // Error type string: "ExecutionTimeout" - EXECUTION_TIMEOUT = 8; - - // Operation is not implemented or not supported/enabled in this service. - // Error type string: "UnimpelmentedError" - UNIMPLEMENTED = 9; - - // The service is currently unavailable. This is a most likely a - // transient condition and may be corrected by retrying with - // a backoff. - // Error type string: "UnavailableError" - UNAVAILABLE = 10; - - // Fatal errors. Means some invariant expected by the underlying - // system has been broken. If you see one of these errors, - // something is very broken. - // Error type string: "FatalError" - FATAL = 11; - - // Third-party library error. - // Error type string: "ExternalError" - EXTERNAL = 12; -} diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h index 6bcd5cf39f2e0b051595de2ae8c5e41e03abb62a..30e532f0491cbc9b821014bf3d0dbfe2263cce50 100644 --- a/paddle/fluid/platform/errors.h +++ b/paddle/fluid/platform/errors.h @@ -13,76 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - -#include -#include -#include -#include -#include - -#include "paddle/fluid/platform/error_codes.pb.h" -#include "paddle/fluid/string/printf.h" - +#include "paddle/pten/core/errors.h" namespace paddle { namespace platform { - -typedef ::paddle::platform::error::Code Code; - -class ErrorSummary { - public: - // Note(chenweihang): Final deprecated constructor - // This constructor is used to be compatible with - // current existing untyped PADDLE_ENFORCE_* - // PADDLE_ENFORCE - // Note(chenweihang): Windows openblas need this - // constructor for compiling PADDLE_ENFORCE in *.cu, - // this is a bug cause we can't remove this - // constructor now. - template - explicit ErrorSummary(Args... args) { - code_ = paddle::platform::error::LEGACY; - msg_ = paddle::string::Sprintf(args...); - } - - // Note(chenweihang): Only recommended constructor - // No longer supports PADDLE_ENFORCE without type or without error message - explicit ErrorSummary(Code code, std::string msg) : code_(code), msg_(msg) {} - - Code code() const { return code_; } - - const std::string& error_message() const { return msg_; } - - std::string to_string() const; - - private: - Code code_; - std::string msg_; -}; - -namespace errors { - -#define REGISTER_ERROR(FUNC, CONST, ...) \ - template \ - ::paddle::platform::ErrorSummary FUNC(Args... args) { \ - return ::paddle::platform::ErrorSummary( \ - ::paddle::platform::error::CONST, ::paddle::string::Sprintf(args...)); \ - } - -REGISTER_ERROR(InvalidArgument, INVALID_ARGUMENT) -REGISTER_ERROR(NotFound, NOT_FOUND) -REGISTER_ERROR(OutOfRange, OUT_OF_RANGE) -REGISTER_ERROR(AlreadyExists, ALREADY_EXISTS) -REGISTER_ERROR(ResourceExhausted, RESOURCE_EXHAUSTED) -REGISTER_ERROR(PreconditionNotMet, PRECONDITION_NOT_MET) -REGISTER_ERROR(PermissionDenied, PERMISSION_DENIED) -REGISTER_ERROR(ExecutionTimeout, EXECUTION_TIMEOUT) -REGISTER_ERROR(Unimplemented, UNIMPLEMENTED) -REGISTER_ERROR(Unavailable, UNAVAILABLE) -REGISTER_ERROR(Fatal, FATAL) -REGISTER_ERROR(External, EXTERNAL) - -#undef REGISTER_ERROR - -} // namespace errors -} // namespace platform -} // namespace paddle +namespace errors = ::pten::errors; +using error = ::pten::ErrorCode; +} +} diff --git a/paddle/fluid/platform/profiler/event_node.h b/paddle/fluid/platform/profiler/event_node.h new file mode 100755 index 0000000000000000000000000000000000000000..05190bc4666941f8403cbb55589a53bb26aeb690 --- /dev/null +++ b/paddle/fluid/platform/profiler/event_node.h @@ -0,0 +1,207 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/profiler/output_logger.h" +#include "paddle/fluid/platform/profiler/trace_event.h" + +namespace paddle { +namespace platform { + +class DeviceTraceEventNode { + public: + // constructor + explicit DeviceTraceEventNode(const DeviceTraceEvent& device_event) + : device_event_(device_event) {} + // destructor + ~DeviceTraceEventNode() {} + // getter + std::string name() const { return device_event_.name; } + TracerEventType type() const { return device_event_.type; } + uint64_t start_ns() const { return device_event_.start_ns; } + uint64_t end_ns() const { return device_event_.end_ns; } + uint64_t device_id() const { return device_event_.device_id; } + uint64_t context_id() const { return device_event_.context_id; } + uint64_t stream_id() const { return device_event_.stream_id; } + uint64_t duration() const { + return device_event_.end_ns - device_event_.start_ns; + } + uint32_t correlation_id() const { return device_event_.correlation_id; } + KernelEventInfo kernel_info() const { + PADDLE_ENFORCE_EQ( + device_event_.type, TracerEventType::Kernel, + platform::errors::Unavailable( + "Can not kernel_info, " + "TracerEventType in node must be TracerEventType::Kernel.")); + return device_event_.kernel_info; + } + MemcpyEventInfo memcpy_info() const { + PADDLE_ENFORCE_EQ( + device_event_.type, TracerEventType::Memcpy, + platform::errors::Unavailable( + "Can not get memcpy_info, " + "TracerEventType in node must be TracerEventType::Memcpy.")); + return device_event_.memcpy_info; + } + MemsetEventInfo memset_info() const { + PADDLE_ENFORCE_EQ( + device_event_.type, TracerEventType::Memset, + platform::errors::Unavailable( + "Can not get memset_info, " + "TracerEventType in node must be TracerEventType::Memset.")); + return device_event_.memset_info; + } + + // member function + void LogMe(BaseLogger* logger) { logger->LogDeviceTraceEventNode(*this); } + + private: + // data + DeviceTraceEvent device_event_; +}; + +class CudaRuntimeTraceEventNode { + public: + // constructor + explicit CudaRuntimeTraceEventNode(const RuntimeTraceEvent& runtime_event) + : runtime_event_(runtime_event) {} + // destructor + ~CudaRuntimeTraceEventNode(); + // getter + std::string name() const { return runtime_event_.name; } + TracerEventType type() const { return runtime_event_.type; } + uint64_t start_ns() const { return runtime_event_.start_ns; } + uint64_t end_ns() const { return runtime_event_.end_ns; } + uint64_t process_id() const { return runtime_event_.process_id; } + uint64_t thread_id() const { return runtime_event_.thread_id; } + uint64_t duration() const { + return runtime_event_.end_ns - runtime_event_.start_ns; + } + uint32_t correlation_id() const { return runtime_event_.correlation_id; } + uint32_t callback_id() const { return runtime_event_.callback_id; } + // member function + void AddDeviceTraceEventNode(DeviceTraceEventNode* node) { + device_node_ptrs_.push_back(node); + } + void LogMe(BaseLogger* logger) { logger->LogRuntimeTraceEventNode(*this); } + std::vector& GetDeviceTraceEventNodes() { + return device_node_ptrs_; + } + + private: + // data + RuntimeTraceEvent runtime_event_; + // device events called by this + std::vector device_node_ptrs_; +}; + +class HostTraceEventNode { + public: + // constructor + explicit HostTraceEventNode(const HostTraceEvent& host_event) + : host_event_(host_event) {} + + // destructor + ~HostTraceEventNode(); + + // getter + std::string name() const { return host_event_.name; } + TracerEventType type() const { return host_event_.type; } + uint64_t start_ns() const { return host_event_.start_ns; } + uint64_t end_ns() const { return host_event_.end_ns; } + uint64_t process_id() const { return host_event_.process_id; } + uint64_t thread_id() const { return host_event_.thread_id; } + uint64_t duration() const { + return host_event_.end_ns - host_event_.start_ns; + } + + // member function + void AddChild(HostTraceEventNode* node) { children_.push_back(node); } + void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) { + runtime_node_ptrs_.push_back(node); + } + std::vector& GetChildren() { return children_; } + std::vector& GetRuntimeTraceEventNodes() { + return runtime_node_ptrs_; + } + void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); } + + private: + // data + HostTraceEvent host_event_; + // cuda runtime events called by this + std::vector runtime_node_ptrs_; + // host events called by this + std::vector children_; +}; + +class NodeTrees { + public: + // constructor + NodeTrees(const std::list& host_events, + const std::list& runtime_events, + const std::list& device_events) { + std::vector host_event_nodes; + std::vector runtime_event_nodes; + std::vector device_event_nodes; + // encapsulate event into nodes + for (auto it = host_events.begin(); it != host_events.end(); ++it) { + host_event_nodes.push_back(new HostTraceEventNode(*it)); + } + for (auto it = runtime_events.begin(); it != runtime_events.end(); ++it) { + runtime_event_nodes.push_back(new CudaRuntimeTraceEventNode(*it)); + } + for (auto it = device_events.begin(); it != device_events.end(); ++it) { + device_event_nodes.push_back(new DeviceTraceEventNode(*it)); + } + // build tree + BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes); + } + + explicit NodeTrees( + const std::map& thread_event_trees_map) + : thread_event_trees_map_(thread_event_trees_map) {} + + // destructor + ~NodeTrees(); + + void LogMe(BaseLogger* logger); + void HandleTrees(std::function, + std::function, + std::function); + std::map GetNodeTrees() { + return thread_event_trees_map_; + } + std::map> Traverse(bool bfs) const; + + private: + std::map thread_event_trees_map_; + void BuildTrees(const std::vector&, + std::vector&, + const std::vector&); + HostTraceEventNode* BuildTreeRelationship( + std::vector host_event_nodes, + std::vector runtime_event_nodes); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h new file mode 100755 index 0000000000000000000000000000000000000000..2241cf9e49e7e8d50cd0bfda575675559577323f --- /dev/null +++ b/paddle/fluid/platform/profiler/event_python.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/platform/profiler/event_node.h" + +namespace paddle { +namespace platform { + +struct DevicePythonNode { + DevicePythonNode() = default; + ~DevicePythonNode() {} + // record name + std::string name; + // record type, one of TracerEventType + TracerEventType type; + // start timestamp of the record + uint64_t start_ns; + // end timestamp of the record + uint64_t end_ns; + // device id + uint64_t device_id; + // context id + uint64_t context_id; + // stream id + uint64_t stream_id; +}; + +struct HostPythonNode { + HostPythonNode() = default; + ~HostPythonNode(); + // record name + std::string name; + // record type, one of TracerEventType + TracerEventType type; + // start timestamp of the record + uint64_t start_ns; + // end timestamp of the record + uint64_t end_ns; + // process id of the record + uint64_t process_id; + // thread id of the record + uint64_t thread_id; + // children node + std::vector children_node_ptrs; + // runtime node + std::vector runtime_node_ptrs; + // device node + std::vector device_node_ptrs; +}; + +class ProfilerResult { + public: + ProfilerResult() : tree_(nullptr) {} + explicit ProfilerResult(NodeTrees* tree); + ~ProfilerResult(); + std::map GetData() { + return thread_event_trees_map; + } + void Save(const std::string& file_name); + + private: + std::map thread_event_trees_map; + NodeTrees* tree_; + HostPythonNode* CopyTree(HostTraceEventNode* node); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/output_logger.h b/paddle/fluid/platform/profiler/output_logger.h new file mode 100755 index 0000000000000000000000000000000000000000..6901ed0c44479459cbe920cd906c2ef16e20844e --- /dev/null +++ b/paddle/fluid/platform/profiler/output_logger.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +namespace paddle { +namespace platform { + +class DeviceTraceEventNode; // forward declaration +class HostTraceEventNode; // forward declaration +class CudaRuntimeTraceEventNode; // forward declaration +class NodeTrees; // forward declaration + +class BaseLogger { + public: + BaseLogger() {} + virtual ~BaseLogger() {} + virtual void LogDeviceTraceEventNode(const DeviceTraceEventNode&) {} + virtual void LogHostTraceEventNode(const HostTraceEventNode&) {} + virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {} + virtual void LogNodeTrees(const NodeTrees&) {} + virtual void LogMetaInfo() {} +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h new file mode 100644 index 0000000000000000000000000000000000000000..e676942c4581688f6854918a3e5a1465fab8d00b --- /dev/null +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -0,0 +1,228 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace platform { + +enum class TracerEventType { + // Used to mark operator record + Operator = 0, + // Used to mark dataloader record + Dataloader = 1, + // Used to mark profile step record + ProfileStep = 2, + // Used to mark cuda runtime record returned by cupti + CudaRuntime = 3, + // Used to mark kernel computation record returned by cupti + Kernel = 4, + // Used to mark memcpy record returned by cupti + Memcpy = 5, + // Used to mark memset record returned by cupti + Memset = 6, + // Used to mark record defined by user + UserDefined = 7, + // A flag to denote the number of current types + NumTypes +}; + +struct KernelEventInfo { + // The X-dimension block size for the kernel. + uint32_t block_x; + // The Y-dimension block size for the kernel. + uint32_t block_y; + // The Z-dimension grid size for the kernel. + uint32_t block_z; + // X-dimension of a grid. + uint32_t grid_x; + // Y-dimension of a grid. + uint32_t grid_y; + // Z-dimension of a grid. + uint32_t grid_z; + // The dynamic shared memory reserved for the kernel, in bytes. + uint32_t dynamic_shared_memory; + // The static shared memory allocated for the kernel, in bytes. + uint32_t static_shared_memory; + // The number of registers required for each thread executing the kernel. + uint32_t registers_per_thread; + // The amount of local memory reserved for each thread, in bytes. + uint32_t local_memory_per_thread; + // The total amount of local memory reserved for the kernel, in bytes. + uint32_t local_memory_total; + // The timestamp when the kernel is queued up in the command buffer, in ns. + // This timestamp is not collected by default. Use API + // cuptiActivityEnableLatencyTimestamps() to enable collection. + uint64_t queued; + // The timestamp when the command buffer containing the kernel launch is + // submitted to the GPU, in ns. + // This timestamp is not collected by default. Use API + // cuptiActivityEnableLatencyTimestamps() to enable collection. + uint64_t submitted; + // The completed timestamp for the kernel execution, in ns. + uint64_t completed; +}; + +struct MemcpyEventInfo { + // The number of bytes transferred by the memory copy. + uint64_t num_bytes; + // The kind of the memory copy. + // Each kind represents the source and destination targets of a memory copy. + // Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind + std::string copy_kind; + // The source memory kind read by the memory copy. + // Each kind represents the type of the memory accessed by a memory + // operation/copy. Refer to CUpti_ActivityMemoryKind + std::string src_kind; + // The destination memory kind read by the memory copy. + std::string dst_kind; +}; + +struct MemsetEventInfo { + // The number of bytes being set by the memory set. + uint64_t num_bytes; + // The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind + std::string memory_kind; + // the value being assigned to memory by the memory set. + uint32_t value; +}; + +struct HostTraceEvent { + HostTraceEvent() = default; + HostTraceEvent(const std::string& name, TracerEventType type, + uint64_t start_ns, uint64_t end_ns, uint64_t process_id, + uint64_t thread_id) + : name(name), + type(type), + start_ns(start_ns), + end_ns(end_ns), + process_id(process_id), + thread_id(thread_id) {} + // record name + std::string name; + // record type, one of TracerEventType + TracerEventType type; + // start timestamp of the record + uint64_t start_ns; + // end timestamp of the record + uint64_t end_ns; + // process id of the record + uint64_t process_id; + // thread id of the record + uint64_t thread_id; +}; + +struct RuntimeTraceEvent { + RuntimeTraceEvent() = default; + RuntimeTraceEvent(const std::string& name, uint64_t start_ns, uint64_t end_ns, + uint64_t process_id, uint64_t thread_id, + uint32_t correlation_id, uint32_t callback_id) + : name(name), + start_ns(start_ns), + end_ns(end_ns), + process_id(process_id), + thread_id(thread_id), + correlation_id(correlation_id), + callback_id(callback_id) {} + + // record name + std::string name; + // record type, one of TracerEventType + TracerEventType type{TracerEventType::CudaRuntime}; + // start timestamp of the record + uint64_t start_ns; + // end timestamp of the record + uint64_t end_ns; + // process id of the record + uint64_t process_id; + // thread id of the record + uint64_t thread_id; + // correlation id, used for correlating async activities happened on device + uint32_t correlation_id; + // callback id, used to identify which cuda runtime api is called + uint32_t callback_id; +}; + +struct DeviceTraceEvent { + DeviceTraceEvent() = default; + DeviceTraceEvent(const std::string& name, TracerEventType type, + uint64_t start_ns, uint64_t end_ns, uint64_t device_id, + uint64_t context_id, uint64_t stream_id, + uint32_t correlation_id, const KernelEventInfo& kernel_info) + : name(name), + type(type), + start_ns(start_ns), + end_ns(end_ns), + device_id(device_id), + context_id(context_id), + stream_id(stream_id), + correlation_id(correlation_id), + kernel_info(kernel_info) {} + DeviceTraceEvent(const std::string& name, TracerEventType type, + uint64_t start_ns, uint64_t end_ns, uint64_t device_id, + uint64_t context_id, uint64_t stream_id, + uint32_t correlation_id, const MemcpyEventInfo& memcpy_info) + : name(name), + type(type), + start_ns(start_ns), + end_ns(end_ns), + device_id(device_id), + context_id(context_id), + stream_id(stream_id), + correlation_id(correlation_id), + memcpy_info(memcpy_info) {} + DeviceTraceEvent(const std::string& name, TracerEventType type, + uint64_t start_ns, uint64_t end_ns, uint64_t device_id, + uint64_t context_id, uint64_t stream_id, + uint32_t correlation_id, const MemsetEventInfo& memset_info) + : name(name), + type(type), + start_ns(start_ns), + end_ns(end_ns), + device_id(device_id), + context_id(context_id), + stream_id(stream_id), + correlation_id(correlation_id), + memset_info(memset_info) {} + // record name + std::string name; + // record type, one of TracerEventType + TracerEventType type; + // start timestamp of the record + uint64_t start_ns; + // end timestamp of the record + uint64_t end_ns; + // device id + uint64_t device_id; + // context id + uint64_t context_id; + // stream id + uint64_t stream_id; + // correlation id, used for correlating async activities happened on device + uint32_t correlation_id; + // union, specific device record type has different detail information + union { + // used for TracerEventType::Kernel + KernelEventInfo kernel_info; + // used for TracerEventType::Memcpy + MemcpyEventInfo memcpy_info; + // used for TracerEventType::Memset + MemsetEventInfo memset_info; + }; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 4feba4ab19b785491bc611b00b1749f253433b29..922b818b2363bb1b29585f31f53c71ebece8e887 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,7 +2,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator - cost_model cuda_graph_with_memory_pool fleet_executor global_utils) + cost_model cuda_graph_with_memory_pool fleet_executor global_utils pten_utils) if (WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) @@ -293,6 +293,10 @@ if(WITH_PYTHON) target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB}) endif() + if(WITH_IPU) + target_link_libraries(paddle_pybind paddle_ipu) + endif() + get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) add_dependencies(paddle_pybind op_function_generator_cmd) diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc index 450939dd0ff8bf8d48262bc4b59e53e0123a3dae..72ee451fe7c31deeec714ba899b6cf2535edc88a 100644 --- a/paddle/fluid/pybind/bind_fleet_executor.cc +++ b/paddle/fluid/pybind/bind_fleet_executor.cc @@ -162,7 +162,12 @@ void BindFleetExecutor(py::module* m) { py::class_(*m, "DistModel") .def(py::init()) .def("init", &DistModel::Init) - .def("run", &DistModel::Run, py::call_guard()); + .def("run", + [](DistModel& self, const std::vector& inputs) { + std::vector outputs; + self.Run(inputs, &outputs); + return outputs; + }); py::class_(*m, "DistModelDataBuf") .def(py::init()) diff --git a/paddle/fluid/pybind/communicator_py.cc b/paddle/fluid/pybind/communicator_py.cc index 07ba7061678d97f11fa6541e7cb7c304d64eb945..723d7f3197230aa5218b19bebdf97bb9a7167e75 100644 --- a/paddle/fluid/pybind/communicator_py.cc +++ b/paddle/fluid/pybind/communicator_py.cc @@ -23,8 +23,8 @@ limitations under the License. */ #include "pybind11/pybind11.h" #include "paddle/fluid/operators/distributed/communicator.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" #include "paddle/fluid/operators/distributed/large_scale_kv.h" +#include "paddle/fluid/operators/distributed/ps/service/communicator/communicator_common.h" namespace py = pybind11; diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index aeb4f533f49395ed43fface1a5c11cee508837d4..73c8f362d145db078ac4c84c91372dcdd61c47af 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -29,15 +29,15 @@ limitations under the License. */ #include #include "paddle/fluid/distributed/common/sparse_sharding_merge.h" -#include "paddle/fluid/distributed/communicator_common.h" -#include "paddle/fluid/distributed/fleet.h" #include "paddle/fluid/distributed/index_dataset/index_sampler.h" #include "paddle/fluid/distributed/index_dataset/index_wrapper.h" -#include "paddle/fluid/distributed/service/communicator.h" -#include "paddle/fluid/distributed/service/env.h" -#include "paddle/fluid/distributed/service/graph_brpc_client.h" -#include "paddle/fluid/distributed/service/graph_py_service.h" -#include "paddle/fluid/distributed/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" +#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h" +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" namespace py = pybind11; using paddle::distributed::CommContext; diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index a3f0a0c87fd803880b6ea19df3d79761bce59daf..780ef741c6aca5ca53224fcdadaf4a4b2e6a6205 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -697,10 +697,10 @@ static void VarBaseCopy(std::shared_ptr &src, // NOLINT platform::DeviceContextPool::Instance().Get(src_device)->Wait(); } } - } else if (src->Var().IsType()) { - auto &src_selected_rows = src->Var().Get(); + } else if (src->Var().IsType()) { + auto &src_selected_rows = src->Var().Get(); auto *dst_selected_rows = - dst.MutableVar()->GetMutable(); + dst.MutableVar()->GetMutable(); dst_selected_rows->set_height(src_selected_rows.height()); dst_selected_rows->set_rows(src_selected_rows.rows()); framework::TensorCopy(src_selected_rows.value(), dst_device, @@ -1392,7 +1392,7 @@ void BindImperative(py::module *m_ptr) { PADDLE_ENFORCE_EQ( self.Var().IsType() || - self.Var().IsType(), + self.Var().IsType(), true, platform::errors::InvalidArgument( "Type of Tensor[%s] must be LoDTensor or SelectedRows!", @@ -1423,15 +1423,14 @@ void BindImperative(py::module *m_ptr) { detach_tensor->ShareInplaceVersionCounterWith(origin_tensor); } else { const auto &origin_selected_rows = - self.Var().Get(); + self.Var().Get(); PADDLE_ENFORCE_EQ( origin_selected_rows.value().IsInitialized(), true, platform::errors::InvalidArgument( "Tensor %s has not been initialized!", self.Name())); auto *detach_selected_rows = - detach_var->MutableVar() - ->GetMutable(); + detach_var->MutableVar()->GetMutable(); detach_selected_rows->set_height(origin_selected_rows.height()); detach_selected_rows->set_rows(origin_selected_rows.rows()); detach_selected_rows->mutable_value()->ShareDataWith( @@ -1597,7 +1596,7 @@ void BindImperative(py::module *m_ptr) { ? grad_var->MutableVar() ->GetMutable() : grad_var->MutableVar() - ->GetMutable() + ->GetMutable() ->mutable_value(); if (tensor->IsInitialized()) { @@ -1613,7 +1612,7 @@ void BindImperative(py::module *m_ptr) { }) .def("_is_sparse", [](imperative::VarBase &self) { - return self.Var().IsType(); + return self.Var().IsType(); }) .def("_allreduce", [](imperative::VarBase &self, @@ -1623,7 +1622,7 @@ void BindImperative(py::module *m_ptr) { #if NCCL_VERSION_CODE >= 2212 imperative::AllReduce(self.Var(), self.MutableVar(), strategy); #else - if (!self.Var().IsType()) { + if (!self.Var().IsType()) { imperative::AllReduce(self.Var(), self.MutableVar(), strategy); } else { PADDLE_THROW(platform::errors::Unimplemented( @@ -2126,11 +2125,10 @@ void BindImperative(py::module *m_ptr) { .Get() .dims()); } else if (self.Var() - .IsType< - framework::SelectedRows>()) { + .IsType()) { return framework::vectorize( self.Var() - .Get() + .Get() .value() .dims()); } else if (self.Var() diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc index 88a43f9428b227d217074765de6d94733d71213f..0bd1e94a09cdbe979228ba0cc30149416458dbf9 100644 --- a/paddle/fluid/pybind/io.cc +++ b/paddle/fluid/pybind/io.cc @@ -49,35 +49,33 @@ void BindIO(pybind11::module *m) { return tellg; }); - m->def("save_selected_rows", - [](const paddle::framework::SelectedRows &selected_rows, - const std::string &str_file_name) { - std::ofstream fout(str_file_name, std::ios::binary); - PADDLE_ENFORCE_EQ( - static_cast(fout), true, - platform::errors::Unavailable( - "Cannot open %s to save SelectedRows.", str_file_name)); - - paddle::framework::SerializeToStream(fout, selected_rows); - int64_t tellp = fout.tellp(); - fout.close(); - return tellp; - }); + m->def("save_selected_rows", [](const pten::SelectedRows &selected_rows, + const std::string &str_file_name) { + std::ofstream fout(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fout), true, + platform::errors::Unavailable("Cannot open %s to save SelectedRows.", + str_file_name)); - m->def("load_selected_rows", - [](paddle::framework::SelectedRows &selected_rows, - const std::string &str_file_name) { - std::ifstream fin(str_file_name, std::ios::binary); - PADDLE_ENFORCE_EQ( - static_cast(fin), true, - platform::errors::Unavailable( - "Cannot open %s to load SelectedRows.", str_file_name)); + paddle::framework::SerializeToStream(fout, selected_rows); + int64_t tellp = fout.tellp(); + fout.close(); + return tellp; + }); - paddle::framework::DeserializeFromStream(fin, &selected_rows); - int64_t tellg = fin.tellg(); - fin.close(); - return tellg; - }); + m->def("load_selected_rows", [](pten::SelectedRows &selected_rows, + const std::string &str_file_name) { + std::ifstream fin(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fin), true, + platform::errors::Unavailable("Cannot open %s to load SelectedRows.", + str_file_name)); + + paddle::framework::DeserializeFromStream(fin, &selected_rows); + int64_t tellg = fin.tellg(); + fin.close(); + return tellg; + }); m->def("save_lod_tensor_to_memory", [](const paddle::framework::LoDTensor &tensor) -> py::bytes { @@ -93,14 +91,14 @@ void BindIO(pybind11::module *m) { }); m->def("save_selected_rows_to_memory", - [](const paddle::framework::SelectedRows &selected_rows) -> py::bytes { + [](const pten::SelectedRows &selected_rows) -> py::bytes { std::ostringstream ss; paddle::framework::SerializeToStream(ss, selected_rows); return ss.str(); }); m->def("load_selected_rows_from_memory", - [](paddle::framework::SelectedRows &selected_rows, + [](pten::SelectedRows &selected_rows, const std::string &selected_rows_bytes) { std::istringstream fin(selected_rows_bytes, std::ios::in | std::ios::binary); diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 5587952facc530c6847a8949ae17c08c7cb09a9c..957c0b0ee6d1d09fa6b4ed78595295e5b43544f5 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -123,6 +123,7 @@ static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs) PyThreadState *tstate = nullptr; try { + platform::RecordEvent op_type_record_event("%s pybind_imperative_func"); %s framework::AttributeMap attrs; ConstructAttrMapFromPyArgs("%s", args, %d, PyTuple_GET_SIZE(args) , attrs); @@ -371,8 +372,8 @@ std::string GenerateOpFunctionsBody( // generate op funtcion body auto op_function_str = paddle::string::Sprintf( - OP_FUNCTION_TEMPLATE, func_name, ins_cast_str, op_type, input_args_num, - inplace_strategy_str, outs_initializer, ins_initializer, + OP_FUNCTION_TEMPLATE, func_name, op_type, ins_cast_str, op_type, + input_args_num, inplace_strategy_str, outs_initializer, ins_initializer, ins_initializer_with_null + outs_initializer_with_null + view_strategy_str, op_type, inplace_mapping_str, return_str); @@ -461,6 +462,7 @@ int main(int argc, char* argv[]) { #endif std::vector headers{"\"paddle/fluid/imperative/tracer.h\"", + "\"paddle/fluid/platform/profiler.h\"", "\"pybind11/detail/common.h\"", ""}; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 454e3b524f5f14f3aa5b780eec2eac2305a1e1ed..d3d7e5794e7b192d6aacee4adccfab554555187b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -50,6 +50,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/prune.h" +#include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/save_load_util.h" #include "paddle/fluid/framework/scope_pool.h" @@ -133,9 +134,10 @@ limitations under the License. */ #endif #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + #ifdef PADDLE_WITH_IPU -#include "paddle/fluid/platform/ipu/ipu_backend.h" -#include "paddle/fluid/platform/ipu_info.h" +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_info.h" #endif #ifdef PADDLE_WITH_MLU @@ -1216,23 +1218,27 @@ PYBIND11_MODULE(core_noavx, m) { })); #endif - py::class_(m, "SelectedRows") + py::class_(m, "SelectedRows") .def("__init__", - [](SelectedRows &instance) { new (&instance) SelectedRows(); }) + [](pten::SelectedRows &instance) { + new (&instance) pten::SelectedRows(); + }) .def("__init__", - [](SelectedRows &instance, const std::vector rows, + [](pten::SelectedRows &instance, const std::vector rows, const int64_t &height) { - new (&instance) SelectedRows(rows, height); + new (&instance) pten::SelectedRows(rows, height); }) .def("get_tensor", - [](SelectedRows &self) { return self.mutable_value(); }, + [](pten::SelectedRows &self) { return self.mutable_value(); }, py::return_value_policy::reference) .def("numel", - [](SelectedRows &self) -> int64_t { return self.value().numel(); }) - .def("set_height", &SelectedRows::set_height) - .def("height", &SelectedRows::height) + [](pten::SelectedRows &self) -> int64_t { + return self.value().numel(); + }) + .def("set_height", &pten::SelectedRows::set_height) + .def("height", &pten::SelectedRows::height) .def("set_rows", - [](SelectedRows &self, std::vector rows) { + [](pten::SelectedRows &self, std::vector rows) { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) self.set_rows(rows); #else @@ -1240,8 +1246,9 @@ PYBIND11_MODULE(core_noavx, m) { self.set_rows(new_rows); #endif }) - .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); }) - .def("rows", [](SelectedRows &self) { + .def("sync_index", + [](pten::SelectedRows &instance) { instance.SyncIndex(); }) + .def("rows", [](pten::SelectedRows &self) { auto rows = self.rows(); std::vector new_rows; new_rows.reserve(rows.size()); @@ -1290,8 +1297,8 @@ All parameter, weight, gradient are variables in Paddle. [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) .def("get_selected_rows", - [](Variable &self) -> SelectedRows * { - return self.GetMutable(); + [](Variable &self) -> pten::SelectedRows * { + return self.GetMutable(); }, py::return_value_policy::reference) .def("get_lod_tensor_array", @@ -1756,27 +1763,30 @@ All parameter, weight, gradient are variables in Paddle. .def("__repr__", string::to_string) .def("__str__", string::to_string); #ifdef PADDLE_WITH_XPU - py::enum_(m, "XPUVersion", py::arithmetic()) - .value("XPU1", platform::XPUVersion::XPU1) - .value("XPU2", platform::XPUVersion::XPU2) + py::enum_(m, "XPUVersion", py::arithmetic()) + .value("XPU1", pten::backends::xpu::XPUVersion::XPU1) + .value("XPU2", pten::backends::xpu::XPUVersion::XPU2) .export_values(); m.def("get_xpu_device_count", platform::GetXPUDeviceCount); m.def("get_xpu_device_version", [](int device_id) { return platform::get_xpu_version(device_id); }); - m.def("get_xpu_device_op_support_types", - [](const std::string &op_name, platform::XPUVersion version) { - return platform::get_xpu_op_support_type(op_name, version); - }); - m.def("get_xpu_device_op_list", [](platform::XPUVersion version) { + m.def( + "get_xpu_device_op_support_types", + [](const std::string &op_name, pten::backends::xpu::XPUVersion version) { + return platform::get_xpu_op_support_type(op_name, version); + }); + m.def("get_xpu_device_op_list", [](pten::backends::xpu::XPUVersion version) { return platform::get_xpu_op_list(version); }); m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool { // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1; + return platform::get_xpu_version(place.device) > + pten::backends::xpu::XPUVersion::XPU1; }); m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool { // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1; + return platform::get_xpu_version(place.device) > + pten::backends::xpu::XPUVersion::XPU1; }); #endif diff --git a/paddle/fluid/string/piece.h b/paddle/fluid/string/piece.h index 8dda484eaac4d62b758e57ac5e81bfe68a5c60d4..09dee0a31cbc71ba946a05d55714ba1c302bf788 100644 --- a/paddle/fluid/string/piece.h +++ b/paddle/fluid/string/piece.h @@ -17,89 +17,4 @@ #include #include -namespace paddle { -namespace string { - -// Piece points into a std::string object but doesn't own the -// string. It is for efficient access to strings. Like Go's string -// type. Not that Piece doesn't mutate the underlying string, -// so it is thread-safe given that the underlying string doesn't -// change. Because Piece contains a little data members, and -// its syntax is simple as it doesn't own/manage the string, it is -// cheap to construct Pieces and pass them around. -class Piece { - public: - static const size_t npos = static_cast(-1); - - // We provide non-explicit singleton constructors so users can - // pass in a "const char*" or a "string" wherever a "Piece" - // is expected. These constructors ensure that if data_ is NULL, - // size_ is 0. - Piece(); - Piece(const char* d, size_t n); - Piece(const char* d); // NOLINT: accept C string into Piece. - Piece(const std::string& s); // NOLINT: accept C++ string into Piece. - - const char* data() const { return data_; } - size_t len() const { return size_; } - - char operator[](size_t n) const; - - // Piece doesn't own the string, so both iterator and const - // iterator are const char* indeed. - typedef const char* const_iterator; - typedef const char* iterator; - iterator begin() const { return data_; } - iterator end() const { return data_ + size_; } - - // Return a string that contains the copy of the referenced data. - std::string ToString() const { return std::string(data_, size_); } - - private: - const char* data_; - size_t size_; - - // Intentionally copyable -}; - -int Compare(Piece a, Piece b); - -bool operator==(Piece x, Piece y); -bool operator!=(Piece x, Piece y); -bool operator<(Piece x, Piece y); -bool operator>(Piece x, Piece y); -bool operator<=(Piece x, Piece y); -bool operator>=(Piece x, Piece y); - -bool HasPrefix(Piece s, Piece prefix); -bool HasSuffix(Piece s, Piece suffix); - -Piece SkipPrefix(Piece s, size_t n); -Piece SkipSuffix(Piece s, size_t n); - -// Skip the prefix (or suffix) if it matches with the string. -Piece TrimPrefix(Piece s, Piece prefix); -Piece TrimSuffix(Piece s, Piece suffix); - -// Returns if s contains sub. Any s except for empty s contains an -// empty sub. -bool Contains(Piece s, Piece sub); - -// Return the first occurrence of sub in s, or npos. If both s and -// sub is empty, it returns npos; otherwise, if only sub is empty, it -// returns 0. -size_t Index(Piece s, Piece sub); - -// Return the first occurrence of c in s[pos:end], or npos. -size_t Find(Piece s, char c, size_t pos); - -// Search range is [0..pos] inclusive. If pos == npos, search everything. -size_t RFind(Piece s, char c, size_t pos); - -Piece SubStr(Piece s, size_t pos, size_t n); - -// allow Piece to be logged -std::ostream& operator<<(std::ostream& o, Piece piece); - -} // namespace string -} // namespace paddle +#include "paddle/utils/string/piece.h" diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h index 696e2bb04f010dcbbd8eb930cb64d3e5c6a595ce..45fe89e8b5b14ef7afe7ccb4806b025f0a5eac39 100644 --- a/paddle/fluid/string/pretty_log.h +++ b/paddle/fluid/string/pretty_log.h @@ -19,70 +19,4 @@ #include #include "gflags/gflags.h" -#include "paddle/fluid/string/printf.h" - -DECLARE_bool(color); - -namespace paddle { - -namespace string { - -inline std::string black() { return FLAGS_color ? "\e[30m" : ""; } -inline std::string red() { return FLAGS_color ? "\e[31m" : ""; } -inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; } -inline std::string green() { return FLAGS_color ? "\e[32m" : ""; } -inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; } -inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; } -inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; } -inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; } -inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; } -inline std::string white() { return FLAGS_color ? "\e[37m" : ""; } -inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; } -inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; } -inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; } -inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; } -inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; } -inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; } - -using TextBlock = std::pair; - -struct Style { - static std::string info() { return black(); } - static std::string warn() { return b_red(); } - static std::string suc() { return green(); } - static std::string H1() { return bold() + purple(); } - static std::string H2() { return green(); } - static std::string H3() { return green(); } - static std::string detail() { return light_gray(); } -}; - -template -static void PrettyLogEndl(const std::string &style, const char *fmt, - const Args &... args) { - std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; -} -template -static void PrettyLog(const std::string &style, const char *fmt, - const Args &... args) { - std::cerr << style << Sprintf(fmt, args...) << reset(); -} - -template -static void PrettyLogInfo(const char *fmt, const Args &... args) { - PrettyLogEndl(Style::info(), fmt, args...); -} -template -static void PrettyLogDetail(const char *fmt, const Args &... args) { - PrettyLogEndl(Style::detail(), fmt, args...); -} -template -static void PrettyLogH1(const char *fmt, const Args &... args) { - PrettyLogEndl(Style::H1(), fmt, args...); -} -template -static void PrettyLogH2(const char *fmt, const Args &... args) { - PrettyLogEndl(Style::H2(), fmt, args...); -} - -} // namespace string -} // namespace paddle +#include "paddle/utils/string/pretty_log.h" diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index 66b768665b6d0b97b4ca1470020132bfc9576bbb..40cc5450f415911b9f15ef39d24d8b04914a6baf 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -12,113 +12,5 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Compared with std::stringstream, there are primary purpose of -// string::Printf: -// -// 1. Type-safe printing, with why and how explained in -// http://www.drdobbs.com/stringprintf-a-typesafe-printf-family-fo/184401999. -// Implementation includes -// -// https://github.com/c42f/tinyformat -// boost::format -// std::stringstream -// -// std::stringstream is not convenient enough in many cases. For example: -// -// std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n"; -// -// boost::format is the most convenient one. We can have -// -// std::cout << format("%2% %1%") % 36 % 77; -// -// or -// -// format fmter("%2% %1%"); -// fmter % 36; fmter % 77; -// std::cout << fmter.c_str(); -// -// But the overloading of % might be overkilling and it would be -// more efficient if it can write to std::cout directly. -// -// tinyformat has an interface compatible with the C-printf style, -// and it can writes to a stream or returns a std::string: -// -// std::cout << tfm::printf( -// "%s, %s %d, %.2d:%.2d\n", -// weekday, month, day, hour, min); -// -// or -// -// tfm::format(std::cout, -// "%s, %s %d, %.2d:%.2d\n", -// weekday, month, day, hour, min); -// -// 2. High-performance -- most printed strings are not too long and -// doens't need dynamic memory allocation. Many StringPrintf -// implementations doesn't enforce type-safe, but are -// high-performance, including -// -// https://developers.google.com/optimization/reference/base/stringprintf/ -// https://github.com/adobe/chromium/blob/master/base/stringprintf.h -// https://github.com/google/protobuf/blob/master/src/google/protobuf/stubs/stringprintf.h -// -// According to -// https://github.com/c42f/tinyformat#compile-time-and-code-bloat, -// boost::format runs too slow and results in large executable binary -// files. So here we port tinyformat. - #pragma once - -#include -#include -#include -#include - -#include "tinyformat/tinyformat.h" // https://github.com/c42f/tinyformat - -namespace paddle { -namespace string { - -template -void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { - tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...)); -} - -inline std::string Sprintf() { return ""; } - -template -std::string Sprintf(const Args&... args) { - std::ostringstream oss; - Fprintf(oss, "%s", args...); - return oss.str(); -} - -template -std::string Sprintf(const char* fmt, const Args&... args) { - std::ostringstream oss; - Fprintf(oss, fmt, args...); - return oss.str(); -} - -template -void Printf(const char* fmt, const Args&... args) { - Fprintf(std::cout, fmt, args...); -} - -inline std::string HumanReadableSize(double f_size) { - size_t i = 0; - double orig = f_size; - const std::vector units( - {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}); - while (f_size >= 1024) { - f_size /= 1024; - i++; - } - if (i >= units.size()) { - return Sprintf("%fB", orig); - } - return Sprintf("%f%s", f_size, units[i]); -} - -} // namespace string -} // namespace paddle +#include "paddle/utils/string/printf.h" diff --git a/paddle/fluid/string/split.h b/paddle/fluid/string/split.h index ccb96b8a9cb68f03acbca592a2149ba5001f34d2..d2a6f67ca75c15e746586ab0db97528c3fc88117 100644 --- a/paddle/fluid/string/split.h +++ b/paddle/fluid/string/split.h @@ -17,21 +17,4 @@ limitations under the License. */ #include #include -namespace paddle { -namespace string { - -static inline std::vector Split(std::string const& original, - char separator) { - std::vector results; - std::string token; - std::istringstream is(original); - while (std::getline(is, token, separator)) { - if (!token.empty()) { - results.push_back(token); - } - } - return results; -} - -} // namespace string -} // namespace paddle +#include "paddle/utils/string/split.h" diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h index c52b7a99a777a7eac714e6533101368e35844c21..08a715bfbc76431362fadf5376cc4647f1487ebc 100644 --- a/paddle/fluid/string/string_helper.h +++ b/paddle/fluid/string/string_helper.h @@ -14,219 +14,4 @@ #pragma once -#include -#include -#include -#include -#include -#include - -#include "glog/logging.h" - -namespace paddle { -namespace string { - -inline size_t count_spaces(const char* s) { - size_t count = 0; - - while (*s != 0 && isspace(*s++)) { - count++; - } - - return count; -} - -inline size_t count_nonspaces(const char* s) { - size_t count = 0; - - while (*s != 0 && !isspace(*s++)) { - count++; - } - - return count; -} - -template -void format_string_append(std::string& str, const char* fmt, // NOLINT - ARGS&&... args) { - int len = snprintf(NULL, 0, fmt, args...); - CHECK_GE(len, 0); - size_t oldlen = str.length(); - str.resize(oldlen + len + 1); - - CHECK(snprintf(&str[oldlen], (size_t)len + 1, fmt, args...) == // NOLINT - len); - str.resize(oldlen + len); -} - -template -void format_string_append(std::string& str, const std::string& fmt, // NOLINT - ARGS&&... args) { - format_string_append(str, fmt.c_str(), args...); -} - -template -std::string format_string(const char* fmt, ARGS&&... args) { - std::string str; - format_string_append(str, fmt, args...); - return str; -} - -template -std::string format_string(const std::string& fmt, ARGS&&... args) { - return format_string(fmt.c_str(), args...); -} - -// remove leading and tailing spaces -std::string trim_spaces(const std::string& str); - -// erase all spaces in str -std::string erase_spaces(const std::string& str); - -inline int str_to_float(const char* str, float* v) { - const char* head = str; - char* cursor = NULL; - int index = 0; - while (*(head += count_spaces(head)) != 0) { - v[index++] = std::strtof(head, &cursor); - if (head == cursor) { - break; - } - head = cursor; - } - return index; -} - -// checks whether the test string is a suffix of the input string. -bool ends_with(std::string const& input, std::string const& test); - -// split string by delim -template -std::vector split_string(const std::string& str, const std::string& delim) { - size_t pre_pos = 0; - size_t pos = 0; - std::string tmp_str; - std::vector res_list; - res_list.clear(); - if (str.empty()) { - return res_list; - } - while ((pos = str.find(delim, pre_pos)) != std::string::npos) { - tmp_str.assign(str, pre_pos, pos - pre_pos); - res_list.push_back(tmp_str); - pre_pos = pos + 1; - } - tmp_str.assign(str, pre_pos, str.length() - pre_pos); - if (!tmp_str.empty()) { - res_list.push_back(tmp_str); - } - return res_list; -} - -// split string by spaces. Leading and tailing spaces are ignored. Consecutive -// spaces are treated as one delim. -template -std::vector split_string(const std::string& str) { - std::vector list; - const char* p; - int pre_pos = 0; - int pos = 0; - std::string tmp_str; - if (str.empty()) { - return list; - } - for (p = str.c_str(); *p != 0;) { - if (!isspace(*p)) { - pos = pre_pos; - p++; - - while (*p != 0 && !isspace(*p)) { - pos++; - p++; - } - tmp_str.assign(str, pre_pos, pos - pre_pos + 1); - list.push_back(tmp_str); - pre_pos = pos + 1; - } else { - pre_pos++; - p++; - } - } - return list; -} - -template -std::string join_strings(const Container& strs, char delim) { - std::string str; - - size_t i = 0; - for (auto& elem : strs) { - if (i > 0) { - str += delim; - } - - std::stringstream ss; - ss << elem; - str += ss.str(); - ++i; - } - - return str; -} - -template -std::string join_strings(const Container& strs, const std::string& delim) { - std::string str; - - size_t i = 0; - for (auto& elem : strs) { - if (i > 0) { - str += delim; - } - - std::stringstream ss; - ss << elem; - str += ss.str(); - ++i; - } - - return str; -} - -template -std::string join_strings(const Container& strs, DelimT&& delim, - ConvertFunc&& func) { - std::stringstream ss; - size_t i = 0; - for (const auto& elem : strs) { - if (i > 0) { - ss << delim; - } - ss << func(elem); - ++i; - } - - return ss.str(); -} - -// A helper class for reading lines from file. A line buffer is maintained. It -// doesn't need to know the maximum possible length of a line. - -class LineFileReader { - public: - LineFileReader() {} - LineFileReader(LineFileReader&&) = delete; - LineFileReader(const LineFileReader&) = delete; - ~LineFileReader() { ::free(_buffer); } - char* getline(FILE* f) { return this->getdelim(f, '\n'); } - char* getdelim(FILE* f, char delim); - char* get() { return _buffer; } - size_t length() { return _length; } - - private: - char* _buffer = NULL; - size_t _buf_size = 0; - size_t _length = 0; -}; -} // end namespace string -} // end namespace paddle +#include "paddle/utils/string/string_helper.h" diff --git a/paddle/fluid/string/to_string.h b/paddle/fluid/string/to_string.h index 7b3332861e0fa3edbbb8915e3e3f068fed3b412f..72d9c0379fd3aa3f9d1ee156cd19c13eb7001efa 100644 --- a/paddle/fluid/string/to_string.h +++ b/paddle/fluid/string/to_string.h @@ -13,48 +13,4 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include -#include -#include -#include -#include - -namespace paddle { -namespace string { -inline std::ostream& operator<<(std::ostream& s, const std::type_index& t) { - s << t.name(); - return s; -} - -template ::value, int>::type = 0> -inline std::string to_string(T v) { - std::ostringstream sout; - sout << v; - return sout.str(); -} - -template ::value, int>::type = 0> -inline std::string to_string(T v) { - return std::to_string(static_cast(v)); -} - -template <> -inline std::string to_string(std::type_index t) { - return t.name(); -} - -// Faster std::string/const char* type -template <> -inline std::string to_string(std::string v) { - return v; -} - -template <> -inline std::string to_string(const char* v) { - return std::string(v); -} - -} // namespace string -} // namespace paddle +#include "paddle/utils/string/to_string.h" diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt index 671ed28313af9efe18e8aa8e7b525ae289446e5f..78e86c12cb4bbb10de52cc2aa46a7d0ff6ce7cd3 100644 --- a/paddle/pten/CMakeLists.txt +++ b/paddle/pten/CMakeLists.txt @@ -21,7 +21,7 @@ add_subdirectory(ops) add_subdirectory(tests) # make an unity target for compile deps -set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils) +set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos) get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS) # keep this message for debug, remove it later if needless message(STATUS "All standard pten kernels: ${pten_kernels}") diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h index b8e7b0d75bc6cb5d8458c4e0663bc4ff1cd1a732..863adbea36aa4ddc04bea5c76959a85e3d8acfb7 100644 --- a/paddle/pten/api/include/kernel_signature.h +++ b/paddle/pten/api/include/kernel_signature.h @@ -102,8 +102,8 @@ using scale_kernel = void (*)(const DeviceContext&, using sum_kernel = void (*)(const DeviceContext&, const DenseTensor&, const std::vector&, - bool, DataType, + bool, DenseTensor*); using subtract_kernel = void (*)(const DeviceContext&, diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h index d2afd703eaf2a1827143fd6b6f47c6f42941c250..e93b9be7046a3cf9592898408575c2bdb4f378c2 100644 --- a/paddle/pten/api/include/tensor.h +++ b/paddle/pten/api/include/tensor.h @@ -505,6 +505,12 @@ class PADDLE_API Tensor final { * in the development of new dygraph. It may be removed in the future. */ std::string name_{""}; + + /** + * Place type: Return the expected memory location if the Tensor is + * uninitialized. + */ + PlaceType place_{PlaceType::kUNK}; }; } // namespace experimental diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt index 1e645a68edfdfa8b09216860cb905a171a0258aa..d3088c4483427f93a47b54532cb186a71f9546f8 100644 --- a/paddle/pten/api/lib/CMakeLists.txt +++ b/paddle/pten/api/lib/CMakeLists.txt @@ -3,11 +3,11 @@ add_subdirectory(utils) cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place) if (WITH_GPU) - nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils enforce) + nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce) elseif (WITH_ROCM) - hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils enforce) + hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce) else() - cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils enforce) + cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce) endif() cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context kernel_factory) @@ -38,7 +38,7 @@ endif() add_custom_command( OUTPUT ${api_header_file} ${api_source_file} COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml - COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file} + COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${api_yaml_file} --api_header_path ${api_header_file_tmp} --api_source_path ${api_source_file_tmp} @@ -51,7 +51,7 @@ add_custom_command( # generate backward api add_custom_command( OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp} ${bw_api_source_file_tmp} - COMMAND ${PYTHON_EXECUTABLE} ${bw_api_gen_file} + COMMAND ${PYTHON_EXECUTABLE} ${bw_api_gen_file} --backward_yaml_path ${bw_api_yaml_file} --backward_header_path ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp} @@ -63,4 +63,4 @@ add_custom_command( cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch) cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch) -cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta) +cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta pten_function_api) diff --git a/paddle/pten/api/lib/op_meta_info.cc b/paddle/pten/api/lib/op_meta_info.cc index aa2e33afb94b84ca7fb34ebb6342d792b3afec44..82d465b4c21fcac5cb593c4d246421dd3378a275 100644 --- a/paddle/pten/api/lib/op_meta_info.cc +++ b/paddle/pten/api/lib/op_meta_info.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/custom_operator.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/core/enforce.h" namespace paddle { @@ -74,7 +74,7 @@ OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name, size_t index) { PADDLE_ENFORCE_EQ( info_vector.size(), index_, - platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "The operator %s's meta info register failed. " "Please make sure you call marcos as order `PD_BUILD_OP`, " "`PD_BUILD_GRAD_OP`, `PD_BUILD_DOUBLE_GRAD_OP`.", @@ -88,7 +88,7 @@ OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name, size_t index) { case 2: name_ = name_ + "_grad_grad"; default: - PADDLE_THROW(platform::errors::InvalidArgument( + PADDLE_THROW(pten::errors::InvalidArgument( "Not support index `%d` when construct OpMetaInfoBuilder, " "now only support `0, 1, 2`.", index_)); @@ -130,7 +130,7 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc func) { PADDLE_ENFORCE_EQ( index_, 0UL, - platform::errors::Unimplemented( + pten::errors::Unimplemented( "Currently, the InferDtypeFn setting of Grad Op is not supported, " "And backward Tensor `X@GRAD` will use the dtype of forward Tensor " "`X` by default.")); diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc index 3389dacec36a5c5515fd95c66f7a39ea27d5fc40..02fd918d799be6c226da73813efecd930b9bb56b 100644 --- a/paddle/pten/api/lib/tensor.cc +++ b/paddle/pten/api/lib/tensor.cc @@ -48,12 +48,12 @@ limitations under the License. */ * or the corresponding components will be re-implemented. */ #include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/pten/common/complex.h" #include "paddle/pten/common/float16.h" #include "paddle/pten/core/ddim.h" +#include "paddle/pten/core/enforce.h" namespace paddle { namespace experimental { @@ -68,7 +68,7 @@ Tensor cast(const Tensor &x, DataType out_dtype); Tensor::Tensor(std::shared_ptr tensor_impl) : impl_(std::move(tensor_impl)) { PADDLE_ENFORCE_NOT_NULL(impl_, - platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "TensorImpl with nullptr is not supported")); } @@ -78,7 +78,8 @@ Tensor::Tensor(const PlaceType &place) ConvertExtPlaceToInnerPlace(place))), std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED, framework::make_ddim({}), - pten::DataLayout::NCHW))))) {} + pten::DataLayout::NCHW))))), + place_{place} {} Tensor::Tensor(const PlaceType &place, const std::vector &shape) : impl_(std::move(std::make_shared( @@ -86,7 +87,8 @@ Tensor::Tensor(const PlaceType &place, const std::vector &shape) ConvertExtPlaceToInnerPlace(place))), std::move(pten::DenseTensorMeta(pten::DataType::UNDEFINED, framework::make_ddim(shape), - pten::DataLayout::NCHW))))) {} + pten::DataLayout::NCHW))))), + place_{place} {} /* Part 2: Dimension, DataType and DataLayout methods */ @@ -113,7 +115,7 @@ void Tensor::reshape(const std::vector &shape) { std::dynamic_pointer_cast(impl_)->set_meta( pten::DenseTensorMeta(dtype(), framework::make_ddim(shape))); } else { - PADDLE_THROW(platform::errors::Unimplemented( + PADDLE_THROW(pten::errors::Unimplemented( "Only support reshape operation on DenseTensor now.")); } } @@ -131,17 +133,23 @@ bool Tensor::is_dense_tensor() const { /* Part 3: Device and Backend methods */ PlaceType Tensor::place() const { - return ConvertInnerPlaceToExtPlace(impl_->place()); + if (!impl_->initialized()) { + return place_; + } else { + return ConvertInnerPlaceToExtPlace(impl_->place()); + } } -paddle::platform::Place Tensor::inner_place() const { return impl_->place(); } +paddle::platform::Place Tensor::inner_place() const { + return ConvertExtPlaceToInnerPlace(place()); +} bool Tensor::is_cpu() const { - return paddle::platform::is_cpu_place(impl_->place()); + return paddle::platform::is_cpu_place(inner_place()); } bool Tensor::is_cuda() const { - return paddle::platform::is_gpu_place(impl_->place()); + return paddle::platform::is_gpu_place(inner_place()); } /* Part 4: Data Access methods */ @@ -177,8 +185,8 @@ T *Tensor::mutable_data(const PlaceType &place) { PADDLE_ENFORCE_EQ( platform::is_same_place(inner_place, impl_->place()), true, - platform::errors::Unimplemented("Modification of tensor place through " - "mutable_data is not supported now")); + pten::errors::Unimplemented("Modification of tensor place through " + "mutable_data is not supported now")); } if (is_dense_tensor()) { return std::dynamic_pointer_cast(impl_)->mutable_data( @@ -236,7 +244,7 @@ Tensor::data() const; template T *Tensor::data() { - PADDLE_THROW(platform::errors::Unimplemented( + PADDLE_THROW(pten::errors::Unimplemented( "It is not currently supported to directly obtain the modifiable data " "address through the tensor::data() method, please use the " "tensor::mutable_data() method.")); @@ -267,7 +275,7 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const { begin_idx, end_idx)))); } else { - PADDLE_THROW(platform::errors::Unimplemented( + PADDLE_THROW(pten::errors::Unimplemented( "Only support slice operation on DenseTensor now.")); } } diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc index e9f5ec2d05727adde9cee1c7ad32595f914bbdde..971476a55db935af616257168b2925d1a23cb603 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.cc +++ b/paddle/pten/api/lib/utils/tensor_utils.cc @@ -261,10 +261,10 @@ std::unique_ptr MakePtenTensorBaseFromVar( } else { return MakePtenDenseTensor(tensor); } - } else if (variable.IsType()) { + } else if (variable.IsType()) { // TODO(chenweihang): now we don't deal with row and height // by xiaowei's advice - const auto& tensor = variable.Get(); + const auto& tensor = variable.Get(); if (!platform::is_same_place(tensor.value().place(), expected_place)) { framework::Tensor tmp_tensor; paddle::framework::TensorCopySync( @@ -289,8 +289,8 @@ std::unique_ptr MakePtenTensorBaseFromVar( if (variable->template IsType()) { auto* tensor = variable->template GetMutable(); return MakePtenDenseTensor(*tensor, arg_def); - } else if (variable->template IsType()) { - auto* tensor = variable->template GetMutable(); + } else if (variable->template IsType()) { + auto* tensor = variable->template GetMutable(); // TODO(chenweihang): adapt SelectedRows by xiaowei's design, // here the row and height will lost in output! return MakePtenDenseTensor(tensor->value(), arg_def); @@ -389,8 +389,8 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src, tensor->set_type(dtype); } - } else if (variable->IsType()) { - auto* tensor = variable->GetMutable(); + } else if (variable->IsType()) { + auto* tensor = variable->GetMutable(); auto dtype = pten::TransToProtoVarType(src->dtype()); if (!tensor->value().IsInitialized()) { diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt index 3587910ff506e572ebeead963015a8c9591388b7..e9f222d642ea0438bbee1532bf746bd0324d3f4b 100644 --- a/paddle/pten/backends/CMakeLists.txt +++ b/paddle/pten/backends/CMakeLists.txt @@ -2,4 +2,12 @@ add_subdirectory(dynload) add_subdirectory(cpu) -cc_library(pten_context SRCS all_context.cc DEPS device_context) +if(WITH_XPU) + add_subdirectory(xpu) +endif() + +cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context) + +if(WITH_XPU) + add_dependencies(pten_context xpu_context) +endif() diff --git a/paddle/pten/backends/cpu/CMakeLists.txt b/paddle/pten/backends/cpu/CMakeLists.txt index 62eff2dedc99c8dcc54c0f1372e3b65e36c3e9f9..965b33f3800edf9597b07ad2446637d2c505fe0f 100644 --- a/paddle/pten/backends/cpu/CMakeLists.txt +++ b/paddle/pten/backends/cpu/CMakeLists.txt @@ -1,6 +1,6 @@ if(WITH_MKLDNN) # TODO(wilber): support mkldnn context. - cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context mkldnn) + cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context mkldnn eigen3) else() - cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context) + cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context eigen3) endif() diff --git a/paddle/pten/backends/cpu/cpu_context.cc b/paddle/pten/backends/cpu/cpu_context.cc index e749dfb9bd70e3594766f5399848c4114ee83ca2..efce128596b8123029787b6e4ba187c464d26cb9 100644 --- a/paddle/pten/backends/cpu/cpu_context.cc +++ b/paddle/pten/backends/cpu/cpu_context.cc @@ -18,16 +18,11 @@ // NOTE: The paddle framework should add WITH_EIGEN option to support compile // without eigen. -#include "paddle/pten/core/device_context.h" #include "unsupported/Eigen/CXX11/Tensor" namespace pten { struct CPUContext::CPUImpl { - Eigen::DefaultDevice* device_{nullptr}; - CPUContextResource res_; - CPUPlace place_; - CPUImpl() { device_ = new Eigen::DefaultDevice(); } // Users need to manage external resources. @@ -36,7 +31,7 @@ struct CPUContext::CPUImpl { } ~CPUImpl() { - if (res_.device == nullptr) { + if (res_.device == nullptr && device_ != nullptr) { delete device_; device_ = nullptr; } @@ -56,27 +51,28 @@ struct CPUContext::CPUImpl { } Place GetPlace() const { return place_; } + + Eigen::DefaultDevice* device_{nullptr}; + CPUContextResource res_; + CPUPlace place_; }; -CPUContext::CPUContext() : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext() : DeviceContext() { cpu_impl_ = std::make_unique(); } -CPUContext::CPUContext(const CPUContext& other) - : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext(const CPUContext& other) : DeviceContext() { cpu_impl_ = std::make_unique(); cpu_impl_->SetEigenDevice(other.eigen_device()); } -CPUContext::CPUContext(CPUContext&& other) - : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext(CPUContext&& other) : DeviceContext() { cpu_impl_ = std::move(other.cpu_impl_); } CPUContext::~CPUContext() = default; -CPUContext::CPUContext(const CPUContextResource& ctx_res) - : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() { cpu_impl_ = std::make_unique(ctx_res); } diff --git a/paddle/pten/backends/xpu/CMakeLists.txt b/paddle/pten/backends/xpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..65341dd206fd30c318eb72cb74c4ad3ac4ae212b --- /dev/null +++ b/paddle/pten/backends/xpu/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place) +cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info) diff --git a/paddle/pten/backends/xpu/enforce_xpu.h b/paddle/pten/backends/xpu/enforce_xpu.h new file mode 100644 index 0000000000000000000000000000000000000000..38aeff198d44bf98dd44edb640f9f46a6d8bd123 --- /dev/null +++ b/paddle/pten/backends/xpu/enforce_xpu.h @@ -0,0 +1,194 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/backends/xpu/xpu_header.h" +#include "xpu/bkcl.h" + +#include "paddle/fluid/platform/enforce.h" + +namespace pten { +namespace backends { +namespace xpu { + +// Note: XPU runtime api return int, not XPUError_t +inline const char* xpuGetErrorString(int stat) { + switch (stat) { + case XPU_SUCCESS: + return "Success"; + case XPUERR_INVALID_DEVICE: + return "Invalid XPU device"; + case XPUERR_UNINIT: + return "XPU runtime not properly inited"; + case XPUERR_NOMEM: + return "Device memory not enough"; + case XPUERR_NOCPUMEM: + return "CPU memory not enough"; + case XPUERR_INVALID_PARAM: + return "Invalid parameter"; + case XPUERR_NOXPUFUNC: + return "Cannot get XPU Func"; + case XPUERR_LDSO: + return "Error loading dynamic library"; + case XPUERR_LDSYM: + return "Error loading func from dynamic library"; + case XPUERR_SIMULATOR: + return "Error from XPU Simulator"; + case XPUERR_NOSUPPORT: + return "Operation not supported"; + case XPUERR_ABNORMAL: + return "Device abnormal due to previous error"; + case XPUERR_KEXCEPTION: + return "Exception in kernel execution"; + case XPUERR_TIMEOUT: + return "Kernel execution timed out"; + case XPUERR_BUSY: + return "Resource busy"; + case XPUERR_USEAFCLOSE: + return "Use a stream after closed"; + case XPUERR_UCECC: + return "Uncorrectable ECC"; + case XPUERR_OVERHEAT: + return "Overheat"; + case XPUERR_UNEXPECT: + return "Execution error, reach unexpected control flow"; + case XPUERR_DEVRESET: + return "Device is being reset, try again later"; + case XPUERR_HWEXCEPTION: + return "Hardware module exception"; + case XPUERR_HBM_INIT: + return "Error init HBM"; + case XPUERR_DEVINIT: + return "Error init device"; + case XPUERR_PEERRESET: + return "Device is being reset, try again later"; + case XPUERR_MAXDEV: + return "Device count exceed limit"; + case XPUERR_NOIOC: + return "Unknown IOCTL command"; + case XPUERR_DMATIMEOUT: + return "DMA timed out, a reboot maybe needed"; + case XPUERR_DMAABORT: + return "DMA aborted due to error, possibly wrong address or hardware " + "state"; + case XPUERR_MCUUNINIT: + return "Firmware not initialized"; + case XPUERR_OLDFW: + return "Firmware version too old (<15), please update."; + case XPUERR_PCIE: + return "Error in PCIE"; + case XPUERR_FAULT: + return "Error copy between kernel and user space"; + case XPUERR_INTERRUPTED: + return "Execution interrupted by user"; + default: + return "unkonwn error"; + } +} + +inline const char* bkclGetErrorString(BKCLResult_t stat) { + switch (stat) { + case BKCL_SUCCESS: + return "BKCL_SUCCESS"; + case BKCL_INVALID_ARGUMENT: + return "BKCL_INVALID_ARGUMENT"; + case BKCL_RUNTIME_ERROR: + return "BKCL_RUNTIME_ERROR"; + case BKCL_SYSTEM_ERROR: + return "BKCL_SYSTEM_ERROR"; + case BKCL_INTERNAL_ERROR: + return "BKCL_INTERNAL_ERROR"; + default: + return "Unknown BKCL status"; + } +} + +inline const char* xdnnGetErrorString(int stat) { + switch (stat) { + case baidu::xpu::api::Error_t::SUCCESS: + return "XDNN_SUCCESS"; + case baidu::xpu::api::Error_t::INVALID_PARAM: + return "XDNN_INVALID_PARAM"; + case baidu::xpu::api::Error_t::RUNTIME_ERROR: + return "XDNN_RUNTIME_ERROR"; + case baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE: + return "XDNN_NO_ENOUGH_WORKSPACE"; + case baidu::xpu::api::Error_t::NOT_IMPLEMENT: + return "XDNN_NOT_IMPLEMENT"; + default: + return "Unknown XDNN status"; + } +} + +inline std::string build_xpu_error_msg(int stat) { + std::string msg("XPU Error <" + std::to_string(stat) + ">, "); + return msg + xpuGetErrorString(stat) + " "; +} + +inline std::string build_xpu_error_msg(BKCLResult_t stat) { + std::string msg("BKCL Error, "); + return msg + bkclGetErrorString(stat) + " "; +} + +inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) { + return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " "; +} + +namespace details { + +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); + +#undef DEFINE_EXTERNAL_API_TYPE + +} // namespace details + +#define PADDLE_ENFORCE_XPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __XPU_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::pten::backends::xpu::details::ExternalApiType< \ + __XPU_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = paddle::platform::errors::External( \ + ::pten::backends::xpu::build_xpu_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG) \ + do { \ + auto __cond__ = (COND); \ + if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) { \ + auto __summary__ = paddle::platform::errors::External( \ + ::pten::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +} // namespace xpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/backends/xpu/forwards.h b/paddle/pten/backends/xpu/forwards.h new file mode 100644 index 0000000000000000000000000000000000000000..805a74865b6d8c62019f593160c19cc661962b01 --- /dev/null +++ b/paddle/pten/backends/xpu/forwards.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// Forward-declares. +#pragma once + +// Forward declaration of xpu context. +namespace baidu { +namespace xpu { +namespace api { + +struct Context; +typedef void* BKCLContext_t; + +} // namespace api +} // namespace xpu +} // namespace baidu diff --git a/paddle/pten/backends/xpu/xpu_context.cc b/paddle/pten/backends/xpu/xpu_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..af4478662a53b8b657bab02e21eb9282fd4189ac --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_context.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/backends/xpu/xpu_context.h" +#include +#include "paddle/pten/api/ext/exception.h" + +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" +#include "xpu/xdnn.h" + +namespace xpu = baidu::xpu::api; + +namespace pten { + +struct XPUContext::XPUImpl { + void SetL3Cache() { + const int MAX_XPU_NUM = 16; + static void* l3ptrs[MAX_XPU_NUM] = {nullptr}; + + int l3_size = 13.5 * 1024 * 1024; + if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) { + l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE")); + } + + auto selected_xpus = backends::xpu::GetXPUSelectedDevices(); + for (unsigned int i = 0; i < selected_xpus.size(); i++) { + if (place_.GetDeviceId() == selected_xpus[i]) { + if (l3ptrs[place_.GetDeviceId()] == nullptr) { + xpu_malloc(static_cast(&l3ptrs[place_.GetDeviceId()]), + l3_size, + XPU_MEM_L3); + } + if (l3ptrs[place_.GetDeviceId()] != nullptr) { + context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size); + VLOG(3) << "xpu place " << place_.GetDeviceId() << " set l3 size " + << l3_size; + } + break; + } + } + } + + XPUImpl() { + context_ = xpu::create_context(); + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + } + + explicit XPUImpl(XPUPlace place) : place_(place) { + backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); + + LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " + << static_cast(place_.device); + + context_ = xpu::create_context(); + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + SetL3Cache(); + } + + // Users need to manage external resources. + explicit XPUImpl(const XPUContextResource& ctx_res, + const XPUPlace& place = XPUPlace(0)) + : res_(ctx_res), place_(place) { + context_ = res_.context; + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + SetL3Cache(); + } + + ~XPUImpl() { + if (res_.context == nullptr && context_ != nullptr) { + xpu::destroy_context(context_); + context_ = nullptr; + } + } + + Place GetPlace() const { return place_; } + + backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; } + + xpu::Context* GetXContext() const { + PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); + return context_; + } + + xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; } + + void Wait() const { + backends::xpu::SetXPUDeviceId(place_.GetDeviceId()); + PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); + xpu_wait(context_->xpu_stream); + } + + void SetXContext(xpu::Context* context) { + if (context == nullptr) { + return; + } + res_.context = context; + context_ = context; + } + + void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; } + + XPUContextResource res_; + XPUPlace place_; + backends::xpu::XPUVersion xpu_version_; + xpu::Context* context_{nullptr}; + // NOTE: Distributed communicator, distributed framework manages its + // resources, XPUContext only holds references. + xpu::BKCLContext_t bkcl_context_{nullptr}; +}; + +XPUContext::XPUContext() : DeviceContext() { + impl_ = std::make_unique(); +} + +XPUContext::XPUContext(const XPUPlace& place) { + impl_ = std::make_unique(place); +} + +XPUContext::XPUContext(const XPUContext& other) : DeviceContext() { + impl_ = std::make_unique(); + impl_->SetXContext(other.x_context()); + impl_->SetBkclContext(other.bkcl_context()); +} + +XPUContext::XPUContext(XPUContext&& other) : DeviceContext() { + impl_ = std::move(other.impl_); +} + +XPUContext::~XPUContext() = default; + +XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() { + impl_ = std::make_unique(ctx_res); +} + +Place XPUContext::GetPlace() const { return impl_->GetPlace(); } + +backends::xpu::XPUVersion XPUContext::xpu_version() const { + return impl_->GetXpuVersion(); +} + +xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); } + +xpu::BKCLContext_t XPUContext::bkcl_context() const { + return impl_->GetBkclContext(); +} + +void XPUContext::Wait() const { impl_->Wait(); } + +void XPUContext::set_x_context(xpu::Context* context) { + impl_->SetXContext(context); +} + +void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) { + impl_->SetBkclContext(context); +} + +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_context.h b/paddle/pten/backends/xpu/xpu_context.h index 94d2a1532f6365bdb4e916adc54a32f3b5f492f3..4ae5786211dd21718a0e72d53f742fd6ae599170 100644 --- a/paddle/pten/backends/xpu/xpu_context.h +++ b/paddle/pten/backends/xpu/xpu_context.h @@ -14,13 +14,60 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_XPU +#include +#include "paddle/pten/backends/xpu/forwards.h" +#include "paddle/pten/common/place.h" +#include "paddle/pten/core/device_context.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/device_context.h" +#include "paddle/pten/backends/xpu/xpu_header.h" +#include "paddle/pten/backends/xpu/xpu_info.h" + +namespace xpu = baidu::xpu::api; namespace pten { -using XPUContext = paddle::platform::XPUDeviceContext; -} // namespace pten -#endif // PADDLE_WITH_XPU +struct XPUContextResource { + xpu::Context* context{nullptr}; +}; + +class XPUContext : public DeviceContext { + public: + // NOTE: DeviceContext hold resources. Used in training scenarios. + XPUContext(); + + explicit XPUContext(const XPUPlace&); + + // NOTE: Share the same underlying resources, please ensure that resources are + // not released. + XPUContext(const XPUContext&); + + XPUContext(XPUContext&&); + + virtual ~XPUContext(); + + Place GetPlace() const override; + + backends::xpu::XPUVersion xpu_version() const; + + xpu::Context* x_context() const; + + // Return bkcl context. + xpu::BKCLContext_t bkcl_context() const; + + // Wait for all operations completion in the stream. + void Wait() const override; + + public: + // NOTE: External users manage resources. Used in inference scenarios. + explicit XPUContext(const XPUContextResource&); + + void set_x_context(xpu::Context*); + + void set_bkcl_context(xpu::BKCLContext_t context); + + private: + struct XPUImpl; + std::unique_ptr impl_; +}; + +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_header.h b/paddle/pten/backends/xpu/xpu_header.h new file mode 100644 index 0000000000000000000000000000000000000000..99e4a06720f22b2993b395ab4ce7ec9585bf3ea2 --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_header.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_XPU +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/float16.h" + +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" +#include "xpu/xdnn.h" + +namespace xpu = baidu::xpu::api; + +static std::map XPUAPIErrorMsg = { + {xpu::Error_t::SUCCESS, "xpu api success"}, + {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"}, + {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"}, + {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}}; + +template +class XPUTypeTrait { + public: + using Type = T; +}; + +template <> +class XPUTypeTrait { + public: + using Type = float16; +}; + +template <> +class XPUTypeTrait { + public: + using Type = bfloat16; +}; + +#endif diff --git a/paddle/pten/backends/xpu/xpu_info.cc b/paddle/pten/backends/xpu/xpu_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..01d23be848bde82445498bab23ff56ce971660f8 --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_info.cc @@ -0,0 +1,199 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/pten/backends/xpu/xpu_info.h" + +#include +#include +#include + +#include "paddle/pten/backends/xpu/enforce_xpu.h" +#include "paddle/pten/backends/xpu/xpu_context.h" +#include "paddle/pten/backends/xpu/xpu_header.h" +#include "paddle/pten/common/place.h" + +// TODO(wilber): The pten computing library requires a component to manage +// flags. +#include "paddle/fluid/platform/flags.h" + +PADDLE_DEFINE_EXPORTED_string( + selected_xpus, + "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (XPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between XPU devices, use XPU_VISIBLE_DEVICES can only use" + "share-memory only."); + +namespace pten { +class XPUContext; + +namespace backends { +namespace xpu { + +/**************************** Version Management **************************/ + +//! Get the version of XPU Driver +int GetDriverVersion() { + uint32_t driver_version_major = 0; + uint32_t driver_version_minor = 0; + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_get_driver_version(&driver_version_major, &driver_version_minor)); + int driver_version = driver_version_major * 10 + driver_version_minor; + return driver_version; +} + +//! Get the version of XPU Runtime +int GetRuntimeVersion() { + uint32_t rumtime_version_major = 0; + uint32_t rumtime_version_minor = 0; + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor)); + int runtime_version = rumtime_version_major * 10 + rumtime_version_minor; + return runtime_version; +} + +/**************************** Device Management **************************/ + +static int GetDeviceCountImpl() { + const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES"); + if (xpu_visible_devices != nullptr) { + std::string xpu_visible_devices_str(xpu_visible_devices); + if (std::all_of(xpu_visible_devices_str.begin(), + xpu_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected."; + return 0; + } + } + + int count = 0; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count)); + return count; +} + +int GetXPUDeviceCount() { + static auto dev_cnt = GetDeviceCountImpl(); + return dev_cnt; +} + +int GetXPUCurrentDeviceId() { + int dev_id; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id)); + if (dev_id >= 64) { + // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id + dev_id -= 64; + } + return dev_id; +} + +void SetXPUDeviceId(int id) { + PADDLE_ENFORCE_LT( + id, + GetXPUDeviceCount(), + paddle::platform::errors::InvalidArgument("id must less than XPU count")); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id)); +} + +static inline std::vector Split(std::string const& original, + char separator) { + std::vector results; + std::string token; + std::istringstream is(original); + while (std::getline(is, token, separator)) { + if (!token.empty()) { + results.push_back(token); + } + } + return results; +} + +//! Get a list of device ids from environment variable or use all. +std::vector GetXPUSelectedDevices() { + // use user specified XPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_xpus.empty()) { + auto devices_str = Split(FLAGS_selected_xpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetXPUDeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + +/**************************** Memory Management **************************/ + +void MemcpySyncH2D(void* dst, + const void* src, + size_t count, + const pten::XPUPlace& dst_place) { + XPUDeviceGuard guard(dst_place.device); + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE)); +} + +void MemcpySyncD2H(void* dst, + const void* src, + size_t count, + const pten::XPUPlace& src_place, + const pten::XPUContext& dev_ctx) { + XPUDeviceGuard guard(src_place.GetDeviceId()); + dev_ctx.Wait(); + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST)); +} + +// if src.device == dst.device and you need sync , after call this function, +// need to call xpu_wait() +void MemcpySyncD2D(void* dst, + const pten::XPUPlace& dst_place, + const void* src, + const pten::XPUPlace& src_place, + size_t count, + const pten::XPUContext& dev_ctx) { + int dev_id = GetXPUCurrentDeviceId(); + if (dst_place.device == dev_id && src_place.device == dev_id) { + PADDLE_ENFORCE_XDNN_SUCCESS( + baidu::xpu::api::copy(dev_ctx.x_context(), + static_cast(src), + static_cast(dst), + count), + "copy "); + } else { + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count)); + } +} + +/**************************** Others **************************/ + +XPUVersion get_xpu_version(int dev_id) { + uint64_t v = 0; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id)); + + if (v == K100 || v == K200) { + VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n"; + return XPU1; + } else { + VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n"; + return XPU2; + } +} + +} // namespace xpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_info.h b/paddle/pten/backends/xpu/xpu_info.h new file mode 100644 index 0000000000000000000000000000000000000000..8cf836ba16dc6a4ff1e5408bb92b8e60758895b1 --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_info.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include "paddle/pten/common/place.h" + +namespace pten { + +class XPUContext; + +namespace backends { +namespace xpu { + +/***** Version Management *****/ + +//! Get the version of XPU Driver +int GetDriverVersion(); + +//! Get the version of XPU Runtime +int GetRuntimeVersion(); + +/***** Device Management *****/ + +//! Get the total number of XPU devices in system. +int GetXPUDeviceCount(); + +//! Set the XPU device id for next execution. +void SetXPUDeviceId(int device_id); + +//! Get the current XPU device id in system. +int GetXPUCurrentDeviceId(); + +//! Get a list of device ids from environment variable or use all. +std::vector GetXPUSelectedDevices(); + +/***** Memory Management *****/ + +//! Copy memory from address src to dst synchronously. +void MemcpySyncH2D(void *dst, + const void *src, + size_t count, + const pten::XPUPlace &dst_place); +void MemcpySyncD2H(void *dst, + const void *src, + size_t count, + const pten::XPUPlace &src_place, + const pten::XPUContext &dev_ctx); +void MemcpySyncD2D(void *dst, + const pten::XPUPlace &dst_place, + const void *src, + const pten::XPUPlace &src_place, + size_t count, + const pten::XPUContext &dev_ctx); + +class XPUDeviceGuard { + public: + explicit inline XPUDeviceGuard(int dev_id) { + int prev_id = GetXPUCurrentDeviceId(); + if (prev_id != dev_id) { + prev_id_ = prev_id; + SetXPUDeviceId(dev_id); + } + } + + inline ~XPUDeviceGuard() { + if (prev_id_ != -1) { + SetXPUDeviceId(prev_id_); + } + } + + XPUDeviceGuard(const XPUDeviceGuard &o) = delete; + XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete; + + private: + int prev_id_{-1}; +}; + +enum XPUVersion { XPU1, XPU2 }; +XPUVersion get_xpu_version(int dev_id); + +} // namespace xpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt index b281f95f36bbd9100987a7d92c03822131ff0200..0c5437ff6d07abf2f5b4536fd314455839807e00 100644 --- a/paddle/pten/core/CMakeLists.txt +++ b/paddle/pten/core/CMakeLists.txt @@ -9,23 +9,26 @@ else() cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place) endif() -cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce convert_utils) -cc_library(kernel_context SRCS kernel_context.cc DEPS enforce pten_context) +cc_library(errors SRCS errors.cc) +set(pten_enforce_deps errors flags) +cc_library(pten_enforce INTERFACE SRCS enforce.cc DEPS ${pten_enforce_deps}) -cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce) -cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector) -cc_library(lod_utils SRCS lod_utils.cc DEPS enforce mixed_vector) -cc_library(dense_tensor SRCS dense_tensor.cc DEPS convert_utils tensor_meta tensor_base) +cc_library(kernel_factory SRCS kernel_factory.cc DEPS pten_enforce convert_utils) +cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context) + +cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS pten_enforce) +cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce mixed_vector) +cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector) +cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils tensor_meta tensor_base) cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base ) cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) +cc_library(selected_rows SRCS selected_rows.cc DEPS dense_tensor mixed_vector pten_enforce ddim) cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) -cc_library(selected_rows SRCS selected_rows.cc DEPS dense_tensor mixed_vector enforce ddim) - # Will remove once we implemented MKLDNN_Tensor if(WITH_MKLDNN) add_dependencies(dense_tensor mkldnn) diff --git a/paddle/pten/core/compat/CMakeLists.txt b/paddle/pten/core/compat/CMakeLists.txt index 253f60daf1f890caccdeb02908c1b4fb3d6c62da..0c081edb81ccf740ed74c377070f4847650a1ff2 100644 --- a/paddle/pten/core/compat/CMakeLists.txt +++ b/paddle/pten/core/compat/CMakeLists.txt @@ -1 +1,2 @@ -cc_library(arg_map_context SRCS arg_map_context.cc DEPS enforce) +cc_library(arg_map_context SRCS arg_map_context.cc DEPS pten_enforce) +cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce convert_utils) diff --git a/paddle/pten/core/compat/arg_map_context.cc b/paddle/pten/core/compat/arg_map_context.cc index 3914a8a684eda937cf54283f72a04bec67cf64af..73fa0b300cf96cb653129148ea86883a8536ebe0 100644 --- a/paddle/pten/core/compat/arg_map_context.cc +++ b/paddle/pten/core/compat/arg_map_context.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/pten/core/compat/arg_map_context.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/core/enforce.h" namespace pten { - std::ostream& operator<<(std::ostream& os, KernelSignature signature) { os << "Kernel Signature - name: " << signature.name << "; inputs: " << paddle::string::join_strings(std::get<0>(signature.args), ", ") diff --git a/paddle/pten/core/compat/op_utils.cc b/paddle/pten/core/compat/op_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..12c2d74737d5d988c3708b2d47f9cad2a0d78e08 --- /dev/null +++ b/paddle/pten/core/compat/op_utils.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/core/compat/op_utils.h" + +namespace pten { + +DefaultKernelSignatureMap& DefaultKernelSignatureMap::Instance() { + static DefaultKernelSignatureMap g_default_kernel_sig_map; + return g_default_kernel_sig_map; +} + +OpUtilsMap& OpUtilsMap::Instance() { + static OpUtilsMap g_op_utils_map; + return g_op_utils_map; +} + +} // namespace pten diff --git a/paddle/pten/core/compat/op_utils.h b/paddle/pten/core/compat/op_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..505ef13891aeea1454c8807cfee59223ce49cbab --- /dev/null +++ b/paddle/pten/core/compat/op_utils.h @@ -0,0 +1,166 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/pten/core/compat/arg_map_context.h" +#include "paddle/pten/core/infermeta_utils.h" +#include "paddle/pten/core/kernel_def.h" +#include "paddle/pten/core/macros.h" +#include "paddle/utils/flat_hash_map.h" + +#include "paddle/fluid/platform/enforce.h" + +namespace pten { + +class DefaultKernelSignatureMap { + public: + static DefaultKernelSignatureMap& Instance(); + + bool Has(const std::string& op_type) const { return map_.count(op_type) > 0; } + + const KernelSignature& Get(const std::string& op_type) const { + auto it = map_.find(op_type); + PADDLE_ENFORCE_NE( + it, + map_.end(), + paddle::platform::errors::NotFound( + "Operator `%s`'s kernel signature is not registered.", op_type)); + return it->second; + } + + void Insert(std::string op_type, KernelSignature signature) { + PADDLE_ENFORCE_NE( + Has(op_type), + true, + paddle::platform::errors::AlreadyExists( + "Operator (%s)'s Kernel Siginature has been registered.", op_type)); + map_.insert({std::move(op_type), std::move(signature)}); + } + + private: + DefaultKernelSignatureMap() = default; + + paddle::flat_hash_map map_; + + DISABLE_COPY_AND_ASSIGN(DefaultKernelSignatureMap); +}; + +class OpUtilsMap { + public: + static OpUtilsMap& Instance(); + + bool Contains(const std::string& op_type) const { + return name_map_.count(op_type) || arg_mapping_fn_map_.count(op_type); + } + + void InsertApiName(std::string op_type, std::string api_name) { + PADDLE_ENFORCE_EQ( + name_map_.count(op_type), + 0UL, + paddle::platform::errors::AlreadyExists( + "Operator (%s)'s api name has been registered.", op_type)); + name_map_.insert({std::move(op_type), std::move(api_name)}); + } + + void InsertArgumentMappingFn(std::string op_type, ArgumentMappingFn fn) { + PADDLE_ENFORCE_EQ( + arg_mapping_fn_map_.count(op_type), + 0UL, + paddle::platform::errors::AlreadyExists( + "Operator (%s)'s argu,emt mapping function has been registered.", + op_type)); + arg_mapping_fn_map_.insert({std::move(op_type), std::move(fn)}); + } + + std::string GetApiName(const std::string& op_type) const { + auto it = name_map_.find(op_type); + if (it == name_map_.end()) { + return "deprecated"; + } else { + return it->second; + } + } + + ArgumentMappingFn GetArgumentMappingFn(const std::string& op_type) const { + auto it = arg_mapping_fn_map_.find(op_type); + if (it == arg_mapping_fn_map_.end()) { + auto func = + [op_type](const ArgumentMappingContext& ctx) -> KernelSignature { + return DefaultKernelSignatureMap::Instance().Get(op_type); + }; + return func; + } else { + return it->second; + } + } + + private: + OpUtilsMap() = default; + + paddle::flat_hash_map name_map_; + paddle::flat_hash_map arg_mapping_fn_map_; + + DISABLE_COPY_AND_ASSIGN(OpUtilsMap); +}; + +struct ApiNameRegistrar { + ApiNameRegistrar(const char* op_type, const char* api_name) { + OpUtilsMap::Instance().InsertApiName(op_type, api_name); + } +}; + +struct ArgumentMappingFnRegistrar { + ArgumentMappingFnRegistrar(const char* op_type, + ArgumentMappingFn arg_mapping_fn) { + OpUtilsMap::Instance().InsertArgumentMappingFn(op_type, + std::move(arg_mapping_fn)); + } +}; + +#define PT_REGISTER_API_NAME(op_type, api_name) \ + PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + pt_register_api_name_ns_check_##op_type, \ + "PT_REGISTER_API_NAME must be called in global namespace."); \ + static const ::pten::ApiNameRegistrar __registrar_api_name_for_##op_type( \ + #op_type, #api_name); \ + int TouchApiNameSymbol_##op_type() { return 0; } + +#define PT_DECLARE_API_NAME(op_type) \ + PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + pt_declare_ai_name_ns_check_##op_type, \ + "PT_DECLARE_API_NAME must be called in global namespace."); \ + extern int TouchApiNameSymbol_##op_type(); \ + UNUSED static int __declare_api_name_symbol_for_##op_type = \ + TouchApiNameSymbol_##op_type() + +#define PT_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn) \ + PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + pt_register_arg_map_fn_ns_check_##op_type, \ + "PT_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \ + static const ::pten::ArgumentMappingFnRegistrar \ + __registrar_arg_map_fn_for_##op_type(#op_type, arg_mapping_fn); \ + int TouchArgumentMappingFnSymbol_##op_type() { return 0; } + +#define PT_DECLARE_ARG_MAPPING_FN(op_type) \ + PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + pt_declare_arg_map_fn_ns_check_##op_type, \ + "PT_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \ + extern int TouchArgumentMappingFnSymbol_##op_type(); \ + UNUSED static int __declare_arg_map_fn_symbol_for_##op_type = \ + TouchArgumentMappingFnSymbol_##op_type() + +} // namespace pten diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index b1a5015f010c2002b8e5dbb6fc9eac1269224ad1..15f9f0bda3c25e2b8a4125d1025d8b0a673f2dc5 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,14 +22,6 @@ limitations under the License. */ #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/convert_utils.h" -namespace paddle { -namespace framework { -extern void TensorCopy(const pten::DenseTensor& src, - const paddle::platform::Place& dst_place, - pten::DenseTensor* dst); -} -} - namespace pten { DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta) @@ -126,6 +118,19 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) { meta_ = std::move(meta); } +void DenseTensor::set_meta(const DenseTensorMeta& meta) { + PADDLE_ENFORCE( + meta.valid(), + paddle::platform::errors::InvalidArgument( + "Input meta is invalid, please check the meta attribute.")); + meta_.dims = meta.dims; + meta_.dtype = meta.dtype; + meta_.is_scalar = meta.is_scalar; + meta_.layout = meta.layout; + meta_.lod = meta.lod; + meta_.offset = meta.offset; +} + /* @jim19930609: This interface will be further modified util we finalized the design for Allocator - Allocation For now, we have to temporarily accommodate two independent use cases: @@ -167,370 +172,4 @@ DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128); #undef DATA_MEMBER_FUNC_INSTANTIATION -/* --------------------------- */ -/* From framework::Tensor */ -/* --------------------------- */ -DenseTensor::DenseTensor() { - inplace_version_counter_ = std::make_shared(0); - meta_.dtype = paddle::experimental::DataType::FLOAT32; - meta_.offset = 0; -} - -DenseTensor::DenseTensor(paddle::framework::proto::VarType::Type dtype) { - inplace_version_counter_ = std::make_shared(0); - meta_.dtype = TransToPtenDataType(dtype); - meta_.offset = 0; -} - -size_t DenseTensor::memory_size() const { - return holder_ == nullptr ? 0UL : holder_->size() - meta_.offset; -} - -void DenseTensor::check_memory_size() const { - PADDLE_ENFORCE_NOT_NULL(holder_, - paddle::platform::errors::PreconditionNotMet( - "Tensor holds no memory. " - "Call Tensor::mutable_data firstly.")); - PADDLE_ENFORCE_LE( - numel() * SizeOf(dtype()), - memory_size(), - paddle::platform::errors::PreconditionNotMet( - "Tensor's dimension is out of bound." - "Tensor's dimension must be equal or less than the size of its " - "memory." - "But received Tensor's dimension is d%, memory's size is %d.", - numel() * SizeOf(dtype()), - memory_size())); -} - -const paddle::platform::Place& DenseTensor::place() const { - PADDLE_ENFORCE_NOT_NULL( - holder_, - paddle::platform::errors::PreconditionNotMet( - "Tensor not initialized yet when DenseTensor::place() is called.")); - return holder_->place(); -} - -paddle::framework::proto::VarType::Type DenseTensor::type() const { - return TransToProtoVarType(meta_.dtype); -} - -paddle::framework::proto::VarType::Type DenseTensor::saved_type() const { - return TransToProtoVarType(meta_.dtype); -} - -void DenseTensor::set_layout(const paddle::framework::DataLayout layout) { - meta_.layout = layout; -} - -void DenseTensor::ResetHolder(const std::shared_ptr& holder) { - PADDLE_ENFORCE_EQ( - meta_.offset, - 0, - paddle::platform::errors::Fatal( - "Only the offset is supported to zero when the holder is reset.")); - - if (holder_) { - PADDLE_ENFORCE_LE( - numel() * SizeOf(dtype()) + meta_.offset, - holder->size(), - paddle::platform::errors::InvalidArgument( - "The size of Holder is not enough to store the Tensor.")); - } - holder_ = holder; -} - -void DenseTensor::ResetHolderWithType( - const std::shared_ptr& holder, - paddle::framework::proto::VarType::Type type) { - set_type(type); - ResetHolder(holder); -} - -void DenseTensor::set_type(paddle::framework::proto::VarType::Type type) { - meta_.dtype = TransToPtenDataType(type); -} - -void* DenseTensor::mutable_data(const paddle::platform::Place& place, - paddle::framework::proto::VarType::Type type, - size_t requested_size) { - set_type(type); - PADDLE_ENFORCE_GE( - numel(), - 0, - paddle::platform::errors::PreconditionNotMet( - "The Tensor's element number must be equal or greater than zero. " - "The Tensor's shape is [", - dims(), - "] now")); - size_t size = numel() * SizeOf(dtype()); - if (requested_size && (requested_size > size)) { - size = requested_size; - } - - /* some versions of boost::variant don't have operator!= */ - if (holder_ == nullptr || !(holder_->place() == place) || - holder_->size() < size + meta_.offset) { - holder_.reset(); - holder_ = paddle::memory::AllocShared(place, size); - meta_.offset = 0; - } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - meta_.offset); -} - -void* DenseTensor::mutable_data(const paddle::platform::Place& place, - size_t requested_size) { - return mutable_data(place, type(), requested_size); -} - -void* DenseTensor::mutable_data(const paddle::platform::Place& place, - paddle::framework::proto::VarType::Type type, - const paddle::platform::Stream& stream) { - set_type(type); - PADDLE_ENFORCE_GE( - numel(), - 0, - paddle::platform::errors::PreconditionNotMet( - "The Tensor's element number must be equal or greater than zero. " - "The Tensor's shape is [", - dims(), - "] now")); - size_t size = numel() * SizeOf(dtype()); - - /* some versions of boost::variant don't have operator!= */ - if (holder_ == nullptr || !(holder_->place() == place) || - holder_->size() < size + meta_.offset || - !(paddle::platform::is_gpu_place(place) && - paddle::memory::InSameStream(holder_, stream))) { - holder_.reset(); - holder_ = paddle::memory::AllocShared(place, size, stream); - meta_.offset = 0; - } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - meta_.offset); -} - -/* @jim19930609: The following "mutable_data" only supports specific dtypes - defined in OpProto. This part need another clean up once the data type across - Fluid - and Pten get unified. - */ -template -inline T* DenseTensor::mutable_data(const DDim& dims, - const paddle::platform::Place& place, - size_t requested_size) { - static_assert(std::is_pod::value, "T must be POD"); - meta_.dims = dims; - return mutable_data(place, requested_size); -} - -template -inline T* DenseTensor::mutable_data(const paddle::platform::Place& place, - size_t requested_size) { - static_assert(std::is_pod::value, "T must be POD"); - return reinterpret_cast(mutable_data( - place, paddle::framework::DataTypeTrait::DataType(), requested_size)); -} - -void DenseTensor::ShareBufferWith(const DenseTensor& tensor) { - holder_ = tensor.holder_; - meta_.offset = tensor.meta().offset; - meta_.dtype = tensor.dtype(); -} - -#define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \ - template dtype* DenseTensor::mutable_data( \ - const DDim& dims, \ - const paddle::platform::Place& place, \ - size_t requested_size); \ - template dtype* DenseTensor::mutable_data( \ - const paddle::platform::Place& place, size_t requested_size); - -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(bool) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int8_t) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(uint8_t) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int16_t) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int32_t) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int64_t) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(float) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(double) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128) - -#undef LEGACY_DATA_MEMBER_FUNC_INSTANTIATION - -/* ------------------------------ */ -/* From framework::LoDTensor */ -/* ------------------------------ */ - -DenseTensor::DenseTensor(intrusive_ptr storage, - const DenseTensorMeta& meta) - : meta_(meta), holder_(storage->move_data_shared()) {} - -DenseTensor::DenseTensor(intrusive_ptr storage, DenseTensorMeta&& meta) - : meta_(std::move(meta)), holder_(storage->move_data_shared()) {} - -DenseTensor::DenseTensor(const LoD& lod) : DenseTensor() { meta_.lod = lod; } - -void DenseTensor::set_lod(const LoD& lod) { meta_.lod = lod; } - -LoD* DenseTensor::mutable_lod() { return &meta_.lod; } - -std::pair DenseTensor::lod_element(size_t level, - size_t elem) const { - PADDLE_ENFORCE_LT( - level, - NumLevels(), - paddle::platform::errors::InvalidArgument( - "The input level of LoD is invalid, it should be less than LoD " - "size. The input level is %zu, the LoD size is %zu.", - level, - NumLevels())); - - PADDLE_ENFORCE_LT(elem, - NumElements(level), - paddle::platform::errors::InvalidArgument( - "The input element of LoD is invalid, it should be " - "less than the number of elements in its level." - "The input element is %zu, the number of elements in " - "its level is %zu.", - elem, - NumElements(level))); - - return std::make_pair((meta_.lod)[level][elem], (meta_.lod)[level][elem + 1]); -} - -size_t DenseTensor::NumLevels() const { return meta_.lod.size(); } - -size_t DenseTensor::NumElements(size_t level) const { - PADDLE_ENFORCE_LT( - level, - NumLevels(), - paddle::platform::errors::InvalidArgument( - "The input level of LoD is invalid, it should be less than LoD " - "size. The input level is %zu, the LoD size is %zu.", - level, - NumLevels())); - - // the last offset is the end of last element - return (meta_.lod)[level].size() - 1; -} - -DenseTensor& DenseTensor::Resize(const DDim& dims) { - meta_.dims = dims; - return *this; -} - -DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const { - check_memory_size(); - PADDLE_ENFORCE_GE(begin_idx, - 0, - paddle::platform::errors::OutOfRange( - "The start row index must be greater than 0." - "But received the start index is d%.", - begin_idx)); - PADDLE_ENFORCE_LE(end_idx, - meta_.dims[0], - paddle::platform::errors::OutOfRange( - "The end row index is out of bound.")); - PADDLE_ENFORCE_LT( - begin_idx, - end_idx, - paddle::platform::errors::InvalidArgument( - "The start row index must be less than the end row index." - "But received the start index = %d, the end index = %d.", - begin_idx, - end_idx)); - - if (meta_.dims[0] == 1) { - return *this; - } else { - size_t base = numel() / meta_.dims[0]; - DenseTensor dst; - dst.holder_ = holder_; - dst.set_layout(meta_.layout); - dst.meta_.dtype = meta_.dtype; - DDim dst_dims = meta_.dims; - dst_dims[0] = end_idx - begin_idx; - dst.Resize(dst_dims); - dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype()); - return dst; - } -} - -std::vector DenseTensor::Split(int64_t split_size, - int64_t axis) const { - check_memory_size(); - - PADDLE_ENFORCE_GE(meta_.dims.size(), - 0, - paddle::platform::errors::OutOfRange( - "split expects at least a 1-dimensional tensor")); - - PADDLE_ENFORCE_GE( - split_size, - 0, - paddle::platform::errors::OutOfRange( - "split expects split_size be non-negative, but got split_size is %d", - split_size)); - - int64_t numel_size = meta_.dims[axis]; - - int64_t num_splits = 1; - if (split_size != 0) { - num_splits = - std::max((numel_size + split_size - 1) / split_size, 1); - } - - std::vector splits(num_splits); - int64_t last_split_size = split_size - (split_size * num_splits - numel_size); - - for (int64_t i = 0; i < num_splits; ++i) { - int64_t length = i < num_splits - 1 ? split_size : last_split_size; - splits[i] = Slice(i * split_size, i * split_size + length); - } - return splits; -} - -std::vector DenseTensor::Chunk(int64_t chunks, - int64_t axis) const { - check_memory_size(); - PADDLE_ENFORCE_GE(meta_.dims.size(), - 0, - paddle::platform::errors::OutOfRange( - "split expects at least a 1-dimensional tensor")); - PADDLE_ENFORCE_GE( - chunks, - 0, - paddle::platform::errors::OutOfRange( - "chunks expects to be greater than 0, but got chunks is %d", chunks)); - - int64_t numel_size = meta_.dims[axis]; - int64_t split_size = (numel_size + chunks - 1) / chunks; - return Split(split_size, axis); -} - -DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) { - src.check_memory_size(); - // Preserve LoD - auto lod = meta_.lod; - *this = src; - meta_.lod = lod; - return *this; -} - -DenseTensor& DenseTensor::ShareInplaceVersionCounterWith( - const DenseTensor& src) { - PADDLE_ENFORCE_NOT_NULL( - inplace_version_counter_, - paddle::platform::errors::PreconditionNotMet( - "Tensor does not hold inplace_version_counter_.")); - - inplace_version_counter_ = src.inplace_version_counter_; - return *this; -} - } // namespace pten diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index 88c459e6d87eaee4cd52111c42458868698eda43..2823441f97da2a784d6fb175429a0496e50d6aaa 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -33,25 +33,6 @@ namespace pten { class CompatibleDenseTensorUtils; -/* --------------------------- */ -/* From framework::Tensor */ -/* --------------------------- */ -/* Temporarily put TensorInplaceVersion inside DenseTensor. - Will move to AutogradMeta as soon as we switch to Eager Dygraph. - */ -class TensorInplaceVersion { - public: - explicit TensorInplaceVersion(uint32_t inplace_version = 0) - : inplace_version_(inplace_version) {} - bool IsUnique() const { return inplace_version_ == 0; } - void Bump() { ++inplace_version_; } - uint32_t CurrentVersion() const { return inplace_version_; } - void SetInplaceVersionToZero() { inplace_version_ = 0; } - - private: - uint32_t inplace_version_; -}; - /// \brief The Dense tensor store values in a contiguous sequential block /// of memory where all values are represented. Tensors or multi-dimensional /// arrays are used in math operators. @@ -90,6 +71,8 @@ class DenseTensor : public TensorBase, DenseTensor& operator=(DenseTensor&& other); + DenseTensor(); + /// \brief Destroy the tensor object and release exclusive resources. virtual ~DenseTensor() = default; @@ -131,6 +114,8 @@ class DenseTensor : public TensorBase, /// \param meta The meta information of the tensor. void set_meta(DenseTensorMeta&& meta); + void set_meta(const DenseTensorMeta& meta); + /// \brief Test whether the metadata is valid. /// \return Whether the metadata is valid. bool valid() const noexcept override { return meta_.valid(); } @@ -177,181 +162,6 @@ class DenseTensor : public TensorBase, DenseTensorMeta meta_; std::shared_ptr holder_; - /* --------------------------- */ - /* From framework::Tensor */ - /* --------------------------- */ - /* The following members & interfaces were copied from framework::Tensor, - so as to facilitate the unification of different Tensors - - Will be adjusted/removed/moved in the near future - */ - public: - /* @jim19930609: The way default constructor handles allocator might change, - according to - the final design of Allocation - Allocator. - */ - DenseTensor(); - - /* @jim19930609: Remove dependency on protobuf after Tensor Unification. - */ - explicit DenseTensor(paddle::framework::proto::VarType::Type dtype); - - /// \brief Use existing storage space to create dense tensor. This interface - /// can be used to deliberately create an uninitialized dense tensor. - /// \param storage The existing storage. - /// \param meta The meta data of dense tensor. - DenseTensor(intrusive_ptr storage, const DenseTensorMeta& meta); - - /// \brief Use existing storage space to create dense tensor. This interface - /// can be used to deliberately create an uninitialized dense tensor. - /// \param storage The existing storage. - /// \param meta The meta data of dense tensor. - DenseTensor(intrusive_ptr storage, DenseTensorMeta&& meta); - - inline bool IsInitialized() const { return holder_ != nullptr; } - - template - T* data(); - - void* data(); - - template - T* mutable_data(const paddle::platform::Place& place, - size_t requested_size = 0); - - template - T* mutable_data(const DDim& dims, - const paddle::platform::Place& place, - size_t requested_size = 0); - - void* mutable_data(const paddle::platform::Place& place, - paddle::framework::proto::VarType::Type type, - size_t requested_size = 0); - - void* mutable_data(const paddle::platform::Place& place, - size_t requested_size = 0); - - void* mutable_data(const paddle::platform::Place& place, - paddle::framework::proto::VarType::Type type, - const paddle::platform::Stream& stream); - - /* @jim19930609: Remove dependency on protobuf after Tensor Unification. - */ - paddle::framework::proto::VarType::Type type() const; - - /* @jim19930609: Remove dependency on protobuf after Tensor Unification. - */ - paddle::framework::proto::VarType::Type saved_type() const; - - // memory size returns the holding memory size in byte. - size_t memory_size() const; - - void check_memory_size() const; - - void set_layout(const paddle::framework::DataLayout layout); - - void clear() { - holder_.reset(); - meta_.offset = 0; - } - - void ShareBufferWith(const DenseTensor& tensor); - - void ShareDataTypeWith(const DenseTensor& tensor) { - meta_.dtype = tensor.meta().dtype; - } - - bool IsSharedBufferWith(const DenseTensor& src) const { - return holder_ && holder_ == src.Holder(); - } - - const std::shared_ptr& Holder() const { return holder_; } - - void set_offset(size_t offset) { meta_.offset = offset; } - size_t offset() const { return meta_.offset; } - - std::shared_ptr MoveMemoryHolder() { - return std::move(holder_); - } - - void ResetHolder(const std::shared_ptr& holder); - - void ResetHolderWithType(const std::shared_ptr& holder, - paddle::framework::proto::VarType::Type type); - - void set_type(paddle::framework::proto::VarType::Type type); - - TensorInplaceVersion& InplaceVersionCounter() { - return *inplace_version_counter_; - } - - /*! The internal of two tensors share the same memory block. */ - DenseTensor& ShareDataWith(const DenseTensor& src); - - /*! The internal of two tensors share the same inplace version counter. */ - DenseTensor& ShareInplaceVersionCounterWith(const DenseTensor& src); - - DenseTensor Slice(int64_t begin_idx, int64_t end_idx) const; - - std::vector Split(int64_t split_size, int64_t axis) const; - - std::vector Chunk(int64_t chunks, int64_t axis) const; - - protected: - std::shared_ptr inplace_version_counter_; - -/* @jim19930609: This is a hack - In general, it is badly designed to fuse MKLDNN-specific objects into a - generic Tensor. - We temporarily leave them here to unblock Tensor Unification progress. - In the final state, we should come up with a MKLDNN_Tensor and move the - following codes there. - */ -#ifdef PADDLE_WITH_MKLDNN - - public: - inline dnnl::memory::format_tag format() const { return format_; } - - inline void set_format(const dnnl::memory::format_tag format) { - format_ = format; - } - - protected: - /** - * @brief the detail format of memory block which have layout as kMKLDNN - * - * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, - * nChw16c, etc. For a MKLDNN memory block, layout will be set as - * DataLayout::kMKLDNN meanwhile detail memory format will be kept in - * this field. - */ - - dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef; -#endif - - /* ------------------------------ */ - /* From framework::LoDTensor */ - /* ------------------------------ */ - /* The following members & interfaces were copied from framework::Tensor, - so as to facilitate the unification of different Tensors - - Will be adjusted/removed/moved in the near future - */ - public: - explicit DenseTensor(const LoD& lod); - - void set_lod(const LoD& lod); - - LoD* mutable_lod(); - - /* - * Get the start offset and end offset of an element from LoD. - */ - std::pair lod_element(size_t level, size_t elem) const; - - size_t NumLevels() const; - - size_t NumElements(size_t level = 0) const; +#include "paddle/pten/core/dense_tensor.inl" }; - } // namespace pten diff --git a/paddle/pten/core/dense_tensor.inl b/paddle/pten/core/dense_tensor.inl new file mode 100644 index 0000000000000000000000000000000000000000..754baeb73c90c2b494bf774588219c877a2fb8e9 --- /dev/null +++ b/paddle/pten/core/dense_tensor.inl @@ -0,0 +1,197 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* --------------------------- */ +/* From framework::Tensor */ +/* --------------------------- */ +/* The following members & interfaces were copied from framework::Tensor, + so as to facilitate the unification of different Tensors + + Will be adjusted/removed/moved in the near future +*/ +public: +/* Temporarily put InplaceVersion inside DenseTensor. +Will move to AutogradMeta as soon as we switch to Eager Dygraph. +*/ +class InplaceVersion { +public: + bool IsUnique() const { return inplace_version_ == 0; } + void Bump() { ++inplace_version_; } + uint32_t CurrentVersion() const { return inplace_version_; } + void SetInplaceVersionToZero() { inplace_version_ = 0; } + +private: + uint32_t inplace_version_{0}; +}; + +/* @jim19930609: Remove dependency on protobuf after Tensor Unification. +*/ +explicit DenseTensor(paddle::framework::proto::VarType::Type dtype); + +/// \brief Use existing storage space to create dense tensor. This interface +/// can be used to deliberately create an uninitialized dense tensor. +/// \param storage The existing storage. +/// \param meta The meta data of dense tensor. +DenseTensor(intrusive_ptr storage, const DenseTensorMeta& meta); + +/// \brief Use existing storage space to create dense tensor. This interface +/// can be used to deliberately create an uninitialized dense tensor. +/// \param storage The existing storage. +/// \param meta The meta data of dense tensor. +DenseTensor(intrusive_ptr storage, DenseTensorMeta&& meta); + +inline bool IsInitialized() const { return holder_ != nullptr; } + +template +T* data(); + +void* data(); + +template +T* mutable_data(const paddle::platform::Place& place, + size_t requested_size = 0); + +template +T* mutable_data(const DDim& dims, + const paddle::platform::Place& place, + size_t requested_size = 0); + +void* mutable_data(const paddle::platform::Place& place, + paddle::framework::proto::VarType::Type type, + size_t requested_size = 0); + +void* mutable_data(const paddle::platform::Place& place, + size_t requested_size = 0); + +void* mutable_data(const paddle::platform::Place& place, + paddle::framework::proto::VarType::Type type, + const paddle::platform::Stream& stream); + +/* @jim19930609: Remove dependency on protobuf after Tensor Unification. +*/ +paddle::framework::proto::VarType::Type type() const; + +/* @jim19930609: Remove dependency on protobuf after Tensor Unification. +*/ +paddle::framework::proto::VarType::Type saved_type() const; + +// memory size returns the holding memory size in byte. +size_t memory_size() const; + +void check_memory_size() const; + +void set_layout(const paddle::framework::DataLayout layout); + +void clear() { + holder_.reset(); + meta_.offset = 0; +} + +void ShareBufferWith(const DenseTensor& tensor); + +void ShareDataTypeWith(const DenseTensor& tensor) { + meta_.dtype = tensor.meta().dtype; +} + +bool IsSharedBufferWith(const DenseTensor& src) const { + return holder_ && holder_ == src.Holder(); +} + +const std::shared_ptr& Holder() const { return holder_; } + +void set_offset(size_t offset) { meta_.offset = offset; } +size_t offset() const { return meta_.offset; } + +std::shared_ptr MoveMemoryHolder() { + return std::move(holder_); +} + +void ResetHolder(const std::shared_ptr& holder); + +void ResetHolderWithType(const std::shared_ptr& holder, + paddle::framework::proto::VarType::Type type); + +void set_type(paddle::framework::proto::VarType::Type type); + +InplaceVersion& InplaceVersionCounter() { + return *inplace_version_counter_; +} + +/*! The internal of two tensors share the same memory block. */ +DenseTensor& ShareDataWith(const DenseTensor& src); + +/*! The internal of two tensors share the same inplace version counter. */ +DenseTensor& ShareInplaceVersionCounterWith(const DenseTensor& src); + +DenseTensor Slice(int64_t begin_idx, int64_t end_idx) const; + +std::vector Split(int64_t split_size, int64_t axis) const; + +std::vector Chunk(int64_t chunks, int64_t axis) const; + +protected: +std::shared_ptr inplace_version_counter_{std::make_shared()}; + +/* @jim19930609: This is a hack +In general, it is badly designed to fuse MKLDNN-specific objects into a +generic Tensor. +We temporarily leave them here to unblock Tensor Unification progress. +In the final state, we should come up with a MKLDNN_Tensor and move the +following codes there. +*/ +#ifdef PADDLE_WITH_MKLDNN + +public: +inline dnnl::memory::format_tag format() const { return format_; } + +inline void set_format(const dnnl::memory::format_tag format) { + format_ = format; +} + +protected: +/** + * @brief the detail format of memory block which have layout as kMKLDNN + * + * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. + */ + +dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef; +#endif + +/* ------------------------------ */ +/* From framework::LoDTensor */ +/* ------------------------------ */ +/* The following members & interfaces were copied from framework::Tensor, + so as to facilitate the unification of different Tensors + + Will be adjusted/removed/moved in the near future +*/ +public: +explicit DenseTensor(const LoD& lod); + +void set_lod(const LoD& lod); + +LoD* mutable_lod(); + +/* +* Get the start offset and end offset of an element from LoD. +*/ +std::pair lod_element(size_t level, size_t elem) const; + +size_t NumLevels() const; + +size_t NumElements(size_t level = 0) const; diff --git a/paddle/pten/core/dense_tensor_impl.cc b/paddle/pten/core/dense_tensor_impl.cc new file mode 100644 index 0000000000000000000000000000000000000000..f825d3619b92bcfd8d66ea47d9f176630ccbb525 --- /dev/null +++ b/paddle/pten/core/dense_tensor_impl.cc @@ -0,0 +1,394 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/core/dense_tensor.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/complex.h" +#include "paddle/pten/common/float16.h" + +#include "paddle/pten/api/lib/utils/storage.h" +#include "paddle/pten/core/convert_utils.h" + +namespace pten { +/* --------------------------- */ +/* From framework::Tensor */ +/* --------------------------- */ +DenseTensor::DenseTensor() { + meta_.dtype = paddle::experimental::DataType::FLOAT32; + meta_.offset = 0; +} + +DenseTensor::DenseTensor(paddle::framework::proto::VarType::Type dtype) { + meta_.dtype = TransToPtenDataType(dtype); + meta_.offset = 0; +} + +size_t DenseTensor::memory_size() const { + return holder_ == nullptr ? 0UL : holder_->size() - meta_.offset; +} + +void DenseTensor::check_memory_size() const { + PADDLE_ENFORCE_NOT_NULL(holder_, + paddle::platform::errors::PreconditionNotMet( + "Tensor holds no memory. " + "Call Tensor::mutable_data firstly.")); + PADDLE_ENFORCE_LE( + numel() * SizeOf(dtype()), + memory_size(), + paddle::platform::errors::PreconditionNotMet( + "Tensor's dimension is out of bound." + "Tensor's dimension must be equal or less than the size of its " + "memory." + "But received Tensor's dimension is d%, memory's size is %d.", + numel() * SizeOf(dtype()), + memory_size())); +} + +const paddle::platform::Place& DenseTensor::place() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, + paddle::platform::errors::PreconditionNotMet( + "Tensor not initialized yet when DenseTensor::place() is called.")); + return holder_->place(); +} + +paddle::framework::proto::VarType::Type DenseTensor::type() const { + return TransToProtoVarType(meta_.dtype); +} + +paddle::framework::proto::VarType::Type DenseTensor::saved_type() const { + return TransToProtoVarType(meta_.dtype); +} + +void DenseTensor::set_layout(const paddle::framework::DataLayout layout) { + meta_.layout = layout; +} + +void DenseTensor::ResetHolder(const std::shared_ptr& holder) { + PADDLE_ENFORCE_EQ( + meta_.offset, + 0, + paddle::platform::errors::Fatal( + "Only the offset is supported to zero when the holder is reset.")); + + if (holder_) { + // TODO(zyfncg): The change of static_cast<> in check will recover back + // when SetAllocationForOutputTenosr is deleted. + // Now the numel() may return -1, and will cast to a very large number when + // compare with a data with unsigned long type, this will make checking + // failed, so it's a temporary solution to deal with this problem. + PADDLE_ENFORCE_LE( + numel() * static_cast(SizeOf(dtype())), + static_cast(holder->size()), + paddle::platform::errors::InvalidArgument( + "The size of Holder is not enough to store the Tensor.")); + } + holder_ = holder; +} + +void DenseTensor::ResetHolderWithType( + const std::shared_ptr& holder, + paddle::framework::proto::VarType::Type type) { + set_type(type); + ResetHolder(holder); +} + +void DenseTensor::set_type(paddle::framework::proto::VarType::Type type) { + meta_.dtype = TransToPtenDataType(type); +} + +void* DenseTensor::mutable_data(const paddle::platform::Place& place, + paddle::framework::proto::VarType::Type type, + size_t requested_size) { + set_type(type); + PADDLE_ENFORCE_GE( + numel(), + 0, + paddle::platform::errors::PreconditionNotMet( + "The Tensor's element number must be equal or greater than zero. " + "The Tensor's shape is [", + dims(), + "] now")); + size_t size = numel() * SizeOf(dtype()); + if (requested_size && (requested_size > size)) { + size = requested_size; + } + + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + meta_.offset) { + holder_.reset(); + holder_ = paddle::memory::AllocShared(place, size); + meta_.offset = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + meta_.offset); +} + +void* DenseTensor::mutable_data(const paddle::platform::Place& place, + size_t requested_size) { + return mutable_data(place, type(), requested_size); +} + +void* DenseTensor::mutable_data(const paddle::platform::Place& place, + paddle::framework::proto::VarType::Type type, + const paddle::platform::Stream& stream) { + set_type(type); + PADDLE_ENFORCE_GE( + numel(), + 0, + paddle::platform::errors::PreconditionNotMet( + "The Tensor's element number must be equal or greater than zero. " + "The Tensor's shape is [", + dims(), + "] now")); + size_t size = numel() * SizeOf(dtype()); + + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + meta_.offset || + !(paddle::platform::is_gpu_place(place) && + paddle::memory::InSameStream(holder_, stream))) { + holder_.reset(); + holder_ = paddle::memory::AllocShared(place, size, stream); + meta_.offset = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + meta_.offset); +} + +/* @jim19930609: The following "mutable_data" only supports specific dtypes + defined in OpProto. This part need another clean up once the data type across + Fluid + and Pten get unified. + */ +template +inline T* DenseTensor::mutable_data(const DDim& dims, + const paddle::platform::Place& place, + size_t requested_size) { + static_assert(std::is_pod::value, "T must be POD"); + meta_.dims = dims; + return mutable_data(place, requested_size); +} + +template +inline T* DenseTensor::mutable_data(const paddle::platform::Place& place, + size_t requested_size) { + static_assert(std::is_pod::value, "T must be POD"); + return reinterpret_cast(mutable_data( + place, paddle::framework::DataTypeTrait::DataType(), requested_size)); +} + +void DenseTensor::ShareBufferWith(const DenseTensor& tensor) { + holder_ = tensor.holder_; + meta_.offset = tensor.meta().offset; + meta_.dtype = tensor.dtype(); +} + +#define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \ + template dtype* DenseTensor::mutable_data( \ + const DDim& dims, \ + const paddle::platform::Place& place, \ + size_t requested_size); \ + template dtype* DenseTensor::mutable_data( \ + const paddle::platform::Place& place, size_t requested_size); + +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(bool) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int8_t) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(uint8_t) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int16_t) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int32_t) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int64_t) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(float) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(double) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128) + +#undef LEGACY_DATA_MEMBER_FUNC_INSTANTIATION + +/* ------------------------------ */ +/* From framework::LoDTensor */ +/* ------------------------------ */ + +DenseTensor::DenseTensor(intrusive_ptr storage, + const DenseTensorMeta& meta) + : meta_(meta), holder_(storage->move_data_shared()) {} + +DenseTensor::DenseTensor(intrusive_ptr storage, DenseTensorMeta&& meta) + : meta_(std::move(meta)), holder_(storage->move_data_shared()) {} + +DenseTensor::DenseTensor(const LoD& lod) : DenseTensor() { meta_.lod = lod; } + +void DenseTensor::set_lod(const LoD& lod) { meta_.lod = lod; } + +LoD* DenseTensor::mutable_lod() { return &meta_.lod; } + +std::pair DenseTensor::lod_element(size_t level, + size_t elem) const { + PADDLE_ENFORCE_LT( + level, + NumLevels(), + paddle::platform::errors::InvalidArgument( + "The input level of LoD is invalid, it should be less than LoD " + "size. The input level is %zu, the LoD size is %zu.", + level, + NumLevels())); + + PADDLE_ENFORCE_LT(elem, + NumElements(level), + paddle::platform::errors::InvalidArgument( + "The input element of LoD is invalid, it should be " + "less than the number of elements in its level." + "The input element is %zu, the number of elements in " + "its level is %zu.", + elem, + NumElements(level))); + + return std::make_pair((meta_.lod)[level][elem], (meta_.lod)[level][elem + 1]); +} + +size_t DenseTensor::NumLevels() const { return meta_.lod.size(); } + +size_t DenseTensor::NumElements(size_t level) const { + PADDLE_ENFORCE_LT( + level, + NumLevels(), + paddle::platform::errors::InvalidArgument( + "The input level of LoD is invalid, it should be less than LoD " + "size. The input level is %zu, the LoD size is %zu.", + level, + NumLevels())); + + // the last offset is the end of last element + return (meta_.lod)[level].size() - 1; +} + +DenseTensor& DenseTensor::Resize(const DDim& dims) { + meta_.dims = dims; + return *this; +} + +DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const { + check_memory_size(); + PADDLE_ENFORCE_GE(begin_idx, + 0, + paddle::platform::errors::OutOfRange( + "The start row index must be greater than 0." + "But received the start index is d%.", + begin_idx)); + PADDLE_ENFORCE_LE(end_idx, + meta_.dims[0], + paddle::platform::errors::OutOfRange( + "The end row index is out of bound.")); + PADDLE_ENFORCE_LT( + begin_idx, + end_idx, + paddle::platform::errors::InvalidArgument( + "The start row index must be less than the end row index." + "But received the start index = %d, the end index = %d.", + begin_idx, + end_idx)); + + if (meta_.dims[0] == 1) { + return *this; + } else { + size_t base = numel() / meta_.dims[0]; + DenseTensor dst; + dst.holder_ = holder_; + dst.set_layout(meta_.layout); + dst.meta_.dtype = meta_.dtype; + DDim dst_dims = meta_.dims; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype()); + return dst; + } +} + +std::vector DenseTensor::Split(int64_t split_size, + int64_t axis) const { + check_memory_size(); + + PADDLE_ENFORCE_GE(meta_.dims.size(), + 0, + paddle::platform::errors::OutOfRange( + "split expects at least a 1-dimensional tensor")); + + PADDLE_ENFORCE_GE( + split_size, + 0, + paddle::platform::errors::OutOfRange( + "split expects split_size be non-negative, but got split_size is %d", + split_size)); + + int64_t numel_size = meta_.dims[axis]; + + int64_t num_splits = 1; + if (split_size != 0) { + num_splits = + std::max((numel_size + split_size - 1) / split_size, 1); + } + + std::vector splits(num_splits); + int64_t last_split_size = split_size - (split_size * num_splits - numel_size); + + for (int64_t i = 0; i < num_splits; ++i) { + int64_t length = i < num_splits - 1 ? split_size : last_split_size; + splits[i] = Slice(i * split_size, i * split_size + length); + } + return splits; +} + +std::vector DenseTensor::Chunk(int64_t chunks, + int64_t axis) const { + check_memory_size(); + PADDLE_ENFORCE_GE(meta_.dims.size(), + 0, + paddle::platform::errors::OutOfRange( + "split expects at least a 1-dimensional tensor")); + PADDLE_ENFORCE_GE( + chunks, + 0, + paddle::platform::errors::OutOfRange( + "chunks expects to be greater than 0, but got chunks is %d", chunks)); + + int64_t numel_size = meta_.dims[axis]; + int64_t split_size = (numel_size + chunks - 1) / chunks; + return Split(split_size, axis); +} + +DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) { + src.check_memory_size(); + // Preserve LoD + auto lod = meta_.lod; + *this = src; + meta_.lod = lod; + return *this; +} + +DenseTensor& DenseTensor::ShareInplaceVersionCounterWith( + const DenseTensor& src) { + PADDLE_ENFORCE_NOT_NULL( + inplace_version_counter_, + paddle::platform::errors::PreconditionNotMet( + "Tensor does not hold inplace_version_counter_.")); + + inplace_version_counter_ = src.inplace_version_counter_; + return *this; +} +} // namespace pten diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc index 7b2c4a2cf170f18fefb4df3e3a1dca23230f9ae8..7566b351bf63401acba3bad247b10bd7bb3c9cf1 100644 --- a/paddle/pten/core/device_context.cc +++ b/paddle/pten/core/device_context.cc @@ -13,28 +13,45 @@ // limitations under the License. #include "paddle/pten/core/device_context.h" +#include "paddle/pten/api/ext/exception.h" namespace pten { struct DeviceContext::Impl { - Allocator* allocator_{nullptr}; - Impl() = default; ~Impl() = default; - void SetAllocator(Allocator* allocator) { allocator_ = allocator; } + void SetDeviceAllocator(Allocator* allocator) { + device_allocator_ = allocator; + } + + void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; } + + const Allocator& GetDeviceAllocator() const { + PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr."); + return *device_allocator_; + } - const Allocator& GetAllocator() const { return *allocator_; } + const Allocator& GetHostAllocator() const { + PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr."); + return *host_allocator_; + } // TODO(Wilber): Add impl. It seems that tensorbase not have interface to // communicate with allocator. - void Alloc(TensorBase* tensor) {} + void HostAlloc(TensorBase* tensor) {} + void DeviceAlloc(TensorBase* tensor) {} + + Allocator* device_allocator_{nullptr}; + Allocator* host_allocator_{nullptr}; }; DeviceContext::DeviceContext() { impl_ = std::make_unique(); } DeviceContext::DeviceContext(const DeviceContext& other) { - impl_->SetAllocator(const_cast(&other.GetAllocator())); + impl_->SetDeviceAllocator( + const_cast(&other.GetDeviceAllocator())); + impl_->SetHostAllocator(const_cast(&other.GetHostAllocator())); } DeviceContext::DeviceContext(DeviceContext&& other) { @@ -43,14 +60,26 @@ DeviceContext::DeviceContext(DeviceContext&& other) { DeviceContext::~DeviceContext() = default; -void DeviceContext::SetAllocator(Allocator* allocator) { - impl_->SetAllocator(allocator); +void DeviceContext::SetHostAllocator(Allocator* allocator) { + impl_->SetHostAllocator(allocator); +} + +void DeviceContext::SetDeviceAllocator(Allocator* allocator) { + impl_->SetDeviceAllocator(allocator); +} + +const Allocator& DeviceContext::GetHostAllocator() const { + return impl_->GetHostAllocator(); } -const Allocator& DeviceContext::GetAllocator() const { - return impl_->GetAllocator(); +const Allocator& DeviceContext::GetDeviceAllocator() const { + return impl_->GetDeviceAllocator(); } -void DeviceContext::Alloc(TensorBase* tensor) { impl_->Alloc(tensor); } +void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); } + +void DeviceContext::DeviceAlloc(TensorBase* tensor) { + impl_->DeviceAlloc(tensor); +} } // namespace pten diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h index 1ee2e21494bf544c130ede20ea84c11ae94ca812..c658a24c3527d50efacc9b2b768ac8f07c07b338 100644 --- a/paddle/pten/core/device_context.h +++ b/paddle/pten/core/device_context.h @@ -57,19 +57,38 @@ class DeviceContext { * * @param allocator */ - void SetAllocator(Allocator*); + void SetDeviceAllocator(Allocator*); /** - * @brief Get the const Allocator object. + * @brief Get the const deveice-releated Allocator object. * * @return Allocator */ - const Allocator& GetAllocator() const; + const Allocator& GetDeviceAllocator() const; /** - * @brief Allocate memory for tensor. + * @brief Allocate device memory for tensor. */ - void Alloc(pten::TensorBase*); + void DeviceAlloc(pten::TensorBase*); + + /** + * @brief Set the host Allocator object. + * + * @param allocator + */ + void SetHostAllocator(Allocator*); + + /** + * @brief Get the const host Allocator object. + * + * @return Allocator + */ + const Allocator& GetHostAllocator() const; + + /** + * @brief Allocate host memory for tensor. + */ + void HostAlloc(pten::TensorBase*); // TODO(wilber): Just for the convenience of migrating the code, it will be // modified or removed later. diff --git a/paddle/pten/core/enforce.cc b/paddle/pten/core/enforce.cc new file mode 100644 index 0000000000000000000000000000000000000000..ce23565a8874f1afae8aa1f4feb2f217da5f8ed8 --- /dev/null +++ b/paddle/pten/core/enforce.cc @@ -0,0 +1,15 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/core/enforce.h" diff --git a/paddle/pten/core/enforce.h b/paddle/pten/core/enforce.h new file mode 100644 index 0000000000000000000000000000000000000000..97433f1a6d5fc3d209528d8c419e9737e85cd4ad --- /dev/null +++ b/paddle/pten/core/enforce.h @@ -0,0 +1,558 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef __GNUC__ +#include // for __cxa_demangle +#endif // __GNUC__ + +#if !defined(_WIN32) +#include // dladdr +#include // sleep, usleep +#else // _WIN32 +#ifndef NOMINMAX +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#endif +#include // GetModuleFileName, Sleep +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL) +#include +#endif + +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "paddle/pten/core/errors.h" +#include "paddle/utils/string/printf.h" +#include "paddle/utils/string/to_string.h" + +// Note: these headers for simplify demangle type string +#include "paddle/pten/core/type_defs.h" + +namespace pten { +class ErrorSummary; +} // namespace pten + +DECLARE_int32(call_stack_level); +namespace pten { +namespace enforce { +/** HELPER MACROS AND FUNCTIONS **/ + +#ifndef PADDLE_MAY_THROW +#define PADDLE_MAY_THROW noexcept(false) +#endif + +// Because most enforce conditions would evaluate to true, we can use +// __builtin_expect to instruct the C++ compiler to generate code that +// always forces branch prediction of true. +// This generates faster binary code. __builtin_expect is since C++11. +// For more details, please check https://stackoverflow.com/a/43870188/724872. +#if !defined(_WIN32) +#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) +#else +// there is no equivalent intrinsics in msvc. +#define UNLIKELY(condition) (condition) +#endif + +#if !defined(_WIN32) +#define LIKELY(condition) __builtin_expect(static_cast(condition), 1) +#else +// there is no equivalent intrinsics in msvc. +#define LIKELY(condition) (condition) +#endif + +#if defined _WIN32 && defined PADDLE_ON_INFERENCE && defined PADDLE_NO_PYTHON +#define HANDLE_THE_ERROR try { +#define END_HANDLE_THE_ERROR \ + } \ + catch (const std::exception& e) { \ + std::cout << e.what() << std::endl; \ + throw; \ + } +#else +#define HANDLE_THE_ERROR +#define END_HANDLE_THE_ERROR +#endif + +#ifdef __GNUC__ +inline std::string demangle(std::string name) { + int status = -4; // some arbitrary value to eliminate the compiler warning + std::unique_ptr res{ + abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free}; + return (status == 0) ? res.get() : name; +} +#else +inline std::string demangle(std::string name) { return name; } +#endif + +namespace details { +template +inline constexpr bool IsArithmetic() { + return std::is_arithmetic::value; +} + +template +struct TypeConverterImpl { + using Type1 = typename std::common_type::type; + using Type2 = Type1; +}; + +template +struct TypeConverterImpl { + using Type1 = T1; + using Type2 = T2; +}; + +template +struct TypeConverter { + static constexpr bool kIsArithmetic = + IsArithmetic() && IsArithmetic(); + using Type1 = typename TypeConverterImpl::Type1; + using Type2 = typename TypeConverterImpl::Type2; +}; + +template +using CommonType1 = typename std::add_lvalue_reference< + typename std::add_const::Type1>::type>::type; + +template +using CommonType2 = typename std::add_lvalue_reference< + typename std::add_const::Type2>::type>::type; + +// Here, we use SFINAE to check whether T can be converted to std::string +template +struct CanToString { + private: + using YesType = uint8_t; + using NoType = uint16_t; + + template + static YesType Check(decltype(std::cout << std::declval())) { + return 0; + } + + template + static NoType Check(...) { + return 0; + } + + public: + static constexpr bool kValue = + std::is_same(std::cout))>::value; +}; + +template +struct BinaryCompareMessageConverter { + template + static std::string Convert(const char* expression, const T& value) { + return expression + std::string(":") + paddle::string::to_string(value); + } +}; + +template <> +struct BinaryCompareMessageConverter { + template + static const char* Convert(const char* expression, const T& value) { + return expression; + } +}; +} // namespace details + +template +inline std::string ReplaceComplexTypeStr(std::string str, + const std::string& type_name) { + auto demangle_type_str = demangle(typeid(T).name()); + size_t start_pos = 0; + while ((start_pos = str.find(demangle_type_str, start_pos)) != + std::string::npos) { + str.replace(start_pos, demangle_type_str.length(), type_name); + start_pos += type_name.length(); + } + return str; +} + +#define __REPLACE_COMPLEX_TYPE_STR__(__TYPENAME, __STR) \ + do { \ + __STR = \ + pten::enforce::ReplaceComplexTypeStr<__TYPENAME>(__STR, #__TYPENAME); \ + } while (0) + +inline std::string SimplifyDemangleStr(std::string str) { + // the older is important, you have to put complex types in front + __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::AttributeMap, str); + __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::Attribute, str); + __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVariableWrapperMap, str); + __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVarBaseMap, str); + __REPLACE_COMPLEX_TYPE_STR__(std::string, str); + return str; +} + +inline std::string GetCurrentTraceBackString(bool for_signal = false) { + std::ostringstream sout; + + if (!for_signal) { + sout << "\n\n--------------------------------------\n"; + sout << "C++ Traceback (most recent call last):"; + sout << "\n--------------------------------------\n"; + } +#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL) + static constexpr int TRACE_STACK_LIMIT = 100; + + void* call_stack[TRACE_STACK_LIMIT]; + auto size = backtrace(call_stack, TRACE_STACK_LIMIT); + auto symbols = backtrace_symbols(call_stack, size); + Dl_info info; + int idx = 0; + // `for_signal` used to remove the stack trace introduced by + // obtaining the error stack trace when the signal error occurred, + // that is not related to the signal error self, remove it to + // avoid misleading users and developers + int end_idx = for_signal ? 2 : 0; + for (int i = size - 1; i >= end_idx; --i) { + if (dladdr(call_stack[i], &info) && info.dli_sname) { + auto demangled = demangle(info.dli_sname); + std::string path(info.dli_fname); + // C++ traceback info are from core.so + if (path.substr(path.length() - 3).compare(".so") == 0) { + sout << paddle::string::Sprintf( + "%-3d %s\n", idx++, SimplifyDemangleStr(demangled)); + } + } + } + free(symbols); +#else + sout << "Not support stack backtrace yet.\n"; +#endif + return sout.str(); +} + +template +inline std::string GetErrorSumaryString(StrType&& what, + const char* file, + int line) { + std::ostringstream sout; + if (FLAGS_call_stack_level > 1) { + sout << "\n----------------------\nError Message " + "Summary:\n----------------------\n"; + } + sout << paddle::string::Sprintf( + "%s (at %s:%d)", std::forward(what), file, line) + << std::endl; + return sout.str(); +} + +template +inline std::string GetTraceBackString(StrType&& what, + const char* file, + int line) { + if (FLAGS_call_stack_level > 1) { + // FLAGS_call_stack_level>1 means showing c++ call stack + return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line); + } else { + return GetErrorSumaryString(what, file, line); + } +} + +inline std::string SimplifyErrorTypeFormat(const std::string& str) { + std::ostringstream sout; + size_t type_end_pos = str.find(":", 0); + if (type_end_pos == std::string::npos) { + sout << str; + } else { + // Remove "Error:", add "()"" + sout << "(" << str.substr(0, type_end_pos - 5) << ")" + << str.substr(type_end_pos + 1); + } + return sout.str(); +} + +inline bool is_error(bool stat) { return !stat; } + +// Note: This Macro can only be used within enforce.h +#define __THROW_ERROR_INTERNAL__(__ERROR_SUMMARY) \ + do { \ + HANDLE_THE_ERROR \ + throw ::pten::enforce::EnforceNotMet(__ERROR_SUMMARY, __FILE__, __LINE__); \ + END_HANDLE_THE_ERROR \ + } while (0) + +/** ENFORCE EXCEPTION AND MACROS **/ + +struct EnforceNotMet : public std::exception { + public: + EnforceNotMet(std::exception_ptr e, const char* file, int line) { + try { + std::rethrow_exception(e); + } catch (EnforceNotMet& e) { + code_ = e.code(); + err_str_ = GetTraceBackString(e.what(), file, line); + simple_err_str_ = SimplifyErrorTypeFormat(err_str_); + } catch (std::exception& e) { + err_str_ = GetTraceBackString(e.what(), file, line); + simple_err_str_ = SimplifyErrorTypeFormat(err_str_); + } + } + + EnforceNotMet(const std::string& str, const char* file, int line) + : err_str_(GetTraceBackString(str, file, line)) { + simple_err_str_ = SimplifyErrorTypeFormat(err_str_); + } + + EnforceNotMet(const pten::ErrorSummary& error, const char* file, int line) + : code_(error.code()), + err_str_(GetTraceBackString(error.to_string(), file, line)) { + simple_err_str_ = SimplifyErrorTypeFormat(err_str_); + } + + const char* what() const noexcept override { + if (FLAGS_call_stack_level > 1) { + return err_str_.c_str(); + } else { + return simple_err_str_.c_str(); + } + } + + pten::ErrorCode code() const { return code_; } + + const std::string& error_str() const { return err_str_; } + + const std::string& simple_error_str() const { return simple_err_str_; } + + void set_error_str(std::string str) { + if (FLAGS_call_stack_level > 1) { + err_str_ = str; + } else { + simple_err_str_ = str; + } + } + + private: + // Used to determine the final type of exception thrown + pten::ErrorCode code_ = pten::ErrorCode::LEGACY; + // Complete error message + // e.g. InvalidArgumentError: *** + std::string err_str_; + // Simple errror message used when no C++ stack and python compile stack + // e.g. (InvalidArgument) *** + std::string simple_err_str_; +}; + +#define PADDLE_THROW(...) \ + do { \ + HANDLE_THE_ERROR \ + throw ::pten::enforce::EnforceNotMet( \ + ::pten::ErrorSummary(__VA_ARGS__), __FILE__, __LINE__); \ + END_HANDLE_THE_ERROR \ + } while (0) + +#if defined(__CUDA_ARCH__) +// For cuda, the assertions can affect performance and it is therefore +// recommended to disable them in production code +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion +#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \ + do { \ + if (!(_IS_NOT_ERROR)) { \ + printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \ + __FILE__, \ + __LINE__, \ + #_IS_NOT_ERROR, \ + ##__VA_ARGS__); \ + asm("trap;"); \ + } \ + } while (0) +#elif defined(__HIPCC__) +#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \ + do { \ + if (!(_IS_NOT_ERROR)) { \ + printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \ + __FILE__, \ + __LINE__, \ + #_IS_NOT_ERROR, \ + ##__VA_ARGS__); \ + abort(); \ + } \ + } while (0) +#else +#define PADDLE_ENFORCE(COND, ...) \ + do { \ + auto __cond__ = (COND); \ + if (UNLIKELY(::pten::is_error(__cond__))) { \ + __THROW_ERROR_INTERNAL__(pten::ErrorSummary(__VA_ARGS__)); \ + } \ + } while (0) +#endif + +/* + * Some enforce helpers here, usage: + * int a = 1; + * int b = 2; + * PADDLE_ENFORCE_EQ(a, b); + * + * will raise an expression described as follows: + * "Expected input a == b, but received a(1) != b(2)." + * with detailed stack information. + * + * extra messages is also supported, for example: + * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) + */ + +#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ + do { \ + if (UNLIKELY(nullptr == (__VAL))) { \ + auto __summary__ = pten::ErrorSummary(__VA_ARGS__); \ + auto __message__ = ::paddle::string::Sprintf( \ + "%s\n [Hint: " #__VAL " should not be null.]", \ + __summary__.error_message()); \ + __THROW_ERROR_INTERNAL__( \ + pten::ErrorSummary(__summary__.code(), __message__)); \ + } \ + } while (0) + +#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...) \ + do { \ + auto __val1 = (__VAL1); \ + auto __val2 = (__VAL2); \ + using __TYPE1__ = decltype(__val1); \ + using __TYPE2__ = decltype(__val2); \ + using __COMMON_TYPE1__ = \ + ::pten::details::CommonType1<__TYPE1__, __TYPE2__>; \ + using __COMMON_TYPE2__ = \ + ::pten::details::CommonType2<__TYPE1__, __TYPE2__>; \ + bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \ + static_cast<__COMMON_TYPE2__>(__val2)); \ + if (UNLIKELY(!__is_not_error)) { \ + auto __summary__ = pten::ErrorSummary(__VA_ARGS__); \ + constexpr bool __kCanToString__ = \ + ::pten::details::CanToString<__TYPE1__>::kValue && \ + ::pten::details::CanToString<__TYPE2__>::kValue; \ + auto __message__ = ::paddle::string::Sprintf( \ + "%s\n [Hint: Expected %s " #__CMP \ + " %s, but received %s " #__INV_CMP " %s.]", \ + __summary__.error_message(), \ + #__VAL1, \ + #__VAL2, \ + ::pten::details::BinaryCompareMessageConverter< \ + __kCanToString__>::Convert(#__VAL1, __val1), \ + ::pten::details::BinaryCompareMessageConverter< \ + __kCanToString__>::Convert(#__VAL2, __val2)); \ + __THROW_ERROR_INTERNAL__( \ + pten::ErrorSummary(__summary__.code(), __message__)); \ + } \ + } while (0) + +#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) +#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) +#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) +#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) +#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) +#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) + +/** EXTENDED TOOL FUNCTIONS WITH CHECKING **/ + +/* + * Summary: This macro is used to get Variable or internal type + * data (such as LoDTensor or SelectedRows) of the Input and + * Output in op, generally used when call scope.FindVar(Input/ + * Output("Name")) or ctx.Input(). + * Firstly this macro check whether the obtained pointer is null, + * and then return data if it is not null. + * + * Note: This macro is only suitable for specific scenarios and + * does not intended to be widely used. If it cannot meet the + * requirements, please use other PADDLE_ENFORCE** check macro. + * + * Parameters: + *     __PTR: pointer + * __ROLE: (string), Input or Output + * __NAME: (string), Input or Output name + * __OP_TYPE: (string), the op type + *   + * Return: The data pointed to by the pointer. + * + * Examples: + * GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", "Mul"); + */ +#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \ + (([&]() -> std::add_lvalue_reference::type { \ + auto* __ptr = (__PTR); \ + if (UNLIKELY(nullptr == __ptr)) { \ + auto __summary__ = pten::errors::NotFound( \ + "Unable to get %s data of %s %s in operator %s. " \ + "Possible reasons are:\n" \ + " 1. The %s is not the %s of operator %s;\n" \ + " 2. The %s has no corresponding variable passed in;\n" \ + " 3. The %s corresponding variable is not initialized.", \ + pten::demangle( \ + typeid(std::add_lvalue_reference::type) \ + .name()), \ + __ROLE, \ + __NAME, \ + __OP_TYPE, \ + __NAME, \ + __ROLE, \ + __OP_TYPE, \ + __NAME, \ + __NAME); \ + auto __message__ = ::paddle::string::Sprintf( \ + "%s\n [Hint: pointer " #__PTR " should not be null.]", \ + __summary__.error_message()); \ + __THROW_ERROR_INTERNAL__( \ + pten::ErrorSummary(__summary__.code(), __message__)); \ + } \ + return *__ptr; \ + })()) + +/* + * Summary: This macro is used to check whether op has specified + * Input or Output Variables. Because op's Input and Output + * checking are written similarly, so abstract this macro. + * + * Parameters: + *     __EXPR: (bool), the bool expression + * __ROLE: (string), Input or Output + * __NAME: (string), Input or Output name + * __OP_TYPE: (string), the op type + * + * Examples: + * OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul"); + */ +#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \ + do { \ + PADDLE_ENFORCE_EQ( \ + __EXPR, \ + true, \ + pten::errors::NotFound( \ + "No %s(%s) found for %s operator.", __ROLE, __NAME, __OP_TYPE)); \ + } while (0) + +} // namespace enforce +using namespace enforce; // NOLINT +} // namespace pten diff --git a/paddle/fluid/platform/errors.cc b/paddle/pten/core/errors.cc similarity index 63% rename from paddle/fluid/platform/errors.cc rename to paddle/pten/core/errors.cc index 94a182f96567889c3093a2eca0d7ac013599c471..c567cfe66465cc90a313b88d4627dceafa627798 100644 --- a/paddle/fluid/platform/errors.cc +++ b/paddle/pten/core/errors.cc @@ -12,54 +12,50 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/errors.h" +#include "paddle/pten/core/errors.h" #include -namespace paddle { -namespace platform { - -typedef ::paddle::platform::error::Code Code; - -std::string error_name(Code code) { +namespace pten { +std::string error_name(ErrorCode code) { switch (code) { - case paddle::platform::error::LEGACY: + case ErrorCode::LEGACY: return "Error"; break; - case paddle::platform::error::INVALID_ARGUMENT: + case ErrorCode::INVALID_ARGUMENT: return "InvalidArgumentError"; break; - case paddle::platform::error::NOT_FOUND: + case ErrorCode::NOT_FOUND: return "NotFoundError"; break; - case paddle::platform::error::OUT_OF_RANGE: + case ErrorCode::OUT_OF_RANGE: return "OutOfRangeError"; break; - case paddle::platform::error::ALREADY_EXISTS: + case ErrorCode::ALREADY_EXISTS: return "AlreadyExistsError"; break; - case paddle::platform::error::RESOURCE_EXHAUSTED: + case ErrorCode::RESOURCE_EXHAUSTED: return "ResourceExhaustedError"; break; - case paddle::platform::error::PRECONDITION_NOT_MET: + case ErrorCode::PRECONDITION_NOT_MET: return "PreconditionNotMetError"; break; - case paddle::platform::error::PERMISSION_DENIED: + case ErrorCode::PERMISSION_DENIED: return "PermissionDeniedError"; break; - case paddle::platform::error::EXECUTION_TIMEOUT: + case ErrorCode::EXECUTION_TIMEOUT: return "ExecutionTimeoutError"; break; - case paddle::platform::error::UNIMPLEMENTED: + case ErrorCode::UNIMPLEMENTED: return "UnimplementedError"; break; - case paddle::platform::error::UNAVAILABLE: + case ErrorCode::UNAVAILABLE: return "UnavailableError"; break; - case paddle::platform::error::FATAL: + case ErrorCode::FATAL: return "FatalError"; break; - case paddle::platform::error::EXTERNAL: + case ErrorCode::EXTERNAL: return "ExternalError"; break; default: @@ -74,6 +70,4 @@ std::string ErrorSummary::to_string() const { result += error_message(); return result; } - -} // namespace platform -} // namespace paddle +} // namespace pten diff --git a/paddle/pten/core/errors.h b/paddle/pten/core/errors.h new file mode 100644 index 0000000000000000000000000000000000000000..56bbeef644f9e1dd1087a3923d302180975b7573 --- /dev/null +++ b/paddle/pten/core/errors.h @@ -0,0 +1,146 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/utils/string/printf.h" + +namespace pten { +enum ErrorCode { + // Legacy error. + // Error type string: "Error" + LEGACY = 0, + + // Client specified an invalid argument. + // Error type string: "InvalidArgumentError" + INVALID_ARGUMENT = 1, + + // Some requested entity (e.g., file or directory) was not found. + // Error type string: "NotFoundError" + NOT_FOUND = 2, + + // Operation tried to iterate past the valid input range. E.g., seeking or + // reading past end of file. + // Error type string: "OutOfRangeError" + OUT_OF_RANGE = 3, + + // Some entity that we attempted to create (e.g., file or directory) + // already exists. + // Error type string: "AlreadyExistsError" + ALREADY_EXISTS = 4, + + // Some resource has been exhausted, perhaps a per-user quota, or + // perhaps the entire file system is out of space. + // Error type string: "ResourceExhaustedError" + RESOURCE_EXHAUSTED = 5, + + // Operation was rejected because the system is not in a state + // required for the operation's execution. + // Error type string: "PreconditionNotMetError" + PRECONDITION_NOT_MET = 6, + + // The caller does not have permission to execute the specified + // operation. + // Error type string: "PermissionDeniedError" + PERMISSION_DENIED = 7, + + // Deadline expired before operation could complete. + // Error type string: "ExecutionTimeout" + EXECUTION_TIMEOUT = 8, + + // Operation is not implemented or not supported/enabled in this service. + // Error type string: "UnimpelmentedError" + UNIMPLEMENTED = 9, + + // The service is currently unavailable. This is a most likely a + // transient condition and may be corrected by retrying with + // a backoff. + // Error type string: "UnavailableError" + UNAVAILABLE = 10, + + // Fatal errors. Means some invariant expected by the underlying + // system has been broken. If you see one of these errors, + // something is very broken. + // Error type string: "FatalError" + FATAL = 11, + + // Third-party library error. + // Error type string: "ExternalError" + EXTERNAL = 12, +}; + +class ErrorSummary { + public: + // Note(chenweihang): Final deprecated constructor + // This constructor is used to be compatible with + // current existing untyped PADDLE_ENFORCE_* + // PADDLE_ENFORCE + // Note(chenweihang): Windows openblas need this + // constructor for compiling PADDLE_ENFORCE in *.cu, + // this is a bug cause we can't remove this + // constructor now. + template + explicit ErrorSummary(Args... args) { + code_ = pten::ErrorCode::LEGACY; + msg_ = paddle::string::Sprintf(args...); + } + + // Note(chenweihang): Only recommended constructor + // No longer supports PADDLE_ENFORCE without type or without error message + explicit ErrorSummary(ErrorCode code, std::string msg) + : code_(code), msg_(msg) {} + + ErrorCode code() const { return code_; } + + const std::string& error_message() const { return msg_; } + + std::string to_string() const; + + private: + ErrorCode code_; + std::string msg_; +}; + +namespace errors { + +#define REGISTER_ERROR(FUNC, CONST, ...) \ + template \ + ::pten::ErrorSummary FUNC(Args... args) { \ + return ::pten::ErrorSummary(::pten::CONST, \ + ::paddle::string::Sprintf(args...)); \ + } + +REGISTER_ERROR(InvalidArgument, ErrorCode::INVALID_ARGUMENT) +REGISTER_ERROR(NotFound, ErrorCode::NOT_FOUND) +REGISTER_ERROR(OutOfRange, ErrorCode::OUT_OF_RANGE) +REGISTER_ERROR(AlreadyExists, ErrorCode::ALREADY_EXISTS) +REGISTER_ERROR(ResourceExhausted, ErrorCode::RESOURCE_EXHAUSTED) +REGISTER_ERROR(PreconditionNotMet, ErrorCode::PRECONDITION_NOT_MET) +REGISTER_ERROR(PermissionDenied, ErrorCode::PERMISSION_DENIED) +REGISTER_ERROR(ExecutionTimeout, ErrorCode::EXECUTION_TIMEOUT) +REGISTER_ERROR(Unimplemented, ErrorCode::UNIMPLEMENTED) +REGISTER_ERROR(Unavailable, ErrorCode::UNAVAILABLE) +REGISTER_ERROR(Fatal, ErrorCode::FATAL) +REGISTER_ERROR(External, ErrorCode::EXTERNAL) + +#undef REGISTER_ERROR + +} // namespace errors +} // namespace pten diff --git a/paddle/pten/core/infermeta_utils.h b/paddle/pten/core/infermeta_utils.h index c6812dee92b6a77534faa6a8853e322e285d2c6d..bfc9d29e63709f7ad6eff498953027003c677edf 100644 --- a/paddle/pten/core/infermeta_utils.h +++ b/paddle/pten/core/infermeta_utils.h @@ -151,7 +151,7 @@ struct InferMetaFnImpl { struct InferMetaFnCallHelper { template static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { - const MetaConfig& arg = ctx->GetMetaConfig(); + MetaConfig arg = ctx->GetMetaConfig(); InferMetaFnCallHelper::template Call( ctx, pargs..., arg); } diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h index 8e089970f9139e2ec2fb2d84644e9982018bbbd1..e473861dcf09c88de3936ca8849aedc6dac744d6 100644 --- a/paddle/pten/core/kernel_alias_name.h +++ b/paddle/pten/core/kernel_alias_name.h @@ -21,6 +21,7 @@ namespace pten { // the key is sorted by key's alphabet const std::unordered_map kernel_alias_name_map = { {"elementwise_add", "add_raw"}, + {"elementwise_add_grad", "add_grad"}, {"elementwise_div", "divide_raw"}, {"elementwise_mul", "muliply_raw"}, {"elementwise_sub", "subtract_raw"}, diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h index 5dd2bf367b3b83fbef585239af6a11c552821398..def1019e204cd85da56f1a45e162f3e9c4251af3 100644 --- a/paddle/pten/core/kernel_context.h +++ b/paddle/pten/core/kernel_context.h @@ -24,7 +24,7 @@ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/core/enforce.h" namespace pten { @@ -123,7 +123,7 @@ class KernelContext { try { return paddle::any_cast(attrs_.at(idx)); } catch (paddle::bad_any_cast&) { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( + PADDLE_THROW(pten::errors::InvalidArgument( "Attribute cast error in Op Kernel Context.")); } } diff --git a/paddle/pten/core/kernel_factory.cc b/paddle/pten/core/kernel_factory.cc index f10b58506f728ed39b62ec6c6efad621ab8ce926..06049b237d57946d89dd3793211a2f6af85f610f 100644 --- a/paddle/pten/core/kernel_factory.cc +++ b/paddle/pten/core/kernel_factory.cc @@ -15,7 +15,7 @@ #include "paddle/pten/core/kernel_factory.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/core/enforce.h" namespace pten { @@ -64,8 +64,8 @@ const Kernel& KernelFactory::SelectKernelOrThrowError( auto iter = kernels_.find(kernel_name); PADDLE_ENFORCE_NE(iter, kernels_.end(), - paddle::platform::errors::NotFound( - "The kernel `%s` is not registered.", kernel_name)); + pten::errors::NotFound("The kernel `%s` is not registered.", + kernel_name)); auto kernel_iter = iter->second.find(kernel_key); // TODO(chenweihang): polish refind impl here @@ -78,7 +78,7 @@ const Kernel& KernelFactory::SelectKernelOrThrowError( PADDLE_ENFORCE_NE( kernel_iter, iter->second.end(), - paddle::platform::errors::NotFound( + pten::errors::NotFound( "The kernel with key %s of kernel `%s` is not registered.", kernel_key, kernel_name)); diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h index bd26d86a34a0942da61f08c040fdd6a0ec47a2cf..8a100451cd4a8f99064e4a5b129e2c50413befa4 100644 --- a/paddle/pten/core/kernel_factory.h +++ b/paddle/pten/core/kernel_factory.h @@ -27,7 +27,7 @@ #include "paddle/pten/core/kernel_def.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/core/enforce.h" #include "paddle/utils/flat_hash_map.h" #include "paddle/utils/small_vector.h" diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h index e1160ea6b7d5dfa0bcb086247e3cecc99a3fdb78..800c01f6916821e75a5335de7b1efd91cd6ea9f8 100644 --- a/paddle/pten/core/kernel_registry.h +++ b/paddle/pten/core/kernel_registry.h @@ -26,7 +26,7 @@ #include "paddle/pten/core/kernel_utils.h" #include "paddle/pten/core/macros.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/core/enforce.h" namespace pten { diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h index 60201151c62a23130878d93cc0992f9b6e79c02e..85fe2f22836e61bf7348fa0bbe36c9efb2b02331 100644 --- a/paddle/pten/core/kernel_utils.h +++ b/paddle/pten/core/kernel_utils.h @@ -22,7 +22,7 @@ #include "paddle/pten/core/kernel_def.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/core/enforce.h" namespace pten { diff --git a/paddle/pten/core/meta_tensor.cc b/paddle/pten/core/meta_tensor.cc index f52d771b73bb90312a1080fea80aa476bcd90d95..a8229b568a617160ba4d1870f9c6954fb0697de6 100644 --- a/paddle/pten/core/meta_tensor.cc +++ b/paddle/pten/core/meta_tensor.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/pten/core/meta_tensor.h b/paddle/pten/core/meta_tensor.h index 442ff4137de4267e863c169df3dceb4deca2757a..1435e1c3912d0cc661beb839c354171272fbfac5 100644 --- a/paddle/pten/core/meta_tensor.h +++ b/paddle/pten/core/meta_tensor.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/pten/core/selected_rows.cc b/paddle/pten/core/selected_rows.cc index 6f64602bdcf4d9f70d57a76677a1796b373808ac..1dfcfa49347b50d305c2b37ccc4379eedb08a107 100644 --- a/paddle/pten/core/selected_rows.cc +++ b/paddle/pten/core/selected_rows.cc @@ -13,9 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/pten/core/selected_rows.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/data_type.h" +#include "paddle/pten/core/utils/data_type.h" namespace pten { @@ -191,16 +189,16 @@ void SelectedRows::Get(const pten::DenseTensor& ids, int64_t index = AutoGrownIndex(id, auto_grown, is_test); if (index < 0) { VLOG(5) << "id " << id << " not in the table, return 0"; - paddle::framework::VisitDataType( - value_->type(), + pten::VisitDataType( + value_->dtype(), TensorFillVisitor(value, i * value_width, value_width, 0.0)); } else { - paddle::framework::VisitDataType(value_->type(), - TensorCopyVisitor(value, - i * value_width, - *value_.get(), - index * value_width, - value_width)); + pten::VisitDataType(value_->dtype(), + TensorCopyVisitor(value, + i * value_width, + *value_.get(), + index * value_width, + value_width)); } } } diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h index f5be0a906dbdbb5339f995430a95a4be106a4a62..e12f59d02f2ba21054700248404640730614b277 100644 --- a/paddle/pten/core/selected_rows.h +++ b/paddle/pten/core/selected_rows.h @@ -24,15 +24,16 @@ limitations under the License. */ #include "paddle/pten/common/place.h" #include "paddle/pten/core/ddim.h" #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/enforce.h" #include "paddle/pten/core/utils/rw_lock.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/enforce.h" namespace pten { -class SelectedRows { +class SelectedRows : public TensorBase, + public TypeInfoTraits { /* * @brief We can use the SelectedRows structure to reproduce a sparse table. * A sparse table is a key-value structure that the key is an `int64_t`, @@ -51,21 +52,19 @@ class SelectedRows { public: SelectedRows(const std::vector& rows, const int64_t& height) : rows_(rows), height_(height) { - value_.reset(new pten::DenseTensor()); + value_.reset(new DenseTensor()); rwlock_.reset(new RWLock); } SelectedRows() { height_ = 0; - value_.reset(new pten::DenseTensor()); + value_.reset(new DenseTensor()); rwlock_.reset(new RWLock); } - const pten::Place& place() const { return value_->place(); } + const DenseTensor& value() const { return *value_; } - const pten::DenseTensor& value() const { return *value_; } - - pten::DenseTensor* mutable_value() { return value_.get(); } + DenseTensor* mutable_value() { return value_.get(); } int64_t height() const { return height_; } @@ -109,8 +108,8 @@ class SelectedRows { * @return a list of pair which contains the non-exists key and the index in * the value */ - void Get(const pten::DenseTensor& ids, - pten::DenseTensor* value, + void Get(const DenseTensor& ids, + DenseTensor* value, bool auto_grown = false, bool is_test = false); @@ -149,6 +148,41 @@ class SelectedRows { return pten::framework::make_ddim(dims); } + /// \brief Returns the name of the class for type traits. + /// \return The name of the class. + static const char* name() { return "SelectedRows"; } + + /// \brief Returns the number of elements contained in tensor. + /// \return The number of elements contained in tensor. + int64_t numel() const override { return value_->numel(); }; + + /// \brief Returns the dims of the tensor. + /// \return The dims of the tensor. + const DDim& dims() const noexcept override { + return value_->dims(); + // return paddle::framework::make_ddim(dims); + } + + /// \brief Returns the data type of the tensor. + /// \return The data type of the tensor. + DataType dtype() const noexcept override { return value_->dtype(); } + + /// \brief Returns the data layout of the tensor. + /// \return The data layout of the tensor. + DataLayout layout() const noexcept override { return value_->layout(); } + + /// \brief Returns the data place of the tensor. + /// \return The data place of the tensor. + const Place& place() const override { return value_->place(); }; + + /// \brief Test whether the metadata is valid. + /// \return Whether the metadata is valid. + bool valid() const noexcept override { return value_->valid(); } + + /// \brief Test whether the storage is allocated. + /// return Whether the storage is allocated. + bool initialized() const override { return value_->initialized(); } + private: // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. // SelectedRows are simply concated when adding together. Until a @@ -156,7 +190,7 @@ class SelectedRows { paddle::framework::Vector rows_; std::unordered_map id_to_index_; // should not be used when rows_ has duplicate member - std::unique_ptr value_{nullptr}; + std::unique_ptr value_{nullptr}; int64_t height_; // height indicates the underline tensor's height std::unique_ptr rwlock_{nullptr}; }; diff --git a/paddle/pten/core/tensor_status.h b/paddle/pten/core/tensor_status.h deleted file mode 100644 index e426a27eabb882adf447d610c957173a46903c49..0000000000000000000000000000000000000000 --- a/paddle/pten/core/tensor_status.h +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/common/backend.h" -#include "paddle/pten/common/data_type.h" -#include "paddle/pten/common/layout.h" -namespace pten { -class TensorInplaceVersion { - public: - explicit TensorInplaceVersion(uint32_t inplace_version = 0) - : inplace_version_(inplace_version) {} - bool IsUnique() const { return inplace_version_ == 0; } - void Bump() { ++inplace_version_; } - uint32_t CurrentVersion() const { return inplace_version_; } - - private: - uint32_t inplace_version_; -}; - -/** - * The Status data member of DenseTensor. - * - * Here the `static` represents information describing the status of Tensor, - * such as version counter, or other bool status members. - * - * Note: TensorStatus is a struct, the members are named like - * ordinary nonmember variables, such as `type` instead of `type_`. - * And we direct access its members, in addition to constructor, destructor - * and functions for setting data members, can not provide other functions. - * - * Note: polish impl later - */ -struct TensorStatus { - TensorStatus() = default; - TensorStatus(const TensorStatus&) = default; - TensorStatus(TensorStatus&&) = default; - - TensorStatus& operator=(const TensorStatus&) = delete; - TensorStatus& operator=(TensorStatus&&) = delete; - - TensorInplaceVersion inplace_version_counter{0}; - - /** - * For Scalar Tensor design - */ - bool is_scalar{false}; -}; - -} // namespace pten diff --git a/paddle/pten/core/type_defs.h b/paddle/pten/core/type_defs.h new file mode 100644 index 0000000000000000000000000000000000000000..13e7bb51c2e1bada6957108faace579b3cb76ecc --- /dev/null +++ b/paddle/pten/core/type_defs.h @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace paddle { +namespace framework { +// The order should be as same as framework.proto +// NOTE(xiongkun): we extract from framework/typedef.h to ensure we can transfer +// enforce.h +class BlockDesc; +using Attribute = boost::variant, + std::vector, + std::vector, + bool, + std::vector, + BlockDesc*, + int64_t, + std::vector, + std::vector, + std::vector>; +using AttributeMap = std::unordered_map; +} // namespace framework + +namespace imperative { + +class VariableWrapper; +class SavedVariableWrapperList; +class VarBase; +class OpBase; +class GradOpNode; +class Tracer; + +using WeakNameVarBaseMap = + std::map>>; + +namespace details { +template +struct NameVarMapTrait {}; + +template <> +struct NameVarMapTrait { + using Type = std::map>>; +}; + +template <> +struct NameVarMapTrait { + using Type = std::map; +}; +} // namespace details + +template +using NameVarMap = typename details::NameVarMapTrait::Type; + +using NameVarBaseMap = NameVarMap; +using NameVariableWrapperMap = NameVarMap; + +using VariableWrapperList = std::vector>; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/pten/core/utils/data_type.h b/paddle/pten/core/utils/data_type.h new file mode 100644 index 0000000000000000000000000000000000000000..ee223afb3b03c0e2b770097e4313ce31c45927ea --- /dev/null +++ b/paddle/pten/core/utils/data_type.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +#include "paddle/pten/common/data_type.h" +#include "paddle/pten/core/enforce.h" +#include "paddle/pten/kernels/funcs/eigen/extensions.h" + +namespace pten { + +#define _PtenForEachDataTypeHelper_(callback, cpp_type, data_type) \ + callback(cpp_type, data_type); + +#define _PtenForEachDataType_(callback) \ + _PtenForEachDataTypeHelper_(callback, float, DataType::FLOAT32); \ + _PtenForEachDataTypeHelper_( \ + callback, ::paddle::platform::float16, DataType::FLOAT16); \ + _PtenForEachDataTypeHelper_( \ + callback, ::paddle::platform::bfloat16, DataType::BFLOAT16); \ + _PtenForEachDataTypeHelper_(callback, double, DataType::FLOAT64); \ + _PtenForEachDataTypeHelper_(callback, int, DataType::INT32); \ + _PtenForEachDataTypeHelper_(callback, int64_t, DataType::INT64); \ + _PtenForEachDataTypeHelper_(callback, bool, DataType::BOOL); \ + _PtenForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8); \ + _PtenForEachDataTypeHelper_(callback, int16_t, DataType::INT16); \ + _PtenForEachDataTypeHelper_(callback, int8_t, DataType::INT8); \ + _PtenForEachDataTypeHelper_( \ + callback, ::paddle::platform::complex, DataType::COMPLEX64); \ + _PtenForEachDataTypeHelper_( \ + callback, ::paddle::platform::complex, DataType::COMPLEX128); + +template +inline void VisitDataType(pten::DataType type, Visitor visitor) { +#define PtenVisitDataTypeCallback(cpp_type, data_type) \ + do { \ + if (type == data_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _PtenForEachDataType_(PtenVisitDataTypeCallback); +#undef PtenVisitDataTypeCallback + PADDLE_THROW(pten::errors::Unimplemented( + "Not supported proto::VarType::Type(%d) as data type.", + static_cast(type))); +} +} // namespace pten diff --git a/paddle/pten/core/utils/intrusive_ptr.h b/paddle/pten/core/utils/intrusive_ptr.h index ed9a21e7f3a8a6c169a4f83572cfc9be6ff3a8d6..40f1dba4f64ad7688753042cb28ed2115d73f8aa 100644 --- a/paddle/pten/core/utils/intrusive_ptr.h +++ b/paddle/pten/core/utils/intrusive_ptr.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/core/enforce.h" namespace pten { @@ -62,7 +62,7 @@ class intrusive_ptr { T& operator*() const { PADDLE_ENFORCE_NOT_NULL( px, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "The pointer must be non-null before the dereference operation.")); return *px; } @@ -70,7 +70,7 @@ class intrusive_ptr { T* operator->() const { PADDLE_ENFORCE_NOT_NULL( px, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "The pointer must be non-null before the dereference operation.")); return px; } diff --git a/paddle/pten/infermeta/CMakeLists.txt b/paddle/pten/infermeta/CMakeLists.txt index 8e50d9d2c90d435eddd75f110ca7de38e11c9044..2216d38708b0b4746e55481ca63299b96b496eb6 100644 --- a/paddle/pten/infermeta/CMakeLists.txt +++ b/paddle/pten/infermeta/CMakeLists.txt @@ -1,2 +1,2 @@ -cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils) +cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils infermeta_utils) cc_library(backward_infermeta SRCS backward.cc DEPS convert_utils) diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc index 083fb0fca21881bcfbf078d31fb23687d07864f2..cb605db78d962e2deff1295686c5e95945f02531 100644 --- a/paddle/pten/infermeta/binary.cc +++ b/paddle/pten/infermeta/binary.cc @@ -131,8 +131,13 @@ DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta, } DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta, - const DenseTensorMeta& y_meta, - int axis) { + const DenseTensorMeta& y_meta) { + return ElementwiseRawInferMeta(x_meta, y_meta, -1); +} + +DenseTensorMeta ElementwiseRawInferMeta(const DenseTensorMeta& x_meta, + const DenseTensorMeta& y_meta, + int axis) { DenseTensorMeta return_meta(x_meta.dtype, x_meta.dims, x_meta.layout); if (x_meta.dims != y_meta.dims) { auto x_dims = x_meta.dims; diff --git a/paddle/pten/infermeta/binary.h b/paddle/pten/infermeta/binary.h index c86fc12a20abef6db422b93c1aa258e008688e0c..658211e48ac0a44c57d83ce63154e481a90ce69c 100644 --- a/paddle/pten/infermeta/binary.h +++ b/paddle/pten/infermeta/binary.h @@ -42,6 +42,10 @@ DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta, bool trans_y); DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta, - const DenseTensorMeta& y_meta, - int axis); + const DenseTensorMeta& y_meta); + +DenseTensorMeta ElementwiseRawInferMeta(const DenseTensorMeta& x_meta, + const DenseTensorMeta& y_meta, + int axis); + } // namespace pten diff --git a/paddle/pten/infermeta/unary.cc b/paddle/pten/infermeta/unary.cc index 27e1dc9511df231ba3c81f9a1ece7dbaafdb2450..fec50d528dfc42f357c006ef895549465a02f3e7 100644 --- a/paddle/pten/infermeta/unary.cc +++ b/paddle/pten/infermeta/unary.cc @@ -12,12 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -// See Note [ Why still include the fluid headers? ] #include "paddle/pten/infermeta/unary.h" + #include +#include "paddle/pten/core/infermeta_utils.h" + namespace pten { +void UnchangedInferMetaNew(MetaConfig config, + const MetaTensor& x, + MetaTensor* out) { + out->set_dims(x.dims()); + out->share_lod(x); +} + DenseTensorMeta UnchangedInferMeta(const DenseTensorMeta& x_meta) { return x_meta; } @@ -232,6 +241,16 @@ DenseTensorMeta ReshapeInferMeta(const DenseTensorMeta& x_meta, return InferMetaFromVecValue(x_meta, shape.GetData()); } +/* Why not use ReduceInferMeta directly? + Because we need make InferMetaFunction's args follow the design of api.yaml +*/ +DenseTensorMeta SumInferMeta(const DenseTensorMeta& x_meta, + const std::vector& axis, + DataType dtype, + bool keep_dim) { + return ReduceInferMeta(x_meta, axis, keep_dim, dtype); +} + DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta, const std::vector& axis, bool keep_dim, diff --git a/paddle/pten/infermeta/unary.h b/paddle/pten/infermeta/unary.h index ae42cbd5dd2c6d764bd10660834f24aa002baeab..670c70de84ccfdd01288ad5ad02b0b0ce5226c24 100644 --- a/paddle/pten/infermeta/unary.h +++ b/paddle/pten/infermeta/unary.h @@ -16,23 +16,27 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/pten/common/scalar_array.h" +#include "paddle/pten/core/infermeta_utils.h" +#include "paddle/pten/core/meta_tensor.h" #include "paddle/pten/core/tensor_meta.h" namespace pten { +class MetaConfig; + // Common InferMeta Functions for unary operators, The format like: // -// 1. DenseTensorMeta [OpName]InferMeta(const DenseTensorMeta& x_meta, ...) -// {} -// 2. std::pair [OpName]InferMeta(const -// DenseTensorMeta& -// x_meta, ...) {} -// 3. std::tuple -// [OpName]InferMeta(const -// DenseTensorMeta& x_meta, ...) -// NOTE: The name "InferMeta" may be not appropriate. "InferMeta" may be good. -// Because functions in this file -// not only can infer shape, but alse need infer lod or other useful data. +// void [OpName]InferMeta(const MetaTensor& x, ..., MetaTensor* out) {} +// +// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. +// Because functions in this file not only can infer shape, but also need +// infer lod or other useful data. + +// TODO(chenweihang): update all InferMeta function format in next pr, +// now add UnchangedInferMetaNew for test new format +void UnchangedInferMetaNew(MetaConfig config, + const MetaTensor& x, + MetaTensor* out); DenseTensorMeta UnchangedInferMeta(const DenseTensorMeta& x_meta); @@ -58,4 +62,9 @@ DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta, const std::vector& axis, bool keep_dim, DataType dtype = DataType::UNDEFINED); + +DenseTensorMeta SumInferMeta(const DenseTensorMeta& x_meta, + const std::vector& axis, + DataType dtype, + bool keep_dim); } // namespace pten diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt index 999f72a7e6b657613067cd9311a774b6d3a69b8d..615b80be592a081c044e458e46b52a3cb866c369 100644 --- a/paddle/pten/kernels/CMakeLists.txt +++ b/paddle/pten/kernels/CMakeLists.txt @@ -9,7 +9,7 @@ add_subdirectory(funcs) set_property(GLOBAL PROPERTY PTEN_KERNELS "") set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h index 6bfde977ce51789d1d62736338f1098a8d4783a7..179a1881189222e18f2dde14c35c14caadc831f4 100644 --- a/paddle/pten/kernels/cpu/elementwise.h +++ b/paddle/pten/kernels/cpu/elementwise.h @@ -706,4 +706,94 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx, } } +// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub. +// explicit gradient can cut off X, Y, Out from gradient op +// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse +// elementwise code. +template +void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy, + DX_OP dx_op, + DY_OP dy_op) { + const DDim& x_dim = x.dims(); + const DDim& y_dim = y.dims(); + if (x.dims() == y.dims()) { + pten::funcs::ElemwiseGradComputeNoBroadcast( + dev_ctx, + x_dim, + y_dim, + dout, + dout, + out, + dout, + axis, + dx, + dy, + dx_op, + dy_op); + } else { + ElemwiseGradComputeWithBroadcast(dev_ctx, + x_dim, + y_dim, + dout, + dout, + out, + dout, + axis, + dx, + dy, + dx_op, + dy_op); + } +} + +// Add Grad + +template +struct IdentityGrad { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } +}; + +template +typename std::enable_if::value>::type +elementwise_add_grad(const CPUContext& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + auto blas = paddle::operators::math::GetBlas(ctx); + if (dx) { + blas.VCOPY( + dout.numel(), dout.data(), dx->mutable_data(ctx.GetPlace())); + } + + if (dy) { + blas.VCOPY( + dout.numel(), dout.data(), dy->mutable_data(ctx.GetPlace())); + } +} + +template +typename std::enable_if::value>::type +elementwise_add_grad(const CPUContext& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + ElemwiseExplicitGradCompute, IdentityGrad>( + ctx, x, y, out, dout, axis, dx, dy, IdentityGrad(), IdentityGrad()); +} + } // namespace pten diff --git a/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc b/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..4a940c2be15c00e129611a6da7e5d5a6d1545a27 --- /dev/null +++ b/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/elementwise_grad_kernel.h" + +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/copy_kernel.h" +#include "paddle/pten/kernels/cpu/elementwise.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" +#include "paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h" + +namespace pten { + +template +void AddGradFunc(const CPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { + elementwise_add_grad(dev_ctx, x, y, out, dout, dx, dy); + } else { + ElemwiseExplicitGradCompute, IdentityGrad>( + dev_ctx, + x, + y, + out, + dout, + axis, + dx, + dy, + IdentityGrad(), + IdentityGrad()); + } +} + +template +void AddGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + pten::AddGradImpl(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc); +} + +template +void AddDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& y, + paddle::optional ddx, + paddle::optional ddy, + const DenseTensor& dout, + int axis, + DenseTensor* ddout) { + pten::AddDoubleGradImpl( + dev_ctx, + y, + ddx, + ddy, + dout, + axis, + ddout, + ElementwiseCompute, T>, + ElementwiseCompute, T>); +} + +template +void AddTripleGradKernel(const Context& dev_ctx, + const DenseTensor& ddx, + const DenseTensor& ddy, + const DenseTensor& d_ddout, + int axis, + DenseTensor* d_ddx, + DenseTensor* d_ddy) { + pten::AddGradImpl( + dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc); +} + +} // namespace pten + +PT_REGISTER_KERNEL(add_grad, + CPU, + ALL_LAYOUT, + pten::AddGradKernel, + float, + double, + int, + int64_t, + paddle::platform::complex, + paddle::platform::complex) {} + +PT_REGISTER_KERNEL(add_double_grad, + CPU, + ALL_LAYOUT, + pten::AddDoubleGradKernel, + float, + double, + int, + int64_t, + paddle::platform::complex, + paddle::platform::complex) {} + +PT_REGISTER_KERNEL(add_triple_grad, + CPU, + ALL_LAYOUT, + pten::AddTripleGradKernel, + float, + double, + int, + int64_t, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/kernels/elementwise_grad_kernel.h b/paddle/pten/kernels/elementwise_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..067eebc9e15b95c870a7fac4c03bb52d79fed511 --- /dev/null +++ b/paddle/pten/kernels/elementwise_grad_kernel.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace pten { + +template +void AddGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy); + +template +void AddDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& y, + paddle::optional ddx, + paddle::optional ddy, + const DenseTensor& dout, + int axis, + DenseTensor* ddout); + +template +void AddTripleGradKernel(const Context& dev_ctx, + const DenseTensor& ddx, + const DenseTensor& ddy, + const DenseTensor& d_ddout, + int axis, + DenseTensor* d_ddx, + DenseTensor* d_ddy); + +} // namespace pten diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h index 1c18e9f7998adc777c1f267ecf66ba1ad673112b..9ea27fd9c5b8d5f9b9a4d6fb0d6cb608d13f5984 100644 --- a/paddle/pten/kernels/funcs/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -14,18 +14,20 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/transform.h" #include "paddle/pten/backends/all_context.h" #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/empty_kernel.h" #if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/function_traits.h" +#include "paddle/pten/kernels/primitive/kernel_primitives.h" -namespace kps = paddle::operators::kernel_primitives; +namespace kps = pten::kps; #endif @@ -360,6 +362,43 @@ inline void get_mid_dims(const DDim &x_dims, } } +// for broadcast backwards +static inline std::vector GetReduceDim(const paddle::framework::DDim &in, + const paddle::framework::DDim &out, + int axis) { + axis = + (axis == -1 ? std::abs(static_cast(out.size() - in.size())) : axis); + std::vector dims; + for (int i = 0; i < axis; ++i) { + dims.push_back(i); + } + for (int i = 0; i < in.size(); ++i) { + if (out[i + axis] != in[i]) { + dims.push_back(i + axis); + } + } + for (int i = axis + in.size(); i < out.size(); ++i) { + dims.push_back(i); + } + return dims; +} + +template +static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx, + const DenseTensor &x, + const DenseTensor *ddx, + DenseTensor *ddx_safe) { + if (ddx) { + *ddx_safe = *ddx; + } else { + auto meta = pten::DenseTensorMeta(x.dtype(), x.dims(), x.layout()); + *ddx_safe = pten::Empty(dev_ctx, std::move(meta)); + ddx_safe->mutable_data(dev_ctx.GetPlace()); + paddle::operators::math::SetConstant set_zero; + set_zero(dev_ctx, ddx_safe, static_cast(0)); + } +} + template mutable_data(dev_ctx.GetPlace())}); } +inline void ElementwiseGradPreProcess(const DenseTensor &dout, + DenseTensor *dx) { + if (dx != nullptr) { + dx->set_lod(dout.lod()); + } +} + #if defined(__NVCC__) || defined(__HIPCC__) template diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h index 6d139d68530befe57bc0094eb3d5537cf00e660b..0b279d5325905885d69943629523f130b2411aff 100644 --- a/paddle/pten/kernels/funcs/elementwise_functor.h +++ b/paddle/pten/kernels/funcs/elementwise_functor.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/platform/enforce.h" #include "paddle/pten/common/float16.h" +#include "paddle/pten/core/enforce.h" #include "paddle/pten/core/hostdevice.h" namespace pten { diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index f4d8e442fcdebfe76cfa89df82d3132a7a65fae4..9a3ae7f12dfcd62a1a18154971fa99ab72c5561d 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -14,9 +14,11 @@ limitations under the License. */ #pragma once +#include "paddle/pten/kernels/copy_kernel.h" #include "paddle/pten/kernels/funcs/common_shape.h" #include "paddle/pten/kernels/funcs/cuda_kernel_config.h" #include "paddle/pten/kernels/funcs/elementwise_base.h" +#include "paddle/pten/kernels/gpu/reduce.h" #ifdef __HIPCC__ constexpr int ELEMWISE_MAX_BLOCK_DIM = 256; @@ -578,6 +580,20 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx, } } +template +void ElementwiseCompute(const GPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + int axis, + Functor func, + DenseTensor *z) { + std::vector ins = {&x, &y}; + std::vector outs = {z}; + z->mutable_data(dev_ctx.GetPlace()); + pten::LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, axis, func); +} + // BACKWARD CODE // Suppose only has contiguous dims @@ -1938,4 +1954,130 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, } } +template +static __global__ void SimpleElemwiseAddGradCUDAKernel( + const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + int loop = size / vec_size; + int remainder = size % vec_size; + const float4 *dout_vec = reinterpret_cast(dout); + float4 *dx_vec = reinterpret_cast(dx); + float4 *dy_vec = reinterpret_cast(dy); + float4 tmp_loop; + + for (int i = tid; i < loop; i += stride) { + tmp_loop = dout_vec[i]; + dx_vec[i] = tmp_loop; + dy_vec[i] = tmp_loop; + } + + if (tid == loop && remainder != 0) { + T tmp_rem; + while (remainder) { + int idx = size - remainder; + remainder--; + tmp_rem = dout[idx]; + dx[idx] = tmp_rem; + dy[idx] = tmp_rem; + } + } +} + +template +void default_elementwise_add_grad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int axis = -1) { + auto *dout_data = dout.data(); + + // dx + if (dx != nullptr) { + auto *dx_data = dx->mutable_data(ctx.GetPlace()); + if (dx->dims() == dout.dims()) { + if (dx_data != dout_data) { + pten::Copy(ctx, dout, false, dx); + } + } else { + // For inplace strategy, dx will be stored in addr of dout, which makes + // the result of dy wrong. + if (dx->IsSharedBufferWith(dout)) { + dx->clear(); + dx->mutable_data(x.dims(), ctx.GetPlace()); + } + std::vector reduce_dims = + funcs::GetReduceDim(x.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceFunctorImpl>( + dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + } + } + // dy + if (dy != nullptr) { + auto *dy_data = dy->mutable_data(ctx.GetPlace()); + if (dy->dims() == dout.dims()) { + if (dy_data != dout_data) { + pten::Copy(ctx, dout, false, dy); + } + } else { + std::vector reduce_dims = + funcs::GetReduceDim(y.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceFunctorImpl>( + dout, dy, kps::IdentityFunctor(), reduce_dims, stream); + } + } +} + +template +void elementwise_add_grad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy) { + auto *dx_data = dx->mutable_data(ctx.GetPlace()); + auto *dy_data = dy->mutable_data(ctx.GetPlace()); + auto *dout_data = dout.data(); + if (dx_data == dout_data && dy_data != dout_data) { + VLOG(4) << "Special case when dx_data is the same as dout_data, " + "only need copy dout to dy"; + pten::Copy(ctx, dout, false, dy); + } else if (dx_data != dout_data && dy_data == dout_data) { + VLOG(4) << "Special case when dy_data is the same as dout_data, " + "only need copy dout to dx"; + pten::Copy(ctx, dout, false, dx); + } else if (dx_data != dout_data && dy_data != dout_data) { + auto size = x.numel(); + int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); + dim3 grid_size = + dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / + PREDEFINED_BLOCK_SIZE, + 1); + SimpleElemwiseAddGradCUDAKernel< + T><<>>( + dout.data(), + size, + vec_size, + dx->mutable_data(ctx.GetPlace()), + dy->mutable_data(ctx.GetPlace())); + } else { + VLOG(4) << "Special case when dy_data is the same as dout_data, " + "and dx_data is the same as dout_data, do not need " + "any operator"; + } +} + } // namespace pten diff --git a/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu b/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..76af94f42fde2a10033169015e2acb7fed3c46a7 --- /dev/null +++ b/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu @@ -0,0 +1,121 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/elementwise_grad_kernel.h" + +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/copy_kernel.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" +#include "paddle/pten/kernels/gpu/elementwise.h" +#include "paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h" + +namespace pten { + +template +void AddGradFunc(const GPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { + elementwise_add_grad(dev_ctx, x, y, out, dout, dx, dy); + } else { + default_elementwise_add_grad(dev_ctx, x, y, out, dout, dx, dy, axis); + } +} + +template +void AddGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + pten::AddGradImpl(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc); +} + +template +void AddDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& y, + paddle::optional ddx, + paddle::optional ddy, + const DenseTensor& dout, + int axis, + DenseTensor* ddout) { + pten::AddDoubleGradImpl( + dev_ctx, + y, + ddx, + ddy, + dout, + axis, + ddout, + ElementwiseCompute, T>, + ElementwiseCompute, T>); +} + +template +void AddTripleGradKernel(const Context& dev_ctx, + const DenseTensor& ddx, + const DenseTensor& ddy, + const DenseTensor& d_ddout, + int axis, + DenseTensor* d_ddx, + DenseTensor* d_ddy) { + pten::AddGradImpl( + dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc); +} + +} // namespace pten + +PT_REGISTER_KERNEL(add_grad, + GPU, + ALL_LAYOUT, + pten::AddGradKernel, + float, + double, + int, + int64_t, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} + +PT_REGISTER_KERNEL(add_double_grad, + GPU, + ALL_LAYOUT, + pten::AddDoubleGradKernel, + float, + double, + int, + int64_t, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} + +PT_REGISTER_KERNEL(add_triple_grad, + GPU, + ALL_LAYOUT, + pten::AddTripleGradKernel, + float, + double, + int, + int64_t, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index d06dc1c43f6d41bc988283b2cfa9be072f4a69c8..996d85d3f42a7996e481a4887a0d0f4fa2587893 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -27,10 +27,10 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/platform/enforce.h" #include "paddle/pten/common/complex.h" #include "paddle/pten/common/float16.h" #include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/enforce.h" #include "paddle/pten/core/kernel_registry.h" namespace pten { diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index 26f17bc00507e8bc401a50942ce951710b120d64..d864c76ea197408e4d035c816a32d5bb5ccb71c1 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -34,13 +34,13 @@ namespace cub = hipcub; #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/pten/core/array.h" +#include "paddle/pten/core/enforce.h" +#include "paddle/pten/kernels/primitive/kernel_primitives.h" #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/backends/gpu/gpu_context.h" @@ -51,7 +51,7 @@ namespace cub = hipcub; #define REDUCE_SPLIT_BOUNDARY 512 #define REDUCE_VEC_SIZE 4 -namespace kps = paddle::operators::kernel_primitives; +namespace kps = pten::kps; namespace pten { namespace kernels { @@ -94,7 +94,7 @@ static inline void CheckReduceRank(int reduce_rank, int rank) { if (rank % 2 == 0) { PADDLE_ENFORCE_EQ(reduce_rank, rank / 2, - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "ReduceOp: invalid reduce rank. When rank = %d, " "reduce_rank must be %d, but got %d.", rank, @@ -106,7 +106,7 @@ static inline void CheckReduceRank(int reduce_rank, int rank) { PADDLE_ENFORCE_EQ( reduce_rank == lower_rank || reduce_rank == upper_rank, true, - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank " "must be %d or %d, but got %d.", rank, @@ -122,7 +122,7 @@ static inline pten::framework::Array VectorToArray( const VectorLikeType& vec) { PADDLE_ENFORCE_LE(vec.size(), ElementCount, - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "Cub reduce Array: size not match. Received " "vec.size() %d > ElementCount %d.", vec.size(), @@ -149,7 +149,7 @@ static inline std::vector GetReduceDim(const std::vector& dims, for (auto e : dims) { PADDLE_ENFORCE_LT(e, dim_size, - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "ReduceOp: invalid axis, when x_dims is %d, " "axis[i] should less than x_dims, but got %d.", dim_size, @@ -1057,7 +1057,7 @@ static int reduce_num, const paddle::platform::Place& place, gpuStream_t stream) { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( + PADDLE_THROW(pten::errors::InvalidArgument( "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); } diff --git a/paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..a74c9c0b6be10449a9f589d6437d828d65283096 --- /dev/null +++ b/paddle/pten/kernels/impl/elementwise_grad_kernel_impl.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" + +namespace pten { + +template +void AddGradImpl(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad, + GradFunc grad_func) { + pten::funcs::ElementwiseGradPreProcess(out_grad, x_grad); + auto* out = &out_grad; + // Special case when y_grad is not needed and x_grad doesn't reduce + if (x_grad != nullptr && y_grad == nullptr && + x_grad->dims() == out_grad.dims()) { + VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't " + "reduce"; + pten::Copy(dev_ctx, out_grad, false, x_grad); + } else if (x_grad == nullptr && y_grad != nullptr && + y_grad->dims() == out_grad.dims()) { + VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't " + "reduce"; + pten::Copy(dev_ctx, out_grad, false, y_grad); + } else { + grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis); + } +} + +template +void AddDoubleGradImpl(const Context& dev_ctx, + const DenseTensor& y, + const paddle::optional& ddx, + const paddle::optional& ddy, + const DenseTensor& dout, + int axis, + DenseTensor* ddout, + GradFunc grad_func, + GradInverseFunc grad_inverse_func) { + // ddOut = ddx + ddy + if (ddout) { + DenseTensor ddx_safe, ddy_safe; + funcs::GetDoubleGradSafeTensor( + dev_ctx, dout, ddx.get_ptr(), &ddx_safe); + funcs::GetDoubleGradSafeTensor( + dev_ctx, y, ddy.get_ptr(), &ddy_safe); + + ddout->mutable_data(dev_ctx.GetPlace()); + auto ddx_dims = ddx_safe.dims(); + auto ddy_dims = ddy_safe.dims(); + if (ddx_dims.size() >= ddy_dims.size()) { + grad_func( + dev_ctx, ddx_safe, ddy_safe, axis, funcs::AddFunctor(), ddout); + } else { + grad_inverse_func(dev_ctx, + ddx_safe, + ddy_safe, + axis, + funcs::InverseAddFunctor(), + ddout); + } + } +} + +} // namespace pten diff --git a/paddle/pten/kernels/math_kernel.cc b/paddle/pten/kernels/math_kernel.cc index 423282ab97ca44966e1e8722aafc6c6703a9094c..29a2b48fa7c4f12558c47dc1d6d87c758f0c492e 100644 --- a/paddle/pten/kernels/math_kernel.cc +++ b/paddle/pten/kernels/math_kernel.cc @@ -33,8 +33,8 @@ template void SumKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& dims, - bool keep_dim, DataType out_dtype, + bool keep_dim, DenseTensor* out) { bool reduce_all = false; SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h index 95379baaf350434d48fff4ce2a7b1988f7f041d5..afef58669312ca3d051b161caa39c1ca5d26bf9b 100644 --- a/paddle/pten/kernels/math_kernel.h +++ b/paddle/pten/kernels/math_kernel.h @@ -50,8 +50,8 @@ template void SumKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& dims, - bool keep_dim, DataType out_dtype, + bool keep_dim, DenseTensor* out); template @@ -110,7 +110,7 @@ template DenseTensor Add(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1); + auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1); auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); AddKernel(dev_ctx, x, y, &dense_out); return dense_out; @@ -120,7 +120,7 @@ template DenseTensor Subtract(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1); + auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1); auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); SubtractKernel(dev_ctx, x, y, &dense_out); return dense_out; @@ -130,7 +130,7 @@ template DenseTensor Divide(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1); + auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1); auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); DivideKernel(dev_ctx, x, y, &dense_out); return dense_out; @@ -140,7 +140,7 @@ template DenseTensor Multiply(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1); + auto out_meta = ElementwiseRawInferMeta(x.meta(), y.meta(), -1); auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); MultiplyKernel(dev_ctx, x, y, &dense_out); return dense_out; @@ -163,10 +163,10 @@ DenseTensor Sum(const Context& dev_ctx, const std::vector& axis, DataType dtype, bool keep_dim) { - auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype); + auto out_meta = SumInferMeta(x.meta(), axis, dtype, keep_dim); auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); - SumKernel(dev_ctx, x, axis, keep_dim, dtype, &dense_out); + SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); return dense_out; } diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/pten/kernels/primitive/compute_primitives.h similarity index 87% rename from paddle/fluid/operators/kernel_primitives/compute_primitives.h rename to paddle/pten/kernels/primitive/compute_primitives.h index 2320b9e0b2fbf47610365155558b869bd5d77b38..ac812c9c9f3eb3d8d97ef595ca3d1bdff3177e41 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/pten/kernels/primitive/compute_primitives.h @@ -22,11 +22,10 @@ #endif #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/common/float16.h" -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { namespace details { #ifdef __HIPCC__ @@ -48,7 +47,7 @@ class MPTypeTrait { }; template <> -class MPTypeTrait { +class MPTypeTrait { public: using Type = float; }; @@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { * in: The register pointer of in, the size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, +__device__ __forceinline__ void ElementwiseUnary(OutT* out, + const InT* in, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; idx++) { @@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, * in2: The register pointer of second input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, +__device__ __forceinline__ void ElementwiseBinary(OutT* out, + const InT* in1, const InT* in2, OpFunc compute) { #pragma unroll @@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, * in3: The register pointer of third input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, - const InT* in2, - const InT* in3, - OpFunc compute) { +__device__ __forceinline__ void ElementwiseTernary( + OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; ++idx) { out[idx] = static_cast(compute(in1[idx], in2[idx], in3[idx])); @@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, * ins: A pointers of array consisting of multiple inputs. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], +__device__ __forceinline__ void ElementwiseAny(OutT* out, + InT (*ins)[NX * NY], OpFunc compute) { InT args[Arity]; #pragma unroll @@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], * in2: The register pointer of second input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, - const InT* in2, OpFunc compute) { +__device__ __forceinline__ void CycleBinary(OutT* out, + const InT* in1, + const InT* in2, + OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX; idx++) { #pragma unroll @@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, * reducer: Compute function which was declared like ReduceFunctor(). * reduce_last_dim: if the last dim gets involved in reduction. */ -template -__device__ __forceinline__ void Reduce(T* out, const T* in, +__device__ __forceinline__ void Reduce(T* out, + const T* in, ReduceFunctor reducer, bool reduce_last_dim) { int block_index = blockDim.y; @@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, } } -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h b/paddle/pten/kernels/primitive/compute_primitives_xpu2.h similarity index 85% rename from paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h rename to paddle/pten/kernels/primitive/compute_primitives_xpu2.h index 32355915809161ae1a4dcc275eba8a28966fb92e..d7282c089fc9cc332abc132941188c7804e68f80 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h +++ b/paddle/pten/kernels/primitive/compute_primitives_xpu2.h @@ -13,13 +13,13 @@ // limitations under the License. #pragma once +#include "paddle/pten/common/float16.h" #include "xpu/kernel/cluster_header.h" #include "xpu/kernel/debug.h" #include "xpu/kernel/math.h" -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { namespace details { // kGlobalMode: block reduce, each block gets an output; @@ -33,7 +33,7 @@ class MPTypeTrait { }; template <> -class MPTypeTrait { +class MPTypeTrait { public: using Type = float; }; @@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) { * in: The register pointer of in, the size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, +__device__ __forceinline__ void ElementwiseUnary(OutT* out, + const InT* in, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; idx++) { @@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, * in2: The register pointer of second input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, +__device__ __forceinline__ void ElementwiseBinary(OutT* out, + const InT* in1, const InT* in2, OpFunc compute) { #pragma unroll @@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, * in3: The register pointer of third input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, - const InT* in2, - const InT* in3, - OpFunc compute) { +__device__ __forceinline__ void ElementwiseTernary( + OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; ++idx) { out[idx] = static_cast(compute(in1[idx], in2[idx], in3[idx])); @@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, * ins: A pointers of array consisting of multiple inputs. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], +__device__ __forceinline__ void ElementwiseAny(OutT* out, + InT (*ins)[NX * NY], OpFunc compute) { __local__ InT args[Arity]; #pragma unroll @@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], * in2: The register pointer of second input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, - const InT* in2, OpFunc compute) { +__device__ __forceinline__ void CycleBinary(OutT* out, + const InT* in1, + const InT* in2, + OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX; idx++) { #pragma unroll @@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, * reducer: Compute function which was declared like ReduceFunctor(). * reduce_last_dim: if the last dim gets involved in reduction. */ -template -__device__ __forceinline__ void Reduce(T* out, const T* in, +__device__ __forceinline__ void Reduce(T* out, + const T* in, ReduceFunctor reducer, bool reduce_last_dim) { if (Mode == kGlobalMode) { @@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, } } -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/pten/kernels/primitive/datamover_primitives.h similarity index 87% rename from paddle/fluid/operators/kernel_primitives/datamover_primitives.h rename to paddle/pten/kernels/primitive/datamover_primitives.h index 45697073cbf85b436a4db33b0a2d49d8b805fd63..2a8006f3ecbc427c3e0cf36a08457c2ecd5f84df 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/pten/kernels/primitive/datamover_primitives.h @@ -22,9 +22,8 @@ #endif #include "paddle/pten/core/ddim.h" -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { namespace details { #define INT_BITS 32 @@ -103,11 +102,12 @@ struct BroadcastConfig { strides_in.resize(dim_size, 1); for (int i = 0; i < dim_size; ++i) { strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i]; - strides_in[i] = - (i != 0 && strides_in[i] != 0) - ? std::accumulate(in_dims.begin(), in_dims.begin() + i, 1, - std::multiplies()) - : strides_in[i]; + strides_in[i] = (i != 0 && strides_in[i] != 0) + ? std::accumulate(in_dims.begin(), + in_dims.begin() + i, + 1, + std::multiplies()) + : strides_in[i]; } memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t)); @@ -144,11 +144,18 @@ struct BroadcastConfig { * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, - int size_nx, int size_ny, - int stride_nx, int stride_ny) { +__device__ __forceinline__ void ReadData(Ty* dst, + const Tx* __restrict__ src, + int size_nx, + int size_ny, + int stride_nx, + int stride_ny) { int thread_offset = threadIdx.x; int left_size_nx = size_nx - thread_offset; @@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { * size: The current block needs to load size data continuously. */ template -__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, +__device__ __forceinline__ void ReadData(T* dst, + const T* __restrict__ src, int num) { if (IsBoundary) { // blockDim.x * NX > num int thread_offset = threadIdx.x * NX; @@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template __device__ __forceinline__ void ReadDataBc( - T* dst, const T* __restrict__ src, uint32_t block_offset, - details::BroadcastConfig config, int total_num_output, int stride_nx, + T* dst, + const T* __restrict__ src, + uint32_t block_offset, + details::BroadcastConfig config, + int total_num_output, + int stride_nx, int stride_ny) { uint32_t thread_offset = block_offset + threadIdx.x; uint32_t index_src = 0; @@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc( * reduce_last_dim: Used to indicate whether the dimension of reduce contains * the lowest dimension. */ -template -__device__ __forceinline__ void ReadDataReduce( - Ty* dst, const Tx* __restrict__ src, int block_offset, - const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, - int stride_ny, Functor func, bool reduce_last_dim) { +template +__device__ __forceinline__ void ReadDataReduce(Ty* dst, + const Tx* __restrict__ src, + int block_offset, + const IndexCal& index_cal, + int size_nx, + int size_ny, + int stride_nx, + int stride_ny, + Functor func, + bool reduce_last_dim) { int thread_offset = 0; int left_idx = 0; if (reduce_last_dim) { @@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce( * size: The current block needs to load size elements continuously. */ template -__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, +__device__ __forceinline__ void WriteData(T* dst, + T* __restrict__ src, int num) { if (IsBoundary) { int thread_offset = threadIdx.x * NX; @@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src, - int size_nx, int size_ny, - int stride_nx, int stride_ny) { +__device__ __forceinline__ void WriteData(Ty* dst, + const Tx* __restrict__ src, + int size_nx, + int size_ny, + int stride_nx, + int stride_ny) { int thread_offset = threadIdx.x; int left_size_nx = size_nx - thread_offset; @@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) { * coordinate mapping relationship between output data and input data. * total_num_output: Total number of original output. */ -template __device__ __forceinline__ void ReadDataBc( - T* dst, const T* __restrict__ src, uint32_t block_offset, - details::BroadcastConfig config, int total_num_output) { + T* dst, + const T* __restrict__ src, + uint32_t block_offset, + details::BroadcastConfig config, + int total_num_output) { uint32_t thread_offset = block_offset + threadIdx.x * NX; uint32_t index_src = 0; @@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc( } } -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h b/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h similarity index 90% rename from paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h rename to paddle/pten/kernels/primitive/datamover_primitives_xpu2.h index 333899535894e0939086817c9fd6caad992f807f..d6586368c804126f896ba476cc1679d54a4c6eb8 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h +++ b/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h @@ -17,9 +17,8 @@ #include "xpu/kernel/debug.h" #include "xpu/kernel/math.h" -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { namespace details { template @@ -105,10 +104,17 @@ struct BroadcastConfig { * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __inline__ void ReadData(Ty* dst, const Tx _global_ptr_* src, - int size_nx, int size_ny, int stride_nx, +__device__ __inline__ void ReadData(Ty* dst, + const Tx _global_ptr_* src, + int size_nx, + int size_ny, + int stride_nx, int stride_ny) { int thread_offset = core_id(); int left_size_nx = size_nx - thread_offset; @@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) { * size: The current block needs to load size data continuously. */ template -__device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src, +__device__ __inline__ void ReadData(T* dst, + const T _global_ptr_* src, int num) { int thread_offset = core_id() * NX; __local__ T in_temp[1]; @@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src, * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, +__device__ __inline__ void ReadDataBc(T* dst, + const T _global_ptr_* src, uint32_t block_offset, details::BroadcastConfig config, - int total_num_output, int stride_nx, + int total_num_output, + int stride_nx, int stride_ny) { uint32_t thread_offset = block_offset + core_id(); uint32_t index_src = 0; @@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, * reduce_last_dim: Used to indicate whether the dimension of reduce contains * the lowest dimension. */ -template -__device__ __inline__ void ReadDataReduce(T* dst, const T _global_ptr_* src, +template +__device__ __inline__ void ReadDataReduce(T* dst, + const T _global_ptr_* src, int block_offset, const IndexCal& index_cal, - int size_nx, int size_ny, - int stride_nx, int stride_ny, + int size_nx, + int size_ny, + int stride_nx, + int stride_ny, bool reduce_last_dim) { __local__ Tx in_temp[1]; int thread_offset = 0; @@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) { * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src, - int size_nx, int size_ny, int stride_nx, +__device__ __inline__ void WriteData(Ty _global_ptr_* dst, + const Tx* src, + int size_nx, + int size_ny, + int stride_nx, int stride_ny) { int thread_offset = core_id(); int left_size_nx = size_nx - thread_offset; @@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src, } } in_temp[0] = static_cast(src[idx + idy * NX]); - LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny, + LM2GM(in_temp, + dst + thread_offset + idx * stride_nx + idy * stride_ny, sizeof(Ty)); } } @@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) { * coordinate mapping relationship between output data and input data. * total_num_output: Total number of original output. */ -template -__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, +__device__ __inline__ void ReadDataBc(T* dst, + const T _global_ptr_* src, uint32_t block_offset, details::BroadcastConfig config, int total_num_output) { @@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, } } -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/pten/kernels/primitive/functor_primitives.h b/paddle/pten/kernels/primitive/functor_primitives.h new file mode 100644 index 0000000000000000000000000000000000000000..8d62d622701342e058a57ff31d12410e78eb1306 --- /dev/null +++ b/paddle/pten/kernels/primitive/functor_primitives.h @@ -0,0 +1,255 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/pten/common/float16.h" +#include "paddle/pten/core/enforce.h" +#include "paddle/pten/kernels/funcs/eigen/extensions.h" + +namespace pten { +namespace kps { +namespace details { + +static __device__ __forceinline__ pten::dtype::float16 Exp( + pten::dtype::float16 x) { + return ::Eigen::numext::exp(x); +} + +static __device__ __forceinline__ float Exp(float x) { return expf(x); } + +static __device__ __forceinline__ double Exp(double x) { return exp(x); } + +static __device__ __forceinline__ pten::dtype::float16 Log( + pten::dtype::float16 x) { + return ::Eigen::numext::log(x); +} + +static __device__ __forceinline__ float Log(float x) { return logf(x); } + +static __device__ __forceinline__ double Log(double x) { return log(x); } + +} // namespace details + +/******************************** Unary Functor *******************************/ + +/** + * @brief Default unary exp functor + */ +template +struct ExpFunctor { + HOSTDEVICE inline ExpFunctor() {} + + HOSTDEVICE explicit inline ExpFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(details::Exp(x)); + } +}; + +/** + * @brief Default unary identity functor + */ +template +struct IdentityFunctor { + HOSTDEVICE inline IdentityFunctor() {} + + HOSTDEVICE explicit inline IdentityFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(x); + } +}; + +/** + * @brief Default unary div functor. Divide by a constant + */ +template +struct DivideFunctor { + private: + using MPType = typename ::paddle::operators::details::MPTypeTrait::Type; + + public: + HOSTDEVICE inline DivideFunctor() { n_inv = static_cast(1.0f); } + + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(static_cast(x) * n_inv); + } + + private: + MPType n_inv; +}; + +/** + * @brief Default inverse functor + */ +template +struct InverseFunctor { + HOSTDEVICE inline InverseFunctor() {} + + HOSTDEVICE explicit inline InverseFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(-x); + } +}; + +/** + * @brief Default unary square functor + */ +template +struct SquareFunctor { + HOSTDEVICE inline SquareFunctor() {} + + HOSTDEVICE explicit inline SquareFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(x) * static_cast(x); + } +}; + +/****************************** Binary Functor ********************************/ + +/** + * @brief Default binary min functor + */ +template +struct MinFunctor { + inline T initial() { return static_cast(std::numeric_limits::max()); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return (b < a) ? b : a; + } +}; + +/** + * @brief Default binary max functor + */ +template +struct MaxFunctor { + inline T initial() { + return static_cast(std::numeric_limits::lowest()); + } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return (b > a) ? b : a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct AddFunctor { + inline T initial() { return static_cast(0.0f); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return b + a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct MulFunctor { + inline T initial() { return static_cast(1.0f); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return b * a; + } +}; + +/** + * @brief Default binary logic or functor + */ +template +struct LogicalOrFunctor { + inline T initial() { return static_cast(false); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return b || a; + } +}; + +/** + * @brief Default binary logic and functor + */ +template +struct LogicalAndFunctor { + inline T initial() { return static_cast(true); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return b && a; + } +}; + +/** + * @brief Default binary sub functor + */ +template +struct SubFunctor { + inline T initial() { return static_cast(0.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; } +}; + +/** + * @brief Default binary div functor + */ +template +struct DivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } +}; + +template +struct DivFunctor::value>::type> { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { + // For int32/int64, need to check whether the divison is zero. + PADDLE_ENFORCE_NE(b, + 0, + pten::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return a / b; + } +}; + +/** + * @brief Default binary floor divide functor + */ +template +struct FloorDivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { + PADDLE_ENFORCE_NE(b, + 0, + pten::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return static_cast(std::trunc(a / b)); + } +}; + +} // namespace kps +} // namespace pten diff --git a/paddle/fluid/operators/kernel_primitives/helper_primitives.h b/paddle/pten/kernels/primitive/helper_primitives.h similarity index 73% rename from paddle/fluid/operators/kernel_primitives/helper_primitives.h rename to paddle/pten/kernels/primitive/helper_primitives.h index 48ac1509d1f6e8cd3c6ecf06ac0f3445dac39a51..26d431d46abae651e854820b0c7b43afadf148b6 100644 --- a/paddle/fluid/operators/kernel_primitives/helper_primitives.h +++ b/paddle/pten/kernels/primitive/helper_primitives.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ #pragma once -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { #ifdef PADDLE_WITH_XPU2 struct dim3 { @@ -43,8 +42,12 @@ struct DimConfig { int rem_y; int rem_z; - HOSTDEVICE explicit inline DimConfig(int split_x, int split_y, int split_z, - int size_x, int size_y, int size_z) { + HOSTDEVICE explicit inline DimConfig(int split_x, + int split_y, + int split_z, + int size_x, + int size_y, + int size_z) { split_num_x = split_x; split_num_y = split_y; split_num_z = split_z; @@ -60,6 +63,5 @@ struct DimConfig { } }; -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/pten/kernels/primitive/kernel_primitives.h b/paddle/pten/kernels/primitive/kernel_primitives.h new file mode 100644 index 0000000000000000000000000000000000000000..6067fa59d57ba6f400500805bff7aea80f17926d --- /dev/null +++ b/paddle/pten/kernels/primitive/kernel_primitives.h @@ -0,0 +1,69 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/pten/kernels/primitive/helper_primitives.h" +#ifdef PADDLE_WITH_XPU2 +#include "paddle/pten/backends/xpu/xpu_context.h" +#include "paddle/pten/kernels/primitive/compute_primitives_xpu2.h" +#include "paddle/pten/kernels/primitive/datamover_primitives_xpu2.h" +#include "paddle/pten/kernels/primitive/functor_primitives_xpu2.h" + +#define KPStream XPUStream +#define KPDevice pten::XPUContext +#define _ptr_ _global_ptr_ +#define __forceinline__ __inline__ +#define __restrict__ + +#define THREAD_ID_X core_id() +#define THREAD_ID_Y 0 +#define THREAD_ID_Z 0 + +#define BLOCK_NUM_X core_num() +#define BLOCK_NUM_Y 0 +#define BLOCK_NUM_Z 0 + +#define BLOCK_ID_X cluster_id() +#define BLOCK_ID_Y 0 +#define BLOCK_ID_Z 0 + +#define GRID_NUM_X cluster_num() +#define GRID_NUM_Y 0 +#define GRID_NUM_Z 0 +#else +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/kernels/primitive/compute_primitives.h" +#include "paddle/pten/kernels/primitive/datamover_primitives.h" +#include "paddle/pten/kernels/primitive/functor_primitives.h" + +#define KPStream gpuStream_t +#define KPDevice pten::GPUContext +#define _ptr_ + +#define THREAD_ID_X threadIdx.x +#define THREAD_ID_Y threadIdx.y +#define THREAD_ID_Z threadIdx.z + +#define BLOCK_NUM_X blockDim.x +#define BLOCK_NUM_Y blockDim.y +#define BLOCK_NUM_Z blockDim.z + +#define BLOCK_ID_X blockIdx.x +#define BLOCK_ID_Y blockIdx.y +#define BLOCK_ID_Z blockIdx.z + +#define GRID_NUM_X gridDim.x +#define GRID_NUM_Y gridDim.y +#define GRID_NUM_Z gridDim.z +#endif diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc index 9bfad22374c9f0c840634a16bfff45849e8ef60a..4b706e9e685b47af20dd23ab0855db6116623c46 100644 --- a/paddle/pten/kernels/reshape_kernel.cc +++ b/paddle/pten/kernels/reshape_kernel.cc @@ -31,9 +31,8 @@ void ReshapeKernel(const Context& dev_ctx, out->ResizeAndAllocate(out_meta.dims); return; } - - out->Resize(x.dims()); - out->mutable_data(x.place()); + out->set_meta(out_meta); + out->mutable_data(dev_ctx.GetPlace()); pten::Copy(dev_ctx, x, false, out); out->Resize(out_meta.dims); out->ResetLoD(x.lod()); diff --git a/paddle/pten/kernels/xpu/scale_kernel.cc b/paddle/pten/kernels/xpu/scale_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..116cd63f876207b39bc9b523b9f9e70876cc1b98 --- /dev/null +++ b/paddle/pten/kernels/xpu/scale_kernel.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/scale_kernel.h" + +#include "paddle/fluid/platform/device/xpu/xpu_header.h" +#include "paddle/pten/backends/xpu/xpu_context.h" +#include "paddle/pten/common/data_type.h" +#include "paddle/pten/common/float16.h" +#include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/kernel_registry.h" + +namespace pten { + +template +void ScaleKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + DenseTensor* out) { + out->mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_EQ(x.dims(), + out->dims(), + paddle::platform::errors::InvalidArgument( + "In and out should have the same dim," + " expected %s, but got %s.", + x.dims().to_str().c_str(), + out->dims().to_str().c_str())); + using XPUType = typename XPUTypeTrait::Type; + int r = xpu::scale(dev_ctx.x_context(), + reinterpret_cast(x.data()), + reinterpret_cast(out->data()), + x.numel(), + bias_after_scale, + scale.to(), + bias); + PADDLE_ENFORCE_EQ( + r, + XPU_SUCCESS, + paddle::platform::errors::External( + "XPU scale kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); +} + +} // namespace pten + +PT_REGISTER_KERNEL(scale, + XPU, + ALL_LAYOUT, + pten::ScaleKernel, + float, + pten::dtype::float16, + int64_t) {} diff --git a/paddle/pten/ops/CMakeLists.txt b/paddle/pten/ops/CMakeLists.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..910b62766ebf6e3dac9d9218b60622b6352a5e44 100644 --- a/paddle/pten/ops/CMakeLists.txt +++ b/paddle/pten/ops/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(compat) diff --git a/paddle/pten/ops/compat/CMakeLists.txt b/paddle/pten/ops/compat/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd214087e1a68a624fc198a6d86596b4ff8e4ff3 --- /dev/null +++ b/paddle/pten/ops/compat/CMakeLists.txt @@ -0,0 +1,11 @@ +set(op_utils_header ${PADDLE_BINARY_DIR}/paddle/pten/ops/compat/signatures.h.tmp CACHE INTERNAL "op_args_fns.cc file") +set(op_utils_header_final ${PADDLE_BINARY_DIR}/paddle/pten/ops/compat/signatures.h) +file(WRITE ${op_utils_header} "// Generated by the paddle/pten/ops/compat/CMakeLists.txt. DO NOT EDIT!\n\n") +file(APPEND ${op_utils_header} "#include \"paddle/pten/core/compat/op_utils.h\"\n\n") + +# Automatically generate the registration code of all arg map functions +# and compile the corresponding target to avoid frequent code conflicts +# when writing to same file +register_op_utils(op_compat_infos DEPS op_utils) + +copy_if_different(${op_utils_header} ${op_utils_header_final}) diff --git a/paddle/pten/ops/compat/scale_args_fn.h b/paddle/pten/ops/compat/scale_sig.cc similarity index 72% rename from paddle/pten/ops/compat/scale_args_fn.h rename to paddle/pten/ops/compat/scale_sig.cc index 91f0db389d9d5094e6f6d3cf978c4c35590d1d2e..5ce159a5d84c9faba760cd7b8605f2bd0734c53f 100644 --- a/paddle/pten/ops/compat/scale_args_fn.h +++ b/paddle/pten/ops/compat/scale_sig.cc @@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - -#include "paddle/pten/core/compat/arg_map_context.h" +#include "paddle/pten/core/compat/op_utils.h" namespace pten { @@ -22,15 +20,18 @@ KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorInput("X")) { std::string scale_attr; if (ctx.HasInput("ScaleTensor")) { - scale_attr = "ScaleTensor"; + return KernelSignature( + "scale", {"X"}, {"ScaleTensor", "bias", "bias_after_scale"}, {"Out"}); } else { - scale_attr = "scale"; + return KernelSignature( + "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"}); } - return KernelSignature( - "scale", {"X"}, {scale_attr, "bias", "bias_after_scale"}, {"Out"}); } // TODO(chenweihang): support other cases after selected rows added return KernelSignature("scale.unregistered", {}, {}, {}); } } // namespace pten + +// op_type, api_name, arg_mapping_fn +PT_REGISTER_ARG_MAPPING_FN(scale, pten::ScaleOpArgumentMapping); diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt index 43e1480e2c41e3e0a5cc2a57597a83d306e709ed..27a0173ef6f1fc8654fdbe4ef7b585f3ec3d7651 100644 --- a/paddle/pten/tests/core/CMakeLists.txt +++ b/paddle/pten/tests/core/CMakeLists.txt @@ -3,6 +3,7 @@ cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) cc_test(test_convert_utils SRCS test_convert_utils.cc DEPS convert_utils) cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scale_kernel) +cc_test(test_op_utils SRCS test_op_utils.cc DEPS op_compat_infos) cc_test(test_pten_device_context SRCS test_device_context.cc DEPS pten_context cpu_context) cc_test(test_ddim SRCS test_ddim.cc DEPS ddim) diff --git a/paddle/pten/tests/core/test_op_utils.cc b/paddle/pten/tests/core/test_op_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c4a418685775a01af0076d26b4878d3eb91462e --- /dev/null +++ b/paddle/pten/tests/core/test_op_utils.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "gtest/gtest.h" +#include "paddle/pten/core/compat/op_utils.h" +#include "paddle/pten/ops/compat/signatures.h" + +namespace pten { +namespace tests { + +TEST(OpUtilsMap, ArgMappingFnExists) { + std::cout << "enter ArgMappingFnExists"; + auto scale_arg_mapping_fn = + pten::OpUtilsMap::Instance().GetArgumentMappingFn("scale"); + EXPECT_NE(scale_arg_mapping_fn, nullptr); +} + +} // namespace tests +} // namespace pten diff --git a/paddle/pten/tests/core/test_selected_rows.cc b/paddle/pten/tests/core/test_selected_rows.cc index 81c7ff4a838a702a1c32df2fb4a7f082b1b39f3b..c6e52ff64eab90c71ffc698c04e6e0b58cd1f6d4 100644 --- a/paddle/pten/tests/core/test_selected_rows.cc +++ b/paddle/pten/tests/core/test_selected_rows.cc @@ -40,7 +40,7 @@ class SelectedRowsTester : public ::testing::Test { protected: pten::CPUPlace place_; - std::unique_ptr selected_rows_{nullptr}; + std::unique_ptr selected_rows_{nullptr}; }; TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); } diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b4803e35385403badc27f416b2c7112411fd8c7 --- /dev/null +++ b/paddle/utils/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(string) diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/utils/string/CMakeLists.txt similarity index 69% rename from paddle/fluid/string/CMakeLists.txt rename to paddle/utils/string/CMakeLists.txt index 9667e18bc6a1e34fee6e039a710dd1bd8b24481e..db3cb542ba3748e3bc936394a136e2ad8aaf327e 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/utils/string/CMakeLists.txt @@ -1,8 +1,8 @@ cc_library(stringpiece SRCS piece.cc DEPS flags) cc_library(pretty_log SRCS pretty_log.cc DEPS flags) cc_library(string_helper SRCS string_helper.cc DEPS flags) -cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) -cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) +cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece gflags) +cc_test(stringprintf_test SRCS printf_test.cc DEPS gflags) cc_test(to_string_test SRCS to_string_test.cc) cc_test(split_test SRCS split_test.cc) cc_test(string_helper_test SRCS string_helper_test.cc DEPS string_helper) diff --git a/paddle/fluid/string/piece.cc b/paddle/utils/string/piece.cc similarity index 94% rename from paddle/fluid/string/piece.cc rename to paddle/utils/string/piece.cc index 971ee3ddb5ff0347e3f365c3eeb9fe9fea96e573..305ac85a5320ead7cfd784863927ad2d0913a07d 100644 --- a/paddle/fluid/string/piece.cc +++ b/paddle/utils/string/piece.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/string/piece.h" +#include "paddle/utils/string/piece.h" #include #include @@ -76,9 +76,11 @@ bool HasPrefix(Piece s, Piece x) { } bool HasSuffix(Piece s, Piece x) { - return !x.len() ? true : ((s.len() >= x.len()) && - (memcmp(s.data() + (s.len() - x.len()), x.data(), - x.len()) == 0)); + return !x.len() + ? true + : ((s.len() >= x.len()) && + (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == + 0)); } Piece SkipPrefix(Piece s, size_t n) { diff --git a/paddle/utils/string/piece.h b/paddle/utils/string/piece.h new file mode 100644 index 0000000000000000000000000000000000000000..8dda484eaac4d62b758e57ac5e81bfe68a5c60d4 --- /dev/null +++ b/paddle/utils/string/piece.h @@ -0,0 +1,105 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace string { + +// Piece points into a std::string object but doesn't own the +// string. It is for efficient access to strings. Like Go's string +// type. Not that Piece doesn't mutate the underlying string, +// so it is thread-safe given that the underlying string doesn't +// change. Because Piece contains a little data members, and +// its syntax is simple as it doesn't own/manage the string, it is +// cheap to construct Pieces and pass them around. +class Piece { + public: + static const size_t npos = static_cast(-1); + + // We provide non-explicit singleton constructors so users can + // pass in a "const char*" or a "string" wherever a "Piece" + // is expected. These constructors ensure that if data_ is NULL, + // size_ is 0. + Piece(); + Piece(const char* d, size_t n); + Piece(const char* d); // NOLINT: accept C string into Piece. + Piece(const std::string& s); // NOLINT: accept C++ string into Piece. + + const char* data() const { return data_; } + size_t len() const { return size_; } + + char operator[](size_t n) const; + + // Piece doesn't own the string, so both iterator and const + // iterator are const char* indeed. + typedef const char* const_iterator; + typedef const char* iterator; + iterator begin() const { return data_; } + iterator end() const { return data_ + size_; } + + // Return a string that contains the copy of the referenced data. + std::string ToString() const { return std::string(data_, size_); } + + private: + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +int Compare(Piece a, Piece b); + +bool operator==(Piece x, Piece y); +bool operator!=(Piece x, Piece y); +bool operator<(Piece x, Piece y); +bool operator>(Piece x, Piece y); +bool operator<=(Piece x, Piece y); +bool operator>=(Piece x, Piece y); + +bool HasPrefix(Piece s, Piece prefix); +bool HasSuffix(Piece s, Piece suffix); + +Piece SkipPrefix(Piece s, size_t n); +Piece SkipSuffix(Piece s, size_t n); + +// Skip the prefix (or suffix) if it matches with the string. +Piece TrimPrefix(Piece s, Piece prefix); +Piece TrimSuffix(Piece s, Piece suffix); + +// Returns if s contains sub. Any s except for empty s contains an +// empty sub. +bool Contains(Piece s, Piece sub); + +// Return the first occurrence of sub in s, or npos. If both s and +// sub is empty, it returns npos; otherwise, if only sub is empty, it +// returns 0. +size_t Index(Piece s, Piece sub); + +// Return the first occurrence of c in s[pos:end], or npos. +size_t Find(Piece s, char c, size_t pos); + +// Search range is [0..pos] inclusive. If pos == npos, search everything. +size_t RFind(Piece s, char c, size_t pos); + +Piece SubStr(Piece s, size_t pos, size_t n); + +// allow Piece to be logged +std::ostream& operator<<(std::ostream& o, Piece piece); + +} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/piece_test.cc b/paddle/utils/string/piece_test.cc similarity index 99% rename from paddle/fluid/string/piece_test.cc rename to paddle/utils/string/piece_test.cc index 544b5985ed21432488200768a28a3bae69f00a7f..27b189e251f8ad368895419ff9cf854a6d929893 100644 --- a/paddle/fluid/string/piece_test.cc +++ b/paddle/utils/string/piece_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/string/piece.h" +#include "paddle/utils/string/piece.h" #include "gtest/gtest.h" diff --git a/paddle/fluid/string/pretty_log.cc b/paddle/utils/string/pretty_log.cc similarity index 94% rename from paddle/fluid/string/pretty_log.cc rename to paddle/utils/string/pretty_log.cc index c0715e644fb3302bde53564be3bf63e4e3f4657c..b014c6de20d855c16432aa8b1e898b5d12ae3a3d 100644 --- a/paddle/fluid/string/pretty_log.cc +++ b/paddle/utils/string/pretty_log.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/string/pretty_log.h" +#include "paddle/utils/string/pretty_log.h" #include "gflags/gflags.h" DEFINE_bool(color, true, "Whether to turn on pretty log"); diff --git a/paddle/utils/string/pretty_log.h b/paddle/utils/string/pretty_log.h new file mode 100644 index 0000000000000000000000000000000000000000..9a8038f3a8bef073c53a1b59c6fc2ed565913c85 --- /dev/null +++ b/paddle/utils/string/pretty_log.h @@ -0,0 +1,90 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include "gflags/gflags.h" + +#include "paddle/utils/string/printf.h" + +DECLARE_bool(color); + +namespace paddle { + +namespace string { + +inline std::string black() { return FLAGS_color ? "\e[30m" : ""; } +inline std::string red() { return FLAGS_color ? "\e[31m" : ""; } +inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; } +inline std::string green() { return FLAGS_color ? "\e[32m" : ""; } +inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; } +inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; } +inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; } +inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; } +inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; } +inline std::string white() { return FLAGS_color ? "\e[37m" : ""; } +inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; } +inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; } +inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; } +inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; } +inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; } +inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; } + +using TextBlock = std::pair; + +struct Style { + static std::string info() { return black(); } + static std::string warn() { return b_red(); } + static std::string suc() { return green(); } + static std::string H1() { return bold() + purple(); } + static std::string H2() { return green(); } + static std::string H3() { return green(); } + static std::string detail() { return light_gray(); } +}; + +template +static void PrettyLogEndl(const std::string &style, + const char *fmt, + const Args &... args) { + std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; +} +template +static void PrettyLog(const std::string &style, + const char *fmt, + const Args &... args) { + std::cerr << style << Sprintf(fmt, args...) << reset(); +} + +template +static void PrettyLogInfo(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::info(), fmt, args...); +} +template +static void PrettyLogDetail(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::detail(), fmt, args...); +} +template +static void PrettyLogH1(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::H1(), fmt, args...); +} +template +static void PrettyLogH2(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::H2(), fmt, args...); +} + +} // namespace string +} // namespace paddle diff --git a/paddle/utils/string/printf.h b/paddle/utils/string/printf.h new file mode 100644 index 0000000000000000000000000000000000000000..f4576c6bc4aa543a25604638c2047eeaeb179a74 --- /dev/null +++ b/paddle/utils/string/printf.h @@ -0,0 +1,124 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Compared with std::stringstream, there are primary purpose of +// string::Printf: +// +// 1. Type-safe printing, with why and how explained in +// http://www.drdobbs.com/stringprintf-a-typesafe-printf-family-fo/184401999. +// Implementation includes +// +// https://github.com/c42f/tinyformat +// boost::format +// std::stringstream +// +// std::stringstream is not convenient enough in many cases. For example: +// +// std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n"; +// +// boost::format is the most convenient one. We can have +// +// std::cout << format("%2% %1%") % 36 % 77; +// +// or +// +// format fmter("%2% %1%"); +// fmter % 36; fmter % 77; +// std::cout << fmter.c_str(); +// +// But the overloading of % might be overkilling and it would be +// more efficient if it can write to std::cout directly. +// +// tinyformat has an interface compatible with the C-printf style, +// and it can writes to a stream or returns a std::string: +// +// std::cout << tfm::printf( +// "%s, %s %d, %.2d:%.2d\n", +// weekday, month, day, hour, min); +// +// or +// +// tfm::format(std::cout, +// "%s, %s %d, %.2d:%.2d\n", +// weekday, month, day, hour, min); +// +// 2. High-performance -- most printed strings are not too long and +// doens't need dynamic memory allocation. Many StringPrintf +// implementations doesn't enforce type-safe, but are +// high-performance, including +// +// https://developers.google.com/optimization/reference/base/stringprintf/ +// https://github.com/adobe/chromium/blob/master/base/stringprintf.h +// https://github.com/google/protobuf/blob/master/src/google/protobuf/stubs/stringprintf.h +// +// According to +// https://github.com/c42f/tinyformat#compile-time-and-code-bloat, +// boost::format runs too slow and results in large executable binary +// files. So here we port tinyformat. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/utils/string/tinyformat/tinyformat.h" // https://github.com/c42f/tinyformat + +namespace paddle { +namespace string { + +template +void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { + tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...)); +} + +inline std::string Sprintf() { return ""; } + +template +std::string Sprintf(const Args&... args) { + std::ostringstream oss; + Fprintf(oss, "%s", args...); + return oss.str(); +} + +template +std::string Sprintf(const char* fmt, const Args&... args) { + std::ostringstream oss; + Fprintf(oss, fmt, args...); + return oss.str(); +} + +template +void Printf(const char* fmt, const Args&... args) { + Fprintf(std::cout, fmt, args...); +} + +inline std::string HumanReadableSize(double f_size) { + size_t i = 0; + double orig = f_size; + const std::vector units( + {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}); + while (f_size >= 1024) { + f_size /= 1024; + i++; + } + if (i >= units.size()) { + return Sprintf("%fB", orig); + } + return Sprintf("%f%s", f_size, units[i]); +} + +} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/printf_test.cc b/paddle/utils/string/printf_test.cc similarity index 84% rename from paddle/fluid/string/printf_test.cc rename to paddle/utils/string/printf_test.cc index 544b12ef3a877a6e84c136433799301edaa4abdf..9da7bfedb72c6e3aca38921dd751824d205f7084 100644 --- a/paddle/fluid/string/printf_test.cc +++ b/paddle/utils/string/printf_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/string/printf.h" +#include "paddle/utils/string/printf.h" #include @@ -25,7 +25,7 @@ TEST(StringPrintf, StringPrintf) { int hour = 14; int min = 44; EXPECT_EQ(std::string("Wednesday, July 27, 14:44"), - paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day, - hour, min)); + paddle::string::Sprintf( + "%s, %s %d, %.2d:%.2d", weekday, month, day, hour, min)); EXPECT_EQ(std::string(""), paddle::string::Sprintf()); } diff --git a/paddle/utils/string/split.h b/paddle/utils/string/split.h new file mode 100644 index 0000000000000000000000000000000000000000..ccb96b8a9cb68f03acbca592a2149ba5001f34d2 --- /dev/null +++ b/paddle/utils/string/split.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +namespace paddle { +namespace string { + +static inline std::vector Split(std::string const& original, + char separator) { + std::vector results; + std::string token; + std::istringstream is(original); + while (std::getline(is, token, separator)) { + if (!token.empty()) { + results.push_back(token); + } + } + return results; +} + +} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/split_test.cc b/paddle/utils/string/split_test.cc similarity index 95% rename from paddle/fluid/string/split_test.cc rename to paddle/utils/string/split_test.cc index c85dc1eed40dbe25d922c0f4810a747d1bd2d60f..dcb69955c86580f6d21eea1783041768073a7c29 100644 --- a/paddle/fluid/string/split_test.cc +++ b/paddle/utils/string/split_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/string/split.h" +#include "paddle/utils/string/split.h" #include diff --git a/paddle/fluid/string/string_helper.cc b/paddle/utils/string/string_helper.cc similarity index 95% rename from paddle/fluid/string/string_helper.cc rename to paddle/utils/string/string_helper.cc index db9ee7592fc84237e760d94a3ebb3eff328a8309..37b9e9ce4e513cd160ddf9f67889247741663731 100644 --- a/paddle/fluid/string/string_helper.cc +++ b/paddle/utils/string/string_helper.cc @@ -12,15 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/string/string_helper.h" +#include "paddle/utils/string/string_helper.h" #include #include #include #include -#include "glog/logging.h" - namespace paddle { namespace string { @@ -75,7 +73,9 @@ char* LineFileReader::getdelim(FILE* f, char delim) { return _buffer; } else { _length = 0; - CHECK(feof(f)); + int code = feof(f); + (void)code; + assert(code); return NULL; } #else diff --git a/paddle/utils/string/string_helper.h b/paddle/utils/string/string_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..a02b313ef0eba61682188f65d3d6a03d432dc7fb --- /dev/null +++ b/paddle/utils/string/string_helper.h @@ -0,0 +1,236 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace string { + +inline size_t count_spaces(const char* s) { + size_t count = 0; + + while (*s != 0 && isspace(*s++)) { + count++; + } + + return count; +} + +inline size_t count_nonspaces(const char* s) { + size_t count = 0; + + while (*s != 0 && !isspace(*s++)) { + count++; + } + + return count; +} + +template +void format_string_append(std::string& str, // NOLINT + const char* fmt, // NOLINT + ARGS&&... args) { + int len = snprintf(NULL, 0, fmt, args...); + assert(len == 0); + size_t oldlen = str.length(); + str.resize(oldlen + len + 1); + int new_len = + snprintf(&str[oldlen], (size_t)len + 1, fmt, args...); // NOLINT + (void)new_len; + assert(new_len == len); + str.resize(oldlen + len); +} + +template +void format_string_append(std::string& str, // NOLINT + const std::string& fmt, // NOLINT + ARGS&&... args) { + format_string_append(str, fmt.c_str(), args...); +} + +template +std::string format_string(const char* fmt, ARGS&&... args) { + std::string str; + format_string_append(str, fmt, args...); + return str; +} + +template +std::string format_string(const std::string& fmt, ARGS&&... args) { + return format_string(fmt.c_str(), args...); +} + +// remove leading and tailing spaces +std::string trim_spaces(const std::string& str); + +// erase all spaces in str +std::string erase_spaces(const std::string& str); + +inline int str_to_float(const char* str, float* v) { + const char* head = str; + char* cursor = NULL; + int index = 0; + while (*(head += count_spaces(head)) != 0) { + v[index++] = std::strtof(head, &cursor); + if (head == cursor) { + break; + } + head = cursor; + } + return index; +} + +// checks whether the test string is a suffix of the input string. +bool ends_with(std::string const& input, std::string const& test); + +// split string by delim +template +std::vector split_string(const std::string& str, const std::string& delim) { + size_t pre_pos = 0; + size_t pos = 0; + std::string tmp_str; + std::vector res_list; + res_list.clear(); + if (str.empty()) { + return res_list; + } + while ((pos = str.find(delim, pre_pos)) != std::string::npos) { + tmp_str.assign(str, pre_pos, pos - pre_pos); + res_list.push_back(tmp_str); + pre_pos = pos + 1; + } + tmp_str.assign(str, pre_pos, str.length() - pre_pos); + if (!tmp_str.empty()) { + res_list.push_back(tmp_str); + } + return res_list; +} + +// split string by spaces. Leading and tailing spaces are ignored. Consecutive +// spaces are treated as one delim. +template +std::vector split_string(const std::string& str) { + std::vector list; + const char* p; + int pre_pos = 0; + int pos = 0; + std::string tmp_str; + if (str.empty()) { + return list; + } + for (p = str.c_str(); *p != 0;) { + if (!isspace(*p)) { + pos = pre_pos; + p++; + + while (*p != 0 && !isspace(*p)) { + pos++; + p++; + } + tmp_str.assign(str, pre_pos, pos - pre_pos + 1); + list.push_back(tmp_str); + pre_pos = pos + 1; + } else { + pre_pos++; + p++; + } + } + return list; +} + +template +std::string join_strings(const Container& strs, char delim) { + std::string str; + + size_t i = 0; + for (auto& elem : strs) { + if (i > 0) { + str += delim; + } + + std::stringstream ss; + ss << elem; + str += ss.str(); + ++i; + } + + return str; +} + +template +std::string join_strings(const Container& strs, const std::string& delim) { + std::string str; + + size_t i = 0; + for (auto& elem : strs) { + if (i > 0) { + str += delim; + } + + std::stringstream ss; + ss << elem; + str += ss.str(); + ++i; + } + + return str; +} + +template +std::string join_strings(const Container& strs, + DelimT&& delim, + ConvertFunc&& func) { + std::stringstream ss; + size_t i = 0; + for (const auto& elem : strs) { + if (i > 0) { + ss << delim; + } + ss << func(elem); + ++i; + } + + return ss.str(); +} + +// A helper class for reading lines from file. A line buffer is maintained. It +// doesn't need to know the maximum possible length of a line. + +class LineFileReader { + public: + LineFileReader() {} + LineFileReader(LineFileReader&&) = delete; + LineFileReader(const LineFileReader&) = delete; + ~LineFileReader() { ::free(_buffer); } + char* getline(FILE* f) { return this->getdelim(f, '\n'); } + char* getdelim(FILE* f, char delim); + char* get() { return _buffer; } + size_t length() { return _length; } + + private: + char* _buffer = NULL; + size_t _buf_size = 0; + size_t _length = 0; +}; +} // end namespace string +} // end namespace paddle diff --git a/paddle/fluid/string/string_helper_test.cc b/paddle/utils/string/string_helper_test.cc similarity index 97% rename from paddle/fluid/string/string_helper_test.cc rename to paddle/utils/string/string_helper_test.cc index 67456e16a93b67f39d86c5751e35148be7020f61..e0789e9a545dd66abfde8799671bb6887db91287 100644 --- a/paddle/fluid/string/string_helper_test.cc +++ b/paddle/utils/string/string_helper_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/string/string_helper.h" +#include "paddle/utils/string/string_helper.h" #include diff --git a/paddle/fluid/string/tinyformat/tinyformat.h b/paddle/utils/string/tinyformat/tinyformat.h similarity index 91% rename from paddle/fluid/string/tinyformat/tinyformat.h rename to paddle/utils/string/tinyformat/tinyformat.h index 7498c6a46e38af98e8356f9f87a0cfb6b163bddf..28a444f87c48fdde7d41aa257fe0e91538c9b7a7 100644 --- a/paddle/fluid/string/tinyformat/tinyformat.h +++ b/paddle/utils/string/tinyformat/tinyformat.h @@ -170,7 +170,8 @@ struct is_convertible { // Format the value by casting to type fmtT. This default implementation // should never be called. -template ::value> struct formatValueAsType { static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); } @@ -240,8 +241,11 @@ TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char) /// operator<< to format the type T, with special cases for the %c and %p /// conversions. template -inline void formatValue(std::ostream &out, const char * /*fmtBegin*/, - const char *fmtEnd, int ntrunc, const T &value) { +inline void formatValue(std::ostream &out, + const char * /*fmtBegin*/, + const char *fmtEnd, + int ntrunc, + const T &value) { // The mess here is to support the %c and %p conversions: if these // conversions are active we try to convert the type to a char or const // void* respectively and format that instead of the value itself. For the @@ -250,35 +254,39 @@ inline void formatValue(std::ostream &out, const char * /*fmtBegin*/, const bool canConvertToChar = detail::is_convertible::value; const bool canConvertToVoidPtr = detail::is_convertible::value; - if (canConvertToChar && *(fmtEnd - 1) == 'c') + if (canConvertToChar && *(fmtEnd - 1) == 'c') { detail::formatValueAsType::invoke(out, value); - else if (canConvertToVoidPtr && *(fmtEnd - 1) == 'p') + } else if (canConvertToVoidPtr && *(fmtEnd - 1) == 'p') { detail::formatValueAsType::invoke(out, value); - else if (ntrunc >= 0) { + } else if (ntrunc >= 0) { // Take care not to overread C strings in truncating conversions like // "%.4s" where at most 4 characters may be read. detail::formatTruncated(out, value, ntrunc); - } else + } else { out << value; + } } // Overloaded version for char types to support printing as an integer -#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \ - inline void formatValue(std::ostream &out, const char * /*fmtBegin*/, \ - const char *fmtEnd, int /**/, charType value) { \ - switch (*(fmtEnd - 1)) { \ - case 'u': \ - case 'd': \ - case 'i': \ - case 'o': \ - case 'X': \ - case 'x': \ - out << static_cast(value); \ - break; \ - default: \ - out << value; \ - break; \ - } \ +#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \ + inline void formatValue(std::ostream &out, \ + const char * /*fmtBegin*/, \ + const char *fmtEnd, \ + int /**/, \ + charType value) { \ + switch (*(fmtEnd - 1)) { \ + case 'u': \ + case 'd': \ + case 'i': \ + case 'o': \ + case 'X': \ + case 'x': \ + out << static_cast(value); \ + break; \ + default: \ + out << value; \ + break; \ + } \ } // per 3.9.1: char, signed char and unsigned char are all distinct types TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char) @@ -466,7 +474,7 @@ cog.outl('#define TINYFORMAT_FOREACH_ARGNUM(m) \\\n ' + #define TINYFORMAT_FOREACH_ARGNUM(m) \ m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) m(10) m(11) m(12) m(13) m(14) \ m(15) m(16) -//[[[end]]] +// [[[end]]] namespace detail { @@ -476,15 +484,17 @@ namespace detail { // whereas a naive implementation based on inheritance does not. class FormatArg { public: - FormatArg() {} + FormatArg() {} // NOLINT template - FormatArg(const T &value) + FormatArg(const T &value) // NOLINT : m_value(static_cast(&value)), m_formatImpl(&formatImpl), m_toIntImpl(&toIntImpl) {} - void format(std::ostream &out, const char *fmtBegin, const char *fmtEnd, + void format(std::ostream &out, + const char *fmtBegin, + const char *fmtEnd, int ntrunc) const { m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value); } @@ -493,8 +503,11 @@ class FormatArg { private: template - static void formatImpl(std::ostream &out, const char *fmtBegin, - const char *fmtEnd, int ntrunc, const void *value) { + static void formatImpl(std::ostream &out, + const char *fmtBegin, + const char *fmtEnd, + int ntrunc, + const void *value) { formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast(value)); } @@ -504,14 +517,17 @@ class FormatArg { } const void *m_value; - void (*m_formatImpl)(std::ostream &out, const char *fmtBegin, - const char *fmtEnd, int ntrunc, const void *value); + void (*m_formatImpl)(std::ostream &out, + const char *fmtBegin, + const char *fmtEnd, + int ntrunc, + const void *value); int (*m_toIntImpl)(const void *value); }; // Parse and return an integer from the string c, as atoi() // On return, c is set to one past the end of the integer. -inline int parseIntAndAdvance(const char *&c) { +inline int parseIntAndAdvance(const char *&c) { // NOLINT int i = 0; for (; *c >= '0' && *c <= '9'; ++c) i = 10 * i + (*c - '0'); return i; @@ -553,11 +569,13 @@ inline const char *printFormatStringLiteral(std::ostream &out, // and ntrunc (for truncating conversions). argIndex is incremented if // necessary to pull out variable width and precision . The function returns a // pointer to the character after the end of the current format spec. -inline const char *streamStateFromFormat(std::ostream &out, - bool &spacePadPositive, int &ntrunc, +inline const char *streamStateFromFormat(std::ostream &out, // NOLINT + bool &spacePadPositive, // NOLINT + int &ntrunc, // NOLINT const char *fmtStart, const detail::FormatArg *formatters, - int &argIndex, int numFormatters) { + int &argIndex, // NOLINT + int numFormatters) { if (*fmtStart != '%') { TINYFORMAT_ERROR( "tinyformat: Not enough conversion specifiers in format string"); @@ -733,8 +751,10 @@ inline const char *streamStateFromFormat(std::ostream &out, } //------------------------------------------------------------------------------ -inline void formatImpl(std::ostream &out, const char *fmt, - const detail::FormatArg *formatters, int numFormatters) { +inline void formatImpl(std::ostream &out, + const char *fmt, + const detail::FormatArg *formatters, + int numFormatters) { // Saved stream state std::streamsize origWidth = out.width(); std::streamsize origPrecision = out.precision(); @@ -746,9 +766,13 @@ inline void formatImpl(std::ostream &out, const char *fmt, fmt = printFormatStringLiteral(out, fmt); bool spacePadPositive = false; int ntrunc = -1; - const char *fmtEnd = - streamStateFromFormat(out, spacePadPositive, ntrunc, fmt, formatters, - argIndex, numFormatters); + const char *fmtEnd = streamStateFromFormat(out, + spacePadPositive, + ntrunc, + fmt, + formatters, + argIndex, + numFormatters); if (argIndex >= numFormatters) { // Check args remain after reading any variable width/precision TINYFORMAT_ERROR("tinyformat: Not enough format arguments"); @@ -756,9 +780,9 @@ inline void formatImpl(std::ostream &out, const char *fmt, } const FormatArg &arg = formatters[argIndex]; // Format the arg into the stream. - if (!spacePadPositive) + if (!spacePadPositive) { arg.format(out, fmt, fmtEnd, ntrunc); - else { + } else { // The following is a special case with no direct correspondence // between stream formatting and the printf() behaviour. Simulate // it crudely by formatting into a temporary string stream and @@ -801,7 +825,8 @@ class FormatList { FormatList(detail::FormatArg *formatters, int N) : m_formatters(formatters), m_N(N) {} - friend void vformat(std::ostream &out, const char *fmt, + friend void vformat(std::ostream &out, + const char *fmt, const FormatList &list); private: @@ -819,7 +844,7 @@ template class FormatListN : public FormatList { public: template - FormatListN(const Args &... args) + FormatListN(const Args &... args) // NOLINT : FormatList(&m_formatterStore[0], N), m_formatterStore{FormatArg(args)...} { static_assert(sizeof...(args) == N, "Number of args must be N"); @@ -850,7 +875,7 @@ class FormatListN<0> : public FormatList { template detail::FormatListN makeFormatList(const Args &... args) { return detail::FormatListN(args...); -} +} // NOLINT /// Format list of arguments to the stream according to the given format string. /// diff --git a/paddle/utils/string/to_string.h b/paddle/utils/string/to_string.h new file mode 100644 index 0000000000000000000000000000000000000000..7b3332861e0fa3edbbb8915e3e3f068fed3b412f --- /dev/null +++ b/paddle/utils/string/to_string.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include + +namespace paddle { +namespace string { +inline std::ostream& operator<<(std::ostream& s, const std::type_index& t) { + s << t.name(); + return s; +} + +template ::value, int>::type = 0> +inline std::string to_string(T v) { + std::ostringstream sout; + sout << v; + return sout.str(); +} + +template ::value, int>::type = 0> +inline std::string to_string(T v) { + return std::to_string(static_cast(v)); +} + +template <> +inline std::string to_string(std::type_index t) { + return t.name(); +} + +// Faster std::string/const char* type +template <> +inline std::string to_string(std::string v) { + return v; +} + +template <> +inline std::string to_string(const char* v) { + return std::string(v); +} + +} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/to_string_test.cc b/paddle/utils/string/to_string_test.cc similarity index 96% rename from paddle/fluid/string/to_string_test.cc rename to paddle/utils/string/to_string_test.cc index 1d9c0e5e0c2b6e7f44c1622d2828b21b0a4380ee..778ba8bb113a29a5831ecd751bb294d52ac7e0eb 100644 --- a/paddle/fluid/string/to_string_test.cc +++ b/paddle/utils/string/to_string_test.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/string/to_string.h" +#include "paddle/utils/string/to_string.h" #include constexpr char kOutputString[] = "User Defined Output"; diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index 2e5adfa5dfbb14749dd614340768bb064d6dbaf1..b9cceafebaac4d86f34362e051934efe244e52ab 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -14,6 +14,7 @@ import contextlib import paddle +from paddle.static import gradients from ..fluid import framework from ..fluid.dygraph import grad from ..tensor.creation import assign @@ -904,3 +905,122 @@ def vhp(func, inputs, v=None, create_graph=False, allow_unused=False): vhp = grad_fn(jac, xs, v) outputs, vhp = return_fn(outputs), return_fn(vhp) return outputs, vhp + + +class Jacobian(object): + r""" + Object that represents the Jacobian matrix of a muli-input multi-output + function. + + The Jacobian values are lazily evaluated if accessed through indices. + In contrast, slicing access would trigger evaluating the full matrix + if it's not already computed. + + Examples: + .. code-block:: python + import paddle + import numpy as np + + def func(xs): + x, y = xs + return paddle.matmul(x, y) + + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + x = paddle.static.data(name='x', shape=[2, 2], dtype='float32') + JJ = paddle.autograd.functional.Jacobian(func, [x, x]) + nrow, ncol = JJ.shape() + full_jacobian = JJ[:] + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + + feeds = {'x': np.array([[2., 2.], [2., 1.]]).astype('float32')} + jacobian = exe.run(main, feed=feeds, fetch_list=[full_jacobian])[0] + print(jacobian) + # [[4. 2. 2. 0. 4. 2. 2. 0.] + # [2. 3. 0. 2. 2. 3. 0. 2.] + # [2. 0. 3. 2. 2. 0. 3. 2.] + # [0. 2. 2. 2. 0. 2. 2. 2.]] + """ + + def __init__(self, func, inputs, batch=False): + r"""Constructing a Jacobian matrix. + + Parameters: + func (Callable): a Python function that takes as input a Tensor + or a Tensor list and outputs a Tensor or a Tensor list. + inputs (Tensor|list[Tensor]): a Tensor or a list of Tensors as + `func`'s input. + batch (bool): if True the 0'th axis is considered the batch + dimension, both on input and output. + """ + + def enable_grads(inputs): + if isinstance(inputs, (list, tuple)): + for x in inputs: + x.stop_gradient = False + else: + assert isinstance(inputs, paddle.fluid.framework.Variable), ( + f"Expecting {inputs} to be paddle.fluid.framework.Variable," + f" however it's found to be a(n) {type(inputs)}.") + inputs.stop_gradient = False + return inputs + + self.batch = batch + self.xs = enable_grads(inputs) + ys = func(inputs) + if not isinstance(ys, list): + ys = [ys] + self.y = self.flatten_all(ys) + self.ydim = self.y.shape[-1] + self.xdim = self.flatten_all(inputs).shape[-1] + self.bdim = self.y.shape[0] + self.jacobian = {} + + def flatten(self, x): + to = [x.shape[0], -1] if self.batch else [-1] + return x.reshape(to) + + def flatten_all(self, xs): + return paddle.concat([self.flatten(x) for x in xs], axis=-1) + + def shape(self): + return (self.ydim, self.xdim) + + def __getitem__(self, tup): + if hasattr(tup, '__iter__'): + i, j = tup + else: + i, j = tup, None + + if isinstance(i, slice): + slicing = True + else: + slicing = False + + if slicing: + if 'full' not in self.jacobian: + rows = [ + self.flatten_all(gradients(self.y[..., i], self.xs)) + for i in range(self.ydim) + ] + self.jacobian['full'] = paddle.stack(rows) + return self.jacobian['full'][i] + + assert 0 <= i < self.ydim, f"Jacobian index i={i} is not valid." + assert (j is None) or ( + 0 <= j < self.xdim), f"Jacobian index j={j} is not valid." + if 'full' in self.jacobian: + JJ = self.jacobian['full'] + else: + JJ = self.jacobian + if i not in self.jacobian: + self.jacobian[i] = self.flatten_all( + gradients(self.y[..., i], self.xs)) + + if j is None: + return JJ[i] + else: + return JJ[i][..., j] diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py index 6e6d2a672fd18631c4f0ac7073eaada488b37967..da0f2ebcba89ef1ffddf1870eeba75ca07c4a6bb 100644 --- a/python/paddle/distributed/auto_parallel/reshard.py +++ b/python/paddle/distributed/auto_parallel/reshard.py @@ -279,7 +279,7 @@ def _is_overlapped(shape_x, shape_y): return overlapped -def _need_reshard(dist_tensor, dist_op): +def _need_reshard(dist_tensor, dist_op, op_input=True): """Judge the tensor whether needs to be resharded.""" is_reshard = False tensor_dist_attr = dist_tensor.dist_attr @@ -289,13 +289,31 @@ def _need_reshard(dist_tensor, dist_op): op_dist_attr = dist_op.dist_attr op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name) op_process_mesh = op_dist_attr.process_mesh - if all( - map(lambda x: x is not None, [ - tensor_dims_mapping, tensor_process_mesh, op_input_dims_mapping, - op_process_mesh - ])): - if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh: - is_reshard = True + if op_input: + op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name) + op_process_mesh = op_dist_attr.process_mesh + if all( + map(lambda x: x is not None, [ + tensor_dims_mapping, tensor_process_mesh, + op_input_dims_mapping, op_process_mesh + ])): + if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh: + is_reshard = True + else: + op_output_dims_mapping = op_dist_attr.get_output_dims_mapping( + tensor_name) + op_process_mesh = op_dist_attr.process_mesh + if all( + map(lambda x: x is not None, [ + tensor_dims_mapping, tensor_process_mesh, + op_output_dims_mapping, op_process_mesh + ])): + if tensor_process_mesh != op_process_mesh: + is_reshard = True + if tensor_dims_mapping != op_output_dims_mapping: + raise ValueError( + "It is not supported that tensor dims mapping is different from op output dims mapping." + ) return is_reshard @@ -948,12 +966,13 @@ def remove_no_need_in_startup(auto_parallel_main_prog, def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id, dist_context): """ - Reshard tensor in the program according to its dist attr and corresponding op dist attr. + Reshard tensor in the program according to its distributed attribute and corresponding op distributed attribute. Args: auto_parallel_main_prog (Program): An auto parallel main program. auto_parallel_startup_prog (Program): An auto parallel startup program. rank_id (int): The process id. + dist_context (DistributedContext): The distributed context of this rank. """ assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_main_prog should be Program, " \ "but got {}.".format(type(auto_parallel_main_prog)) @@ -1001,6 +1020,34 @@ def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id, else: idx += 1 + # insert send and recv op if output process mesh is different from tensor process mesh + idx = 0 + skip_ops = ["create_py_reader", "create_double_buffer_reader", "read"] + while idx < len(block.ops): + pre_op_count = len(block.ops) + op = block.ops[idx] + dist_op = dist_context.get_dist_op_for_program(op) + if dist_op is not None and op.type not in skip_ops: + for var_name in op.output_arg_names: + var = block.vars[var_name] + dist_tensor = dist_context.get_dist_tensor_for_program(var) + if dist_tensor is not None and _need_reshard(dist_tensor, + dist_op, False): + for index, item in enumerate( + dist_op.dist_attr.process_mesh.processes): + recv_rank = dist_tensor.dist_attr.process_mesh.processes[ + index] + if rank_id == item: + _insert_send_op(block, idx + 1, var, recv_rank) + if rank_id == recv_rank: + _insert_recv_op(block, idx + 1, var, item) + cur_op_count = len(block.ops) + idx_offset = idx_offset + cur_op_count - pre_op_count + pre_op_count = cur_op_count + idx = idx + idx_offset + 1 + else: + idx += 1 + # remove no need vars and ops in the main program remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id) diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index b08753066ca3614c02252aff1b72fbdcfceb5698..f0cf6573139d90620436ba943b1af5be5bd5cb15 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -224,6 +224,14 @@ class Gloo(object): self._worker_comm = gloo # TODO (sandyhouse): initialize gloo for server and all + # the closing of kv server may cause gloo init failure + # since it depend on the full mesh connection + # e.g. 0 connected with 1,2,3 while 2-3 not connected yet + # TODO(kuizhiqing) + if start_http_server: + http_server_d["running"] = False + http_server.join() + def _get_rank_nodes(self, role): nodes = 0 rank = -1 diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index e7108b3f4f3432df04556b4cf78726a63cc8b076..50bf8a2f9c7c58b3390d2881cb5d6e8510e78ae8 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -49,8 +49,6 @@ class HybridParallelClipGrad: @imperative_base.no_grad def _dygraph_clip(self, params_grads): - params_and_grads = [] - sum_square_dist_fp16 = [] sum_square_dist_fp32 = [] sum_square_not_dist_fp16 = [] @@ -153,15 +151,14 @@ class HybridParallelClipGrad: if g is None: continue if getattr(p, 'need_clip', True) is False: - params_and_grads.append((p, g)) continue if p.dtype == paddle.float16: - new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16) + g.scale_(clip_var_fp16) else: - new_grad = layers.elementwise_mul(x=g, y=clip_var) - params_and_grads.append((p, new_grad)) + g.scale_(clip_var) + p._reset_grad_inplace_version(True) - return params_and_grads + return params_grads def __getattr__(self, item): return getattr(self._clip, item) @@ -201,6 +198,12 @@ class HybridParallelOptimizer: else: self._inner_opt._grad_clip = HybridParallelClipGrad( self._inner_opt._grad_clip, hcg) + if self._inner_opt._parameter_list and isinstance( + self._inner_opt._parameter_list[0], dict): + for item in self._inner_opt._param_groups: + if "grad_clip" in item.keys(): + item["grad_clip"] = HybridParallelClipGrad( + self._inner_opt._grad_clip, hcg) @imperative_base.no_grad @framework.dygraph_only diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py index a2797adff251aea3535f86e5c423463d748c37b3..fc5b93c6e25499a0ae50c19cacae4a9395520fe9 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py @@ -109,6 +109,13 @@ class ShardingOptimizerStage2(Optimizer): self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip, paddle.get_device(), self.group) + if self._optim._parameter_list and isinstance( + self._optim._parameter_list[0], dict): + for item in self._optim._param_groups: + if "grad_clip" in item.keys(): + item["grad_clip"] = ShardingClipGrad( + self._optim._grad_clip, + paddle.get_device(), self.group) if offload: assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16" diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py index 41c6f92230ab3e0e8de9aec0abdf920fad1ef232..9d7bd937411882541d9cb1311c241d3e84316c90 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py @@ -393,6 +393,7 @@ class ShardingStage3(nn.Layer): else: param.bw_storage.scale_(scale=self._world_size_scaling) param.fw_storage = _VarBaseWrapper(param) + assert param.fw_storage.grad is None param.fw_storage._copy_gradient_from(param.bw_storage) update_list.append(param) return update_list @@ -495,10 +496,9 @@ class ShardingStage3(nn.Layer): def _redefine_opt_step(self): params_slice_func = self._update_params_slice opt_step = self._optim.step - update_scaler = self._optim.update_scaler def _opt_step(self): - if not update_scaler: + if not self.update_scaler: params_slice_func() if self.offload: with device_guard(device="cpu"): diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index 5f696195c1abcd4921b4358b8971fdbc982609da..9c30ff5a45075ae423d6a46ef328e3b6523fbd5b 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -57,8 +57,6 @@ class ShardingClipGrad: @imperative_base.no_grad def _dygraph_clip(self, params_grads): - params_and_grads = [] - sum_square_fp16 = [] sum_square_fp32 = [] @@ -114,15 +112,14 @@ class ShardingClipGrad: if g is None: continue if getattr(p, 'need_clip', True) is False: - params_and_grads.append((p, g)) continue if p.dtype == paddle.float16: - new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16) + g.scale_(clip_var_fp16) else: - new_grad = layers.elementwise_mul(x=g, y=clip_var) - params_and_grads.append((p, new_grad)) + g.scale_(clip_var) + p._reset_grad_inplace_version(True) - return params_and_grads + return params_grads def __getattr__(self, item): return getattr(self._clip, item) diff --git a/python/paddle/distributed/passes/cpp_pass.py b/python/paddle/distributed/passes/cpp_pass.py index fe6ef74bd85c8fdbbb02d9c15f0392acc5606786..6b8ea30f3ba17711935f6c58fc44a9dd05b7b4ea 100644 --- a/python/paddle/distributed/passes/cpp_pass.py +++ b/python/paddle/distributed/passes/cpp_pass.py @@ -26,3 +26,16 @@ class FuseElementwiseAddActPass(CPPPassWrapper): def _type(self): return PassType.FUSION_OPT + + +@register_pass("fuse_bn_act") +class FuseBatchNormActPass(CPPPassWrapper): + def __init__(self): + super(FuseBatchNormActPass, self).__init__() + + @property + def cpp_name(self): + return "fuse_bn_act_pass" + + def _type(self): + return PassType.FUSION_OPT diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 5ee7b04248e4527357060839a769f7a2c726d744..84d3f5547feb4b5c4fa6d3d9b88a57b9a1e52344 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1763,7 +1763,10 @@ class Variable(object): Examples: .. code-block:: python + import paddle import paddle.fluid as fluid + + paddle.enable_static() cur_program = fluid.Program() cur_block = cur_program.current_block() new_variable = cur_block.create_var(name="X", @@ -1773,7 +1776,8 @@ class Variable(object): """ if self.type == core.VarDesc.VarType.SELECTED_ROWS: raise Exception("SelectedRows DO NOT supprt lod") - + if self.type == core.VarDesc.VarType.STRINGS: + return None return self.desc.lod_level() @property diff --git a/python/paddle/fluid/tests/custom_op/attr_test_op.cc b/python/paddle/fluid/tests/custom_op/attr_test_op.cc index 1c79d9a26aee3409fc2a32b755abcd45f4ca06c3..14cb0aa7c716d8449c672231f5399027275f8c5d 100644 --- a/python/paddle/fluid/tests/custom_op/attr_test_op.cc +++ b/python/paddle/fluid/tests/custom_op/attr_test_op.cc @@ -137,9 +137,7 @@ std::vector AttrTestForward( PD_DISPATCH_FLOATING_TYPES( x.type(), "assign_cpu_kernel", ([&] { assign_cpu_kernel( - x.data(), - out.mutable_data(paddle::PlaceType::kCPU), - x.size()); + x.data(), out.mutable_data(), x.size()); })); // Check attrs value @@ -177,13 +175,12 @@ std::vector AttrTestBackward( const std::vector& str_vec_attr) { auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape()); - PD_DISPATCH_FLOATING_TYPES( - grad_out.type(), "assign_cpu_kernel", ([&] { - assign_cpu_kernel( - grad_out.data(), - grad_x.mutable_data(paddle::PlaceType::kCPU), - grad_out.size()); - })); + PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + grad_out.data(), + grad_x.mutable_data(), + grad_out.size()); + })); CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr); @@ -206,9 +203,7 @@ std::vector ConstAttrTestForward( PD_DISPATCH_FLOATING_TYPES( x.type(), "assign_cpu_kernel", ([&] { assign_cpu_kernel( - x.data(), - out.mutable_data(paddle::PlaceType::kCPU), - x.size()); + x.data(), out.mutable_data(), x.size()); })); // Check attrs value @@ -246,13 +241,12 @@ std::vector ConstAttrTestBackward( const std::vector& str_vec_attr) { auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape()); - PD_DISPATCH_FLOATING_TYPES( - grad_out.type(), "assign_cpu_kernel", ([&] { - assign_cpu_kernel( - grad_out.data(), - grad_x.mutable_data(paddle::PlaceType::kCPU), - grad_out.size()); - })); + PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + grad_out.data(), + grad_x.mutable_data(), + grad_out.size()); + })); CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr); diff --git a/python/paddle/fluid/tests/custom_op/concat_and_split.h b/python/paddle/fluid/tests/custom_op/concat_and_split.h index cbec4653a207d9b92da48d0cad79288159329a6a..9f24cc43699773fc531ccd68d4219ebcdfdab8eb 100644 --- a/python/paddle/fluid/tests/custom_op/concat_and_split.h +++ b/python/paddle/fluid/tests/custom_op/concat_and_split.h @@ -47,7 +47,7 @@ void ConcatCpuKernel(const std::vector& ins, int64_t out_cols = 0; auto ins_cols = GetCols(ins, out_rows, &out_cols); - auto* out_data = out->mutable_data(paddle::PlaceType::kCPU); + auto* out_data = out->mutable_data(); int64_t col_idx = 0; for (size_t i = 0; i < num; ++i) { int64_t col_len = ins_cols[i]; @@ -76,9 +76,7 @@ void SplitCpuKernel(const paddle::Tensor& in, int64_t col_idx = 0; for (size_t j = 0; j < num; ++j) { int64_t col_len = out_cols[j]; - auto* out_data = - outs->at(j).mutable_data(paddle::PlaceType::kCPU) + - i * col_len; + auto* out_data = outs->at(j).mutable_data() + i * col_len; std::memcpy(out_data, in_data + col_idx, sizeof(data_t) * col_len); col_idx += col_len; } diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc index ae60799d239467ff8637f2e494315c2ac8c08744..b9c10f479e0a39eb8e33ffceb30e8eb9cc8efa9e 100644 --- a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc @@ -76,9 +76,7 @@ std::vector ConjFunction(const paddle::Tensor& x) { PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES( x.type(), "ConjCPUKernel", ([&] { ConjCPUKernel( - x.data(), - x.size(), - out.mutable_data(paddle::PlaceType::kCPU)); + x.data(), x.size(), out.mutable_data()); })); return {out}; diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc index d5f161fc5b775d92627bfcd0b0f4b0fa347d02be..0f7d323b5451efba5a503d9039a03531e1773efb 100644 --- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc +++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc @@ -32,9 +32,7 @@ std::vector DispatchTestInterger(const paddle::Tensor& x) { PD_DISPATCH_INTEGRAL_TYPES( x.type(), "assign_cpu_kernel", ([&] { assign_cpu_kernel( - x.data(), - out.mutable_data(paddle::PlaceType::kCPU), - x.size()); + x.data(), out.mutable_data(), x.size()); })); return {out}; @@ -52,9 +50,7 @@ std::vector DispatchTestFloatAndInteger( PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES( x.type(), "assign_cpu_kernel", ([&] { assign_cpu_kernel( - x.data(), - out.mutable_data(paddle::PlaceType::kCPU), - x.size()); + x.data(), out.mutable_data(), x.size()); })); return {out}; @@ -71,9 +67,7 @@ std::vector DispatchTestComplex(const paddle::Tensor& x) { PD_DISPATCH_COMPLEX_TYPES( x.type(), "assign_cpu_kernel", ([&] { assign_cpu_kernel( - x.data(), - out.mutable_data(paddle::PlaceType::kCPU), - x.size()); + x.data(), out.mutable_data(), x.size()); })); return {out}; @@ -91,9 +85,7 @@ std::vector DispatchTestFloatAndComplex( PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES( x.type(), "assign_cpu_kernel", ([&] { assign_cpu_kernel( - x.data(), - out.mutable_data(paddle::PlaceType::kCPU), - x.size()); + x.data(), out.mutable_data(), x.size()); })); return {out}; @@ -111,9 +103,7 @@ std::vector DispatchTestFloatAndIntegerAndComplex( PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES( x.type(), "assign_cpu_kernel", ([&] { assign_cpu_kernel( - x.data(), - out.mutable_data(paddle::PlaceType::kCPU), - x.size()); + x.data(), out.mutable_data(), x.size()); })); return {out}; @@ -130,9 +120,7 @@ std::vector DispatchTestFloatAndHalf(const paddle::Tensor& x) { PD_DISPATCH_FLOATING_AND_HALF_TYPES( x.type(), "assign_cpu_kernel", ([&] { assign_cpu_kernel( - x.data(), - out.mutable_data(paddle::PlaceType::kCPU), - x.size()); + x.data(), out.mutable_data(), x.size()); })); return {out}; diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 2ac5e9404c1ba52adcd4aaa86485c11a5ec881b4..2e35277d70cd62ab0a7a931cb32e2d9ead99ed73 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -156,7 +156,8 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_origin_scheduler) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_mapper) LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_task_node) - LIST(REMOVE_ITEM TEST_OPS test_dist_model_tensor) + LIST(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_run) + LIST(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_tensor) endif() # Temporally disable test_deprecated_decorator diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py b/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py new file mode 100644 index 0000000000000000000000000000000000000000..28fc6932b07310c4b2fced4d6e6122260e37fb2d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/test_jacobian_static.py @@ -0,0 +1,346 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +from utils import _compute_numerical_jacobian, _compute_numerical_batch_jacobian + + +def approx_jacobian(f, xs, dtype, eps=1e-5, batch=False): + r"""Computes an approximate Jacobian matrix of a multi-valued function + using finite differences. + + The function input is required to be an np array or a list of list of np + arrays. + """ + + def flatten(x): + if len(x.shape) > 0: + to = [x.shape[0], -1] if batch else [-1] + return x.reshape(to) + else: + return x + + def flatten_all(xs): + if isinstance(xs, list): + flattened = np.concatenate([flatten(x) for x in xs], axis=-1) + else: + flattened = flatten(xs) + return flattened + + def x_like(x, orig_x): + return x.reshape(orig_x.shape) + + def _f(x): + if multi_inps: + _xs = np.split(x, splits, axis=-1) + _xs = [x_like(_x, _o) for _x, _o in zip(_xs, xs)] + outs = f(_xs) + else: + outs = f(x) + return flatten_all(outs) + + multi_inps = False if isinstance(xs, np.ndarray) else True + x = flatten_all(xs) + xdim = x.shape[-1] + splits = [] + + if multi_inps: + split = 0 + for inp in xs: + split += flatten(inp).shape[-1] + splits.append(split) + + ds = eps * np.eye(xdim, dtype=dtype) + + fprimes_by_x = [(0.5 / eps) * (_f(x + d) - _f(x - d)) for d in ds] + fprimes_by_y = np.stack(fprimes_by_x, axis=-1) + return np.transpose(fprimes_by_y, [1, 0, 2]) if batch else fprimes_by_y + + +class TestJacobianFloat32(unittest.TestCase): + @classmethod + def setUpClass(self): + paddle.enable_static() + if fluid.core.is_compiled_with_cuda(): + self.place = fluid.CUDAPlace(0) + else: + self.place = fluid.CPUPlace() + self.np_dtype = np.float32 + self.A = np.array([[1., 2.]]).astype('float32') + self.B = np.array([[1., 2.], [2., 1.]]).astype('float32') + self.C = np.array([[2., 2.], [2., 1.]]).astype('float32') + self.D = np.array( + [[[2., 2.], [2., 1.]], [[1., 2.], [2., 1.]]]).astype('float32') + self.E = np.array( + [[[3., 4.], [2., 3.]], [[2., 1.], [1., 3.]]]).astype('float32') + self.eps = 1e-4 + self.rtol = 1e-2 + self.atol = 1e-2 + + def run_test(self, pd_f, np_f, inps, dtype, batch=False): + def make_tensors(inps): + if isinstance(inps, list): + xs = [ + paddle.static.data( + f'x{i}', inp.shape, dtype=inp.dtype) + for i, inp in enumerate(inps) + ] + else: + xs = paddle.static.data( + name='x', shape=inps.shape, dtype=inps.dtype) + return xs + + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + xs = make_tensors(inps) + JJ = paddle.autograd.functional.Jacobian(pd_f, xs, batch=batch) + nrow, ncol = JJ.shape() + full_jacobian = JJ[:] + exe = fluid.Executor(self.place) + exe.run(startup) + if isinstance(inps, list): + feeds = {f'x{i}': x for i, x in enumerate(inps)} + else: + feeds = {'x': inps} + pd_jacobians = exe.run(main, feed=feeds, fetch_list=[full_jacobian])[0] + np_jacobians = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch) + self.assertTrue( + np.allclose(pd_jacobians, np_jacobians, self.rtol, self.atol)) + + def test_square(self): + def pd_f(x): + return paddle.multiply(x, x) + + def np_f(x): + return np.multiply(x, x) + + self.run_test(pd_f, np_f, self.A, np.dtype('float32')) + + def test_mul(self): + def pd_f(xs): + x, y = xs + return paddle.multiply(x, y) + + def np_f(xs): + x, y = xs + return np.multiply(x, y) + + self.run_test(pd_f, np_f, [self.B, self.C], np.dtype('float32')) + + def test_matmul(self): + def pd_f(xs): + x, y = xs + return paddle.matmul(x, y) + + def np_f(xs): + x, y = xs + return np.matmul(x, y) + + self.run_test(pd_f, np_f, [self.B, self.C], np.dtype('float32')) + + def test_batch_matmul(self): + def pd_f(xs): + x, y = xs + return paddle.matmul(x, y) + + def np_f(xs): + x, y = xs + return np.matmul(x, y) + + self.run_test( + pd_f, np_f, [self.D, self.E], np.dtype('float32'), batch=True) + + +class TestJacobianFloat64(unittest.TestCase): + @classmethod + def setUpClass(self): + paddle.enable_static() + if fluid.core.is_compiled_with_cuda(): + self.place = fluid.CUDAPlace(0) + else: + self.place = fluid.CPUPlace() + self.np_dtype = np.float32 + self.A = np.array([[1., 2.]]).astype('float64') + self.B = np.array([[1., 2.], [2., 1.]]).astype('float64') + self.C = np.array([[2., 2.], [2., 1.]]).astype('float64') + self.D = np.array( + [[[2., 2.], [2., 1.]], [[1., 2.], [2., 1.]]]).astype('float64') + self.E = np.array( + [[[3., 4.], [2., 3.]], [[2., 1.], [1., 3.]]]).astype('float64') + self.eps = 1e-7 + self.rtol = 1e-6 + self.atol = 1e-6 + + def run_test_by_fullmatrix(self, pd_f, np_f, inps, dtype, batch=False): + def make_tensors(inps): + if isinstance(inps, list): + xs = [ + paddle.static.data( + f'x{i}', inp.shape, dtype=inp.dtype) + for i, inp in enumerate(inps) + ] + else: + xs = paddle.static.data( + name='x', shape=inps.shape, dtype=inps.dtype) + return xs + + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + xs = make_tensors(inps) + JJ = paddle.autograd.functional.Jacobian(pd_f, xs, batch=batch) + nrow, ncol = JJ.shape() + full_jacobian = JJ[:] + exe = fluid.Executor(self.place) + exe.run(startup) + if isinstance(inps, list): + feeds = {f'x{i}': x for i, x in enumerate(inps)} + else: + feeds = {'x': inps} + pd_jacobians = exe.run(main, feed=feeds, fetch_list=[full_jacobian])[0] + np_jacobians = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch) + self.assertTrue( + np.allclose(pd_jacobians, np_jacobians, self.rtol, self.atol)) + + def run_test_by_rows(self, pd_f, np_f, inps, dtype, batch=False): + def make_tensors(inps): + if isinstance(inps, list): + xs = [ + paddle.static.data( + f'x{i}', inp.shape, dtype=inp.dtype) + for i, inp in enumerate(inps) + ] + else: + xs = paddle.static.data( + name='x', shape=inps.shape, dtype=inps.dtype) + return xs + + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + xs = make_tensors(inps) + JJ = paddle.autograd.functional.Jacobian(pd_f, xs, batch=batch) + nrow, ncol = JJ.shape() + rows = [JJ[i] for i in range(nrow)] + exe = fluid.Executor(self.place) + exe.run(startup) + if isinstance(inps, list): + feeds = {f'x{i}': x for i, x in enumerate(inps)} + else: + feeds = {'x': inps} + pd_jac = exe.run(main, feed=feeds, fetch_list=[rows]) + np_jac = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch) + for i in range(nrow): + self.assertTrue( + np.allclose(pd_jac[i], np_jac[i], self.rtol, self.atol)) + + def run_test_by_entries(self, pd_f, np_f, inps, dtype, batch=False): + def make_tensors(inps): + if isinstance(inps, list): + xs = [ + paddle.static.data( + f'x{i}', inp.shape, dtype=inp.dtype) + for i, inp in enumerate(inps) + ] + else: + xs = paddle.static.data( + name='x', shape=inps.shape, dtype=inps.dtype) + return xs + + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + xs = make_tensors(inps) + JJ = paddle.autograd.functional.Jacobian(pd_f, xs, batch=batch) + nrow, ncol = JJ.shape() + entries = [JJ[i, j] for i in range(nrow) for j in range(ncol)] + exe = fluid.Executor(self.place) + exe.run(startup) + if isinstance(inps, list): + feeds = {f'x{i}': x for i, x in enumerate(inps)} + else: + feeds = {'x': inps} + pd_entries = exe.run(main, feed=feeds, fetch_list=[entries]) + np_jac = approx_jacobian(np_f, inps, dtype, self.eps, batch=batch) + np_entries = [ + np_jac[i, ..., j] for i in range(nrow) for j in range(ncol) + ] + for pd_entry, np_entry in zip(pd_entries, np_entries): + self.assertTrue( + np.allclose(pd_entry, np_entry, self.rtol, self.atol)) + + def test_square(self): + def pd_f(x): + return paddle.multiply(x, x) + + def np_f(x): + return np.multiply(x, x) + + self.run_test_by_fullmatrix(pd_f, np_f, self.A, np.dtype('float64')) + self.run_test_by_rows(pd_f, np_f, self.A, np.dtype('float64')) + self.run_test_by_entries(pd_f, np_f, self.A, np.dtype('float64')) + + def test_mul(self): + def pd_f(xs): + x, y = xs + return paddle.multiply(x, y) + + def np_f(xs): + x, y = xs + return np.multiply(x, y) + + self.run_test_by_fullmatrix(pd_f, np_f, [self.B, self.C], + np.dtype('float64')) + self.run_test_by_rows(pd_f, np_f, [self.B, self.C], np.dtype('float64')) + self.run_test_by_entries(pd_f, np_f, [self.B, self.C], + np.dtype('float64')) + + def test_matmul(self): + def pd_f(xs): + x, y = xs + return paddle.matmul(x, y) + + def np_f(xs): + x, y = xs + return np.matmul(x, y) + + self.run_test_by_fullmatrix(pd_f, np_f, [self.B, self.C], + np.dtype('float64')) + self.run_test_by_rows(pd_f, np_f, [self.B, self.C], np.dtype('float64')) + self.run_test_by_entries(pd_f, np_f, [self.B, self.C], + np.dtype('float64')) + + def test_batch_matmul(self): + def pd_f(xs): + x, y = xs + return paddle.matmul(x, y) + + def np_f(xs): + x, y = xs + return np.matmul(x, y) + + self.run_test_by_fullmatrix( + pd_f, np_f, [self.D, self.E], np.dtype('float64'), batch=True) + self.run_test_by_rows( + pd_f, np_f, [self.D, self.E], np.dtype('float64'), batch=True) + self.run_test_by_entries( + pd_f, np_f, [self.D, self.E], np.dtype('float64'), batch=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt index 48a9f7204aa8d26090d1b4e9a059cad7f382612f..b3ba7c80b32265912c746db4fed76773e127255f 100755 --- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt @@ -4,6 +4,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) list(APPEND DIST_TEST_OPS ${TEST_OP}) - set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 90) + set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 120) set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST") endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..a7147724fbc5c35d27a6172576539fb24d41ca5a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py @@ -0,0 +1,96 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import paddle +import paddle.distributed.fleet as fleet +import numpy as np +import paddle.nn as nn +from paddle.distributed.passes import new_pass, PassManager +import unittest +from dist_pass_test_base import DistPassTestBase + + +class BatchNormActNet(nn.Layer): + def __init__(self): + super(BatchNormActNet, self).__init__() + + self.conv1 = nn.Conv2D(3, 8, (3, 3), data_format="NHWC") + self.bn1 = nn.BatchNorm2D(8, data_format="NHWC") + self.relu = nn.ReLU() + + def forward(self, x): + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = paddle.flatten(out, 1) + return out + + +class TestFuseBatchNormActPass(DistPassTestBase): + def init(self): + self.atol = 1e-4 + self.rtol = 1e-4 + + def get_model(self, place, batch_size=32, image_shape=[224, 224, 3]): + image = paddle.static.data( + shape=[batch_size] + image_shape, dtype='float32', name='image') + + model = BatchNormActNet() + pred_out = model(image) + loss = paddle.mean(pred_out) + optimizer = paddle.optimizer.Adam(learning_rate=1e-3) + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.fuse_all_reduce_ops = False + dist_strategy.without_graph_optimization = True + dist_strategy.amp = True + dist_strategy.amp_configs = { + "init_loss_scaling": 32768, + "use_dynamic_loss_scaling": True, + } + fleet.init(is_collective=True, strategy=dist_strategy) + optimizer = fleet.distributed_optimizer(optimizer) + optimizer.minimize(loss) + + rank = paddle.distributed.get_rank() + + def reader(): + seed = int(os.environ.get("SEED", 0)) + np.random.seed(seed + rank) + for _ in range(10): + image_np = np.random.random(size=image.shape).astype('float32') + yield image_np, + + main_program = paddle.static.default_main_program() + startup_program = paddle.static.default_startup_program() + return main_program, startup_program, [image], [loss], reader + + def apply_passes(self, main_prog, startup_prog): + pass_manager = PassManager([new_pass("fuse_bn_act")]) + pass_manager.apply([main_prog], [startup_prog]) + print(pass_manager.names) + + op_type = [] + for op in main_prog.global_block().ops: + op_type.append(op.type) + self.assertTrue("fused_batch_norm_act" in op_type) + self.assertTrue("fused_batch_norm_act_grad" in op_type) + + def test_fuse_bn_act(self): + self.check_main() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py index 9206d744990008496e7af43d67e000f9d00f6dab..80acf7217e76fb996e6b76aa519307c44952636e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py @@ -159,10 +159,13 @@ def test_dp_stage2(): mlp2 = MLP() mlp3 = MLP() mlp4 = MLP() + mlp5 = MLP() mlp1.set_state_dict(state_dict) mlp2.set_state_dict(state_dict) mlp3.set_state_dict(state_dict) mlp4.set_state_dict(state_dict) + mlp5.set_state_dict(state_dict) + dp_params = train_mlp( mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False) stage2_params = train_mlp( @@ -181,6 +184,11 @@ def test_dp_stage2(): rtol=1e-5, atol=1e-5) + stage2_params = train_mlp( + mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=True) + for i in range(len(dp_params)): + np.testing.assert_allclose( + dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6) return diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py index f7e426377382bb089d9a4c4f968759f38c40e647..84ffe9094d8126ac75f864022659cbf2e101ad65 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py @@ -49,7 +49,7 @@ def train_mlp(model, offload=False): optimizer = ShardingOptimizerStage2( params=model.parameters(), optim=optimizer, offload=offload) model = ShardingStage2( - model, optimizer, buffer_max_size=2**21, accumulate_grads=True) + model, optimizer, buffer_max_size=2**21, accumulate_grads=False) train_reader = paddle.batch( reader_decorator(linear_size), batch_size=batch_size, drop_last=True) @@ -98,12 +98,11 @@ def test_sharding_stage2_offload(): mlp_offload_params = train_mlp(mlp_offload, offload=True) for i in range(len(mlp_params)): - for j in range(len(mlp_offload_params)): - if mlp_params[i].name == mlp_offload_params[j].name: - np.testing.assert_allclose( - mlp_params[i].numpy(), - mlp_offload_params[j].numpy(), - rtol=1e-6) + np.testing.assert_allclose( + mlp_params[i].numpy(), + mlp_offload_params[i].numpy(), + rtol=5e-3, + atol=5e-3) return diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py index de980f3c3f787e4e55a9ac06b92609d0cbbfb9c6..430c6e0884822dc9d38f593b4cee26f96ed18b3b 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py @@ -31,5 +31,19 @@ class TestPPClipGrad(TestDistPPTraning): return scheduler, optimizer +class TestPPClipGradParamGroup(TestDistPPTraning): + def build_optimizer(self, model): + grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5) + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer = paddle.optimizer.Momentum( + learning_rate=scheduler, + grad_clip=grad_clip, + parameters=[{ + "params": model.parameters() + }]) + return scheduler, optimizer + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py b/python/paddle/fluid/tests/unittests/ipu/ernie_training.py index bedf0a38549b8ebd23563389e05dff6f26967933..ddda666db2c0cb043624ff7249d0ea08c455c0a4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py +++ b/python/paddle/fluid/tests/unittests/ipu/ernie_training.py @@ -856,16 +856,15 @@ if __name__ == "__main__": paddle.static.load(main_prog, "model/ernie") if args.run_on_ipu: - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.num_ipus = args.num_ipus - ipu_strategy.enable_manual_shard = args.num_ipus > 1 - ipu_strategy.enable_pipelining = args.enable_pipelining - if args.enable_pipelining: - if args.is_training: - ipu_strategy.batches_per_step = args.num_ipus + 1 - else: - ipu_strategy.batches_per_step = args.num_ipus - ipu_strategy.is_training = args.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig( + num_ipus=args.num_ipus, + is_training=args.is_training, + enable_manual_shard=args.num_ipus > 1) + ipu_strategy.SetPipeliningConfig( + enable_pipelining=args.enable_pipelining, + batches_per_step=args.num_ipus + 1) + ipu_compiler = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy) program = ipu_compiler.compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py index 0f726accfa83c66784ef51bddd660c20e7968ffd..58a88c113fc0b6b82c1c58d50a1b0824cb530632 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py @@ -72,8 +72,8 @@ class TestRelu(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IpuCompiler( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py index c4d8b3ee89f439091e065dbfe5d3277c3e16b64b..a23cacf47636b434b3211ca377e8e6a5e79fa64b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py @@ -81,10 +81,12 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training - # enable avg shard pass - ipu_strategy.need_avg_shard = True + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig( + num_ipus=2, + is_training=self.is_training, + enable_manual_shard=True, + need_avg_shard=True) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py index ee81354c44620e3807e3c191be7fa62c0a9473c4..87f783dbd1c1aef2f5bcc40b407dff9f4bbe0916 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py @@ -79,8 +79,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py index 19026f5e05989c97b655da020a94a15f7b4fe0fe..6e58f809046000bb7a41b9875f9ebf945b86fd07 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py @@ -81,8 +81,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py index 2b59f2bb729e4f5e12eec0c1f72dec2ff292536f..094b19ce99da9c73c188f000dc7080504bc9ff3e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py @@ -83,8 +83,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py index fb237c30d49cdc7345275558de3350c69fcd480b..f28733de6b1a12a8aac362e30c8478a145520506 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py @@ -81,8 +81,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py index 9d49b054370b7a9bad1f85bd2a57dfcf840d8b25..3987c6cd5b386ae22a2fcac1a985e5e915a3e5ae 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py @@ -93,8 +93,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py index 1cd2037f2283e012918303b051396b9b0abb1977..8b1560edfd81de65f495a1bd0609bfa09c5f7810 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py @@ -83,8 +83,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py index 4aa202ace2fc278c9c0835d7910448e9d0871cce..07b06d77c90ffb41d3a221c94d52bdc2c7d1a3a5 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py @@ -75,8 +75,8 @@ class TestMul(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py index b2548a17634d721b882509ea5bd27bc459120f38..c319894bfae250789bf9931343ac29106629a148 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py @@ -84,8 +84,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py index 018c9a7876f886093c9e8345f073ceaa08eb237a..5b7ea61568ecd5772d574dd3cc63fac74535c903 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py @@ -76,8 +76,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) @@ -142,8 +142,8 @@ class TestCase1(TestBase): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py index 6a43fb46eea496ca42f2863cdbf4b20ace1677fe..c62e0c08f9c79c2ee1217b2b74df84c26cf30f1f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py @@ -78,8 +78,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py index af220a34ddb02652edf690c700309830b5914733..d5be8ae0cf77526a6aefa1fd060a510689c43cc0 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py @@ -83,8 +83,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py index 6c9d3f29adf3c0674045e653572e892141b699c7..ca8c0935d782cc275838871e2c73a3eba9454e5b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py @@ -81,8 +81,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py index 1afc0cb9ed330d95de68e29743980e9920922beb..eb644c2c6670f5beedbb3ad0b1868bc8a0f9434e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py @@ -101,8 +101,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py index b50ed7bdbab52fd7d49b1bd202406930da109ced..ee9cd875cf29884da58dc7f0488e7f9bfc50d4e5 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py @@ -97,8 +97,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py index 6b549b306f0d379d46ed3597c116fbcbfabccbb4..9b485d7794db2cbb538317b0d664d6ece9799a83 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py @@ -62,9 +62,14 @@ class TestFunc(unittest.TestCase): if run_ipu: feed_list = [image.name] fetch_list = [out.name] - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = False - ipu_strategy.batches_per_step = bps + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig( + num_ipus=2, + is_training=False, + enable_manual_shard=True, + need_avg_shard=True) + ipu_strategy.SetPipeliningConfig( + enable_pipelinin=True, batches_per_step=bps) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py index d135e5a586e7dffd667adc88179a8df830deb245..aa6c05dc59a87f844c19912be484a4b007f0adfc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py @@ -83,9 +83,9 @@ class TestBase(IPUOpTest): feed = self.feed_ipu if run_ipu else self.feed_cpu if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = False - ipu_strategy.enable_fp16 = True + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=False) + ipu_strategy.SetHalfConfig(enable_fp16=True) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py index f8ab3f81e9d3d63c2fe460dd359e3a2a54e02b7d..0a331d804545d49eeaffdf0c8054db89041f2c29 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py @@ -94,8 +94,9 @@ class TestBase(IPUOpTest): exe = paddle.static.Executor(place) exe.run(startup_prog) - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.attrs['is_training'] + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig( + is_training=self.attrs['is_training']) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile( self.feed_list, fetch_list) @@ -123,8 +124,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = feed_target_names fetch_list = [fetch_targets[0].name] - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = False + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=False) program = compiler.IPUCompiledProgram( inference_program, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py index 00fc0dd6633aed5b3dbc08a5170f476ba6d160ef..e1ed7603ed6272ba91cf485d91f512f07b72a258 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py @@ -59,11 +59,9 @@ class TestCastNet(unittest.TestCase): if run_ipu: feed_list = [image.name] fetch_list = [loss.name] - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.num_ipus = 2 - ipu_strategy.is_training = False - ipu_strategy.enable_manual_shard = True - ipu_strategy.enable_pipelining = False + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig( + num_ipus=2, is_training=False, enable_manual_shard=True) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py index 741ca8784bb602c5d8ab855b11d2478b3606c130..afeec9ee1b6fa75961aa76bd2f2c2f6701e200b5 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py @@ -29,7 +29,7 @@ SEED = 2021 "core is not compiled with IPU") class TestConvNet(unittest.TestCase): def test_training(self): - ipu_strategy = compiler.get_ipu_strategy() + ipu_strategy = paddle.static.IpuStrategy() assert ipu_strategy.num_ipus == 1, "Default num_ipus must be 1" assert ipu_strategy.is_training == True, "Default is_training is True" @@ -38,17 +38,16 @@ class TestConvNet(unittest.TestCase): assert ipu_strategy.enable_manual_shard == False, \ "Default enable_manual_shard is False" - ipu_strategy.num_ipus = 2 + ipu_strategy.SetGraphConfig( + num_ipus=2, is_training=False, enable_manual_shard=True) + ipu_strategy.SetPipeliningConfig(enable_pipelining=True) assert ipu_strategy.num_ipus == 2, "Set num_ipus Failed" - ipu_strategy.is_training = False assert ipu_strategy.is_training == False, "Set is_training Failed" - ipu_strategy.enable_pipelining = True assert ipu_strategy.enable_pipelining == True, \ "Set enable_pipelining Failed" - ipu_strategy.enable_manual_shard = True assert ipu_strategy.enable_manual_shard == True, \ "Set enable_manual_shard Failed" diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py index 043bc8ad36296228624c4b8c1e9f34b8e872f962..196f94b68f94a08f1b08871c805a9d7ceee14ffa 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py @@ -104,8 +104,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py index 6f85c4f381e5d74a0e09acb00587fe8c275f09c5..dc3cab6ac5e114f7083937687c1dfedf0ebd1c44 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py @@ -79,8 +79,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py index 2443541c79991d23f20f93b91eb6773b7e9419df..31b0c99603c3f707328eef5a3713bcd9b882b948 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py @@ -96,8 +96,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py index 0aac2344b3cf2811c990ae90288c966158e53673..38b91785aeec8c061a5d4f6203363645569c2824 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py @@ -70,8 +70,8 @@ class TestConvNet(unittest.TestCase): if run_ipu: feed_list = [image.name] fetch_list = [loss.name] - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = True + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=True) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py index 7133a76c607cbfaa34882106b459e71b9df48edd..c6702b92ab969ec7b1b97c44bc9245c6df41d83a 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py @@ -87,8 +87,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py index 87c67fb72c3df8a1b5f171465f30146f1f8fbab8..f04d712755deadd44158431569430909f19a093e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py @@ -76,8 +76,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py index 678276eba317862b3e9385badddf04c9e57c2e61..78a2589d9aca59ec72d22aa6fa35f9f934f94caf 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py @@ -86,8 +86,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py index cb9967831feee363b75a3a1fccc7ec37754dffdd..e81591ad68368033bc9b5223753203ae3126c005 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py @@ -87,8 +87,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py index fffac1218576bf747548be6016e2bdb340e82b30..a7c45c6686f10e3c13462d49ab8e34fe72c4f03a 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py @@ -87,8 +87,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py index 8ede44d7f92297745bd4c621adcf0d3846afff59..5059de7ba77b1baf65c511da11735a73a87aa1e8 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py @@ -78,8 +78,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) @@ -141,8 +141,8 @@ class TestCase1(TestBase): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py index f8c96684afe8f3c8222fcc4ec6f9c179a73158fd..ac8ad08e8b28c00555c5c78f5b8a834b0024acc7 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py @@ -71,8 +71,8 @@ class TestMean(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py index a6e0552691b857c6f53bf8dc327c0e874bab8708..f312b7b69ad79b721fd768f6762a48ca68793d6d 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py @@ -82,8 +82,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py index 2cc0d1770be13cea265d521c20dc4dabe186dbd5..5163838bc0cd633f69e2e446294250686e4fe04f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py @@ -76,8 +76,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load.py index 5d9ff4eb886c02a551bdc6ef9ffc2c2e653b999a..24bb8e111842cb93ed35cdc796868c5a911ee36f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load.py @@ -93,8 +93,9 @@ class TestBase(IPUOpTest): if not save_otherwise_load: paddle.static.load(main_prog, "model/model") - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.attrs['is_training'] + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig( + is_training=self.attrs['is_training']) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile( self.feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py index 94758155b35a82d3971f6f25192eb5516276e7ed..6ad2a89a738b7090ce082a637c3d531cf4566f9a 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py @@ -82,8 +82,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) @@ -175,8 +175,8 @@ class TestCase4(TestBase): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py index 335e90b4607b2fe84b2c37b2a1bae75fca2a0044..93945b98ef0a26b35b28e1cf9d2bb5e88d21f308 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py @@ -81,10 +81,9 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training - # set batch size - ipu_strategy.batch_size = 2 + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig( + batch_size=2, is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py index ee469c5fc1de992dc24626116c1d9ed06410690f..df0e2a040bd3e55aac20e383c7aa2ff50c567de4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py @@ -59,8 +59,8 @@ class TestSGD(unittest.TestCase): if run_ipu: feed_list = [image.name] fetch_list = [loss.name] - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = True + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=True) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py index 7261f26c0ec108a50514e0361d28f5de274f1176..3bdfeabce6592cd25067adfdabe8b7c74a6848c7 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py @@ -80,8 +80,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) @@ -159,8 +159,8 @@ class TestCase2(TestBase): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py index 36cb529c231a0fe9c875fa7f2d4827c63d5ca447..a4a4b83baf35e558ff1f8b0982ffc44287919dfc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py @@ -77,8 +77,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py index 672e6ede0ede10c490edbb7c41fafb05fdae235d..ccd2796590838faa8980ef7d214f73a944c9220d 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py @@ -77,11 +77,11 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, - ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + iipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py index 360e81f2862cddce6d54a4d2fb6f5cda631d7ae7..3d5de11b5e213e3fe68561d60c1c0dbcb6fbcbf1 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py @@ -88,8 +88,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py index 6bba02942a713daf5be6faa0344e355e2f211f8d..003350cd7a01e284d28ff8904a38ab3755e07642 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py @@ -83,8 +83,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) @@ -149,11 +149,11 @@ class TestCase1(TestBase): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, - ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + iipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py index 4cbbc9e478d2e8b3719c01562e57780cede1049b..9915a7a1fd89f91f71814ebbe577abcf9327cb37 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py @@ -102,8 +102,8 @@ class TestTopKOp(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py index 715f81b684e35b1aad955883f1198696e42d4c04..77d2f4131014965ef6cfba9cdf5efffa34c2dbc6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py @@ -78,8 +78,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py index 9d8cab6a697081d86635fd25eeb4c1c0e9f82fce..75ed5a07315c775e4ea3e105a1fa4d6731666c70 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py @@ -76,8 +76,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py index 801794804da0a5e3df210decbfd20bbca6f450c7..fabad936decb975214f0416000d77c76a2f7ddfb 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py @@ -91,8 +91,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list - ipu_strategy = compiler.get_ipu_strategy() - ipu_strategy.is_training = self.is_training + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.SetGraphConfig(is_training=self.is_training) program = compiler.IPUCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index 1c5b640fe4b0bd3a2e401541ee945e64c131dd32..505060e31a0a27be64e7755cbb25844ebefe66df 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -36,7 +36,7 @@ class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest): for shape in [[32], [batch, 32], [batch, 32, 32], [batch, 32, 16, 32]]: for op_type in ["elementwise_add", "elementwise_mul"]: - for axis in [len(shape) - 1, -1]: + for axis in [-1 if len(shape) == 1 else 1]: self.dims = len(shape) dics = [{"axis": axis}] ops_config = [{ @@ -129,33 +129,7 @@ class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest): True), 1e-5 def add_skip_trt_case(self): - def teller1(program_config, predictor_config): - if self.dims == 2 and len(self.dynamic_shape.max_input_shape) == 0: - return True - return False - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "The output shape are not equal between gpu and tensorrt when input dim is 2." - ) - - def teller2(program_config, predictor_config): - if self.dims == 3: - return True - return False - - self.add_skip_case( - teller2, SkipReasons.TRT_NOT_IMPLEMENTED, - "The output has diff between gpu and tensorrt when input dim is 3.") - - def teller3(program_config, predictor_config): - if self.dims == 4: - return True - return False - - self.add_skip_case( - teller3, SkipReasons.TRT_NOT_IMPLEMENTED, - "The output has diff between gpu and tensorrt when input dim is 4.") + pass def test(self): self.add_skip_trt_case() @@ -287,15 +261,7 @@ class TrtConvertElementwiseTest_two_input_without_broadcast( yield self.create_inference_config(), (1, 3), 1e-5 def add_skip_trt_case(self): - def teller1(program_config, predictor_config): - if self.dims == 2: - return True - return False - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "The output shape are not equal between gpu and tensorrt when input dim is 2." - ) + pass def test(self): self.add_skip_trt_case() @@ -418,15 +384,7 @@ class TrtConvertElementwiseTest_two_input_with_broadcast(TrtLayerAutoScanTest): yield self.create_inference_config(), (1, 3), 1e-5 def add_skip_trt_case(self): - def teller1(program_config, predictor_config): - if len(self.shape1) == 2: - return True - return False - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "The output shape are not equal between gpu and tensorrt when input dim is 2." - ) + pass def test(self): self.add_skip_trt_case() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py index ba648042dabf755326e709a46218caa9d857b506..2e1e04870b926b05e7191b335aa6403a6380a68d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py @@ -36,26 +36,32 @@ class TrtConvertReduceMeanTest(TrtLayerAutoScanTest): return False if len(attrs[0]["dim"]) == 0: return False - ## skip not use - if attrs[0]["out_dtype"] != -1: - return False + + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 < 7000: + if attrs[0]['out_dtype'] == 2: + return False return True def sample_program_configs(self): - def generate_input1(attrs: List[Dict[str, Any]]): - return np.random.random([1, 3, 64, 64]).astype(np.float32) + def generate_input1(dtype, attrs: List[Dict[str, Any]]): + if dtype == -1 or dtype == 5: + return np.random.random([1, 3, 64, 64]).astype(np.float32) + elif dtype == 2: + return np.random.random([1, 3, 64, 64]).astype(np.int32) - for keep_dim in [False, True]: + for keep_dim in [True, False]: for dim in [[], [1], [0], [0, 1], [1, 2, 3], [-2, 0, 3], [-3], [-4, 1], [3, 4, 5]]: - for reduce_all in [False, True]: - for out_dtype in [-1, 0, 1]: + for reduce_all in [True, False]: + for out_dtype in [-1, 2, 5]: dics = [{ "keep_dim": keep_dim, "dim": dim, "reduce_all": reduce_all, - "out_dtype": out_dtype + "out_dtype": out_dtype, + "in_dtype": out_dtype, }, {}] ops_config = [{ @@ -75,7 +81,7 @@ class TrtConvertReduceMeanTest(TrtLayerAutoScanTest): weights={}, inputs={ "input_data": TensorConfig(data_gen=partial( - generate_input1, dics)) + generate_input1, out_dtype, dics)) }, outputs=["reduce_output_data"]) @@ -134,16 +140,6 @@ class TrtConvertReduceMeanTest(TrtLayerAutoScanTest): pass def add_skip_trt_case(self): - def teller1(program_config, predictor_config): - if program_config.ops[0].attrs['out_dtype'] != -1: - return True - return False - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "NOT Implemented: we will add out_dtype not equal to -1 in the future" - ) - pass def test(self): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py index ba0f61a2768988505856a0cdf2d481e182384110..2a7e673d4203a9f73b543e32a15b3a2479f5b14e 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py @@ -37,26 +37,27 @@ class TrtConvertReduceSumTest(TrtLayerAutoScanTest): return False if len(attrs[0]["dim"]) == 0: return False - ## skip not use - if attrs[0]["out_dtype"] != -1: - return False return True def sample_program_configs(self): - def generate_input1(attrs: List[Dict[str, Any]]): - return np.random.random([1, 3, 64, 64]).astype(np.float32) + def generate_input1(dtype, attrs: List[Dict[str, Any]]): + if dtype == -1 or dtype == 5: + return np.random.random([1, 3, 64, 64]).astype(np.float32) + elif dtype == 2: + return np.random.random([1, 3, 64, 64]).astype(np.int32) - for keep_dim in [False, True]: + for keep_dim in [True, False]: for dim in [[], [1], [0], [0, 1], [1, 2, 3], [-2, 0, 3], [-3], [-4, 1], [3, 4, 5]]: - for reduce_all in [False, True]: - for out_dtype in [-1, 0, 1]: + for reduce_all in [True, False]: + for out_dtype in [-1, 2, 5]: dics = [{ "keep_dim": keep_dim, "dim": dim, "reduce_all": reduce_all, - "out_dtype": out_dtype + "out_dtype": out_dtype, + "in_dtype": out_dtype, }, {}] ops_config = [{ @@ -76,7 +77,7 @@ class TrtConvertReduceSumTest(TrtLayerAutoScanTest): weights={}, inputs={ "input_data": TensorConfig(data_gen=partial( - generate_input1, dics)) + generate_input1, out_dtype, dics)) }, outputs=["reduce_output_data"]) @@ -134,16 +135,6 @@ class TrtConvertReduceSumTest(TrtLayerAutoScanTest): pass def add_skip_trt_case(self): - def teller1(program_config, predictor_config): - if program_config.ops[0].attrs['out_dtype'] != -1: - return True - return False - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "NOT Implemented: we will add out_dtype not equal to -1 in the future" - ) - pass def test(self): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py new file mode 100644 index 0000000000000000000000000000000000000000..2150e06381fac37e527d9d593f8752eb38ba1596 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py @@ -0,0 +1,702 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +import sys +sys.path.append('..') +from op_test import OpTest, _set_use_system_allocator +from paddle.fluid.framework import grad_var_name +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard + +_set_use_system_allocator(True) + + +def _reference_testing(x, scale, offset, mean, var, epsilon, data_format): + x_shape = x.shape + if len(x_shape) == 2: + if data_format == "NCHW": + x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1)) + else: + x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1])) + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + + if data_format == "NCHW": + n, c, h, w = x.shape + mean_tile = np.reshape(mean, (1, c, 1, 1)) + mean_tile = np.tile(mean_tile, (n, 1, h, w)) + var_tile = np.reshape(var, (1, c, 1, 1)) + var_tile = np.tile(var_tile, (n, 1, h, w)) + normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon) + scale_tile = np.reshape(scale, (1, c, 1, 1)) + scale_tile = np.tile(scale_tile, (n, 1, h, w)) + offset_tile = np.reshape(offset, (1, c, 1, 1)) + offset_tile = np.reshape(offset_tile, (1, c, 1, 1)) + y = normalized * scale_tile + offset_tile + elif data_format == "NHWC": + normalized = (x - mean) / np.sqrt(var + epsilon) + y = normalized * scale + offset + else: + raise ValueError("Unknown data order.") + + if len(x_shape) == 2 or len(x_shape) == 3: + y = np.reshape(y, x_shape) + return y + + +def _cal_mean_variance(x, epsilon, data_format): + assert data_format in ['NCHW', 'NHWC'] + x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + x_square = x * x + axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2) + C = x.shape[1] if data_format == 'NCHW' else x.shape[-1] + x_square_sum = np.sum(x_square, axis) + x_sum = np.sum(x, axis=axis) + element_count = np.size(x) / C + mean = x_sum / element_count + var = x_square_sum / element_count - mean * mean + return mean, var + + +def _reference_training(x, scale, offset, epsilon, data_format): + x_shape = x.shape + + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + + if data_format == "NCHW": + n, c, h, w = x.shape + x_square = x * x + x_square_sum = np.sum(x_square, (0, 2, 3)) + x_sum = np.sum(x, axis=(0, 2, 3)) + element_count = np.size(x) / int(np.shape(x)[1]) + mean = x_sum / element_count + var = x_square_sum / element_count - mean * mean + mean_tile = np.reshape(mean, (1, c, 1, 1)) + mean_tile = np.tile(mean_tile, (n, 1, h, w)) + var_tile = np.reshape(var, (1, c, 1, 1)) + var_tile = np.tile(var_tile, (n, 1, h, w)) + normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon) + scale_tile = np.reshape(scale, (1, c, 1, 1)) + scale_tile = np.tile(scale_tile, (n, 1, h, w)) + offset_tile = np.reshape(offset, (1, c, 1, 1)) + offset_tile = np.reshape(offset_tile, (1, c, 1, 1)) + y = normalized * scale_tile + offset_tile + elif data_format == "NHWC": + x_square = x * x + x_square_sum = np.sum(x_square, (0, 1, 2)) + x_sum = np.sum(x, axis=(0, 1, 2)) + element_count = np.size(x) / int(np.shape(x)[-1]) + mean = x_sum / element_count + var = x_square_sum / element_count - mean * mean + normalized = (x - mean) / np.sqrt(var + epsilon) + y = normalized * scale + offset + else: + raise ValueError("Unknown data order.") + + if len(x_shape) == 3: + y = np.reshape(y, x_shape) + return y, mean, var + + +def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format): + # Use the following formulas to calculate gradients: + # grad_scale = + # sum(grad_y * (x - mean)) * rsqrt(var + epsilon) + # + # grad_offset = sum(output_y) + # + # x_grad = + # 1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) - + # (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon)) + + # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation + if data_format != "NCHW" and data_format != "NHWC": + raise ValueError("Unknown data order.") + + x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], 1, x_shape[2])) + + if data_format == "NCHW": + x = np.transpose(x, (0, 2, 3, 1)) + y_grad = np.transpose(y_grad, (0, 2, 3, 1)) + + x_grad = scale * (y_grad - np.mean( + y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean( + y_grad * (x - mean), axis=(0, 1, 2)) / + (var + epsilon)) / np.sqrt(var + epsilon) + grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon), + axis=(0, 1, 2)) + grad_offset = np.sum(y_grad, axis=(0, 1, 2)) + + # transfer back to N, C, H, W + if data_format == "NCHW": + x_grad = np.transpose(x_grad, (0, 3, 1, 2)) + x = np.transpose(x, (0, 3, 1, 2)) + y_grad = np.transpose(y_grad, (0, 3, 1, 2)) + + if len(x_shape) == 3: + x_grad = np.reshape(x_grad, x_shape) + + return x_grad, grad_scale, grad_offset + + +def create_or_get_tensor(scope, var_name, var, place): + tensor = scope.var(var_name).get_tensor() + if var is not None: + assert isinstance(var, np.ndarray) + tensor.set_recursive_sequence_lengths([]) + tensor.set(var, place) + return tensor + + +def set_output_grad(scope, outputs, place, feed_dict=None): + def __set_tensor__(name, data=None): + out_tensor = scope.find_var(name).get_tensor() + grad_tensor = scope.var(grad_var_name(name)).get_tensor() + out_dtype = out_tensor.dtype() + if data is None: + if out_dtype == core.VarDesc.VarType.FP64: + data = np.ones(out_tensor.shape(), dtype=np.float64) + elif out_dtype == core.VarDesc.VarType.FP32: + data = np.ones(out_tensor.shape(), dtype=np.float32) + else: + raise ValueError("Not supported data type " + str(out_dtype)) + grad_tensor.set(data, place) + + for output in outputs: + data = None + if output in feed_dict: + data = feed_dict[output] + __set_tensor__(output, data) + + +class TestBatchNormOpInference(unittest.TestCase): + def setUp(self): + self.dtype = np.float32 + self.fuse_with_relu = False + self.init_kernel_type() + + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + + def check_with_place(self, place, data_layout, dtype, shape): + epsilon = 0.00001 + if len(shape) == 2: + x_shape = shape + c = x_shape[1] + else: + n, h, w, c = shape[0], shape[1], shape[2], shape[3] + if data_layout == "NHWC": + x_shape = [n, h, w, c] + elif data_layout == "NCHW": + x_shape = [n, c, h, w] + else: + raise ValueError("Unknown data layout.") + scale_shape = [c] + + x_val = np.random.random_sample(x_shape).astype(dtype) + # generate some negative values to test case with relu fused + x_val = x_val - 0.5 + scale_val = np.random.random_sample(scale_shape).astype(np.float32) + bias_val = np.random.random_sample(scale_shape).astype(np.float32) + + mean = np.zeros(scale_shape).astype(np.float32) + variance = np.ones(scale_shape).astype(np.float32) + + y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance, + epsilon, data_layout).astype(dtype) + if self.fuse_with_relu: + y_out = np.maximum(y_out, 0) + + scope = core.Scope() + + # create input + x_tensor = create_or_get_tensor(scope, "x_val", + OpTest.np_dtype_to_fluid_dtype(x_val), + place) + scale_tensor = create_or_get_tensor( + scope, "scale_val", + OpTest.np_dtype_to_fluid_dtype(scale_val), place) + bias_tensor = create_or_get_tensor( + scope, "bias_val", OpTest.np_dtype_to_fluid_dtype(bias_val), place) + mean_tensor = create_or_get_tensor(scope, "mean", + OpTest.np_dtype_to_fluid_dtype(mean), + place) + variance_tensor = create_or_get_tensor( + scope, "variance", OpTest.np_dtype_to_fluid_dtype(variance), place) + + # create output + y_tensor = create_or_get_tensor(scope, "y_out", None, place) + saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None, + place) + saved_variance_tensor = create_or_get_tensor(scope, "saved_variance", + None, place) + mean_out_tensor = mean_tensor + variance_out_tensor = variance_tensor + + batch_norm_op = Operator( + "batch_norm", + # inputs + X="x_val", + Scale="scale_val", + Bias="bias_val", + Mean="mean", + Variance="variance", + # outputs + Y="y_out", + MeanOut="mean", + VarianceOut="variance", + SavedMean="saved_mean", + SavedVariance="saved_variance", + # attrs + is_test=True, + data_layout=data_layout, + use_mkldnn=False, + fuse_with_relu=self.fuse_with_relu, + epsilon=epsilon) + + batch_norm_op.run(scope, place) + + # check inference result + self.__assert_close( + y_tensor, + y_out, + "inference output are different at " + str(place) + ", " + + data_layout + ", " + str(np.dtype(dtype)) + + str(np.array(y_tensor)) + str(y_out), + atol=1e-3) + + def test_check_output(self): + places = [core.CPUPlace()] + if core.is_compiled_with_mlu(): + places.append(core.MLUPlace(0)) + + for place in places: + for data_format in ["NCHW", "NHWC"]: + self.check_with_place(place, data_format, self.dtype, + [2, 3, 4, 5]) + self.check_with_place(place, data_format, self.dtype, [2, 3]) + + def init_kernel_type(self): + pass + + +class TestFP16BatchNormOpInference(TestBatchNormOpInference): + def setUp(self): + self.dtype = np.float16 + self.fuse_with_relu = False + self.init_kernel_type() + + def test_check_output(self): + places = [] + if core.is_compiled_with_mlu(): + places.append(core.MLUPlace(0)) + + for place in places: + for data_format in ["NCHW", "NHWC"]: + self.check_with_place(place, data_format, self.dtype, + [2, 3, 4, 5]) + self.check_with_place(place, data_format, self.dtype, [2, 3]) + + +class TestBatchNormOpTraining(unittest.TestCase): + def setUp(self): + self.fuse_with_relu = False + self.data_formats = ["NCHW", "NHWC"] + self.momentum = 0.9 + self.use_momentum_variable = False + self.epsilon = 0.00001 + self.init_kernel_type() + self.init_test_case() + + def init_test_case(self): + self.use_global_stats = False + self.no_grad_set = set() + self.fetch_list = [ + 'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD', + 'scale@GRAD', 'bias@GRAD' + ] + + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + np.allclose(np.array(tensor), np_array, atol=atol) + + def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, + epsilon, momentum, shape, data_layout): + # run forward + y, saved_mean, var_ref = _reference_training(x, scale, bias, epsilon, + data_layout) + mean_out = saved_mean * (1. - momentum) + momentum * mean + variance_out = var_ref * (1. - momentum) + momentum * variance + saved_variance = 1. / np.sqrt(var_ref + epsilon) + # run backward + x_grad, scale_grad, bias_grad = _reference_grad( + x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout) + + return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad + + def set_mean_variance(self, scale_shape, x, data_layout): + mean, variance = _cal_mean_variance(x, self.epsilon, data_layout) + mean_pre = np.zeros(scale_shape).astype(np.float32) + variance_pre = np.ones(scale_shape).astype(np.float32) + # computing global mean/variance for one step + if self.use_global_stats: + mom = self.momentum + mean = mean * (1. - mom) + mom * mean_pre + variance = variance * (1. - mom) + mom * variance_pre + return mean, variance + + def test_forward_backward(self): + def test_with_place(place, data_layout, shape): + # attr + epsilon = self.epsilon + momentum = self.momentum + if data_layout == "NCHW": + n, c, h, w = shape[0], shape[1], shape[2], shape[3] + else: + n, h, w, c = shape[0], shape[1], shape[2], shape[3] + scale_shape = [c] + + np.random.seed(123) + x = np.random.random_sample(shape).astype(np.float32) + scale = np.random.random_sample(scale_shape).astype(np.float32) + bias = np.random.random_sample(scale_shape).astype(np.float32) + mean, variance = self.set_mean_variance(scale_shape, x, data_layout) + y_grad = np.random.random_sample(shape).astype(np.float32) + momentum_var = np.array([momentum]).astype(np.float32) + + y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward( + x, y_grad, scale, bias, mean, variance, epsilon, momentum, + shape, data_layout) + + var_dict = locals() + var_dict['y@GRAD'] = y_grad + var_dict['x@GRAD'] = x_grad + var_dict['scale@GRAD'] = scale_grad + var_dict['bias@GRAD'] = bias_grad + + var_names = [ + 'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean', + 'saved_variance', 'momentum_var' + ] + ground_truth = {name: var_dict[name] for name in var_names} + + program = fluid.Program() + with fluid.program_guard(program): + block = program.global_block() + for name in ground_truth: + block.create_var( + name=name, + dtype='float32', + shape=ground_truth[name].shape) + inputs = { + "X": block.var('x'), + "Scale": block.var('scale'), + "Bias": block.var('bias'), + "Mean": block.var('mean'), + "Variance": block.var('variance') + } + attrs = { + "epsilon": epsilon, + "is_test": False, + "data_layout": data_layout, + "use_mkldnn": False, + "fuse_with_relu": self.fuse_with_relu, + "use_global_stats": self.use_global_stats + } + if self.use_momentum_variable: + inputs['MomentumTensor'] = block.var('momentum_var') + else: + attrs['momentum'] = momentum + + outputs = { + "Y": block.var('y'), + "MeanOut": block.var('mean'), # share memory + "VarianceOut": block.var('variance'), # share memory + "SavedMean": block.var('saved_mean'), + "SavedVariance": block.var('saved_variance') + } + block.create_var(name="reserve_space", dtype='float32') + outputs["ReserveSpace"] = block.var('reserve_space') + bn_op = block.append_op( + type="batch_norm", + inputs=inputs, + outputs=outputs, + attrs=attrs) + block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) + + # generate backward op_desc + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + bn_op.desc, self.no_grad_set, []) + grad_op_desc = grad_op_desc_list[0] + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + for var_name in grad_op_desc.output_arg_names(): + block.desc.var(var_name.encode("ascii")) + grad_op_desc.infer_var_type(block.desc) + grad_op_desc.infer_shape(block.desc) + for arg in grad_op_desc.output_arg_names(): + grad_var = block.desc.find_var(arg.encode("ascii")) + grad_var.set_dtype(core.VarDesc.VarType.FP32) + + program._sync_with_cpp() + + exe = fluid.Executor(place) + out = exe.run(program, + feed={ + name: var_dict[name] + for name in [ + 'x', 'scale', 'bias', 'mean', 'variance', + 'y@GRAD', 'momentum_var' + ] + }, + fetch_list=self.fetch_list) + + for id, name in enumerate(self.fetch_list): + if name == 'variance': + self.__assert_close( + var_dict[name], out[id], name, atol=1e-3) + continue + self.__assert_close(var_dict[name], out[id], name) + print("op test forward passed: ", str(place), data_layout) + + places = [core.CPUPlace()] + + if core.is_compiled_with_mlu(): + places.append(core.MLUPlace(0)) + + for place in places: + for data_format in self.data_formats: + test_with_place(place, data_format, [2, 3, 4, 5]) + + def init_kernel_type(self): + pass + + +class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining): + def init_test_case(self): + self.use_global_stats = False + self.no_grad_set = set(['scale@GRAD', 'bias@GRAD']) + self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD'] + + +class TestBatchNormOpTrainingCase2(TestBatchNormOpTraining): + def init_test_case(self): + self.use_global_stats = False + self.no_grad_set = set() + self.fetch_list = [ + 'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD', + 'scale@GRAD', 'bias@GRAD' + ] + os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = "1" + + +class TestBatchNormOpTrainingCase3(TestBatchNormOpTraining): + def init_test_case(self): + self.use_global_stats = False + self.no_grad_set = set(['x@GRAD']) + self.fetch_list = ['y', 'mean', 'variance', 'scale@GRAD', 'bias@GRAD'] + + +class TestBatchNormOpTrainingMomentumVariable(TestBatchNormOpTraining): + def init_test_case(self): + self.use_momentum_variable = True + self.use_global_stats = False + self.no_grad_set = set() + self.fetch_list = [ + 'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD', + 'scale@GRAD', 'bias@GRAD' + ] + + +class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining): + def init_test_case(self): + self.use_global_stats = True + self.no_grad_set = set() + self.fetch_list = [ + 'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD' + ] + + def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format): + if data_format == "NCHW": + x = np.transpose(x, (0, 2, 3, 1)) + y_grad = np.transpose(y_grad, (0, 2, 3, 1)) + + x_grad = scale * y_grad / np.sqrt(var + epsilon) + grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon), + axis=(0, 1, 2)) + grad_offset = np.sum(y_grad, axis=(0, 1, 2)) + + # transfer back to N, C, H, W + if data_format == "NCHW": + x_grad = np.transpose(x_grad, (0, 3, 1, 2)) + x = np.transpose(x, (0, 3, 1, 2)) + y_grad = np.transpose(y_grad, (0, 3, 1, 2)) + + return x_grad, grad_scale, grad_offset + + def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, + epsilon, momentum, shape, data_layout): + if data_layout != "NCHW" and data_layout != "NHWC": + raise ValueError("Unknown data order.") + + if data_layout == "NCHW": + x = np.transpose(x, (0, 2, 3, 1)) + + # run normalizaton + normalized = (x - mean) / np.sqrt(variance + epsilon) + y = normalized * scale + bias + + # transfer back to N, C, H, W + if data_layout == "NCHW": + x = np.transpose(x, (0, 3, 1, 2)) + y = np.transpose(y, (0, 3, 1, 2)) + + mean_out = mean + variance_out = variance + saved_variance = 1. / np.sqrt(variance + epsilon) + # run backward + x_grad, scale_grad, bias_grad = self.reference_grad( + x, y_grad, scale, mean, variance, epsilon, data_layout) + + return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad + + +class TestBatchNormOpFreezeStatsAndScaleBiasTraining( + TestBatchNormOpFreezeStatsTraining): + def init_test_case(self): + self.use_global_stats = True + self.no_grad_set = set(['scale@GRAD', 'bias@GRAD']) + self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD'] + + +class TestBatchNormOpError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + # the input of batch_norm must be Variable. + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()) + self.assertRaises(TypeError, fluid.layers.batch_norm, x1) + + # the input dtype of batch_norm must be float16 or float32 or float64 + # float16 only can be set on GPU place + x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32") + self.assertRaises(TypeError, fluid.layers.batch_norm, x2) + + +class TestDygraphBatchNormAPIError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + batch_norm = fluid.dygraph.BatchNorm(10) + # the input of BatchNorm must be Variable. + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()) + self.assertRaises(TypeError, batch_norm, x1) + + # the input dtype of BatchNorm must be float16 or float32 or float64 + # float16 only can be set on GPU place + x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32") + self.assertRaises(TypeError, batch_norm, x2) + + +class TestDygraphBatchNormTrainableStats(unittest.TestCase): + def test_dygraph(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_mlu(): + places.append(fluid.MLUPlace(0)) + for p in places: + shape = [4, 10, 4, 4] + + def compute(x, is_test, trainable_statistics): + with fluid.dygraph.guard(p): + bn = fluid.dygraph.BatchNorm( + shape[1], + is_test=is_test, + trainable_statistics=trainable_statistics) + y = bn(fluid.dygraph.to_variable(x)) + return y.numpy() + + x = np.random.randn(*shape).astype("float32") + y1 = compute(x, False, False) + y2 = compute(x, True, True) + self.assertTrue(np.allclose(y1, y2)) + + def test_static(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_mlu(): + places.append(fluid.MLUPlace(0)) + for p in places: + exe = fluid.Executor(p) + shape = [4, 10, 16, 16] + + def compute(x_np, is_test, trainable_statistics): + with program_guard(Program(), Program()): + bn = fluid.dygraph.BatchNorm( + shape[1], + is_test=is_test, + trainable_statistics=trainable_statistics) + x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype) + y = bn(x) + exe.run(fluid.default_startup_program()) + r = exe.run(feed={'x': x_np}, fetch_list=[y])[0] + return r + + x = np.random.randn(*shape).astype("float32") + y1 = compute(x, False, False) + y2 = compute(x, True, True) + self.assertTrue(np.allclose(y1, y2)) + + +class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase): + def test_reservespace(self): + with program_guard(Program(), Program()): + paddle.enable_static() + x = np.random.random(size=(3, 10, 3, 7)).astype('float32') + x = fluid.data(name='x', shape=x.shape, dtype=x.dtype) + # Set this FLAG, the BatchNorm API will pass "reserve_space" argument into batch_norm op. + os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1' + batch_norm = fluid.dygraph.BatchNorm(7, data_layout="NHWC") + hidden1 = batch_norm(x) + os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '0' + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..f608344f6e0363864a76a23f8d8c10dace130149 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py @@ -0,0 +1,295 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +import sys +sys.path.append("..") +from op_test import OpTest, _set_use_system_allocator +from paddle.fluid.framework import grad_var_name +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard +import paddle + + +class TestBatchNorm(unittest.TestCase): + def test_name(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_mlu(): + places.append(fluid.MLUPlace(0)) + for p in places: + with fluid.dygraph.guard(p): + batch_norm1d = paddle.nn.BatchNorm1D(1, name="test") + + def test_error(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_mlu(): + places.append(fluid.MLUPlace(0)) + for p in places: + #paddle.disable_static() + x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32') + x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32') + + def error1d_dataformat(): + x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32') + batch_norm1d = paddle.nn.BatchNorm1D(1, data_format='NCDHW') + batch_norm1d(fluid.dygraph.to_variable(x_data_4)) + + def error2d_dataformat(): + x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32') + batch_norm2d = paddle.nn.BatchNorm2D(1, data_format='NCDHW') + batch_norm2d(fluid.dygraph.to_variable(x_data_3)) + + def error3d_dataformat(): + x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32') + batch_norm3d = paddle.nn.BatchNorm3D(1, data_format='NCL') + batch_norm3d(fluid.dygraph.to_variable(x_data_4)) + + def error1d(): + x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32') + batch_norm1d = paddle.nn.BatchNorm1D(1) + batch_norm1d(fluid.dygraph.to_variable(x_data_4)) + + def error2d(): + x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32') + batch_norm2d = paddle.nn.BatchNorm2D(1) + batch_norm2d(fluid.dygraph.to_variable(x_data_3)) + + def error3d(): + x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32') + batch_norm3d = paddle.nn.BatchNorm3D(1) + batch_norm3d(fluid.dygraph.to_variable(x_data_4)) + + with fluid.dygraph.guard(p): + self.assertRaises(ValueError, error1d) + self.assertRaises(ValueError, error2d) + self.assertRaises(ValueError, error3d) + self.assertRaises(ValueError, error1d_dataformat) + self.assertRaises(ValueError, error2d_dataformat) + self.assertRaises(ValueError, error3d_dataformat) + + def test_dygraph(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_mlu(): + places.append(fluid.MLUPlace(0)) + for p in places: + shape = [4, 10, 4, 4] + + def compute_v1(x, is_test, trainable_statistics): + with fluid.dygraph.guard(p): + bn = fluid.dygraph.BatchNorm( + shape[1], + is_test=is_test, + trainable_statistics=trainable_statistics) + y = bn(fluid.dygraph.to_variable(x)) + return y.numpy() + + def compute_v2(x): + with fluid.dygraph.guard(p): + bn = paddle.nn.BatchNorm2D(shape[1]) + y = bn(fluid.dygraph.to_variable(x)) + return y.numpy() + + def compute_v3(x, is_test, trainable_statistics): + with fluid.dygraph.guard(p): + bn = fluid.dygraph.BatchNorm( + shape[1], + is_test=is_test, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(1.0), + trainable=False), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(0.0), + trainable=False), + trainable_statistics=trainable_statistics) + y = bn(fluid.dygraph.to_variable(x)) + return y.numpy() + + def compute_v4(x): + with fluid.dygraph.guard(p): + bn = paddle.nn.BatchNorm2D( + shape[1], weight_attr=False, bias_attr=False) + y = bn(fluid.dygraph.to_variable(x)) + return y.numpy() + + x = np.random.randn(*shape).astype("float32") + y1 = compute_v1(x, False, False) + y2 = compute_v2(x) + y3 = compute_v3(x, False, False) + y4 = compute_v4(x) + self.assertTrue(np.allclose(y1, y2)) + self.assertTrue(np.allclose(y3, y4)) + + def test_static(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_mlu(): + places.append(fluid.MLUPlace(0)) + for p in places: + exe = fluid.Executor(p) + shape = [4, 10, 16, 16] + + def compute_v1(x_np, is_test, trainable_statistics): + with program_guard(Program(), Program()): + bn = fluid.dygraph.BatchNorm( + shape[1], + is_test=is_test, + trainable_statistics=trainable_statistics) + x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype) + y = bn(x) + exe.run(fluid.default_startup_program()) + r = exe.run(feed={'x': x_np}, fetch_list=[y])[0] + return r + + def compute_v2(x_np): + with program_guard(Program(), Program()): + bn = paddle.nn.BatchNorm2D(shape[1]) + x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype) + y = bn(x) + exe.run(fluid.default_startup_program()) + r = exe.run(feed={'x': x_np}, fetch_list=[y])[0] + return r + + x = np.random.randn(*shape).astype("float32") + y1 = compute_v1(x, False, False) + y2 = compute_v2(x) + self.assertTrue(np.allclose(y1, y2)) + + +class TestBatchNormChannelLast(unittest.TestCase): + def setUp(self): + self.original_dtyep = paddle.get_default_dtype() + paddle.set_default_dtype("float32") + self.places = [fluid.CPUPlace()] + if core.is_compiled_with_mlu(): + self.places.append(fluid.MLUPlace(0)) + + def tearDown(self): + paddle.set_default_dtype(self.original_dtyep) + + def test_1d(self): + for p in self.places: + with fluid.dygraph.guard(p): + x = paddle.randn([2, 6, 4]) + net1 = paddle.nn.BatchNorm1D(4, data_format="NLC") + net2 = paddle.nn.BatchNorm1D(4) + net2.weight = net1.weight + net2.bias = net1.bias + y1 = net1(x) + channel_first_x = paddle.transpose(x, [0, 2, 1]) + y2 = net2(channel_first_x) + y2 = paddle.transpose(y2, [0, 2, 1]) + self.assertEqual( + np.allclose( + y1.numpy(), y2.numpy(), atol=1e-07), True) + + def test_2d(self): + for p in self.places: + with fluid.dygraph.guard(p): + x = paddle.randn([2, 6, 6, 4]) + net1 = paddle.nn.BatchNorm2D(4, data_format="NHWC") + net2 = paddle.nn.BatchNorm2D(4) + net2.weight = net1.weight + net2.bias = net1.bias + y1 = net1(x) + channel_first_x = paddle.transpose(x, [0, 3, 1, 2]) + y2 = net2(channel_first_x) + y2 = paddle.transpose(y2, [0, 2, 3, 1]) + self.assertEqual( + np.allclose( + y1.numpy(), y2.numpy(), atol=1e-07), True) + + def test_3d(self): + for p in self.places: + with fluid.dygraph.guard(p): + x = paddle.randn([2, 6, 6, 6, 4]) + net1 = paddle.nn.BatchNorm3D(4, data_format="NDHWC") + net2 = paddle.nn.BatchNorm3D(4) + net2.weight = net1.weight + net2.bias = net1.bias + y1 = net1(x) + channel_first_x = paddle.transpose(x, [0, 4, 1, 2, 3]) + y2 = net2(channel_first_x) + y2 = paddle.transpose(y2, [0, 2, 3, 4, 1]) + self.assertEqual( + np.allclose( + y1.numpy(), y2.numpy(), atol=1e-07), True) + # res = np.allclose(y1.numpy(), y2.numpy()) + # if res == False: + # np.savetxt("./y1.txt", y1.numpy().flatten(), fmt='%.10f', delimiter='\n') + # np.savetxt("./y2.txt", y2.numpy().flatten(), fmt='%.10f', delimiter='\n') + # self.assertEqual(res, True) + + +class TestBatchNormUseGlobalStats(unittest.TestCase): + def setUp(self): + self.places = [fluid.CPUPlace()] + if core.is_compiled_with_mlu(): + self.places.append(fluid.MLUPlace(0)) + self.init_test() + + ### train mode + def init_test(self): + self.use_global_stats = True + self.trainable_statistics = False + + def test_global_stats(self): + for p in self.places: + with fluid.dygraph.guard(p): + x = paddle.randn([2, 6, 6, 4]) + net1 = paddle.fluid.dygraph.BatchNorm( + 6, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(1.0)), + use_global_stats=self.use_global_stats, + trainable_statistics=self.trainable_statistics) + net2 = paddle.nn.BatchNorm2D( + 6, use_global_stats=self.use_global_stats) + net2.weight = net1.weight + net2.bias = net1.bias + if self.trainable_statistics == True: + net1.training = False + net2.training = False + y1 = net1(x) + y2 = net2(x) + self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True) + + +class TestBatchNormUseGlobalStatsCase1(TestBatchNormUseGlobalStats): + ### test mode + def init_test(self): + self.use_global_stats = False + self.trainable_statistics = True + + +class TestBatchNormUseGlobalStatsCase2(TestBatchNormUseGlobalStats): + ### train mode + def init_test(self): + self.use_global_stats = False + self.trainable_statistics = False + + +class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats): + ### test mode + def init_test(self): + self.use_global_stats = True + self.trainable_statistics = True + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py new file mode 100644 index 0000000000000000000000000000000000000000..3bfa96b70011238b48f55d628ad17794b84ff5de --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py @@ -0,0 +1,223 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest, skip_check_grad_ci +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestConcatOp(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "concat" + self.place = paddle.device.MLUPlace(0) + self.init_dtype() + self.init_test_data() + + self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} + self.attrs = {'axis': self.axis} + if self.axis < 0: + self.actual_axis = self.axis + len(self.x0.shape) + self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0 + else: + self.actual_axis = self.axis + + self.outputs = { + 'Out': np.concatenate( + (self.x0, self.x1, self.x2), axis=self.actual_axis) + } + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['x0', 'x2'], 'Out') + self.check_grad_with_place(self.place, ['x1'], 'Out') + self.check_grad_with_place(self.place, ['x2'], 'Out') + + def init_test_data(self): + self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) + self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) + self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) + self.axis = 0 + + +class TestConcatOp2(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.axis = 1 + + +@skip_check_grad_ci( + reason="The function 'check_grad' for large inputs is too slow.") +class TestConcatOp3(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype) + self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype) + self.x2 = np.random.random((1, 128, 170, 256)).astype(self.dtype) + self.axis = 1 + + def test_check_grad(self): + pass + + +@skip_check_grad_ci( + reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015." +) +class TestConcatOp4(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((0, 3, 4, 5)).astype(self.dtype) + self.axis = 0 + + def test_check_grad(self): + pass + + +class TestConcatOp5(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype) + self.axis = -3 + + +#----------------Concat Fp16---------------- +def create_test_fp16(parent): + class TestConcatFp16(parent): + def init_dtype(self): + self.dtype = np.float16 + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestConcatFp16.__name__ = cls_name + globals()[cls_name] = TestConcatFp16 + + +create_test_fp16(TestConcatOp) +create_test_fp16(TestConcatOp2) +create_test_fp16(TestConcatOp3) +create_test_fp16(TestConcatOp4) +create_test_fp16(TestConcatOp5) + + +#----------------Concat Int64---------------- +def create_test_int64(parent): + class TestConcatInt64(parent): + def init_dtype(self): + self.dtype = np.int64 + + def test_check_grad(self): + pass + + cls_name = "{0}_{1}".format(parent.__name__, "Int64") + TestConcatInt64.__name__ = cls_name + globals()[cls_name] = TestConcatInt64 + + +create_test_int64(TestConcatOp) +create_test_int64(TestConcatOp2) +create_test_int64(TestConcatOp3) +create_test_int64(TestConcatOp4) +create_test_int64(TestConcatOp5) + + +#----------------Concat Int32---------------- +def create_test_int32(parent): + class TestConcatInt32(parent): + def init_dtype(self): + self.dtype = np.int32 + + def test_check_grad(self): + pass + + cls_name = "{0}_{1}".format(parent.__name__, "Int32") + TestConcatInt32.__name__ = cls_name + globals()[cls_name] = TestConcatInt32 + + +create_test_int32(TestConcatOp) +create_test_int32(TestConcatOp2) +create_test_int32(TestConcatOp3) +create_test_int32(TestConcatOp4) +create_test_int32(TestConcatOp5) + + +#----------------Concat AxisTensor---------------- +def create_test_AxisTensor(parent): + class TestConcatAxisTensor(parent): + def setUp(self): + self.op_type = "concat" + self.dtype = self.init_dtype() + self.init_test_data() + + self.inputs = { + 'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)], + 'AxisTensor': np.array([self.axis]).astype("int32") + } + self.attrs = {} + + if self.axis < 0: + self.actual_axis = self.axis + len(self.x0.shape) + self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0 + else: + self.actual_axis = self.axis + + self.outputs = { + 'Out': np.concatenate( + (self.x0, self.x1, self.x2), axis=self.actual_axis) + } + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def init_test_data(self): + self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) + self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) + self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) + self.axis = 0 + + def init_dtype(self): + self.dtype = np.float32 + + cls_name = "{0}_{1}".format(parent.__name__, "AxisTensor") + TestConcatAxisTensor.__name__ = cls_name + globals()[cls_name] = TestConcatAxisTensor + + +create_test_AxisTensor(TestConcatOp) +create_test_AxisTensor(TestConcatOp2) +create_test_AxisTensor(TestConcatOp3) +create_test_AxisTensor(TestConcatOp4) +create_test_AxisTensor(TestConcatOp5) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py new file mode 100644 index 0000000000000000000000000000000000000000..b09d892554bab6dc2951a72d72773935e5f60ddb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_mlu.py @@ -0,0 +1,555 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from op_test import OpTest + +from test_conv2d_op import conv2d_forward_naive + +paddle.enable_static() + + +def create_test_channel_last_class(parent): + class TestChannelLastCase(parent): + def init_data_format(self): + self.data_format = "NHWC" + + def init_test_case_2(self): + N, C, H, W = self.input_size + self.input_size = [N, H, W, C] + + cls_name = "{0}_{1}".format(parent.__name__, "ChannelLast") + TestChannelLastCase.__name__ = cls_name + globals()[cls_name] = TestChannelLastCase + + +def create_test_padding_SAME_class(parent): + class TestPaddingSMAECase(parent): + def init_paddings(self): + self.pad = [0, 0] + self.padding_algorithm = "SAME" + + cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp") + TestPaddingSMAECase.__name__ = cls_name + globals()[cls_name] = TestPaddingSMAECase + + +def create_test_padding_VALID_class(parent): + class TestPaddingVALIDCase(parent): + def init_paddings(self): + self.pad = [1, 1] + self.padding_algorithm = "VALID" + + cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp") + TestPaddingVALIDCase.__name__ = cls_name + globals()[cls_name] = TestPaddingVALIDCase + + +def create_test_fp16_class(parent): + class TestFp16Case(parent): + def init_dtype(self): + self.dtype = np.float16 + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestFp16Case.__name__ = cls_name + globals()[cls_name] = TestFp16Case + + +class TestConv2DOp(OpTest): + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def init_data_format(self): + self.data_format = "NCHW" + + def setUp(self): + self.set_mlu() + self.op_type = "conv2d" + self.init_data_format() + self.init_dtype() + self.init_group() + self.init_dilation() + self.init_test_case() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype) + + output, _, _, _, _ = conv2d_forward_naive( + input, + filter, + self.groups, + conv2d_param, + data_format=self.data_format) + output = output.astype(self.dtype) + + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations, + 'data_format': self.data_format, + } + self.outputs = {'Output': output} + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-2) + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) + + def test_check_grad_no_filter(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter']), + numeric_place=paddle.CPUPlace()) + + def test_check_grad_no_input(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input']), + numeric_place=paddle.CPUPlace()) + + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + def init_dilation(self): + self.dilations = [1, 1] + + def init_group(self): + self.groups = 1 + + +class TestWithPad(TestConv2DOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + +class TestWithStride(TestConv2DOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + +class TestWithGroup(TestConv2DOp): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.group = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [18, f_c, 3, 3] + + +class TestWith1x1(TestConv2DOp): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [120, f_c, 1, 1] + + def init_group(self): + # FIXME: Supporting group = 3 in this case. + # NOTE(wangran16): There is an unknown error (acl error code is : 507015) + # when group = 3, which needs to be fixed. + self.groups = 1 + + +class TestWithDepthWise5x5(TestConv2DOp): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 4, 10, 10] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [8, f_c, 5, 5] + + def init_group(self): + self.groups = 4 + + +class TestWithDepthWise7x7(TestConv2DOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 8, 10, 10] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [16, f_c, 7, 7] + + def init_group(self): + self.groups = 8 + + +class TestWithDilation(TestConv2DOp): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 10, 10] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [12, f_c, 3, 3] + + def init_dilation(self): + self.dilations = [2, 2] + + # TODO(MLU): Depthwise opration does not support dilation yet + # it will throw an error of CNNL_STATUS_NOT_SUPPORTED. + # def init_group(self): + # self.groups = 3 + + +class TestWithInput1x1Filter1x1(TestConv2DOp): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [100, 1, 1, 1] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [120, f_c, 1, 1] + + def init_group(self): + self.groups = 1 + + +class TestConv2DOp_v2(OpTest): + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def setUp(self): + self.set_mlu() + self.op_type = "conv2d" + self.dtype = np.float32 + self.init_kernel_type() + self.init_group() + self.init_dilation() + self.init_data_format() + self.init_test_case() + self.init_paddings() + self.init_test_case_2() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype) + output, _, _, _, _ = conv2d_forward_naive( + input, filter, self.groups, conv2d_param, self.padding_algorithm, + self.data_format) + output = output.astype(self.dtype) + + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'padding_algorithm': self.padding_algorithm, + 'groups': self.groups, + 'dilations': self.dilations, + 'data_format': self.data_format, + } + self.outputs = {'Output': output} + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-2) + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.02, + numeric_place=paddle.CPUPlace()) + + def test_check_grad_no_filter(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Filter']), + numeric_place=paddle.CPUPlace()) + + def test_check_grad_no_input(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + no_grad_set=set(['Input']), + numeric_place=paddle.CPUPlace()) + + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 2] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 4, 3] + + def init_dilation(self): + self.dilations = [1, 1] + + def init_group(self): + self.groups = 1 + + def init_kernel_type(self): + pass + + def init_paddings(self): + self.pad = [0, 0] + self.padding_algorithm = "EXPLICIT" + + def init_data_format(self): + self.data_format = "NCHW" + + def init_test_case_2(self): + pass + + +class TestConv2DOp_AsyPadding(TestConv2DOp_v2): + def init_paddings(self): + self.pad = [0, 0, 1, 2] + self.padding_algorithm = "EXPLICIT" + + +class TestWithPad_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + def init_paddings(self): + self.pad = [2, 1, 3, 2] + self.padding_algorithm = "EXPLICIT" + + +class TestWithStride_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + def init_paddings(self): + self.pad = [2, 1, 3, 2] + self.padding_algorithm = "EXPLICIT" + + +class TestWithGroup_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.group = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [24, f_c, 4, 3] + + +class TestWith1x1_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [120, f_c, 1, 1] + + def init_group(self): + self.groups = 1 + + def init_paddings(self): + self.pad = [2, 2, 4, 0] + self.padding_algorithm = "EXPLICIT" + + +class TestWithDepthWise3x3_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [1, 1] + self.input_size = [3, 4, 10, 10] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [16, f_c, 3, 3] + + # TODO(MLU): Depthwise opration does not support dilation yet + # it will throw an error of CNNL_STATUS_NOT_SUPPORTED. + # def init_dilation(self): + # self.dilations = [2, 2] + + def init_group(self): + self.groups = 4 + + def init_paddings(self): + self.pad = [1, 3, 2, 1] + self.padding_algorithm = "EXPLICIT" + + +class TestWithDepthWise5x5_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [1, 1] + self.input_size = [2, 4, 10, 10] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [8, f_c, 5, 5] + + def init_group(self): + self.groups = 4 + + def init_paddings(self): + self.pad = [0, 1, 1, 0] + self.padding_algorithm = "EXPLICIT" + + +class TestWithDepthWise7x7_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [2, 2] + self.input_size = [2, 8, 10, 10] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [16, f_c, 7, 7] + + def init_group(self): + self.groups = 8 + + def init_paddings(self): + self.pad = [1, 3, 4, 1] + self.padding_algorithm = "EXPLICIT" + + +class TestWithDilation_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [1, 1] + self.input_size = [2, 3, 10, 10] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [24, f_c, 3, 3] + + def init_dilation(self): + self.dilations = [2, 2] + + # TODO(MLU): Depthwise opration does not support dilation yet + # it will throw an error of CNNL_STATUS_NOT_SUPPORTED. + # def init_group(self): + # self.groups = 3 + + def init_paddings(self): + self.pad = [0, 1, 3, 0] + self.padding_algorithm = "EXPLICIT" + + +class TestWithInput1x1Filter1x1_AsyPadding(TestConv2DOp_v2): + def init_test_case(self): + self.stride = [1, 1] + self.input_size = [100, 1, 1, 1] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [120, f_c, 1, 1] + + def init_group(self): + self.groups = 1 + + def init_paddings(self): + self.pad = [0, 3, 4, 0] + self.padding_algorithm = "EXPLICIT" + + +create_test_padding_SAME_class(TestConv2DOp_AsyPadding) +create_test_padding_SAME_class(TestWithPad_AsyPadding) +create_test_padding_SAME_class(TestWithStride_AsyPadding) +create_test_padding_SAME_class(TestWithGroup_AsyPadding) +create_test_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding) + +create_test_padding_VALID_class(TestConv2DOp_AsyPadding) +create_test_padding_VALID_class(TestWithPad_AsyPadding) +create_test_padding_VALID_class(TestWithStride_AsyPadding) +create_test_padding_VALID_class(TestWithGroup_AsyPadding) +create_test_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding) + +create_test_channel_last_class(TestConv2DOp_AsyPadding) +create_test_channel_last_class(TestWithPad_AsyPadding) +create_test_channel_last_class(TestWithGroup_AsyPadding) +create_test_channel_last_class(TestWith1x1_AsyPadding) +create_test_channel_last_class(TestWithInput1x1Filter1x1_AsyPadding) + +create_test_fp16_class(TestConv2DOp_AsyPadding) +create_test_fp16_class(TestWithPad_AsyPadding) +create_test_fp16_class(TestWithStride_AsyPadding) +create_test_fp16_class(TestWithGroup_AsyPadding) +create_test_fp16_class(TestWithInput1x1Filter1x1_AsyPadding) + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py new file mode 100644 index 0000000000000000000000000000000000000000..6610127d382bd3a715b64ad359c500fefc595936 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py @@ -0,0 +1,453 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append('..') +from op_test import OpTest, convert_float_to_uint16 + +import paddle +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +import numpy as np +from paddle.fluid import compiler, Program, program_guard + + +# Situation 1: Attr(shape) is a list(without tensor) +class TestFillConstantOp1(OpTest): + def setUp(self): + '''Test fill_constant op with specified value + ''' + self.op_type = "fill_constant" + + self.inputs = {} + self.attrs = {'shape': [123, 92], 'value': 3.8} + self.outputs = {'Out': np.full((123, 92), 3.8)} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestFillConstantOp2(OpTest): + def setUp(self): + '''Test fill_constant op with default value + ''' + self.op_type = "fill_constant" + + self.inputs = {} + self.attrs = {'shape': [123, 92]} + self.outputs = {'Out': np.full((123, 92), 0.0)} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestFillConstantOp3(OpTest): + def setUp(self): + '''Test fill_constant op with specified int64 value + ''' + self.op_type = "fill_constant" + + self.inputs = {} + self.attrs = {'shape': [123, 92], 'value': 10000000000} + self.outputs = {'Out': np.full((123, 92), 10000000000)} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestFillConstantOp4(OpTest): + def setUp(self): + '''Test fill_constant op with specified int value + ''' + self.op_type = "fill_constant" + + self.inputs = {} + self.attrs = {'shape': [123, 92], 'value': 3} + self.outputs = {'Out': np.full((123, 92), 3)} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestFillConstantOpWithSelectedRows(unittest.TestCase): + def check_with_place(self, place): + scope = core.Scope() + # create Out Variable + out = scope.var('Out').get_selected_rows() + + # create and run fill_constant_op operator + fill_constant_op = Operator( + "fill_constant", shape=[123, 92], value=3.8, Out='Out') + fill_constant_op.run(scope, place) + + # get result from Out + result_array = np.array(out.get_tensor()) + full_array = np.full((123, 92), 3.8, 'float32') + + self.assertTrue(np.array_equal(result_array, full_array)) + + def test_fill_constant_with_selected_rows(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self.check_with_place(place) + + +# Situation 2: Attr(shape) is a list(with tensor) +class TestFillConstantOp1_ShapeTensorList(OpTest): + def setUp(self): + '''Test fill_constant op with specified value + ''' + self.op_type = "fill_constant" + self.init_data() + shape_tensor_list = [] + for index, ele in enumerate(self.shape): + shape_tensor_list.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = {"ShapeTensorList": shape_tensor_list} + self.attrs = {'shape': self.infer_shape, 'value': self.value} + self.outputs = {'Out': np.full(self.shape, self.value)} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def init_data(self): + self.shape = [123, 92] + self.infer_shape = [-1, 92] + self.value = 3.8 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestFillConstantOp2_ShapeTensorList(OpTest): + def setUp(self): + '''Test fill_constant op with default value + ''' + self.op_type = "fill_constant" + self.init_data() + shape_tensor_list = [] + for index, ele in enumerate(self.shape): + shape_tensor_list.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = {"ShapeTensorList": shape_tensor_list} + self.attrs = {'shape': self.infer_shape} + self.outputs = {'Out': np.full(self.shape, 0.0)} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def init_data(self): + self.shape = [123, 92] + self.infer_shape = [-1, -1] + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestFillConstantOp3_ShapeTensorList(TestFillConstantOp1_ShapeTensorList): + def init_data(self): + self.shape = [123, 92] + self.infer_shape = [123, -1] + self.value = 10000000000 + + +class TestFillConstantOp4_ShapeTensorList(TestFillConstantOp1_ShapeTensorList): + def init_data(self): + self.shape = [123, 92] + self.infer_shape = [123, -1] + self.value = 3 + + +# Situation 3: shape is a tensor +class TestFillConstantOp1_ShapeTensor(OpTest): + def setUp(self): + '''Test fill_constant op with specified value + ''' + self.op_type = "fill_constant" + self.init_data() + + self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")} + self.attrs = {'value': self.value} + self.outputs = {'Out': np.full(self.shape, self.value)} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def init_data(self): + self.shape = [123, 92] + self.value = 3.8 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +# Situation 4: value is a tensor +class TestFillConstantOp1_ValueTensor(OpTest): + def setUp(self): + '''Test fill_constant op with specified value + ''' + self.op_type = "fill_constant" + self.init_data() + + self.inputs = { + "ShapeTensor": np.array(self.shape).astype("int32"), + 'ValueTensor': np.array([self.value]).astype("float32") + } + self.attrs = {'value': self.value + 1.0} + self.outputs = {'Out': np.full(self.shape, self.value)} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def init_data(self): + #self.shape = [123, 92] + self.shape = [2, 2] + self.value = 3.8 + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +# Situation 5: value is a tensor +class TestFillConstantOp2_ValueTensor(OpTest): + def setUp(self): + '''Test fill_constant op with specified value + ''' + self.op_type = "fill_constant" + self.init_data() + + self.inputs = { + "ShapeTensor": np.array(self.shape).astype("int32"), + 'ValueTensor': np.array([self.value]).astype("int32") + } + self.attrs = {'value': self.value, 'dtype': 2} + self.outputs = {'Out': np.full(self.shape, self.value)} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def init_data(self): + self.shape = [123, 92] + self.value = 3 + self.dtype = np.int32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +# Test python API +class TestFillConstantAPI(unittest.TestCase): + def test_api(self): + + positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2) + positive_2_int64 = fluid.layers.fill_constant([1], "int64", 2) + + shape_tensor_int32 = fluid.data( + name="shape_tensor_int32", shape=[2], dtype="int32") + shape_tensor_int64 = fluid.data( + name="shape_tensor_int64", shape=[2], dtype="int64") + + out_1 = fluid.layers.fill_constant( + shape=[1, 2], dtype="float32", value=1.1) + + out_2 = fluid.layers.fill_constant( + shape=[1, positive_2_int32], dtype="float32", value=1.1) + + out_3 = fluid.layers.fill_constant( + shape=[1, positive_2_int64], dtype="float32", value=1.1) + + out_4 = fluid.layers.fill_constant( + shape=shape_tensor_int32, dtype="float32", value=1.1) + + out_5 = fluid.layers.fill_constant( + shape=shape_tensor_int64, dtype="float32", value=1.1) + + out_6 = fluid.layers.fill_constant( + shape=shape_tensor_int64, dtype=np.float32, value=1.1) + + val1 = fluid.layers.fill_constant( + shape=[1], dtype=np.float32, value=1.1) + val2 = fluid.layers.fill_constant( + shape=[1], dtype=np.float64, value=1.1) + out_7 = fluid.layers.fill_constant( + shape=shape_tensor_int64, dtype=np.float32, value=val1) + + out_8 = fluid.layers.fill_constant( + shape=shape_tensor_int64, dtype=np.float32, value=val2) + + exe = fluid.Executor(place=fluid.CPUPlace()) + res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = exe.run( + fluid.default_main_program(), + feed={ + "shape_tensor_int32": np.array([1, 2]).astype("int32"), + "shape_tensor_int64": np.array([1, 2]).astype("int64"), + }, + fetch_list=[ + out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8 + ]) + + assert np.array_equal(res_1, np.full([1, 2], 1.1, dtype="float32")) + assert np.array_equal(res_2, np.full([1, 2], 1.1, dtype="float32")) + assert np.array_equal(res_3, np.full([1, 2], 1.1, dtype="float32")) + assert np.array_equal(res_4, np.full([1, 2], 1.1, dtype="float32")) + assert np.array_equal(res_5, np.full([1, 2], 1.1, dtype="float32")) + assert np.array_equal(res_6, np.full([1, 2], 1.1, dtype="float32")) + assert np.array_equal(res_7, np.full([1, 2], 1.1, dtype="float32")) + assert np.array_equal(res_8, np.full([1, 2], 1.1, dtype="float32")) + + +class TestFillConstantImperative(unittest.TestCase): + def test_api(self): + with fluid.dygraph.guard(): + data1 = np.array([1, 2]).astype('int32') + data2 = np.array([1.1]).astype('float32') + data3 = np.array([88]).astype('int32') + shape = fluid.dygraph.to_variable(data1) + val = fluid.dygraph.to_variable(data2) + value = fluid.dygraph.to_variable(data3) + res1 = fluid.layers.fill_constant( + shape=[1, 2], dtype='float32', value=1.1) + res2 = fluid.layers.fill_constant( + shape=shape, dtype='float32', value=1.1) + res3 = fluid.layers.fill_constant( + shape=shape, dtype='float32', value=val) + res4 = fluid.layers.fill_constant( + shape=shape, dtype='int32', value=value) + assert np.array_equal( + res1.numpy(), np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + res2.numpy(), np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + res3.numpy(), np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + res4.numpy(), np.full( + [1, 2], 88, dtype="int32")) + + def test_nan(self): + with fluid.dygraph.guard(): + res = fluid.layers.fill_constant([1], 'float32', np.nan) + self.assertTrue(np.isnan(res.numpy().item(0))) + + def test_inf(self): + with fluid.dygraph.guard(): + res = fluid.layers.fill_constant([1], 'float32', np.inf) + self.assertTrue(np.isinf(res.numpy().item(0))) + + def test_ninf(self): + with fluid.dygraph.guard(): + res = fluid.layers.fill_constant([1], 'float32', np.NINF) + self.assertTrue(np.isinf(res.numpy().item(0))) + self.assertEqual(np.NINF, res.numpy().item(0)) + + +class TestFillConstantOpError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + #for ci coverage + x1 = fluid.layers.data(name='x1', shape=[1], dtype="int16") + self.assertRaises( + TypeError, + fluid.layers.fill_constant, + shape=[1], + value=5, + dtype='uint4') + + self.assertRaises( + TypeError, + fluid.layers.fill_constant, + shape=[1.1], + value=5, + dtype='float32', + out=x1) + + # The argument dtype of fill_constant_op must be one of bool, float16, + #float32, float64, uint8, int16, int32 or int64 + x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32") + + self.assertRaises( + TypeError, + fluid.layers.fill_constant, + shape=[1], + value=5, + dtype='float64', + out=x2) + + x3 = np.random.randn(100, 100).astype('int32') + self.assertRaises( + TypeError, + fluid.layers.fill_constant, + shape=[100, 100], + value=5, + dtype='float64', + out=x3) + + # The argument shape's type of fill_constant_op must be list, tuple or Variable. + def test_shape_type(): + fluid.layers.fill_constant(shape=1, dtype="float32", value=1) + + self.assertRaises(TypeError, test_shape_type) + + # The argument shape's size of fill_constant_op must not be 0. + def test_shape_size(): + fluid.layers.fill_constant(shape=[], dtype="float32", value=1) + + self.assertRaises(AssertionError, test_shape_size) + + # The shape dtype of fill_constant_op must be int32 or int64. + def test_shape_tensor_dtype(): + shape = fluid.data( + name="shape_tensor", shape=[2], dtype="float32") + fluid.layers.fill_constant( + shape=shape, dtype="float32", value=1) + + self.assertRaises(TypeError, test_shape_tensor_dtype) + + def test_shape_tensor_list_dtype(): + shape = fluid.data( + name="shape_tensor_list", shape=[1], dtype="bool") + fluid.layers.fill_constant( + shape=[shape, 2], dtype="float32", value=1) + + self.assertRaises(TypeError, test_shape_tensor_list_dtype) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py new file mode 100644 index 0000000000000000000000000000000000000000..b8363545d228892c4c7209499caf13aec4b4805b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py @@ -0,0 +1,234 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + +paddle.enable_static() +SEED = 2021 + + +class TestCase1(OpTest): + def setUp(self): + self.set_mlu() + self.set_example() + self.op_type = "split" + self.place = paddle.device.MLUPlace(0) + ipt = self.x.astype(self.dtype) + axis = self.axis if isinstance(self.axis, int) else int(self.axis[0]) + tmp_outs = np.split( + ipt, axis=axis, indices_or_sections=self.num_or_sections) + tmp_outs = [o.astype(self.dtype) for o in tmp_outs] + self.outputs = {'Out': []} + self.outs = [] + for i, o in enumerate(tmp_outs): + self.outputs["Out"].append((str(i), o)) + self.outs.append(str(i)) + + self.attrs = {"axis": self.axis, "num": self.num_or_sections} + self.inputs = {} + self.inputs.update({'X': ipt.astype(self.dtype)}) + + def set_mlu(self): + self.__class__.use_mlu = True + self.__class__.op_type = "split" + + def test_check_output(self): + self.check_output_with_place(self.place) + + def set_example(self): + self.dtype = "float32" + self.x = np.random.random((2, 4, 6)) + self.axis = 1 + self.num_or_sections = 2 + + +class TestCase2(TestCase1): + def set_example(self): + self.dtype = "float32" + self.x = np.random.random((20, 4, 50)) + self.axis = 0 + self.num_or_sections = 4 + + +class TestCase4(TestCase1): + def set_example(self): + self.dtype = "float16" + self.x = np.random.random((4, 50, 20)) + self.axis = 2 + self.num_or_sections = 4 + + +# Test Sections +class TestCase5(TestCase1): + def set_example(self): + super().set_example() + self.x = np.random.random((2, 10, 4)) + self.axis = 1 + self.num_or_sections = [2, 4, 8] + + def setUp(self): + super().setUp() + self.attrs.update({"sections": [2, 2, 4, 2], "num": 0}) + + +class API_TestSplit(unittest.TestCase): + def test_out(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + data = fluid.layers.data('data', shape=[-1, 10], dtype='float32') + x0, x1 = paddle.split(data, num_or_sections=(3, 7), axis=1) + place = fluid.MLUPlace(0) + exe = fluid.Executor(place) + input1 = np.random.random([1, 10]).astype('float32') + r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1]) + ex_x0, ex_x1 = np.split(input1, (3, ), axis=1) + self.assertTrue(np.allclose(ex_x0, r0)) + self.assertTrue(np.allclose(ex_x1, r1)) + + +class API_TestSplit2(unittest.TestCase): + def test_out(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + data = fluid.layers.data('data', shape=[-1, 10], dtype='float32') + x0, x1 = paddle.split(data, num_or_sections=2, axis=1) + place = fluid.MLUPlace(0) + exe = fluid.Executor(place) + input1 = np.random.random([1, 10]).astype('float32') + r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1]) + ex_x0, ex_x1 = np.split(input1, 2, axis=1) + self.assertTrue(np.allclose(ex_x0, r0)) + self.assertTrue(np.allclose(ex_x1, r1)) + + +class API_TestDygraphSplit(unittest.TestCase): + def test_out1(self): + with fluid.dygraph.guard(paddle.MLUPlace(0)): + input_1 = np.random.random([4, 6, 6]).astype("int32") + # input is a variable which shape is [4, 6, 6] + input = fluid.dygraph.to_variable(input_1) + x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1) + x0_out = x0.numpy() + x1_out = x1.numpy() + x2_out = x2.numpy() + ex_x0, ex_x1, ex_x2 = np.split(input_1, 3, axis=1) + self.assertTrue(np.allclose(ex_x0, x0_out)) + self.assertTrue(np.allclose(ex_x1, x1_out)) + self.assertTrue(np.allclose(ex_x2, x2_out)) + + def test_out2(self): + with fluid.dygraph.guard(paddle.MLUPlace(0)): + input_1 = np.random.random([4, 6, 6]).astype("int32") + # input is a variable which shape is [4, 6, 6] + input = fluid.dygraph.to_variable(input_1) + x0, x1, x2 = paddle.split(input, num_or_sections=[1, 2, 3], axis=1) + x0_out = x0.numpy() + x1_out = x1.numpy() + x2_out = x2.numpy() + ex_x0, ex_x1, ex_x2 = np.split(input_1, (1, 3), axis=1) + self.assertTrue(np.allclose(ex_x0, x0_out)) + self.assertTrue(np.allclose(ex_x1, x1_out)) + self.assertTrue(np.allclose(ex_x2, x2_out)) + + +# attr(axis) is Tensor +class TestSplitOp_AxisTensor(OpTest): + def setUp(self): + self._set_op_type() + self.dtype = self.get_dtype() + self.init_data() + self.inputs = { + 'X': self.x, + 'AxisTensor': np.array([self.axis]).astype("int32") + } + self.attrs = {'sections': self.sections, 'num': self.num} + + out = np.split(self.x, self.indices_or_sections, self.axis) + self.outputs = {'Out': [('out%d' % i, out[i]) \ + for i in range(len(out))]} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def init_data(self): + self.x = np.random.random((4, 5, 6)).astype(self.dtype) + self.axis = 2 + self.sections = [] + self.num = 3 + self.indices_or_sections = 3 + + def get_dtype(self): + return "float" + + def _set_op_type(self): + self.op_type = "split" + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestSplitOp_SectionsTensor(OpTest): + def setUp(self): + self._set_op_type() + self.dtype = self.get_dtype() + self.init_data() + self.inputs = {'X': self.x} + + sections_tensor = [] + for index, ele in enumerate(self.sections): + sections_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs['SectionsTensorList'] = sections_tensor + + self.attrs = { + 'axis': self.axis, + 'sections': self.sections_infer, + 'num': self.num + } + + out = np.split(self.x, self.indices_or_sections, self.axis) + self.outputs = {'Out': [('out%d' % i, out[i]) \ + for i in range(len(out))]} + + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def init_data(self): + self.x = np.random.random((4, 5, 6)).astype(self.dtype) + self.axis = 1 + self.sections = [2, 1, 2] + self.sections_infer = [-1, -1, -1] + self.num = 0 + self.indices_or_sections = [2, 3] + + def get_dtype(self): + return "float" + + def _set_op_type(self): + self.op_type = "split" + + def test_check_output(self): + self.check_output_with_place(self.place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py new file mode 100755 index 0000000000000000000000000000000000000000..e9db14de46ab58ebc300cc282f150244d02e0b48 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_sum_op_mlu.py @@ -0,0 +1,116 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + +paddle.enable_static() +SEED = 2021 + + +class TestSum1(OpTest): + def setUp(self): + self.set_mlu() + self.init_dtype() + self.op_type = "sum" + self.place = paddle.MLUPlace(0) + + x0 = np.random.random((3, 40)).astype(self.dtype) + x1 = np.random.random((3, 40)).astype(self.dtype) + x2 = np.random.random((3, 40)).astype(self.dtype) + self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2)]} + y = x0 + x1 + x2 + self.outputs = {'Out': y} + + self.attrs = {'use_mkldnn': False} + + def init_dtype(self): + self.dtype = np.float32 + + def set_mlu(self): + self.__class__.use_mlu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestSum2(OpTest): + def setUp(self): + self.set_mlu() + self.init_dtype() + self.op_type = "sum" + self.place = paddle.MLUPlace(0) + + x0 = np.random.random((3, 3)).astype(self.dtype) + x1 = np.random.random((3, 3)).astype(self.dtype) + x2 = np.random.random((3, 3)).astype(self.dtype) + x3 = np.random.random((3, 3)).astype(self.dtype) + self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]} + # There will be a problem if just using `y=x0+x1+x2+x3` to calculate the + # summation result as the reference standard result. The reason is that + # numpy's fp16 data has precision loss when doing `add` operation. + # For example, the results of `x0+x1+x2+x3` is different from that of + # `x3+x2+x1+x0` if the dtype is fp16. + # Therefore, converting the input to fp32 for calculation. + y = (x0.astype(np.float32) + x1.astype(np.float32) + + x2.astype(np.float32) + x3.astype(np.float32)).astype(self.dtype) + self.outputs = {'Out': y} + + self.attrs = {'use_mkldnn': False} + + def init_dtype(self): + self.dtype = np.float16 + + def set_mlu(self): + self.__class__.use_mlu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestSum3(OpTest): + def setUp(self): + self.set_mlu() + self.init_dtype() + self.op_type = "sum" + self.place = paddle.MLUPlace(0) + + x0 = np.random.random((3, 3)).astype(self.dtype) + + self.inputs = {'X': [("x0", x0)]} + y = x0 + self.outputs = {'Out': y} + + self.attrs = {'use_mkldnn': False} + + def init_dtype(self): + self.dtype = np.float16 + + def set_mlu(self): + self.__class__.use_mlu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py index b234e25823f4b370b9a4150ee3f8b7d635468952..a93abd3c1277681234209c27f54f0d019bf4e9df 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py @@ -27,7 +27,7 @@ from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer from paddle.distributed.auto_parallel.partitioner import Partitioner -from paddle.distributed.auto_parallel.reshard import reshard +from paddle.distributed.auto_parallel.reshard import reshard, HAS_SENT, HAS_RECV, HAS_ALLGATHER from paddle.distributed.auto_parallel.process_group import _g_process_group_map from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr @@ -143,7 +143,11 @@ def mlp_forward(train_program, start_program): return loss, train_program, start_program -def get_dist_prog(train_program, startup_program, dist_context, rank_id): +def get_dist_prog(train_program, + startup_program, + dist_context, + rank_id, + change_process_mesh=False): loss, train_program, startup_program = mlp_forward(train_program, startup_program) @@ -157,6 +161,12 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): complete_train_program = completer.complete_forward_annotation( train_program) + if change_process_mesh: + global PP_MESH_1 + dist_context.get_tensor_dist_attr_for_program( + train_program.global_block().vars[ + "gelu_0.tmp_0"]).process_mesh = PP_MESH_1 + params_grads = parallelizer._generate_backward( complete_train_program, startup_program, @@ -308,6 +318,25 @@ class TestMLPReshard(unittest.TestCase): # parameter initialization of every rank should be different in the pipeline scene self.assertTrue(check_initialization(dist_startup_prog, rank_id)) + def test_mlp_pp_diff_process_mesh(self): + HAS_SENT.clear() + HAS_RECV.clear() + HAS_ALLGATHER.clear() + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 1 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id, True) + for key in list(_g_process_group_map.keys()): + del _g_process_group_map[key] + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + print_program_with_dist_attr(dist_main_prog, dist_context) + + # check send and recv result + self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) + self.assertTrue(check_initialization(dist_startup_prog, rank_id)) + def test_mlp_dp(self): global _global_parallel_strategy _global_parallel_strategy = "dp" diff --git a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py index 79de17fdb66412b32164574acb6e3d9446e0f29b..b67dbd0ba622d9f5dda96fd448d08ea71eb999fa 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_trace_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_trace_op.py @@ -41,9 +41,6 @@ class TestEagerTraceOp(unittest.TestCase): paddle.fluid.framework._dygraph_tracer().trace_op( 'instance_norm', {'Scale': [scale], 'X': [x]}, {'Y': [x]}, {}) - paddle.fluid.framework._dygraph_tracer().trace_op( - 'coalesce_tensor', {'Input': [x]}, {'Output': [x]}, - {'dtype': int(core.VarDesc.VarType.FP32)}) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py new file mode 100644 index 0000000000000000000000000000000000000000..544fe4dd43e6b93a2a34215744fe0bdf506ad655 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py @@ -0,0 +1,86 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import numpy as np +import os +from paddle.fluid import core + +paddle.enable_static() + + +class TestDistModelRun(unittest.TestCase): + def test_dist_model_run(self): + # step 0: declare folder to save the model and params + folder = './dist_model_run_test/' + file = 'inf' + path_prefix = folder + file + + # step 1: saving the inference model and params + x = paddle.static.data(name='x', shape=[28, 28], dtype='float32') + y = paddle.static.data(name='y', shape=[28, 1], dtype='int64') + predict = paddle.static.nn.fc(x, 10, activation='softmax') + loss = paddle.nn.functional.cross_entropy(predict, y) + avg_loss = paddle.tensor.stat.mean(loss) + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + exe.run(paddle.static.default_startup_program()) + x_data = np.random.randn(28, 28).astype('float32') + y_data = np.random.randint(0, 9, size=[28, 1]).astype('int64') + exe.run(paddle.static.default_main_program(), + feed={'x': x_data, + 'y': y_data}, + fetch_list=[avg_loss]) + paddle.static.save_inference_model(path_prefix, [x, y], [avg_loss], exe) + print('save model to', path_prefix) + + # step 2: prepare fake data for the inference + x_tensor = np.random.randn(28, 28).astype('float32') + y_tensor = np.random.randint(0, 9, size=[28, 1]).astype('int64') + + # step 3: init the dist model to inference with fake data + config = core.DistModelConfig() + config.model_dir = path_prefix + config.place = 'GPU' + dist = core.DistModel(config) + dist.init() + dist_x = core.DistModelTensor(x_tensor, 'x') + dist_y = core.DistModelTensor(y_tensor, 'y') + input_data = [dist_x, dist_y] + output_rst = dist.run(input_data) + dist_model_rst = output_rst[0].as_ndarray().ravel().tolist() + print("dist model rst:", dist_model_rst) + + # step 4: use framework's api to inference with fake data + [inference_program, feed_target_names, fetch_targets] = ( + paddle.static.load_inference_model(path_prefix, exe)) + results = exe.run(inference_program, + feed={'x': x_tensor, + 'y': y_tensor}, + fetch_list=fetch_targets) + load_inference_model_rst = results[0] + print("load inference model api rst:", load_inference_model_rst) + + # step 5: compare two results + self.assertTrue(np.allclose(dist_model_rst, load_inference_model_rst)) + + # step 6: clean up the env, delete the saved model and params + os.remove(path_prefix + '.pdiparams') + os.remove(path_prefix + '.pdmodel') + os.rmdir(folder) + print('cleaned up the env') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py similarity index 97% rename from python/paddle/fluid/tests/unittests/test_dist_model_tensor.py rename to python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py index da25550c4f47ed1bf5f694618afce722989139ca..a74b4f0d224ef6c165cfadc785f1de9c50d8de4a 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py index d2d931f148078d124a25ddbb888b3e9cb5911211..7dd310d2b88a90e09ba5ceedb541da4be263e559 100644 --- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py @@ -278,6 +278,8 @@ class TestLayerNormOp(unittest.TestCase): has_scale=False, has_bias=False, y_grad_scale=0.1) + self.check_forward_backward( + shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True) class TestLayerNormAPI(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py index 352089e1fb75fa4c3423d29012fd85c3d611c81b..b20305b78efe2dfe73e069e13f0d0eca3bb84057 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_all.py +++ b/python/paddle/fluid/tests/unittests/test_norm_all.py @@ -19,11 +19,12 @@ import numpy as np from op_test import OpTest import paddle import paddle.fluid as fluid +import paddle.fluid.core as core -def p_norm(x, axis, porder, keepdims=False): +def p_norm(x, axis, porder, keepdims=False, reduce_all=False): r = [] - if axis is None: + if axis is None or reduce_all: x = x.flatten() if porder == np.inf: r = np.amax(np.abs(x), keepdims=keepdims) @@ -53,8 +54,8 @@ def p_norm(x, axis, porder, keepdims=False): else: if isinstance(axis, list): axis = tuple(axis) - r = np.linalg.norm( - x, ord=porder, axis=axis, keepdims=keepdims).astype(x.dtype) + r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims) + r = r.astype(x.dtype) return r @@ -111,13 +112,14 @@ class TestPnormOp(OpTest): self.op_type = "p_norm" self.init_test_case() x = (np.random.random(self.shape) + 0.5).astype(self.dtype) - norm = p_norm(x, self.axis, self.porder, self.keepdim) + norm = p_norm(x, self.axis, self.porder, self.keepdim, self.asvector) self.inputs = {'X': x} self.attrs = { 'epsilon': self.epsilon, 'axis': self.axis, 'keepdim': self.keepdim, - 'porder': float(self.porder) + 'porder': float(self.porder), + 'asvector': self.asvector } self.outputs = {'Out': norm} self.gradient = self.calc_gradient() @@ -135,34 +137,42 @@ class TestPnormOp(OpTest): self.porder = 2.0 self.keepdim = False self.dtype = "float64" + self.asvector = False def calc_gradient(self): self.attrs = { 'epsilon': self.epsilon, 'axis': self.axis, 'keepdim': self.keepdim, - 'porder': float(self.porder) + 'porder': float(self.porder), + 'asvector': self.asvector } x = self.inputs["X"] porder = self.attrs["porder"] axis = self.attrs["axis"] + asvector = self.attrs["asvector"] + x_dtype = x.dtype + x = x.astype(np.float32) if x.dtype == np.float16 else x if porder == 0: grad = np.zeros(x.shape).astype(x.dtype) elif porder in [float("inf"), float("-inf")]: - norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + norm = p_norm( + x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector) x_abs = np.abs(x) grad = np.sign(x) grad[x_abs != norm] = 0.0 else: - norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + norm = p_norm( + x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector) grad = np.power(norm, 1 - porder) * np.power( np.abs(x), porder - 1) * np.sign(x) numel = 1 for s in x.shape: numel *= s - numel /= x.shape[axis] - return [grad.astype(x.dtype) * 1 / numel] + divisor = numel if asvector else x.shape[axis] + numel /= divisor + return [grad.astype(x_dtype) * 1 / numel] class TestPnormOp2(TestPnormOp): @@ -173,6 +183,7 @@ class TestPnormOp2(TestPnormOp): self.porder = 2.0 self.keepdim = True self.dtype = "float32" + self.asvector = False def test_check_grad(self): self.check_grad(['X'], 'Out') @@ -186,6 +197,7 @@ class TestPnormOp3(TestPnormOp): self.porder = np.inf self.keepdim = True self.dtype = "float32" + self.asvector = False def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) @@ -199,6 +211,7 @@ class TestPnormOp4(TestPnormOp): self.porder = -np.inf self.keepdim = True self.dtype = "float32" + self.asvector = False def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) @@ -212,11 +225,63 @@ class TestPnormOp5(TestPnormOp): self.porder = 0 self.keepdim = True self.dtype = "float32" + self.asvector = False def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) +class TestPnormOp6(TestPnormOp): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = -1 + self.epsilon = 1e-12 + self.porder = 2 + self.keepdim = False + self.dtype = "float32" + self.asvector = True + + def test_check_grad(self): + self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestPnormOpFP16(TestPnormOp): + def init_test_case(self): + self.shape = [2, 3, 4, 5] + self.axis = 1 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = False + self.dtype = "float16" + self.asvector = False + + def test_check_output(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_grad_with_place( + place, ['X'], 'Out', user_defined_grads=self.gradient) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestPnormOpFP161(TestPnormOpFP16): + def init_test_case(self): + self.shape = [2, 3, 4, 5] + self.axis = -1 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = False + self.dtype = "float16" + self.asvector = True + + def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False): with fluid.program_guard(fluid.Program()): data = fluid.data(name="X", shape=shape_x, dtype=dtype) diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index 5ba54daa0d4cbc49d4693090a853347f2e4355ab..a3bfe3864a2493fdcf100a1a86648a159701ec11 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -63,6 +63,12 @@ class TestVariable(unittest.TestCase): self.assertRaises(ValueError, lambda: b.create_var(name="fc.w", shape=(24, 100))) + w = b.create_var( + dtype=paddle.fluid.core.VarDesc.VarType.STRINGS, + shape=[1], + name="str_var") + self.assertEqual(None, w.lod_level) + def test_element_size(self): with fluid.program_guard(Program(), Program()): x = paddle.static.data(name='x1', shape=[2], dtype='bool') diff --git a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py index cb54d12488d542e515b01c5a04407884eac41152..a1eb0af2bc978dd46b9f25b81f972669d7b93d94 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py @@ -52,8 +52,9 @@ class XPUTestArgsortOp1(XPUOpTestWrapper): classes = [] for descending in [True, False]: for axis in [0, 1, 2, -1, -2]: - class_name = 'XPUTestArgsortOp_axis_' + str(axis) - attr_dict = {'init_axis': axis, 'descending': descending} + class_name = 'XPUTestArgsortOp_axis_' + str(axis) + '_' + str( + descending) + attr_dict = {'init_axis': axis, 'init_descending': descending} classes.append([class_name, attr_dict]) return base_class, classes @@ -64,8 +65,9 @@ class XPUTestArgsortOp1(XPUOpTestWrapper): self.place = paddle.XPUPlace(0) self.dtype = self.in_type self.input_shape = (2, 2, 2, 3, 3) - self.axis = -1 - self.descending = False + self.axis = -1 if not hasattr(self, 'init_axis') else self.init_axis + self.descending = False if not hasattr( + self, 'init_descending') else self.init_descending if self.in_type == 'float32': self.x = np.random.random(self.input_shape).astype(self.dtype) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py index 4ceacd52092341347ce5633c5b439ad49e7ca8de..9cb31d4270552d23435a6bebc71ae6ef208b204d 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py @@ -73,6 +73,39 @@ class TestSigmoidCrossEntropyWithLogitsOp1(XPUOpTest): self.dtype = np.float32 +class TestSigmoidCrossEntropyWithLogitsOp2( + TestSigmoidCrossEntropyWithLogitsOp1): + """Test sigmoid_cross_entropy_with_logit_op with probabalistic label + """ + + def setUp(self): + self.op_type = "sigmoid_cross_entropy_with_logits" + self.set_xpu() + self.init_dtype() + + batch_size = 64 + num_classes = 20 + ignore_index = -1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, (batch_size, num_classes)) + .astype(self.dtype)), + 'Label': np.random.randint(-1, 2, (batch_size, num_classes)) + .astype(self.dtype) + } + self.attrs = {'ignore_index': ignore_index, } + + # Fw Pass is implemented as elementwise sigmoid followed by + # elementwise logistic loss + # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X)) + sigmoid_X = expit(self.inputs['X']) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) + out = -term1 - term2 + out[np.where(self.inputs['Label'] == ignore_index)] = 0 + self.outputs = {'Out': out} + + class TestSigmoidCrossEntropyWithLogitsOp3( TestSigmoidCrossEntropyWithLogitsOp1): """Test sigmoid_cross_entropy_with_logit_op with probabalistic label @@ -102,6 +135,42 @@ class TestSigmoidCrossEntropyWithLogitsOp3( self.outputs = {'Out': -term1 - term2} +class TestSigmoidCrossEntropyWithLogitsOp4( + TestSigmoidCrossEntropyWithLogitsOp1): + """Test sigmoid_cross_entropy_with_logit_op with probabalistic label + """ + + def setUp(self): + self.op_type = "sigmoid_cross_entropy_with_logits" + self.set_xpu() + self.init_dtype() + + batch_size = 64 + num_classes = 20 + ignore_index = -1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, (batch_size, num_classes)) + .astype(self.dtype)), + 'Label': np.random.randint(-1, 2, (batch_size, num_classes)) + .astype(self.dtype) + } + self.attrs = {'ignore_index': ignore_index, 'normalize': True} + + # Fw Pass is implemented as elementwise sigmoid followed by + # elementwise logistic loss + # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X)) + sigmoid_X = expit(self.inputs['X']) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) + out = -term1 - term2 + out[np.where(self.inputs['Label'] == ignore_index)] = 0 + if self.attrs['normalize']: + out = out / float( + np.where(self.inputs['Label'] != ignore_index)[0].size) + self.outputs = {'Out': out} + + class TestSigmoidCrossEntropyWithLogitsOp5( TestSigmoidCrossEntropyWithLogitsOp1): """Test sigmoid_cross_entropy_with_logit_op with probabalistic label @@ -131,6 +200,42 @@ class TestSigmoidCrossEntropyWithLogitsOp5( self.outputs = {'Out': -term1 - term2} +class TestSigmoidCrossEntropyWithLogitsNorm( + TestSigmoidCrossEntropyWithLogitsOp1): + """Test sigmoid_cross_entropy_with_logit_op with probabalistic label + """ + + def setUp(self): + self.op_type = "sigmoid_cross_entropy_with_logits" + self.set_xpu() + self.init_dtype() + + batch_size = [10, 10] + num_classes = 20 + ignore_index = -1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, tuple(batch_size + [num_classes])) + .astype(self.dtype)), + 'Label': np.random.randint(-1, 2, tuple(batch_size + [num_classes])) + .astype(self.dtype) + } + self.attrs = {'ignore_index': ignore_index, 'normalize': True} + + # Fw Pass is implemented as elementwise sigmoid followed by + # elementwise logistic loss + # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X)) + sigmoid_X = expit(self.inputs['X']) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) + out = -term1 - term2 + out[np.where(self.inputs['Label'] == ignore_index)] = 0 + if self.attrs['normalize']: + out = out / float( + np.where(self.inputs['Label'] != ignore_index)[0].size) + self.outputs = {'Out': out} + + class TestSigmoidCrossEntropyWithLogitsOp6( TestSigmoidCrossEntropyWithLogitsOp1): """Test sigmoid_cross_entropy_with_logit_op with binary label diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index f37b45eef1b80211cbb749c20b489af43cdafdee..e5ccd6b04054a27c6df9963735dc63c7db9c7c56 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -3,7 +3,7 @@ output : Tensor infer_meta : func : ElementwiseInferMeta - param : [x, y, -1] + param : [x, y] kernel : func : add @@ -40,7 +40,7 @@ output : Tensor infer_meta : func : ElementwiseInferMeta - param : [x, y, -1] + param : [x, y] kernel : func : divide @@ -135,7 +135,7 @@ output : Tensor infer_meta : func : ElementwiseInferMeta - param : [x, y, -1] + param : [x, y] kernel : func : multiply @@ -166,19 +166,19 @@ output : Tensor infer_meta : func : ElementwiseInferMeta - param : [x, y, -1] + param : [x, y] kernel : func : subtract - api : sum args : (const Tensor& x, const std::vector& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) output : Tensor - infer_meta : - func : ReduceInferMeta - param: [x, axis, keep_dim, dtype] - kernel : + infer_meta : + func : SumInferMeta + param: [x, axis, dtype, keep_dim] + kernel : func : sum - param : [x, axis, keep_dim, dtype] + param : [x, axis, dtype, keep_dim] data_type : x - api : zeros_like diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index 6bb02ab9d40dbe28b01bf669417a8d521c6458da..09182768f242760bc0b6c74cc37a4e3a0a0fb60e 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -31,7 +31,12 @@ class API: # names : [], list of attribute names # attr_info : { attr_name : (type, default_values)} self.args = gen_utils.parse_args(self.api, api_item_yaml['args']) - self.output = api_item_yaml['output'] + self.out_type_list, _ = gen_utils.parse_output(self.api, + api_item_yaml['output']) + self.return_type = self.out_type_list[0] if len( + self.out_type_list) == 1 else "std::tuple<" + ",".join( + self.out_type_list) + ">" + self.is_base_api = True if 'invoke' in api_item_yaml: self.is_base_api = False @@ -54,18 +59,44 @@ class API: def gene_api_declaration(self): return f""" -PADDLE_API {self.output} {self.api}({self.args['args_declare']}); +PADDLE_API {self.return_type} {self.api}({self.args['args_declare']}); """ + def gene_output(self, output_type_list): + kernel_output = "" + output_create = "" + + if len(output_type_list) == 1: + kernel_output = 'dense_out' + output_create = f""" + {self.return_type} out; + auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);""" + + elif len(output_type_list) > 1: + output_create = f""" + {self.return_type} out;""" + + for i in range(len(output_type_list)): + kernel_output = kernel_output + f'dense_out_{i}, ' + output_create = output_create + f""" + auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, &std::get<{i}>(out));""" + + kernel_output = kernel_output[:-2] + else: + raise ValueError( + "{} : Output error: the output should not be empty.".format( + self.api)) + + return kernel_output, output_create + def gene_api_code(self): if self.is_base_api: input_tensors, kernel_args = gen_utils.get_kernel_args( self.args['inputs']['names'], self.args['attrs'], self.kernel['param']) - out_type, _ = gen_utils.parse_output(self.api, self.output) - outputs_args, output_create = gen_utils.gene_output(out_type) + outputs_args, output_create = self.gene_output(self.out_type_list) return f""" -PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{ +PADDLE_API {self.return_type} {self.api}({self.args["args_define"]}) {{ {gen_utils.gene_kernel_select(self.api, self.args['inputs']['names'], self.args['attrs'], self.kernel)} auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); @@ -82,7 +113,7 @@ PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{ else: return f""" -PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{ +PADDLE_API {self.return_type} {self.api}({self.args["args_define"]}) {{ return {self.invoke}; }} """ diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index 0cb14327f6e09092bbce0229ae26f1b456238802..d55759b51c2e79be59f5881b0546338334a54342 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -23,9 +23,11 @@ import gen_utils class BackwardAPI: def __init__(self, backward_item_yaml): self.backward_api = backward_item_yaml['backward_api'] - self.args, self.output_type, self.return_comment = self.parse_and_check_args( + self.args, self.output_type_list, self.return_comment = self.parse_and_check_args( backward_item_yaml['forward'], backward_item_yaml['args'], backward_item_yaml['output']) + self.return_type = self.output_type_list[0] if len( + self.output_type_list) == 1 else "std::vector>" self.is_base_api = True if 'invoke' in backward_item_yaml: @@ -81,36 +83,65 @@ class BackwardAPI: Please check the args of {self.backward_api} in yaml." # check the output of backward - output_type, return_comment = gen_utils.parse_output(self.backward_api, - output_config) - assert output_type.count('Tensor') <= len(fw_inputs['names']), \ + out_type_list, return_comment = gen_utils.parse_output( + self.backward_api, output_config) + assert len(out_type_list) <= len(fw_inputs['names']), \ f"{self.backward_api} : Output error: The number of ouputs should be less then the number of inputs of forward api. \ Please check the output of {self.backward_api} in yaml." - return bw_args, output_type, return_comment + return bw_args, out_type_list, return_comment def gene_api_declaration(self): if self.return_comment: return f""" // {self.return_comment} -{self.output_type} {self.backward_api}({self.args['args_declare']}); +{self.return_type} {self.backward_api}({self.args['args_declare']}); """ else: return f""" -{self.output_type} {self.backward_api}({self.args['args_declare']}); +{self.return_type} {self.backward_api}({self.args['args_declare']}); """ + def gene_output(self, output_type_list): + kernel_output = "" + output_create = "" + + if len(output_type_list) == 1: + return_type = output_type_list[0] + kernel_output = 'dense_out' + output_create = f""" + {self.return_type} out; + auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);""" + + elif len(output_type_list) > 1: + output_create = f""" + {self.return_type} out;""" + + for i, out_type_item in enumerate(output_type_list): + kernel_output = kernel_output + f'dense_out_{i}, ' + get_out_code = f'&out[{i}][0]' if out_type_item == 'Tensor' else f'&out[{i}]' + output_create = output_create + f""" + auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, {get_out_code});""" + + kernel_output = kernel_output[:-2] + else: + raise ValueError( + "{} : Output error: the output should not be empty.".format( + self.backward_api)) + + return kernel_output, output_create + def gene_api_code(self): if self.is_base_api: input_tensors, kernel_args = gen_utils.get_kernel_args( self.args['inputs']['names'], self.args['attrs'], self.kernel['param']) - outputs_args, output_create = gen_utils.gene_output( - self.output_type) + outputs_args, output_create = self.gene_output( + self.output_type_list) return f""" // {self.return_comment} -{self.output_type} {self.backward_api}({self.args["args_define"]}) {{ +{self.return_type} {self.backward_api}({self.args["args_define"]}) {{ {gen_utils.gene_kernel_select(self.backward_api, self.args['inputs']['names'], self.args['attrs'], self.kernel)} auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); @@ -143,7 +174,7 @@ class BackwardAPI: params_code = self.args["args_define"] return f""" // {self.return_comment} -{self.output_type} {self.backward_api}({params_code}) {{ +{self.return_type} {self.backward_api}({params_code}) {{ return {invoke_code}; }} """ diff --git a/python/paddle/utils/code_gen/gen_utils.py b/python/paddle/utils/code_gen/gen_utils.py index 9d368c292b7cfefb0121aba9f0c0fcdc7b0a4caf..bdc29420558e910417e3b3deb7781e6b2d836766 100644 --- a/python/paddle/utils/code_gen/gen_utils.py +++ b/python/paddle/utils/code_gen/gen_utils.py @@ -124,7 +124,7 @@ def parse_output(api_name, output_config): if len(temp_list) == 1: out_type, out_name = parse_output_item(temp_list[0]) - return out_type, out_name + return [out_type], out_name else: out_type_list = [] out_name_list = [] @@ -133,8 +133,7 @@ def parse_output(api_name, output_config): out_type_list.append(out_type) out_name_list.append(out_name) - return "std::tuple<" + ",".join(out_type_list) + ">", ", ".join( - out_name_list) + return out_type_list, ", ".join(out_name_list) def gene_kernel_select(api, input_names, attrs, kernel) -> str: @@ -241,7 +240,7 @@ def gene_kernel_select(api, input_names, attrs, kernel) -> str: if len(input_names) > 0: kernel_select_code = kernel_select_code + f""" - if (kernel_backend == Backend::UNDEFINED + if (kernel_backend == Backend::UNDEFINED || kernel_layout == DataLayout::UNDEFINED || kernel_data_type == DataType::UNDEFINED ) {{ auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args}); @@ -315,24 +314,3 @@ def get_kernel_args(input_names, attrs, kernel_param): else: kernel_args = kernel_args + str(param) + ", " return input_tensor_code, kernel_args[:-2] - - -def gene_output(output_type): - kernel_output = "" - output_create = f""" - {output_type} out;""" - - if output_type == 'Tensor' or output_type == 'std::vector': - kernel_output = 'dense_out' - output_create = output_create + """ - auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);""" - elif re.match(r'std::tuple<.*>$', output_type): - out_num = output_type.count('Tensor') - for i in range(out_num): - kernel_output = kernel_output + f'dense_out_{i}, ' - output_create = output_create + f""" - auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, &std::get<{i}>(out));""" - - kernel_output = kernel_output[:-2] - - return kernel_output, output_create diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index f085eac1e358dd90e6377e015e360fc86ab6ca5c..853a98a62b504d94617127bd35212d2412719e1c 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -449,8 +449,8 @@ def _get_include_dirs_when_compiling(compile_dir): include_dirs_file = 'includes.txt' path = os.path.abspath(compile_dir) include_dirs_file = os.path.join(path, include_dirs_file) - if not os.path.isfile(include_dirs_file): - return [] + assert os.path.isfile(include_dirs_file), "File {} does not exist".format( + include_dirs_file) with open(include_dirs_file, 'r') as f: include_dirs = [line.strip() for line in f.readlines() if line.strip()] diff --git a/python/setup.py.in b/python/setup.py.in index aee4e149b0afeeb4c33d617b7db2a19bf2c36a91..e8cc2914521f323fd50e491dab1f8e7eb1421a3b 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -519,6 +519,10 @@ if '${WITH_XPU_BKCL}' == 'ON': shutil.copy('${XPU_BKCL_LIB}', libs_path) package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}'] +if '${WITH_IPU}' == 'ON': + shutil.copy('${PADDLE_IPU_LIB}', libs_path) + package_data['paddle.libs'] += ['libpaddle_ipu' + ext_name] + # remove unused paddle/libs/__init__.py if os.path.isfile(libs_path+'/__init__.py'): os.remove(libs_path+'/__init__.py')