diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index 2e335579f32df4f146c8d88e05e684a9a8105e20..e66459fa3a1508fe4a3687f07bbe18f2a5421296 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH}) return() ENDIF() +INCLUDE(GNUInstallDirs) + INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") @@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) +SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") +SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) +SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) +SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) ExternalProject_Add( ${NGRAPH_PROJECT} @@ -63,18 +69,6 @@ ExternalProject_Add( CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib ) -if(UNIX AND NOT APPLE) - include(GNUInstallDirs) - SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) -else() - SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib) -endif() -MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}") - -SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) -SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) -SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) - # Workaround for nGraph expecting mklml to be in mkldnn install directory. ExternalProject_Add_Step( ${NGRAPH_PROJECT} diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 0b95a780721b0771d55c4dbb2ddce33418612018..c679d8507d8a9d3bce48b7f38491dadd9f2fb7f6 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -129,6 +129,15 @@ if (WITH_MKLDNN) ) endif () +if (WITH_NGRAPH) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph") + copy(ngraph_lib + SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR} + DSTS ${dst_dir} ${dst_dir} + DEPS ngraph + ) +endif () + if (NOT WIN32) if (NOT MOBILE_INFERENCE AND NOT RPI) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 6b5ed1024449bdd5e73d69b3d6531e1d4e86126f..4f286049d40af988e83b0d0bc24a7b61ee90444f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -175,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -187,6 +194,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 52946c7f11f90490b1af1347f20db236a8fe24af..c701a2ad63048f69e8443fee7434928698aa20cc 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -34,6 +34,7 @@ add_subdirectory(ir) add_subdirectory(details) # ddim lib proto_library(framework_proto SRCS framework.proto) +proto_library(async_executor_proto SRCS data_feed.proto) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) @@ -126,8 +127,9 @@ cc_library(version SRCS version.cc) cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto) + if(NOT WIN32) +cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler) endif(NOT WIN32) @@ -135,7 +137,7 @@ endif(NOT WIN32) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) -py_proto_compile(framework_py_proto SRCS framework.proto) +py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) # Generate an empty __init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) @@ -157,18 +159,19 @@ endif(NOT WIN32) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) +cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) -cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) else(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) endif(NOT WIN32) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() @@ -176,8 +179,11 @@ endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph build_strategy - fast_threaded_ssa_graph_executor) + fast_threaded_ssa_graph_executor variable_helper) + +cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..afb2dd2f064384da39904f6aceead4fa915a80f2 --- /dev/null +++ b/paddle/fluid/framework/async_executor.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/async_executor.h" +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/executor_thread_worker.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/pybind/pybind.h" + +namespace paddle { +namespace framework { +AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place) + : root_scope_(scope), place_(place) {} + +void AsyncExecutor::CreateThreads( + ExecutorThreadWorker* worker, const ProgramDesc& main_program, + const std::shared_ptr& reader, + const std::vector& fetch_var_names, Scope* root_scope, + const int thread_index, const bool debug) { + worker->SetThreadId(thread_index); + worker->SetDebug(debug); + worker->SetRootScope(root_scope); + worker->CreateThreadResource(main_program, place_); + worker->SetDataFeed(reader); + worker->SetFetchVarNames(fetch_var_names); + worker->BindingDataFeedMemory(); +} + +void PrepareReaders(std::vector>& readers, // NOLINT + const int thread_num, const DataFeedDesc& data_feed_desc, + const std::vector& filelist) { + readers.resize(thread_num); + for (size_t i = 0; i < readers.size(); ++i) { + readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name()); + readers[i]->Init(data_feed_desc); // set batch_size and queue_size here + } + readers[0]->SetFileList(filelist); +} + +void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, + const std::string& data_feed_desc_str, + const std::vector& filelist, + const int thread_num, + const std::vector& fetch_var_names, + const bool debug) { + std::vector threads; + + auto& block = main_program.Block(0); + for (auto var_name : fetch_var_names) { + auto var_desc = block.FindVar(var_name); + auto shapes = var_desc->GetShape(); + PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1, + "var %s: Fetched var has wrong shape, " + "only variables with the last dimension size 1 supported", + var_name); + } + + DataFeedDesc data_feed_desc; + google::protobuf::TextFormat::ParseFromString(data_feed_desc_str, + &data_feed_desc); + + int actual_thread_num = thread_num; + int file_cnt = filelist.size(); + PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty"); + + if (actual_thread_num > file_cnt) { + VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt + << ". Changing thread_num = " << file_cnt; + actual_thread_num = file_cnt; + } + + /* + readerDesc: protobuf description for reader initlization + argument: class_name, batch_size, use_slot, queue_size, buffer_size, + padding_index + + reader: + 1) each thread has a reader, reader will read input data and + put it into input queue + 2) each reader has a Next() iterface, that can fetch an instance + from the input queue + */ + // todo: should be factory method for creating datafeed + std::vector> readers; + PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); + + std::vector> workers; + workers.resize(actual_thread_num); + for (auto& worker : workers) { + worker.reset(new ExecutorThreadWorker); + } + + // prepare thread resource here + for (int thidx = 0; thidx < actual_thread_num; ++thidx) { + CreateThreads(workers[thidx].get(), main_program, readers[thidx], + fetch_var_names, root_scope_, thidx, debug); + } + + // start executing ops in multiple threads + for (int thidx = 0; thidx < actual_thread_num; ++thidx) { + threads.push_back( + std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); + } + + for (auto& th : threads) { + th.join(); + } + + root_scope_->DropKids(); + + return; +} + +} // einit_modelnd namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..f4d2a79ac592e02f49ec0b988c824dc98883fbf6 --- /dev/null +++ b/paddle/fluid/framework/async_executor.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_thread_worker.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +class AsyncExecutor { + public: + AsyncExecutor(Scope* scope, const platform::Place& place); + virtual ~AsyncExecutor() {} + void RunFromFile(const ProgramDesc& main_program, + const std::string& data_feed_desc_str, + const std::vector& filelist, + const int thread_num, + const std::vector& fetch_names, + const bool debug = false); + + private: + void CreateThreads(ExecutorThreadWorker* worker, + const ProgramDesc& main_program, + const std::shared_ptr& reader, + const std::vector& fetch_var_names, + Scope* root_scope, const int thread_index, + const bool debug); + + public: + Scope* root_scope_; + platform::Place place_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc new file mode 100644 index 0000000000000000000000000000000000000000..291d8ffc3c3334c2836e1651a8997984bba084e1 --- /dev/null +++ b/paddle/fluid/framework/data_feed.cc @@ -0,0 +1,386 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" + +namespace paddle { +namespace framework { + +std::vector DataFeed::filelist_; +size_t DataFeed::file_idx_; +std::mutex DataFeed::mutex_for_pick_file_; +bool DataFeed::finish_set_filelist_; + +void DataFeed::AddFeedVar(Variable* var, const std::string& name) { + CheckInit(); + for (size_t i = 0; i < use_slots_.size(); ++i) { + if (name == use_slots_[i]) { + if (use_slots_is_dense_[i]) { + feed_vec_[i] = MixTensor(var->GetMutable()); + } else { + feed_vec_[i] = MixTensor(var->GetMutable()); + } + } + } +} + +bool DataFeed::SetFileList(const std::vector& files) { + std::unique_lock lock(mutex_for_pick_file_); + CheckInit(); + if (finish_set_filelist_) { + VLOG(3) << "info: you have set the filelist."; + return false; + } + PADDLE_ENFORCE(files.size(), "You have set an empty filelist."); + filelist_.assign(files.begin(), files.end()); + file_idx_ = 0; + + finish_set_filelist_ = true; + return true; +} + +void DataFeed::SetBatchSize(int batch_size) { + PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size); + default_batch_size_ = batch_size; +} + +bool DataFeed::PickOneFile(std::string* filename) { + std::unique_lock lock(mutex_for_pick_file_); + if (file_idx_ == filelist_.size()) { + return false; + } + *filename = filelist_[file_idx_++]; + return true; +} + +void DataFeed::CheckInit() { + PADDLE_ENFORCE(finish_init_, "Initialization did not succeed."); +} + +void DataFeed::CheckSetFileList() { + PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed."); +} + +void DataFeed::CheckStart() { + PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet."); +} + +template +void PrivateQueueDataFeed::SetQueueSize(int queue_size) { + PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size); + queue_size_ = queue_size; + queue_ = std::unique_ptr>( + new paddle::operators::reader::BlockingQueue(queue_size_)); +} + +template +bool PrivateQueueDataFeed::Start() { + CheckSetFileList(); + read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this); + read_thread_.detach(); + + finish_start_ = true; + return true; +} + +template +void PrivateQueueDataFeed::ReadThread() { + std::string filename; + while (PickOneFile(&filename)) { + file_.open(filename.c_str()); // is_text_feed + PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str()); + T instance; + while (ParseOneInstance(&instance)) { + queue_->Send(instance); + } + file_.close(); + } + queue_->Close(); +} + +template +int PrivateQueueDataFeed::Next() { + CheckStart(); + int index = 0; + T instance; + T ins_vec; + while (index < default_batch_size_) { + if (!queue_->Receive(&instance)) { + break; + } + AddInstanceToInsVec(&ins_vec, instance, index++); + } + batch_size_ = index; + if (batch_size_ != 0) { + PutToFeedVec(ins_vec); + } + return batch_size_; +} + +#ifdef _WIN32 +template class PrivateQueueDataFeed>; +#endif + +void MultiSlotDataFeed::Init( + const paddle::framework::DataFeedDesc& data_feed_desc) { + finish_init_ = false; + finish_set_filelist_ = false; + finish_start_ = false; + + PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(), + "Multi_slot_desc has not been set."); + paddle::framework::MultiSlotDesc multi_slot_desc = + data_feed_desc.multi_slot_desc(); + SetBatchSize(data_feed_desc.batch_size()); + SetQueueSize(data_feed_desc.batch_size()); + size_t all_slot_num = multi_slot_desc.slots_size(); + all_slots_.resize(all_slot_num); + all_slots_type_.resize(all_slot_num); + use_slots_index_.resize(all_slot_num); + use_slots_.clear(); + use_slots_is_dense_.clear(); + for (size_t i = 0; i < all_slot_num; ++i) { + const auto& slot = multi_slot_desc.slots(i); + all_slots_[i] = slot.name(); + all_slots_type_[i] = slot.type(); + use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1; + if (slot.is_used()) { + use_slots_.push_back(all_slots_[i]); + use_slots_is_dense_.push_back(slot.is_dense()); + } + } + feed_vec_.resize(use_slots_.size()); + finish_init_ = true; +} + +bool MultiSlotDataFeed::CheckFile(const char* filename) { + CheckInit(); // get info of slots + std::ifstream fin(filename); + if (!fin.good()) { + VLOG(1) << "error: open file<" << filename << "> fail"; + return false; + } + std::string line; + int instance_cout = 0; + std::string all_slots_alias = ""; + for (const auto& alias : all_slots_) { + all_slots_alias += alias + " "; + } + std::string use_slots_alias = ""; + for (const auto& alias : use_slots_) { + use_slots_alias += alias + " "; + } + VLOG(3) << "total slots num: " << all_slots_.size(); + VLOG(3) << "total slots alias: " << all_slots_alias; + VLOG(3) << "used slots num: " << use_slots_.size(); + VLOG(3) << "used slots alias: " << use_slots_alias; + while (getline(fin, line)) { + ++instance_cout; + const char* str = line.c_str(); + char* endptr = const_cast(str); + int len = line.length(); + for (size_t i = 0; i < all_slots_.size(); ++i) { + int num = strtol(endptr, &endptr, 10); + if (num < 0) { + VLOG(0) << "error: the number of ids is a negative number: " << num; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } else if (num == 0) { + VLOG(0) + << "error: the number of ids can not be zero, you need " + "padding it in data generator; or if there is something wrong" + " with the data, please check if the data contains unresolvable " + "characters."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } else if (errno == ERANGE || num > INT_MAX) { + VLOG(0) << "error: the number of ids greater than INT_MAX"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + if (all_slots_type_[i] == "float") { + for (int i = 0; i < num; ++i) { + strtof(endptr, &endptr); + if (errno == ERANGE) { + VLOG(0) << "error: the value is out of the range of " + "representable values for float"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + if (i + 1 != num && endptr - str == len) { + VLOG(0) << "error: there is a wrong with the number of ids."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + } + } else if (all_slots_type_[i] == "uint64") { + for (int i = 0; i < num; ++i) { + strtoull(endptr, &endptr, 10); + if (errno == ERANGE) { + VLOG(0) << "error: the value is out of the range of " + "representable values for uint64_t"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + if (i + 1 != num && endptr - str == len) { + VLOG(0) << "error: there is a wrong with the number of ids."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + } + } else { + VLOG(0) << "error: this type<" << all_slots_type_[i] + << "> is not supported"; + return false; + } + } + // It may be added '\t' character to the end of the output of reduce + // task when processes data by Hadoop(when the output of the reduce + // task of Hadoop has only one field, it will add a '\t' at the end + // of the line by default, and you can use this option to avoid it: + // `-D mapred.textoutputformat.ignoreseparator=true`), which does + // not affect the correctness of the data. Therefore, it should be + // judged that the data is not normal when the end of each line of + // data contains characters which are not spaces. + while (endptr - str != len) { + if (!isspace(*(endptr++))) { + VLOG(0) + << "error: there is some extra characters at the end of the line."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + } + } + VLOG(3) << "instances cout: " << instance_cout; + VLOG(3) << "The file format is correct"; + return true; +} + +bool MultiSlotDataFeed::ParseOneInstance(std::vector* instance) { + std::string line; + if (getline(file_, line)) { + int use_slots_num = use_slots_.size(); + instance->resize(use_slots_num); + // parse line + const char* str = line.c_str(); + char* endptr = const_cast(str); + int pos = 0; + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + if (idx != -1) { + (*instance)[idx].Init(all_slots_type_[i]); + if ((*instance)[idx].GetType()[0] == 'f') { // float + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + (*instance)[idx].AddValue(feasign); + } + } else if ((*instance)[idx].GetType()[0] == 'u') { // uint64 + for (int j = 0; j < num; ++j) { + uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10); + (*instance)[idx].AddValue(feasign); + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + pos = line.find_first_of(' ', pos + 1); + } + } + } + } else { + return false; + } + return true; +} + +void MultiSlotDataFeed::AddInstanceToInsVec( + std::vector* ins_vec, + const std::vector& instance, int index) { + if (index == 0) { + ins_vec->resize(instance.size()); + for (size_t i = 0; i < instance.size(); ++i) { + (*ins_vec)[i].Init(instance[i].GetType()); + (*ins_vec)[i].InitOffset(); + } + } + for (size_t i = 0; i < instance.size(); ++i) { + (*ins_vec)[i].AddIns(instance[i]); + } +} + +void MultiSlotDataFeed::PutToFeedVec( + const std::vector& ins_vec) { + for (size_t i = 0; i < use_slots_.size(); ++i) { + const auto& type = ins_vec[i].GetType(); + const auto& offset = ins_vec[i].GetOffset(); + int total_instance = static_cast(offset.back()); + if (type[0] == 'f') { // float + const auto& feasign = ins_vec[i].GetFloatData(); + if (feed_vec_[i].IsDense()) { + int size_in_each_batch = total_instance / batch_size_; + float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data( + {batch_size_, size_in_each_batch}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); + } else { + float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); + LoD data_lod{offset}; + feed_vec_[i].GetLoDTensor()->set_lod(data_lod); + } + } else if (type[0] == 'u') { // uint64 + // no uint64_t type in paddlepaddle + const auto& feasign = ins_vec[i].GetUint64Data(); + if (feed_vec_[i].IsDense()) { + int size_in_each_batch = total_instance / batch_size_; + int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data( + {batch_size_, size_in_each_batch}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + } else { + int64_t* tensor_ptr = + feed_vec_[i].GetLoDTensor()->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + LoD data_lod{offset}; + feed_vec_[i].GetLoDTensor()->set_lod(data_lod); + } + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h new file mode 100644 index 0000000000000000000000000000000000000000..a7f8d1d31752af200145bc7934e7880910338e9d --- /dev/null +++ b/paddle/fluid/framework/data_feed.h @@ -0,0 +1,269 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include + +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/reader/blocking_queue.h" + +namespace paddle { +namespace framework { + +// Pack Tensor type and LoDTensor type into MixTensor type, in order +// to record either Tensor or LoDTensor information at the same time. +class MixTensor { + public: + MixTensor() {} + explicit MixTensor(LoDTensor* lodtensor) { + is_dense_ = false; + lodtensor_ = lodtensor; + } + explicit MixTensor(Tensor* tensor) { + is_dense_ = true; + tensor_ = tensor; + } + bool IsDense() { return is_dense_; } + LoDTensor* GetLoDTensor() { + PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr."); + return lodtensor_; + } + Tensor* GetTensor() { + PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr."); + return tensor_; + } + + private: + bool is_dense_; + LoDTensor* lodtensor_; + Tensor* tensor_; +}; + +// DataFeed is the base virtual class for all ohther DataFeeds. +// It is used to read files and parse the data for subsequent trainer. +// Example: +// DataFeed* reader = +// paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name); +// reader->Init(data_feed_desc); // data_feed_desc is a protobuf object +// reader->SetFileList(filelist); +// const std::vector & use_slot_alias = +// reader->GetUseSlotAlias(); +// for (auto name: use_slot_alias){ // for binding memory +// reader->AddFeedVar(scope->Var(name), name); +// } +// reader->Start(); +// while (reader->Next()) { +// // trainer do something +// } +class DataFeed { + public: + DataFeed() {} + virtual ~DataFeed() {} + virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0; + virtual bool CheckFile(const char* filename) { + PADDLE_THROW("This function(CheckFile) is not implemented."); + } + // Set filelist for DataFeed. + // Pay attention that it must init all readers before call this function. + // Otherwise, Init() function will init finish_set_filelist_ flag. + virtual bool SetFileList(const std::vector& files); + virtual bool Start() = 0; + // The trainer calls the Next() function, and the DataFeed will load a new + // batch to the feed_vec. The return value of this function is the batch + // size of the current batch. + virtual int Next() = 0; + // Get all slots' alias which defined in protofile + virtual const std::vector& GetAllSlotAlias() { + return all_slots_; + } + // Get used slots' alias which defined in protofile + virtual const std::vector& GetUseSlotAlias() { + return use_slots_; + } + // This function is used for binding feed_vec memory + virtual void AddFeedVar(Variable* var, const std::string& name); + + protected: + // The following three functions are used to check if it is executed in this + // order: + // Init() -> SetFileList() -> Start() -> Next() + virtual void CheckInit(); + virtual void CheckSetFileList(); + virtual void CheckStart(); + virtual void SetBatchSize( + int batch); // batch size will be set in Init() function + // This function is used to pick one file from the global filelist(thread + // safe). + virtual bool PickOneFile(std::string* filename); + + static std::vector filelist_; + static size_t file_idx_; + static std::mutex mutex_for_pick_file_; + + // the alias of used slots, and its order is determined by + // data_feed_desc(proto object) + std::vector use_slots_; + std::vector use_slots_is_dense_; + + // the alias of all slots, and its order is determined by data_feed_desc(proto + // object) + std::vector all_slots_; + std::vector all_slots_type_; + std::vector + use_slots_index_; // -1: not used; >=0: the index of use_slots_ + + // The data read by DataFeed will be stored here + std::vector feed_vec_; + + // the batch size defined by user + int default_batch_size_; + // current batch size + int batch_size_; + + bool finish_init_; + static bool finish_set_filelist_; + bool finish_start_; +}; + +// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds. +// It use a read-thread to read file and parse data to a private-queue +// (thread level), and get data from this queue when trainer call Next(). +template +class PrivateQueueDataFeed : public DataFeed { + public: + PrivateQueueDataFeed() {} + virtual ~PrivateQueueDataFeed() {} + virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0; + virtual bool Start(); + virtual int Next(); + + protected: + // The thread implementation function for reading file and parse. + virtual void ReadThread(); + // This function is used to set private-queue size, and the most + // efficient when the queue size is close to the batch size. + virtual void SetQueueSize(int queue_size); + // The reading and parsing method called in the ReadThread. + virtual bool ParseOneInstance(T* instance) = 0; + // This function is used to put instance to vec_ins + virtual void AddInstanceToInsVec(T* vec_ins, const T& instance, + int index) = 0; + // This function is used to put ins_vec to feed_vec + virtual void PutToFeedVec(const T& ins_vec) = 0; + + // The thread for read files + std::thread read_thread_; + // using ifstream one line and one line parse is faster + // than using fread one buffer and one buffer parse. + // for a 601M real data: + // ifstream one line and one line parse: 6034 ms + // fread one buffer and one buffer parse: 7097 ms + std::ifstream file_; + size_t queue_size_; + // The queue for store parsed data + std::unique_ptr> queue_; +}; + +// This class define the data type of instance(ins_vec) in MultiSlotDataFeed +class MultiSlotType { + public: + MultiSlotType() {} + ~MultiSlotType() {} + void Init(const std::string& type) { + CheckType(type); + if (type_[0] == 'f') { + float_feasign_.clear(); + } else if (type_[0] == 'u') { + uint64_feasign_.clear(); + } + type_ = type; + } + void InitOffset() { + offset_.resize(1); + // LoDTensor' lod is counted from 0, the size of lod + // is one size larger than the size of data. + offset_[0] = 0; + } + const std::vector& GetOffset() const { return offset_; } + void AddValue(const float v) { + CheckFloat(); + float_feasign_.push_back(v); + } + void AddValue(const uint64_t v) { + CheckUint64(); + uint64_feasign_.push_back(v); + } + void AddIns(const MultiSlotType& ins) { + if (ins.GetType()[0] == 'f') { // float + CheckFloat(); + auto& vec = ins.GetFloatData(); + offset_.push_back(offset_.back() + vec.size()); + float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end()); + } else if (ins.GetType()[0] == 'u') { // uint64 + CheckUint64(); + auto& vec = ins.GetUint64Data(); + offset_.push_back(offset_.back() + vec.size()); + uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end()); + } + } + const std::vector& GetFloatData() const { return float_feasign_; } + const std::vector& GetUint64Data() const { return uint64_feasign_; } + const std::string& GetType() const { return type_; } + + private: + void CheckType(const std::string& type) const { + PADDLE_ENFORCE((type == "uint64") || (type == "float"), + "There is no this type<%s>.", type); + } + void CheckFloat() const { + PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_); + } + void CheckUint64() const { + PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_); + } + std::vector float_feasign_; + std::vector uint64_feasign_; + std::string type_; + std::vector offset_; +}; + +// This DataFeed is used to feed multi-slot type data. +// The format of multi-slot type data: +// [n feasign_0 feasign_1 ... feasign_n]* +class MultiSlotDataFeed + : public PrivateQueueDataFeed> { + public: + MultiSlotDataFeed() {} + virtual ~MultiSlotDataFeed() {} + virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc); + virtual bool CheckFile(const char* filename); + + protected: + virtual void AddInstanceToInsVec(std::vector* vec_ins, + const std::vector& instance, + int index); + virtual bool ParseOneInstance(std::vector* instance); + virtual void PutToFeedVec(const std::vector& ins_vec); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto new file mode 100644 index 0000000000000000000000000000000000000000..489fec08d86ccf61ece29bbba6d0204f25530b0f --- /dev/null +++ b/paddle/fluid/framework/data_feed.proto @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +syntax = "proto2"; +package paddle.framework; + +message Slot { + required string name = 1; + required string type = 2; + optional bool is_dense = 3 [ default = false ]; + optional bool is_used = 4 [ default = false ]; +} + +message MultiSlotDesc { repeated Slot slots = 1; } + +message DataFeedDesc { + optional string name = 1; + optional int32 batch_size = 2 [ default = 32 ]; + optional MultiSlotDesc multi_slot_desc = 3; +} diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..72148b9f7d343e19d60bb2be44d8270ad78d1412 --- /dev/null +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_feed_factory.h" +#include +#include +#include + +#include "paddle/fluid/framework/data_feed.h" + +namespace paddle { +namespace framework { +typedef std::shared_ptr (*Createdata_feedFunction)(); +typedef std::unordered_map data_feedMap; +data_feedMap g_data_feed_map; + +#define REGISTER_DATAFEED_CLASS(data_feed_class) \ + namespace { \ + std::shared_ptr Creator_##data_feed_class() { \ + return std::shared_ptr(new data_feed_class); \ + } \ + class __Registerer_##data_feed_class { \ + public: \ + __Registerer_##data_feed_class() { \ + g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \ + } \ + }; \ + __Registerer_##data_feed_class g_registerer_##data_feed_class; \ + } // namespace + +std::string DataFeedFactory::DataFeedTypeList() { + std::string data_feed_types; + for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end(); + ++iter) { + if (iter != g_data_feed_map.begin()) { + data_feed_types += ", "; + } + data_feed_types += iter->first; + } + return data_feed_types; +} + +std::shared_ptr DataFeedFactory::CreateDataFeed( + std::string data_feed_class) { + if (g_data_feed_map.count(data_feed_class) < 1) { + exit(-1); + } + return g_data_feed_map[data_feed_class](); +} + +REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed_factory.h b/paddle/fluid/framework/data_feed_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..13678edb0b8d084a0b3016d93f6e1bc32ce0169a --- /dev/null +++ b/paddle/fluid/framework/data_feed_factory.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/data_feed.h" + +namespace paddle { +namespace framework { +class DataFeedFactory { + public: + static std::string DataFeedTypeList(); + static std::shared_ptr CreateDataFeed(std::string data_feed_class); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3974f8dbadf332801a822618d77f140db440b29d --- /dev/null +++ b/paddle/fluid/framework/data_feed_test.cc @@ -0,0 +1,337 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_feed.h" +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +paddle::framework::DataFeedDesc load_datafeed_param_from_file( + const char* filename) { + paddle::framework::DataFeedDesc data_feed_desc; + int file_descriptor = open(filename, O_RDONLY); + PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename); + google::protobuf::io::FileInputStream fileInput(file_descriptor); + google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc); + close(file_descriptor); + return data_feed_desc; +} + +const std::vector load_filelist_from_file(const char* filename) { + std::vector filelist; + std::ifstream fin(filename); + PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename); + std::string line; + while (getline(fin, line)) { + filelist.push_back(line); + } + fin.close(); + return filelist; +} + +void GenerateFileForTest(const char* protofile, const char* filelist) { + std::ofstream w_protofile(protofile); + w_protofile << "name: \"MultiSlotDataFeed\"\n" + "batch_size: 2\n" + "multi_slot_desc {\n" + " slots {\n" + " name: \"uint64_sparse_slot\"\n" + " type: \"uint64\"\n" + " is_dense: false\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"float_sparse_slot\"\n" + " type: \"float\"\n" + " is_dense: false\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"uint64_dense_slot\"\n" + " type: \"uint64\"\n" + " is_dense: true\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"float_dense_slot\"\n" + " type: \"float\"\n" + " is_dense: true\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"not_used_slot\"\n" + " type: \"uint64\"\n" + " is_dense: false\n" + " is_used: false\n" + " }\n" + "}"; + w_protofile.close(); + std::ofstream w_filelist(filelist); + int total_file = 4; + for (int i = 0; i < total_file; ++i) { + std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i); + w_filelist << filename; + if (i + 1 != total_file) { + w_filelist << std::endl; + } + std::ofstream w_datafile(filename.c_str()); + w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n" + "2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n" + "1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n"; + w_datafile.close(); + } + w_filelist.close(); +} + +class MultiTypeSet { + public: + MultiTypeSet() { + uint64_set_.clear(); + float_set_.clear(); + } + ~MultiTypeSet() {} + void AddValue(uint64_t v) { uint64_set_.insert(v); } + void AddValue(float v) { float_set_.insert(v); } + const std::set& GetUint64Set() const { return uint64_set_; } + const std::set& GetFloatSet() const { return float_set_; } + + private: + std::set uint64_set_; + std::set float_set_; +}; + +void GetElemSetFromReader(std::vector* reader_elem_set, + const paddle::framework::DataFeedDesc& data_feed_desc, + const std::vector& filelist, + const int thread_num) { + int used_slot_num = 0; + for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) { + if (data_feed_desc.multi_slot_desc().slots(i).is_used()) { + ++used_slot_num; + } + } + reader_elem_set->resize(used_slot_num); + std::vector threads; + std::vector> readers; + readers.resize(thread_num); + for (int i = 0; i < thread_num; ++i) { + readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed( + data_feed_desc.name()); + readers[i]->Init(data_feed_desc); + } + readers[0]->SetFileList(filelist); + std::mutex mu; + for (int idx = 0; idx < thread_num; ++idx) { + threads.emplace_back(std::thread([&, idx] { + std::unique_ptr scope( + new paddle::framework::Scope()); + const auto& multi_slot_desc = data_feed_desc.multi_slot_desc(); + std::map + lodtensor_targets; + std::map tensor_targets; + for (int i = 0; i < multi_slot_desc.slots_size(); ++i) { + const auto& slot = multi_slot_desc.slots(i); + if (slot.is_used()) { + const auto& name = slot.name(); + readers[idx]->AddFeedVar(scope->Var(name), name); + if (slot.is_dense()) { + tensor_targets[name] = + &scope->FindVar(name)->Get(); + } else { + lodtensor_targets[name] = + &scope->FindVar(name)->Get(); + } + } + } + readers[idx]->Start(); + while (readers[idx]->Next()) { + int index = 0; + for (int k = 0; k < multi_slot_desc.slots_size(); ++k) { + const auto& slot = multi_slot_desc.slots(k); + if (!slot.is_used()) { + continue; + } + if (slot.is_dense()) { // dense branch + const paddle::framework::Tensor* tens = tensor_targets[slot.name()]; + if (slot.type() == "uint64") { + const int64_t* data = tens->data(); + int batch_size = tens->dims()[0]; + int dim = tens->dims()[1]; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < dim; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue( + (uint64_t)data[i * dim + j]); + } + } + } else if (slot.type() == "float") { + const float* data = tens->data(); + int batch_size = tens->dims()[0]; + int dim = tens->dims()[1]; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < dim; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue(data[i * dim + j]); + } + } + } else { + PADDLE_THROW("Error type in proto file."); + } + } else { // sparse branch + const paddle::framework::LoDTensor* tens = + lodtensor_targets[slot.name()]; + if (slot.type() == "uint64") { + const int64_t* data = tens->data(); + for (size_t i = 0; i < tens->NumElements(); ++i) { + std::pair element = tens->lod_element(0, i); + for (size_t j = element.first; j < element.second; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue((uint64_t)data[j]); + } + } + } else if (slot.type() == "float") { + const float* data = tens->data(); + for (size_t i = 0; i < tens->NumElements(); ++i) { + std::pair element = tens->lod_element(0, i); + for (size_t j = element.first; j < element.second; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue(data[j]); + } + } + } else { + PADDLE_THROW("Error type in proto file."); + } + } // end sparse branch + ++index; + } // end slots loop + } // end while Next() + })); // end anonymous function + } + for (auto& th : threads) { + th.join(); + } +} + +void CheckIsUnorderedSame(const std::vector& s1, + const std::vector& s2) { + EXPECT_EQ(s1.size(), s2.size()); + for (size_t i = 0; i < s1.size(); ++i) { + // check for uint64 + const std::set& uint64_s1 = s1[i].GetUint64Set(); + const std::set& uint64_s2 = s2[i].GetUint64Set(); + EXPECT_EQ(uint64_s1.size(), uint64_s2.size()); + auto uint64_it1 = uint64_s1.begin(); + auto uint64_it2 = uint64_s2.begin(); + while (uint64_it1 != uint64_s1.end()) { + EXPECT_EQ(*uint64_it1, *uint64_it2); + ++uint64_it1; + ++uint64_it2; + } + // check for float + const std::set& float_s1 = s1[i].GetFloatSet(); + const std::set& float_s2 = s2[i].GetFloatSet(); + EXPECT_EQ(float_s1.size(), float_s2.size()); + auto float_it1 = float_s1.begin(); + auto float_it2 = float_s2.begin(); + while (float_it1 != float_s1.end()) { + EXPECT_EQ(*float_it1, *float_it2); + ++float_it1; + ++float_it2; + } + } +} + +void GetElemSetFromFile(std::vector* file_elem_set, + const paddle::framework::DataFeedDesc& data_feed_desc, + const std::vector& filelist) { + int used_slot_num = 0; + for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) { + if (data_feed_desc.multi_slot_desc().slots(i).is_used()) { + ++used_slot_num; + } + } + file_elem_set->resize(used_slot_num); + for (const auto& file : filelist) { + std::ifstream fin(file.c_str()); + PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str()); + while (1) { + bool end_flag = false; + int index = 0; + for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) { + int num; + if (fin >> num) { + auto slot = data_feed_desc.multi_slot_desc().slots(i); + auto type = slot.type(); + if (type == "uint64") { + while (num--) { + uint64_t feasign; + fin >> feasign; + if (slot.is_used()) { + (*file_elem_set)[index].AddValue(feasign); + } + } + } else if (type == "float") { + while (num--) { + float feasign; + fin >> feasign; + if (slot.is_used()) { + (*file_elem_set)[index].AddValue(feasign); + } + } + } else { + PADDLE_THROW("Error type in proto file."); + } + if (slot.is_used()) { + ++index; + } + } else { + end_flag = true; + break; + } + } + if (end_flag) { + break; + } + } + fin.close(); + } +} + +TEST(DataFeed, MultiSlotUnitTest) { + const char* protofile = "data_feed_desc.prototxt"; + const char* filelist_name = "filelist.txt"; + GenerateFileForTest(protofile, filelist_name); + const std::vector filelist = + load_filelist_from_file(filelist_name); + paddle::framework::DataFeedDesc data_feed_desc = + load_datafeed_param_from_file(protofile); + std::vector reader_elem_set; + std::vector file_elem_set; + GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4); + GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist); + CheckIsUnorderedSame(reader_elem_set, file_elem_set); +} diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index eea7e712f8f6e187cdceedce77cc76d1d4ca2101..1ce18c3d6b26c541beed668a113b7a4de7f0e79e 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -32,7 +32,9 @@ enum OpInfoFillType { kOpProtoAndCheckerMaker = 1, kGradOpDescMaker = 2, kVarTypeInference = 3, - kShapeInference = 4 + kShapeInference = 4, + kEstimateFlops = 5, + kUnknown = -1 }; template @@ -48,8 +50,10 @@ struct OpInfoFillTypeID { ? kVarTypeInference : (std::is_base_of::value ? kShapeInference - : static_cast( - -1))))); + : (std::is_base_of::value + ? kEstimateFlops + : kUnknown))))); } }; @@ -139,6 +143,16 @@ struct OpInfoFiller { } }; +template +struct OpInfoFiller { + void operator()(const char* op_tpe, OpInfo* info) const { + info->estimate_flops_ = [](InferShapeContext* ctx) { + T estimate_flops; + return estimate_flops(ctx); + }; + } +}; + } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index e5b1eaa7318aecde1dbf89de8fe242a3008db97c..499246a9856bb3ba67a155c6f00c3ad06af50edf 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -16,7 +16,7 @@ #include #include #include -#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/framework/details/reference_count_op_handle.h" diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 96132a2c18233ca10d7bad4e26dfabadd39d84db..73cec21e20f2fd26e144872f1f7b5bb7065adb74 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/transfer_scope_cache.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -114,36 +115,6 @@ void Executor::Close() { #endif } -void InitializeVariable(Variable* var, proto::VarType::Type var_type) { - if (var_type == proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else if (var_type == proto::VarType::SELECTED_ROWS) { - var->GetMutable(); - } else if (var_type == proto::VarType::FEED_MINIBATCH) { - var->GetMutable(); - } else if (var_type == proto::VarType::FETCH_LIST) { - var->GetMutable(); - } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); - } else if (var_type == proto::VarType::LOD_RANK_TABLE) { - var->GetMutable(); - } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { - var->GetMutable(); - } else if (var_type == proto::VarType::PLACE_LIST) { - var->GetMutable(); - } else if (var_type == proto::VarType::READER) { - var->GetMutable(); - } else if (var_type == proto::VarType::RAW) { - // GetMutable will be called in operator - } else { - PADDLE_THROW( - "Variable type %d is not in " - "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " - "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", - var_type); - } -} - void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id) { auto& global_block = pdesc.Block(block_id); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 36b36d49c2728dbef93042158dffa26d8f56d529..2d47903ffbd8d821b7c31386b225fe5e65ca2720 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -26,7 +26,6 @@ limitations under the License. */ namespace paddle { namespace framework { -extern void InitializeVariable(Variable* var, proto::VarType::Type var_type); template std::unordered_map GetNonPersistableReferenceCount( diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e4001e979fdd0774779fa288402c7847af90637 --- /dev/null +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -0,0 +1,223 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/executor_thread_worker.h" +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/pybind/pybind.h" +namespace paddle { +namespace framework { + +void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { + auto& block = program.Block(0); + op_names_.clear(); + for (auto& op_desc : block.AllOps()) { + std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); + op_names_.push_back(op_desc->Type()); + OperatorBase* local_op_ptr = local_op.release(); + ops_.push_back(local_op_ptr); + continue; + } +} + +void ExecutorThreadWorker::CreateThreadResource( + const framework::ProgramDesc& program, + const paddle::platform::Place& place) { + CreateThreadScope(program); + CreateThreadOperators(program); + SetMainProgram(program); + SetPlace(place); +} + +void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) { + auto& block = program.Block(0); + + PADDLE_ENFORCE_NOT_NULL( + root_scope_, "root_scope should be set before creating thread scope"); + + thread_scope_ = &root_scope_->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Persistable()) { + auto* ptr = root_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } else { + auto* ptr = thread_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } + } +} + +void ExecutorThreadWorker::SetDataFeed( + const std::shared_ptr& datafeed) { + thread_reader_ = datafeed; +} + +void ExecutorThreadWorker::BindingDataFeedMemory() { + const std::vector& input_feed = + thread_reader_->GetUseSlotAlias(); + for (auto name : input_feed) { + thread_reader_->AddFeedVar(thread_scope_->Var(name), name); + } +} + +void ExecutorThreadWorker::SetFetchVarNames( + const std::vector& fetch_var_names) { + fetch_var_names_.clear(); + fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(), + fetch_var_names.end()); +} + +void ExecutorThreadWorker::SetDevice() { +#if defined _WIN32 || defined __APPLE__ + return; +#else + static unsigned concurrency_cap = std::thread::hardware_concurrency(); + int thread_id = this->thread_id_; + + if (thread_id < concurrency_cap) { + unsigned proc = thread_id; + + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(proc, &mask); + + if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) { + VLOG(1) << "WARNING: Failed to set thread affinity for thread " + << thread_id; + } else { + CPU_ZERO(&mask); + if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) || + (CPU_ISSET(proc, &mask) == 0)) { + VLOG(3) << "WARNING: Failed to set thread affinity for thread " + << thread_id; + } + } + } else { + VLOG(1) << "WARNING: Failed to set thread affinity for thread " + << thread_id; + } +#endif +} + +template +void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) { + auto inspect = lod_tensor.data(); + auto element_num = lod_tensor.numel(); + + std::ostringstream sstream; + sstream << var_name << " (element num " << element_num << "): ["; + sstream << inspect[0]; + for (int j = 1; j < element_num; ++j) { + sstream << " " << inspect[j]; + } + sstream << "]"; + + std::cout << sstream.str() << std::endl; +} + +void print_fetch_var(Scope* scope, std::string var_name) { + const LoDTensor& tensor = scope->FindVar(var_name)->Get(); + + if (std::type_index(tensor.type()) == + std::type_index(typeid(platform::float16))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(double))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(int64_t))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(uint8_t))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(int16_t))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(int8_t))) { + print_lod_tensor(var_name, tensor); + } else { + VLOG(1) << "print_fetch_var: unrecognized data type:" + << tensor.type().name(); + } + + return; +} + +void ExecutorThreadWorker::TrainFiles() { + // todo: configurable + SetDevice(); + + int fetch_var_num = fetch_var_names_.size(); + fetch_values_.clear(); + fetch_values_.resize(fetch_var_num); + + thread_reader_->Start(); + + int cur_batch; + int batch_cnt = 0; + while ((cur_batch = thread_reader_->Next()) > 0) { + // executor run here + for (auto& op : ops_) { + op->Run(*thread_scope_, place_); + } + + ++batch_cnt; + thread_scope_->DropKids(); + + if (debug_ == false || thread_id_ != 0) { + continue; + } + + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } // end for (int i = 0...) + } // end while () +} + +void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; } + +void ExecutorThreadWorker::SetPlace(const platform::Place& place) { + place_ = place; +} + +void ExecutorThreadWorker::SetMainProgram( + const ProgramDesc& main_program_desc) { + main_program_.reset(new ProgramDesc(main_program_desc)); +} + +void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { + root_scope_ = g_scope; +} + +} // einit_modelnd namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h new file mode 100644 index 0000000000000000000000000000000000000000..13ec2442c46459116320236bf98f23c91340f389 --- /dev/null +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +void CreateTensor(Variable* var, proto::VarType::Type var_type); + +class ExecutorThreadWorker { + public: + ExecutorThreadWorker() + : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} + ~ExecutorThreadWorker() {} + + void CreateThreadResource(const framework::ProgramDesc& program, + const paddle::platform::Place& place); + void SetThreadId(int tid); + void SetDebug(const bool debug) { debug_ = debug; } + void SetRootScope(Scope* g_scope); + // set cpu device in this function + // cpu binding is used by default + void SetDevice(); + // since we read data into memory that can not be accessed by program + // we need to bind memory of data with corresponding variables in program + // this function should be called after data feed is set + void BindingDataFeedMemory(); + // set data feed declared in executor + void SetDataFeed(const std::shared_ptr& datafeed); + // A multi-thread training function + void TrainFiles(); + // set fetch variable names from python interface assigned by users + void SetFetchVarNames(const std::vector& fetch_var_names); + + private: + void CreateThreadScope(const framework::ProgramDesc& program); + void CreateThreadOperators(const framework::ProgramDesc& program); + void SetMainProgram(const ProgramDesc& main_program_desc); + void SetPlace(const paddle::platform::Place& place); + + protected: + // thread index + std::shared_ptr thread_reader_; // shared queue, thread buffer + int thread_id_; + // operator name + std::vector op_names_; + // thread level, local operators for forward and backward + std::vector ops_; + // main program for training + std::unique_ptr main_program_; + // execution place + platform::Place place_; + // root scope for model parameters + Scope* root_scope_; + // a thread scope, father scope is global score which is shared + Scope* thread_scope_; + + private: + std::vector fetch_var_names_; + std::vector> fetch_values_; + bool debug_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 292f232ffce48593e1827fe2dfe1b8472360054e..6d8f020918d4e56fa7f125a659f7f8511ca067ca 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -38,7 +38,7 @@ std::unique_ptr IsTestPass::ApplyImpl( for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); - if (op->HasAttr("is_test")) { + if (n->RuntimeHasAttr("is_test")) { op->SetAttr("is_test", true); } else if (std::find(begin(op_list), end(op_list), op->Type()) != end(op_list)) { diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index 9696441a21661db89146c448742a992d1f7df022..d9a68c7f1dd2a0dca5204719c4ce6cd9d68292a2 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -104,9 +104,9 @@ TEST(IsTestPass, basic) { auto* op = node->Op(); auto op_name = boost::get(op->GetAttr("name")); if (op_name == "conv3") { - ASSERT_FALSE(op->HasAttr("is_test")); + ASSERT_FALSE(node->RuntimeHasAttr("is_test")); } else { - ASSERT_TRUE(op->HasAttr("is_test")); + ASSERT_TRUE(node->RuntimeHasAttr("is_test")); EXPECT_TRUE(boost::get(op->GetAttr("is_test"))); } } diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 65be69b7f5b5e363d5d0753c45f9ff9e3f329fbe..1cf1315d3d3059261d84d0e8795a75df4deae005 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -22,7 +22,7 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( std::unique_ptr graph) const { VLOG(3) << "Aplies MKL-DNN placement strategy."; for (const Node* n : graph->Nodes()) { - if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) { + if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) { n->Op()->SetAttr("use_mkldnn", true); } } diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 50d9113088903aa7681d6c6af5cc65f846d32787..7a88cb2b681c1aa5e1b75481b1aba66a125a1d9c 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_info.h" namespace paddle { namespace framework { @@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[]; const char Node::kControlDepVarName[] = "__control_var"; #endif -std::unique_ptr CreateNodeForTest(const std::string& name, +std::unique_ptr CreateNodeForTest(const std::string &name, Node::Type type) { return std::unique_ptr(new Node(name, type)); } + +bool Node::RuntimeHasAttr(const std::string &name) const { + if (Op()->HasAttr(name)) { + return true; + } else { + auto &op_info = OpInfoMap::Instance(); + auto op_type = Op()->Type(); + if (op_info.Has(op_type)) { + auto op_info_ptr = op_info.Get(op_type); + if (op_info_ptr.HasOpProtoAndChecker()) { + const proto::OpProto &proto = op_info_ptr.Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; + } + } + } + } + } + return false; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f19e9aab79098757dae663d030b0fa2b..1044a96430f060b750580ea0b225787ba6ebd2a0 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -108,6 +108,18 @@ class Node { Name().find(ir::Node::kControlDepVarName) != std::string::npos; } + // RuntimeHasAttr is different with HasAttr now. + // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr, + // thus, if stored program_desc_ are old which don't have an attr, a new + // library which adds the attr already will fail on this function. + // Details: + // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087 + // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above + // problem. + // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding + // RuntimeHasAttr. + bool RuntimeHasAttr(const std::string& name) const; + std::vector inputs; std::vector outputs; diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index e8295639520b5838dce3c9c9e443cc846bd9c1ec..f1642bc0d2b10f97295e80ee201db8f83bfd06ef 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -21,42 +21,11 @@ #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace framework { - -// These code can be shared with Executor. -static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { - if (var_type == proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else if (var_type == proto::VarType::SELECTED_ROWS) { - var->GetMutable(); - } else if (var_type == proto::VarType::FEED_MINIBATCH) { - var->GetMutable(); - } else if (var_type == proto::VarType::FETCH_LIST) { - var->GetMutable(); - } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); - } else if (var_type == proto::VarType::LOD_RANK_TABLE) { - var->GetMutable(); - } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { - var->GetMutable(); - } else if (var_type == proto::VarType::PLACE_LIST) { - var->GetMutable(); - } else if (var_type == proto::VarType::READER) { - var->GetMutable(); - } else if (var_type == proto::VarType::RAW) { - // GetMutable will be called in operator - } else { - PADDLE_THROW( - "Variable type %d is not in " - "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " - "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", - var_type); - } -} - void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, int block_id, bool with_feed_fetch_ops) { if (!scope) { diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 8177436d0bd90c3bcf8f91d5c55b66be188b19f9..e22c29037718a60ff7f24404d7749600e2edb80b 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -15,23 +15,105 @@ limitations under the License. */ #ifdef PADDLE_WITH_NGRAPH #include #include +#include #include "paddle/fluid/framework/ngraph_bridge.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" #include "ngraph/ngraph.hpp" namespace paddle { namespace framework { +static std::shared_ptr GetNode( + const std::shared_ptr& op, const std::string prm, + const VariableNameMap& var_map, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto& var_names = var_map.at(prm); + PADDLE_ENFORCE_EQ(var_names.size(), 1, + "op %s prm %s expects one associated var", op->Type(), prm); + if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { + return (*ngb_node_map)[var_names[0]]; + } else { + return nullptr; + } +} + +static std::shared_ptr GetInputNode( + const std::shared_ptr& op, const std::string prm, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + return GetNode(op, prm, op->Inputs(), ngb_node_map); +} + +static std::shared_ptr GetOutputNode( + const std::shared_ptr& op, const std::string prm, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + return GetNode(op, prm, op->Outputs(), ngb_node_map); +} + +static void SetOutputNode( + const std::shared_ptr& op, const std::string prm, + std::shared_ptr node, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto& var_names = op->Outputs().at(prm); + if (var_names.size() == 1) { + (*ngb_node_map)[var_names[0]] = node; + } else if (var_names.size() == 0) { + (*ngb_node_map)[""] = node; + } else { + PADDLE_THROW("prm %s has more than 1 var_names.", prm); + } +} + +static bool HasOutput(const std::shared_ptr& op, + const std::string prm) { + auto& outputs = op->Outputs(); + if (outputs.find(prm) == outputs.end()) return false; + return outputs.at(prm).size() > 0; +} + +template +static void BuildBinaryNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = GetInputNode(op, "X", ngb_node_map); + auto y = GetInputNode(op, "Y", ngb_node_map); + auto out = std::make_shared(x, y); + SetOutputNode(op, "Out", out, ngb_node_map); +} + +template +static void BuildUnaryNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto input = GetInputNode(op, "X", ngb_node_map); + auto out = std::make_shared(input); + SetOutputNode(op, "Out", out, ngb_node_map); +} + std::map&, std::shared_ptr>>)>> - NgraphBridge::NG_NODE_MAP = {}; + NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode}, + {"tanh", BuildUnaryNode}}; -void NgraphBridge::build_graph(const std::shared_ptr& op) { +void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { auto& op_type = op->Type(); - NG_NODE_MAP[op_type](op, ngb_node_map); + NG_NODE_MAP[op_type](op, ngb_node_map_); } } // namespace framework diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h index 55bf0d21f3471013b1fb780e852d813313345f03..9ed6b9510942136a61faa5755fd8fa74286939a8 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/framework/ngraph_bridge.h @@ -20,16 +20,14 @@ limitations under the License. */ #include #include #include -#include -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" - -#include "ngraph/ngraph.hpp" +#include "ngraph/node.hpp" namespace paddle { namespace framework { +class OperatorBase; + class NgraphBridge { public: static std::map< @@ -43,14 +41,14 @@ class NgraphBridge { std::shared_ptr< std::unordered_map>> var_node_map) - : ngb_node_map(var_node_map) {} + : ngb_node_map_(var_node_map) {} - void build_graph(const std::shared_ptr& op); + void BuildNgNode(const std::shared_ptr& op); private: std::shared_ptr< std::unordered_map>> - ngb_node_map; + ngb_node_map_; }; } // namespace framework diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index d967b2780c21713a2f9a73a3402964103f44269e..3fea753f0659019395c9b214e52a7912058c501c 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -19,14 +19,29 @@ limitations under the License. */ #include #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/ngraph_operator.h" -#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_type.h" +#include "ngraph/ngraph.hpp" + namespace paddle { namespace framework { +static ngraph::Shape Ddim2Shape(const DDim& dims) { + ngraph::Shape sp; + for (int i = 0; i < dims.size(); ++i) { + int k = dims[i]; + k = k == 0 ? 1 : k; + sp.push_back(k); + } + return sp; +} + static std::map pd2ng_type_map = { {proto::VarType::FP32, ngraph::element::f32}, {proto::VarType::FP64, ngraph::element::f64}, @@ -42,6 +57,7 @@ typedef enum { /* nGraph support state on ops */ PARTIAL_TEST /* Support partial list of ops for test */ } op_state; +// perform graph build through bridge and execute computation class NgraphOperator { public: explicit NgraphOperator(const Scope& scope, const platform::Place& place, @@ -59,13 +75,23 @@ class NgraphOperator { persistables_(persist), fetches_(fetches), post_op_inputs_(post_op_inputs), - ng_op_state_(ng_op_state) {} + ng_op_state_(ng_op_state) { + var_in_node_map_ = std::make_shared< + std::unordered_map>>(); + + var_node_map_ = std::make_shared< + std::unordered_map>>(); + + BuildNgIO(); + + GetNgFunction(); + } void Run(const Scope& scope, const platform::Place& place) const; private: static std::unordered_map> - func_cache; + func_cache_; const Scope& scope_; const platform::Place& place_; std::vector> fused_ops_; @@ -74,6 +100,35 @@ class NgraphOperator { std::unordered_set fetches_; std::unordered_set post_op_inputs_; op_state ng_op_state_; + + // ngraph backend eg. CPU + static std::shared_ptr backend_; + // ngraph function to call and execute + std::shared_ptr ngraph_function_; + // var_name of inputs + std::vector var_in_; + // var_name of outputs from fetch in order + std::vector var_out_; + // map input vars to nodes + std::shared_ptr< + std::unordered_map>> + var_in_node_map_; + // map each var name with a ngraph node + std::shared_ptr< + std::unordered_map>> + var_node_map_; + // cache key to check if function is cached + std::shared_ptr GetCacheKey(); + // get ngraph input and define ngraph input parameters + void GetNgInputShape(std::shared_ptr op); + // Call ngraph bridge to map ops + void BuildNgNodes(); + // get the ngraph input and output var list + void BuildNgIO(); + // build ngraph function call + void BuildNgFunction(); + // Check cache for ngraph function or otherwise build the function + void GetNgFunction(); }; std::vector>::iterator>> @@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals( } size_t size = ops->size(); size_t left = 0; - while (left < size && ops.at(left)->Type() != kFeedOpType) { + while (left < size && ops->at(left)->Type() != kFeedOpType) { ++left; } if (left == size) { @@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals( size_t start = pivot, end = start; while (pivot < right && (paddle::framework::NgraphBridge::NG_NODE_MAP.find( - ops.at(pivot)->Type()) != + ops->at(pivot)->Type()) != paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { ++pivot; ++end; @@ -136,7 +191,9 @@ FusedOperator::FusedOperator( std::vector>::iterator end, const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) { + : OperatorBase(type, inputs, outputs, attrs), + pdesc_(prog), + block_(block_id) { for (std::vector>::iterator it = start; it != end; ++it) { fused_ops_.push_back(std::move(*it)); @@ -152,7 +209,7 @@ FusedOperator::FusedOperator( } if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { - is_complete = true; + is_full_ = true; } Process(); @@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope, } } - if (is_full) { + if (is_full_) { ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; } @@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope, ngraph_op.Run(scope, place); } +std::unordered_map> + NgraphOperator::func_cache_ = {}; + +std::shared_ptr NgraphOperator::backend_ = + ngraph::runtime::Backend::create("CPU"); + +void NgraphOperator::GetNgInputShape(std::shared_ptr op) { + op->RuntimeInferShape(scope_, place_); + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto sp = Ddim2Shape(tensor_pd->dims()); + if (std::find(var_in_.begin(), var_in_.end(), var_name) != + var_in_.end()) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto ng_type = var_type_map_.at(var_name); + auto prm = + std::make_shared(ng_type, sp, true); + (*var_node_map_)[var_name] = prm; + (*var_in_node_map_)[var_name] = prm; + } + } + } + } + } +} + +void NgraphOperator::BuildNgNodes() { + for (auto& var_name : var_out_) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + auto ng_shape = Ddim2Shape(ddim); + auto ng_type = var_type_map_.at(var_name); + auto prm = + std::make_shared(ng_type, ng_shape, true); + (*var_node_map_)[var_name] = prm; + } + } + } + + paddle::framework::NgraphBridge ngb(var_node_map_); + for (auto& op : fused_ops_) { + ngb.BuildNgNode(op); + } +} + +void NgraphOperator::BuildNgIO() { + std::unordered_set inputs; + std::unordered_set outputs; + + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + inputs.insert(var_name); + const bool is_output = outputs.find(var_name) != outputs.end(); + if (!is_output && + std::find(var_in_.begin(), var_in_.end(), var_name) == + var_in_.end()) { + // fill var_in here to keep lhs and rhs order + var_in_.push_back(var_name); + } + } + } + + if (op->Type() != "fill_constant") { + GetNgInputShape(op); + } + + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + outputs.insert(var_name); + } + } + } + + // var_out.clear(); + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + switch (ng_op_state_) { + case PARTIAL_TEST: + if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || + fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case FULL_TEST: + if (fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case PARTIAL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + post_op_inputs_.find(var_name) != post_op_inputs_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + case FULL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + default: + var_out_.push_back(var_name); + } + } + } + } +} + +void NgraphOperator::BuildNgFunction() { + BuildNgNodes(); + ngraph_function_ = nullptr; + ngraph::NodeVector func_outputs; + ngraph::op::ParameterVector func_inputs; + + for (auto& vo : var_out_) { + func_outputs.push_back(var_node_map_->at(vo)); + } + + for (auto& vi : var_in_) { + std::shared_ptr prm = + std::dynamic_pointer_cast( + var_in_node_map_->at(vi)); + func_inputs.push_back(prm); + } + + ngraph_function_ = + std::make_shared(func_outputs, func_inputs); +} + +std::shared_ptr NgraphOperator::GetCacheKey() { + auto cache_key = std::make_shared(""); + *cache_key += std::to_string(fused_ops_.size()); + for (auto& op : fused_ops_) { + *cache_key += op->Type(); + } + for (auto& var_name : var_in_) { + auto shape = var_node_map_->at(var_name)->get_shape(); + *cache_key += var_name; + *cache_key += var_type_map_.at(var_name).c_type_string(); + for (size_t i = 0; i < shape.size(); ++i) { + *cache_key += std::to_string(shape.at(i)); + } + } + + for (auto& var_name : var_out_) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + for (int i = 0; i < ddim.size(); ++i) { + *cache_key += std::to_string(ddim[i]); + } + } + } + return cache_key; +} + +void NgraphOperator::GetNgFunction() { + bool cache_on = true; + if (cache_on) { + std::string cache_key_val = *GetCacheKey(); + if (func_cache_.find(cache_key_val) != func_cache_.end()) { + ngraph_function_ = func_cache_.at(cache_key_val); + } else { + BuildNgFunction(); + func_cache_[cache_key_val] = ngraph_function_; + } + } else { + BuildNgFunction(); + } +} + +void NgraphOperator::Run(const Scope& scope, + const platform::Place& place) const { + std::vector> t_in; + std::vector> t_out; + + for (size_t i = 0; i < var_in_.size(); ++i) { + auto vi = var_in_.at(i); + auto sp = var_node_map_->at(vi)->get_shape(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), + "Ensure ngraph tensor layout align with paddle tensor"); + if (tensor_pd->type().hash_code() == + typeid(float).hash_code()) { // NOLINT + const float* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::f32, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == + typeid(int).hash_code()) { // NOLINT + const int* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::i32, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) { + const int64_t* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::i64, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == + typeid(double).hash_code()) { // NOLINT + const double* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::f64, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == + typeid(bool).hash_code()) { // NOLINT + const bool* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::boolean, sp, + const_cast(arr)); + } else { + PADDLE_THROW("Data type not handling for var %s", vi); + } + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + } + bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST) + ? true + : false; + bool is_persistable = + (persistables_.find(vi) != persistables_.end()) ? true : false; + if (is_test && is_persistable) { + ti->set_stale(false); + } + t_in.push_back(ti); + } + + for (size_t i = 0; i < var_out_.size(); ++i) { + auto var_name = var_out_[i]; + auto* var = scope.FindVar(var_name); + std::shared_ptr to; + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + auto dd = tensor_pd->dims(); + ngraph::Shape sp = Ddim2Shape(dd); + auto ng_type = var_type_map_.at(var_name); + if (ng_type == ngraph::element::f32) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); + } else if (ng_type == ngraph::element::i64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); + } else if (ng_type == ngraph::element::f64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); + } else if (ng_type == ngraph::element::boolean) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); + } else { + PADDLE_THROW("Data type not handled in for var %s", var_name); + } + t_out.push_back(to); + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", var_name); + } + } + + backend_->call(ngraph_function_, t_out, t_in); +} // NgraphOperator::RunImpl } // namespace framework } // namespace paddle #endif diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h index 0f655cef1dde624bcf4944b5c096279097e1c8ae..3ca023e11111c5b447b2cabbfb8bb29877297f65 100644 --- a/paddle/fluid/framework/ngraph_operator.h +++ b/paddle/fluid/framework/ngraph_operator.h @@ -17,24 +17,19 @@ limitations under the License. */ #ifdef PADDLE_WITH_NGRAPH #include -#include #include #include #include #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/variant.h" -#include "ngraph/ngraph.hpp" +#include "ngraph/type/element_type.hpp" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index 19e5c2c73eac74dee030a4f7820531800f737e4e..e0bf5ed999f580f217af285bf97d0bc0232f1ded 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -31,6 +31,12 @@ class InferShapeBase { virtual void operator()(InferShapeContext*) const = 0; }; +class EstimateFlopsBase { + public: + virtual ~EstimateFlopsBase() = default; + virtual size_t operator()(InferShapeContext*) const = 0; +}; + struct OpInfo { OpCreator creator_; GradOpMakerFN grad_op_maker_; @@ -38,6 +44,7 @@ struct OpInfo { OpAttrChecker* checker_{nullptr}; InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; + EstimateFlopsFN estimate_flops_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8bfdf3891203823826fd5bf919c176011f22213c..c6f3254e9f7cedcf47be8ce8c3eecf4aa1b57add 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name, "Tensor %s contains NAN", name); } +void OperatorWithKernel::RuntimeInferShape(const Scope& scope, + const platform::Place& place) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); +} + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 5bd68f9ac2e1b30bc6ce3094960bb89842b99e01..0a6a28a5bce01d71cf56f25f5556033db94452c2 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -128,6 +128,8 @@ class OperatorBase { virtual std::vector OutputVars(bool has_intermediate) const; void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } + virtual void RuntimeInferShape(const Scope& scope, + const platform::Place& place) const {} protected: std::string type_; @@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase { OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); } + void RuntimeInferShape(const Scope& scope, + const platform::Place& place) const override; + protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetKernelTypeForVar( diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 2de6233a9e0d320ec9a06d547db3575eb61925c0..cdc5fa6862e3b2a2784151302f15540a0e9db8ff 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -54,5 +54,7 @@ using InferVarTypeFN = using InferShapeFN = std::function; +using EstimateFlopsFN = std::function; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc4525549caeebb06dea766ccb123b5ebc6d5b13 --- /dev/null +++ b/paddle/fluid/framework/variable_helper.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/variable_helper.h" + +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +void InitializeVariable(Variable* var, proto::VarType::Type var_type) { + if (var_type == proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarType::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarType::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarType::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarType::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::READER) { + var->GetMutable(); + } else if (var_type == proto::VarType::RAW) { + // GetMutable will be called in operator + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", + var_type); + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..0e0c72c3621dce0a6b372f9a9110a63fbc0a1d71 --- /dev/null +++ b/paddle/fluid/framework/variable_helper.h @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/variable.h" +namespace paddle { +namespace framework { +void InitializeVariable(Variable *var, proto::VarType::Type var_type); +} +} diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 8fb464c0f5443f116815b14324f6cbc966dc6482..ec93729cd2b379dc2ac39b51df6799b74c8529b6 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (NOT WIN32) + set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph") + if(EXISTS ${NGRAPH_PATH}) + include(GNUInstallDirs) + include_directories("${NGRAPH_PATH}/include") + link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}") + set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) if(WITH_MKL) @@ -106,7 +116,7 @@ endif() if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} + ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} glog gflags protobuf snappystream snappy z xxhash ${EXTERNAL_LIB}) else() diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e63d57be57a66e8e02f7ef88acd01246302bc53c --- /dev/null +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -0,0 +1,218 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class CudnnLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(Weight) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("InitH"), + "Input(init_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InitC"), + "Input(init_c) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cache"), + "Input(Cache) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("last_h"), + "Output(last_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("last_c"), + "Output(last_c) of LSTM should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3."); + + ctx->SetOutputDim("Out", ctx->GetInputDim("Input")); + ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH")); + ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC")); + } +}; + +class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "(Tensor) RNN input tensor, which support variable-time length input " + "sequence." + "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)" + "seq_len is the total time step in this mini-batch (CAN be change in " + "different batch)" + "batch_size is the instance number of this batch" + "input_size is the hidden size of the input." + "input_hidden_size and the hidden_size in the next may not be same"); + AddInput("InitH", + "(Tensor) the initial hidden state of the LSTM" + "input. This is a tensor with shape (num_layers x batch_size x " + "hidden_size)" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddInput("InitC", + "(Tensor) the initial cell state of the LSTm " + "input. This is a tensor with shape (num_layers x batch_size x " + "hidden_size)" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddInput("W", + "(Tensor) the learnable hidden-hidden weights." + " The shape is (N), where N is total weight size of the LSTM. " + " cudnn concatenate all the weight to one Tensor"); + AddInput("Cache", + "The cache of dropout op, a RAW type variable including random " + "number generator states and some descriptors, which is used in " + "cudnn kernel.") + .AsDispensable(); + AddOutput("Out", + "(Tensor) the hidden state of LSTM operator. " + "The shape is ( seq_len x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirec is True, the shape will be ( seq_len x " + "batch_size x hidden_size * 2) "); + AddOutput("last_h", + "(Tensor) the hidden state of the last step. " + "The shape is ( num_layers x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddOutput("last_c", + "(Tensor) the cell state of the last step" + "The shape is ( num_layers x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirect is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size*2)"); + AddAttr("max_len", + "max length of the LSTM op" + "the first dim of the Input can NOT be greater than max_len") + .SetDefault(20); + AddAttr( + "dropout_prob", + "dropout prob of the dropout op" + "the dropout ONLY work between lstm layers, not between time steps" + "There is no dropout work on the Out tensor") + .SetDefault(0.0); + AddAttr("is_bidirec", + "is_bidirec" + "if it is bidirection rnn" + "The will affect the shape of the Out, last_h, and last_c") + .SetDefault(false); + AddAttr("input_size", "input size ot the Input Tensor").SetDefault(10); + AddAttr("hidden_size", "hidden size of the LSTM").SetDefault(100); + AddAttr("num_layers", "the total layer number of the LSTM") + .SetDefault(1); + AddAttr("is_test", "True if in test phase.").SetDefault(false); + AddAttr("seed", "seed to used if fix_seed is True").SetDefault(-1); + AddComment(R"DOC( +CUDNN LSTM implementation + +A four-gate Long Short-Term Memory network with no peephole connections. +In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, +the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: + +$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ + +$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + +$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + +$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + +$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + +$$ h_t = o_t \\odot tanh(c_t) $$ + +- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) +- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). +- sigmoid is the logistic sigmoid function. +- $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. +- The $\odot$ is the element-wise product of the vectors. +- `tanh` is the activation functions. +- $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + +Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, +X represensts a matrix multiplication + + +)DOC"); + } +}; + +class CudnnLSTMGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("last_h"), + "Input(last_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("last_c"), + "Input(last_c) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("Cache"), + "Input(last_c) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InitH"), + "Input(init_h) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("InitC"), + "Input(init_c) of LSTM should not be null."); + + auto SetOutGradDim = [&ctx](const std::string& name) { + auto g_name = framework::GradVarName(name); + if (ctx->HasOutput(g_name)) { + ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); + } + }; + + SetOutGradDim("Input"); + SetOutGradDim("W"); + SetOutGradDim("InitH"); + SetOutGradDim("InitC"); + } +}; + +template +class NotImpleKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW( + "CPU is not support for this kernel now. Will be add in the future"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp); + +REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel); +REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e01070c7b8ed4374cf8a61cfde4de940b4ea38b2 --- /dev/null +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -0,0 +1,485 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +struct CudnnRNNCache { + CudnnRNNCache() { + x_desc_ = NULL; + y_desc_ = NULL; + dx_desc_ = NULL; + dy_desc_ = NULL; + } + ~CudnnRNNCache() { release(); } + + cudnnRNNDescriptor_t rnn_desc_; + cudnnTensorDescriptor_t *x_desc_; + cudnnTensorDescriptor_t *y_desc_; + cudnnTensorDescriptor_t *dx_desc_; + cudnnTensorDescriptor_t *dy_desc_; + + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnTensorDescriptor_t dhx_desc_; + cudnnTensorDescriptor_t dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_; + cudnnTensorDescriptor_t dcy_desc_; + + cudnnTensorDescriptor_t output_x_desc_; + cudnnTensorDescriptor_t output_y_desc_; + + cudnnDropoutDescriptor_t dropout_desc_; + + size_t weights_size_; + cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t dw_desc_; + + size_t workspace_size_; + size_t reserve_size_; + Tensor reserve_data_; + Tensor workspace_data_; + + Tensor dropout_state_; + + size_t max_length_; + + float dropout_prob_; + bool is_bidirec_; + + int batch_size_; + int input_size_; + int hidden_size_; + int num_layers_; + int seed_; + + void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx, + size_t max_len, int batch_size, int input_size, int hidden_size, + int num_layers, float dropout_prob, bool is_bidirec, int seed, + int weight_numel) { + max_length_ = max_len; + batch_size_ = batch_size; + input_size_ = input_size; + hidden_size_ = hidden_size; + num_layers_ = num_layers; + dropout_prob_ = dropout_prob; + is_bidirec_ = is_bidirec; + seed_ = seed; + + x_desc_ = new cudnnTensorDescriptor_t[max_length_]; + y_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; + int dim_a[3]; + int stride_a[3]; + + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); + dim_a[0] = batch_size_; + dim_a[1] = input_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + dim_a[0] = batch_size_; + dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + } + + dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); + dim_a[1] = batch_size_; + dim_a[2] = hidden_size_; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); + + size_t state_size; + CUDNN_ENFORCE( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); + dropout_state_.Resize({static_cast(state_size)})); + auto *dropout_state_data = + dropout_state_.mutable_data(ctx.GetPlace()); + CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, + seed_)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); + + PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, + "cudnn lstm weight size should be SAME"); + int dim_w[3]; + dim_w[0] = weights_size_ / sizeof(float); + dim_w[1] = 1; + dim_w[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + + reserve_data_.Resize({static_cast(reserve_size_)}); + reserve_data_.mutable_data(ctx.GetPlace()); + + workspace_data_.Resize({static_cast(workspace_size_)}); + workspace_data_.mutable_data(ctx.GetPlace()); + } + + void release() { + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); + } + + delete[] x_desc_; + delete[] y_desc_; + delete[] dx_desc_; + delete[] dy_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + } +}; + +template +class CudnnLSTMGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const Tensor *x = ctx.Input("Input"); + const Tensor *init_h = ctx.Input("InitH"); + const Tensor *init_c = ctx.Input("InitC"); + + auto w = ctx.Input("W"); + + Tensor *out = ctx.Output("Out"); + Tensor *last_h = ctx.Output("last_h"); + Tensor *last_c = ctx.Output("last_c"); + + const T *x_data = x->data(); + const T *init_h_data = init_h->data(); + const T *init_c_data = init_c->data(); + + const T *w_data = w->data(); + + T *out_data = out->mutable_data(ctx.GetPlace()); + T *last_h_data = last_h->mutable_data(ctx.GetPlace()); + T *last_c_data = last_c->mutable_data(ctx.GetPlace()); + + size_t max_len = ctx.Attr("max_len"); + float dropout_prob = ctx.Attr("dropout_prob"); + bool is_bidirec = ctx.Attr("is_bidirec"); + int input_size = ctx.Attr("input_size"); + int hidden_size = ctx.Attr("hidden_size"); + int num_layers = ctx.Attr("num_layers"); + bool is_test = ctx.Attr("is_test"); + + auto &dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto *cache_var = ctx.InputVar("Cache"); + if (!cache_var) { + // The RAW type cache variable wouldn't be created and broadcasted on + // multi-devices before the first running. + // use parent scope to make cache persistable + auto *scope = const_cast(ctx.scope().parent()); + auto cache_var_name = ctx.Inputs("Cache")[0]; + cache_var = scope->Var(cache_var_name); + } + CudnnRNNCache *cudnn_rnn_cache = nullptr; + if (cache_var->IsInitialized()) { + cudnn_rnn_cache = const_cast(cache_var) + ->GetMutable(); + } else { + cudnn_rnn_cache = const_cast(cache_var) + ->GetMutable(); + std::random_device rnd; + int seed = ctx.Attr("seed"); + if (seed == -1) { + seed = rnd(); + } + + auto input_w_numel = w->numel(); + auto batch_size = x->dims()[1]; + cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, + hidden_size, num_layers, dropout_prob, is_bidirec, + seed, input_w_numel); + } + + auto run_seq_len = x->dims()[0]; + + if (is_test) { + // for inference + CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, + init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, + cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, + cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, + last_c_data, cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_)); + } else { + // for train + CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, + init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, + cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, + cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, + last_c_data, cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_, + cudnn_rnn_cache->reserve_data_.data(), + cudnn_rnn_cache->reserve_size_)); + } + } +}; + +template +class CudnnLSTMGPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *input = ctx.Input("Input"); + auto *weight = ctx.Input("W"); + auto *init_h = ctx.Input("InitH"); + auto *init_c = ctx.Input("InitC"); + // auto * last_h = ctx.Input("last_h"); + // auto * last_c = ctx.Input("last_c"); + auto *out = ctx.Input("Out"); + auto *out_grad = ctx.Input(framework::GradVarName("Out")); + auto *last_h_grad = ctx.Input(framework::GradVarName("last_h")); + auto *last_c_grad = ctx.Input(framework::GradVarName("last_c")); + + // auto* init_h = ctx.Input("init_h"); + // auto* init_c = ctx.Input("init_c"); + + auto *in_grad = ctx.Output(framework::GradVarName("Input")); + auto *weight_grad = ctx.Output(framework::GradVarName("W")); + auto *init_h_grad = ctx.Output(framework::GradVarName("InitH")); + auto *init_c_grad = ctx.Output(framework::GradVarName("InitC")); + + auto &dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto *cache_var = ctx.InputVar("Cache"); + PADDLE_ENFORCE(cache_var->IsInitialized()); + CudnnRNNCache *cudnn_rnn_cache = + const_cast(cache_var) + ->GetMutable(); + + auto input_dims = input->dims(); + auto weight_dims = weight->dims(); + auto init_h_dims = init_h->dims(); + auto init_c_dims = init_c->dims(); + in_grad->mutable_data(ctx.GetPlace()); + weight_grad->mutable_data(ctx.GetPlace()); + math::SetConstant zero; + zero(dev_ctx, in_grad, static_cast(0.0)); + zero(dev_ctx, weight_grad, static_cast(0.0)); + + T *init_h_grad_data = NULL; + if (init_h_grad == nullptr) { + Tensor init_h_grad_temp; + init_h_grad_temp.mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, &init_h_grad_temp, static_cast(0.0)); + + init_h_grad_data = init_h_grad_temp.data(); + } else { + init_h_grad->mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, init_h_grad, static_cast(0.0)); + init_h_grad_data = init_h_grad->data(); + } + + T *init_c_grad_data = NULL; + if (init_c_grad == nullptr) { + Tensor init_c_grad_temp; + init_c_grad_temp.mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, &init_c_grad_temp, static_cast(0.0)); + + init_c_grad_data = init_c_grad_temp.data(); + } else { + init_c_grad->mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, init_c_grad, static_cast(0.0)); + init_c_grad_data = init_c_grad->data(); + } + + const T *last_h_grad_data = NULL; + if (last_h_grad == nullptr) { + Tensor last_h_grad_temp; + last_h_grad_temp.mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, &last_h_grad_temp, static_cast(0.0)); + + last_h_grad_data = (const T *)last_h_grad_temp.data(); + } else { + last_h_grad_data = last_h_grad->data(); + } + + const T *last_c_grad_data = NULL; + if (last_c_grad == nullptr) { + Tensor last_c_grad_temp; + last_c_grad_temp.mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, &last_c_grad_temp, static_cast(0.0)); + + last_c_grad_data = (const T *)last_c_grad_temp.data(); + } else { + last_c_grad_data = last_c_grad->data(); + } + + const T *out_grad_data = NULL; + if (out_grad == nullptr) { + Tensor out_grad_temp; + out_grad_temp.mutable_data(out->dims(), ctx.GetPlace()); + zero(dev_ctx, &out_grad_temp, static_cast(0.0)); + + out_grad_data = (const T *)out_grad_temp.data(); + } else { + out_grad_data = out_grad->data(); + } + + // zero( dev_ctx, last_h_grad, static_cast(0.0)); + // zero( dev_ctx, last_c_grad, static_cast(0.0)); + + auto out_data = out->data(); + // auto out_grad_data = out_grad->data(); + auto weight_data = weight->data(); + auto init_h_data = init_h->data(); + auto init_c_data = init_c->data(); + auto in_grad_data = in_grad->data(); + + auto work_data = cudnn_rnn_cache->workspace_data_.data(); + auto reserve_data = cudnn_rnn_cache->reserve_data_.data(); + + auto run_seq_len = input_dims[0]; + PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_, + "cudnn running seq_len CAN not greater max_lengh"); + CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_, + out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data, + cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_, + weight_data, cudnn_rnn_cache->hx_desc_, init_h_data, + cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_, + in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data, + cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data, + cudnn_rnn_cache->workspace_size_, reserve_data, + cudnn_rnn_cache->reserve_size_)); + + CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, input->data(), cudnn_rnn_cache->hx_desc_, + init_h->data(), cudnn_rnn_cache->y_desc_, out->data(), + cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_, + weight_grad->data(), cudnn_rnn_cache->reserve_data_.data(), + cudnn_rnn_cache->reserve_size_)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel); +REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel); diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 0258f8f2384669c8a7466fd3c60f9b55a0fde9fd..9722f8c96e91d2dfbe929dcc11645a40c44afb4e 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include #include #include @@ -20,7 +21,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/string/printf.h" diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 36156a1f6174631dd084c8dc63dc432f5275008e..6a0d6bad512fe7cc15e60ed25028bc3cbbbca2ab 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -31,8 +31,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids, while (idy < K) { int64_t id = ids[idy]; - PADDLE_ASSERT(id >= 0); - PADDLE_ASSERT(id < N); + PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id); + PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id); T *out = output + idy * D; const T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { @@ -57,9 +57,9 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids, int idy = blockIdx.x + threadIdx.y * GridDimX; while (idy < K) { - int id = ids[idy]; - PADDLE_ASSERT(id >= 0); - PADDLE_ASSERT(id < N); + int64_t id = ids[idy]; + PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id); + PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id); const T *out = output + idy * D; T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h index fb370842d1942c3b3eebecb1fe5e8ffb845cb34b..4ab5cfe53c67eeaa995d7e955eec63a065c5eec5 100644 --- a/paddle/fluid/operators/metrics/auc_op.h +++ b/paddle/fluid/operators/metrics/auc_op.h @@ -75,8 +75,13 @@ class AucKernel : public framework::OpKernel { const auto *label_data = label->data(); for (size_t i = 0; i < batch_size; i++) { - uint32_t binIdx = static_cast( - inference_data[i * inference_width + 1] * num_thresholds); + auto predict_data = inference_data[i * inference_width + 1]; + PADDLE_ENFORCE_LE(predict_data, 1, + "The predict data must less or equal 1."); + PADDLE_ENFORCE_GE(predict_data, 0, + "The predict data must gather or equal 0."); + + uint32_t binIdx = static_cast(predict_data * num_thresholds); if (label_data[i]) { (*stat_pos)[binIdx] += 1.0; } else { diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 193de05422bb78572c0e5eaf4cd46744c3bcb113..14746fa95159d707be7c10c69a4ffc2211e17a93 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -18,6 +18,7 @@ namespace paddle { namespace operators { using framework::Tensor; +const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: @@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker AddOutput("Out", "(Tensor, default Tensor), a 2-D tensor with shape N x D " " of elementwise logistic losses."); + AddAttr("ignore_index", + "(int, default kIgnoreIndex), Specifies a target value that " + "is ignored and" + "does not contribute to the input gradient.") + .SetDefault(kIgnoreIndex); AddComment(R"DOC( SigmoidCrossEntropyWithLogits Operator. diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h index faef72866eb491887bbf221d32a8121b21fc3c66..b8731c232753074fa9e76b028485d3598c9a7295 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h @@ -15,33 +15,72 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/legacy/utils/Logging.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +struct SigmoidCrossEntropyWithLogitsForward { + HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index) + : ignore_index(ignore_index) {} + + HOSTDEVICE T operator()(const T &x, const T &label) const { + if (static_cast(label) == ignore_index) { + return static_cast(0.); + } + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = std::log(static_cast(1) + std::exp(-(std::abs(x)))); + return term1 - term2 + term3; + } + + int ignore_index; +}; + +template +struct SigmoidCrossEntropyWithLogitsBackward { + HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index) + : ignore_index(ignore_index) {} + + HOSTDEVICE T operator()(const T &x, const T &label) const { + if (static_cast(label) == ignore_index) { + return static_cast(0.); + } + T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); + return simoid_x - label; + } + + int ignore_index; +}; + // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) template class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = context.Input("Label"); - framework::Tensor *Out = context.Output("Out"); + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + Tensor *Out = context.Output("Out"); Out->mutable_data(context.GetPlace()); + int ignore_index = context.Attr("ignore_index"); - auto x = framework::EigenVector::Flatten(*X); - auto labels = framework::EigenVector::Flatten(*Labels); - auto out = framework::EigenVector::Flatten(*Out); + auto x = EigenVector::Flatten(*X); + auto labels = EigenVector::Flatten(*Labels); + auto out = EigenVector::Flatten(*Out); auto &place = *context.device_context().eigen_device(); - // term1 = max(x, 0) - auto term1 = x.cwiseMax(static_cast(0)); - // term2 = x * labels - auto term2 = x * labels; - // term3 = log(1 + exp(-abs(x))) - auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); - - out.device(place) = term1 - term2 + term3; + out.device(place) = x.binaryExpr( + labels, SigmoidCrossEntropyWithLogitsForward(ignore_index)); } }; @@ -50,23 +89,23 @@ template class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = context.Input("Label"); - const framework::Tensor *dOut = - context.Input(framework::GradVarName("Out")); - framework::Tensor *dX = - context.Output(framework::GradVarName("X")); + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + Tensor *dX = context.Output(framework::GradVarName("X")); dX->mutable_data(context.GetPlace()); - auto x = framework::EigenVector::Flatten(*X); - auto labels = framework::EigenVector::Flatten(*Labels); - auto dout = framework::EigenVector::Flatten(*dOut); - auto dx = framework::EigenVector::Flatten(*dX); + auto ignore_index = context.Attr("ignore_index"); + auto x = EigenVector::Flatten(*X); + auto labels = EigenVector::Flatten(*Labels); + auto dout = EigenVector::Flatten(*dOut); + auto dx = EigenVector::Flatten(*dX); auto &place = *context.template device_context().eigen_device(); - auto sigmoid_x = static_cast(1) / (static_cast(1) + (-x).exp()); - dx.device(place) = dout * (sigmoid_x - labels); + auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward( + static_cast(ignore_index))); + dx.device(place) = dout * diff; } }; diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h index 2ce9b31bb81de867ff4ed6ee14afddecd95317b9..2e8fa7c1b8f7f7b8f3154aae691bb100375981dd 100644 --- a/paddle/fluid/platform/assert.h +++ b/paddle/fluid/platform/assert.h @@ -36,6 +36,15 @@ limitations under the License. */ asm("trap;"); \ } \ } while (0) + +#define PADDLE_ASSERT_MSG_CODE(e, m, c) \ + do { \ + if (!(e)) { \ + printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \ + TOSTRING(e), m, c); \ + asm("trap;"); \ + } \ + } while (0) #else #include // For cuda, the assertions can affect performance and it is therefore @@ -43,4 +52,5 @@ limitations under the License. */ // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion #define PADDLE_ASSERT(e) assert((e)) #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m)) +#define PADDLE_ASSERT_MSG_CODE(e, m, c) assert((e) && (m) && (c || 1)) #endif diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index db62377898339def415a13d185f85f34de326d7f..213cd8a9ce094512cea6f6405492ec8feff11516 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -111,7 +111,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnFindConvolutionForwardAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ - __macro(cudnnGetErrorString); + __macro(cudnnGetErrorString); \ + __macro(cudnnCreateDropoutDescriptor); \ + __macro(cudnnDropoutGetStatesSize); \ + __macro(cudnnSetDropoutDescriptor); \ + __macro(cudnnCreateRNNDescriptor); \ + __macro(cudnnSetRNNDescriptor); \ + __macro(cudnnGetRNNParamsSize); \ + __macro(cudnnGetRNNWorkspaceSize); \ + __macro(cudnnGetRNNTrainingReserveSize); \ + __macro(cudnnRNNForwardTraining); \ + __macro(cudnnRNNBackwardData); \ + __macro(cudnnRNNBackwardWeights); \ + __macro(cudnnRNNForwardInference); \ + __macro(cudnnDestroyDropoutDescriptor); \ + __macro(cudnnDestroyRNNDescriptor); \ + __macro(cudnnSetRNNDescriptor_v6); + CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \ diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 25d241d9768c16e1da304a78f259d5a626f702fc..d602613fc82223e14f48830a87533880696eb550 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc new file mode 100644 index 0000000000000000000000000000000000000000..470e8b050808295d49728bbdb757b6a612df9a01 --- /dev/null +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +// To avoid conflicting definition in gcc-4.8.2 headers and pyconfig.h (2.7.3) +#ifdef _POSIX_C_SOURCE +#undef _POSIX_C_SOURCE +#endif + +#ifdef _XOPEN_SOURCE +#undef _XOPEN_SOURCE +#endif +#include +#include + +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" +#include "paddle/fluid/framework/async_executor.h" +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/variant.h" +#include "paddle/fluid/pybind/async_executor_py.h" + +namespace py = pybind11; +namespace pd = paddle::framework; + +namespace paddle { +namespace pybind { +using set_name_func = void (pd::DataFeedDesc::*)(const std::string&); +void BindAsyncExecutor(py::module* m) { + py::class_(*m, "AsyncExecutor") + .def(py::init([](framework::Scope* scope, const platform::Place& place) { + return std::unique_ptr( + new framework::AsyncExecutor(scope, place)); + })) + .def("run_from_files", &framework::AsyncExecutor::RunFromFile); +} // end BindAsyncExecutor +} // end namespace pybind +} // end namespace paddle diff --git a/paddle/fluid/pybind/async_executor_py.h b/paddle/fluid/pybind/async_executor_py.h new file mode 100644 index 0000000000000000000000000000000000000000..a99d6e04218c9310ede00de7d9bdfc015889bd22 --- /dev/null +++ b/paddle/fluid/pybind/async_executor_py.h @@ -0,0 +1,28 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +void BindAsyncExecutor(py::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1835c064055635a4284fc64f4ca4dd8728f933ca..fc7991d2974c9262e6225de1537025944c1068c1 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -42,6 +42,7 @@ limitations under the License. */ #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/protobuf.h" @@ -932,6 +933,7 @@ All parameter, weight, gradient are variables in Paddle. }); BindRecordIOWriter(&m); + BindAsyncExecutor(&m); } } // namespace pybind } // namespace paddle diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index f7fefb3e5b767e25373665058d4fd6a298fb3d60..a1ffbf42622bcda44ec038edf20811fb0032891f 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -20,6 +20,13 @@ from .framework import * # import all class inside executor into fluid module from . import executor from .executor import * + +from . import data_feed_desc +from .data_feed_desc import * + +from . import async_executor +from .async_executor import * + from . import trainer from . import inferencer @@ -54,7 +61,8 @@ Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ - parallel_executor.__all__ + lod_tensor.__all__ + [ + parallel_executor.__all__ + lod_tensor.__all__ + \ + data_feed_desc.__all__ + async_executor.__all__ + [ 'io', 'initializer', 'layers', diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..2664a7301db3bf471126ff26504e7042f02b7d84 --- /dev/null +++ b/python/paddle/fluid/async_executor.py @@ -0,0 +1,151 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import contextlib +import six +from .framework import Program, default_main_program, Variable +from . import core +from .executor import global_scope, Executor +from paddle.fluid.proto import data_feed_pb2 +from google.protobuf import text_format +from . import io +from .data_feed_desc import DataFeedDesc + +__all__ = ['AsyncExecutor'] + + +class AsyncExecutor(object): + """ + An asynchronous Executor in Python. Through exploiting the power of + multi-core processor and data queueing, AsyncExecutor makes data reading + and cosuming decoupled, each run in multiple threads in parallel. + + Instead of reading data in python side, AsyncExecutor accepts a training + file list, which will be retrieved in C++, then training inputs will be + read, parsed and fed to training network within C++ code. + + AsyncExecutor is in active development and the API might change in the near + future. + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> startup_program = fluid.default_startup_program() + >>> main_program = fluid.default_main_program() + >>> filelist = ["train_data/part-%d" % i for i in range(100)] + >>> thread_num = len(filelist) / 4 + >>> + >>> place = fluid.CPUPlace() + >>> async_executor = fluid.AsyncExecutor(place) + >>> + >>> async_executor.run_startup_program(startup_program) + >>> + >>> epoch = 10 + >>> for i in range(epoch): + >>> async_executor.run(main_program, + >>> data_feed, + >>> filelist, + >>> thread_num, + >>> [acc], + >>> debug=False) + + Args: + place(fluid.CPUPlace|None): indicate the executor run on which device. + Only CPUPlace supported + + Note: + For debugging complicated network in parallel-GPUs, you can test it + on the executor. They has the exactly same arguments, and expected + the same results. + + Note: Only running on CPUPlace supported. + """ + + def __init__(self, place=None): + if place is None: + place = core.CPUPlace() + if not isinstance(place, core.CPUPlace): + raise ValueError("AsyncExecutor only supports CPU device") + + p = core.Place() + p.set_place(place) + + scope = global_scope() + self.executor = core.AsyncExecutor(scope, p) + + def run(self, program, data_feed, filelist, thread_num, fetch, debug=False): + """ + Run program by this AsyncExecutor. Training dataset will be in filelist. + Users can also inspect certain variables by naming them in parameter + :code:`fetch`, like in fluid.Executor. Unlike fluid.Executor, however, + AsyncExecutor doesn't return fetched variables, instead, it will dump + the values of each fetched variable to stdandard output. + + Running the dataset will be on multiple threads, within each a thread + local scope will be created, then all OPs also created in that scope. + Parameters are updated by all the OPs simultaneously. + + Args: + program(Program): the program that need to run, if not provied, + then default_main_program will be used. + data_feed(DataFeedDesc): A DataFeedDesc object + filelist(str): a file containing the training dataset file list + thread_num(int): number of concurrent training threads. See + :code:`Note` for how to set this properly + fetch(str|list): the var name or a list of var names to inspect + debug(bool): When set to True, fetch vars will be printed to + standard output after each minibatch + + Note: + the executor will run all operators in the program but not only + the operators dependent by the fetch_list. + + Note: + Running AsyncExecutor will be on multiple threads, each bound to a + CPU core. To achieve best performance, it's suggested to set thread + num to be equal or slightly less than that of CPU cores. + """ + if program is None: + program = default_main_program() + program_desc = program.desc + + if data_feed is None: + raise ValueError('ValueError: data_feed should be provided') + + if filelist is None: + raise ValueError('ValueError: filelist should be provided') + + if isinstance(filelist, str): + filelist = [filelist] + + if not isinstance(thread_num, int): + raise TypeError('TypeError: thread_num should be a positive number') + + if fetch is not None: + if isinstance(fetch, Variable): + fetch = [fetch] + fetch_var_names = [var.name for var in fetch] + for fetch_var in fetch: + shape = fetch_var.shape + if shape[len(shape) - 1] != 1: + raise AssertionError( + "%s: Fetch variable has wrong shape. Only varibles " + "with the last dimension size 1 supported." % + (fetch_var.name)) + + self.executor.run_from_files(program_desc, + data_feed.desc(), filelist, thread_num, + fetch_var_names, debug) diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py new file mode 100644 index 0000000000000000000000000000000000000000..d2ec74d6cfdeb34c1f48c086a3aa30d5100c3efb --- /dev/null +++ b/python/paddle/fluid/data_feed_desc.py @@ -0,0 +1,152 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.proto import data_feed_pb2 +from google.protobuf import text_format + +__all__ = ['DataFeedDesc'] + + +class DataFeedDesc(object): + """ + Datafeed descriptor, describing input training data format. This class is + currently only used for AsyncExecutor (See comments for class AsyncExecutor + for a brief introduction) + + DataFeedDesc shall be initialized from a valid protobuf message from disk: + >>> data_feed = fluid.DataFeedDesc('data.proto') + + See :code:`paddle/fluid/framework/data_feed.proto` for message definition. + A typical message might look like: + + >>> name: "MultiSlotDataFeed" + >>> batch_size: 2 + >>> multi_slot_desc { + >>> slots { + >>> name: "words" + >>> type: "uint64" + >>> is_dense: false + >>> is_used: true + >>> } + >>> slots { + >>> name: "label" + >>> type: "uint64" + >>> is_dense: false + >>> is_used: true + >>> } + >>> } + + However, users usually shouldn't care about the message format; instead, + they are encouragd to use :code:`Data Generator` as a tool to generate a + valid data description, in the process of converting their raw log files to + training files acceptable to AsyncExecutor. + + DataFeedDesc can also be changed during runtime. Once you got familiar with + what each field mean, you can modify it to better suit your need. E.g.: + >>> data_feed.set_batch_size(128) + >>> data_feed.set_dense_slots('wd') # The slot named 'wd' will be dense + >>> data_feed.set_use_slots('wd') # The slot named 'wd' will be used + + Finally, the content can be dumped out for debugging purpose: + >>> print(data_feed.desc()) + + Args: + proto_file(string): Disk file containing a data feed description. + + """ + + def __init__(self, proto_file): + self.proto_desc = data_feed_pb2.DataFeedDesc() + with open(proto_file, 'r') as f: + text_format.Parse(f.read(), self.proto_desc) + if self.proto_desc.name == "MultiSlotDataFeed": + self.__name_to_index = { + slot.name: i + for i, slot in enumerate(self.proto_desc.multi_slot_desc.slots) + } + + def set_batch_size(self, batch_size): + """ + Set batch size. Will be effective during training + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> data_feed.set_batch_size(128) + + Args: + batch_size: batch size + + """ + self.proto_desc.batch_size = batch_size + + def set_dense_slots(self, dense_slots_name): + """ + Set if a specific slot will be dense. Will be effective during training. + features for a dense slot will be fed into a Tensor, while those for a + sparse slot will be fed into a LoDTensor + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> data_feed.set_dense_slots(['words']) + + Args: + dense_slots_name: a list of slot names which will be set dense + + Note: + Default is sparse for all slots + """ + if self.proto_desc.name != "MultiSlotDataFeed": + raise ValueError( + "Only MultiSlotDataFeed need set_dense_slots, pls check your datafeed.proto" + ) + for name in dense_slots_name: + self.proto_desc.multi_slot_desc.slots[self.__name_to_index[ + name]].is_dense = True + + def set_use_slots(self, use_slots_name): + """ + Set if a specific slot will be used for training. A dataset shall + contain a lot of features, through this function one can select which + ones will be used for a specific model. + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> data_feed.set_use_slots(['words']) + + Args: + use_slots_name: a list of slot names which will be used in training + + Note: + Default is not used for all slots + """ + if self.proto_desc.name != "MultiSlotDataFeed": + raise ValueError( + "Only MultiSlotDataFeed need set_use_slots, pls check your datafeed.proto" + ) + for name in use_slots_name: + self.proto_desc.multi_slot_desc.slots[self.__name_to_index[ + name]].is_used = True + + def desc(self): + """ + Returns a protobuf message for this DataFeedDesc + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> print(data_feed.desc()) + + Returns: + A string message + """ + return text_format.MessageToString(self.proto_desc) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 288951cd7cd32155f136125fb817c35dd2ec6444..42c2484b284844a1f1acf53f79296e13da72676a 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -278,6 +278,7 @@ class Executor(object): p = core.Place() p.set_place(place) self.executor = core.Executor(p) + self.program_caches = dict() self._closed = False diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 472daff1eab164c2f12d940aa8fb87788c25d741..ac1c8657759cad33f939c64413e669ba9ae2aad3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -169,8 +169,11 @@ __all__ = [ 'log_loss', 'add_position_encoding', 'bilinear_tensor_product', + 'lstm', ] +kIgnoreIndex = -100 + def fc(input, size, @@ -472,6 +475,168 @@ def dynamic_lstm(input, return hidden, cell +def lstm(input, + init_h, + init_c, + max_len, + hidden_size, + num_layers, + dropout_prob=0.0, + is_bidirec=False, + is_test=False, + name=None, + default_initializer=None, + seed=-1): + """ + If Device is GPU, This op will use cudnn LSTM implementation + + A four-gate Long Short-Term Memory network with no peephole connections. + In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, + the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: + + $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ + + $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + + $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + + $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + + $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + + $$ h_t = o_t \\odot tanh(c_t) $$ + + - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) + - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). + - sigmoid is the logistic sigmoid function. + - $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. + - The $\odot$ is the element-wise product of the vectors. + - `tanh` is the activation functions. + - $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + + Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, + X represensts a matrix multiplication + + + Args: + input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size ) + init_h(Variable): The initial hidden state of the LSTM + This is a tensor with shape ( num_layers x batch_size x hidden_size) + if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) + init_c(Variable): The initial cell state of the LSTM. + This is a tensor with shape ( num_layers x batch_size x hidden_size ) + if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) + max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len + hidden_size (int): hidden size of the LSTM + num_layers (int): total layers number of the LSTM + dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps + There is NO dropout work on rnn output of the last RNN layers + is_bidirec (bool): If it is bidirectional + is_test (bool): If it is in test phrase + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + default_initializer(Initialize|None): Where use initializer to initialize the Weight + If set None, defaule initializer will be used + seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed + + + Returns: + rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) + if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) + last_h(Tensor): the hidden state of the last step of LSTM + shape is ( num_layers x batch_size x hidden_size ) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + last_c(Tensor): the cell state of the last step of LSTM + shape is ( num_layers x batch_size x hidden_size ) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + + + Examples: + .. code-block:: python + + input = embedding + batch_size = 20 + max_len = 100 + dropout_prob = 0.2 + input_size = 100 + hidden_size = 150 + num_layers = 1 + init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) + init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) + + rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \ + max_len, dropout_prob, input_size, hidden_size, \ + num_layers) + """ + + helper = LayerHelper('cudnn_lstm', **locals()) + + dtype = input.dtype + input_shape = list(input.shape) + input_size = input_shape[-1] + weight_size = 0 + for i in range(num_layers): + if i == 0: + input_weight_size = (input_size * hidden_size) * 4 + else: + if is_bidirec: + input_weight_size = (hidden_size * 2 * hidden_size) * 4 + else: + input_weight_size = (hidden_size * hidden_size) * 4 + + hidden_weight_size = (hidden_size * hidden_size) * 4 + + if is_bidirec: + weight_size += (input_weight_size + hidden_weight_size) * 2 + weight_size += hidden_size * 8 * 2 + else: + weight_size += input_weight_size + hidden_weight_size + weight_size += hidden_size * 8 + + weight = helper.create_parameter( + attr=helper.param_attr, + shape=[weight_size], + dtype=dtype, + default_initializer=default_initializer) + + out = helper.create_variable_for_type_inference(dtype) + last_h = helper.create_variable_for_type_inference(dtype) + last_c = helper.create_variable_for_type_inference(dtype) + + cache = helper.create_variable( + persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True) + + helper.append_op( + type='cudnn_lstm', + inputs={ + 'Input': input, + 'InitH': init_h, + 'InitC': init_c, + 'W': weight, + 'Cache': cache, + }, + outputs={ + 'Out': out, + 'last_h': last_h, + 'last_c': last_c, + }, + attrs={ + 'max_len': max_len, + 'is_bidirec': is_bidirec, + 'input_size': input_size, + 'hidden_size': hidden_size, + 'num_layers': num_layers, + 'is_test': is_test, + 'dropout_prob': dropout_prob, + 'seed': seed, + }) + return out, last_h, last_c + + def dynamic_lstmp(input, size, proj_size, @@ -1104,7 +1269,7 @@ def dropout(x, return out -def cross_entropy(input, label, soft_label=False, ignore_index=-100): +def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): """ **Cross Entropy Layer** @@ -1151,7 +1316,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100): labels. Default: `False`. ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient. Only valid - if soft_label is set to False. Default: -100 + if soft_label is set to False. Default: kIgnoreIndex Returns: A 2-D tensor with shape [N x 1], the cross entropy loss. @@ -4250,8 +4415,15 @@ def ctc_greedy_decoder(input, blank, name=None): [0.5, 0.1, 0.3, 0.1]] input.lod = [[4, 4]] + + Computation: - Then: + step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get: + [[0], [2], [1], [0]] + step2: merge repeated tokens and remove blank which is 0. Then we get first output sequence: + [[2], [1]] + + Finally: output.data = [[2], [1], @@ -4259,6 +4431,7 @@ def ctc_greedy_decoder(input, blank, name=None): output.lod = [[2, 1]] + Args: input(Variable): (LoDTensor), the probabilities of @@ -4273,8 +4446,10 @@ def ctc_greedy_decoder(input, blank, name=None): name (str): The name of this layer. It is optional. Returns: - Variable: CTC greedy decode result. If all the sequences in result were - empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1]. + Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. + 'Lp' is the sum if all output sequences' length. If all the sequences + in result were empty, the result LoDTensor will be [-1] with + LoD [[]] and dims [1, 1]. Examples: .. code-block:: python @@ -5012,7 +5187,7 @@ def multiplex(inputs, index): def softmax_with_cross_entropy(logits, label, soft_label=False, - ignore_index=-100, + ignore_index=kIgnoreIndex, numeric_stable_mode=False, return_softmax=False): """ @@ -5070,7 +5245,7 @@ def softmax_with_cross_entropy(logits, labels as soft labels. By default, `soft_label` is set to False. ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient. Only valid - if soft_label is set to False. Default: -100 + if soft_label is set to False. Default: kIgnoreIndex numeric_stable_mode (bool): A flag to indicate whether to use a more numerically stable algorithm. Only valid when soft_label is False and GPU is used. @@ -8242,13 +8417,17 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): @templatedoc() -def sigmoid_cross_entropy_with_logits(x, label, name=None): +def sigmoid_cross_entropy_with_logits(x, + label, + ignore_index=kIgnoreIndex, + name=None): """ ${comment} Args: x(${x_type}): ${x_comment} label(${label_type}): ${label_comment} + ignore_index(&{ignore_index}): ${ignore_index_comment} name(basestring|None): Name of the output. Returns: @@ -8267,7 +8446,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None): type="sigmoid_cross_entropy_with_logits", inputs={"X": x, "Label": label}, - attrs={}, + attrs={"ignore_index": ignore_index}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/demo/async_executor.py b/python/paddle/fluid/tests/demo/async_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..fe8da0aab74bd5fc6219666236a04423a6d60489 --- /dev/null +++ b/python/paddle/fluid/tests/demo/async_executor.py @@ -0,0 +1,100 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tarfile +import paddle.fluid as fluid +import paddle +from paddle.fluid import core + +URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz' +MD5 = '2a405a31508969b3ab823f42c0f522ca' + + +def bow_net(data, + label, + dict_dim=89528, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + models/fluid/PaddleNLP/text_classification/nets.py + """ + # embedding + emb = fluid.layers.embedding( + input=data, size=[dict_dim, emb_dim], is_sparse=True) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bowh = fluid.layers.tanh(bow) + # fc layer after conv + fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + # probability of each class + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + # cross entropy loss + cost = fluid.layers.cross_entropy(input=prediction, label=label) + # mean loss + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, acc, prediction + + +def train(): + # Download data + with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf: + tarf.extractall(path='./') + tarf.close() + + # Initialize dataset description + dataset = fluid.DataFeedDesc('train_data/data.prototxt') + dataset.set_batch_size(128) # See API doc for how to change other fields + print dataset.desc() # Debug purpose: see what we get + + # define network + # input text data + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost, acc, prediction = bow_net(data, label) + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) + opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost) + + # Run startup program + startup_program = fluid.default_startup_program() + place = fluid.CPUPlace() + executor = fluid.Executor(place) + executor.run(startup_program) + + async_executor = fluid.AsyncExecutor(place) + main_program = fluid.default_main_program() + epochs = 10 + filelist = ["train_data/part-%d" % i for i in range(12)] + for i in range(epochs): + thread_num = 4 + async_executor.run( + main_program, # This can be changed during iteration + dataset, # This can be changed during iteration + filelist, # This can be changed during iteration + thread_num, # This can be changed during iteration + [data, acc], # Multiple fetch targets can be specified + debug=False) + fluid.io.save_inference_model('imdb/epoch%d.model' % i, + [data.name, label.name], [acc], executor) + + +if __name__ == "__main__": + train() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 271b9c740fd99554e9a7aa8d476a52cf6385b1d9..76a707efdc0804be0316ab12c347ffed6199529a 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -216,6 +216,15 @@ class OpTest(unittest.TestCase): self.dtype) outputs = append_input_output(block, op_proto, self.outputs, False, self.dtype) + + if hasattr(self, "cache_name_list"): + for name in self.cache_name_list: + inputs[name] = block.create_var( + name=name, + persistable=True, + type=core.VarDesc.VarType.RAW, + stop_gradient=True) + op = block.append_op( type=self.op_type, inputs=inputs, @@ -428,8 +437,17 @@ class OpTest(unittest.TestCase): op_inputs = self.inputs if hasattr(self, "inputs") else dict() op_outputs = self.outputs if hasattr(self, "outputs") else dict() op_attrs = self.attrs if hasattr(self, "attrs") else dict() - self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs, - op_attrs) + + cache_list = None + if hasattr(self, "cache_name_list"): + cache_list = self.cache_name_list + self.op = create_op( + self.scope, + self.op_type, + op_inputs, + op_outputs, + op_attrs, + cache_list=cache_list) if no_grad_set is None: no_grad_set = set() diff --git a/python/paddle/fluid/tests/unittests/test_async_executor.py b/python/paddle/fluid/tests/unittests/test_async_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..43855b95f9e3096d58ca3e8acfdb25f034bab175 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_async_executor.py @@ -0,0 +1,142 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle +import unittest +import tarfile +import os +import shutil + +proto_str = ('name: "MultiSlotDataFeed"\n' + 'batch_size: 2\n' + 'multi_slot_desc {\n' + ' slots {\n' + ' name: "words"\n' + ' type: "uint64"\n' + ' is_dense: false\n' + ' is_used: true\n' + ' }\n' + ' slots {\n' + ' name: "label"\n' + ' type: "uint64"\n' + ' is_dense: false\n' + ' is_used: true\n' + ' }\n' + '}') + +URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz' +MD5 = '2a405a31508969b3ab823f42c0f522ca' + + +def bow_net(data, + label, + dict_dim=89528, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + models/fluid/PaddleNLP/text_classification/nets.py + """ + # embedding + emb = fluid.layers.embedding( + input=data, size=[dict_dim, emb_dim], is_sparse=True) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bowh = fluid.layers.tanh(bow) + # fc layer after conv + fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + # probability of each class + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + # cross entropy loss + cost = fluid.layers.cross_entropy(input=prediction, label=label) + # mean loss + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, acc, prediction + + +class TestAsyncExecutor(unittest.TestCase): + def setUp(self): + with open('./data.prototxt', 'w+') as f: + f.write(proto_str) + f.close() + + with tarfile.open(paddle.dataset.common.download(URL, "imdb", + MD5)) as tarf: + tarf.extractall(path='./') + tarf.close() + + def test_data_feed_desc(self): + data_feed = fluid.DataFeedDesc('./data.prototxt') + # assertEqueal(data_feed.proto_desc.batch, 2) + # assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2) + self.assertEqual(" ".join(data_feed.desc().split()), + " ".join(proto_str.split())) + + def test_run(self): + # Initialize dataset description + data_feed = fluid.DataFeedDesc('train_data/data.prototxt') + data_feed.set_batch_size( + 128) # See API doc for how to change other fields + + # define network + # input text data + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost, acc, prediction = bow_net(data, label) + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) + opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost) + + # Run startup program + startup_program = fluid.default_startup_program() + place = fluid.CPUPlace() + executor = fluid.Executor(place) + executor.run(startup_program) + + main_program = fluid.default_main_program() + async_executor = fluid.AsyncExecutor(place) + + self.assertRaises(TypeError, async_executor.run) + self.assertRaises(TypeError, async_executor.run, main_program) + self.assertRaises(TypeError, async_executor.run, main_program, + data_feed) + + filelist = ['train_data/part-%d' % i for i in range(10)] + self.assertRaises(TypeError, async_executor.run, main_program, + data_feed, filelist) + + thread_num = 4 + self.assertRaises(TypeError, async_executor.run, main_program, + data_feed, filelist, thread_num) + + async_executor.run(main_program, data_feed, filelist, thread_num, [acc]) + fluid.io.save_inference_model("imdb.model", [data.name, label.name], + [acc], executor) + statinfo = os.stat('imdb.model/__model__') + self.assertGreater(statinfo.st_size, 0) + + os.remove('./data.prototxt') + shutil.rmtree('./train_data') + shutil.rmtree('./imdb.model') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 7f477031ddef6b459dcda74a18d0831763f4aeca..be51fb06a37a376f6f410336184c95981ded35dc 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -170,9 +170,10 @@ class TestBook(unittest.TestCase): with program_guard(program): dat = layers.data(name='data', shape=[10], dtype='float32') lbl = layers.data(name='label', shape=[10], dtype='float32') + ignore_index = -1 self.assertIsNotNone( layers.sigmoid_cross_entropy_with_logits( - x=dat, label=lbl)) + x=dat, label=lbl, ignore_index=ignore_index)) print(str(program)) def test_hsigmoid(self): diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0e9e2e8429e51a328e397f9e2a05ab7209c9c1a2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py @@ -0,0 +1,192 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle.fluid.core as core +from op_test import OpTest +import paddle.fluid as fluid + +SIGMOID_THRESHOLD_MIN = -40.0 +SIGMOID_THRESHOLD_MAX = 13.0 +EXP_MAX_INPUT = 40.0 + + +def lstm_naive( + input, + w, ): + seq_len, batch_size, hidden_size = input.shape + + offset = 0 + wi = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + wf = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + wc = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + wo = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + ri = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + rf = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + rc = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + ro = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + + bi_1 = w[offset:offset + hidden_size] + offset += hidden_size + bf_1 = w[offset:offset + hidden_size] + offset += hidden_size + bc_1 = w[offset:offset + hidden_size] + offset += hidden_size + bo_1 = w[offset:offset + hidden_size] + offset += hidden_size + + bi_2 = w[offset:offset + hidden_size] + offset += hidden_size + bf_2 = w[offset:offset + hidden_size] + offset += hidden_size + bc_2 = w[offset:offset + hidden_size] + offset += hidden_size + bo_2 = w[offset:offset + hidden_size] + + def sigmoid(x): + y = np.copy(x) + y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN + y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX + return 1. / (1. + np.exp(-y)) + + def tanh(x): + y = -2. * x + y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT + return (2. / (1. + np.exp(y))) - 1. + + output = [] + pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype) + pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype) + + for i in range(seq_len): + emb_1 = input[i] + + input_gate = sigmoid( + np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2) + forget_gate = sigmoid( + np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2) + output_gate = sigmoid( + np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2) + c_t_temp = tanh( + np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2) + new_c = input_gate * c_t_temp + forget_gate * pre_c + new_h = output_gate * tanh(new_c) + + pre_h = new_h + pre_c = new_c + + output.append(new_h) + + output = np.concatenate(output, -1) + output = output.reshape((batch_size, -1, hidden_size)) + + output = output.transpose((1, 0, 2)) + + return output, pre_h, pre_c + + +class TestCUDNNLstmOp(OpTest): + def setUp(self): + self.op_type = "cudnn_lstm" + self.dtype = np.float32 + + num_steps = 20 + batch_size = 5 + hidden_size = 20 + + input_weight_size = (hidden_size * hidden_size) * 4 + hidden_weight_size = (hidden_size * hidden_size) * 4 + weight_size = input_weight_size + hidden_weight_size + weight_size += hidden_size * 8 + + input = np.random.uniform( + low=-0.1, high=0.1, size=(num_steps, batch_size, + hidden_size)).astype(self.dtype) + flat_w = np.random.uniform( + low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype) + + output, last_hidden, last_cell = lstm_naive(input, flat_w) + + init_h = np.zeros((batch_size, hidden_size), dtype=np.float32) + init_c = np.zeros((batch_size, hidden_size), dtype=np.float32) + scope = core.Scope() + program = fluid.Program() + block = program.global_block() + + cache_temp = block.create_var( + name="Cache", + persistable=True, + type=core.VarDesc.VarType.RAW, + stop_gradient=True) + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'W': OpTest.np_dtype_to_fluid_dtype(flat_w), + 'InitH': OpTest.np_dtype_to_fluid_dtype(init_h), + 'InitC': OpTest.np_dtype_to_fluid_dtype(init_c), + } + self.cache_name_list = ['Cache'] + self.attrs = { + 'max_len': num_steps, + 'dropout_prob': 0.0, + 'is_bidirec': False, + 'input_size': hidden_size, + 'hidden_size': hidden_size, + 'num_layers': 1, + } + self.outputs = { + 'Out': output, + "last_h": last_hidden, + 'last_c': last_cell + } + + def test_output_with_place(self): + if self.testcuda(): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + + def test_grad_with_place(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + set(['Input', 'W', 'InitH', 'InitC']), + ['Out', 'last_h', 'last_c'], + max_relative_error=0.02) + + def testcuda(self): + return core.is_compiled_with_cuda() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py index 97ff203499c0bf223930c904de46e1abdd902799..41797a241cab9f2b3bc4b492a1c4b6db89ac2948 100644 --- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py +++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py @@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): """Test sigmoid_cross_entropy_with_logit_op with probabalistic label """ + def setUp(self): + self.op_type = "sigmoid_cross_entropy_with_logits" + batch_size = 64 + num_classes = 20 + ignore_index = -1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, (batch_size, num_classes)) + .astype("float32")), + 'Label': np.random.randint(-1, 2, (batch_size, num_classes)) + .astype("float32") + } + self.attrs = {'ignore_index': ignore_index, } + # Fw Pass is implemented as elementwise sigmoid followed by + # elementwise logistic loss + # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X)) + sigmoid_X = expit(self.inputs['X']) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) + out = -term1 - term2 + out[np.where(self.inputs['Label'] == ignore_index)] = 0 + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSigmoidCrossEntropyWithLogitsOp3(OpTest): + """Test sigmoid_cross_entropy_with_logit_op with probabalistic label + """ + def setUp(self): self.op_type = "sigmoid_cross_entropy_with_logits" batch_size = 64 diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py index 34fbb1b549cf5fc5f75bcc0715e5c83665f1d200..dc3b2cb8bc15836a4bf067caa05c3a37a917ecad 100644 --- a/python/paddle/fluid/tests/unittests/testsuite.py +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -20,7 +20,7 @@ import paddle.fluid.core as core from paddle.fluid.op import Operator -def create_op(scope, op_type, inputs, outputs, attrs): +def create_op(scope, op_type, inputs, outputs, attrs, cache_list=None): kwargs = dict() op_maker = core.op_proto_and_checker_maker @@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs): __create_var__(in_name, sub_in_name) else: __create_var__(in_name, in_name) + if cache_list != None and isinstance(cache_list, list): + for name in cache_list: + kwargs[name] = [] + scope.var(name) + kwargs[name].append(name) for out_name, out_dup in Operator.get_op_outputs(op_type): if out_name in outputs: diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py index b9af8348e16c051db64d57a9594aee303d83aef2..a9dddbbcc82e649b6c98db0fd58c62b58435b8db 100644 --- a/python/paddle/reader/tests/decorator_test.py +++ b/python/paddle/reader/tests/decorator_test.py @@ -62,10 +62,10 @@ class TestBuffered(unittest.TestCase): for idx, i in enumerate(b()): elapsed_time = time.time() - last_time if i == 0: - time.sleep(0.3) + time.sleep(1) else: # read time should be short, meaning already buffered. - self.assertLess(elapsed_time, 0.05) + self.assertLess(elapsed_time, 0.08) last_time = time.time() diff --git a/python/setup.py.in b/python/setup.py.in index 200b96ec54ee5daeb905e155d0b7b57ab7740250..5aee26b63832889272cde09c553b4615efb8872a 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path) package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name] -if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode. - if '${WITH_MKLDNN}' == 'ON': +if '${WITH_MKLDNN}' == 'ON': + if '${CMAKE_BUILD_TYPE}' == 'Release': + # only change rpath in Release mode. # TODO(typhoonzero): use install_name_tool to patch mkl libs once # we can support mkl on mac. # @@ -177,14 +177,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" if os.system(command) != 0: raise Exception("patch libmkldnn.so failed, command: %s" % command) - package_data['paddle.libs']+=['libmkldnn.so.0'] - shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) + package_data['paddle.libs']+=['libmkldnn.so.0'] + shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) if '${WITH_NGRAPH}' == 'ON': + # only change rpath in Release mode, + # since in Debug mode, nGraph lib may be too large to be changed? if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode. - command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" - if os.system(command) != 0: - raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) + if os.name != 'nt': + if "@APPLE@" == "1": + command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}" + else: + command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" + if os.system(command) != 0: + raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) shutil.copy('${NGRAPH_SHARED_LIB}', libs_path) shutil.copy('${NGRAPH_CPU_LIB}', libs_path) shutil.copy('${NGRAPH_TBB_LIB}', libs_path)