diff --git a/Dockerfile b/Dockerfile index 752fea5951bdc8c2cf79a17c960217c88ae62571..fc5069a6c080ed23317695e6822c4c46b5b5c7f9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -76,7 +76,8 @@ RUN easy_install -U pip && \ pip install sphinx-rtd-theme==0.1.9 recommonmark RUN pip install pre-commit 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' + pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip install opencv-python #For docstring checker RUN pip install pylint pytest astroid isort diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 48d3d2db7f63767195756f7f42ee91677b671d02..20dda35c5ccd98f5672d867c26ab97a215483543 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -54,7 +54,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" - GIT_TAG "db3424ad44901513c03a1ea31ccaacdf633fbe9f" + GIT_TAG "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst index f292684fb5fe2df06db5239e7f43fdfa1dd2f2bd..0d644777287aea0a572adb6fa40f498f9c147af7 100644 --- a/doc/v2/faq/build_and_install/index_cn.rst +++ b/doc/v2/faq/build_and_install/index_cn.rst @@ -213,3 +213,12 @@ virtualenv本身也是Python的一个包,可以用pip进行安装: 保存并关闭文件。 这样,每次打开终端时就会自动启动名为‘paddle’的Python环境了。 + +10. 通过pip安装的PaddlePaddle在 :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so` +------------------------------------------------------------------------------------------ +出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`, +但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` +拷贝到 :code:`/usr/local/lib` 路径下,所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下, +即: :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。 + +**注意**:如果是在虚拟环境中安装PaddlePaddle, :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。 \ No newline at end of file diff --git a/paddle/contrib/inference/high_level_api.md b/paddle/contrib/inference/high_level_api.md new file mode 100644 index 0000000000000000000000000000000000000000..563b696143de9cbf67db38048bbd2f7c11b3a66e --- /dev/null +++ b/paddle/contrib/inference/high_level_api.md @@ -0,0 +1,59 @@ +# Inference High-level APIs +This document describes the high-level inference APIs one can use to easily deploy a Paddle model for an application. + +The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed. + +## PaddleTensor +We provide the `PaddleTensor` data structure is to give a general tensor interface. + +The definition is + +```c++ +struct PaddleTensor { + std::string name; // variable name. + std::vector shape; + PaddleBuf data; // blob of data. + PaddleDType dtype; +}; +``` + +The data is stored in a continuous memory `PaddleBuf`, and tensor's data type is specified by a `PaddleDType`. +The `name` field is used to specify the name of input variable, +that is important when there are multiple inputs and need to distiuish which variable to set. + +## engine +The inference APIs has two different underlying implementation, currently there are two valid engines: + +- the native engine, which is consists of the native operators and framework, +- the Anakin engine, which is a Anakin library embeded. + +The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, +but the Anakin engine can only take the Anakin model as input(user need to manully transform the format first) and currently not all Paddle models are supported. + +```c++ +enum class PaddleEngineKind { + kNative = 0, // Use the native Fluid facility. + kAnakin, // Use Anakin for inference. +}; +``` + +## PaddlePredictor and how to create one +The main interface is `PaddlePredictor`, there are following methods + +- `bool Run(const std::vector& inputs, std::vector* output_data)` + - take inputs and output `output_data` +- `Clone` to clone a predictor from an existing one, with model parameter shared. + +There is a factory method to help create a predictor, and the user takes the ownership of this object. + +```c++ +template +std::unique_ptr CreatePaddlePredictor(const ConfigT& config); +``` + +By specifying the engine kind and config, one can get an specific implementation. + +## Reference + +- [paddle_inference_api.h](./paddle_inference_api.h) +- [demos](./demo) diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h index bd4530fcf9518cb3bf06179d8f60a1dde38ff7dd..38e3cc21413b9ab715b84f278f00b9df23cb7682 100644 --- a/paddle/contrib/inference/paddle_inference_api.h +++ b/paddle/contrib/inference/paddle_inference_api.h @@ -109,8 +109,7 @@ class PaddlePredictor { // The common configs for all the predictors. struct Config { - std::string model_dir; // path to the model directory. - bool enable_engine{false}; // Enable to execute (part of) the model on + std::string model_dir; // path to the model directory. }; }; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index d5ca061944f33939cea59a5275e691b1966194fa..1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -73,6 +73,9 @@ void BroadcastOpHandle::RunImpl() { int root_id = boost::get(in_tensor.place()).device; std::vector> broadcast_calls; + int type = platform::ToNCCLDataType(in_tensor.type()); + size_t numel = static_cast(in_tensor.numel()); + for (auto out_var_handle : out_var_handles) { Variable *out_var = var_scopes.at(out_var_handle->scope_idx_) ->FindVar(out_var_handle->name_); @@ -87,13 +90,11 @@ void BroadcastOpHandle::RunImpl() { send_recv_buffer = const_cast(in_tensor.data()); out_handle = out_var_handle; } else { - send_recv_buffer = - VariableVisitor::GetMutableTensor(out_var).mutable_data( - out_var_handle->place_); + send_recv_buffer = VariableVisitor::GetMutableTensor(out_var) + .Resize(in_tensor.dims()) + .mutable_data(out_var_handle->place_); } - int type = platform::ToNCCLDataType(in_tensor.type()); - size_t numel = static_cast(in_tensor.numel()); broadcast_calls.emplace_back( [send_recv_buffer, numel, type, root_id, &nccl_ctx] { PADDLE_ENFORCE(platform::dynload::ncclBcast( diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 99dcaf27134e879fa85f57e5a675382442e9edf2..a6fe64fa80d6bf036893d49de56d7274d49a3b30 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -351,7 +351,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result, auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); - auto var = new VarHandle(vars.size() - 1, i, og, p); + auto var = new VarHandle(vars.size(), i, og, p); vars.emplace_back(var); op_handle->AddOutput(var); } @@ -447,8 +447,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, op_handle->AddInput(prev_grad.get()); } auto &vars = result->vars_[dst_dev_id][og]; - auto var = - new VarHandle(vars.size() - 1, dst_dev_id, og, places_[dst_dev_id]); + auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]); vars.emplace_back(var); op_handle->AddOutput(var); return var; diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index eb1c07630ab665a90d76b810a421cffb0ce673c2..0b6347bf51dc1c347073a0fdcf4ddd91865d846d 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -47,7 +47,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { #endif std::unique_ptr Build(const ProgramDesc &program) const override; - int GetVarDeviceID(const std::string &varname) const; + int GetVarDeviceID(const std::string &varname) const override; private: void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index f79565fe71c4aef140475c922cbbf5a1e0b7fe03..1f84c3b9e2d7ee9ae51959988fceeb3451b7b3b8 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/details/op_handle_base.h" +#include namespace paddle { namespace framework { @@ -122,11 +122,16 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { #ifdef PADDLE_WITH_CUDA if (!events_.empty()) { // Use event std::function method = callback; - + // NOTE(zcd): device context must be ordered here because RecordEvent + // will use a mutex to ensure the safe of multi-threads. + std::map ordered_ctxes; for (auto &p : dev_ctxes_) { + ordered_ctxes.emplace(p.second, p.first); + } + for (auto &p : ordered_ctxes) { method = [method, p, this]() { - static_cast(p.second)->RecordEvent( - events_.at(boost::get(p.first).device), + static_cast(p.first)->RecordEvent( + events_.at(boost::get(p.second).device), method); }; } diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 68fcc104d48b2b39929ed2198a2dd2eabae10e94..2cf14bd371831ab682166f4256d6966b5ab278c8 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -27,6 +27,7 @@ enum AttrType { BOOLEANS = 7; BLOCK = 8; LONG = 9; + BLOCKS = 10; } // OpDesc describes an instance of a C++ framework::OperatorBase @@ -46,6 +47,7 @@ message OpDesc { repeated bool bools = 11; optional int32 block_idx = 12; optional int64 l = 13; + repeated int32 blocks_idx = 14; }; message Var { diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index e331c8128f2e8121dbbfe82b74ea35f2d0d399c0..d29d8ce1c561e45980d10c17c984ca2ed3b453f3 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -51,8 +51,6 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) { } std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { - PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code()); - if (!platform::is_cpu_place(t.place())) { LoDTensor tt; framework::TensorCopy(t, platform::CPUPlace(), &tt); @@ -70,7 +68,13 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { // only print first ten elements int64_t size = t.numel() < 10 ? t.numel() : 10; for (int64_t i = 0; i < size; ++i) { - os << t.data()[i] << " "; + if (t.type().hash_code() == typeid(float).hash_code()) { + os << t.data()[i] << " "; + } else if (t.type().hash_code() == typeid(int64_t).hash_code()) { + os << t.data()[i] << " "; + } else { + PADDLE_THROW("LoDTensor data type not in [float, int64_t]"); + } } return os; diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index 6dfe7d2d8c1cce3360d99950240bc6de5a063dab..38d3cd96d65f0a54b0ea87b4c677013f3802adfb 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -26,6 +26,20 @@ namespace paddle { namespace framework { +TEST(LoD, PrintLoDTensor) { + LoDTensor tensor1; + tensor1.mutable_data(platform::CPUPlace()); + tensor1.data()[0] = 0.2; + tensor1.data()[1] = 0.5; + LOG(INFO) << tensor1; + + LoDTensor tensor2; + tensor2.mutable_data(platform::CPUPlace()); + tensor2.data()[0] = 1; + tensor2.data()[1] = 2; + LOG(INFO) << tensor2; +} + TEST(LoD, data) { LoD lod{{0, 1, 2}}; lod.push_back({0, 2, 4, 5}); @@ -37,7 +51,7 @@ TEST(LoD, data) { } } -TEST(LodExpand, test) { +TEST(LoD, ExpandLoD) { LoD lod{{0, 2}}; LoDTensor tensor; tensor.set_lod(lod); diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index f92769192c218eb7cdc2350ff6e4721b45005806..a190199f1cb1361f67f20c755b8e7ef52c284adc 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -211,6 +211,12 @@ void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) { need_update_ = true; } +void OpDesc::SetBlocksAttr(const std::string &name, + std::vector blocks) { + this->attrs_[name] = blocks; + need_update_ = true; +} + void OpDesc::SetAttrMap( const std::unordered_map &attr_map) { attrs_ = attr_map; @@ -305,6 +311,13 @@ struct SetAttrDescVisitor : public boost::static_visitor { void operator()(const std::vector &v) const { VectorToRepeated(v, attr_->mutable_bools()); } + void operator()(const std::vector &v) const { + std::vector blocks_idx; + for (auto blk : v) { + blocks_idx.push_back(blk->ID()); + } + VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx()); + } void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } void operator()(int64_t v) const { attr_->set_l(v); } void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index a02d3e269129596f65a2fb346e76c1af7fbead95..74dd8ec002005dd080424b48b5db1a2574a6974f 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -77,6 +77,8 @@ class OpDesc { void SetBlockAttr(const std::string &name, BlockDesc *block); + void SetBlocksAttr(const std::string &name, std::vector blocks); + Attribute GetAttr(const std::string &name) const; Attribute GetNullableAttr(const std::string &name) const; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d478865fa8f24c653a4185cabd05747a5410ceaa..a6788cb6d5d6acb57998fb9b06dfaaf417912dde 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -121,7 +121,7 @@ ParallelExecutor::ParallelExecutor( #endif } - builder_ = std::move(builder_factory.Create()); + builder_ = builder_factory.Create(); member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, builder_->Build(main_program))); diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 4879209ece9fdfea91e484a4118c00a2a2a2b4f7..e099e40f121ff13657e563eb608feecbca0551be 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -35,7 +35,8 @@ using VariableNameMap = std::map>; using Attribute = boost::variant, std::vector, std::vector, bool, - std::vector, BlockDesc*, int64_t>; + std::vector, BlockDesc*, int64_t, + std::vector>; using AttributeMap = std::unordered_map; diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 65d63784c1486f3304c053a2bd3c58b8b30eda2f..52f931188dc790682626b14da83d0835cad4f1a6 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include +#include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/platform/profiler.h" @@ -75,6 +76,9 @@ bool GRPCClient::AsyncSendVar(const std::string& ep, var_h.scope = p_scope; var_h.name = var_name_val; var_h.ctx = p_ctx; + var_h.method = "Send"; + + VLOG(3) << var_h.String() << " begin"; // stub context SendProcessor* s = new SendProcessor(ch); @@ -129,6 +133,9 @@ bool GRPCClient::AsyncGetVar(const std::string& ep, var_h.scope = p_scope; var_h.name = var_name_val; var_h.ctx = p_ctx; + var_h.method = "Get"; + + VLOG(3) << var_h.String() << " begin"; // stub context GetProcessor* s = new GetProcessor(ch); @@ -172,6 +179,9 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep, var_h.scope = p_scope; var_h.name = out_var_name_val; var_h.ctx = p_ctx; + var_h.method = "Prefetch"; + + VLOG(3) << var_h.String() << " begin"; // stub context GetProcessor* s = new GetProcessor(ch); @@ -243,10 +253,11 @@ void GRPCClient::Proceed() { GPR_ASSERT(ok); PADDLE_ENFORCE(c); if (c->status_.ok()) { + VLOG(3) << c->var_h_.String() << " process"; c->Process(); } else { - LOG(FATAL) << "var: " << c->var_h_.String() - << " grpc error:" << c->status_.error_message(); + LOG(FATAL) << c->var_h_.String() + << " meets grpc error:" << c->status_.error_message(); } delete c; { diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index a6efa7dfd11e87caafa6109391f133b0233d58dd..7875939ff510e7e41a2a11ca965b52eedff3d05c 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -47,14 +47,18 @@ namespace operators { namespace distributed { struct VarHandle { + // RPC endpoint. std::string ep; const platform::DeviceContext* ctx; const framework::Scope* scope; + // Variable name. std::string name; + // RPC method name. + std::string method; std::string String() const { std::ostringstream s; - s << "name:[" << name << "] ep:[" << ep << "]"; + s << method << " name:[" << name << "], ep:[" << ep << "]"; return s.str(); } }; diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index 707f665a29d83d7cdf6e4e80624f2402a7b0a2e7..b9a9b12cecdada570da5af173e394999554e9cb8 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -41,6 +41,19 @@ class RequestBase { virtual ~RequestBase() {} virtual void Process() = 0; + std::string Status2String(const std::string& method) { + std::string status = "Process"; + if (status_ == FINISH) { + status = "Finish"; + } + + std::ostringstream s; + s << method << " name:[" << GetReqName() << "]" + << ", ep:[" << ctx_.peer() << "]" + << " " << status << " using req_id:" << req_id_; + return s.str(); + } + CallStatus Status() const { std::lock_guard l(status_mu_); return status_; @@ -272,7 +285,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, int req_id) { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { - VLOG(3) << "shutdown, do not TryToRegisterNewSendOne"; + LOG(WARNING) << "shutdown, do not TryToRegisterNewSendOne"; return; } @@ -306,14 +319,14 @@ void AsyncGRPCServer::HandleRequest( bool ok = false; while (true) { - VLOG(3) << "HandleRequest " << rpc_name << " wait next"; + VLOG(4) << "HandleRequest " << rpc_name << " wait next"; if (!cq->Next(&tag, &ok)) { LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!"; break; } int req_id = static_cast(reinterpret_cast(tag)); - VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id + VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id << " get next"; auto& reqs = rpc_reqs_[rpc_name]; @@ -324,22 +337,21 @@ void AsyncGRPCServer::HandleRequest( base = reqs[req_id]; } + VLOG(3) << base->Status2String(rpc_name); + // reference: // https://github.com/tensorflow/tensorflow/issues/5596 // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I if (!ok) { LOG(WARNING) << "completion queue:" << rpc_name - << " recv no regular event:argument name[" - << base->GetReqName() << "]"; + << " recv no regular event" + << " context:" << base->Status2String(rpc_name); TryToRegisterNewOne(rpc_name, req_id); delete base; continue; } - VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id - << ", status:" << base->Status(); - switch (base->Status()) { case PROCESS: { base->Process(); diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index 619890b1939be8777b89b94e415a3c2d63376658..45832c60bf9172497afabac927ba39a7cbfb9a52 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -76,6 +76,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, if (total_written + size_to_write > length) { size_to_write = length - total_written; } + // This log is useful to see how long a internal block size is of rpc. + VLOG(7) << "copy " << size_to_write << " data to CUDAPlace"; memory::Copy(boost::get(place), reinterpret_cast(p), cpu, data, size_to_write, gpu_dev_ctx.stream()); @@ -103,6 +105,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, } // TODO(gongwb): can we avoid copy? platform::CPUPlace cpu; + // This log is useful to see how long a internal block size is of rpc. + VLOG(7) << "copy " << size_to_write << " data to CPUPlace"; memory::Copy(cpu, reinterpret_cast(p), cpu, data, size_to_write); p += size_to_write; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index f840064ecaca4bc38191727da39d07676dc18ee1..d98bf807a9464c1c2294aa0601386a940ddc00f8 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -101,17 +101,16 @@ void ListenAndServOp::RunSyncLoop( framework::Scope *recv_scope, const std::vector &prefetch_block_id_list) const { size_t num_blocks = program->Size(); + auto optimize_blocks = + Attr>(kOptimizeBlocks); PADDLE_ENFORCE_GE(num_blocks, 2, "server program should have at least 2 blocks"); - std::vector optimize_block_id_list; - for (int blkid = 1; blkid < num_blocks; ++blkid) { - if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(), - blkid) == prefetch_block_id_list.end()) { - optimize_block_id_list.push_back(blkid); - } + std::vector optimize_blocks_idx; + for (auto blk : optimize_blocks) { + optimize_blocks_idx.push_back(blk->ID()); } - auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list); + auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx); // Insert placeholder for block0 which holds current op itself. optimize_prepared.insert( optimize_prepared.begin(), @@ -134,14 +133,14 @@ void ListenAndServOp::RunSyncLoop( // and this will still work. // The optimize blocks which have the same parent ID would run parallel // TODO(Yancey1989): need to use ParallelExecutor for future - int32_t last_parent_blkid = program->Block(1).Parent(); + int32_t last_parent_blkid = optimize_blocks[0]->Parent(); std::vector parallel_blkids; - parallel_blkids.push_back(1); + parallel_blkids.push_back(optimize_blocks[0]->ID()); double ts = GetTimestamp(); - for (size_t i = 1; i < optimize_block_id_list.size(); ++i) { + for (size_t i = 1; i < optimize_blocks.size(); ++i) { // skip the first optimize block because it is already in the // parallel_blkids. - int blkid = optimize_block_id_list[i]; + int blkid = optimize_blocks[i]->ID(); if (program->Block(blkid).Parent() != last_parent_blkid) { ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); @@ -261,8 +260,11 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, rpc_service_->RegisterRPC(distributed::kRequestPrefetch, request_prefetch_handler_.get()); - auto *optimize_block = Attr(kOptimizeBlock); - auto *program = optimize_block->Program(); + auto optimize_blocks = + Attr>(kOptimizeBlocks); + PADDLE_ENFORCE(optimize_blocks.size() >= 1, + "optimize blocks should be 1 at least on the pserver side."); + auto *program = optimize_blocks[0]->Program(); framework::Executor executor(dev_place); // prepare for prefetch @@ -339,8 +341,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { "a map from grad name to it's optimize block id") .SetDefault({}); AddAttr("sync_mode", "if works at sync_mode or not").SetDefault(true); - AddAttr(kOptimizeBlock, - "BlockID to run on server side."); + AddAttr>( + kOptimizeBlocks, "Optimize blocks to run on server side.") + .SetDefault({}); AddAttr>(kPrefetchVarNameToBlockId, "prefetch blocks to run on server side.") .SetDefault({}); diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 9aa322ad602d7a72bb90aaa4a67e7f2f7a3a54cd..634c1b4f4b541be9f4950a9ef48f944863486705 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -30,7 +30,7 @@ limitations under the License. */ namespace paddle { namespace operators { -constexpr char kOptimizeBlock[] = "OptimizeBlock"; +constexpr char kOptimizeBlocks[] = "optimize_blocks"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; void RunServer(std::shared_ptr service); diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index e550552b195b768d68ec64e9c3b5889b56ca719f..aee6180add5708d31f7ce927b37c4524a291fe3c 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -129,7 +129,10 @@ void StartServerNet(bool is_sparse, std::atomic *initialized) { // sub program run in listen_and_serv_op, for simple test we use sum f::ProgramDesc program; const auto &root_block = program.Block(0); + std::vector optimize_blocks; auto *optimize_block = program.AppendBlock(root_block); + optimize_blocks.push_back(optimize_block); + auto *prefetch_block = program.AppendBlock(root_block); // X for server side tensors, RX for received tensors, must be of same shape. AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block, @@ -139,7 +142,7 @@ void StartServerNet(bool is_sparse, std::atomic *initialized) { attrs.insert({"Fanin", 1}); attrs.insert({"ParamList", std::vector({"Out"})}); attrs.insert({"GradList", std::vector({"x1"})}); - attrs.insert({"OptimizeBlock", optimize_block}); + attrs.insert({"optimize_blocks", optimize_blocks}); attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"grad_to_block_id", std::vector({""})}); attrs.insert({"sync_mode", true}); diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc index 14b57b11fefb2b726531cb164dbf479f8df26b24..6668e6b9e917eea7ba4a80ac78917b73eb827208 100644 --- a/paddle/fluid/operators/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/softmax_mkldnn_op.cc @@ -27,8 +27,81 @@ using paddle::platform::MKLDNNMemDesc; using mkldnn::memory; // Note: paddle has also "memory" namespace using mkldnn::primitive; using mkldnn::softmax_forward; +using mkldnn::softmax_backward; using mkldnn::prop_kind; using mkldnn::stream; +using platform::to_void_cast; + +class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler { + public: + SoftmaxMKLDNNHandler( + std::shared_ptr softmax_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + softmax_pd_(softmax_pd) {} + + SoftmaxMKLDNNHandler( + std::shared_ptr softmax_pd, + std::shared_ptr softmax_bwd_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + softmax_pd_(softmax_pd), + softmax_bwd_pd_(softmax_bwd_pd) { + // If we are in Grad operatgor then update a key with BWD suffix to + // distinguish from FWD memory primitives + key_ += "-BWD"; + } + + std::shared_ptr AcquireSoftmax( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p) { + /*Generate key*/ + auto prim_key = key_ + "@softmax_p"; + + auto softmax_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE((softmax_p != nullptr) || (is_reusing_ == false), + "Fail to find softmax primitive in device context"); + if (softmax_p == nullptr) { + softmax_p = std::make_shared( + *(softmax_pd_.get()), + *(static_cast(src_memory_p.get())), + *(static_cast(dst_memory_p.get()))); + dev_ctx_.SetBlob(prim_key, softmax_p); + } else { + is_reusing_ = true; + } + + return softmax_p; + } + + std::shared_ptr AcquireSoftmaxBackward( + std::shared_ptr dst_memory_p, + std::shared_ptr diff_dst_memory_p, + std::shared_ptr diff_src_memory_p) { + auto prim_key = key_ + "@softmax_bwd_p"; + auto softmax_bwd_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE((softmax_bwd_p != nullptr) || (is_reusing_ == false), + "Fail to find softmax backward primitive in device context"); + if (softmax_bwd_p == nullptr) { + softmax_bwd_p = std::make_shared( + *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()), + *(diff_src_memory_p.get())); + dev_ctx_.SetBlob(prim_key, softmax_bwd_p); + } else { + is_reusing_ = true; + } + + return softmax_bwd_p; + } + + private: + std::shared_ptr softmax_pd_; + std::shared_ptr softmax_bwd_pd_; +}; template class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { @@ -54,56 +127,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { // Same memory descriptor to be used for input and output memory::dims softmax_tz = {src_tz[0], src_tz[1]}; // Generate keys for storing/retriving primitives for this operator - // TODO(jczaja): Each MKLDNN operator may have diffrent hashing function - auto gethash = [](memory::dims& operand_dims) { - return std::string(std::to_string(operand_dims[0]) + "-" + - std::to_string(operand_dims[1])); - }; - const std::string key = gethash(softmax_tz); - const std::string key_softmax_p = key + "@softmax_p"; - const std::string key_softmax_src_mem_p = key + "@softmax_src_mem_p"; - const std::string key_softmax_dst_mem_p = key + "@softmax_dst_mem_p"; - - std::shared_ptr softmax_p = dev_ctx.GetBlob(key_softmax_p); - if (softmax_p == nullptr) { - // Currently only NC data format is supported - auto softmax_md = - MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc); - // Normalization is made after innermost dimension eg. C out of NC - auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring, - softmax_md, 1 /*dim: C*/); - // create memory primitives - auto softmax_src_memory_p = std::make_shared( - memory::primitive_desc{softmax_md, mkldnn_engine}, - static_cast(const_cast(input_data))); - dev_ctx.SetBlob(key_softmax_src_mem_p, softmax_src_memory_p); - auto softmax_dst_memory_p = std::make_shared( - memory::primitive_desc{softmax_md, mkldnn_engine}, - static_cast(output_data)); - dev_ctx.SetBlob(key_softmax_dst_mem_p, softmax_dst_memory_p); - - auto softmax_forward_pd = - std::make_shared(softmax_desc, - mkldnn_engine); - softmax_p = std::make_shared( - *(softmax_forward_pd.get()), - *(static_cast(softmax_src_memory_p.get())), - *(static_cast(softmax_dst_memory_p.get()))); - dev_ctx.SetBlob(key_softmax_p, softmax_p); - } else { - // Primitives already exist - auto src_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(key_softmax_src_mem_p)); - PADDLE_ENFORCE(src_memory_p != nullptr, - "Fail to find softmax src mem_p in device context"); - auto dst_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(key_softmax_dst_mem_p)); - PADDLE_ENFORCE(dst_memory_p != nullptr, - "Fail to find softmax dst mem_p in device context"); - src_memory_p->set_data_handle( - reinterpret_cast(const_cast(input_data))); - dst_memory_p->set_data_handle(output_data); - } + const std::string key = + platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out")); + const std::string key_softmax_pd = key + "@softmax_pd"; + + // Currently only NC data format is supported + auto softmax_md = MKLDNNMemDesc( + {softmax_tz}, platform::MKLDNNGetDataType(), memory::format::nc); + // Normalization is made after innermost dimension eg. C out of NC + auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring, + softmax_md, 1 /*dim: C*/); + auto softmax_pd = std::make_shared( + softmax_desc, mkldnn_engine); + dev_ctx.SetBlob(key_softmax_pd, softmax_pd); + + SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key); + auto softmax_src_memory_p = + handler.AcquireSrcMemory(softmax_md, to_void_cast(input_data)); + auto softmax_dst_memory_p = + handler.AcquireDstMemory(softmax_md, to_void_cast(output_data)); + auto softmax_p = + handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p); std::vector pipeline{ *(static_cast(softmax_p.get()))}; @@ -120,6 +164,77 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { } }; +template +class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto& dev_ctx = ctx.template device_context(); + auto mkldnn_engine = dev_ctx.GetEngine(); + const Tensor* output = ctx.Input("Out"); + const T* dst_data = output->data(); + + auto* dout = ctx.template Input(framework::GradVarName("Out")); + const auto* diff_dst_ptr = dout->template data(); + + auto* dx = + ctx.template Output(framework::GradVarName("X")); + T* diff_src_ptr = dx->template mutable_data(ctx.GetPlace()); + + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + std::vector src_tz(dst_tz); + PADDLE_ENFORCE(output->dims().size() == 2UL, + "The input of softmax op must be a 2D matrix."); + // MKL-DNN does support softmax over selected axis. Having 2D Tensor, + // we will make normalization after final eg. axis: 1 + PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])), + "Softmax input and output dimensions should match"); + // Same memory descriptor to be used for input and output + memory::dims softmax_tz = {src_tz[0], src_tz[1]}; + // Currently only supports NC data format + // retrieve eltwise primitive desc from device context + const std::string key = + platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Input("Out")); + const std::string key_softmax_pd = key + "@softmax_pd"; + + auto softmax_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_softmax_pd)); + PADDLE_ENFORCE(softmax_pd != nullptr, + "Fail to find softmax_pd in device context"); + + // TODO(jczaja): Add layouts support when there is a need to do so + // Two dimensional softmax does support NC format + auto data_softmax_md = MKLDNNMemDesc( + {softmax_tz}, platform::MKLDNNGetDataType(), memory::format::nc); + auto diff_softmax_md = MKLDNNMemDesc( + {softmax_tz}, platform::MKLDNNGetDataType(), memory::format::nc); + // Normalization is made after innermost dimension eg. C out of NC + auto softmax_bwd_desc = + softmax_backward::desc(diff_softmax_md, data_softmax_md, 1 /* dim: C*/); + auto softmax_bwd_pd = + std::make_shared( + softmax_bwd_desc, mkldnn_engine, *softmax_pd); + + SoftmaxMKLDNNHandler handler(softmax_pd, softmax_bwd_pd, dev_ctx, + mkldnn_engine, key); + auto dst_memory_p = + handler.AcquireDstMemory(data_softmax_md, to_void_cast(dst_data)); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory( + diff_softmax_md, to_void_cast(diff_dst_ptr)); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory( + diff_softmax_md, to_void_cast(diff_src_ptr)); + + // Get primitve from device context + auto softmax_bwd_p = handler.AcquireSoftmaxBackward( + dst_memory_p, diff_dst_memory_p, diff_src_memory_p); + + std::vector pipeline{*softmax_bwd_p}; + stream(stream::kind::eager).submit(pipeline).wait(); + } +}; } // namespace operators } // namespace paddle @@ -127,3 +242,5 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace, ops::SoftmaxMKLDNNKernel); +REGISTER_OP_KERNEL(softmax_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::SoftmaxMKLDNNGradKernel); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 847b3cbd1bd416ae1326211c98ba9d145c103298..31a7458f637921c290fc71ac748143867b4aae19 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -145,16 +145,30 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif - std::string data_format = ctx.Attr("data_format"); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - framework::StringToDataLayout(data_format), library_); +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + } +#endif + auto input_data_type = + framework::ToDataType(ctx.Input("X")->type()); + if (input_data_type == framework::proto::VarType::FP16) { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "float16 can only be used on GPU place"); + } + + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, + library_); } }; diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 364c4901b297dbd647faae85b01f682a1daace9c..6dd19aaeffef8aa8a7d1997915908af04273d50c 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,11 +1,16 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) -list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc) +list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc) + +# There is no macOS version of NCCL. +if (NOT APPLE) + list(APPEND CUDA_SRCS nccl.cc) +endif() + if (TENSORRT_FOUND) list(APPEND CUDA_SRCS tensorrt.cc) endif() - configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 7b8c29e1e642ec6bb4023afd8c083311b8b31812..a34e4371cccfd1be0d173fa11595e4368eb65b85 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -44,8 +44,10 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/curand.h" +#ifndef __APPLE__ #include "paddle/fluid/platform/dynload/nccl.h" -#endif +#endif // __APPLE__ +#endif // PADDLE_WITH_CUDA namespace paddle { namespace platform { @@ -174,6 +176,7 @@ inline typename std::enable_if::type throw_on_error( throw std::runtime_error(err + string::Sprintf(args...)); } +#ifndef __APPLE__ template inline typename std::enable_if::type throw_on_error( ncclResult_t stat, const Args&... args) { @@ -184,7 +187,7 @@ inline typename std::enable_if::type throw_on_error( string::Sprintf(args...)); } } - +#endif // __APPLE__ #endif // PADDLE_WITH_CUDA template diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 2689d5e0787e0164bfb8e539399d8a378964e50a..ed99932546446eb877c9701de15e2d37d29b5f88 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -105,5 +105,137 @@ inline mkldnn::memory::format GetMKLDNNFormat( memory.dst_primitive_desc().desc().data.format); } +class MKLDNNHandler { + public: + MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : dev_ctx_(dev_ctx), + engine_(engine), + key_(base_key), + is_reusing_(false) {} + + std::shared_ptr AcquireSrcMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_src_mem_p"); + } + + std::shared_ptr AcquireWeightsMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_weights_mem_p"); + } + + std::shared_ptr AcquireDstMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_dst_mem_p"); + } + + std::shared_ptr AcquireDiffDstMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p"); + } + + std::shared_ptr AcquireDiffSrcMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p"); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::primitive_desc mdp, void* ptr, + const std::string& suffix) { + auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + "Fail to find mem primitive in device context"); + if (mem_p == nullptr) { + mem_p = std::make_shared(mdp, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + + std::shared_ptr AcquireMemory(const mkldnn::memory::desc& md, + void* ptr, + const std::string& suffix) { + /*Generate key*/ + auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + "Fail to find mem primitive in device context"); + if (mem_p == nullptr) { + mem_p = std::make_shared( + mkldnn::memory::primitive_desc{md, engine_}, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + + std::shared_ptr AcquireMemory( + mkldnn::memory::primitive_desc& mpd, + mkldnn::memory::primitive_desc& user_mpd, + const std::shared_ptr user_memory_p, + const std::string& suffix, std::vector& pipeline) { + // create reorder primitive if the input format is not the preferred one + auto local_key = key_ + suffix; + auto key_reorder_p = key_ + suffix + "reorder_p"; + + auto target_memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false), + "Fail to find mem primitive in device context"); + if (target_memory_p == nullptr) { + target_memory_p = user_memory_p; + std::shared_ptr reorder_p; + if (mpd != user_mpd) { + target_memory_p = std::make_shared(mpd); + + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + dev_ctx_.SetBlob(key_reorder_p, reorder_p); + pipeline.push_back(*reorder_p); + } + dev_ctx_.SetBlob(local_key, target_memory_p); + } else { + // Make reorder if needed + auto reorder_p = std::static_pointer_cast( + dev_ctx_.GetBlob(key_reorder_p)); + if (reorder_p != nullptr) { + pipeline.push_back(*reorder_p); + } + is_reusing_ = true; + } + return target_memory_p; + } + + static std::string GetHash(mkldnn::memory::dims& operand_dims, + const std::string& suffix) { + auto dims2str = [](const mkldnn::memory::dims& operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; + }; + return dims2str(operand_dims) + suffix; + }; + + protected: + const MKLDNNDeviceContext& dev_ctx_; + mkldnn::engine engine_; + std::string key_; + bool is_reusing_; +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index bcf6d4dd3087060c016e53722cde80704ef2e834..fcd3356d44ee592233c3883d439d0677714900b8 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -268,7 +268,8 @@ void BindOpDesc(pybind11::module *m) { .value("STRINGS", pd::proto::AttrType::STRINGS) .value("BOOL", pd::proto::AttrType::BOOLEAN) .value("BOOLS", pd::proto::AttrType::BOOLEANS) - .value("BLOCK", pd::proto::AttrType::BLOCK); + .value("BLOCK", pd::proto::AttrType::BLOCK) + .value("BLOCKS", pd::proto::AttrType::BLOCKS); pybind11::class_ op_desc(*m, "OpDesc", ""); op_desc @@ -293,6 +294,7 @@ void BindOpDesc(pybind11::module *m) { .def("set_attr", &pd::OpDesc::SetAttr) .def("attr", &pd::OpDesc::GetAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr) + .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) .def("set_serialized_attr", [](pd::OpDesc &self, const std::string &name, const pybind11::bytes &seriralized) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index dc02c6632e2a5265daf0c2f9949bdb94beec4232..5a45e431df993febab676f22da7116d84e441548 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -167,9 +167,6 @@ PYBIND11_PLUGIN(core) { .def("set_lod", [](LoDTensor &self, const std::vector> &lod) { // the input lod is offset-based level-of-detail info - LOG(WARNING) - << "set_lod is deprecated and will be removed by 9.2018, " - "please switch to set_recursive_sequence_lengths."; LoD new_lod; new_lod.reserve(lod.size()); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); @@ -196,8 +193,6 @@ PYBIND11_PLUGIN(core) { .def("lod", [](LoDTensor &self) -> std::vector> { // output the offset-based lod info - LOG(WARNING) << "lod is deprecated and will be removed by 9.2018, " - "please switch to recursive_sequence_lengths."; LoD lod = self.lod(); std::vector> new_lod; new_lod.reserve(lod.size()); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index e8b305326702cf04b752bb2eb413f848daa5ec7b..ff46c5f846999c03f44fd14758b740e46275a001 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -22,7 +22,7 @@ function print_usage() { echo -e "\n${RED}Usage${NONE}: ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]" - + echo -e "\n${RED}Options${NONE}: ${BLUE}build${NONE}: run build for x86 platform ${BLUE}build_android${NONE}: run build for android platform @@ -198,7 +198,7 @@ function build_android() { fi ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API - + cat <'. + + Args: + input (Variable): prediction output of the network. + label (Variable): label of the test data set. + chunk_scheme (str): can be IOB/IOE/IOBES and IO. See the chunk_eval op for details. + num_chunk_types (int): the number of chunk type. + excluded_chunk_types (list): A list including chunk type ids, indicating chunk types that are not counted. + + Returns: + tuple: tuple containing: precision, recall, f1_score + + Examples: + .. code-block:: python + + exe = fluid.executor(place) + evaluator = fluid.Evaluator.ChunkEvaluator(input, label) + for epoch in PASS_NUM: + evaluator.reset(exe) + for data in batches: + loss = exe.run(fetch_list=[cost]) + distance, instance_error = distance_evaluator.eval(exe) """ def __init__( @@ -130,11 +166,11 @@ class ChunkEvaluator(Evaluator): if main_program.current_block().idx != 0: raise ValueError("You can only invoke Evaluator in root block") - self.num_infer_chunks = self.create_state( + self.num_infer_chunks = self._create_state( dtype='int64', shape=[1], suffix='num_infer_chunks') - self.num_label_chunks = self.create_state( + self.num_label_chunks = self._create_state( dtype='int64', shape=[1], suffix='num_label_chunks') - self.num_correct_chunks = self.create_state( + self.num_correct_chunks = self._create_state( dtype='int64', shape=[1], suffix='num_correct_chunks') precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval( input=input, @@ -178,6 +214,8 @@ class ChunkEvaluator(Evaluator): class EditDistance(Evaluator): """ + Warning: This would be deprecated in the future. Please use fluid.metrics.EditDistance + instead. Accumulate edit distance sum and sequence number from mini-batches and compute the average edit_distance and instance error of all batches. @@ -188,15 +226,16 @@ class EditDistance(Evaluator): ignored_tokens(list of int): Tokens that should be removed before calculating edit distance. - Example: + Examples: + .. code-block:: python - exe = fluid.executor(place) - distance_evaluator = fluid.Evaluator.EditDistance(input, label) - for epoch in PASS_NUM: - distance_evaluator.reset(exe) - for data in batches: - loss = exe.run(fetch_list=[cost]) - distance, instance_error = distance_evaluator.eval(exe) + exe = fluid.executor(place) + distance_evaluator = fluid.Evaluator.EditDistance(input, label) + for epoch in PASS_NUM: + distance_evaluator.reset(exe) + for data in batches: + loss = exe.run(fetch_list=[cost]) + distance, instance_error = distance_evaluator.eval(exe) In the above example: 'distance' is the average of the edit distance in a pass. @@ -210,11 +249,11 @@ class EditDistance(Evaluator): if main_program.current_block().idx != 0: raise ValueError("You can only invoke Evaluator in root block") - self.total_distance = self.create_state( + self.total_distance = self._create_state( dtype='float32', shape=[1], suffix='total_distance') - self.seq_num = self.create_state( + self.seq_num = self._create_state( dtype='int64', shape=[1], suffix='seq_num') - self.instance_error = self.create_state( + self.instance_error = self._create_state( dtype='int64', shape=[1], suffix='instance_error') distances, seq_num = layers.edit_distance( input=input, label=label, ignored_tokens=ignored_tokens) @@ -256,9 +295,10 @@ class EditDistance(Evaluator): class DetectionMAP(Evaluator): """ + Warning: This would be deprecated in the future. Please use fluid.metrics.DetectionMAP + instead. Calculate the detection mean average precision (mAP). - TODO (Dang Qingqing): update the following doc. The general steps are as follows: 1. calculate the true positive and false positive according to the input of detection and labels. @@ -293,17 +333,18 @@ class DetectionMAP(Evaluator): - 11point: the 11-point interpolated average precision. - integral: the natural integral of the precision-recall curve. - Example: + Examples: + .. code-block:: python - exe = fluid.executor(place) - map_evaluator = fluid.Evaluator.DetectionMAP(input, - gt_label, gt_box, gt_difficult) - cur_map, accum_map = map_evaluator.get_map_var() - fetch = [cost, cur_map, accum_map] - for epoch in PASS_NUM: - map_evaluator.reset(exe) - for data in batches: - loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch) + exe = fluid.executor(place) + map_evaluator = fluid.Evaluator.DetectionMAP(input, + gt_label, gt_box, gt_difficult) + cur_map, accum_map = map_evaluator.get_map_var() + fetch = [cost, cur_map, accum_map] + for epoch in PASS_NUM: + map_evaluator.reset(exe) + for data in batches: + loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch) In the above example: @@ -340,9 +381,10 @@ class DetectionMAP(Evaluator): evaluate_difficult=evaluate_difficult, ap_version=ap_version) - self.create_state(dtype='int32', shape=None, suffix='accum_pos_count') - self.create_state(dtype='float32', shape=None, suffix='accum_true_pos') - self.create_state(dtype='float32', shape=None, suffix='accum_false_pos') + self._create_state(dtype='int32', shape=None, suffix='accum_pos_count') + self._create_state(dtype='float32', shape=None, suffix='accum_true_pos') + self._create_state( + dtype='float32', shape=None, suffix='accum_false_pos') self.has_state = None var = self.helper.create_variable( diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 159b0ca39eed547e4f3448e7ebf4807299d465b2..dc275674618ee147dad2e32c7db29132ab55eb29 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -18,7 +18,7 @@ from framework import Program, default_main_program, Variable from . import core __all__ = [ - 'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var' + 'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var' ] g_scope = core.Scope() @@ -35,7 +35,7 @@ def global_scope(): return g_scope -def switch_scope(scope): +def _switch_scope(scope): global g_scope ex = g_scope g_scope = scope @@ -57,12 +57,27 @@ def scope_guard(scope): Args: scope: The new global/default scope. """ - ex = switch_scope(scope) + ex = _switch_scope(scope) yield - switch_scope(ex) + _switch_scope(ex) def as_numpy(tensor): + """ + Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information. + For higher dimensional sequence data, please use LoDTensor directly. + Examples: + >>> import paddle.fluid as fluid + >>> outs = executor.run(...) + >>> np_outs = map(lambda x: as_numpy(x), outs) + >>> ... + + Args: + tensor(Variable): a instance of Tensor + + Returns: + numpy.ndarray + """ if isinstance(tensor, list): return [as_numpy(t) for t in tensor] assert isinstance(tensor, core.LoDTensor) @@ -186,7 +201,7 @@ def fetch_var(name, scope=None, return_numpy=True): return tensor -def get_program_cache_key(feed, fetch_list): +def _get_program_cache_key(feed, fetch_list): feed_var_names = feed.keys() def to_name_str(var): @@ -205,6 +220,25 @@ def get_program_cache_key(feed, fetch_list): class Executor(object): + """ + An Executor in Python, only support the single-GPU running. For multi-cards, please refer to + ParallelExecutor. + Python executor takes a program, add feed operators and fetch operators to this program according + to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides + the variables(or names) that user want to get after program run. Note: the executor will run all + operators in the program but not only the operators dependent by the fetch_list. + It store the global variables into the global scope, and create a local scope for the temporary + variables. The local scope contents will be discarded after every minibatch forward/backward finished. + But the global scope variables will be persistent through different runs. + All of ops in program will be running in sequence. + + Args: + place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device + + Note: For debugging complicated network in parallel-GPUs, you can test it on the executor. + They has the exactly same arguments, and expected the same results. + """ + def __init__(self, place): self.place = place p = core.Place() @@ -213,6 +247,23 @@ class Executor(object): self.program_caches = dict() def as_lodtensor(self, data): + """ + Convert numpy.ndarray to Tensor, its only support Tensor without LoD information. + For higher dimensional sequence data, please use LoDTensor directly. + + Examples: + >>> import paddle.fluid as fluid + >>> exe = fluid.executor(fluid.CPUPlace()) + >>> data = np.array(size=(100, 200, 300)) + >>> np_outs = map(lambda x: exe.as_lodtensor(x), data) + >>> ... + + Args: + data(numpy.ndarray): a instance of array + + Returns: + LoDTensor + """ if isinstance(data, list): raise RuntimeError("Some of your feed data hold LoD information. \ They can not be completely cast from a list of Python \ @@ -304,23 +355,47 @@ class Executor(object): scope=None, return_numpy=True, use_program_cache=False): - """ Run program by this Executor. Feed data by feed map, fetch result by fetch_list. - + """ + Run program by this Executor. Feed data by feed map, fetch result by fetch_list. Python executor takes a program, add feed operators and fetch operators to this program according to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides - the variables(or names) that user want to get after program run. Note: the executor will run all + the variables(or names) that user want to get after program run. + + Note: the executor will run all operators in the program but not only the operators dependent by the fetch_list - :param program: the program that need to run, if not provied, then default_main_program will be used. - :param feed: feed variable map, e.g. {"image": ImageData, "label": LableData} - :param fetch_list: a list of variable or variable names that user want to get, run will return them according - to this list. - :param feed_var_name: the name for the input variable of feed Operator. - :param fetch_var_name: the name for the output variable of feed Operator. - :param scope: the scope used to run this program, you can switch it to different scope. default is global_scope - :param return_numpy: if convert the fetched tensor to numpy - :param use_program_cache: set use_program_cache to true if program not changed compare to the last step. - :return: result according to fetch_list. + Args: + program(Program): the program that need to run, if not provied, then default_main_program will be used. + feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData} + fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list. + feed_var_name(str): the name for the input variable of feed Operator. + fetch_var_name(str): the name for the output variable of fetch Operator. + scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope + return_numpy(bool): if convert the fetched tensor to numpy + use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step. + + Returns: + + list(numpy.array): fetch result according to fetch_list. + + + Examples: + + >>> data = layers.data(name='X', shape=[1], dtype='float32') + >>> hidden = layers.fc(input=data, size=10) + >>> layers.assign(hidden, out) + >>> loss = layers.mean(out) + >>> adam = fluid.optimizer.Adam() + >>> adam.minimize(loss) + + >>> cpu = core.CPUPlace() + >>> exe = Executor(cpu) + >>> exe.run(default_startup_program()) + + >>> x = numpy.random.random(size=(10, 1)).astype('float32') + >>> outs = exe.run( + >>> feed={'X': x}, + >>> fetch_list=[loss.name]) """ if feed is None: feed = {} @@ -341,7 +416,7 @@ class Executor(object): if scope is None: scope = global_scope() - cache_key = get_program_cache_key(feed, fetch_list) + cache_key = _get_program_cache_key(feed, fetch_list) if use_program_cache: cached_program = self._get_program_cache(cache_key) if cached_program is None: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index db21b1f3c03c40d79084b0dbb57d22f6d41fa270..4c1c8443a641cde40c392f1c647bc78d6cd3c13c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -558,15 +558,20 @@ class Operator(object): if (attr_name not in self.attrs) or ( self.attrs[attr_name] is None): continue - if isinstance(self.attrs[attr_name], Block): + attr_val = self.attrs[attr_name] + if isinstance(attr_val, Block): self.desc.set_block_attr(attr_name, self.attrs[attr_name].desc) - elif isinstance(self.attrs[attr_name], core.BlockDesc) or \ - isinstance(self.attrs[attr_name], core.ProgramDesc): + elif isinstance(attr_val, list) and attr_val and \ + all(isinstance(v, Block) for v in attr_val): + self.desc.set_blocks_attr(attr_name, + [v.desc for v in attr_val]) + elif isinstance(attr_val, core.BlockDesc) or \ + isinstance(attr_val, core.ProgramDesc): self.desc.set_serialized_attr( - attr_name, self.attrs[attr_name].serialize_to_string()) + attr_name, attr_val.serialize_to_string()) else: - self.desc.set_attr(attr_name, self.attrs[attr_name]) + self.desc.set_attr(attr_name, attr_val) self.desc.check_attrs() if self.has_kernel(type): self.desc.infer_var_type(self.block.desc) @@ -715,6 +720,9 @@ class Operator(object): self.attrs[name] = val if isinstance(val, Block): self.desc.set_block_attr(name, val.desc) + elif isinstance(val, list) and val and all( + isinstance(v, Block) for v in val): + self.desc.set_blocks_attr(name, [v.desc for v in val]) elif isinstance(val, core.BlockDesc) or \ isinstance(val, core.ProgramDesc): self.desc.set_serialized_attr(name, val.serialize_to_string()) @@ -1387,7 +1395,11 @@ class Program(object): * Set for_test to True when we want to clone the program for testing. Notes: This API DOES NOT prune any operator. Use - :code:`clone(for_test=True)` before backward and optimization please. + :code:`clone(for_test=True)` before backward and optimization please. e.g. + + >>> test_program = fluid.default_main_program().clone(for_test=True) + >>> optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + >>> optimizer.minimize() Args: for_test(bool): True if change the :code:`is_test` attribute of diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py index a568f61dcb2da976baa7847ae26281a34d6f88dd..cd1492da24d5e9d09a9eaac0b1b9c7aaffac6250 100644 --- a/python/paddle/fluid/layers/__init__.py +++ b/python/paddle/fluid/layers/__init__.py @@ -28,8 +28,8 @@ import math_op_patch from math_op_patch import * import detection from detection import * -import metric -from metric import * +import metric_op +from metric_op import * from learning_rate_scheduler import * __all__ = [] @@ -41,5 +41,5 @@ __all__ += control_flow.__all__ __all__ += ops.__all__ __all__ += device.__all__ __all__ += detection.__all__ -__all__ += metric.__all__ +__all__ += metric_op.__all__ __all__ += learning_rate_scheduler.__all__ diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 8d153b75cd49953770cfa89348914a375be82a82..f3ab47c96b1caa2facfd6d191af014b4c7380cbc 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -186,7 +186,6 @@ class ListenAndServ(object): main_program = self.helper.main_program current_block = main_program.current_block() parent_block = self.parent_block() - empty_block = Program().global_block() parent_block.append_op( type='listen_and_serv', @@ -195,8 +194,9 @@ class ListenAndServ(object): attrs={ 'endpoint': self.endpoint, 'Fanin': self.fan_in, - 'OptimizeBlock': current_block, - 'PrefetchBlock': empty_block, + 'optimize_blocks': [ + current_block + ], # did not support multiple optimize blocks in layers 'sync_mode': True, # did not support async now in layers 'grad_to_block_id': [""] }) diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric_op.py similarity index 99% rename from python/paddle/fluid/layers/metric.py rename to python/paddle/fluid/layers/metric_op.py index 58de1b6b9fe17a24203e93de6780190b9fc6b3e7..99e82fdd04282177fae63f1fb94b5e32d41c612e 100644 --- a/python/paddle/fluid/layers/metric.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -126,7 +126,7 @@ def auc(input, label, curve='ROC', num_thresholds=200): topk_out, topk_indices = nn.topk(input, k=k) auc_out = helper.create_tmp_variable(dtype="float32") helper.append_op( - type="accuracy", + type="auc", inputs={ "Out": [topk_out], "Indices": [topk_indices], diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2979ff3057a78ac3074cbb43b7a32966212073f6..be22bde4608807aff12ae8fa4b4c723211ffecce 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4920,16 +4920,16 @@ def random_crop(x, shape, seed=None): return out -def log(x): +def log(input): """ Calculates the natural log of the given input tensor, element-wise. .. math:: - Out = \\ln(x) + Out = \\ln(input) Args: - x (Variable): Input tensor. + input (Variable): Input tensor. Returns: Variable: The natural log of the input tensor computed element-wise. @@ -4938,27 +4938,27 @@ def log(x): .. code-block:: python - output = fluid.layers.log(x) + output = fluid.layers.log(input) """ helper = LayerHelper('log', **locals()) - dtype = helper.input_dtype() + dtype = helper.input_dtype(input_param_name='x') out = helper.create_tmp_variable(dtype) - helper.append_op(type="log", inputs={"X": input}, outputs={"Out": out}) + helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out}) return out -def relu(x): +def relu(input): """ Relu takes one input data (Tensor) and produces one output data (Tensor) - where the rectified linear function, y = max(0, x), is applied to + where the rectified linear function, y = max(0, input), is applied to the tensor elementwise. .. math:: - Out = \\max(0, x) + Out = \\max(0, input) Args: - x (Variable): The input tensor. + input (Variable): The input tensor. Returns: Variable: The output tensor with the same shape as input. @@ -4967,12 +4967,12 @@ def relu(x): .. code-block:: python - output = fluid.layers.relu(x) + output = fluid.layers.relu(input) """ helper = LayerHelper('relu', **locals()) - dtype = helper.input_dtype() + dtype = helper.input_dtype(input_param_name='x') out = helper.create_tmp_variable(dtype) - helper.append_op(type="relu", inputs={"X": input}, outputs={"Out": out}) + helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index b7a8bff30d3baffb7ec4d67a9bf6f5b00e3aa983..2a4fcf8ac18bbd441b6dc97be6d85cf87a2c5677 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -230,11 +230,11 @@ def sums(input, out=None): helper = LayerHelper('sum', **locals()) if out is None: out = helper.create_tmp_variable(dtype=helper.input_dtype()) - helper.append_op( - type='sum', - inputs={'X': input}, - outputs={'Out': out}, - attrs={'use_mkldnn': False}) + helper.append_op( + type='sum', + inputs={'X': input}, + outputs={'Out': out}, + attrs={'use_mkldnn': False}) return out diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 572475b483ff0341a97a91b6c5309fcf337dacbe..c9cd881979a4ea4b14299ce219be4b5bd1f153fc 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -23,6 +23,8 @@ import warnings __all__ = [ 'MetricBase', 'CompositeMetric', + 'Precision', + 'Recall', 'Accuracy', 'ChunkEvaluator', 'EditDistance', @@ -46,33 +48,34 @@ def _is_number_or_matrix_(var): class MetricBase(object): """ - Base Class for all evaluators + Base Class for all Metrics. + MetricBase define a group of interfaces for the + model evaluation methods. Metrics accumulate metric states between + consecutive minibatches, at every minibatch, use update + interface to add current minibatch value to global states. + Use eval to compute accumative metric value from last reset() + or from scratch on. + If you need to custom a new metric, please inherit from MetricBase and + custom implementation. Args: - name(str): The name of evaluator. such as, "accuracy". Used for generate - temporary variable name. - Interface: - Note(*) : the states is the attributes who not has _ prefix. - - get_config(): print current states and configuration - reset(): clear the states. If the Metrics states type is not (int, float, np.ndarray), - Please override this method. - update(): update states at every minibatch - eval(): get metric evaluation in numpy type. + name(str): The name of metric instance. such as, "accuracy". + It needed if you want to distinct different metrics in a model. + """ - def __init__(self, name, **kwargs): + def __init__(self, name): self._name = str(name) if name != None else self.__class__.__name__ - self._kwargs = kwargs if kwargs != None else dict() - self.reset() def __str__(self): return self._name def reset(self): """ - states is the attributes who not has _ prefix. - reset the states of metrics. + reset clear the states of metrics. By default, the states + are the members who do not has _ prefix, reset set them to inital states. + If you violate the implicit name rule, please also custom the reset + interface. """ states = { attr: value @@ -90,61 +93,231 @@ class MetricBase(object): setattr(self, attr, None) def get_config(self): + """ + Get the metric and current states. + The states are the members who do not has "_" prefix. + + Args: + None + + Returns: + dict: a dict of metric and states + """ states = { attr: value for attr, value in self.__dict__.iteritems() if not attr.startswith("_") } - config = copy.deepcopy(self._kwargs) + config = {} config.update({"name": self._name, "states": copy.deepcopy(states)}) return config - def update(self): - raise NotImplementedError() + def update(self, preds, labels): + """ + Updates the metric states at every minibatch. + One user can compute the minibatch metric via pure Python, or + via a c++ operator. + + Args: + preds(numpy.array): the predictions of current minibatch + labels(numpy.array): the labels of current minibatch, if the label is one-hot + or soft-label, should custom the corresponding update rule. + """ + raise NotImplementedError( + "Should not use it directly, please extend it.") def eval(self): - raise NotImplementedError() + """ + Evalute the current metrics based the accumulated states. + + Returns: + float|list(float)|numpy.array: the metrics via Python. + """ + raise NotImplementedError( + "Should not use it directly, please extend it.") class CompositeMetric(MetricBase): """ - Compute multiple metrics in each minibatch. + Composite multiple metrics in one instance. for example, merge F1, accuracy, recall into one Metric. + + Examples: + .. code-block:: python + + labels = fluid.layers.data(name="data", shape=[1], dtype="int32") + data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32") + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + comp = fluid.metrics.CompositeMetric() + acc = fluid.metrics.Precision() + recall = fluid.metrics.Recall() + comp.add_metric(acc) + comp.add_metric(recall) + for pass in range(PASSES): + comp.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + comp.update(preds=preds, labels=labels) + numpy_acc, numpy_recall = comp.eval() """ - def __init__(self, name=None, **kwargs): - super(CompositeMetric, self).__init__(name, kwargs) + def __init__(self, name=None): + super(CompositeMetric, self).__init__(name) self._metrics = [] def add_metric(self, metric): + """ + add one metric instance to CompositeMetric. + + Args: + metric: a instance of MetricBase. + """ if not isinstance(metric, MetricBase): raise ValueError("SubMetric should be inherit from MetricBase.") self._metrics.append(metric) + def update(self, preds, labels): + """ + Update every metrics in sequence. + + Args: + preds(numpy.array): the predictions of current minibatch + labels(numpy.array): the labels of current minibatch, if the label is one-hot + or soft-label, should custom the corresponding update rule. + """ + for m in self._metrics: + ans.append(m.update(preds, labels)) + def eval(self): + """ + Evaluate every metrics in sequence. + + Returns: + list(float|numpy.array): a list of metrics value in Python. + """ ans = [] for m in self._metrics: ans.append(m.eval()) return ans +class Precision(MetricBase): + """ + Precision (also called positive predictive value) is the fraction of + relevant instances among the retrieved instances. + https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers + + Note Precision is different with Accuracy in binary classifiers. + accuracy = true positive / total instances + precision = true positive / all positive instance + + Examples: + .. code-block:: python + + metric = fluid.metrics.Precision() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_precision = metric.eval() + """ + + def __init__(self, name=None): + super(Precision, self).__init__(name) + self.tp = 0 # true positive + self.fp = 0 # false positive + + def update(self, preds, labels): + if not _is_numpy_(preds): + raise ValueError("The 'preds' must be a numpy ndarray.") + if not _is_numpy_(labels): + raise ValueError("The 'labels' must be a numpy ndarray.") + sample_num = labels[0] + for i in range(sample_num): + pred = preds[i].astype("int32") + label = labels[i] + if label == 1: + if pred == label: + self.tp += 1 + else: + self.fp += 1 + + def eval(self): + ap = self.tp + self.fp + return float(self.tp) / ap if ap != 0 else .0 + + +class Recall(MetricBase): + """ + Recall (also known as sensitivity) is the fraction of + relevant instances that have been retrieved over the + total amount of relevant instances + + https://en.wikipedia.org/wiki/Precision_and_recall + + Examples: + .. code-block:: python + + metric = fluid.metrics.Recall() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_recall = metric.eval() + """ + + def __init__(self, name=None): + super(Recall, self).__init__(name) + self.tp = 0 # true positive + self.fn = 0 # false negtive + + def update(self, preds, labels): + if not _is_numpy_(preds): + raise ValueError("The 'preds' must be a numpy ndarray.") + if not _is_numpy_(labels): + raise ValueError("The 'labels' must be a numpy ndarray.") + sample_num = labels[0] + for i in range(sample_num): + pred = preds[i].astype("int32") + label = labels[i] + if label == 1: + if pred == label: + self.tp += 1 + else: + if pred != label: + self.fn += 1 + + def eval(self): + recall = self.tp + self.fn + return float(self.tp) / recall if recall != 0 else .0 + + class Accuracy(MetricBase): """ Accumulate the accuracy from minibatches and compute the average accuracy for every pass. + https://en.wikipedia.org/wiki/Accuracy_and_precision Args: name: the metrics name - Example: - minibatch_accuracy = fluid.layers.accuracy(pred, label) - accuracy_evaluator = fluid.metrics.Accuracy() - for epoch in PASS_NUM: - accuracy_evaluator.reset() - for data in batches: - loss = exe.run(fetch_list=[cost, minibatch_accuracy]) - accuracy_evaluator.update(value=minibatch_accuracy, weight=batches) - accuracy = accuracy_evaluator.eval() + Examples: + .. code-block:: python + + labels = fluid.layers.data(name="data", shape=[1], dtype="int32") + data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32") + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + minibatch_accuracy = fluid.layers.accuracy(pred, label) + accuracy_evaluator = fluid.metrics.Accuracy() + for pass in range(PASSES): + accuracy_evaluator.reset() + for data in train_reader(): + batch_size = data[0] + loss = exe.run(fetch_list=[cost, minibatch_accuracy]) + accuracy_evaluator.update(value=minibatch_accuracy, weight=batch_size) + numpy_acc = accuracy_evaluator.eval() """ def __init__(self, name=None): @@ -153,6 +326,13 @@ class Accuracy(MetricBase): self.weight = .0 def update(self, value, weight): + """ + Update minibatch states. + + Args: + value(float|numpy.array): accuracy of one minibatch. + weight(int|float): batch size. + """ if not _is_number_or_matrix_(value): raise ValueError( "The 'value' must be a number(int, float) or a numpy ndarray.") @@ -163,9 +343,8 @@ class Accuracy(MetricBase): def eval(self): if self.weight == 0: - raise ValueError( - "There is no data in Accuracy Metrics. Please check layers.accuracy output has added to Accuracy." - ) + raise ValueError("There is no data in Accuracy Metrics. \ + Please check layers.accuracy output has added to Accuracy.") return self.value / self.weight @@ -174,6 +353,25 @@ class ChunkEvaluator(MetricBase): Accumulate counter numbers output by chunk_eval from mini-batches and compute the precision recall and F1-score using the accumulated counter numbers. + For some basics of chunking, please refer to + 'Chunking with Support Vector Machines '. + ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection, + and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. + + Examples: + .. code-block:: python + + labels = fluid.layers.data(name="data", shape=[1], dtype="int32") + data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32") + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval( + input=pred, + label=label) + metric = fluid.metrics.ChunkEvaluator() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks) + numpy_precision, numpy_recall, numpy_f1 = metric.eval() """ def __init__(self, name=None): @@ -183,9 +381,17 @@ class ChunkEvaluator(MetricBase): self.num_correct_chunks = 0 def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks): + """ + Update the states based on the layers.chunk_eval() ouputs. + Args: + num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch. + num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch. + num_correct_chunks(int|float|numpy.array): The number of chunks both in Inference and Label on the + given mini-batch. + """ if not _is_number_or_matrix_(num_infer_chunks): raise ValueError( - "The 'num_infer_chunks' must be a number(int, float) or a numpy ndarray." + "The 'num_infer_chunks' must be a number(int) or a numpy ndarray." ) if not _is_number_or_matrix_(num_label_chunks): raise ValueError( @@ -212,21 +418,28 @@ class ChunkEvaluator(MetricBase): class EditDistance(MetricBase): """ + Edit distance is a way of quantifying how dissimilar two strings + (e.g., words) are to one another by counting the minimum number + of operations required to transform one string into the other. + Refer to https://en.wikipedia.org/wiki/Edit_distance + Accumulate edit distance sum and sequence number from mini-batches and compute the average edit_distance and instance error of all batches. Args: name: the metrics name - Example: - edit_distance_metrics = fluid.layers.edit_distance(input, label) - distance_evaluator = fluid.metrics.EditDistance() - for epoch in PASS_NUM: - distance_evaluator.reset() - for data in batches: - loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics)) - distance_evaluator.update(*edit_distance_metrics) - distance, instance_error = distance_evaluator.eval() + Examples: + .. code-block:: python + + distances, seq_num = fluid.layers.edit_distance(input, label) + distance_evaluator = fluid.metrics.EditDistance() + for epoch in PASS_NUM: + distance_evaluator.reset() + for data in batches: + loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics)) + distance_evaluator.update(distances, seq_num) + distance, instance_error = distance_evaluator.eval() In the above example: 'distance' is the average of the edit distance in a pass. @@ -264,16 +477,38 @@ class EditDistance(MetricBase): class DetectionMAP(MetricBase): """ Calculate the detection mean average precision (mAP). - - TODO (Dang Qingqing): update the following doc. - The general steps are as follows: - 1. calculate the true positive and false positive according to the input - of detection and labels. - 2. calculate mAP value, support two versions: '11 point' and 'integral'. - + mAP is the metric to measure the accuracy of object detectors + like Faster R-CNN, SSD, etc. + It is the average of the maximum precisions at different recall values. Please get more information from the following articles: https://sanchom.wordpress.com/tag/average-precision/ + https://arxiv.org/abs/1512.02325 + + The general steps are as follows: + + 1. calculate the true positive and false positive according to the input + of detection and labels. + 2. calculate mAP value, support two versions: '11 point' and 'integral'. + + Examples: + .. code-block:: python + + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + batch_map = layers.detection_map( + input, + label, + class_num, + background_label, + overlap_threshold=overlap_threshold, + evaluate_difficult=evaluate_difficult, + ap_version=ap_version) + metric = fluid.metrics.DetectionMAP() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, batch_map]) + batch_size = data[0] + metric.update(value=batch_map, weight=batch_size) + numpy_map = metric.eval() """ def __init__(self, name=None): @@ -302,17 +537,18 @@ class DetectionMAP(MetricBase): class Auc(MetricBase): """ - Auc Metrics which adapts to binary classification. - Need to note that auc metrics compute the value via Python natively. + Auc metric adapts to the binary classification. + Refer to https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve + Need to note that auc metric compute the value via Python natively. If you concern the speed, please use the fluid.layers.auc instead. The `auc` function creates four local variables, `true_positives`, - `true_negatives`, `false_positives` and `false_negatives` that are used to - compute the AUC. To discretize the AUC curve, a linearly spaced set of - thresholds is used to compute pairs of recall and precision values. The area - under the ROC-curve is therefore computed using the height of the recall - values by the false positive rate, while the area under the PR-curve is the - computed using the height of the precision values by the recall. + `true_negatives`, `false_positives` and `false_negatives` that are used to + compute the AUC. To discretize the AUC curve, a linearly spaced set of + thresholds is used to compute pairs of recall and precision values. The area + under the ROC-curve is therefore computed using the height of the recall + values by the false positive rate, while the area under the PR-curve is the + computed using the height of the precision values by the recall. Args: name: metric name @@ -322,6 +558,16 @@ class Auc(MetricBase): curve. "NOTE: only implement the ROC curve type via Python now." + + Examples: + .. code-block:: python + + pred = fluid.layers.fc(input=data, size=1000, act="tanh") + metric = fluid.metrics.Auc() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds, labels) + numpy_auc = metric.eval() """ def __init__(self, name, curve='ROC', num_thresholds=200): @@ -334,10 +580,10 @@ class Auc(MetricBase): self.tn_list = np.zeros((num_thresholds, )) self.fp_list = np.zeros((num_thresholds, )) - def update(self, labels, predictions, axis=1): + def update(self, preds, labels): if not _is_numpy_(labels): raise ValueError("The 'labels' must be a numpy ndarray.") - if not _is_numpy_(predictions): + if not _is_numpy_(preds): raise ValueError("The 'predictions' must be a numpy ndarray.") kepsilon = 1e-7 # to account for floating point imprecisions diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 21182393bd68db4a379fc3ecf83fc85d27ca9490..219ab9bc2cc74a3c16f7bda69d4d782283574d7e 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -15,7 +15,7 @@ if(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 -list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 +list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 @@ -43,8 +43,6 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dist_train) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) -# TODO(wuyi): this test hungs on CI, will add it back later -list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -52,3 +50,4 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$ py_test_modules(test_dist_train MODULES test_dist_train SERIAL) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) +set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 9dec2acb1d7101f8f00565c56e0469edb143d0c6..1cdc69501043d120b9e3cc8ccda3a1212d205886 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -94,7 +94,7 @@ class TestListenAndServOp(OpTest): self._wait_ps_ready(p1.pid) # raise SIGTERM to pserver - os.kill(p1.pid, signal.SIGKILL) + os.kill(p1.pid, signal.SIGINT) p1.join() # run pserver on CPU in async mode @@ -102,7 +102,7 @@ class TestListenAndServOp(OpTest): self._wait_ps_ready(p2.pid) # raise SIGTERM to pserver - os.kill(p2.pid, signal.SIGKILL) + os.kill(p2.pid, signal.SIGTERM) p2.join() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d8d6a7e9418e1c2a9f82d58b5c9650d58604d46e..bb61f82a9cf7f837f0403082165a2375d18b574e 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -396,7 +396,7 @@ class DistributeTranspiler(object): return varname return "" - def __clone_lr_op_sub_block__(op, program, new_block): + def __clone_lr_op_sub_block__(op, program, lr_block): if not op.has_attr('sub_block'): return @@ -405,36 +405,41 @@ class DistributeTranspiler(object): assert isinstance(origin_block, Block) # we put the new sub block to new block to follow the block # hierarchy of the original blocks - new_sub_block = program.create_block(new_block.idx) + new_sub_block = program.create_block(lr_block.idx) # clone vars for var in origin_block.vars: new_sub_block.clone_variable(var) # clone ops - for op in origin_block.ops: - self._clone_lr_op(program, new_sub_block, op) + for origin_op in origin_block.ops: + cloned_op = self._clone_lr_op(program, new_sub_block, origin_op) # clone sub_block of op - __clone_lr_op_sub_block__(op, program, new_sub_block) + __clone_lr_op_sub_block__(cloned_op, program, new_sub_block) # reset the block of op op.set_attr('sub_block', new_sub_block) # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() + # record optimize blocks and we can run them on pserver parallel + optimize_blocks = [] if len(lr_ops) > 0: lr_decay_block = pserver_program.create_block( pserver_program.num_blocks - 1) + optimize_blocks.append(lr_decay_block) for _, op in enumerate(lr_ops): - self._append_pserver_non_opt_ops(lr_decay_block, op) + cloned_op = self._append_pserver_non_opt_ops(lr_decay_block, op) # append sub blocks to pserver_program in lr_decay_op - __clone_lr_op_sub_block__(op, pserver_program, lr_decay_block) + __clone_lr_op_sub_block__(cloned_op, pserver_program, + lr_decay_block) # append op to the current block grad_to_block_id = [] pre_block_idx = pserver_program.num_blocks - 1 for idx, opt_op in enumerate(opt_op_on_pserver): per_opt_block = pserver_program.create_block(pre_block_idx) + optimize_blocks.append(per_opt_block) # append grad merging ops before clip and weight decay for _, op in enumerate(self.optimize_ops): # find the origin @GRAD var before clipping @@ -453,6 +458,7 @@ class DistributeTranspiler(object): if global_ops: opt_state_block = pserver_program.create_block( pserver_program.num_blocks - 1) + optimize_blocks.append(opt_state_block) for glb_op in global_ops: __append_optimize_op__(glb_op, opt_state_block, grad_to_block_id, None) @@ -474,11 +480,11 @@ class DistributeTranspiler(object): assert len(prefetch_var_name_to_block_id) == 0 attrs = { - "OptimizeBlock": pserver_program.block(1), + "optimize_blocks": optimize_blocks, "endpoint": endpoint, "Fanin": self.trainer_num, "sync_mode": self.sync_mode, - "grad_to_block_id": grad_to_block_id + "grad_to_block_id": grad_to_block_id, } if len(prefetch_var_name_to_block_id) > 0: attrs['prefetch_var_name_to_block_id'] \ @@ -1211,7 +1217,7 @@ class DistributeTranspiler(object): if var not in program.global_block().vars: block.clone_variable(var) - block.append_op( + return block.append_op( type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs) def _append_pserver_non_opt_ops(self, optimize_block, opt_op): @@ -1249,7 +1255,7 @@ class DistributeTranspiler(object): elif not program.global_block().vars.has_key(var.name): program.global_block().clone_variable(var) - optimize_block.append_op( + return optimize_block.append_op( type=opt_op.type, inputs=inputs, outputs=outputs, diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 0a2a1ced11ee5cb2fb407b229ce810d553c2fa46..662655c836dbc54bd6187dcd3dac7354d6c8ecd1 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -43,7 +43,7 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz' CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85' -def reader_creator(filename, sub_name): +def reader_creator(filename, sub_name, cycle=False): def read_batch(batch): data = batch['data'] labels = batch.get('labels', batch.get('fine_labels', None)) @@ -56,10 +56,13 @@ def reader_creator(filename, sub_name): names = (each_item.name for each_item in f if sub_name in each_item.name) - for name in names: - batch = cPickle.load(f.extractfile(name)) - for item in read_batch(batch): - yield item + while True: + for name in names: + batch = cPickle.load(f.extractfile(name)) + for item in read_batch(batch): + yield item + if not cycle: + break return reader @@ -94,34 +97,40 @@ def test100(): 'test') -def train10(): +def train10(cycle=False): """ CIFAR-10 training set creator. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: Training reader creator :rtype: callable """ return reader_creator( paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), - 'data_batch') + 'data_batch', + cycle=cycle) -def test10(): +def test10(cycle=False): """ CIFAR-10 test set creator. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: Test reader creator. :rtype: callable """ return reader_creator( paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), - 'test_batch') + 'test_batch', + cycle=cycle) def fetch(): diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py index 357a4e9b000ea81afe291ff39dde2bed5c67e619..db12076d54064781bd1060947497622b14783768 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/v2/dataset/flowers.py @@ -76,7 +76,8 @@ def reader_creator(data_file, dataset_name, mapper, buffered_size=1024, - use_xmap=True): + use_xmap=True, + cycle=False): ''' 1. read images from tar file and merge images into batch files in 102flowers.tgz_batch/ @@ -96,6 +97,8 @@ def reader_creator(data_file, :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: data reader :rtype: callable ''' @@ -108,15 +111,18 @@ def reader_creator(data_file, file_list = batch_images_from_tar(data_file, dataset_name, img2label) def reader(): - for file in open(file_list): - file = file.strip() - batch = None - with open(file, 'r') as f: - batch = cPickle.load(f) - data = batch['data'] - labels = batch['label'] - for sample, label in itertools.izip(data, batch['label']): - yield sample, int(label) - 1 + while True: + for file in open(file_list): + file = file.strip() + batch = None + with open(file, 'r') as f: + batch = cPickle.load(f) + data = batch['data'] + labels = batch['label'] + for sample, label in itertools.izip(data, batch['label']): + yield sample, int(label) - 1 + if not cycle: + break if use_xmap: cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) @@ -125,7 +131,7 @@ def reader_creator(data_file, return map_readers(mapper, reader) -def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): +def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False): ''' Create flowers training set reader. It returns a reader, each sample in the reader is @@ -138,17 +144,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: train data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), - download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper, - buffered_size, use_xmap) + download(SETID_URL, 'flowers', SETID_MD5), + TRAIN_FLAG, + mapper, + buffered_size, + use_xmap, + cycle=cycle) -def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): +def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False): ''' Create flowers test set reader. It returns a reader, each sample in the reader is @@ -161,14 +173,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True): :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int + :param cycle: whether to cycle through the dataset + :type cycle: bool :return: test data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), - download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper, - buffered_size, use_xmap) + download(SETID_URL, 'flowers', SETID_MD5), + TEST_FLAG, + mapper, + buffered_size, + use_xmap, + cycle=cycle) def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):