diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 8b7f85912a6f877f1df9bbc7e58bfa1954c4b854..9511db9068820711ccd93269e672d860769a96b2 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -197,13 +197,27 @@ FetchResultType AsyncSSAGraphExecutor::Run( HandleException(); - FeedFetchList ret; - auto &val = boost::get(fetch_data); + FetchList ret; + auto &val = boost::get(fetch_data); for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { - std::vector lodtensor_ptrs; - lodtensor_ptrs.push_back(&val.at(fetch_idx)); - ret.emplace_back(); - ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + if (data_is_lod_tensor(val.at(fetch_idx))) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.push_back(&(boost::get(val.at(fetch_idx)))); + LoDTensor var; + var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + ret.emplace_back(var); + } else { + auto array = boost::get(val.at(fetch_idx)); + LoDTensorArray item_array; + item_array.reserve(array.size()); + for (size_t i = 0; i < array.size(); ++i) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.push_back(&array[i]); + item_array.emplace_back(); + item_array.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + } + ret.emplace_back(item_array); + } } return ret; } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 18bac69c8db148a4e315b2334f2e587ed1b1b3f4..9d1395c0356bd38d91c7d7378888921dcf85ee5b 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -63,7 +63,7 @@ FetchResultType FastThreadedSSAGraphExecutor::Run( FetchResultType fetches; if (return_merged) { - fetches = FeedFetchList(fetch_tensors.size()); + fetches = FetchList(fetch_tensors.size()); } else { fetches = FetchUnmergedList(fetch_tensors.size()); } diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 87b3cec8cff4cddacc7186cca1797f93e13f54db..da9330780103aa334e1f3d53f1597a8a57e22ccd 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -39,51 +39,98 @@ void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); } -void FetchOpHandle::WaitAndMergeCPUTensors() const { +static void CheckDims(const framework::DDim &tensor_dims, + const framework::DDim &ele_dims, const size_t offset) { + PADDLE_ENFORCE_EQ( + tensor_dims.size(), ele_dims.size(), + platform::errors::Fatal("The dimension sizes of fetched Tensors or " + "the items of fetched LoDTensorArray are " + "different from each other on different " + "devices. And the error is caused by the %zu " + "(th) fetched variable. Please set the " + "parameter `return_merged = False` when you " + "call the `Executor.run()` method.", + offset)); + for (int j = 1; j < tensor_dims.size(); j++) { + PADDLE_ENFORCE_EQ( + tensor_dims[j], ele_dims[j], + platform::errors::Fatal("The dimensions of fetched Tensors or " + "the items of fetched LoDTensorArray are " + "different from each other on different " + "devices. And the error is caused by the " + "%zu (th) fetched variable. Please set the " + "parameter `return_merged = False` when " + "you call the `Executor.run()` method.", + offset)); + } +} + +void FetchOpHandle::WaitAndMergeCPUFetchVars() const { if (return_merged_) { - const auto &tensor_dims = tensors_[0].dims(); - for (size_t i = 1; i < tensors_.size(); i++) { - const auto &ele_dims = tensors_[i].dims(); - PADDLE_ENFORCE_EQ( - tensor_dims.size(), ele_dims.size(), - platform::errors::Fatal("The dimension sizes of fetched Tensors are " - "different from each other on different " - "devices. And the error is caused by the %zu " - "(th) fetched variable. Please set the " - "parameter `return_merged = False` when you " - "call the `Executor.run()` method.", - offset_)); - for (int j = 1; j < tensor_dims.size(); j++) { - PADDLE_ENFORCE_EQ( - tensor_dims[j], ele_dims[j], - platform::errors::Fatal("The dimensions of fetched Tensors are " - "different from each other on different " - "devices. And the error is caused by the " - "%zu (th) fetched variable. Please set the " - "parameter `return_merged = False` when " - "you call the `Executor.run()` method.", - offset_)); + if (data_is_lod_tensor(tensors_[0])) { + const auto &tensor_dims = boost::get(tensors_[0]).dims(); + for (size_t i = 1; i < tensors_.size(); i++) { + const auto &ele_dims = boost::get(tensors_[i]).dims(); + CheckDims(tensor_dims, ele_dims, offset_); } + std::vector tensors_ptr; + tensors_ptr.reserve(tensors_.size()); + for (auto &t : tensors_) { + tensors_ptr.emplace_back(&boost::get(t)); + } + auto &val = boost::get(*data_); + LoDTensor var; + var.MergeLoDTensor(tensors_ptr, platform::CPUPlace()); + val.at(offset_) = std::move(var); + } else { + auto &array = boost::get(tensors_[0]); + LoDTensorArray tmp_array; + tmp_array.reserve(array.size()); + for (size_t i = 0; i < array.size(); ++i) { + const auto &tensor_dims = array[i].dims(); + std::vector tensors_ptr; + tensors_ptr.reserve(tensors_.size()); + tensors_ptr.push_back(&array[i]); + for (size_t j = 1; j < tensors_.size(); ++j) { + auto &element = boost::get(tensors_[j]); + const auto &ele_dims = element[i].dims(); + CheckDims(tensor_dims, ele_dims, offset_); + tensors_ptr.push_back(&element[i]); + } + tmp_array.emplace_back(); + tmp_array.back().MergeLoDTensor(tensors_ptr, platform::CPUPlace()); + } + auto &val = boost::get(*data_); + val.at(offset_) = std::move(tmp_array); } - std::vector tensors_ptr; - tensors_ptr.reserve(tensors_.size()); - for (auto &t : tensors_) { - tensors_ptr.emplace_back(&t); - } - auto &val = boost::get(*data_); - val.at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace()); } else { auto &val = boost::get(*data_); val.at(offset_) = std::move(tensors_); } } +static void TransData(const framework::LoDTensor &src_item, + framework::LoDTensor *dst_item) { + if (src_item.IsInitialized() && src_item.numel() > 0) { + if (platform::is_gpu_place(src_item.place())) { +#ifdef PADDLE_WITH_CUDA + TensorCopy(src_item, platform::CPUPlace(), dst_item); +#endif + } else { + dst_item->ShareDataWith(src_item); + } + } else { + dst_item->clear(); + dst_item->Resize({0}); + } + dst_item->set_lod(src_item.lod()); +} + void FetchOpHandle::RunImpl() { platform::RecordEvent record_event(Name()); WaitInputVarGenerated(platform::CPUPlace()); tensors_.resize(inputs_.size()); - platform::CPUPlace cpu; auto &scopes = *local_exec_scopes_; for (size_t i = 0; i < inputs_.size(); ++i) { @@ -93,23 +140,21 @@ void FetchOpHandle::RunImpl() { PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope", var_handle->name()); - auto &t = var->Get(); - if (t.IsInitialized() && t.numel() > 0) { - if (platform::is_gpu_place(t.place())) { -#ifdef PADDLE_WITH_CUDA - TensorCopy(t, cpu, &tensors_[i]); -#endif - } else { - tensors_[i].ShareDataWith(t); - } + if (var->IsType()) { + auto &t = var->Get(); + auto &item = boost::get(tensors_[i]); + TransData(t, &item); } else { - tensors_[i].clear(); - tensors_[i].Resize({0}); + auto &t = var->Get(); + LoDTensorArray tmp(t.size()); + tensors_[i] = tmp; + auto &item = boost::get(tensors_[i]); + for (size_t j = 0; j < t.size(); ++j) { + TransData(t[j], &item[j]); + } } - tensors_[i].set_lod(t.lod()); } - - this->WaitAndMergeCPUTensors(); + this->WaitAndMergeCPUFetchVars(); } void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) { diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h index 48753cb45d578bfb26d7b6572d24ebf4d23b6a97..31ffd1211d205ff943f5e65b95bddcfa76bcd05a 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.h +++ b/paddle/fluid/framework/details/fetch_op_handle.h @@ -36,7 +36,7 @@ struct FetchOpHandle : public OpHandleBase { void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override; - void WaitAndMergeCPUTensors() const; + void WaitAndMergeCPUFetchVars() const; std::string Name() const override; @@ -54,7 +54,7 @@ struct FetchOpHandle : public OpHandleBase { size_t offset_; std::vector *local_scopes_; std::vector *local_exec_scopes_; - std::vector tensors_; + std::vector tensors_; bool return_merged_; }; diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 06b4d6e8d6d1268da8029b8bc5d0eb4495b351b9..dbc73048c551b991b9a7640016bc3f8506fcb5ac 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -179,7 +179,7 @@ FetchResultType ParallelSSAGraphExecutor::Run( } if (return_merged) { - return FeedFetchList(); + return FetchList(); } else { return FetchUnmergedList(); } @@ -245,22 +245,43 @@ FetchResultType ParallelSSAGraphExecutor::Run( } if (return_merged) { - FeedFetchList ret; + FetchList ret; ret.reserve(fetch_tensors.size()); - for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { std::vector lodtensor_ptrs; lodtensor_ptrs.reserve(place_num); + std::vector lodtensorarray_ptrs; + lodtensorarray_ptrs.reserve(place_num); for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { if (!is_valid[scope_idx]) { continue; } - const auto &fetch_list = - boost::get(fetch_data[scope_idx]); - lodtensor_ptrs.push_back(&fetch_list[fetch_idx]); + const auto &fetch_list = boost::get(fetch_data[scope_idx]); + if (data_is_lod_tensor(fetch_list[fetch_idx])) { + lodtensor_ptrs.push_back( + &(boost::get(fetch_list[fetch_idx]))); + } else { + lodtensorarray_ptrs.push_back( + &(boost::get(fetch_list[fetch_idx]))); + } + } + if (lodtensor_ptrs.size() != 0) { + LoDTensor var; + var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + ret.emplace_back(var); + } else { + LoDTensorArray var_array(lodtensorarray_ptrs[0]->size()); + for (size_t i = 0; i < lodtensorarray_ptrs[0]->size(); ++i) { + LoDTensor var; + std::vector ptrs; + for (size_t j = 0; j < lodtensorarray_ptrs.size(); ++j) { + ptrs.push_back(&(lodtensorarray_ptrs[j]->at(i))); + } + var.MergeLoDTensor(ptrs, platform::CPUPlace()); + var_array[i] = std::move(var); + } + ret.emplace_back(var_array); } - ret.emplace_back(); - ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); } return ret; } else { @@ -277,8 +298,8 @@ FetchResultType ParallelSSAGraphExecutor::Run( boost::get(fetch_data[scope_idx]); PADDLE_ENFORCE_EQ( fetch_list[fetch_idx].size(), 1, - platform::errors::Fatal( - "Each place must have only one fetched LoDTensor!")); + platform::errors::Fatal("Each place must have only one fetched " + "LoDTensor/LoDTensorArray!")); ret.back().emplace_back(fetch_list[fetch_idx][0]); } } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 593b8543808e4469e8cf85260b85a88537373249..92c3a0cd6b9c01497199fece0a9bdafc89f64678 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -72,7 +72,7 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl( std::unordered_set fetch_dependencies; FetchResultType fetch_data; if (return_merged) { - fetch_data = FeedFetchList(fetch_tensors.size()); + fetch_data = FetchList(fetch_tensors.size()); } else { fetch_data = FetchUnmergedList(fetch_tensors.size()); } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5bdb27683ee68440c590e6ca7f141ea0e80d9ba9..93919f6a3088d243b55872ca920203b5f7674021 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -256,7 +256,7 @@ static bool has_feed_operators( // Return true if the block has fetch operators and holder of matching info. static bool has_fetch_operators( const BlockDesc& block, - const std::map& fetch_targets, + const std::map& fetch_targets, const std::string& fetch_holder_name) { size_t fetch_count = 0; for (auto* op : block.AllOps()) { @@ -306,7 +306,7 @@ static bool has_fetch_operators( void Executor::Run(const ProgramDesc& program, Scope* scope, std::map* feed_targets, - std::map* fetch_targets, + std::map* fetch_targets, bool create_local_scope, bool create_vars, const std::string& feed_holder_name, const std::string& fetch_holder_name) { @@ -504,7 +504,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void Executor::RunPreparedContext( ExecutorPrepareContext* ctx, Scope* scope, std::map* feed_targets, - std::map* fetch_targets, bool create_local_scope, + std::map* fetch_targets, bool create_local_scope, bool create_vars, const std::string& feed_holder_name, const std::string& fetch_holder_name) { auto& global_block = ctx->prog_.Block(ctx->block_id_); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index aa70bb2d81e7c0a08ca4d35b41fdc70ca3362de6..fa6a65d5892f0098a95d1e30b11501bc1043b14f 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -87,7 +87,7 @@ class Executor { // This API is very slow. void Run(const ProgramDesc& program, Scope* scope, std::map* feed_targets, - std::map* fetch_targets, + std::map* fetch_targets, bool create_local_scope = true, bool create_vars = true, const std::string& feed_holder_name = "feed", const std::string& fetch_holder_name = "fetch"); @@ -95,7 +95,7 @@ class Executor { // This API is very slow. void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, std::map* feed_targets, - std::map* fetch_targets, + std::map* fetch_targets, bool create_local_scope = true, bool create_vars = true, const std::string& feed_holder_name = "feed", diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 96530b2a3f9cfd9462627a42b2bb0fea98758f92..fd857f7735c1db2b5b3678517e5301d4ab8700ef 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -29,7 +29,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, // be created. VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; Variable* g_feed_value = scope->Var(var_name); - auto& feed_inputs = *(g_feed_value->GetMutable()); + auto& feed_inputs = *(g_feed_value->GetMutable()); if (index >= feed_inputs.size()) { feed_inputs.resize(index + 1); } @@ -39,27 +39,35 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, feed_inputs[index].set_lod(input.lod()); } -LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, +FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index) { - // Since we want to fetch LodTensor from a variable, the variable must + // Since we want to fetch FetchType from a variable, the variable must // be created alreadly. Variable* g_fetch_value = scope.FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name); - PADDLE_ENFORCE(g_fetch_value->IsType(), - "Only %s can be invoked by GetFetchVariable", - typeid(FeedFetchList).name()); - auto& fetch_outputs = *g_fetch_value->GetMutable(); + PADDLE_ENFORCE_NOT_NULL(g_fetch_value, + platform::errors::NotFound( + "Variable %s is not found in scope.", var_name)); + PADDLE_ENFORCE_EQ(g_fetch_value->IsType(), true, + platform::errors::InvalidArgument( + "Only %s can be invoked by GetFetchVariable", + typeid(FetchList).name())); + auto& fetch_outputs = *g_fetch_value->GetMutable(); auto& tensor = fetch_outputs[index]; - VLOG(3) << "Fetch " << var_name << " with index " << index - << " shape= " << tensor.dims(); - PADDLE_ENFORCE_LT(index, fetch_outputs.size()); + VLOG(3) << "Fetch " << var_name << " with index " << index; + PADDLE_ENFORCE_LT(index, fetch_outputs.size(), + platform::errors::InvalidArgument( + "index must less than fetch_outputs size.")); return tensor; } LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { Variable* var = scope.FindVar(var_name); - PADDLE_ENFORCE(var, "%s no in scope", var_name); - PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable %s is not found in scope.", + var_name)); + PADDLE_ENFORCE_EQ(var->IsType(), true, + platform::errors::InvalidArgument( + "Only support lod tensor in GetVariableTensor now.")); return *var->GetMutable(); } diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 031f8e01aa6128b803dcbfb990778e87d4fafc13..65c8b255ffb2fbe8a054dd871bccd665d284968d 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -24,7 +24,7 @@ namespace framework { void SetFeedVariable(Scope* scope, const LoDTensor& input, const std::string& var_name, size_t index); -LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, +FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index 210d549edf2640759a0713b9d23529a1100ae3c9..1996327fe82bc0455df130006656e3a04ce1bf38 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -15,14 +15,33 @@ limitations under the License. */ #pragma once #include #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { -using FeedFetchType = LoDTensor; -using FeedFetchList = std::vector; -using FetchUnmergedList = std::vector>; -using FetchResultType = boost::variant; +using FeedType = LoDTensor; +using FeedList = std::vector; + +using FetchType = boost::variant; +using FetchList = std::vector; + +using FetchUnmergedList = std::vector>; +using FetchResultType = boost::variant; + +inline bool data_is_lod_tensor(const FetchType &data) { + if (data.type() == typeid(LoDTensor)) { + return true; + } + return false; +} + +inline bool data_is_lod_tensor_array(const FetchType &data) { + if (data.type() == typeid(LoDTensorArray)) { + return true; + } + return false; +} static const char kFeedOpType[] = "feed"; static const char kFetchOpType[] = "fetch"; diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h index 7b15289c1b5121a148f9b3d5d72cc40b026c9106..36a5c3c5d601390beedaf37ceb98ee2c63ecf5a6 100644 --- a/paddle/fluid/framework/lod_tensor_array.h +++ b/paddle/fluid/framework/lod_tensor_array.h @@ -20,7 +20,6 @@ namespace paddle { namespace framework { using LoDTensorArray = std::vector; -using LoDTensor2DArray = std::vector>; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 73be446f71f193bea203c986b482e6b98a9826c5..43e9ed553bea84aaaaa18a46fe81f06a18b124af 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -36,6 +36,7 @@ inline proto::VarType::Type ToVarType(int type) { case proto::VarType::SELECTED_ROWS: case proto::VarType::LOD_RANK_TABLE: case proto::VarType::LOD_TENSOR_ARRAY: + case proto::VarType::FETCH_LIST: case proto::VarType::READER: return static_cast(type); default: @@ -61,6 +62,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { case proto::VarType::READER: visitor(var.Get()); return; + case proto::VarType::FETCH_LIST: + visitor(var.Get()); + return; default: PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); } diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 9a1e90a0268eb81df6d8bf1add79e090a2ee3a3e..4875956096a09bea7f9feb3eaf12a16dc28ee255 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -19,6 +19,7 @@ #include #include #include +#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/platform/place.h" @@ -139,7 +140,7 @@ struct VarTypeRegistryImpl { using VarTypeRegistry = detail::VarTypeRegistryImpl< Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, - operators::reader::LoDTensorBlockingQueueHolder, + operators::reader::LoDTensorBlockingQueueHolder, FetchList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, #ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_NCCL) @@ -178,6 +179,7 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST); REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index 65c939af173a8a2a22d69c636de355293f95dec6..34adbbc0abc879f305618bbd1f3a159600c3496c 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -34,9 +34,9 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::SELECTED_ROWS) { var->GetMutable(); } else if (var_type == proto::VarType::FEED_MINIBATCH) { - var->GetMutable(); + var->GetMutable(); } else if (var_type == proto::VarType::FETCH_LIST) { - var->GetMutable(); + var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6768a490cda169edfd245d18f00c21d0f0ddc19b..44abfc48db217929deb93baca31229dbaa040de6 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -383,8 +383,9 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, for (size_t i = 0; i < fetches_.size(); ++i) { int idx = boost::get(fetches_[i]->GetAttr("col")); PADDLE_ENFORCE((size_t)idx == i); - framework::LoDTensor &fetch = + framework::FetchType &fetch_var = framework::GetFetchVariable(*scope, "fetch", idx); + auto &fetch = boost::get(fetch_var); auto type = fetch.type(); auto output = &(outputs->at(i)); output->name = fetches_[idx]->Input("X")[0]; @@ -583,9 +584,9 @@ void AnalysisPredictor::PrepareFeedFetch() { void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { PADDLE_ENFORCE_NOT_NULL(scope); auto *var = scope->Var("feed"); - var->GetMutable(); + var->GetMutable(); var = scope->Var("fetch"); - var->GetMutable(); + var->GetMutable(); } std::vector AnalysisPredictor::GetInputNames() { diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 84b367336dc31339b89075caec7349cf2ba11790..529d641d46db3247c8907edc7a579f4ff9db864a 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -286,8 +286,9 @@ bool NativePaddlePredictor::GetFetch(std::vector *outputs, for (size_t i = 0; i < fetchs_.size(); ++i) { int idx = boost::get(fetchs_[i]->GetAttr("col")); PADDLE_ENFORCE((size_t)idx == i); - framework::LoDTensor &fetch = + framework::FetchType &fetch_var = framework::GetFetchVariable(*scope, "fetch", idx); + auto fetch = boost::get(fetch_var); auto type = fetch.type(); auto output = &(outputs->at(i)); output->name = fetchs_[idx]->Input("X")[0]; diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index c80187adfa721fb0a81652ae59556ad4ad9a3e88..86f8de97083d5ffdd49dfcf2dd50c51288227e02 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -102,14 +102,15 @@ void MainWord2Vec(bool use_gpu) { cpu_feeds.push_back(&third_word); cpu_feeds.push_back(&fourth_word); - framework::LoDTensor output1; - std::vector cpu_fetchs1; + framework::FetchType output1; + std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); TestInference(config.model_dir, cpu_feeds, cpu_fetchs1); - float* lod_data = output1.data(); - for (int i = 0; i < output1.numel(); ++i) { + auto output1_tensor = boost::get(output1); + float* lod_data = output1_tensor.data(); + for (int i = 0; i < output1_tensor.numel(); ++i) { EXPECT_LT(lod_data[i] - data[i], ACC_DIFF); EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF); } @@ -137,8 +138,8 @@ void MainImageClassification(bool use_gpu) { std::vector cpu_feeds; cpu_feeds.push_back(&input); - framework::LoDTensor output1; - std::vector cpu_fetchs1; + framework::FetchType output1; + std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); TestInference( @@ -153,7 +154,8 @@ void MainImageClassification(bool use_gpu) { ASSERT_EQ(outputs.size(), 1UL); size_t len = outputs[0].data.length(); float* data = static_cast(outputs[0].data.data()); - float* lod_data = output1.data(); + float* lod_data = + boost::get(output1).data(); for (size_t j = 0; j < len / sizeof(float); ++j) { EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF); } @@ -168,7 +170,7 @@ void MainThreadsWord2Vec(bool use_gpu) { constexpr int num_jobs = 3; std::vector> jobs(num_jobs); std::vector> paddle_tensor_feeds(num_jobs); - std::vector refs(num_jobs); + std::vector refs(num_jobs); for (size_t i = 0; i < jobs.size(); ++i) { // each job has 4 words jobs[i].resize(4); @@ -181,7 +183,7 @@ void MainThreadsWord2Vec(bool use_gpu) { // get reference result of each job std::vector ref_feeds; - std::vector ref_fetches(1, &refs[i]); + std::vector ref_fetches(1, &refs[i]); for (auto& word : jobs[i]) { ref_feeds.push_back(&word); } @@ -207,9 +209,10 @@ void MainThreadsWord2Vec(bool use_gpu) { } // check outputs correctness - float* ref_data = refs[tid].data(); - EXPECT_EQ(refs[tid].numel(), static_cast(len / sizeof(float))); - for (int i = 0; i < refs[tid].numel(); ++i) { + auto ref_tensor = boost::get(refs[tid]); + float* ref_data = ref_tensor.data(); + EXPECT_EQ(ref_tensor.numel(), static_cast(len / sizeof(float))); + for (int i = 0; i < ref_tensor.numel(); ++i) { EXPECT_NEAR(ref_data[i], data[i], 2e-3); } }); @@ -230,7 +233,7 @@ void MainThreadsImageClassification(bool use_gpu) { auto main_predictor = CreatePaddlePredictor(config); std::vector jobs(num_jobs); std::vector> paddle_tensor_feeds(num_jobs); - std::vector refs(num_jobs); + std::vector refs(num_jobs); for (size_t i = 0; i < jobs.size(); ++i) { // prepare inputs std::vector> feed_target_shapes = @@ -242,7 +245,7 @@ void MainThreadsImageClassification(bool use_gpu) { // get reference result of each job std::vector ref_feeds(1, &jobs[i]); - std::vector ref_fetches(1, &refs[i]); + std::vector ref_fetches(1, &refs[i]); TestInference(config.model_dir, ref_feeds, ref_fetches); } @@ -259,9 +262,10 @@ void MainThreadsImageClassification(bool use_gpu) { ASSERT_EQ(local_outputs.size(), 1UL); const size_t len = local_outputs[0].data.length(); float* data = static_cast(local_outputs[0].data.data()); - float* ref_data = refs[tid].data(); - EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float)); - for (int i = 0; i < refs[tid].numel(); ++i) { + auto ref_tensor = boost::get(refs[tid]); + float* ref_data = ref_tensor.data(); + EXPECT_EQ((size_t)ref_tensor.numel(), len / sizeof(float)); + for (int i = 0; i < ref_tensor.numel(); ++i) { EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); } }); diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc index 2c5b66a32903f4ffdedb074b31aec53ae6cacaf3..c605e8f2f9ec84684f7b51b4c36266f70707ec2e 100644 --- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc +++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc @@ -40,10 +40,10 @@ TEST(inference, fit_a_line) { cpu_feeds[i].push_back(input); } - std::vector> cpu_fetchs1; + std::vector> cpu_fetchs1; cpu_fetchs1.resize(num_threads); for (int i = 0; i < num_threads; ++i) { - auto* output = new paddle::framework::LoDTensor(); + auto* output = new paddle::framework::FetchType(); cpu_fetchs1[i].push_back(output); } @@ -58,10 +58,10 @@ TEST(inference, fit_a_line) { } #ifdef PADDLE_WITH_CUDA - std::vector> cpu_fetchs2; + std::vector> cpu_fetchs2; cpu_fetchs2.resize(num_threads); for (int i = 0; i < num_threads; ++i) { - auto* output = new paddle::framework::LoDTensor(); + auto* output = new paddle::framework::FetchType(); cpu_fetchs2[i].push_back(output); } @@ -76,7 +76,9 @@ TEST(inference, fit_a_line) { } for (int i = 0; i < num_threads; ++i) { - CheckError(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]); + CheckError( + boost::get(*cpu_fetchs1[i][0]), + boost::get(*cpu_fetchs2[i][0])); delete cpu_fetchs2[i][0]; } #endif diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index 60c761c5281e2f535aab0200c93fb738addcdb87..c6c5b1cadaa37fd81e5e50f004797e045bdd1e3d 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -50,9 +50,9 @@ TEST(inference, image_classification) { std::vector cpu_feeds; cpu_feeds.push_back(&input); - paddle::framework::LoDTensor output1; + paddle::framework::FetchType output1; if (!FLAGS_skip_cpu) { - std::vector cpu_fetchs1; + std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); // Run inference on CPU @@ -60,12 +60,12 @@ TEST(inference, image_classification) { LOG(INFO) << "Batch size is " << FLAGS_batch_size; TestInference( dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); - LOG(INFO) << output1.dims(); + LOG(INFO) << boost::get(output1).dims(); } #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; + paddle::framework::FetchType output2; + std::vector cpu_fetchs2; cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU @@ -73,17 +73,18 @@ TEST(inference, image_classification) { LOG(INFO) << "Batch size is " << FLAGS_batch_size; TestInference( dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined); - LOG(INFO) << output2.dims(); + LOG(INFO) << boost::get(output2).dims(); if (!FLAGS_skip_cpu) { - CheckError(output1, output2); + CheckError(boost::get(output1), + boost::get(output2)); } // float16 inference requires cuda GPUs with >= 5.3 compute capability if (!FLAGS_fp16_dirname.empty() && paddle::platform::GetCUDAComputeCapability(0) >= 53) { - paddle::framework::LoDTensor output3; - std::vector cpu_fetchs3; + paddle::framework::FetchType output3; + std::vector cpu_fetchs3; cpu_fetchs3.push_back(&output3); LOG(INFO) << "--- GPU Runs in float16 mode: ---"; @@ -92,7 +93,8 @@ TEST(inference, image_classification) { TestInference( FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat); - CheckError(output2, output3); + CheckError(boost::get(output2), + boost::get(output3)); } #endif } diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc index 84bb855fea5fa397ff71e2c922fea3302951b7ca..18163985e1c90f0665225d0eb7534ffa37acc552 100644 --- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc +++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc @@ -63,25 +63,27 @@ TEST(inference, label_semantic_roles) { cpu_feeds.push_back(&ctx_p2); cpu_feeds.push_back(&mark); - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; + paddle::framework::FetchType output1; + std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); // Run inference on CPU TestInference(dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.lod(); - LOG(INFO) << output1.dims(); + auto output1_tensor = boost::get(output1); + LOG(INFO) << output1_tensor.lod(); + LOG(INFO) << output1_tensor.dims(); #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; + paddle::framework::FetchType output2; + std::vector cpu_fetchs2; cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU TestInference(dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.lod(); - LOG(INFO) << output2.dims(); + auto output2_tensor = boost::get(output2); + LOG(INFO) << output2_tensor.lod(); + LOG(INFO) << output2_tensor.dims(); - CheckError(output1, output2); + CheckError(output1_tensor, output2_tensor); #endif } diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index 5c1204b9e6b78e42d999b12a2b7be6f822ecf818..39ca328b9aa758284c2f82cf7d8f74c1d00096c8 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -118,8 +118,8 @@ void ThreadRunInfer( inference_program->GetFetchTargetNames(); PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL); - std::map fetch_targets; - paddle::framework::LoDTensor outtensor; + std::map fetch_targets; + paddle::framework::FetchType outtensor; fetch_targets[fetch_target_names[0]] = &outtensor; std::map feed_targets; @@ -150,7 +150,8 @@ void ThreadRunInfer( std::string fetch_target_name = op->Input("X")[0]; int idx = boost::get(op->GetAttr("col")); *fetch_targets[fetch_target_name] = - paddle::framework::GetFetchVariable(*scope, "fetch", idx); + boost::get( + paddle::framework::GetFetchVariable(*scope, "fetch", idx)); } } @@ -215,8 +216,8 @@ TEST(inference, nlp) { const std::vector& fetch_target_names = inference_program->GetFetchTargetNames(); PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL); - std::map fetch_targets; - paddle::framework::LoDTensor outtensor; + std::map fetch_targets; + paddle::framework::FetchType outtensor; fetch_targets[fetch_target_names[0]] = &outtensor; // prepare feed diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc index f12828a2685305c20d26492dbf04fa9ddacf9317..da417b8deea8eeb3a0b366f6e5cc2d287c9c03db 100644 --- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc +++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc @@ -41,28 +41,30 @@ TEST(inference, recognize_digits) { cpu_feeds.push_back(&input); for (auto is_combined : {false, true}) { - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; + paddle::framework::FetchType output1; + std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); // Run inference on CPU LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---"; TestInference(dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); - LOG(INFO) << output1.dims(); + auto output1_tensor = boost::get(output1); + LOG(INFO) << output1_tensor.dims(); #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; + paddle::framework::FetchType output2; + std::vector cpu_fetchs2; cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---"; TestInference(dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined); - LOG(INFO) << output2.dims(); + auto output2_tensor = boost::get(output2); + LOG(INFO) << output2_tensor.dims(); - CheckError(output1, output2); + CheckError(output1_tensor, output2_tensor); #endif } } diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc index 70aa6b194d4417fc85384cc3f615089f024f928e..acec418f0ee9827061085889d0ab43a9e225f0e5 100644 --- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc +++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc @@ -65,23 +65,25 @@ TEST(inference, recommender_system) { cpu_feeds.push_back(&category_id); cpu_feeds.push_back(&movie_title); - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; + paddle::framework::FetchType output1; + std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); // Run inference on CPU TestInference(dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.dims(); + auto output1_tensor = boost::get(output1); + LOG(INFO) << output1_tensor.dims(); #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; + paddle::framework::FetchType output2; + std::vector cpu_fetchs2; cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU TestInference(dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.dims(); + auto output2_tensor = boost::get(output2); + LOG(INFO) << output2_tensor.dims(); - CheckError(output1, output2); + CheckError(output1_tensor, output2_tensor); #endif } diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc index e15c3f59acb1eac535120554a3799c37e9d4e951..efe9f0a8db950eb0ff2c6d5760b533ea816deeed 100644 --- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc +++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc @@ -41,25 +41,27 @@ TEST(inference, rnn_encoder_decoder) { cpu_feeds.push_back(&word_data); cpu_feeds.push_back(&trg_word); - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; + paddle::framework::FetchType output1; + std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); // Run inference on CPU TestInference(dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.lod(); - LOG(INFO) << output1.dims(); + auto output1_tensor = boost::get(output1); + LOG(INFO) << output1_tensor.lod(); + LOG(INFO) << output1_tensor.dims(); #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; + paddle::framework::FetchType output2; + std::vector cpu_fetchs2; cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU TestInference(dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.lod(); - LOG(INFO) << output2.dims(); + auto output2_tensor = boost::get(output2); + LOG(INFO) << output2_tensor.lod(); + LOG(INFO) << output2_tensor.dims(); - CheckError(output1, output2); + CheckError(output1_tensor, output2_tensor); #endif } diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc index 0dbb6a30405eb64133613052ad57b1f705a9e7b4..f05e14afa8fede4969005797423824d226a25cce 100644 --- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc +++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc @@ -39,25 +39,27 @@ TEST(inference, understand_sentiment) { std::vector cpu_feeds; cpu_feeds.push_back(&words); - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; + paddle::framework::FetchType output1; + std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); // Run inference on CPU TestInference(dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.lod(); - LOG(INFO) << output1.dims(); + auto output1_tensor = boost::get(output1); + LOG(INFO) << output1_tensor.lod(); + LOG(INFO) << output1_tensor.dims(); #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; + paddle::framework::FetchType output2; + std::vector cpu_fetchs2; cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU TestInference(dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.lod(); - LOG(INFO) << output2.dims(); + auto output2_tensor = boost::get(output2); + LOG(INFO) << output2_tensor.lod(); + LOG(INFO) << output2_tensor.dims(); - CheckError(output1, output2); + CheckError(output1_tensor, output2_tensor); #endif } diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc index c9328eb21b4fdb06c5f65ba0f7337b1e79fa1927..0c61623a40c10c3beed53b7d298e293d4d8f49fa 100644 --- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc +++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc @@ -44,25 +44,27 @@ TEST(inference, word2vec) { cpu_feeds.push_back(&third_word); cpu_feeds.push_back(&fourth_word); - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; + paddle::framework::FetchType output1; + std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); // Run inference on CPU TestInference(dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.lod(); - LOG(INFO) << output1.dims(); + auto output1_tensor = boost::get(output1); + LOG(INFO) << output1_tensor.lod(); + LOG(INFO) << output1_tensor.dims(); #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; + paddle::framework::FetchType output2; + std::vector cpu_fetchs2; cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU TestInference(dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.lod(); - LOG(INFO) << output2.dims(); + auto output2_tensor = boost::get(output2); + LOG(INFO) << output2_tensor.lod(); + LOG(INFO) << output2_tensor.dims(); - CheckError(output1, output2); + CheckError(output1_tensor, output2_tensor); #endif } diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 861f69f4d2143b16bdec546d92ce7bd13ca53ed3..7183cbac71562bfe4092bf78270096996b74c525 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -142,7 +143,7 @@ std::vector> GetFeedTargetShapes( template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, - const std::vector& cpu_fetchs, + const std::vector& cpu_fetchs, const int repeat = 1, const bool is_combined = false) { // 1. Define place, executor, scope auto place = Place(); @@ -194,7 +195,7 @@ void TestInference(const std::string& dirname, } // 5. Define Tensor to get the outputs: set up maps for fetch targets - std::map fetch_targets; + std::map fetch_targets; for (size_t i = 0; i < fetch_target_names.size(); ++i) { fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; } diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index b40d7b0ce8f12172dfeee5c13f02225b116e78de..088413ce223594ac8a32f0d06662dd3b8342c096 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -58,7 +58,7 @@ class FeedOp : public framework::OperatorBase { VLOG(3) << "Feed variable " << feed_var_name << "'s " << col << " column to variable " << out_name; - auto &feed_list = feed_var->Get(); + auto &feed_list = feed_var->Get(); PADDLE_ENFORCE_LT( static_cast(col), feed_list.size(), platform::errors::InvalidArgument( @@ -68,7 +68,7 @@ class FeedOp : public framework::OperatorBase { col, feed_list.size())); auto &feed_item = feed_list.at(static_cast(col)); - auto *out_item = out_var->GetMutable(); + auto *out_item = out_var->GetMutable(); if (platform::is_same_place(feed_item.place(), place)) { out_item->ShareDataWith(feed_item); diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index ad481c3492330289d7fadb4c1978d3a3790b972b..3f22680b86ad8c766c2ccc35fdc5943f44a7fc9e 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -21,6 +21,39 @@ limitations under the License. */ namespace paddle { namespace operators { +// FIXME(yuyang18): Should we assume the fetch operator always generate +// CPU outputs? +static void DataCopy(const framework::LoDTensor &src_item, + const std::string &fetch_var_name, + framework::LoDTensor *dst_item) { + if (src_item.IsInitialized() && src_item.numel() > 0) { +#ifdef PADDLE_WITH_MKLDNN + // Conversion from MKL-DNN to Paddle + if (src_item.layout() == framework::DataLayout::kMKLDNN) { + framework::Tensor out; + // Convert to desired Paddle layout, apart from grads of filter + // as params are not a subject to paddle's data_format + framework::innerTransDataLayoutFromMKLDNN( + src_item.layout(), + fetch_var_name == framework::GradVarName("Filter") + ? framework::DataLayout::kNCHW + : paddle::platform::get_cur_paddle_data_layout(), + src_item, &out, platform::CPUPlace()); + TensorCopySync(out, platform::CPUPlace(), dst_item); + } else { + TensorCopySync(src_item, platform::CPUPlace(), dst_item); + } +#else + TensorCopySync(src_item, platform::CPUPlace(), dst_item); +#endif + } else { + // Not copy, if the src tensor is empty. + dst_item->clear(); + dst_item->Resize({0}); + } + dst_item->set_lod(src_item.lod()); +} + class FetchOp : public framework::OperatorBase { public: FetchOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -66,42 +99,26 @@ class FetchOp : public framework::OperatorBase { VLOG(3) << "Fetch variable " << fetch_var_name << " to variable " << out_name << "'s " << col << " column."; - auto *fetch_list = out_var->GetMutable(); - auto &src_item = fetch_var->Get(); + auto *fetch_list = out_var->GetMutable(); if (static_cast(col) >= fetch_list->size()) { fetch_list->resize(col + 1); } - auto &dst_item = fetch_list->at(col); - // FIXME(yuyang18): Should we assume the fetch operator always generate - // CPU outputs? - if (src_item.IsInitialized() && src_item.numel() > 0) { -#ifdef PADDLE_WITH_MKLDNN - // Conversion from MKL-DNN to Paddle - if (src_item.layout() == framework::DataLayout::kMKLDNN) { - framework::Tensor out; - // Convert to desired Paddle layout, apart from grads of filter - // as params are not a subject to paddle's data_format - framework::innerTransDataLayoutFromMKLDNN( - src_item.layout(), - fetch_var_name == framework::GradVarName("Filter") - ? framework::DataLayout::kNCHW - : paddle::platform::get_cur_paddle_data_layout(), - src_item, &out, platform::CPUPlace()); - TensorCopySync(out, platform::CPUPlace(), &dst_item); - } else { - TensorCopySync(src_item, platform::CPUPlace(), &dst_item); - } -#else - TensorCopySync(src_item, platform::CPUPlace(), &dst_item); -#endif + if (fetch_var->IsType()) { + auto &src_item = fetch_var->Get(); + auto *dst_item = &(boost::get(fetch_list->at(col))); + DataCopy(src_item, fetch_var_name, dst_item); } else { - // Not copy, if the src tensor is empty. - dst_item.clear(); - dst_item.Resize({0}); + auto &src_item = fetch_var->Get(); + framework::LoDTensorArray tmp(src_item.size()); + fetch_list->at(col) = tmp; + auto &dst_item = + boost::get(fetch_list->at(col)); + for (size_t i = 0; i < src_item.size(); ++i) { + DataCopy(src_item[i], fetch_var_name, &dst_item[i]); + } } - dst_item.set_lod(src_item.lod()); } }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ca3be55d4e5f56c7ce122ffd99c917a160411cd1..e4704219beeac36c01a1f48f9d23736d8332e23d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -97,7 +97,9 @@ DECLARE_bool(use_mkldnn); // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); -PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensor2DArray); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); namespace paddle { namespace pybind { @@ -966,6 +968,9 @@ All parameter, weight, gradient are variables in Paddle. .def("get_lod_tensor_array", [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) + .def("get_fetch_list", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) #if (defined(PADDLE_WITH_NCCL)) .def("get_communicator", [](Variable &self) -> platform::Communicator * { @@ -1443,7 +1448,7 @@ All parameter, weight, gradient are variables in Paddle. .def("run_prepared_ctx", [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope, std::map *feed_targets, - std::map *fetch_targets, + std::map *fetch_targets, bool create_local_scope = true, bool create_vars = true, const std::string &feed_holder_name = "feed", const std::string &fetch_holder_name = "fetch") { @@ -1503,7 +1508,16 @@ All parameter, weight, gradient are variables in Paddle. #endif m.def("set_feed_variable", framework::SetFeedVariable); - m.def("get_fetch_variable", framework::GetFetchVariable); + m.def("get_fetch_variable", + [](const Scope &scope, const std::string &var_name, + size_t index) -> py::object { + auto &var = framework::GetFetchVariable(scope, var_name, index); + if (data_is_lod_tensor(var)) { + return py::cast(boost::get(var)); + } else { + return py::cast(boost::get(var)); + } + }); m.def("get_variable_tensor", framework::GetVariableTensor); m.def("_is_program_version_supported", IsProgramVersionSupported); @@ -1583,16 +1597,70 @@ All parameter, weight, gradient are variables in Paddle. }, py::return_value_policy::take_ownership); - py::class_(m, "LoDTensor2DArray", R"DOC( - LoDTensor2DArray is 2-D array of LoDTensor. + py::class_(m, "FetchList", R"DOC( FetchList is a + vector of boost::variant. + )DOC") + .def("_move_to_list", + [](FetchList &self) -> py::list { + py::list res(self.size()); + for (size_t i = 0; i < self.size(); ++i) { + if (data_is_lod_tensor(self[i])) { + auto &data = boost::get(self[i]); + res[i] = py::cast(std::move(data)); + } else { + auto &data = boost::get(self[i]); + py::list tmp(data.size()); + for (size_t j = 0; j < data.size(); ++j) { + tmp[j] = py::cast(std::move(data[j])); + } + res[i] = std::move(tmp); + } + } + self.clear(); + return res; + }, + py::return_value_policy::take_ownership) + + .def("append", + [](FetchList &self, const LoDTensor &t) { + self.emplace_back(); + auto &lod_tensor = boost::get(self.back()); + lod_tensor.ShareDataWith(t); + lod_tensor.set_lod(t.lod()); + }, + py::arg("var")) + + .def("append", + [](FetchList &self, const LoDTensorArray &t) { + self.emplace_back(); + auto &lod_tensor_array = boost::get(self.back()); + for (size_t i = 0; i < t.size(); ++i) { + lod_tensor_array[i].ShareDataWith(t[i]); + lod_tensor_array[i].set_lod(t[i].lod()); + } + }, + py::arg("var")); + + py::class_(m, "FetchUnmergedList", R"DOC( + FetchUnmergedList is 2-D array of FetchType(boost::variant(LoDTensor, LoDTensorArray)). )DOC") .def("_move_to_list", - [](LoDTensor2DArray &self) -> py::list { + [](FetchUnmergedList &self) -> py::list { py::list res(self.size()); for (size_t i = 0; i < self.size(); ++i) { py::list tmp(self[i].size()); for (size_t j = 0; j < self[i].size(); ++j) { - tmp[j] = py::cast(std::move(self[i][j])); + if (data_is_lod_tensor(self[i][j])) { + auto &var = boost::get(self[i][j]); + tmp[j] = py::cast(std::move(var)); + } else { + auto &var = boost::get(self[i][j]); + py::list tmp_array(var.size()); + for (size_t k = 0; k < var.size(); ++k) { + tmp_array[k] = std::move(var[k]); + } + tmp[j] = std::move(tmp_array); + } } res[i] = std::move(tmp); self[i].clear(); @@ -2326,8 +2394,8 @@ All parameter, weight, gradient are variables in Paddle. ret = self.Run(fetch_tensors, return_merged); } if (return_merged) { - return py::cast(std::move( - boost::get(ret))); + return py::cast( + std::move(boost::get(ret))); } else { return py::cast(std::move( boost::get(ret))); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 8ceba92dd0742ccc915a6c2e3a18e655caa105d5..b635fbebe18a49a7d8da56983b3bf8b702c6d5bd 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -931,14 +931,14 @@ class Executor(object): return_merged(bool): This parameter indicates whether fetched variables (the variables specified in the fetch list) should be merged according to the execution device dimension. If :code:`return_merged` is False, the type of the return value is a two-dimensional list - of :code:`Tensor` ( :code:`return_numpy` is False) or a two-dimensional list of - :code:`numpy.ndarray` ( :code:`return_numpy` is True). If :code:`return_merged` is True, - the type of the return value is an one-dimensional list of :code:`Tensor` ( :code:`return_numpy` - is False) or an one-dimensional list of :code:`numpy.ndarray` ( :code:`return_numpy` is True). - Please see Examples 2 for more details. If the lengths of fetched results are variant, please - set :code:`return_merged` as False, which denotes that the fetched results will not be merged. - The default is True, but it is just for the compatibility, and may use False as default value - in the future version. + of :code:`Tensor` / :code:`LoDTensorArray` ( :code:`return_numpy` is False) or a two-dimensional + list of :code:`numpy.ndarray` ( :code:`return_numpy` is True). If :code:`return_merged` is True, + the type of the return value is an one-dimensional list of :code:`Tensor` / :code:`LoDTensorArray` + ( :code:`return_numpy` is False) or an one-dimensional list of :code:`numpy.ndarray` + ( :code:`return_numpy` is True). Please see Examples 2 for more details. If the lengths of fetched + results are variant, please set :code:`return_merged` as False, which denotes that the fetched + results will not be merged. The default is True, but it is just for the compatibility, and may + use False as default value in the future version. use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned. If the parameter is True, the program will be pruned accroding to the given feed and fetch_list, which means the operators and variables in program that generate :code:`feed` and are not @@ -980,13 +980,17 @@ class Executor(object): loss = fluid.layers.mean(hidden) adam = fluid.optimizer.Adam() adam.minimize(loss) + i = fluid.layers.zeros(shape=[1], dtype='int64') + array = fluid.layers.array_write(x=loss, i=i) # Run the startup program once and only once. exe.run(fluid.default_startup_program()) x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(feed={'X': x}, - fetch_list=[loss.name]) + loss_val, array_val = exe.run(feed={'X': x}, + fetch_list=[loss.name, array.name]) + print(array_val) + # [array([0.02153828], dtype=float32)] Examples 2: .. code-block:: python @@ -1226,7 +1230,7 @@ class Executor(object): else: self._default_executor.run_prepared_ctx(ctx, scope, False, False, False) - arr = scope.find_var(fetch_var_name).get_lod_tensor_array() + arr = scope.find_var(fetch_var_name).get_fetch_list() tensors = arr._move_to_list() if return_numpy: return as_numpy(tensors) diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index 1269735049fc7445c9367238998ebcd1a0dd6d43..17507c70d90d2a3d3264c50e37c61d37ef7c9e7d 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -58,8 +58,11 @@ def convolutional_neural_network(use_py_reader): loss = fluid.layers.cross_entropy(input=prediction, label=label) avg_loss = fluid.layers.mean(loss) acc = fluid.layers.accuracy(input=prediction, label=label) - - return img, label, prediction, avg_loss, acc, py_reader + i = fluid.layers.zeros(shape=[1], dtype='int64') + array = fluid.layers.array_write(x=prediction, i=i) + fluid.layers.increment(i) + fluid.layers.array_write(x=acc, i=i, array=array) + return array, img, label, prediction, avg_loss, acc, py_reader def test(): @@ -69,7 +72,7 @@ def test(): test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) - img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( + array, img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( use_py_reader=False) feeder = fluid.DataFeeder(feed_list=[img, label], place=place) @@ -102,7 +105,7 @@ def train(use_cuda, thread_num, cpu_num): print("paddle is not compiled with cuda, exit!") return - img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( + array, img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( use_py_reader=True) print("build convolutional neural network done.") @@ -150,7 +153,12 @@ def train(use_cuda, thread_num, cpu_num): py_reader.start() try: while True: - loss_val = pe.run(fetch_list=[avg_loss.name]) + array_v, acc_v, prediction_v, loss_val = pe.run( + fetch_list=[array, acc, prediction, avg_loss.name]) + + assert numpy.allclose(array_v[0], prediction_v) == True + assert numpy.allclose(array_v[1], acc_v) == True + loss_val = numpy.mean(loss_val) if step % 10 == 0: print("Pass %d, Batch %d, Cost %f, queue size %d" % diff --git a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py index efac29c2e041f804be358a0b75fe508f601f6def..ebe820cb90ae235c4a95819f823b74afbc98e3b3 100644 --- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py +++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py @@ -19,25 +19,40 @@ import unittest import numpy import paddle.fluid.core as core from paddle.fluid.executor import Executor -from paddle.fluid.layers import mul, data +from paddle.fluid.layers import mul, data, zeros, array_write, increment class TestExecutor(unittest.TestCase): def test_mul(self): + i = zeros(shape=[1], dtype='int64') a = data(name='a', shape=[784], dtype='float32') + array = array_write(x=a, i=i) + + i = increment(i) b = data( name='b', shape=[784, 100], dtype='float32', append_batch_size=False) + array_write(x=b, i=i, array=array) + + i = increment(i) out = mul(x=a, y=b) + array_write(x=out, i=i, array=array) + a_np = numpy.random.random((100, 784)).astype('float32') b_np = numpy.random.random((784, 100)).astype('float32') + exe = Executor() - outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out]) - out = outs[0] - self.assertEqual((100, 100), out.shape) - self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np))) + res, res_array = exe.run(feed={'a': a_np, + 'b': b_np}, + fetch_list=[out, array]) + + self.assertEqual((100, 100), res.shape) + self.assertTrue(numpy.allclose(res, numpy.dot(a_np, b_np))) + self.assertTrue(numpy.allclose(res_array[0], a_np)) + self.assertTrue(numpy.allclose(res_array[1], b_np)) + self.assertTrue(numpy.allclose(res_array[2], res)) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py index b823d397e9530362f5fee417278e36477d65f6f5..d1842001379ec75d259fde8a032a0ce599ed9b2f 100644 --- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py +++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py @@ -31,7 +31,9 @@ class TestFeedFetch(unittest.TestCase): core.set_feed_variable(scope, input_tensor, "feed", 0) - output_tensor = core.get_fetch_variable(scope, "feed", 0) + output = scope.var("fetch").get_fetch_list() + output.append(input_tensor) + output_tensor = core.get_fetch_variable(scope, "fetch", 0) output_lod = output_tensor.recursive_sequence_lengths() self.assertEqual(2, output_lod[0][0]) diff --git a/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py new file mode 100644 index 0000000000000000000000000000000000000000..50ad2a4087afe312aacba524c065d467fc36ef89 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py @@ -0,0 +1,102 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import unittest +import random +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from simple_nets import simple_fc_net_with_inputs, simple_fc_net + + +class TestFetchLoDTensorArray(unittest.TestCase): + def build_program(self, main_program, startup_program): + with fluid.unique_name.guard(): + with fluid.program_guard(main_program, startup_program): + i = layers.zeros(shape=[1], dtype='int64') + img = fluid.data(name='image', shape=[-1, 784], dtype='float32') + label = fluid.data(name='label', shape=[-1, 1], dtype='int64') + loss = simple_fc_net_with_inputs(img, label, class_num=10) + loss = simple_fc_net() + opt = fluid.optimizer.SGD(learning_rate=0.001) + opt.minimize(loss) + + array = layers.array_write(x=img, i=i) + i = layers.increment(i) + layers.array_write(x=label, i=i, array=array) + i = layers.increment(i) + layers.array_write(x=loss, i=i, array=array) + + return loss, array + + def check_network(self, use_cuda=True): + os.environ["CPU_NUM"] = str(2) + main_program = fluid.Program() + startup_program = fluid.Program() + + loss, array = self.build_program(main_program, startup_program) + + batch_size = 32 + image = np.random.normal(size=(batch_size, 784)).astype('float32') + label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_program) + feed_dict = {'image': image, 'label': label} + + build_strategy = fluid.BuildStrategy() + binary = fluid.CompiledProgram(main_program).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy) + + device_num = fluid.core.get_cuda_device_count() if use_cuda else 2 + for _ in range(3): + loss_v, array_v = exe.run(binary, + feed=feed_dict, + fetch_list=[loss, array], + return_merged=False) + self.assertEqual(np.array(loss_v).shape, (device_num, 1)) + self.assertEqual( + np.array(array_v[0][0]).shape, (batch_size / device_num, 784)) + self.assertEqual( + np.array(array_v[0][1]).shape, (batch_size / device_num, 1)) + self.assertEqual(np.array(array_v[0][2]).shape, (1, )) + + for _ in range(3): + loss_v, array_v = exe.run(binary, + feed=feed_dict, + fetch_list=[loss, array], + return_merged=True) + self.assertEqual(np.array(loss_v).shape, (device_num, )) + self.assertEqual(np.array(array_v[0]).shape, (batch_size, 784)) + self.assertEqual(np.array(array_v[1]).shape, (batch_size, 1)) + self.assertTrue(np.allclose(loss_v, array_v[2])) + + def test_fetch_lod_tensor_array(self): + if fluid.core.is_compiled_with_cuda(): + self.check_network(use_cuda=True) + self.check_network(use_cuda=False) + + def test_fetch_unmerged_parallel_graph(self): + fluid.core.globals()['FLAGS_enable_parallel_graph'] = True + if fluid.core.is_compiled_with_cuda(): + self.check_network(use_cuda=True) + self.check_network(use_cuda=False) + fluid.core.globals()['FLAGS_enable_parallel_graph'] = False + + +if __name__ == '__main__': + unittest.main()