diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index c92097eaaa871fc80f721ef261419ef175159272..f7be0cb6b81bb1555f3789529ec97a76d6a8f33b 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -608,14 +608,20 @@ py::tuple AscendSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &gr MS_EXCEPTION_IF_NULL(graph); MS_LOG(INFO) << "Run op " << op_run_info.op_name << " start!"; // malloc mem - RunOpMemoryAlloc(input_tensors, graph.get()); + RunOpMemoryAlloc(op_run_info.value, input_tensors, graph.get()); // load input data to device LoadInputData(graph, input_tensors); // run op RunOpExecTask(graph); // get output VectorRef outputs; - UpdateOutputs(graph, &outputs, input_tensors); + if (op_run_info.value != nullptr) { + std::vector pre_output_tensors; + TensorValueToTensor(op_run_info.value, &pre_output_tensors); + std::copy(pre_output_tensors.begin(), pre_output_tensors.end(), std::back_inserter(outputs)); + } else { + UpdateOutputs(graph, &outputs, input_tensors); + } // trans output to tuple auto output_tensors = TransformBaseRefListToTuple(outputs); if (!utils::isa(output_tensors) || @@ -744,14 +750,15 @@ void AscendSession::MemoryAlloc(KernelGraph *kernel_graph) const { MS_LOG(INFO) << "Finish!"; } -void AscendSession::RunOpMemoryAlloc(const std::vector &input_tensors, +void AscendSession::RunOpMemoryAlloc(const ValuePtr &pre_output_value, + const std::vector &input_tensors, KernelGraph *kernel_graph) const { MS_LOG(INFO) << "Start memory alloc!"; MS_EXCEPTION_IF_NULL(kernel_graph); opt::RemoveNopNode(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); - runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph); + runtime_instance->RunOpAssignMemory(pre_output_value, input_tensors, kernel_graph); MS_LOG(INFO) << "Finish!"; } diff --git a/mindspore/ccsrc/backend/session/ascend_session.h b/mindspore/ccsrc/backend/session/ascend_session.h index b0f2a5d5dc8f07d5076f5f503b0497e68caa736e..ddd9f8d7aa43c599e3a84af10a9b303f8856df17 100755 --- a/mindspore/ccsrc/backend/session/ascend_session.h +++ b/mindspore/ccsrc/backend/session/ascend_session.h @@ -79,7 +79,8 @@ class AscendSession : public SessionBasic { void AssignStream(NotNull kernel_graph) const; void BuildKernel(const std::shared_ptr &kernel_graph) const; void MemoryAlloc(KernelGraph *kernel_graph) const; - void RunOpMemoryAlloc(const std::vector &input_tensors, KernelGraph *kernel_graph) const; + void RunOpMemoryAlloc(const ValuePtr &pre_output_value, const std::vector &input_tensors, + KernelGraph *kernel_graph) const; void RunOpMemoryClear(const KernelGraph *kernel_graph) const; void GenerateTaskInfo(const std::shared_ptr &kernel_graph) const; void LoadTask(const std::shared_ptr &kernel_graph) const; diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 4398d9a0375aebb25b5dee36462326737dc416a9..8462664f67f50872febf04007c5e959edb05bee9 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -102,12 +102,13 @@ void GPUSession::AllocateMemory(KernelGraph *kernel_graph) const { runtime_instance->AssignMemory(kernel_graph); } -void GPUSession::RunOpAllocateMemory(const std::vector &input_tensors, +void GPUSession::RunOpAllocateMemory(const ValuePtr &pre_output_value, + const std::vector &input_tensors, KernelGraph *kernel_graph) const { MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); - runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph); + runtime_instance->RunOpAssignMemory(pre_output_value, input_tensors, kernel_graph); } void GPUSession::RunOpClearMemory(KernelGraph *kernel_graph) const { @@ -292,7 +293,7 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph MS_EXCEPTION_IF_NULL(kernel_graph); // Remove NoOp from execution graph opt::RemoveNopNode(kernel_graph.get()); - RunOpAllocateMemory(input_tensors, kernel_graph.get()); + RunOpAllocateMemory(op_run_info.value, input_tensors, kernel_graph.get()); // Execute the computation LoadInputData(kernel_graph, input_tensors); Execute(kernel_graph); diff --git a/mindspore/ccsrc/backend/session/gpu_session.h b/mindspore/ccsrc/backend/session/gpu_session.h index 3e4e84a29bbf5b44de7a62d2ad60abcd73dd8cb0..70d904ef7aebc8853ef173d5d9ada581e3911372 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.h +++ b/mindspore/ccsrc/backend/session/gpu_session.h @@ -59,7 +59,8 @@ class GPUSession : public SessionBasic { void AllocateMemory(KernelGraph *kernel_graph) const; - void RunOpAllocateMemory(const std::vector &input_tensors, KernelGraph *kernel_graph) const; + void RunOpAllocateMemory(const ValuePtr &pre_output_value, const std::vector &input_tensors, + KernelGraph *kernel_graph) const; void RunOpClearMemory(KernelGraph *kernel_graph) const; diff --git a/mindspore/ccsrc/backend/session/kernel_graph.cc b/mindspore/ccsrc/backend/session/kernel_graph.cc index b9e4e003bf29fe885dc3ee04a3cf21ad666190bc..4313be73797f27a6a77cb619f6fee97379cc7744 100644 --- a/mindspore/ccsrc/backend/session/kernel_graph.cc +++ b/mindspore/ccsrc/backend/session/kernel_graph.cc @@ -95,6 +95,38 @@ bool IsSameLabel(const CNodePtr &left, const CNodePtr &right) { } return false; } + +void SyncDeviceInfoToValueNode(const ValueNodePtr &value_node, std::vector *device_formats, + std::vector *device_types) { + MS_EXCEPTION_IF_NULL(value_node); + MS_EXCEPTION_IF_NULL(device_formats); + MS_EXCEPTION_IF_NULL(device_types); + ValuePtr value = value_node->value(); + std::vector tensors; + TensorValueToTensor(value, &tensors); + if (!tensors.empty()) { + if (tensors.size() != AnfAlgo::GetOutputTensorNum(value_node)) { + MS_LOG(EXCEPTION) << "The size of tensors converted from value [" << tensors.size() + << "] is not equal to output size of value node [" << AnfAlgo::GetOutputTensorNum(value_node) + << "]"; + } + device_formats->clear(); + device_types->clear(); + for (const auto &tensor : tensors) { + MS_EXCEPTION_IF_NULL(tensor); + auto device_sync = tensor->device_address(); + if (device_sync != nullptr) { + auto device_address = std::dynamic_pointer_cast(device_sync); + MS_EXCEPTION_IF_NULL(device_address); + device_formats->emplace_back(device_address->format()); + device_types->emplace_back(device_address->type_id()); + continue; + } + device_formats->emplace_back(kOpFormat_DEFAULT); + device_types->emplace_back(kTypeUnknown); + } + } +} } // namespace AnfNodePtr KernelGraph::MakeValueNode(const AnfNodePtr &node) { auto value_node = node->cast(); @@ -347,10 +379,12 @@ void KernelGraph::SetKernelInfoForNode(const AnfNodePtr &node) const { auto kernel_build_info_builder = std::make_shared(); // set the format of value_node to DEFAULT_FORMAT std::vector types; - kernel_build_info_builder->SetOutputsFormat(std::vector{kOpFormat_DEFAULT}); + std::vector formats = {kOpFormat_DEFAULT}; if (node->isa()) { kernel_info->SetFeatureMapFlag(false); types.emplace_back(kTypeUnknown); + auto value_node = node->cast(); + SyncDeviceInfoToValueNode(value_node, &formats, &types); } if (node->isa()) { auto parameter = node->cast(); @@ -360,6 +394,7 @@ void KernelGraph::SetKernelInfoForNode(const AnfNodePtr &node) const { types.push_back(is_weight ? kTypeUnknown : AnfAlgo::GetOutputInferDataType(parameter, 0)); } // set parameter initaial device data type + kernel_build_info_builder->SetOutputsFormat(formats); kernel_build_info_builder->SetOutputsDeviceType(types); AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), node.get()); } diff --git a/mindspore/ccsrc/frontend/optimizer/ad/dfunctor.cc b/mindspore/ccsrc/frontend/optimizer/ad/dfunctor.cc index b314b22f818a713ff0644156c952ce86fa7d4419..681bd90eb1630d278c02a7976e1954722a66f1d4 100644 --- a/mindspore/ccsrc/frontend/optimizer/ad/dfunctor.cc +++ b/mindspore/ccsrc/frontend/optimizer/ad/dfunctor.cc @@ -216,6 +216,7 @@ AdjointPtr DFunctor::MapMorphism(const AnfNodePtr &morph) { TraceManager::DebugTrace(std::make_shared(cnode_morph->debug_info())); auto k_app = k_graph_->NewCNode(inputs); TraceManager::EndTrace(); + ReplaceEquivdout(k_app, cnode_morph->forward()); for (size_t i = 0; i < param_adjoints.size(); ++i) { param_adjoints[i]->RegisterKUser(k_app, i); } @@ -237,6 +238,37 @@ AdjointPtr DFunctor::MapMorphism(const AnfNodePtr &morph) { return node_adjoint; } +void DFunctor::ReplaceEquivdout(const CNodePtr &cnode, const ValuePtr &forward) { + if (forward == nullptr) { + return; + } + auto &input = cnode->input(0); + if (!IsValueNode(input)) { + return; + } + auto fg = GetValueNode(input); + auto output = fg->output(); + if (!output->isa()) { + return; + } + auto cnode_output = output->cast(); + auto &cnode_input = cnode_output->input(1); + if (!cnode_input->isa()) { + return; + } + auto &input_fg = cnode_output->input(2); + if (!IsValueNode(input_fg)) { + return; + } + auto equivdout = cnode_input->cast(); + auto func_graph = GetValueNode(input_fg); + auto manager = Manage({fg, func_graph}, false); + MS_LOG(DEBUG) << "Replace: " << equivdout->ToString() << " with " << forward; + auto value_node = NewValueNode(forward); + value_node->set_has_new_value(true); + manager->Replace(equivdout, value_node); +} + bool DFunctor::IsFreeMorphism(const AnfNodePtr &node) { // Do not care about non-CNode if (!node->isa()) { diff --git a/mindspore/ccsrc/frontend/optimizer/ad/dfunctor.h b/mindspore/ccsrc/frontend/optimizer/ad/dfunctor.h index 4202ed192978c75f12c41a09c254deabbf99e653..0a25d3e396976a3c4a60480ff826b535344fc358 100644 --- a/mindspore/ccsrc/frontend/optimizer/ad/dfunctor.h +++ b/mindspore/ccsrc/frontend/optimizer/ad/dfunctor.h @@ -95,6 +95,7 @@ class DFunctor : public std::enable_shared_from_this { // Update k hole with adjoint_definition, only applied in recursive case. void UpdateAdjoint(const AdjointPtr &adjoint_definition); void CallDoutHoleOnTape(); + void ReplaceEquivdout(const CNodePtr &cnode, const ValuePtr &forward); std::unordered_map anfnode_to_adjoin_; // Cache for indirect fv backpropagation, K o K can only do backprop layer by layer. diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_eliminate.h b/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_eliminate.h index 6b5a56a7fd4c1c9be463d2c61d7ee10b2a4c1753..e794671f982f3b87beacd0cc3e2e6301f3f8816b 100644 --- a/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_eliminate.h +++ b/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_eliminate.h @@ -88,7 +88,9 @@ class GetitemConstEliminater : public AnfVisitor { AnfVisitor::Match(prim::kPrimListGetItem, {IsVNode, IsVNode})(node); if (is_match_) { - return NewValueNode((*tuple_)[id_]); + auto out = NewValueNode((*tuple_)[id_]); + out->set_has_new_value(has_new_value_); + return out; } return nullptr; } @@ -96,6 +98,7 @@ class GetitemConstEliminater : public AnfVisitor { void Visit(const ValueNodePtr &vnode) override { if (IsValueNode(vnode)) { tuple_ = GetValueNode(vnode); + has_new_value_ = vnode->has_new_value(); } if (tuple_ != nullptr && IsValueNode(vnode)) { id_ = IntToSize(GetValue(vnode->value())); @@ -115,6 +118,7 @@ class GetitemConstEliminater : public AnfVisitor { bool is_match_{false}; size_t id_{0}; ValueTuplePtr tuple_{nullptr}; + bool has_new_value_{false}; }; // setitem((a, b, c, ...), 0, z) => (z, b, c, ...) diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.cc b/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.cc index b8543bad48dc7f50767fb1e0b48f206f1a076e12..da71f3996c0ea9e1be9393acfa577fa714c0975d 100644 --- a/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.cc +++ b/mindspore/ccsrc/pipeline/jit/static_analysis/static_analysis.cc @@ -205,7 +205,11 @@ EvalResultPtr AnalysisEngine::Eval(const AnfNodeConfigPtr &conf) { AbstractBasePtr AnalysisEngine::EvalValueNode(const ValueNodePtr &value_node, const AnfNodeConfigPtr &conf) { MS_EXCEPTION_IF_NULL(conf); MS_EXCEPTION_IF_NULL(value_node); - return ToAbstract(value_node->value(), conf->context(), conf); + auto out = ToAbstract(value_node->value(), conf->context(), conf); + if (value_node->has_new_value()) { + out = out->Broaden(); + } + return out; } EvalResultPtr AnalysisEngine::EvalCNode(const CNodePtr &cnode, const AnfNodeConfigPtr &conf) { diff --git a/mindspore/ccsrc/pipeline/pynative/base.h b/mindspore/ccsrc/pipeline/pynative/base.h index 2a380d690080ac88113523b676647c33b23510e8..efde3f2e58f0a4e4095361158d3f937d3b1acb36 100644 --- a/mindspore/ccsrc/pipeline/pynative/base.h +++ b/mindspore/ccsrc/pipeline/pynative/base.h @@ -26,6 +26,7 @@ #include #include "pybind11/pybind11.h" +#include "ir/anf.h" #include "ir/primitive_py.h" #include "abstract/abstract_value.h" @@ -51,6 +52,7 @@ struct OpExecInfo { PrimitivePyPtr py_primitive; std::string op_name; AbstractBasePtr abstract; + ValuePtr value = nullptr; py::tuple op_inputs; py::tuple inputs_mask; diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc index a39a931b294d85f0897390737ee1bd0d1b459130..224d10c214f7667c725f3d9437d7ceec0247cb2a 100644 --- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc +++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc @@ -111,7 +111,7 @@ inline ValuePtr PyAttrValue(const py::object &obj) { return converted_ret; } -std::string GetId(const py::object &obj) { +static std::string GetId(const py::object &obj) { py::object to_process = obj; std::string prefix = ""; if (py::isinstance(to_process)) { @@ -141,6 +141,11 @@ std::string GetId(const py::object &obj) { return py::cast(ret); } +static std::string GetOpId(const OpExecInfoPtr &op_exec_info) { + auto id = GetId(op_exec_info->py_primitive->GetPyObj()); + return id; +} + py::object GetTupleObj(const py::object &obj) { py::module mod = parse::python_adapter::GetPyModule(parse::PYTHON_MOD_PARSE_MODULE); py::object obj_tuple = parse::python_adapter::CallPyModFn(mod, parse::PYTHON_MOD_GET_DEFAULT_INPUT, obj); @@ -317,6 +322,7 @@ OpExecInfoPtr GenerateOpExecInfo(const py::args &args, py::list *const out_args) } op_exec_info->py_primitive = prim; op_exec_info->op_attrs = py::getattr(args[PY_PRIM], "attrs"); + op_exec_info->value = PynativeExecutor::GetInstance()->GetForwardValue(op_exec_info); if (op_exec_info->op_inputs.size() != op_exec_info->inputs_mask.size()) { MS_LOG(ERROR) << "Op:" << op_exec_info->op_name << " inputs size not equal op_mask"; return nullptr; @@ -606,7 +612,20 @@ py::object RunOpWithBackendPolicy(MsBackendPolicy backend_policy, const OpExecIn return result; } -AnfNodePtr PynativeExecutor::MakeCNode(const OpExecInfoPtr &op_exec_info, const py::args &args, const py::tuple &out) { +ValuePtr PynativeExecutor::GetForwardValue(const OpExecInfoPtr &op_exec_info) { + auto id = GetOpId(op_exec_info); + auto op = id; + op.append(std::to_string(op_id_map_[id])); + auto iter = op_forward_map_.find(op); + if (iter != op_forward_map_.end()) { + ++op_id_map_[id]; + MS_LOG(DEBUG) << "Get: " << op_exec_info->op_name << "(" << op << "), " << iter->second; + return iter->second; + } + return nullptr; +} + +CNodePtr PynativeExecutor::MakeCNode(const OpExecInfoPtr &op_exec_info, const py::args &args, const py::tuple &out) { if (!grad_flag_ || graph_info_map_.empty()) { return nullptr; } @@ -645,6 +664,34 @@ AnfNodePtr PynativeExecutor::MakeCNode(const OpExecInfoPtr &op_exec_info, const return cnode; } +void PynativeExecutor::SaveOpForwardValue(const OpExecInfoPtr &op_exec_info, const ValuePtr &value) { + auto id = GetOpId(op_exec_info); + auto op = id; + op.append(std::to_string(op_id_map_[id])); + auto iter = op_forward_map_.find(op); + if (iter != op_forward_map_.end()) { + return; + } + op_forward_map_[op] = value; + ++op_id_map_[id]; + MS_LOG(DEBUG) << "Save: " << op_exec_info->op_name << "(" << op << "), " << value; +} + +void PynativeExecutor::SaveAllResult(const OpExecInfoPtr &op_exec_info, const CNodePtr &cnode, const py::tuple &out) { + if (!grad_flag_ || op_exec_info->value != nullptr) { + return; + } + py::object out_real = out; + if (out.size() == 1) { + out_real = out[0]; + } + auto value = PyAttrValue(out_real); + if (cnode != nullptr) { + cnode->set_forward(value); + } + SaveOpForwardValue(op_exec_info, value); +} + AnfNodePtr PynativeExecutor::GetObjNode(const py::object &obj) { auto &out = graph_info_map_[curr_g_].obj_node_map[GetId(obj)]; if (out.second.size() == 1 && out.second[0] == -1) { @@ -657,6 +704,7 @@ AnfNodePtr PynativeExecutor::GetObjNode(const py::object &obj) { node = curr_g_->NewCNode(tuple_get_item_inputs); } MS_LOG(DEBUG) << "GetObjNode output" << node->DebugString(6); + node->cast()->set_forward(PyAttrValue(obj)); return node; } @@ -690,11 +738,12 @@ py::tuple RunOpInner(const OpExecInfoPtr &op_exec_info, const py::args &args) { return err_ret; } - auto node = PynativeExecutor::GetInstance()->MakeCNode(op_exec_info, args, result); - if (node != nullptr) { - node->set_abstract(op_exec_info->abstract); - MS_LOG(DEBUG) << "RunOp MakeCnode,new node is: " << node->DebugString(); + auto cnode = PynativeExecutor::GetInstance()->MakeCNode(op_exec_info, args, result); + if (cnode != nullptr) { + cnode->set_abstract(op_exec_info->abstract); + MS_LOG(DEBUG) << "RunOp MakeCnode,new node is: " << cnode->DebugString(); } + PynativeExecutor::GetInstance()->SaveAllResult(op_exec_info, cnode, result); MS_LOG(DEBUG) << "RunOp end"; return result; } @@ -1072,7 +1121,7 @@ void PynativeExecutor::GradNetInner(const GradOperationPtr &grad, const py::obje void PynativeExecutor::Clear(const std::string &flag) { if (!flag.empty()) { - MS_LOG(INFO) << "Clear res"; + MS_LOG(DEBUG) << "Clear res"; (void)graph_map_.erase(flag); (void)cell_graph_map_.erase(flag); Clean(); @@ -1084,17 +1133,19 @@ void PynativeExecutor::Clear(const std::string &flag) { return; } - MS_LOG(INFO) << "Clear"; + MS_LOG(DEBUG) << "Clear"; top_g_ = nullptr; curr_g_ = nullptr; graph_info_map_.clear(); + op_id_map_.clear(); std::stack().swap(graph_p_); } void PynativeExecutor::Clean() { - MS_LOG(INFO) << "Clean all res"; + MS_LOG(DEBUG) << "Clean all res"; Clear(); grad_flag_ = false; + op_forward_map_.clear(); df_builder_ = nullptr; ad::CleanRes(); pipeline::ReclaimOptimizer(); diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h index 4a2c525534059b0de4550454f22b370178ae6b38..1a5cb7408b3d2927ec0e9a26b3c181ff32437405 100644 --- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.h +++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.h @@ -95,7 +95,11 @@ class PynativeExecutor : public std::enable_shared_from_this { void set_obj_node_map(FuncGraphPtr g, const std::string obj, AnfNodePtr node, std::vector index) { graph_info_map_[g].obj_node_map[obj] = std::make_pair(node, index); } - AnfNodePtr MakeCNode(const OpExecInfoPtr &op_exec_info, const py::args &args, const py::tuple &out); + CNodePtr MakeCNode(const OpExecInfoPtr &op_exec_info, const py::args &args, const py::tuple &out); + ValuePtr GetForwardValue(const OpExecInfoPtr &op_exec_info); + void SaveOpForwardValue(const OpExecInfoPtr &op_exec_info, const ValuePtr &value); + void SaveForwardResult(const CNodePtr &cnode, const py::object &out); + void SaveAllResult(const OpExecInfoPtr &op_exec_info, const CNodePtr &cnode, const py::tuple &out); py::object Run(const py::tuple &args, const py::object &phase); void Pushp(); @@ -116,6 +120,8 @@ class PynativeExecutor : public std::enable_shared_from_this { std::unordered_map graph_map_; std::unordered_map cell_graph_map_; std::unordered_map graph_info_map_; + std::unordered_map op_forward_map_; + std::unordered_map op_id_map_; std::stack graph_p_; FuncGraphPtr top_g_; FuncGraphPtr df_builder_; diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc index c6f3c6a7351f38b8f8046ffbb3c1dfba65af79e1..0ff65c4784444326ae04764dbcc3dc9347ff1963 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc @@ -29,6 +29,7 @@ #include "backend/session/anf_runtime_algorithm.h" #include "backend/kernel_compiler/common_utils.h" #include "backend/kernel_compiler/oplib/oplib.h" +#include "backend/optimizer/common/helper.h" #include "ir/value.h" using mindspore::kernel::Address; using mindspore::kernel::AddressPtr; @@ -150,11 +151,13 @@ void KernelRuntime::AssignMemory(session::KernelGraph *graph) { UpdateRefNodeOutputMem(graph); } -void KernelRuntime::RunOpAssignMemory(const std::vector &input_tensors, +void KernelRuntime::RunOpAssignMemory(const ValuePtr &pre_output_value, + const std::vector &input_tensors, session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); RunOpAssignInputMemory(input_tensors, graph); AssignStaticMemoryValueNode(graph); + RunOpAssignOutputNodeMemory(pre_output_value, graph); for (const auto &cnode : graph->execution_order()) { RunOpAssignOutputMemory(cnode); RunOpAssignWorkSpaceMemory(cnode); @@ -322,6 +325,45 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) { } } +void KernelRuntime::RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph) { + if (pre_output_value == nullptr) { + return; + } + std::vector pre_output_tensors; + TensorValueToTensor(pre_output_value, &pre_output_tensors); + MS_EXCEPTION_IF_NULL(graph); + auto output_nodes = graph->outputs(); + if (pre_output_tensors.size() != output_nodes.size()) { + MS_LOG(EXCEPTION) << "The size of pre output tensors [" << pre_output_tensors.size() + << "] is not equal to the size of output nodes of graph [" << output_nodes.size() << "]"; + } + // share output address with pre output tensors + for (size_t i = 0; i < output_nodes.size(); ++i) { + auto output_node_with_index = AnfAlgo::VisitKernel(output_nodes[i], 0); + if (!output_node_with_index.first->isa()) { + MS_LOG(EXCEPTION) << "The output node should be a cnode , but it is " + << output_node_with_index.first->DebugString(); + } + auto real_output_cnode = output_node_with_index.first->cast(); + MS_EXCEPTION_IF_NULL(real_output_cnode); + MS_EXCEPTION_IF_NULL(pre_output_tensors[i]); + if (pre_output_tensors[i]->device_address() == nullptr) { + MS_LOG(EXCEPTION) << "The address of pre output tensor [" << i << "] is a nullptr!"; + } + if (opt::IsNopNode(real_output_cnode)) { + if (real_output_cnode->inputs().size() < 2) { + MS_LOG(EXCEPTION) << "The input size of output node: " << real_output_cnode->DebugString() + << " should large than one!"; + } + AnfAlgo::SetOutputAddr(std::dynamic_pointer_cast(pre_output_tensors[i]->device_address()), + output_node_with_index.second, real_output_cnode->input(1).get()); + } else { + AnfAlgo::SetOutputAddr(std::dynamic_pointer_cast(pre_output_tensors[i]->device_address()), + output_node_with_index.second, output_node_with_index.first.get()); + } + } +} + void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) { MS_EXCEPTION_IF_NULL(graph); MS_EXCEPTION_IF_NULL(mem_manager_); @@ -573,32 +615,40 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const MS_EXCEPTION_IF_NULL(mem_manager_); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); - auto tensor = node_value->cast(); - if (tensor == nullptr) { - MS_LOG(WARNING) << "Tensor is null"; - return; - } - size_t tensor_size = tensor->data().nbytes(); - auto node_size = CountNodeDeviceMemorySize(value_node, output_idx); - TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(value_node, output_idx); - if (output_type_id == kTypeUnknown) { - output_type_id = AnfAlgo::GetOutputInferDataType(value_node, output_idx); - } - auto output_format = AnfAlgo::GetOutputFormat(value_node, output_idx); - DeviceAddressPtr address = nullptr; - address = CreateDeviceAddress(nullptr, node_size, output_format, output_type_id); - MS_EXCEPTION_IF_NULL(address); - if (ms_context->enable_pynative_infer() && !mem_manager_->MallocMemFromMemPool(address, node_size)) { - MS_LOG(EXCEPTION) << "Cannot alloc address from memory pool when tensor size is: " << node_size; - } else if (mem_manager_->MallocMem(address, kStaticMem, node_size) == nullptr) { - MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << node_size; - } - AnfAlgo::SetOutputAddr(address, output_idx, value_node.get()); - if (!address->SyncHostToDevice(trans::GetRuntimePaddingShape(value_node, 0), tensor_size, tensor->data_type(), - tensor->data_c())) { - MS_EXCEPTION(NotExistsError) << "ValueNode SyncHostToDevice fail!" << value_node->DebugString() << "node format is" - << AnfAlgo::GetOutputFormat(value_node, output_idx) << "node dtype is " - << AnfAlgo::GetOutputInferDataType(value_node, output_idx); + std::vector tensors; + TensorValueToTensor(node_value, &tensors); + for (const auto &tensor : tensors) { + if (tensor == nullptr) { + MS_LOG(WARNING) << "Tensor is null"; + return; + } + if (tensor->device_address() != nullptr) { + AnfAlgo::SetOutputAddr(std::dynamic_pointer_cast(tensor->device_address()), output_idx++, + value_node.get()); + continue; + } + size_t tensor_size = tensor->data().nbytes(); + auto node_size = CountNodeDeviceMemorySize(value_node, output_idx); + TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(value_node, output_idx); + if (output_type_id == kTypeUnknown) { + output_type_id = AnfAlgo::GetOutputInferDataType(value_node, output_idx); + } + auto output_format = AnfAlgo::GetOutputFormat(value_node, output_idx); + DeviceAddressPtr address = nullptr; + address = CreateDeviceAddress(nullptr, node_size, output_format, output_type_id); + MS_EXCEPTION_IF_NULL(address); + if (ms_context->enable_pynative_infer() && !mem_manager_->MallocMemFromMemPool(address, node_size)) { + MS_LOG(EXCEPTION) << "Cannot alloc address from memory pool when tensor size is: " << node_size; + } else if (mem_manager_->MallocMem(address, kStaticMem, node_size) == nullptr) { + MS_LOG(EXCEPTION) << "Cannot alloc address when flag is: " << kStaticMem << ", tensor size is: " << node_size; + } + AnfAlgo::SetOutputAddr(address, output_idx, value_node.get()); + if (!address->SyncHostToDevice(trans::GetRuntimePaddingShape(value_node, 0), tensor_size, tensor->data_type(), + tensor->data_c())) { + MS_EXCEPTION(NotExistsError) << "ValueNode SyncHostToDevice fail!" << value_node->DebugString() + << "node format is" << AnfAlgo::GetOutputFormat(value_node, output_idx) + << "node dtype is " << AnfAlgo::GetOutputInferDataType(value_node, output_idx); + } } } @@ -615,7 +665,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) { } auto &node_value = value_node->value(); MS_EXCEPTION_IF_NULL(node_value); - if (node_value->isa()) { + if (node_value->isa() || node_value->isa()) { AssignValueNodeTensor(value_node, node_value, 0); } else if (node_value->isa()) { auto value = GetValue(node_value); diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index e56c80bca08062bc144331e40a96058c3a874d88..3405d6d5911433f3f9ecde7fa62f19535478887b 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -53,7 +53,8 @@ class KernelRuntime { virtual ~KernelRuntime(); virtual bool Init() = 0; virtual void AssignMemory(session::KernelGraph *graph); - void RunOpAssignMemory(const std::vector &input_tensors, session::KernelGraph *graph); + void RunOpAssignMemory(const ValuePtr &pre_output_value, const std::vector &input_tensors, + session::KernelGraph *graph); void RunOpClearMemory(const session::KernelGraph *graph); bool DumpDataEnabled(); bool DumpDataEnabledIteration(); @@ -108,6 +109,7 @@ class KernelRuntime { void RunOpAssignInputMemory(const std::vector &input_tensors, const session::KernelGraph *graph); void RunOpAssignOutputMemory(const AnfNodePtr &kernel); void RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel); + void RunOpAssignOutputNodeMemory(const ValuePtr &pre_output_value, session::KernelGraph *graph); void AssignValueNodeTensor(const ValueNodePtr &value_node, const ValuePtr &node_value, size_t output_idx); DeviceAddressPtr PreAssignCNodeMemory(const AnfNodePtr &anf_node, size_t index); diff --git a/mindspore/ccsrc/utils/convert_utils.cc b/mindspore/ccsrc/utils/convert_utils.cc index b1847d1df53aeb1348a46a35f12568a41d278e1f..140bcdcfb1d70c5d3fa2382a7077fe8c3a9b288d 100644 --- a/mindspore/ccsrc/utils/convert_utils.cc +++ b/mindspore/ccsrc/utils/convert_utils.cc @@ -607,4 +607,25 @@ tensor::TensorPtr ScalarToTensor(const ScalarPtr &scalar) { MS_EXCEPTION_IF_NULL(tensor); return tensor; } + +void TensorValueToTensor(const ValuePtr &value, std::vector *tensors) { + MS_EXCEPTION_IF_NULL(value); + MS_EXCEPTION_IF_NULL(tensors); + if (value->isa()) { + auto value_tuple = value->cast(); + MS_EXCEPTION_IF_NULL(value_tuple); + for (size_t i = 0; i < value_tuple->size(); ++i) { + ValuePtr element = value_tuple->value()[i]; + if (element->isa()) { + auto tensor = element->cast(); + MS_EXCEPTION_IF_NULL(tensor); + tensors->push_back(tensor); + } + } + } else if (value->isa()) { + tensor::TensorPtr tensor = value->cast(); + MS_EXCEPTION_IF_NULL(tensor); + tensors->push_back(tensor); + } +} } // namespace mindspore diff --git a/mindspore/ccsrc/utils/convert_utils.h b/mindspore/ccsrc/utils/convert_utils.h index d4ecbf4408dc1f2b9fece905a24c5353cf40961e..55953bb80ca26f1a3db7444c6c47f988f335a741 100644 --- a/mindspore/ccsrc/utils/convert_utils.h +++ b/mindspore/ccsrc/utils/convert_utils.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -69,6 +70,8 @@ using NodeMapEquiv = std::unordered_map; bool Isomorphic(FuncGraphPtr g1, FuncGraphPtr g2, FuncGraphPairMapEquiv *equiv_func_graph, NodeMapEquiv *equiv_node); tensor::TensorPtr ScalarToTensor(const ScalarPtr &scalar); + +void TensorValueToTensor(const ValuePtr &value, std::vector *tensors); } // namespace mindspore #endif // MINDSPORE_CCSRC_UTILS_CONVERT_UTILS_H_ diff --git a/mindspore/core/ir/anf.h b/mindspore/core/ir/anf.h index 092e70e828ec196386fd936a2241e23e9324e826..c3bf85fba5f3861385dbe6e7e2eec0d9867120ab 100644 --- a/mindspore/core/ir/anf.h +++ b/mindspore/core/ir/anf.h @@ -50,8 +50,13 @@ using BaseShapePtr = std::shared_ptr; using AbstractBasePtr = std::shared_ptr; using AbstractBasePtrList = std::vector; +class Value; +using ValuePtr = std::shared_ptr; +using ValuePtrList = std::vector; + class ValueNode; using ValueNodePtr = std::shared_ptr; + class CNode; using CNodePtr = std::shared_ptr; @@ -225,6 +230,9 @@ class CNode : public AnfNode { void set_input(size_t i, const AnfNodePtr &input); void set_inputs(const std::vector &inputs) { inputs_ = inputs; } + void set_forward(const ValuePtr &forward) { forward_ = forward; } + const ValuePtr &forward() const { return forward_; } + bool stop_gradient() const { return stop_gradient_; } void set_stop_gradient(bool stop_gradient) { stop_gradient_ = stop_gradient; } @@ -243,6 +251,7 @@ class CNode : public AnfNode { VarPtr func_graph_as_var_; bool stop_gradient_; bool in_forward_flag_ = false; + ValuePtr forward_ = nullptr; }; // ANode represents the atomic node. It's derived Parameter and ValueNode. @@ -321,8 +330,6 @@ class Value : public Base { protected: TypePtr type_{nullptr}; }; -using ValuePtr = std::shared_ptr; -using ValuePtrList = std::vector; // ValueNode is used to hold value. Unlike CNode and Parameter, ValueNode // does not belong to any particular function graph. @@ -333,9 +340,13 @@ class ValueNode : public ANode { MS_DECLARE_PARENT(ValueNode, ANode); void accept(AnfIrVisitor *v) override; + void set_value(const ValuePtr &value) { value_ = value; } const ValuePtr &value() const { return value_; } std::string fullname_with_scope() override; + void set_has_new_value(bool flag) { has_new_value_ = flag; } + bool has_new_value() const { return has_new_value_; } + std::string ToString() const override; std::string DebugString(int recursive_level = 1) const override; std::string DebugString(bool recursive) const override { return DebugString(recursive ? 1 : 0); } @@ -355,6 +366,7 @@ class ValueNode : public ANode { private: ValuePtr value_; + bool has_new_value_ = false; }; template diff --git a/mindspore/core/ir/func_graph_cloner.cc b/mindspore/core/ir/func_graph_cloner.cc index 4b116ff52792328187263b9f35e8c70adeb1f7df..a06710c485416ba5fd8f132fde8e4a7bc282ed60 100644 --- a/mindspore/core/ir/func_graph_cloner.cc +++ b/mindspore/core/ir/func_graph_cloner.cc @@ -88,6 +88,7 @@ void Cloner::CloneCNode(const AnfNodePtr &node, const FuncGraphPtr &target) { CNodePtr new_node = std::make_shared(AnfNodePtrList{}, target); auto old_node = node->cast(); new_node->set_abstract(old_node->abstract()); + new_node->set_forward(old_node->forward()); ScopePtr scope = (node->scope() != kDefaultScope) ? node->scope() : this->scope(); new_node->set_scope(scope); new_node->set_kernel_info(old_node->kernel_info_ptr()); @@ -103,6 +104,7 @@ void Cloner::CloneValueNode(const AnfNodePtr &node) { ScopePtr scope = (node->scope() != kDefaultScope) ? node->scope() : this->scope(); new_const->set_scope(scope); new_const->set_abstract(node->abstract()); + new_const->set_has_new_value(node->cast()->has_new_value()); repl_node_[node] = new_const; TraceManager::EndTrace(); } @@ -115,6 +117,7 @@ void Cloner::CloneValueNode(const AnfNodePtr &node, const FuncGraphPtr &target) ScopePtr scope = (node->scope() != kDefaultScope) ? node->scope() : this->scope(); new_const->set_scope(scope); new_const->set_abstract(node->abstract()); + new_const->set_has_new_value(node->cast()->has_new_value()); repl_node_[node] = new_const; TraceManager::EndTrace(); } diff --git a/tests/ut/python/pynative_mode/test_high_order_grad.py b/tests/ut/python/pynative_mode/test_high_order_grad.py index 97fe7c3b68aa21adfb8fc692c2057d6a63358111..71a7dda94d99eb3c11475e8c0ca826bd72863da9 100644 --- a/tests/ut/python/pynative_mode/test_high_order_grad.py +++ b/tests/ut/python/pynative_mode/test_high_order_grad.py @@ -19,7 +19,7 @@ from mindspore.ops.composite import grad, grad_all, grad_all_with_sens def setup_module(module): - context.set_context(mode=context.PYNATIVE_MODE) + context.set_context(mode=context.PYNATIVE_MODE, check_bprop=False) def single(x): diff --git a/tests/vm_impl/vm_me.py b/tests/vm_impl/vm_me.py index 7216ec613bd891fc3456618ad11e10374cb6ff28..85e2d822ff3a41cf6bf3d439f310cad455a4cefd 100644 --- a/tests/vm_impl/vm_me.py +++ b/tests/vm_impl/vm_me.py @@ -554,9 +554,7 @@ def softmax_cross_entropy_with_logits(logits, labels): sample_num = labels.shape[0] prob = softmax(logits) log_likelihood = -np.log(prob[range(sample_num)]) * labels - # loss = np.sum(log_likelihood) - loss = log_likelihood - + loss = np.sum(log_likelihood) dx = prob.copy() dx[range(sample_num)] -= labels return loss, dx