diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h index ce0093cf34ef67b6b19832b16cbd6d8a3e84d0e1..f2ca5c32968ba9c6fe0a8db206adf442dac57020 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/transpose_gpu_kernel.h @@ -62,7 +62,7 @@ class TransposeGpuFwdKernel : public GpuKernel { MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output."; return false; } - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); shape_size_ = input_shape.size(); if (shape_size_ > TRANSPOSE_MAX_DIMENSION) { MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.cc b/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.cc index cf7c1af9cca7520200747f933b4529ee7499d48f..6daade89ea9538e41c44b78641c5a55f4b60bfd5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.cc @@ -52,6 +52,8 @@ TypeId KernelBuildInfo::GetOutputDeviceType(size_t output_index) const { return outputs_device_type_[output_index]; } +const std::string &KernelBuildInfo::GetOriginDataFormat() const { return origin_data_format_; } + const std::vector &KernelBuildInfo::GetAllInputFormats() const { return inputs_format_; } const std::vector &KernelBuildInfo::GetAllOutputFormats() const { return outputs_format_; } @@ -132,6 +134,11 @@ void KernelBuildInfo::KernelBuildInfoBuilder::SetKernelType(const KernelType &ke kernel_build_info_->kernel_type_ = kernel_type; } +void KernelBuildInfo::KernelBuildInfoBuilder::SetOriginDataFormat(const std::string &origin_data_format) { + MS_EXCEPTION_IF_NULL(kernel_build_info_); + kernel_build_info_->origin_data_format_ = origin_data_format; +} + void KernelBuildInfo::KernelBuildInfoBuilder::SetInputsFormat(const std::vector &inputs_format) { MS_EXCEPTION_IF_NULL(kernel_build_info_); kernel_build_info_->inputs_format_ = inputs_format; diff --git a/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.h b/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.h index 71d9825d5a76303ae8d3b929edce54d6c6c45eec..845930f81593a966da6c653452a035ad1ae33db8 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.h +++ b/mindspore/ccsrc/backend/kernel_compiler/kernel_build_info.h @@ -38,6 +38,7 @@ class KernelBuildInfo { op_pattern_ = kCommonPattern; input_reshape_type_ = {}; output_reshape_type_ = {}; + origin_data_format_ = kOpFormat_DEFAULT; inputs_format_ = {}; outputs_format_ = {}; inputs_device_type_ = {}; @@ -64,6 +65,8 @@ class KernelBuildInfo { std::vector GetOutputReshapeType(size_t input_index) const; + const std::string &GetOriginDataFormat() const; + const std::vector &GetAllInputFormats() const; const std::vector &GetAllOutputFormats() const; @@ -97,6 +100,7 @@ class KernelBuildInfo { private: KernelType kernel_type_; + std::string origin_data_format_; std::vector inputs_format_; OpPattern op_pattern_; std::vector outputs_format_; @@ -135,6 +139,8 @@ class KernelBuildInfo::KernelBuildInfoBuilder { void SetKernelType(const KernelType &kernel_type); + void SetOriginDataFormat(const std::string &origin_data_format); + void SetInputsFormat(const std::vector &inputs_format); void SetOutputsFormat(const std::vector &outputs_format); diff --git a/mindspore/ccsrc/backend/optimizer/common/helper.cc b/mindspore/ccsrc/backend/optimizer/common/helper.cc index 5dd3f3b9dcf12ba778e643d8a7cfa58ece8f5913..6c2276f28bcea0f6dbd4cc07b6e94b7ec77c54dd 100644 --- a/mindspore/ccsrc/backend/optimizer/common/helper.cc +++ b/mindspore/ccsrc/backend/optimizer/common/helper.cc @@ -506,6 +506,45 @@ std::shared_ptr>> GetRealNodeUsedList(con return output_node_list; } +std::shared_ptr>> GetRealNodeUsedListByOutputIdx(const FuncGraphPtr &graph, + const AnfNodePtr &node, + size_t output_index) { + auto output_node_list = std::make_shared>>(); + MS_EXCEPTION_IF_NULL(graph); + auto manager = graph->manager(); + MS_EXCEPTION_IF_NULL(manager); + auto iter = manager->node_users().find(node); + if (iter == manager->node_users().end()) { + MS_LOG(EXCEPTION) << "node has no output in manager"; + } + auto output_info_list = iter->second; + for (const auto &output_info : output_info_list) { + if (AnfAlgo::GetCNodeName(output_info.first) == prim::kPrimControlDepend->name()) { + continue; + } + if (AnfAlgo::GetCNodeName(output_info.first) == prim::kPrimDepend->name() && + output_info.second == kDependAttachNodeIndex) { + continue; + } + size_t used_output_index; + if (AnfAlgo::GetCNodeName(output_info.first) == prim::kPrimTupleGetItem->name()) { + used_output_index = AnfAlgo::GetTupleGetItemOutIndex(utils::cast(output_info.first)); + } else if (AnfAlgo::GetCNodeName(node) == prim::kPrimTupleGetItem->name()) { + used_output_index = output_index; + } else { + auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(output_info.first, output_info.second - 1); + if (kernel_with_index.first.get() != node.get()) { + MS_LOG(EXCEPTION) << "Get used node failed for op[" << AnfAlgo::GetCNodeName(node) << "]"; + } + used_output_index = kernel_with_index.second; + } + if (used_output_index == output_index) { + output_node_list->push_back(output_info); + } + } + return output_node_list; +} + bool IsUsedByOthers(const FuncGraphPtr &graph, const AnfNodePtr &node) { MS_EXCEPTION_IF_NULL(graph); MS_EXCEPTION_IF_NULL(node); diff --git a/mindspore/ccsrc/backend/optimizer/common/helper.h b/mindspore/ccsrc/backend/optimizer/common/helper.h index f21637a199e1b243c9485a3019a1382650afb701..144f6e3e7753aa1389e902836bf14f3e3472d4ea 100644 --- a/mindspore/ccsrc/backend/optimizer/common/helper.h +++ b/mindspore/ccsrc/backend/optimizer/common/helper.h @@ -172,6 +172,10 @@ bool IsUsedByOthers(const FuncGraphPtr &graph, const AnfNodePtr &node); std::shared_ptr>> GetRealNodeUsedList(const FuncGraphPtr &graph, const AnfNodePtr &node); +std::shared_ptr>> GetRealNodeUsedListByOutputIdx(const FuncGraphPtr &graph, + const AnfNodePtr &node, + size_t output_index); + void ConstInputToAttr(const CNodePtr &cnode, const std::unordered_set &input_attrs); bool AnfEqual(const BaseRef &a, const BaseRef &b); diff --git a/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.cc b/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e1ab13bbd02ed7b00cef53d1479226717e1fd8aa --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.cc @@ -0,0 +1,151 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/optimizer/gpu/insert_format_transform_op.h" +#include +#include +#include +#include "backend/session/anf_runtime_algorithm.h" +#include "ir/primitive.h" +#include "utils/utils.h" +#include "backend/optimizer/common/helper.h" +#include "runtime/device/gpu/kernel_info_setter.h" + +namespace mindspore { +namespace opt { +namespace { +std::vector TransposeAxis(const std::string &src_format, const std::string &dst_format) { + if ((src_format == kOpFormat_NCHW) && (dst_format == kOpFormat_NHWC)) { + return {0, 2, 3, 1}; + } else if ((src_format == kOpFormat_NHWC) && (dst_format == kOpFormat_NCHW)) { + return {0, 3, 1, 2}; + } else { + MS_LOG(EXCEPTION) << "Invaild format transform, from " << src_format << " to " << dst_format; + } +} + +void SetTransposeOpBuildInfo(const std::string &input_format, const std::string &output_format, + const AnfNodePtr &node) { + MS_EXCEPTION_IF_NULL(node); + auto input_type = AnfAlgo::GetPrevNodeOutputInferDataType(node, 0); + auto output_type = AnfAlgo::GetOutputInferDataType(node, 0); + kernel::KernelBuildInfo::KernelBuildInfoBuilder builder; + builder.SetInputsFormat({input_format}); + builder.SetInputsDeviceType({input_type}); + builder.SetOutputsFormat({output_format}); + builder.SetOutputsDeviceType({output_type}); + builder.SetKernelType(UNKNOWN_KERNEL_TYPE); + builder.SetProcessor(kernel::Processor::CUDA); + AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), node.get()); +} + +// Insert transpose op between node and used_node whose position is used_node_index. +CNodePtr InsertTransposeOp(const FuncGraphPtr &graph, const AnfNodePtr &node, const AnfNodePtr &used_node, + int used_node_index, const std::vector &transpose_perm) { + MS_EXCEPTION_IF_NULL(graph); + // 1.Create a transpose node. + auto transpose_prim = std::make_shared(prim::kPrimTranspose->name()); + MS_EXCEPTION_IF_NULL(transpose_prim); + // 2.Set the input of transpose. + std::vector transpose_input = {NewValueNode(transpose_prim), node}; + auto transpose_op = graph->NewCNode(transpose_input); + // 3.Set the output info of transpose. + auto transpose_type = {AnfAlgo::GetPrevNodeOutputInferDataType(used_node, used_node_index)}; + auto transpose_shape = {AnfAlgo::GetPrevNodeOutputInferShape(used_node, used_node_index)}; + AnfAlgo::SetOutputInferTypeAndShape(transpose_type, transpose_shape, transpose_op.get()); + AnfAlgo::SetNodeAttr(kAttrPerm, MakeValue(transpose_perm), transpose_op); + // 4.Set the input of used_node. + MS_LOG(DEBUG) << "Node: " << node->fullname_with_scope() << ", used node: " << used_node->fullname_with_scope() + << ", index: " << used_node_index; + AnfAlgo::SetNodeInput(utils::cast(used_node), transpose_op, used_node_index); + // 5. Update the manager info of transpose op. + FuncGraphManagerPtr manager = graph->manager(); + MS_EXCEPTION_IF_NULL(manager); + manager->Clear(); + manager->AddFuncGraph(graph); + return transpose_op; +} +} // namespace + +const AnfNodePtr InsertFormatTransformOp::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, + const EquivPtr &equiv) const { + MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(node); + MS_EXCEPTION_IF_NULL(equiv); + if (!AnfAlgo::IsRealCNodeKernel(node)) { + return nullptr; + } + auto iter = device::gpu::kKernelFormatPositionMap.find(AnfAlgo::GetCNodeName(node)); + if (iter == device::gpu::kKernelFormatPositionMap.end()) { + return nullptr; + } + auto origin_data_format = AnfAlgo::GetOriginDataFormat(node); + if (origin_data_format == kOpFormat_DEFAULT) { + origin_data_format = kOpFormat_NCHW; + } + MS_LOG(DEBUG) << "Process node: " << node->fullname_with_scope(); + // Insert input transpose from origin_data_format to input_format. + auto inputs_format = AnfAlgo::GetAllInputFormats(node); + for (size_t i = 0; i < inputs_format.size(); i++) { + if ((inputs_format[i] != kOpFormat_DEFAULT) && (inputs_format[i] != origin_data_format)) { + auto input_node = AnfAlgo::GetInputNode(utils::cast(node), i); + MS_EXCEPTION_IF_NULL(input_node); + auto transpose_perm = TransposeAxis(origin_data_format, inputs_format[i]); + auto transpose_op = InsertTransposeOp(graph, input_node, node, i, transpose_perm); + SetTransposeOpBuildInfo(kOpFormat_DEFAULT, inputs_format[i], transpose_op); + } + } + + // Insert output transpose from output_format to origin_data_format. + auto outputs_format = AnfAlgo::GetAllOutputFormats(node); + for (size_t i = 0; i < outputs_format.size(); i++) { + if ((outputs_format[i] != kOpFormat_DEFAULT) && (outputs_format[i] != origin_data_format)) { + // Find all nodes connected with node output, and change their inputs to transpose. + auto used_node_list = GetRealNodeUsedListByOutputIdx(graph, node, i); + for (size_t j = 0; j < used_node_list->size(); j++) { + auto used_node = used_node_list->at(j).first; + auto used_node_index = used_node_list->at(j).second - 1; + auto transpose_perm = TransposeAxis(outputs_format[i], origin_data_format); + if (AnfAlgo::GetCNodeName(used_node) == prim::kPrimTupleGetItem->name()) { + MS_LOG(DEBUG) << "The used node of [" << node->fullname_with_scope() << "] is tuple item."; + // The tuple item need get next used nodes again. + ProcessForTupleItem(graph, used_node, used_node_index, transpose_perm, outputs_format[i]); + continue; + } + auto transpose_op = InsertTransposeOp(graph, node, used_node, used_node_index, transpose_perm); + SetTransposeOpBuildInfo(outputs_format[i], kOpFormat_DEFAULT, transpose_op); + } + } + } + return node; +} + +void InsertFormatTransformOp::ProcessForTupleItem(const FuncGraphPtr &graph, const AnfNodePtr &node, int node_index, + const std::vector &transpose_perm, + const std::string &transpose_format) const { + auto used_node_list = GetRealNodeUsedListByOutputIdx(graph, node, node_index); + for (size_t i = 0; i < used_node_list->size(); i++) { + auto used_node = used_node_list->at(i).first; + auto used_node_index = used_node_list->at(i).second - 1; + if (AnfAlgo::GetCNodeName(used_node) == prim::kPrimTupleGetItem->name()) { + MS_LOG(EXCEPTION) << "The used node of tuple item can't be tuple item."; + } + auto transpose_op = InsertTransposeOp(graph, node, used_node, used_node_index, transpose_perm); + SetTransposeOpBuildInfo(transpose_format, kOpFormat_DEFAULT, transpose_op); + } +} +} // namespace opt +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.h b/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.h new file mode 100644 index 0000000000000000000000000000000000000000..648eda87e20aa86de81d8192cf530806893a6d90 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/gpu/insert_format_transform_op.h @@ -0,0 +1,39 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_INSERT_FORMAT_TRANSFORM_OP_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_INSERT_FORMAT_TRANSFORM_OP_H_ + +#include +#include +#include +#include "backend/optimizer/common/optimizer.h" + +namespace mindspore { +namespace opt { +class InsertFormatTransformOp : public PatternProcessPass { + public: + explicit InsertFormatTransformOp(bool multigraph = true) + : PatternProcessPass("insert_format_transform_op", multigraph) {} + ~InsertFormatTransformOp() override = default; + const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; + + private: + void ProcessForTupleItem(const FuncGraphPtr &graph, const AnfNodePtr &node, int node_index, + const std::vector &transpose_perm, const std::string &transpose_format) const; +}; +} // namespace opt +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_INSERT_FORMAT_TRANSFORM_OP_H_ diff --git a/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.cc b/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b4779a2aa5342504d8fa2b3fa7f0eae582dd0a8 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.cc @@ -0,0 +1,65 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/optimizer/gpu/remove_format_transform_pair.h" +#include +#include +#include +#include "backend/session/anf_runtime_algorithm.h" +#include "ir/primitive.h" +#include "utils/utils.h" +#include "backend/optimizer/common/helper.h" + +namespace mindspore { +namespace opt { +const BaseRef RemoveFormatTransformPair::DefinePattern() const { + VarPtr X = std::make_shared(); + MS_EXCEPTION_IF_NULL(X); + VectorRef transpose1 = VectorRef({prim::kPrimTranspose, X}); + VectorRef transpose2 = VectorRef({prim::kPrimTranspose, transpose1}); + return transpose2; +} + +const AnfNodePtr RemoveFormatTransformPair::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, + const EquivPtr &equiv) const { + MS_EXCEPTION_IF_NULL(graph); + MS_EXCEPTION_IF_NULL(node); + MS_EXCEPTION_IF_NULL(equiv); + MS_LOG(DEBUG) << "Process node:" << node->fullname_with_scope(); + auto input_node = AnfAlgo::GetInputNode(utils::cast(node), 0); + MS_EXCEPTION_IF_NULL(input_node); + if (AnfAlgo::GetCNodeName(node) != prim::kPrimTranspose->name() || + AnfAlgo::GetCNodeName(input_node) != prim::kPrimTranspose->name()) { + MS_LOG(EXCEPTION) << "The pattern is not transpose pair, " + << "node:" << AnfAlgo::GetCNodeName(node) << " node input:" << AnfAlgo::GetCNodeName(input_node); + } + // If transpose operator used by more than one other operators, it cant not be deleted directly. + if (IsUsedByOthers(graph, input_node)) { + MS_LOG(DEBUG) << "The transpose node [" << input_node->fullname_with_scope() + << "] is used by more than one other operators."; + return nullptr; + } + auto transpose1_input_shape = AnfAlgo::GetInputDeviceShape(input_node, 0); + auto transpose2_output_shape = AnfAlgo::GetOutputDeviceShape(node, 0); + if (transpose2_output_shape == transpose1_input_shape) { + auto transpose1_input_node = AnfAlgo::GetInputNode(utils::cast(input_node), 0); + MS_EXCEPTION_IF_NULL(transpose1_input_node); + return transpose1_input_node; + } + return nullptr; +} +} // namespace opt +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.h b/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.h new file mode 100644 index 0000000000000000000000000000000000000000..3ca197b237d8b48b504357860616e5f1c928b6c7 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/gpu/remove_format_transform_pair.h @@ -0,0 +1,34 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REMOVE_FORMAT_TRANSFORM_PAIR_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REMOVE_FORMAT_TRANSFORM_PAIR_H_ + +#include +#include "backend/optimizer/common/optimizer.h" + +namespace mindspore { +namespace opt { +class RemoveFormatTransformPair : public PatternProcessPass { + public: + explicit RemoveFormatTransformPair(bool multigraph = true) + : PatternProcessPass("remove_format_transform_pair", multigraph) {} + ~RemoveFormatTransformPair() override = default; + const BaseRef DefinePattern() const override; + const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; +}; +} // namespace opt +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REMOVE_FORMAT_TRANSFORM_PAIR_H_ diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc index 799754dad3afaaa880cd7aa59e5e33b8a9d720d9..b70286ae4243a532f537e0d83bf930d6c75cc6b1 100644 --- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc +++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc @@ -353,6 +353,48 @@ size_t AnfRuntimeAlgorithm::GetOutputTensorNum(const AnfNodePtr &node) { } } +std::vector AnfRuntimeAlgorithm::GetAllOutputFormats(const AnfNodePtr &node) { + MS_EXCEPTION_IF_NULL(node); + if (!AnfAlgo::IsRealKernel(node)) { + MS_LOG(EXCEPTION) << "Not real kernel:" + << "#node [" << node->DebugString() << "]"; + } + auto kernel_info = dynamic_cast(node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + auto build_info = kernel_info->select_kernel_build_info(); + MS_EXCEPTION_IF_NULL(build_info); + auto format = build_info->GetAllOutputFormats(); + return format; +} + +std::vector AnfRuntimeAlgorithm::GetAllInputFormats(const AnfNodePtr &node) { + MS_EXCEPTION_IF_NULL(node); + if (!AnfAlgo::IsRealKernel(node)) { + MS_LOG(EXCEPTION) << "Not real kernel:" + << "#node [" << node->DebugString() << "]"; + } + auto kernel_info = dynamic_cast(node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + auto build_info = kernel_info->select_kernel_build_info(); + MS_EXCEPTION_IF_NULL(build_info); + auto format = build_info->GetAllInputFormats(); + return format; +} + +std::string AnfRuntimeAlgorithm::GetOriginDataFormat(const AnfNodePtr &node) { + MS_EXCEPTION_IF_NULL(node); + if (!AnfAlgo::IsRealKernel(node)) { + MS_LOG(EXCEPTION) << "Not real kernel:" + << "#node [" << node->DebugString() << "]"; + } + auto kernel_info = dynamic_cast(node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + auto build_info = kernel_info->select_kernel_build_info(); + MS_EXCEPTION_IF_NULL(build_info); + auto format = build_info->GetOriginDataFormat(); + return format; +} + std::string AnfRuntimeAlgorithm::GetOutputFormat(const AnfNodePtr &node, size_t output_idx) { MS_EXCEPTION_IF_NULL(node); if (output_idx > GetOutputTensorNum(node)) { @@ -829,7 +871,7 @@ void AnfRuntimeAlgorithm::SetKernelMod(const KernelModPtr &kernel_mod, AnfNode * bool AnfRuntimeAlgorithm::IsRealKernel(const AnfNodePtr &node) { MS_EXCEPTION_IF_NULL(node); - // parameter and value node is not a real kernel too + // parameter and value node is a real kernel too if (!node->isa()) { return true; } diff --git a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h index 9e884994c52fde5a18d81aea8ae71d6f5cbf97c0..191ab887d96dfc9e256ced0879cb7f060bedf9f6 100644 --- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h +++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h @@ -101,6 +101,12 @@ class AnfRuntimeAlgorithm { static size_t GetInputTensorNum(const AnfNodePtr &node); // get the num of output real_kernel(which can be build and run in device) static size_t GetOutputTensorNum(const AnfNodePtr &node); + // get all outputs format select of anf node + static std::vector GetAllOutputFormats(const AnfNodePtr &node); + // get all inputs format select of anf node + static std::vector GetAllInputFormats(const AnfNodePtr &node); + // get origin data format select of anf node + static std::string GetOriginDataFormat(const AnfNodePtr &node); // get output format select of anf node static std::string GetOutputFormat(const AnfNodePtr &node, size_t output_idx); // get input format select of anf node diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 042b7d1699db2bd94da98484918e11b69ae6a033..fc8b2dcfdfbb0e3ecd5324df1fc1e32a7a83187c 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -30,6 +30,8 @@ #include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h" #include "backend/optimizer/gpu/replace_momentum_cast_fusion.h" #include "backend/optimizer/gpu/replace_addn_fusion.h" +#include "backend/optimizer/gpu/insert_format_transform_op.h" +#include "backend/optimizer/gpu/remove_format_transform_pair.h" #include "runtime/device/kernel_runtime_manager.h" #include "utils/ms_utils.h" #include "common/trans.h" @@ -76,6 +78,8 @@ void GPUSession::Optimize(const std::shared_ptr &kernel_graph) { void GPUSession::HardwareOptimize(const std::shared_ptr &kernel_graph) { auto optimizer = std::make_shared(); auto pm = std::make_shared(); + pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared()); pm->AddPass(std::make_shared()); pm->AddPass(std::make_shared()); optimizer->AddPassManager(pm); @@ -203,7 +207,7 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList } // Assign CUDA streams AssignStream(graph); - // Hide NoOp from execution graph + // Hide NopOp from execution graph opt::HideNopNode(graph.get()); // Build kernel if node is cnode BuildKernel(graph); @@ -213,7 +217,7 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList graph->set_execution_order(execution_order); // Get summary nodes. SetSummaryNodes(graph.get()); - // Remove NoOp from execution graph + // Remove NopOp from execution graph opt::RemoveNopNode(graph.get()); // Set graph manager. MS_EXCEPTION_IF_NULL(context_); @@ -272,7 +276,7 @@ void GPUSession::BuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_in MS_EXCEPTION_IF_NULL(kernel_graph); SelectKernel(kernel_graph); StartKernelRT(); - // Hide NoOp from execution graph + // Hide NopOp from execution graph opt::HideNopNode(kernel_graph.get()); BuildKernel(kernel_graph); run_op_graphs_[graph_info] = kernel_graph; @@ -282,7 +286,7 @@ py::tuple GPUSession::RunOp(const OpRunInfo &op_run_info, const GraphInfo &graph const std::vector &input_tensors) { auto kernel_graph = run_op_graphs_[graph_info]; MS_EXCEPTION_IF_NULL(kernel_graph); - // Remove NoOp from execution graph + // Remove NopOp from execution graph opt::RemoveNopNode(kernel_graph.get()); RunOpAllocateMemory(op_run_info.value, input_tensors, kernel_graph.get()); // Execute the computation diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc index 0a162ed7994aaf0ea42419333923143be3959700..86a8f5eb90cac392c4a2cdb3b0e41ae3e6a15ad1 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc @@ -252,7 +252,7 @@ Status DeviceQueueOp::RetryPushGPUData(const std::vector &data_size, con return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "invalid input Data, please check it."); } else { if (!stop_send_) { - MS_LOG(WARNING) << "Retry pushing data..."; + MS_LOG(DEBUG) << "Retry pushing data..."; continue; } break; diff --git a/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc b/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc index 5cb97ac1e7b6e47d7790df28acdda102c1377272..a4bb29731aa4284cc9653e63a4445807aaac6bd7 100644 --- a/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc +++ b/mindspore/ccsrc/runtime/device/gpu/blocking_queue.cc @@ -96,10 +96,10 @@ BlockQueueStatus_T BlockingQueue::Create(void *addr, const std::vector & void BlockingQueue::RegisterRelease(const std::function &func) { queue_->RegisterRelease(func); } -BlockQueueStatus_T BlockingQueue::Push(const std::vector &data, unsigned int timeout_in_sec) { +BlockQueueStatus_T BlockingQueue::Push(const std::vector &data, unsigned int) { std::unique_lock locker(mutex_); if (queue_->IsFull()) { - if (not_full_cond_.wait_for(locker, std::chrono::seconds(timeout_in_sec)) == std::cv_status::timeout) { + if (not_full_cond_.wait_for(locker, std::chrono::microseconds(100)) == std::cv_status::timeout) { return TIMEOUT; } } diff --git a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc index 00bae0f6c0a1870190da00f2f76a5e59223ba2c9..3fb2d89d1929374416acc91f6fae8776394c65a6 100644 --- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc +++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc @@ -19,6 +19,7 @@ #include #include "backend/kernel_compiler/kernel.h" #include "utils/utils.h" +#include "utils/ms_context.h" #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" #include "backend/kernel_compiler/kernel_build_info.h" #include "backend/session/anf_runtime_algorithm.h" @@ -157,25 +158,87 @@ void SetTensorDeviceInfo(const kernel::KernelBuildInfo &selected_kernel_info, co } } } + +bool IsNeedProcessFormatInfo(const CNodePtr &kernel_node, const std::vector &inputs_type) { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if (ms_context->execution_mode() == kPynativeMode) { + return false; + } + if (!AnfAlgo::IsRealCNodeKernel(kernel_node)) { + return false; + } + auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); + auto iter = kKernelFormatPositionMap.find(kernel_name); + if (iter == kKernelFormatPositionMap.end()) { + return false; + } + if (inputs_type.size() == 0) { + return false; + } + auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + if (input_shape.size() != 4) { + return false; + } + return false; +} + +void UpdateKernelFormatInfo(const CNodePtr &kernel_node, const std::vector &inputs_type, + std::vector *inputs_format, std::vector *outputs_format, + std::string *origin_data_format) { + auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); + auto iter = kKernelFormatPositionMap.find(kernel_name); + if (iter == kKernelFormatPositionMap.end()) { + return; + } + auto cal_format = (inputs_type[0] == kNumberTypeFloat16) ? kOpFormat_NHWC : kOpFormat_NCHW; + MS_LOG(DEBUG) << "Kernel node: " << kernel_node->fullname_with_scope() << ", format: " << cal_format; + auto inputs_format_position = iter->second.first; + for (const auto &input_format_position : inputs_format_position) { + if (input_format_position >= inputs_format->size()) { + MS_LOG(EXCEPTION) << "The position [" << input_format_position << "] is out of range of the input size [" + << inputs_format->size() << "] #kernel_node [" << kernel_node->fullname_with_scope() << "]"; + } + (*inputs_format)[input_format_position] = cal_format; + } + auto outputs_format_position = iter->second.second; + for (const auto &output_format_position : outputs_format_position) { + if (output_format_position >= outputs_format->size()) { + MS_LOG(EXCEPTION) << "The position [" << output_format_position << "] is out of range of the output size [" + << outputs_format->size() << "] #kernel_node [" << kernel_node->fullname_with_scope() << "]"; + } + (*outputs_format)[output_format_position] = cal_format; + } + auto prim = AnfAlgo::GetCNodePrimitive(kernel_node); + MS_EXCEPTION_IF_NULL(prim); + if (prim->HasAttr("data_format")) { + *origin_data_format = AnfAlgo::GetNodeAttr(kernel_node, "data_format"); + } +} } // namespace void SetKernelInfo(const CNodePtr &kernel_node) { std::vector inputs_format; std::vector inputs_type; - std::shared_ptr builder = - std::make_shared(); for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); ++input_index) { inputs_format.emplace_back(kOpFormat_DEFAULT); inputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, input_index)); } - builder->SetInputsFormat(inputs_format); - builder->SetInputsDeviceType(inputs_type); std::vector outputs_format; std::vector outputs_type; for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(kernel_node); ++output_index) { outputs_format.emplace_back(kOpFormat_DEFAULT); outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel_node, output_index)); } + std::string origin_data_format = kOpFormat_DEFAULT; + if (IsNeedProcessFormatInfo(kernel_node, inputs_type)) { + UpdateKernelFormatInfo(kernel_node, inputs_type, &inputs_format, &outputs_format, &origin_data_format); + } + std::shared_ptr builder = + std::make_shared(); + builder->SetOriginDataFormat(origin_data_format); + builder->SetInputsFormat(inputs_format); + builder->SetInputsDeviceType(inputs_type); builder->SetOutputsFormat(outputs_format); builder->SetOutputsDeviceType(outputs_type); diff --git a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h index 00a2851f02e884b046b25f88de8ba2e590cb5226..c78fed7b9e168ee9d46c80e377640c4a052ef6c7 100644 --- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h +++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.h @@ -20,13 +20,35 @@ #include #include #include +#include #include "ir/anf.h" #include "ir/dtype.h" #include "utils/utils.h" +#include "frontend/operator/ops.h" namespace mindspore { namespace device { namespace gpu { +// map, used for getting the insert position of format transform. +static std::map, std::vector>> kKernelFormatPositionMap = { + {prim::kPrimConv2D->name(), {{0, 1}, {0}}}, + {prim::kPrimConv2DBackpropInput->name(), {{0, 1}, {0}}}, + {prim::kPrimConv2DBackpropFilter->name(), {{0, 1}, {0}}}, + {prim::kPrimRelu->name(), {{0}, {0}}}, + {prim::kPrimReluGrad->name(), {{0, 1}, {0}}}, + {prim::kPrimMaxPool->name(), {{0}, {0}}}, + {prim::kPrimMaxPoolGrad->name(), {{0, 1, 2}, {0}}}, + {kAvgPoolOpName, {{0}, {0}}}, + {kAvgPoolGradGpuOpName, {{0, 1, 2}, {0}}}, + {kTensorAddOpName, {{0, 1}, {0}}}, + {kFusedBatchNormEx, {{0}, {0}}}, + {kFusedBatchNormExWithActivation, {{0}, {0}}}, + {kFusedBatchNormExWithAddAndActivation, {{0, 5}, {0}}}, + {kFusedBatchNormGradEx, {{0, 1}, {0}}}, + {kFusedBatchNormGradExWithActivation, {{0, 1, 7}, {0}}}, + {kFusedBatchNormGradExWithAddAndActivation, {{0, 1, 7}, {0, 3}}}, +}; + void SetKernelInfo(const CNodePtr &apply_kernel_ptr); class KernelAttr { diff --git a/mindspore/ccsrc/utils/utils.h b/mindspore/ccsrc/utils/utils.h index 02d79aaac2cbe1b957f9511301ab0f0df4de785c..af1243e9fc66a6c9e8ea5e6ef10b191ffedb1e0d 100644 --- a/mindspore/ccsrc/utils/utils.h +++ b/mindspore/ccsrc/utils/utils.h @@ -189,6 +189,9 @@ constexpr auto kPullOpName = "Pull"; constexpr auto kEmbeddingLookupOpName = "EmbeddingLookup"; constexpr auto kEmbeddingLookupProxyOpName = "EmbeddingLookupProxy"; constexpr auto kPaddingOpName = "Padding"; +constexpr auto kAvgPoolOpName = "AvgPool"; +constexpr auto kAvgPoolGradGpuOpName = "AvgPoolGradGpu"; +constexpr auto kTensorAddOpName = "TensorAdd"; // attr key name constexpr auto kAttrInputNames = "input_names";