From eae2675dedf5368a9d78f0064b8efc9f4b6349fe Mon Sep 17 00:00:00 2001 From: caifubi Date: Sat, 25 Jul 2020 17:23:45 +0800 Subject: [PATCH] Add Async Dump to tutorial --- .../customized_debugging_information.md | 48 ++++++++++++++++++ .../customized_debugging_information.md | 49 +++++++++++++++++++ 2 files changed, 97 insertions(+) diff --git a/tutorials/source_en/advanced_use/customized_debugging_information.md b/tutorials/source_en/advanced_use/customized_debugging_information.md index b866f005..c59a0eb4 100644 --- a/tutorials/source_en/advanced_use/customized_debugging_information.md +++ b/tutorials/source_en/advanced_use/customized_debugging_information.md @@ -10,6 +10,7 @@ - [Custom Callback](#custom-callback) - [MindSpore Metrics](#mindspore-metrics) - [MindSpore Print Operator](#mindspore-print-operator) + - [Asynchronous Data Dump](#asynchronous-data-dump) - [Log-related Environment Variables and Configurations](#log-related-environment-variables-and-configurations) @@ -216,6 +217,53 @@ val:[[1 1] [1 1]] ``` +## Asynchronous Data Dump + +When the training result deviates from the expectation on Ascend, the input and output of the operator can be dumped for debugging through Asynchronous Data Dump. + +1. Turn on the switch to save graph IR: `context.set_context(save_graphs=True)`. +2. Execute training script. +3. Open `hwopt_d_end_graph_{graph id}.ir` in the directory you execute the script and find the name of the operators you want to Dump. +4. Configure json file: `data_dump.json`. + + ```json + { + "DumpSettings": { + "net_name": "ResNet50", + "mode": 1, + "iteration": 0, + "kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"] + }, + + "DumpSettingsSpec": { + "net_name": "net name eg:ResNet50", + "mode": "0: dump all kernels, 1: dump kernels in kernels list", + "iteration": "specified iteration", + "kernels": "op's full scope name which need to be dump" + } + } + ``` + +5. Set environment variables. + + ```bash + export ENABLE_DATA_DUMP=1 + export DATA_DUMP_PATH=/test + export DATA_DUMP_CONFIG_PATH=data_dump.json + ``` + + > Set the environment variables before executing the training script. Setting environment variables during training will not take effect. + + > Dump environment variables need to be configured before calling `mindspore.communication.management.init`. + +6. Execute the training script again. +7. Parse the Dump file. + + Change directory to `/var/log/npu/ide_daemon/dump/` after training and execute the following commands to parse Dump data file: + + ```bash + python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/dump_data_conversion.pyc -type offline -target numpy -i ./{Dump file path}} -o ./{output file path} + ``` ## Log-related Environment Variables and Configurations MindSpore uses glog to output logs. The following environment variables are commonly used: diff --git a/tutorials/source_zh_cn/advanced_use/customized_debugging_information.md b/tutorials/source_zh_cn/advanced_use/customized_debugging_information.md index 7f41783c..d59e35de 100644 --- a/tutorials/source_zh_cn/advanced_use/customized_debugging_information.md +++ b/tutorials/source_zh_cn/advanced_use/customized_debugging_information.md @@ -9,6 +9,7 @@ - [自定义Callback](#自定义callback) - [MindSpore metrics功能介绍](#mindspore-metrics功能介绍) - [print算子功能介绍](#print算子功能介绍) + - [异步数据Dump功能介绍](#异步数据dump功能介绍) - [日志相关的环境变量和配置](#日志相关的环境变量和配置) @@ -216,6 +217,54 @@ val:[[1 1] [1 1]] ``` +## 异步数据Dump功能介绍 + +在Ascend环境上执行训练,当训练结果和预期有偏差时,可以通过异步数据Dump功能保存算子的输入输出进行调试。 + +1. 开启IR保存开关: `context.set_context(save_graphs=True)`。 +2. 执行网络脚本。 +3. 查看执行目录下的`hwopt_d_end_graph_{graph id}.ir`,找到需要Dump的算子名称。 +4. 配置Dump的json配置文件`data_dump.json`。 + + ```json + { + "DumpSettings": { + "net_name": "ResNet50", + "mode": 1, + "iteration": 0, + "kernels": ["Default/Conv2D-op2", "Default/TensorAdd-op10"] + }, + + "DumpSettingsSpec": { + "net_name": "net name eg:ResNet50", + "mode": "0: dump all kernels, 1: dump kernels in kernels list", + "iteration": "specified iteration ", + "kernels": "op's full scope name which need to be dump" + } + } + ``` + +5. 设置数据Dump的环境变量。 + + ```bash + export ENABLE_DATA_DUMP=1 + export DATA_DUMP_PATH=/test + export DATA_DUMP_CONFIG_PATH=data_dump.json + ``` + + > 在网络脚本执行前,设置好环境变量;网络脚本执行过程中设置将会不生效。 + + > 在分布式场景下,Dump环境变量需要调用`mindspore.communication.management.init`之前配置。 + +6. 再次执行用例进行异步数据Dump。 +7. 解析文件。 + + 执行完用例后去`/var/log/npu/ide_daemon/dump/`目录下,运行如下命令解析Dump数据: + + ```bash + python /usr/local/Ascend/toolkit/tools/operator_cmp/compare/dump_data_conversion.pyc -type offline -target numpy -i ./{Dump出来的文件} -o ./{解析的文件路径} + ``` + ## 日志相关的环境变量和配置 MindSpore采用glog来输出日志,常用的几个环境变量如下: -- GitLab