From 6433fbb8f924d7b7da5d099e4434706f077cca5c Mon Sep 17 00:00:00 2001 From: ph Date: Thu, 18 Jun 2020 22:56:53 +0800 Subject: [PATCH] UI add profiling minddata page(2nd commit) --- mindinsight/ui/src/locales/zh-cn.json | 155 ++++-- .../ui/src/services/request-service.js | 79 +++ .../ui/src/views/train-manage/operator.vue | 7 +- .../train-manage/profiling-dashboard.vue | 497 +++++++++++++++++- .../ui/src/views/train-manage/profiling.vue | 188 ++++++- 5 files changed, 858 insertions(+), 68 deletions(-) diff --git a/mindinsight/ui/src/locales/zh-cn.json b/mindinsight/ui/src/locales/zh-cn.json index 3aef60e..8f7126d 100644 --- a/mindinsight/ui/src/locales/zh-cn.json +++ b/mindinsight/ui/src/locales/zh-cn.json @@ -9,7 +9,6 @@ "dataError": "获取到的数据异常", "regIllegal": "请输入正确的检索条件", "stayTuned": "敬请期待", - "select": "请选择", "search": "请搜索", "enter": "请输入", @@ -72,7 +71,7 @@ "unhide": "取消隐藏", "hideData": "条数据", "totalHide": "本页共隐藏{n}条数据", - "mustExist":"必选项", + "mustExist": "必选项", "remarkValidation": "备注为英文字母、数字、中文、下划线、中划线、点的组合,长度范围为[1,128]字符", "changeSuccess": "修改成功", "metricLabel": "Metric", @@ -142,7 +141,6 @@ "isDelete": "是否删除当前阈值", "noData": "无" }, - "images": { "titleText": "图像", "tagSelectTitle": "标签选择", @@ -232,43 +230,137 @@ "path": "路径", "number": "卡编号", "distribution": "分布图", - "queueEmptyRatio": "队列空比例", - "queueFullRatio": "队列满比例", + "queueEmptyRatio": "队列空比例:", + "queueFullRatio": "队列满比例:", "dataQueueDis": "数据队列分布图", "operatorTimeConAnalysis": "算子耗时分析", "timeConStastic": "耗时统计", - "avgCost": "平均总耗时", - "getCost": "平均取数据耗时", - "pushCost": "平均push耗时", + "avgCost": "平均总耗时:", + "getCost": "平均取数据耗时:", + "pushCost": "平均推送数据耗时:", "lterationGap": "迭代间隙", "lterationTail": "迭代拖尾", "propertion": "占比", + "minddataTitle": "数据准备详情", "title": "迭代间隙流程分析", "dataDeal": "数据处理", "dataQueue": "数据队列", "errorTip": "个step出现异常", - "pipelineError": "Pipeline异常step", - "deviceQueueError": "device_queue_op异常step", - "getNextError": "get_next异常step", - "smartHelper":"小助手", - "suggestions":"优化建议", - "stepSelect":"Step选择", - "curCard":"当前卡片", - "stepTrace":"迭代轨迹", - "mindData":"数据准备", - "timeLine":"时间线", - "rankOfOperator":"算子耗时统计排名", - "stepTraceDetail":"迭代轨迹详情", - "viewDetail":"查看详情", - "stepNum":"耗时step数", - "iterGapTimeLabel":"迭代间隙时长", - "iterGapRateLabel":"迭代间隙占比", - "fpBpTimeLabel":"FP+BP时长", - "fpBpRateLabel":"FP+BP占比", - "tailTimeLabel":"迭代拖尾时长", - "tailRateLabel":"迭代拖尾占比", + "smartHelper": "小助手", + "suggestions": "优化建议", + "common-profiler_tutorial": { + "desc": "如何使用Profiler进行性能分析", + "anchor": [ + "desc" + ], + "url": [ + "https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/visualization_tutorials.html" + ] + }, + "step_trace-proposer_type_label": { + "desc": "迭代轨迹性能优化参考建议" + }, + "step_trace-iter_interval": { + "desc": "在开启图模式和数据集下沉模式后,如果平均迭代间隙时间大于{n1}ms,则数据处理至计算图执行之间的流程可能存在优化空间。" + }, + "common-proposer_type_label": { + "desc": "性能分析与优化指导" + }, + "minddata_pipeline-proposer_type_label": { + "desc": "Minddata pipeline性能优化建议" + }, + "minddata_pipeline-general": { + "desc": "Pipeline中的算子{n1}可能存在性能瓶颈,请用户重点关注。" + }, + "minddata_pipeline-dataset_op": { + "desc": "对于算子{n1},用户可以尝试调整num_parallel_workers参数。" + }, + "minddata_pipeline-generator_op": { + "desc": "对于算子{n1},用户可以尝试调整num_parallel_workers参数或优化训练脚本,如果性能没有得到优化,可以尝试替换为MindRecordDataset算子。" + }, + "minddata_pipeline-map_op": { + "desc": "对于算子{n1},用户可以尝试调整num_parallel_workers参数,如果使用的是Python的算子,可以尝试优化训练脚本。" + }, + "minddata_pipeline-batch_op": { + "desc": "对于算子{n1},用户可以尝试增加prefetch_size大小。" + }, + "minddata_warning_op": { + "desc": "经过上述判断,算子{n1}可能存在优化空间。" + }, + "minddata-proposer_type_label": { + "desc": "数据处理性能分析" + }, + "minddata_device_queue": { + "desc": "主机侧队列为空比例{n1}/{n2},为满比例{n3}/{n4}。" + }, + "minddata_get_next_queue": { + "desc": "芯片侧队列为空比例{n1}/{n2}。" + }, + "millisecond": "ms", + "stepSelect": "Step选择", + "curCard": "当前卡片", + "stepTrace": "迭代轨迹", + "mindData": "数据准备", + "timeLine": "时间线", + "rankOfOperator": "算子耗时统计排名", + "stepTraceDetail": "迭代轨迹详情", + "viewDetail": "查看详情", + "stepNum": "耗时step数", + "iterGapTimeLabel": "迭代间隙时长", + "iterGapRateLabel": "迭代间隙占比", + "fpBpTimeLabel": "前向+反向时长", + "fpBpRateLabel": "前向+反向占比", + "tailTimeLabel": "迭代拖尾时长", + "tailRateLabel": "迭代拖尾占比", "operatorDetail": "算子详情", - "times": "次" + "times": "次", + "queueStep": "队列step分布图", + "queueInfo": "迭代间隙", + "pipeline": "数据处理", + "pipelineTopTitle": "算子间队列平均使用率", + "pipelineMiddleTitle": "算子间队列关系", + "deviceQueueOp": "数据发送", + "deviceQueueOpTip": "数据发送算子", + "getNext": "取数据算子", + "connectorQuene": "主机队列", + "getData": "数据获取", + "opTotalTime": "算子执行总时间:", + "streamNum": "执行流数量:", + "opNum": "算子数目:", + "opTimes": "算子执行总次数:", + "features": "功能介绍:", + "iterationInfo": "迭代轨迹展示的是每个step从上个迭代开始至该step结束的耗时信息,主体时间分为3部分:迭代间隙、前向+反向、迭代拖尾。", + "iterationGapInfo": "主要负责从数据队列中读取数据,如果该部分耗时较长,建议前往数据处理部分进一步分析;", + "fpbpTitle": "前向反向", + "fpbpInfo": "执行网络中的前向算子以及反向算子,承载了一个step主要的计算工作,如果该部分耗时较长,建议前往算子统计或时间线中进一步分析;", + "iterativeTailingTitle": "迭代拖尾", + "iterativeTailingInfo": "主要在多卡场景下执行参数聚合参数更新操作,如果该部分耗时较长,建议查看all_reduce耗时以及并行情况。", + "statistics": "统计信息:", + "totalTime": "总耗时:", + "totalSteps": "总step数:", + "fpbpTimeRatio": "前向+反向耗时占比:", + "iterationGapTimeRatio": "迭代间隙耗时占比:", + "iterativeTailingTimeRatio": "迭代拖尾耗时占比:", + "dataProcess": "该图展示了数据处理阶段的流程,数据通过数据处理阶段存入主机队列,再通过数据传输阶段存入芯片侧的数据队列,最终由数据传输算子get_next发送给前向训练使用。", + "dataProcessInfo": "综合分析该阶段的流程,通过判断主机队列和数据队列为空的情况就可以初步判断可能出现性能异常的阶段。", + "analysisOne": "1、如果迭代间隙较长,并且芯片侧的数据队列部分batch为空,那么可能由于数据处理和数据传输阶段导致的性能异常,参考2,反之则定位数据传输算子get_next内部问题;", + "analysisTwo": "2、如果通过1定位为数据处理、数据传输阶段异常,则查看主机队列情况,如果大概率为空,则可能为数据处理阶段导致异常,如果大概率不为空,则可能数据传输阶段异常;", + "higherAnalysis": "注:可结合下方算子耗时进行高阶分析", + "chipInfo": "芯片侧数据队列为空比例:", + "hostIsEmpty": "主机侧队列为空比例:", + "hostIsFull": "主机侧队列为满比例:", + "operatorInfo": "{msg1}、{msg2}算子信息", + "workersNum": "使用线程数", + "queueDeepChartTitle": "{msg}队列深度折线图", + "sampleInterval": "采样间隔", + "deep": "深度", + "queueTip1": "队列为满比例:", + "queueTip2": "队列为空比例:", + "totalCapacity": "总容量", + "averageCapacity": "平均使用容量", + "stepTraceMessage": "当前FP和BP为自动选点,如不合乎预期,请自行修改。", + "FPMessage": "FP起始算子:", + "BPMessage": "BP终止算子:" }, "components": { "summaryTitle": "训练选择", @@ -299,7 +391,6 @@ "50542215": "查询参数错误", "50542216": "Summary日志文件未找到", "50542217": "Summary日志路径错误", - "50542218": "筛选参数错误", - "50546102": "step_id取值错误" + "50542218": "筛选参数错误" } -} +} \ No newline at end of file diff --git a/mindinsight/ui/src/services/request-service.js b/mindinsight/ui/src/services/request-service.js index 53f4bbb..bbba920 100644 --- a/mindinsight/ui/src/services/request-service.js +++ b/mindinsight/ui/src/services/request-service.js @@ -176,6 +176,14 @@ export default { }, }); }, + // get data of helper + queryDataOfProfileHelper(params) { + return axios({ + method: 'get', + url: '/v1/mindinsight/profile/summary/propose', + params: params, + }); + }, // query training trace queryTrainingTrace(params) { return axios({ @@ -197,4 +205,75 @@ export default { }, }); }, + queryTimeline(params) { + return axios({ + method: 'get', + url: '/v1/mindinsight/profile/timeline', + params: params, + headers: { + ignoreError: true, + }, + }); + }, + queryTimlineInfo(params) { + return axios({ + method: 'get', + url: 'v1/mindinsight/profile/timeline-summary', + params: params, + headers: { + ignoreError: true, + }, + }); + }, + queryOpQueue(params) { + return axios({ + method: 'post', + url: 'v1/mindinsight/profile/minddata-pipeline/op-queue', + params: params.params, + data: params.body, + headers: { + ignoreError: true, + }, + }); + }, + queryQueue(params) { + return axios({ + method: 'get', + url: 'v1/mindinsight/profile/minddata-pipeline/queue', + params: params, + headers: { + ignoreError: true, + }, + }); + }, + queryProcessSummary(params) { + return axios({ + method: 'get', + url: 'v1/mindinsight/profile/process_summary', + params: params, + headers: { + ignoreError: true, + }, + }); + }, + queueInfo(params) { + return axios({ + method: 'get', + url: 'v1/mindinsight/profile/queue_info', + params: params, + headers: { + ignoreError: true, + }, + }); + }, + minddataOp(params) { + return axios({ + method: 'get', + url: 'v1/mindinsight/profile/minddata_op', + params: params, + headers: { + ignoreError: true, + }, + }); + }, }; diff --git a/mindinsight/ui/src/views/train-manage/operator.vue b/mindinsight/ui/src/views/train-manage/operator.vue index 8431525..613c8b9 100644 --- a/mindinsight/ui/src/views/train-manage/operator.vue +++ b/mindinsight/ui/src/views/train-manage/operator.vue @@ -145,14 +145,13 @@ + class="cpu-tab" + name="cpu">
- {{ $t('operator.operatorTypeStatistics') }} + {{ $t('operator.operatorStatistics') }}
diff --git a/mindinsight/ui/src/views/train-manage/profiling-dashboard.vue b/mindinsight/ui/src/views/train-manage/profiling-dashboard.vue index 616e010..ca2d8fc 100644 --- a/mindinsight/ui/src/views/train-manage/profiling-dashboard.vue +++ b/mindinsight/ui/src/views/train-manage/profiling-dashboard.vue @@ -25,11 +25,33 @@ limitations under the License. :class="{disabled:svg.noData && svg.data.length === 0}">{{ $t('profiling.viewDetail') }}
-
- + +
+
{{$t("profiling.features")}}
+
{{$t('profiling.iterationInfo')}}
+
+ {{$t('profiling.queueInfo')}}  + {{$t('profiling.iterationGapInfo')}} +
+
+ {{$t('profiling.fpbpTitle')}}  + {{$t('profiling.fpbpInfo')}} +
+
+ {{$t('profiling.iterativeTailingTitle')}}  + {{$t('profiling.iterativeTailingInfo')}} +
+
+
{{$t('profiling.statistics')}}
+
{{$t('profiling.totalTime')}}{{totalTime}}{{$t('profiling.millisecond')}}
+
{{$t('profiling.totalSteps')}}{{totalSteps}}
+
{{$t('profiling.fpbpTimeRatio')}}{{fpAndBp}}
+
{{$t('profiling.iterationGapTimeRatio')}}{{iterationInterval}}
+
{{$t('profiling.iterativeTailingTimeRatio')}}{{tail}}
+
@@ -78,19 +100,144 @@ limitations under the License.
{{ $t('profiling.mindData') }}
-
-
+
+ +
+
{{$t("profiling.features")}}
+
{{$t('profiling.dataProcess')}}
+
{{$t('profiling.dataProcessInfo')}}
+
{{$t('profiling.analysisOne')}}
+
{{$t('profiling.analysisTwo')}}
+
{{$t('profiling.higherAnalysis')}}
+
+
{{$t('profiling.statistics')}}
+
{{$t('profiling.chipInfo')}} + {{queueInfoEmptyNum}}/{{queueInfoTotalNum}} +
+
+
{{$t('profiling.hostIsEmpty')}} + {{deviceInfoEmptyNum}}/{{deviceInfoTotalNum}} +
+
{{$t('profiling.hostIsFull')}} + {{deviceInfoFullNum}}/{{deviceInfoTotalNum}} +
+
+
+ +
+
-
-
- -

- {{$t("public.stayTuned")}} -

+
+
+
+ {{$t('profiling.pipeline')}} +
+
+ +
+
+
+ +
+
+ +
+
+ +
+
+
{{$t('profiling.connectorQuene')}}
+
+
+ {{$t('profiling.queueTip2')}} + + {{processSummary.device.empty}}/{{processSummary.device.total}} + +
+
+ {{$t('profiling.queueTip1')}} + + {{processSummary.device.empty}}/{{processSummary.device.total}} + +
+
+ +
+
+ {{$t('profiling.deviceQueueOp')}} +
+
{{$t('profiling.deviceQueueOpTip')}} | TDT
+
+ +
+
+
+ +
+
+ +
+
+ +
+
+
{{$t('profiling.dataQueue')}}
+
+
+ {{$t('profiling.queueTip2')}} + + {{processSummary.get_next.empty}}/{{processSummary.get_next.total}} + +
+
+ {{$t('profiling.queueTip1')}} + + {{processSummary.get_next.empty}}/{{processSummary.get_next.total}} + +
+
+
+ +
+
+ {{$t('profiling.getData')}} +
+
+
+
+
+ +
+

{{$t("public.noData")}}

@@ -140,18 +287,35 @@ limitations under the License.
{{ $t('profiling.timeLine') }}
-
- {{ $t('profiling.viewDetail') }} +
+
-
-
- -

- {{$t("public.stayTuned")}} -

+
+
+ {{$t('profiling.opTotalTime')}}{{timelineInfo.totalTime}}ms +
+
+ {{$t('profiling.streamNum')}}{{timelineInfo.streamNum}} +
+
+ {{$t('profiling.opNum')}}{{timelineInfo.opNum}}
+
+ {{$t('profiling.opTimes')}}{{timelineInfo.opTimes + $t('profiling.times')}}
+ +
+
+
+ +
+

{{$t("public.noData")}}

@@ -164,6 +328,18 @@ import CommonProperty from '../../common/common-property'; export default { data() { return { + fpAndBp: '--', + iterationInterval: '--', + totalSteps: '--', + totalTime: '--', + tail: '--', + queueInfoShow: false, + deviceInfoShow: false, + queueInfoEmptyNum: '--', + queueInfoTotalNum: '--', + deviceInfoEmptyNum: '--', + deviceInfoTotalNum: '--', + deviceInfoFullNum: '--', svg: { data: [], svgPadding: 20, @@ -193,6 +369,33 @@ export default { topN: [], colorList: ['#6C92FA', '#6CBFFF', '#4EDED2', '#7ADFA0', '#A6DD82'], }, + perfetto: { + url: 'https://ui.perfetto.dev/#!', + data: null, + delay: 5000, + waiting: true, + }, + timelineInfo: { + totalTime: 0, + streamNum: 0, + opNum: 0, + opTimes: 0, + noData: true, + }, + processSummary: { + noData: true, + count: 6, + device: { + empty: 0, + full: 0, + total: 0, + }, + get_next: { + empty: 0, + full: 0, + total: 0, + }, + }, }; }, mounted() { @@ -231,10 +434,47 @@ export default { }, methods: { init() { + this.queryTimeline(); this.queryTrainingTrace(); + this.getProccessSummary(); this.initPieChart(); window.addEventListener('resize', this.resizeTrace, false); }, + getProccessSummary() { + const params = { + train_id: this.trainingJobId, + profile: this.summaryPath, + device_id: this.currentCard, + }; + RequestService.queryProcessSummary(params).then((resp) => { + if (resp && resp.data) { + const data = JSON.parse(JSON.stringify(resp.data)); + this.processSummary.count = Object.keys(data).length; + this.dealProcess(data); + + // 芯片侧 + if (resp.data.get_next_queue_info) { + this.queueInfoShow = true; + this.queueInfoEmptyNum = + resp.data.get_next_queue_info.summary.empty_batch_count; + this.queueInfoTotalNum = + resp.data.get_next_queue_info.summary.total_batch; + } + // 主机侧 + if (resp.data.device_queue_info) { + this.deviceInfoShow = true; + this.deviceInfoEmptyNum = + resp.data.device_queue_info.summary.empty_batch_count; + this.deviceInfoTotalNum = + resp.data.device_queue_info.summary.total_batch; + this.deviceInfoFullNum = + resp.data.device_queue_info.summary.full_batch_count; + } + } else { + this.dealProcess(null); + } + }); + }, viewDetail(path) { this.$router.push({ path, @@ -250,13 +490,15 @@ export default { option.tooltip = { trigger: 'item', formatter: (params) => { - return `${params.marker} ${params.data.name} ${params.percent}%`; + return `${params.data.name}
${params.marker}${params.percent}%`; }, + confine: true, + extraCssText: 'white-space:normal; word-break:break-word;', }; option.series = [ { type: 'pie', - center: ['50%', '50%'], + center: ['55%', '55%'], data: this.pieChart.data, radius: '50%', lable: { @@ -370,6 +612,19 @@ export default { setTimeout(() => { this.dealTraceData(); }, 100); + if (res.data.summary) { + this.fpAndBp = res.data.summary.fp_and_bp; + this.iterationInterval = res.data.summary.iteration_interval; + this.totalSteps = res.data.summary.total_steps; + this.totalTime = res.data.summary.total_time; + this.tail = res.data.summary.tail; + } else { + this.fpAndBp = '--'; + this.iterationInterval = '--'; + this.totalSteps = '--'; + this.totalTime = '--'; + this.tail = '--'; + } } else { document.querySelector('#trace').style.height = '0px'; this.svg.noData = true; @@ -568,6 +823,85 @@ export default { } return new Uint8Array(arr); }, + toPerfetto() { + if (this.perfetto.data) { + const popupwin = window.open(this.perfetto.url); + setTimeout(() => { + const params = { + perfetto: { + title: '', + buffer: this.perfetto.data, + }, + }; + if (popupwin) { + popupwin.postMessage(params, this.perfetto.url); + } + }, this.perfetto.delay); + } else { + this.perfetto.waiting = true; + } + }, + queryTimeline() { + const params = { + dir: this.relativePath, + device_id: this.currentCard, + }; + RequestService.queryTimlineInfo(params) + .then((res) => { + if (res && res.data) { + this.timelineInfo.noData = false; + this.timelineInfo.totalTime = res.data.total_time.toFixed(4); + this.timelineInfo.streamNum = res.data.num_of_streams; + this.timelineInfo.opNum = res.data.num_of_ops; + this.timelineInfo.opTimes = res.data.op_exe_times; + } else { + this.timelineInfo.noData = true; + } + }) + .catch(() => { + this.timelineInfo.noData = true; + }); + this.perfetto.waiting = true; + RequestService.queryTimeline(params).then((res) => { + if (res && res.data) { + this.perfetto.data = this.stringToUint8Array( + JSON.stringify(res.data), + ); + this.perfetto.waiting = false; + } + }); + }, + dealProcess(data) { + this.processSummary.device = { + empty: 0, + full: 0, + total: 0, + }; + this.processSummary.get_next = { + empty: 0, + full: 0, + total: 0, + }; + this.processSummary.noData = true; + + if (data) { + if (data.device_queue_info && data.device_queue_info.summary) { + this.processSummary.device = { + empty: data.device_queue_info.summary.empty_batch_count, + full: data.device_queue_info.summary.full_batch_count, + total: data.device_queue_info.summary.total_batch, + }; + } + if (data.get_next_queue_info && data.get_next_queue_info.summary) { + this.processSummary.get_next = { + empty: data.get_next_queue_info.summary.empty_batch_count, + full: data.get_next_queue_info.summary.full_batch_count, + total: data.get_next_queue_info.summary.total_batch, + }; + } + this.processSummary.noData = false; + } + }, }, destroyed() { window.removeEventListener('resize', this.resizeTrace, false); @@ -576,13 +910,20 @@ export default { };