提交 58b013c3 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!363 clear the warmming scan by package

Merge pull request !363 from SanjayChan/labao
/** /**
* Copyright 2019 Huawei Technologies Co., Ltd * Copyright 2020 Huawei Technologies Co., Ltd
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
......
/** /**
* Copyright 2019 Huawei Technologies Co., Ltd * Copyright 2020 Huawei Technologies Co., Ltd
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
......
...@@ -19,7 +19,6 @@ ...@@ -19,7 +19,6 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
DropoutGpuFwdKernel::DropoutGpuFwdKernel() DropoutGpuFwdKernel::DropoutGpuFwdKernel()
: cudnn_handle_(nullptr), : cudnn_handle_(nullptr),
is_null_input_(false), is_null_input_(false),
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
MS_REG_GPU_KERNEL_ONE(BatchNormFold2, MS_REG_GPU_KERNEL_ONE(BatchNormFold2,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
......
...@@ -132,7 +132,6 @@ class BatchNormFold2GpuKernel : public GpuKernel { ...@@ -132,7 +132,6 @@ class BatchNormFold2GpuKernel : public GpuKernel {
std::vector<size_t> output_size_list_; std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_; std::vector<size_t> workspace_size_list_;
}; };
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
MS_REG_GPU_KERNEL_ONE(BatchNormFold2Grad, MS_REG_GPU_KERNEL_ONE(BatchNormFold2Grad,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
MS_REG_GPU_KERNEL_ONE(BatchNormFold, MS_REG_GPU_KERNEL_ONE(BatchNormFold,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
......
...@@ -54,7 +54,6 @@ class CorrectionMulGpuKernel : public GpuKernel { ...@@ -54,7 +54,6 @@ class CorrectionMulGpuKernel : public GpuKernel {
} }
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
if (input_shape.size() != 4) { if (input_shape.size() != 4) {
MS_LOG(ERROR) << "CorrectionMulGpuKernel input shape needs (N,C,H,W)."; MS_LOG(ERROR) << "CorrectionMulGpuKernel input shape needs (N,C,H,W).";
return false; return false;
......
...@@ -19,7 +19,6 @@ ...@@ -19,7 +19,6 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
MS_REG_GPU_KERNEL_ONE(CorrectionMulGrad, MS_REG_GPU_KERNEL_ONE(CorrectionMulGrad,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
......
...@@ -61,7 +61,6 @@ class CorrectionMulGradGpuKernel : public GpuKernel { ...@@ -61,7 +61,6 @@ class CorrectionMulGradGpuKernel : public GpuKernel {
} }
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
if (input_shape.size() != 4) { if (input_shape.size() != 4) {
MS_LOG(ERROR) << "CorrectionMulGradGpuKernel input shape needs (N,C,H,W)."; MS_LOG(ERROR) << "CorrectionMulGradGpuKernel input shape needs (N,C,H,W).";
return false; return false;
......
...@@ -114,6 +114,36 @@ void FakeQuantPerChannelGpuKernel::InitSizeLists() { ...@@ -114,6 +114,36 @@ void FakeQuantPerChannelGpuKernel::InitSizeLists() {
workspace_size_list_.push_back(workspace_size_); workspace_size_list_.push_back(workspace_size_);
} }
void FakeQuantPerChannelGpuKernel::CalFakeQuantizeForTraining(float *input, float *output, float *input_min,
float *input_max, float *d_nudge_min, float *d_nudge_max,
float *d_scale, uintptr_t stream_ptr) {
// calculate the input min and max according by the parameter ema and ema_decay.
CalMinMaxPerChannel(input, input_min, input_max, input_size_ / sizeof(float), channel_out_, ema_decay_, ema_,
reinterpret_cast<cudaStream_t>(stream_ptr));
// control flow for quant_delay
if (global_step_ >= quant_delay_) {
// real launch
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantizePerChannel(input, output, input_size_ / sizeof(float), channel_out_, d_nudge_min, d_nudge_max,
d_scale, symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpy(output, input, input_size_, cudaMemcpyDeviceToDevice),
"Copy gpu memory failed.");
}
global_step_++;
}
void FakeQuantPerChannelGpuKernel::CalFakeQuantizeForInfer(float *input, float *output, float *input_min,
float *input_max, float *d_nudge_min, float *d_nudge_max,
float *d_scale, uintptr_t stream_ptr) {
// real launch
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantizePerChannel(input, output, input_size_ / sizeof(float), channel_out_, d_nudge_min, d_nudge_max, d_scale,
symmetric_, reinterpret_cast<cudaStream_t>(stream_ptr));
}
bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs, bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, uintptr_t stream_ptr) { const std::vector<AddressPtr> &outputs, uintptr_t stream_ptr) {
...@@ -126,11 +156,8 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs, ...@@ -126,11 +156,8 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
if (input == nullptr) { if (input == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input is null."; MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input is null.";
} }
if (input_min == nullptr) { if (input_min == nullptr || input_max == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input min is null."; MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input min or max is null.";
}
if (input_max == nullptr) {
MS_LOG(EXCEPTION) << "FakeQuantPerChannelGpuKernel input max is null.";
} }
// Allocate space for device copies // Allocate space for device copies
...@@ -143,30 +170,11 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs, ...@@ -143,30 +170,11 @@ bool FakeQuantPerChannelGpuKernel::Launch(const std::vector<AddressPtr> &inputs,
"Malloc gpu memory failed"); "Malloc gpu memory failed");
CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_max), sizeof(float) * channel_out_), CHECK_CUDA_RET_WITH_ERROR(cudaMalloc(reinterpret_cast<void **>(&d_nudge_max), sizeof(float) * channel_out_),
"Malloc gpu memory failed"); "Malloc gpu memory failed");
int total_size = input_size_ / sizeof(float);
bool symmetric = false;
if (training_) { if (training_) {
// calculate the input min and max according by the parameter ema and ema_decay. CalFakeQuantizeForTraining(input, output, input_min, input_max, d_nudge_min, d_nudge_max, d_scale, stream_ptr);
CalMinMaxPerChannel(input, input_min, input_max, total_size, channel_out_, ema_decay_, ema_,
reinterpret_cast<cudaStream_t>(stream_ptr));
// control flow for quant_delay
if (global_step_ >= quant_delay_) {
// real launch
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantizePerChannel(input, output, total_size, channel_out_, d_nudge_min, d_nudge_max, d_scale, symmetric,
reinterpret_cast<cudaStream_t>(stream_ptr));
} else {
CHECK_CUDA_RET_WITH_ERROR(cudaMemcpy(output, input, input_size_, cudaMemcpyDeviceToDevice),
"Copy gpu memory failed.");
}
global_step_++;
} else { } else {
// real launch CalFakeQuantizeForInfer(input, output, input_min, input_max, d_nudge_min, d_nudge_max, d_scale, stream_ptr);
CalNudgePerChannel(input_min, input_max, quant_min_, quant_max_, d_nudge_min, d_nudge_max, d_scale, channel_out_,
reinterpret_cast<cudaStream_t>(stream_ptr));
CalFakeQuantizePerChannel(input, output, total_size, channel_out_, d_nudge_min, d_nudge_max, d_scale, symmetric,
reinterpret_cast<cudaStream_t>(stream_ptr));
} }
// Cleanup // Cleanup
......
...@@ -39,6 +39,11 @@ class FakeQuantPerChannelGpuKernel : public GpuKernel { ...@@ -39,6 +39,11 @@ class FakeQuantPerChannelGpuKernel : public GpuKernel {
void InitSizeLists() override; void InitSizeLists() override;
private: private:
void CalFakeQuantizeForTraining(float *input, float *output, float *input_min, float *input_max, float *d_nudge_min,
float *d_nudge_max, float *d_scale, uintptr_t stream_ptr);
void CalFakeQuantizeForInfer(float *input, float *output, float *input_min, float *input_max, float *d_nudge_min,
float *d_nudge_max, float *d_scale, uintptr_t stream_ptr);
size_t input_size_; size_t input_size_;
size_t min_size_; size_t min_size_;
size_t max_size_; size_t max_size_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册