提交 cc5a2408 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!2491 add cpu kernel profiling log

Merge pull request !2491 from kisnwang/add-cpu-kernel-profiling
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "device/cpu/cpu_device_address.h" #include "device/cpu/cpu_device_address.h"
#include "utils/context/ms_context.h" #include "utils/context/ms_context.h"
#include "utils/config_manager.h" #include "utils/config_manager.h"
#include "utils/profile.h"
#include "common/utils.h" #include "common/utils.h"
#include "session/anf_runtime_algorithm.h" #include "session/anf_runtime_algorithm.h"
#include "session/session_basic.h" #include "session/session_basic.h"
...@@ -270,6 +271,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) { ...@@ -270,6 +271,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
auto kernels = kernel_graph->execution_order(); auto kernels = kernel_graph->execution_order();
for (const auto &kernel : kernels) { for (const auto &kernel : kernels) {
#ifdef ENABLE_PROFILE
double start_time = GetTime();
#endif
std::vector<kernel::AddressPtr> kernel_inputs; std::vector<kernel::AddressPtr> kernel_inputs;
std::vector<kernel::AddressPtr> kernel_workspaces; std::vector<kernel::AddressPtr> kernel_workspaces;
std::vector<kernel::AddressPtr> kernel_outputs; std::vector<kernel::AddressPtr> kernel_outputs;
...@@ -297,6 +301,10 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) { ...@@ -297,6 +301,10 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
if (!ret) { if (!ret) {
MS_LOG(EXCEPTION) << "Launch kernel failed."; MS_LOG(EXCEPTION) << "Launch kernel failed.";
} }
#ifdef ENABLE_PROFILE
double cost_time = GetTime() - start_time;
MS_LOG(INFO) << "cpu kernel: " << kernel->fullname_with_scope() << " costs " << cost_time * 1e6 << " us";
#endif
} }
return true; return true;
} }
......
...@@ -29,7 +29,7 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en ...@@ -29,7 +29,7 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en
auto linear = input_params->linear_; auto linear = input_params->linear_;
auto lr = input_params->lr_; auto lr = input_params->lr_;
auto l1 = input_params->l1_; auto l1 = input_params->l1_;
auto l2 = input_params->l2_; auto l2_plus = 2 * input_params->l2_;
auto lr_power = input_params->lr_power_; auto lr_power = input_params->lr_power_;
auto unique_sparse_grad = input_params->sparse_grad_; auto unique_sparse_grad = input_params->sparse_grad_;
auto var_first_dim_size = input_params->var_first_dim_size_; auto var_first_dim_size = input_params->var_first_dim_size_;
...@@ -44,21 +44,18 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en ...@@ -44,21 +44,18 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en
for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) { for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k]; auto summed_grad = unique_sparse_grad.value_[k];
auto accum_new = accum[j] + summed_grad * summed_grad; auto accum_new = accum[j] + summed_grad * summed_grad;
if (lr_power == -0.5) {
linear[j] += summed_grad - (std::sqrt(accum_new) - std::sqrt(accum[j])) / lr * var[j];
} else {
linear[j] += summed_grad - (std::pow(accum_new, -lr_power) - std::pow(accum[j], -lr_power)) / lr * var[j];
}
auto x = Sign(linear[j]) * l1 - linear[j];
float y; float y;
if (lr_power == -0.5) { if (lr_power == -0.5) {
y = std::sqrt(accum_new) / lr + 2 * l2; y = std::sqrt(accum_new);
linear[j] += summed_grad - (y - std::sqrt(accum[j])) / lr * var[j];
} else { } else {
y = std::pow(accum_new, -lr_power) / lr + 2 * l2; y = std::pow(accum_new, -lr_power);
linear[j] += summed_grad - (y - std::pow(accum[j], -lr_power)) / lr * var[j];
} }
auto pre_shrink = x / y;
var[j] = std::fabs(linear[j]) > l1 ? pre_shrink : 0;
accum[j] = accum_new; accum[j] = accum_new;
auto x = Sign(linear[j]) * l1 - linear[j];
y = y / lr + l2_plus;
var[j] = std::fabs(linear[j]) > l1 ? x / y : 0;
} }
} }
} }
......
...@@ -112,10 +112,10 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, ...@@ -112,10 +112,10 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
auto tensor_address = tensor->device_address(); auto tensor_address = tensor->device_address();
bool need_sync = false; bool need_sync = false;
if (ms_context->enable_pynative_infer()) { if (ms_context->enable_pynative_infer()) {
if (tensor_address.get() == nullptr || tensor_address != device_address) { if (tensor_address == nullptr || tensor_address != device_address) {
need_sync = true; need_sync = true;
} }
} else if (tensor->is_dirty()) { } else if (tensor->is_dirty() || tensor_address == nullptr) {
need_sync = true; need_sync = true;
} else if (tensor_address != device_address) { } else if (tensor_address != device_address) {
if (tensor_address->DeviceType() == device_address->DeviceType()) { if (tensor_address->DeviceType() == device_address->DeviceType()) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册