提交 cc5a2408 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!2491 add cpu kernel profiling log

Merge pull request !2491 from kisnwang/add-cpu-kernel-profiling
......@@ -26,6 +26,7 @@
#include "device/cpu/cpu_device_address.h"
#include "utils/context/ms_context.h"
#include "utils/config_manager.h"
#include "utils/profile.h"
#include "common/utils.h"
#include "session/anf_runtime_algorithm.h"
#include "session/session_basic.h"
......@@ -270,6 +271,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
auto kernels = kernel_graph->execution_order();
for (const auto &kernel : kernels) {
#ifdef ENABLE_PROFILE
double start_time = GetTime();
#endif
std::vector<kernel::AddressPtr> kernel_inputs;
std::vector<kernel::AddressPtr> kernel_workspaces;
std::vector<kernel::AddressPtr> kernel_outputs;
......@@ -297,6 +301,10 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) {
if (!ret) {
MS_LOG(EXCEPTION) << "Launch kernel failed.";
}
#ifdef ENABLE_PROFILE
double cost_time = GetTime() - start_time;
MS_LOG(INFO) << "cpu kernel: " << kernel->fullname_with_scope() << " costs " << cost_time * 1e6 << " us";
#endif
}
return true;
}
......
......@@ -29,7 +29,7 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en
auto linear = input_params->linear_;
auto lr = input_params->lr_;
auto l1 = input_params->l1_;
auto l2 = input_params->l2_;
auto l2_plus = 2 * input_params->l2_;
auto lr_power = input_params->lr_power_;
auto unique_sparse_grad = input_params->sparse_grad_;
auto var_first_dim_size = input_params->var_first_dim_size_;
......@@ -44,21 +44,18 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en
for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) {
auto summed_grad = unique_sparse_grad.value_[k];
auto accum_new = accum[j] + summed_grad * summed_grad;
if (lr_power == -0.5) {
linear[j] += summed_grad - (std::sqrt(accum_new) - std::sqrt(accum[j])) / lr * var[j];
} else {
linear[j] += summed_grad - (std::pow(accum_new, -lr_power) - std::pow(accum[j], -lr_power)) / lr * var[j];
}
auto x = Sign(linear[j]) * l1 - linear[j];
float y;
if (lr_power == -0.5) {
y = std::sqrt(accum_new) / lr + 2 * l2;
y = std::sqrt(accum_new);
linear[j] += summed_grad - (y - std::sqrt(accum[j])) / lr * var[j];
} else {
y = std::pow(accum_new, -lr_power) / lr + 2 * l2;
y = std::pow(accum_new, -lr_power);
linear[j] += summed_grad - (y - std::pow(accum[j], -lr_power)) / lr * var[j];
}
auto pre_shrink = x / y;
var[j] = std::fabs(linear[j]) > l1 ? pre_shrink : 0;
accum[j] = accum_new;
auto x = Sign(linear[j]) * l1 - linear[j];
y = y / lr + l2_plus;
var[j] = std::fabs(linear[j]) > l1 ? x / y : 0;
}
}
}
......
......@@ -112,10 +112,10 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
auto tensor_address = tensor->device_address();
bool need_sync = false;
if (ms_context->enable_pynative_infer()) {
if (tensor_address.get() == nullptr || tensor_address != device_address) {
if (tensor_address == nullptr || tensor_address != device_address) {
need_sync = true;
}
} else if (tensor->is_dirty()) {
} else if (tensor->is_dirty() || tensor_address == nullptr) {
need_sync = true;
} else if (tensor_address != device_address) {
if (tensor_address->DeviceType() == device_address->DeviceType()) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册