提交 d1bf0e2c 编写于 作者: M Moshe Shemesh 提交者: Saeed Mahameed

net/mlx5: Report devlink health on FW issues

Use devlink_health_report() to report any symptom of FW issue as FW
counter miss or new health syndrome.
The FW issues detected in mlx5 during poll_health which is called in
timer atomic context and so health work queue is used to schedule the
reports.
Signed-off-by: NMoshe Shemesh <moshe@mellanox.com>
Signed-off-by: NEran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: NSaeed Mahameed <saeedm@mellanox.com>
上级 fd1483fe
...@@ -515,6 +515,29 @@ mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter, ...@@ -515,6 +515,29 @@ mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter,
return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg); return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg);
} }
static void mlx5_fw_reporter_err_work(struct work_struct *work)
{
struct mlx5_fw_reporter_ctx fw_reporter_ctx;
struct mlx5_core_health *health;
health = container_of(work, struct mlx5_core_health, report_work);
if (IS_ERR_OR_NULL(health->fw_reporter))
return;
fw_reporter_ctx.err_synd = health->synd;
fw_reporter_ctx.miss_counter = health->miss_counter;
if (fw_reporter_ctx.err_synd) {
devlink_health_report(health->fw_reporter,
"FW syndrom reported", &fw_reporter_ctx);
return;
}
if (fw_reporter_ctx.miss_counter)
devlink_health_report(health->fw_reporter,
"FW miss counter reported",
&fw_reporter_ctx);
}
static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
.name = "fw", .name = "fw",
.diagnose = mlx5_fw_reporter_diagnose, .diagnose = mlx5_fw_reporter_diagnose,
...@@ -572,7 +595,9 @@ static void poll_health(struct timer_list *t) ...@@ -572,7 +595,9 @@ static void poll_health(struct timer_list *t)
{ {
struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
struct mlx5_core_health *health = &dev->priv.health; struct mlx5_core_health *health = &dev->priv.health;
struct health_buffer __iomem *h = health->health;
u32 fatal_error; u32 fatal_error;
u8 prev_synd;
u32 count; u32 count;
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
...@@ -588,8 +613,14 @@ static void poll_health(struct timer_list *t) ...@@ -588,8 +613,14 @@ static void poll_health(struct timer_list *t)
if (health->miss_counter == MAX_MISSES) { if (health->miss_counter == MAX_MISSES) {
mlx5_core_err(dev, "device's health compromised - reached miss count\n"); mlx5_core_err(dev, "device's health compromised - reached miss count\n");
print_health_info(dev); print_health_info(dev);
queue_work(health->wq, &health->report_work);
} }
prev_synd = health->synd;
health->synd = ioread8(&h->synd);
if (health->synd && health->synd != prev_synd)
queue_work(health->wq, &health->report_work);
fatal_error = check_fatal_sensors(dev); fatal_error = check_fatal_sensors(dev);
if (fatal_error && !health->fatal_error) { if (fatal_error && !health->fatal_error) {
...@@ -639,6 +670,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev) ...@@ -639,6 +670,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
spin_lock_irqsave(&health->wq_lock, flags); spin_lock_irqsave(&health->wq_lock, flags);
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
spin_unlock_irqrestore(&health->wq_lock, flags); spin_unlock_irqrestore(&health->wq_lock, flags);
cancel_work_sync(&health->report_work);
cancel_work_sync(&health->work); cancel_work_sync(&health->work);
} }
...@@ -675,6 +707,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) ...@@ -675,6 +707,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
return -ENOMEM; return -ENOMEM;
spin_lock_init(&health->wq_lock); spin_lock_init(&health->wq_lock);
INIT_WORK(&health->work, health_care); INIT_WORK(&health->work, health_care);
INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
mlx5_fw_reporter_create(dev); mlx5_fw_reporter_create(dev);
......
...@@ -435,7 +435,7 @@ struct mlx5_core_health { ...@@ -435,7 +435,7 @@ struct mlx5_core_health {
struct timer_list timer; struct timer_list timer;
u32 prev; u32 prev;
int miss_counter; int miss_counter;
bool sick; u8 synd;
u32 fatal_error; u32 fatal_error;
u32 crdump_size; u32 crdump_size;
/* wq spinlock to synchronize draining */ /* wq spinlock to synchronize draining */
...@@ -443,6 +443,7 @@ struct mlx5_core_health { ...@@ -443,6 +443,7 @@ struct mlx5_core_health {
struct workqueue_struct *wq; struct workqueue_struct *wq;
unsigned long flags; unsigned long flags;
struct work_struct work; struct work_struct work;
struct work_struct report_work;
struct delayed_work recover_work; struct delayed_work recover_work;
struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_reporter;
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册