提交 d94aed5a 编写于 作者: M Marek Olšák 提交者: Alex Deucher

drm/amdgpu: add and implement the GPU reset status query

Signed-off-by: NMarek Olšák <marek.olsak@amd.com>
Reviewed-by: NChristian König <christian.koenig@amd.com>
Reviewed-by: NJammy Zhou <Jammy.Zhou@amd.com>
上级 1f8d9625
...@@ -1040,7 +1040,7 @@ struct amdgpu_vm_manager { ...@@ -1040,7 +1040,7 @@ struct amdgpu_vm_manager {
struct amdgpu_ctx_state { struct amdgpu_ctx_state {
uint64_t flags; uint64_t flags;
uint64_t hangs; uint32_t hangs;
}; };
struct amdgpu_ctx { struct amdgpu_ctx {
...@@ -1049,6 +1049,7 @@ struct amdgpu_ctx { ...@@ -1049,6 +1049,7 @@ struct amdgpu_ctx {
struct amdgpu_fpriv *fpriv; struct amdgpu_fpriv *fpriv;
struct amdgpu_ctx_state state; struct amdgpu_ctx_state state;
uint32_t id; uint32_t id;
unsigned reset_counter;
}; };
struct amdgpu_ctx_mgr { struct amdgpu_ctx_mgr {
...@@ -1897,8 +1898,6 @@ int amdgpu_ctx_alloc(struct amdgpu_device *adev,struct amdgpu_fpriv *fpriv, ...@@ -1897,8 +1898,6 @@ int amdgpu_ctx_alloc(struct amdgpu_device *adev,struct amdgpu_fpriv *fpriv,
uint32_t *id,uint32_t flags); uint32_t *id,uint32_t flags);
int amdgpu_ctx_free(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv, int amdgpu_ctx_free(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv,
uint32_t id); uint32_t id);
int amdgpu_ctx_query(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv,
uint32_t id,struct amdgpu_ctx_state *state);
void amdgpu_ctx_fini(struct amdgpu_fpriv *fpriv); void amdgpu_ctx_fini(struct amdgpu_fpriv *fpriv);
struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id); struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id);
...@@ -2006,6 +2005,7 @@ struct amdgpu_device { ...@@ -2006,6 +2005,7 @@ struct amdgpu_device {
atomic64_t vram_vis_usage; atomic64_t vram_vis_usage;
atomic64_t gtt_usage; atomic64_t gtt_usage;
atomic64_t num_bytes_moved; atomic64_t num_bytes_moved;
atomic_t gpu_reset_counter;
/* display */ /* display */
struct amdgpu_mode_info mode_info; struct amdgpu_mode_info mode_info;
......
...@@ -81,21 +81,36 @@ int amdgpu_ctx_free(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv, uint ...@@ -81,21 +81,36 @@ int amdgpu_ctx_free(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv, uint
return -EINVAL; return -EINVAL;
} }
int amdgpu_ctx_query(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv, uint32_t id, struct amdgpu_ctx_state *state) static int amdgpu_ctx_query(struct amdgpu_device *adev,
struct amdgpu_fpriv *fpriv, uint32_t id,
union drm_amdgpu_ctx_out *out)
{ {
struct amdgpu_ctx *ctx; struct amdgpu_ctx *ctx;
struct amdgpu_ctx_mgr *mgr = &fpriv->ctx_mgr; struct amdgpu_ctx_mgr *mgr = &fpriv->ctx_mgr;
unsigned reset_counter;
mutex_lock(&mgr->lock); mutex_lock(&mgr->lock);
ctx = idr_find(&mgr->ctx_handles, id); ctx = idr_find(&mgr->ctx_handles, id);
if (ctx) { if (!ctx) {
/* state should alter with CS activity */
*state = ctx->state;
mutex_unlock(&mgr->lock); mutex_unlock(&mgr->lock);
return 0; return -EINVAL;
} }
/* TODO: these two are always zero */
out->state.flags = ctx->state.flags;
out->state.hangs = ctx->state.hangs;
/* determine if a GPU reset has occured since the last call */
reset_counter = atomic_read(&adev->gpu_reset_counter);
/* TODO: this should ideally return NO, GUILTY, or INNOCENT. */
if (ctx->reset_counter == reset_counter)
out->state.reset_status = AMDGPU_CTX_NO_RESET;
else
out->state.reset_status = AMDGPU_CTX_UNKNOWN_RESET;
ctx->reset_counter = reset_counter;
mutex_unlock(&mgr->lock); mutex_unlock(&mgr->lock);
return -EINVAL; return 0;
} }
void amdgpu_ctx_fini(struct amdgpu_fpriv *fpriv) void amdgpu_ctx_fini(struct amdgpu_fpriv *fpriv)
...@@ -115,12 +130,11 @@ void amdgpu_ctx_fini(struct amdgpu_fpriv *fpriv) ...@@ -115,12 +130,11 @@ void amdgpu_ctx_fini(struct amdgpu_fpriv *fpriv)
} }
int amdgpu_ctx_ioctl(struct drm_device *dev, void *data, int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
struct drm_file *filp) struct drm_file *filp)
{ {
int r; int r;
uint32_t id; uint32_t id;
uint32_t flags; uint32_t flags;
struct amdgpu_ctx_state state;
union drm_amdgpu_ctx *args = data; union drm_amdgpu_ctx *args = data;
struct amdgpu_device *adev = dev->dev_private; struct amdgpu_device *adev = dev->dev_private;
...@@ -139,11 +153,7 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data, ...@@ -139,11 +153,7 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
r = amdgpu_ctx_free(adev, fpriv, id); r = amdgpu_ctx_free(adev, fpriv, id);
break; break;
case AMDGPU_CTX_OP_QUERY_STATE: case AMDGPU_CTX_OP_QUERY_STATE:
r = amdgpu_ctx_query(adev, fpriv, id, &state); r = amdgpu_ctx_query(adev, fpriv, id, &args->out);
if (r == 0) {
args->out.state.flags = state.flags;
args->out.state.hangs = state.hangs;
}
break; break;
default: default:
return -EINVAL; return -EINVAL;
......
...@@ -1781,6 +1781,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) ...@@ -1781,6 +1781,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
} }
adev->needs_reset = false; adev->needs_reset = false;
atomic_inc(&adev->gpu_reset_counter);
/* block TTM */ /* block TTM */
resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
......
...@@ -149,6 +149,12 @@ union drm_amdgpu_bo_list { ...@@ -149,6 +149,12 @@ union drm_amdgpu_bo_list {
#define AMDGPU_CTX_OP_STATE_RUNNING 1 #define AMDGPU_CTX_OP_STATE_RUNNING 1
/* GPU reset status */
#define AMDGPU_CTX_NO_RESET 0
#define AMDGPU_CTX_GUILTY_RESET 1 /* this the context caused it */
#define AMDGPU_CTX_INNOCENT_RESET 2 /* some other context caused it */
#define AMDGPU_CTX_UNKNOWN_RESET 3 /* unknown cause */
struct drm_amdgpu_ctx_in { struct drm_amdgpu_ctx_in {
uint32_t op; uint32_t op;
uint32_t flags; uint32_t flags;
...@@ -164,7 +170,10 @@ union drm_amdgpu_ctx_out { ...@@ -164,7 +170,10 @@ union drm_amdgpu_ctx_out {
struct { struct {
uint64_t flags; uint64_t flags;
uint64_t hangs; /** Number of resets caused by this context so far. */
uint32_t hangs;
/** Reset status since the last call of the ioctl. */
uint32_t reset_status;
} state; } state;
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册