amdgpu_job.c 7.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * Copyright 2015 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 *
 */
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/sched.h>
27

28
#include "amdgpu.h"
29
#include "amdgpu_trace.h"
30

31
static void amdgpu_job_timedout(struct drm_sched_job *s_job)
32
{
C
Christian König 已提交
33 34
	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
	struct amdgpu_job *job = to_amdgpu_job(s_job);
35
	struct amdgpu_task_info ti;
36
	struct amdgpu_device *adev = ring->adev;
37 38

	memset(&ti, 0, sizeof(struct amdgpu_task_info));
39

40 41
	if (amdgpu_gpu_recovery &&
	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
42 43 44 45 46
		DRM_ERROR("ring %s timeout, but soft recovered\n",
			  s_job->sched->name);
		return;
	}

47
	amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
48
	DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
C
Christian König 已提交
49 50
		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
		  ring->fence_drv.sync_seq);
51 52
	DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
53

54
	if (amdgpu_device_should_recover_gpu(ring->adev)) {
55
		amdgpu_device_gpu_recover(ring->adev, job);
56
	} else {
57
		drm_sched_suspend_timeout(&ring->sched);
58 59 60
		if (amdgpu_sriov_vf(adev))
			adev->virt.tdr_debug = true;
	}
61 62
}

63
int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
64
		     struct amdgpu_job **job, struct amdgpu_vm *vm)
65 66 67 68 69 70 71 72 73 74 75 76
{
	size_t size = sizeof(struct amdgpu_job);

	if (num_ibs == 0)
		return -EINVAL;

	size += sizeof(struct amdgpu_ib) * num_ibs;

	*job = kzalloc(size, GFP_KERNEL);
	if (!*job)
		return -ENOMEM;

77 78 79 80 81
	/*
	 * Initialize the scheduler to at least some ring so that we always
	 * have a pointer to adev.
	 */
	(*job)->base.sched = &adev->rings[0]->sched;
82
	(*job)->vm = vm;
83 84 85
	(*job)->ibs = (void *)&(*job)[1];
	(*job)->num_ibs = num_ibs;

86
	amdgpu_sync_create(&(*job)->sync);
87
	amdgpu_sync_create(&(*job)->sched_sync);
88
	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
89
	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
90

91 92 93
	return 0;
}

94
int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
95 96
		enum amdgpu_ib_pool_type pool_type,
		struct amdgpu_job **job)
97 98 99
{
	int r;

100
	r = amdgpu_job_alloc(adev, 1, job, NULL);
101 102 103
	if (r)
		return r;

104
	r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]);
105 106 107 108 109 110
	if (r)
		kfree(*job);

	return r;
}

111
void amdgpu_job_free_resources(struct amdgpu_job *job)
112
{
113
	struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
114
	struct dma_fence *f;
115 116
	unsigned i;

117
	/* use sched fence if available */
118
	f = job->base.s_fence ? &job->base.s_fence->finished : job->fence;
119 120

	for (i = 0; i < job->num_ibs; ++i)
121
		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
122 123
}

124
static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
M
Monk Liu 已提交
125
{
C
Christian König 已提交
126
	struct amdgpu_job *job = to_amdgpu_job(s_job);
127

128 129
	drm_sched_job_cleanup(s_job);

130
	dma_fence_put(job->fence);
131
	amdgpu_sync_free(&job->sync);
132
	amdgpu_sync_free(&job->sched_sync);
M
Monk Liu 已提交
133 134 135
	kfree(job);
}

136 137 138
void amdgpu_job_free(struct amdgpu_job *job)
{
	amdgpu_job_free_resources(job);
139

140
	dma_fence_put(job->fence);
141
	amdgpu_sync_free(&job->sync);
142
	amdgpu_sync_free(&job->sched_sync);
143 144 145
	kfree(job);
}

146 147
int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
		      void *owner, struct dma_fence **f)
148
{
149
	int r;
150

151 152 153
	if (!f)
		return -EINVAL;

154
	r = drm_sched_job_init(&job->base, entity, owner);
155 156
	if (r)
		return r;
157

158
	*f = dma_fence_get(&job->base.s_fence->finished);
159
	amdgpu_job_free_resources(job);
160
	drm_sched_entity_push_job(&job->base, entity);
161 162

	return 0;
163 164
}

165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
			     struct dma_fence **fence)
{
	int r;

	job->base.sched = &ring->sched;
	r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence);
	job->fence = dma_fence_get(*fence);
	if (r)
		return r;

	amdgpu_job_free(job);
	return 0;
}

180 181
static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
					       struct drm_sched_entity *s_entity)
182
{
183
	struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
184
	struct amdgpu_job *job = to_amdgpu_job(sched_job);
185
	struct amdgpu_vm *vm = job->vm;
186
	struct dma_fence *fence;
187
	int r;
188

189 190 191 192 193
	fence = amdgpu_sync_get_fence(&job->sync);
	if (fence && drm_sched_dependency_optimized(fence, s_entity)) {
		r = amdgpu_sync_fence(&job->sched_sync, fence);
		if (r)
			DRM_ERROR("Error adding fence (%d)\n", r);
194
	}
195

196
	while (fence == NULL && vm && !job->vmid) {
197 198 199
		r = amdgpu_vmid_grab(vm, ring, &job->sync,
				     &job->base.s_fence->finished,
				     job);
200
		if (r)
201 202
			DRM_ERROR("Error getting VM ID (%d)\n", r);

203
		fence = amdgpu_sync_get_fence(&job->sync);
204 205 206
	}

	return fence;
207 208
}

209
static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
210
{
C
Christian König 已提交
211
	struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched);
212
	struct dma_fence *fence = NULL, *finished;
213
	struct amdgpu_job *job;
214
	int r = 0;
215

216
	job = to_amdgpu_job(sched_job);
217
	finished = &job->base.s_fence->finished;
218

219
	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
220

221
	trace_amdgpu_sched_run_job(job);
222

223
	if (down_read_trylock(&ring->adev->reset_sem)) {
C
Christian König 已提交
224
		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
225 226
					&fence);
		up_read(&ring->adev->reset_sem);
227 228
		if (r)
			DRM_ERROR("Error scheduling IBs (%d)\n", r);
229 230 231
	} else {
		dma_fence_set_error(finished, -ECANCELED);
		DRM_INFO("Skip scheduling IBs!\n");
232
	}
233

234
	/* if gpu reset, hw fence will be replaced here */
235 236
	dma_fence_put(job->fence);
	job->fence = dma_fence_get(fence);
237

238
	amdgpu_job_free_resources(job);
239 240

	fence = r ? ERR_PTR(r) : fence;
241
	return fence;
242 243
}

244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
#define to_drm_sched_job(sched_job)		\
		container_of((sched_job), struct drm_sched_job, queue_node)

void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
{
	struct drm_sched_job *s_job;
	struct drm_sched_entity *s_entity = NULL;
	int i;

	/* Signal all jobs not yet scheduled */
	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
		struct drm_sched_rq *rq = &sched->sched_rq[i];

		if (!rq)
			continue;

		spin_lock(&rq->lock);
		list_for_each_entry(s_entity, &rq->entities, list) {
			while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
				struct drm_sched_fence *s_fence = s_job->s_fence;

				dma_fence_signal(&s_fence->scheduled);
				dma_fence_set_error(&s_fence->finished, -EHWPOISON);
				dma_fence_signal(&s_fence->finished);
			}
		}
		spin_unlock(&rq->lock);
	}

	/* Signal all jobs already scheduled to HW */
	list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
		struct drm_sched_fence *s_fence = s_job->s_fence;

		dma_fence_set_error(&s_fence->finished, -EHWPOISON);
		dma_fence_signal(&s_fence->finished);
	}
}

282
const struct drm_sched_backend_ops amdgpu_sched_ops = {
283 284
	.dependency = amdgpu_job_dependency,
	.run_job = amdgpu_job_run,
285
	.timedout_job = amdgpu_job_timedout,
286
	.free_job = amdgpu_job_free_cb
287
};