amdgpu_job.c 7.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * Copyright 2015 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 *
 */
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/sched.h>
27

28
#include "amdgpu.h"
29
#include "amdgpu_trace.h"
30

31
static void amdgpu_job_timedout(struct drm_sched_job *s_job)
32
{
C
Christian König 已提交
33 34
	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
	struct amdgpu_job *job = to_amdgpu_job(s_job);
35
	struct amdgpu_task_info ti;
36
	struct amdgpu_device *adev = ring->adev;
37 38

	memset(&ti, 0, sizeof(struct amdgpu_task_info));
39

40 41 42 43 44 45
	if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
		DRM_ERROR("ring %s timeout, but soft recovered\n",
			  s_job->sched->name);
		return;
	}

46
	amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
47
	DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
C
Christian König 已提交
48 49
		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
		  ring->fence_drv.sync_seq);
50 51
	DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
		  ti.process_name, ti.tgid, ti.task_name, ti.pid);
52

53
	if (amdgpu_device_should_recover_gpu(ring->adev)) {
54
		amdgpu_device_gpu_recover(ring->adev, job);
55
	} else {
56
		drm_sched_suspend_timeout(&ring->sched);
57 58 59
		if (amdgpu_sriov_vf(adev))
			adev->virt.tdr_debug = true;
	}
60 61
}

62
int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
63
		     struct amdgpu_job **job, struct amdgpu_vm *vm)
64 65 66 67 68 69 70 71 72 73 74 75
{
	size_t size = sizeof(struct amdgpu_job);

	if (num_ibs == 0)
		return -EINVAL;

	size += sizeof(struct amdgpu_ib) * num_ibs;

	*job = kzalloc(size, GFP_KERNEL);
	if (!*job)
		return -ENOMEM;

76 77 78 79 80
	/*
	 * Initialize the scheduler to at least some ring so that we always
	 * have a pointer to adev.
	 */
	(*job)->base.sched = &adev->rings[0]->sched;
81
	(*job)->vm = vm;
82 83 84
	(*job)->ibs = (void *)&(*job)[1];
	(*job)->num_ibs = num_ibs;

85
	amdgpu_sync_create(&(*job)->sync);
86
	amdgpu_sync_create(&(*job)->sched_sync);
87
	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
88
	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
89

90 91 92
	return 0;
}

93
int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
94 95
		enum amdgpu_ib_pool_type pool_type,
		struct amdgpu_job **job)
96 97 98
{
	int r;

99
	r = amdgpu_job_alloc(adev, 1, job, NULL);
100 101 102
	if (r)
		return r;

103
	r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]);
104 105 106 107 108 109
	if (r)
		kfree(*job);

	return r;
}

110
void amdgpu_job_free_resources(struct amdgpu_job *job)
111
{
112
	struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
113
	struct dma_fence *f;
114 115
	unsigned i;

116
	/* use sched fence if available */
117
	f = job->base.s_fence ? &job->base.s_fence->finished : job->fence;
118 119

	for (i = 0; i < job->num_ibs; ++i)
120
		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
121 122
}

123
static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
M
Monk Liu 已提交
124
{
C
Christian König 已提交
125
	struct amdgpu_job *job = to_amdgpu_job(s_job);
126

127 128
	drm_sched_job_cleanup(s_job);

129
	dma_fence_put(job->fence);
130
	amdgpu_sync_free(&job->sync);
131
	amdgpu_sync_free(&job->sched_sync);
M
Monk Liu 已提交
132 133 134
	kfree(job);
}

135 136 137
void amdgpu_job_free(struct amdgpu_job *job)
{
	amdgpu_job_free_resources(job);
138

139
	dma_fence_put(job->fence);
140
	amdgpu_sync_free(&job->sync);
141
	amdgpu_sync_free(&job->sched_sync);
142 143 144
	kfree(job);
}

145 146
int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
		      void *owner, struct dma_fence **f)
147
{
148
	int r;
149

150 151 152
	if (!f)
		return -EINVAL;

153
	r = drm_sched_job_init(&job->base, entity, owner);
154 155
	if (r)
		return r;
156

157
	*f = dma_fence_get(&job->base.s_fence->finished);
158
	amdgpu_job_free_resources(job);
159
	drm_sched_entity_push_job(&job->base, entity);
160 161

	return 0;
162 163
}

164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
			     struct dma_fence **fence)
{
	int r;

	job->base.sched = &ring->sched;
	r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, NULL, fence);
	job->fence = dma_fence_get(*fence);
	if (r)
		return r;

	amdgpu_job_free(job);
	return 0;
}

179 180
static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job,
					       struct drm_sched_entity *s_entity)
181
{
182
	struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
183
	struct amdgpu_job *job = to_amdgpu_job(sched_job);
184
	struct amdgpu_vm *vm = job->vm;
185
	struct dma_fence *fence;
186
	bool explicit = false;
187
	int r;
188

189
	fence = amdgpu_sync_get_fence(&job->sync, &explicit);
190
	if (fence && explicit) {
191
		if (drm_sched_dependency_optimized(fence, s_entity)) {
192
			r = amdgpu_sync_fence(&job->sched_sync, fence, false);
193
			if (r)
194
				DRM_ERROR("Error adding fence (%d)\n", r);
195
		}
196
	}
197

198
	while (fence == NULL && vm && !job->vmid) {
199 200 201
		r = amdgpu_vmid_grab(vm, ring, &job->sync,
				     &job->base.s_fence->finished,
				     job);
202
		if (r)
203 204
			DRM_ERROR("Error getting VM ID (%d)\n", r);

205
		fence = amdgpu_sync_get_fence(&job->sync, NULL);
206 207 208
	}

	return fence;
209 210
}

211
static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
212
{
C
Christian König 已提交
213
	struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched);
214
	struct dma_fence *fence = NULL, *finished;
215
	struct amdgpu_job *job;
216
	int r = 0;
217

218
	job = to_amdgpu_job(sched_job);
219
	finished = &job->base.s_fence->finished;
220

221
	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
222

223
	trace_amdgpu_sched_run_job(job);
224

225
	if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter))
226 227 228 229
		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */

	if (finished->error < 0) {
		DRM_INFO("Skip scheduling IBs!\n");
230
	} else {
C
Christian König 已提交
231
		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
232
				       &fence);
233 234 235
		if (r)
			DRM_ERROR("Error scheduling IBs (%d)\n", r);
	}
236
	/* if gpu reset, hw fence will be replaced here */
237 238
	dma_fence_put(job->fence);
	job->fence = dma_fence_get(fence);
239

240
	amdgpu_job_free_resources(job);
241 242

	fence = r ? ERR_PTR(r) : fence;
243
	return fence;
244 245
}

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
#define to_drm_sched_job(sched_job)		\
		container_of((sched_job), struct drm_sched_job, queue_node)

void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
{
	struct drm_sched_job *s_job;
	struct drm_sched_entity *s_entity = NULL;
	int i;

	/* Signal all jobs not yet scheduled */
	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
		struct drm_sched_rq *rq = &sched->sched_rq[i];

		if (!rq)
			continue;

		spin_lock(&rq->lock);
		list_for_each_entry(s_entity, &rq->entities, list) {
			while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
				struct drm_sched_fence *s_fence = s_job->s_fence;

				dma_fence_signal(&s_fence->scheduled);
				dma_fence_set_error(&s_fence->finished, -EHWPOISON);
				dma_fence_signal(&s_fence->finished);
			}
		}
		spin_unlock(&rq->lock);
	}

	/* Signal all jobs already scheduled to HW */
	list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
		struct drm_sched_fence *s_fence = s_job->s_fence;

		dma_fence_set_error(&s_fence->finished, -EHWPOISON);
		dma_fence_signal(&s_fence->finished);
	}
}

284
const struct drm_sched_backend_ops amdgpu_sched_ops = {
285 286
	.dependency = amdgpu_job_dependency,
	.run_job = amdgpu_job_run,
287
	.timedout_job = amdgpu_job_timedout,
288
	.free_job = amdgpu_job_free_cb
289
};