blk-ioc.c 9.4 KB
Newer Older
J
Jens Axboe 已提交
1 2 3 4 5 6 7 8 9
/*
 * Functions related to io context handling
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
10
#include <linux/slab.h>
J
Jens Axboe 已提交
11 12 13 14 15 16 17 18

#include "blk.h"

/*
 * For io context allocations
 */
static struct kmem_cache *iocontext_cachep;

19 20 21 22 23 24 25 26 27 28 29 30 31
/**
 * get_io_context - increment reference count to io_context
 * @ioc: io_context to get
 *
 * Increment reference count to @ioc.
 */
void get_io_context(struct io_context *ioc)
{
	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
	atomic_long_inc(&ioc->refcount);
}
EXPORT_SYMBOL(get_io_context);

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
/*
 * Releasing ioc may nest into another put_io_context() leading to nested
 * fast path release.  As the ioc's can't be the same, this is okay but
 * makes lockdep whine.  Keep track of nesting and use it as subclass.
 */
#ifdef CONFIG_LOCKDEP
#define ioc_release_depth(q)		((q) ? (q)->ioc_release_depth : 0)
#define ioc_release_depth_inc(q)	(q)->ioc_release_depth++
#define ioc_release_depth_dec(q)	(q)->ioc_release_depth--
#else
#define ioc_release_depth(q)		0
#define ioc_release_depth_inc(q)	do { } while (0)
#define ioc_release_depth_dec(q)	do { } while (0)
#endif

/*
 * Slow path for ioc release in put_io_context().  Performs double-lock
 * dancing to unlink all cic's and then frees ioc.
 */
static void ioc_release_fn(struct work_struct *work)
J
Jens Axboe 已提交
52
{
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
	struct io_context *ioc = container_of(work, struct io_context,
					      release_work);
	struct request_queue *last_q = NULL;

	spin_lock_irq(&ioc->lock);

	while (!hlist_empty(&ioc->cic_list)) {
		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
							 struct cfq_io_context,
							 cic_list);
		struct request_queue *this_q = cic->q;

		if (this_q != last_q) {
			/*
			 * Need to switch to @this_q.  Once we release
			 * @ioc->lock, it can go away along with @cic.
			 * Hold on to it.
			 */
			__blk_get_queue(this_q);

			/*
			 * blk_put_queue() might sleep thanks to kobject
			 * idiocy.  Always release both locks, put and
			 * restart.
			 */
			if (last_q) {
				spin_unlock(last_q->queue_lock);
				spin_unlock_irq(&ioc->lock);
				blk_put_queue(last_q);
			} else {
				spin_unlock_irq(&ioc->lock);
			}

			last_q = this_q;
			spin_lock_irq(this_q->queue_lock);
			spin_lock(&ioc->lock);
			continue;
		}
		ioc_release_depth_inc(this_q);
		cic->exit(cic);
		cic->release(cic);
		ioc_release_depth_dec(this_q);
	}
96

97 98 99 100 101 102
	if (last_q) {
		spin_unlock(last_q->queue_lock);
		spin_unlock_irq(&ioc->lock);
		blk_put_queue(last_q);
	} else {
		spin_unlock_irq(&ioc->lock);
103
	}
104 105

	kmem_cache_free(iocontext_cachep, ioc);
J
Jens Axboe 已提交
106 107
}

T
Tejun Heo 已提交
108 109 110
/**
 * put_io_context - put a reference of io_context
 * @ioc: io_context to put
111
 * @locked_q: request_queue the caller is holding queue_lock of (hint)
T
Tejun Heo 已提交
112 113
 *
 * Decrement reference count of @ioc and release it if the count reaches
114 115 116
 * zero.  If the caller is holding queue_lock of a queue, it can indicate
 * that with @locked_q.  This is an optimization hint and the caller is
 * allowed to pass in %NULL even when it's holding a queue_lock.
J
Jens Axboe 已提交
117
 */
118
void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
J
Jens Axboe 已提交
119
{
120 121 122
	struct request_queue *last_q = locked_q;
	unsigned long flags;

J
Jens Axboe 已提交
123
	if (ioc == NULL)
T
Tejun Heo 已提交
124
		return;
J
Jens Axboe 已提交
125

T
Tejun Heo 已提交
126
	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
127 128
	if (locked_q)
		lockdep_assert_held(locked_q->queue_lock);
J
Jens Axboe 已提交
129

T
Tejun Heo 已提交
130 131
	if (!atomic_long_dec_and_test(&ioc->refcount))
		return;
J
Jens Axboe 已提交
132

133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
	/*
	 * Destroy @ioc.  This is a bit messy because cic's are chained
	 * from both ioc and queue, and ioc->lock nests inside queue_lock.
	 * The inner ioc->lock should be held to walk our cic_list and then
	 * for each cic the outer matching queue_lock should be grabbed.
	 * ie. We need to do reverse-order double lock dancing.
	 *
	 * Another twist is that we are often called with one of the
	 * matching queue_locks held as indicated by @locked_q, which
	 * prevents performing double-lock dance for other queues.
	 *
	 * So, we do it in two stages.  The fast path uses the queue_lock
	 * the caller is holding and, if other queues need to be accessed,
	 * uses trylock to avoid introducing locking dependency.  This can
	 * handle most cases, especially if @ioc was performing IO on only
	 * single device.
	 *
	 * If trylock doesn't cut it, we defer to @ioc->release_work which
	 * can do all the double-locking dancing.
	 */
	spin_lock_irqsave_nested(&ioc->lock, flags,
				 ioc_release_depth(locked_q));

	while (!hlist_empty(&ioc->cic_list)) {
		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
							 struct cfq_io_context,
							 cic_list);
		struct request_queue *this_q = cic->q;

		if (this_q != last_q) {
			if (last_q && last_q != locked_q)
				spin_unlock(last_q->queue_lock);
			last_q = NULL;

			if (!spin_trylock(this_q->queue_lock))
				break;
			last_q = this_q;
			continue;
		}
		ioc_release_depth_inc(this_q);
		cic->exit(cic);
		cic->release(cic);
		ioc_release_depth_dec(this_q);
	}
J
Jens Axboe 已提交
177

178 179
	if (last_q && last_q != locked_q)
		spin_unlock(last_q->queue_lock);
J
Jens Axboe 已提交
180

181
	spin_unlock_irqrestore(&ioc->lock, flags);
182

183 184 185 186 187
	/* if no cic's left, we're done; otherwise, kick release_work */
	if (hlist_empty(&ioc->cic_list))
		kmem_cache_free(iocontext_cachep, ioc);
	else
		schedule_work(&ioc->release_work);
J
Jens Axboe 已提交
188
}
189
EXPORT_SYMBOL(put_io_context);
J
Jens Axboe 已提交
190

191
/* Called by the exiting task */
192
void exit_io_context(struct task_struct *task)
J
Jens Axboe 已提交
193 194 195
{
	struct io_context *ioc;

196 197 198
	/* PF_EXITING prevents new io_context from being attached to @task */
	WARN_ON_ONCE(!(current->flags & PF_EXITING));

199 200 201 202
	task_lock(task);
	ioc = task->io_context;
	task->io_context = NULL;
	task_unlock(task);
J
Jens Axboe 已提交
203

204 205
	atomic_dec(&ioc->nr_tasks);
	put_io_context(ioc, NULL);
J
Jens Axboe 已提交
206 207
}

208 209 210
static struct io_context *create_task_io_context(struct task_struct *task,
						 gfp_t gfp_flags, int node,
						 bool take_ref)
J
Jens Axboe 已提交
211
{
212
	struct io_context *ioc;
J
Jens Axboe 已提交
213

T
Tejun Heo 已提交
214 215 216 217 218 219 220 221 222 223 224
	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
				    node);
	if (unlikely(!ioc))
		return NULL;

	/* initialize */
	atomic_long_set(&ioc->refcount, 1);
	atomic_set(&ioc->nr_tasks, 1);
	spin_lock_init(&ioc->lock);
	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
	INIT_HLIST_HEAD(&ioc->cic_list);
225
	INIT_WORK(&ioc->release_work, ioc_release_fn);
J
Jens Axboe 已提交
226

227 228 229 230 231 232 233 234 235 236 237 238 239 240
	/* try to install, somebody might already have beaten us to it */
	task_lock(task);

	if (!task->io_context && !(task->flags & PF_EXITING)) {
		task->io_context = ioc;
	} else {
		kmem_cache_free(iocontext_cachep, ioc);
		ioc = task->io_context;
	}

	if (ioc && take_ref)
		get_io_context(ioc);

	task_unlock(task);
241
	return ioc;
J
Jens Axboe 已提交
242 243
}

T
Tejun Heo 已提交
244 245 246 247
/**
 * current_io_context - get io_context of %current
 * @gfp_flags: allocation flags, used if allocation is necessary
 * @node: allocation node, used if allocation is necessary
J
Jens Axboe 已提交
248
 *
T
Tejun Heo 已提交
249 250 251 252 253
 * Return io_context of %current.  If it doesn't exist, it is created with
 * @gfp_flags and @node.  The returned io_context does NOT have its
 * reference count incremented.  Because io_context is exited only on task
 * exit, %current can be sure that the returned io_context is valid and
 * alive as long as it is executing.
J
Jens Axboe 已提交
254 255 256
 */
struct io_context *current_io_context(gfp_t gfp_flags, int node)
{
257
	might_sleep_if(gfp_flags & __GFP_WAIT);
J
Jens Axboe 已提交
258

259 260 261 262
	if (current->io_context)
		return current->io_context;

	return create_task_io_context(current, gfp_flags, node, false);
J
Jens Axboe 已提交
263
}
264
EXPORT_SYMBOL(current_io_context);
J
Jens Axboe 已提交
265

266 267 268 269 270 271 272 273 274
/**
 * get_task_io_context - get io_context of a task
 * @task: task of interest
 * @gfp_flags: allocation flags, used if allocation is necessary
 * @node: allocation node, used if allocation is necessary
 *
 * Return io_context of @task.  If it doesn't exist, it is created with
 * @gfp_flags and @node.  The returned io_context has its reference count
 * incremented.
J
Jens Axboe 已提交
275
 *
276 277
 * This function always goes through task_lock() and it's better to use
 * current_io_context() + get_io_context() for %current.
J
Jens Axboe 已提交
278
 */
279 280
struct io_context *get_task_io_context(struct task_struct *task,
				       gfp_t gfp_flags, int node)
J
Jens Axboe 已提交
281
{
282
	struct io_context *ioc;
J
Jens Axboe 已提交
283

284 285 286 287 288 289 290 291 292 293 294 295
	might_sleep_if(gfp_flags & __GFP_WAIT);

	task_lock(task);
	ioc = task->io_context;
	if (likely(ioc)) {
		get_io_context(ioc);
		task_unlock(task);
		return ioc;
	}
	task_unlock(task);

	return create_task_io_context(task, gfp_flags, node, true);
J
Jens Axboe 已提交
296
}
297
EXPORT_SYMBOL(get_task_io_context);
J
Jens Axboe 已提交
298

299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
void ioc_set_changed(struct io_context *ioc, int which)
{
	struct cfq_io_context *cic;
	struct hlist_node *n;

	hlist_for_each_entry(cic, n, &ioc->cic_list, cic_list)
		set_bit(which, &cic->changed);
}

/**
 * ioc_ioprio_changed - notify ioprio change
 * @ioc: io_context of interest
 * @ioprio: new ioprio
 *
 * @ioc's ioprio has changed to @ioprio.  Set %CIC_IOPRIO_CHANGED for all
 * cic's.  iosched is responsible for checking the bit and applying it on
 * request issue path.
 */
void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
{
	unsigned long flags;

	spin_lock_irqsave(&ioc->lock, flags);
	ioc->ioprio = ioprio;
	ioc_set_changed(ioc, CIC_IOPRIO_CHANGED);
	spin_unlock_irqrestore(&ioc->lock, flags);
}

/**
 * ioc_cgroup_changed - notify cgroup change
 * @ioc: io_context of interest
 *
 * @ioc's cgroup has changed.  Set %CIC_CGROUP_CHANGED for all cic's.
 * iosched is responsible for checking the bit and applying it on request
 * issue path.
 */
void ioc_cgroup_changed(struct io_context *ioc)
{
	unsigned long flags;

	spin_lock_irqsave(&ioc->lock, flags);
	ioc_set_changed(ioc, CIC_CGROUP_CHANGED);
	spin_unlock_irqrestore(&ioc->lock, flags);
}

A
Adrian Bunk 已提交
344
static int __init blk_ioc_init(void)
J
Jens Axboe 已提交
345 346 347 348 349 350
{
	iocontext_cachep = kmem_cache_create("blkdev_ioc",
			sizeof(struct io_context), 0, SLAB_PANIC, NULL);
	return 0;
}
subsys_initcall(blk_ioc_init);