dax.c 43.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * fs/dax.c - Direct Access filesystem code
 * Copyright (c) 2013-2014 Intel Corporation
 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
20
#include <linux/dax.h>
21 22
#include <linux/fs.h>
#include <linux/genhd.h>
23 24 25
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
26
#include <linux/mutex.h>
R
Ross Zwisler 已提交
27
#include <linux/pagevec.h>
28
#include <linux/pmem.h>
29
#include <linux/sched.h>
30
#include <linux/uio.h>
31
#include <linux/vmstat.h>
D
Dan Williams 已提交
32
#include <linux/pfn_t.h>
33
#include <linux/sizes.h>
34 35
#include <linux/iomap.h>
#include "internal.h"
36

J
Jan Kara 已提交
37 38 39 40
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)

41
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
J
Jan Kara 已提交
42 43 44 45 46 47 48 49 50 51 52

static int __init init_dax_wait_table(void)
{
	int i;

	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
		init_waitqueue_head(wait_table + i);
	return 0;
}
fs_initcall(init_dax_wait_table);

53 54 55 56 57
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
	struct request_queue *q = bdev->bd_queue;
	long rc = -EIO;

D
Dan Williams 已提交
58
	dax->addr = ERR_PTR(-EIO);
59 60 61 62 63
	if (blk_queue_enter(q, true) != 0)
		return rc;

	rc = bdev_direct_access(bdev, dax);
	if (rc < 0) {
D
Dan Williams 已提交
64
		dax->addr = ERR_PTR(rc);
65 66 67 68 69 70 71 72 73 74 75 76 77 78
		blk_queue_exit(q);
		return rc;
	}
	return rc;
}

static void dax_unmap_atomic(struct block_device *bdev,
		const struct blk_dax_ctl *dax)
{
	if (IS_ERR(dax->addr))
		return;
	blk_queue_exit(bdev->bd_queue);
}

79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
static int dax_is_pmd_entry(void *entry)
{
	return (unsigned long)entry & RADIX_DAX_PMD;
}

static int dax_is_pte_entry(void *entry)
{
	return !((unsigned long)entry & RADIX_DAX_PMD);
}

static int dax_is_zero_entry(void *entry)
{
	return (unsigned long)entry & RADIX_DAX_HZP;
}

static int dax_is_empty_entry(void *entry)
{
	return (unsigned long)entry & RADIX_DAX_EMPTY;
}

99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
struct page *read_dax_sector(struct block_device *bdev, sector_t n)
{
	struct page *page = alloc_pages(GFP_KERNEL, 0);
	struct blk_dax_ctl dax = {
		.size = PAGE_SIZE,
		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
	};
	long rc;

	if (!page)
		return ERR_PTR(-ENOMEM);

	rc = dax_map_atomic(bdev, &dax);
	if (rc < 0)
		return ERR_PTR(rc);
	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
	dax_unmap_atomic(bdev, &dax);
	return page;
}

119 120 121 122 123
static bool buffer_written(struct buffer_head *bh)
{
	return buffer_mapped(bh) && !buffer_unwritten(bh);
}

124 125 126 127 128 129 130 131
static sector_t to_sector(const struct buffer_head *bh,
		const struct inode *inode)
{
	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);

	return sector;
}

O
Omar Sandoval 已提交
132 133 134
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
		      loff_t start, loff_t end, get_block_t get_block,
		      struct buffer_head *bh)
135
{
136
	loff_t pos = start, max = start, bh_max = start;
D
Dan Williams 已提交
137
	bool hole = false;
138 139 140 141
	struct block_device *bdev = NULL;
	int rw = iov_iter_rw(iter), rc;
	long map_len = 0;
	struct blk_dax_ctl dax = {
D
Dan Williams 已提交
142
		.addr = ERR_PTR(-EIO),
143
	};
J
Jan Kara 已提交
144 145 146
	unsigned blkbits = inode->i_blkbits;
	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
								>> blkbits;
147 148

	if (rw == READ)
149 150 151
		end = min(end, i_size_read(inode));

	while (pos < end) {
152
		size_t len;
153
		if (pos == max) {
154 155
			long page = pos >> PAGE_SHIFT;
			sector_t block = page << (PAGE_SHIFT - blkbits);
156 157 158 159 160 161
			unsigned first = pos - (block << blkbits);
			long size;

			if (pos == bh_max) {
				bh->b_size = PAGE_ALIGN(end - pos);
				bh->b_state = 0;
162 163
				rc = get_block(inode, block, bh, rw == WRITE);
				if (rc)
164 165
					break;
				bh_max = pos - first + bh->b_size;
166
				bdev = bh->b_bdev;
J
Jan Kara 已提交
167 168 169 170 171 172 173
				/*
				 * We allow uninitialized buffers for writes
				 * beyond EOF as those cannot race with faults
				 */
				WARN_ON_ONCE(
					(buffer_new(bh) && block < file_blks) ||
					(rw == WRITE && buffer_unwritten(bh)));
174 175 176 177 178 179 180
			} else {
				unsigned done = bh->b_size -
						(bh_max - (pos - first));
				bh->b_blocknr += done >> blkbits;
				bh->b_size -= done;
			}

181
			hole = rw == READ && !buffer_written(bh);
182 183 184
			if (hole) {
				size = bh->b_size - first;
			} else {
185 186 187 188 189 190
				dax_unmap_atomic(bdev, &dax);
				dax.sector = to_sector(bh, inode);
				dax.size = bh->b_size;
				map_len = dax_map_atomic(bdev, &dax);
				if (map_len < 0) {
					rc = map_len;
191
					break;
192 193 194
				}
				dax.addr += first;
				size = map_len - first;
195
			}
196 197 198 199 200 201
			/*
			 * pos + size is one past the last offset for IO,
			 * so pos + size can overflow loff_t at extreme offsets.
			 * Cast to u64 to catch this and get the true minimum.
			 */
			max = min_t(u64, pos + size, end);
202 203
		}

204
		if (iov_iter_rw(iter) == WRITE) {
205
			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
206
		} else if (!hole)
207
			len = copy_to_iter((void __force *) dax.addr, max - pos,
208
					iter);
209 210 211
		else
			len = iov_iter_zero(max - pos, iter);

212
		if (!len) {
213
			rc = -EFAULT;
214
			break;
215
		}
216 217

		pos += len;
218 219
		if (!IS_ERR(dax.addr))
			dax.addr += len;
220 221
	}

222
	dax_unmap_atomic(bdev, &dax);
223

224
	return (pos == start) ? rc : pos - start;
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
}

/**
 * dax_do_io - Perform I/O to a DAX file
 * @iocb: The control block for this I/O
 * @inode: The file which the I/O is directed at
 * @iter: The addresses to do I/O from or to
 * @get_block: The filesystem method used to translate file offsets to blocks
 * @end_io: A filesystem callback for I/O completion
 * @flags: See below
 *
 * This function uses the same locking scheme as do_blockdev_direct_IO:
 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
 * caller for writes.  For reads, we take and release the i_mutex ourselves.
 * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
 * is in progress.
 */
O
Omar Sandoval 已提交
243
ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
244
		  struct iov_iter *iter, get_block_t get_block,
O
Omar Sandoval 已提交
245
		  dio_iodone_t end_io, int flags)
246 247 248
{
	struct buffer_head bh;
	ssize_t retval = -EINVAL;
249
	loff_t pos = iocb->ki_pos;
250 251 252
	loff_t end = pos + iov_iter_count(iter);

	memset(&bh, 0, sizeof(bh));
253
	bh.b_bdev = inode->i_sb->s_bdev;
254

255
	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
A
Al Viro 已提交
256
		inode_lock(inode);
257 258

	/* Protects against truncate */
259 260
	if (!(flags & DIO_SKIP_DIO_COUNT))
		inode_dio_begin(inode);
261

O
Omar Sandoval 已提交
262
	retval = dax_io(inode, iter, pos, end, get_block, &bh);
263

O
Omar Sandoval 已提交
264
	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
A
Al Viro 已提交
265
		inode_unlock(inode);
266

267 268 269 270 271 272 273
	if (end_io) {
		int err;

		err = end_io(iocb, pos, retval, bh.b_private);
		if (err)
			retval = err;
	}
274

275 276
	if (!(flags & DIO_SKIP_DIO_COUNT))
		inode_dio_end(inode);
277 278 279
	return retval;
}
EXPORT_SYMBOL_GPL(dax_do_io);
280

J
Jan Kara 已提交
281 282 283 284 285
/*
 * DAX radix tree locking
 */
struct exceptional_entry_key {
	struct address_space *mapping;
286
	pgoff_t entry_start;
J
Jan Kara 已提交
287 288 289 290 291 292 293
};

struct wait_exceptional_entry_queue {
	wait_queue_t wait;
	struct exceptional_entry_key key;
};

294 295 296 297 298 299 300 301 302 303
static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
		pgoff_t index, void *entry, struct exceptional_entry_key *key)
{
	unsigned long hash;

	/*
	 * If 'entry' is a PMD, align the 'index' that we use for the wait
	 * queue to the start of that PMD.  This ensures that all offsets in
	 * the range covered by the PMD map to the same bit lock.
	 */
304
	if (dax_is_pmd_entry(entry))
305 306 307 308 309 310 311 312 313
		index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);

	key->mapping = mapping;
	key->entry_start = index;

	hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
	return wait_table + hash;
}

J
Jan Kara 已提交
314 315 316 317 318 319 320 321
static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
				       int sync, void *keyp)
{
	struct exceptional_entry_key *key = keyp;
	struct wait_exceptional_entry_queue *ewait =
		container_of(wait, struct wait_exceptional_entry_queue, wait);

	if (key->mapping != ewait->key.mapping ||
322
	    key->entry_start != ewait->key.entry_start)
J
Jan Kara 已提交
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
		return 0;
	return autoremove_wake_function(wait, mode, sync, NULL);
}

/*
 * Check whether the given slot is locked. The function must be called with
 * mapping->tree_lock held
 */
static inline int slot_locked(struct address_space *mapping, void **slot)
{
	unsigned long entry = (unsigned long)
		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
	return entry & RADIX_DAX_ENTRY_LOCK;
}

/*
 * Mark the given slot is locked. The function must be called with
 * mapping->tree_lock held
 */
static inline void *lock_slot(struct address_space *mapping, void **slot)
{
	unsigned long entry = (unsigned long)
		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);

	entry |= RADIX_DAX_ENTRY_LOCK;
	radix_tree_replace_slot(slot, (void *)entry);
	return (void *)entry;
}

/*
 * Mark the given slot is unlocked. The function must be called with
 * mapping->tree_lock held
 */
static inline void *unlock_slot(struct address_space *mapping, void **slot)
{
	unsigned long entry = (unsigned long)
		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);

	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
	radix_tree_replace_slot(slot, (void *)entry);
	return (void *)entry;
}

/*
 * Lookup entry in radix tree, wait for it to become unlocked if it is
 * exceptional entry and return it. The caller must call
 * put_unlocked_mapping_entry() when he decided not to lock the entry or
 * put_locked_mapping_entry() when he locked the entry and now wants to
 * unlock it.
 *
 * The function must be called with mapping->tree_lock held.
 */
static void *get_unlocked_mapping_entry(struct address_space *mapping,
					pgoff_t index, void ***slotp)
{
378
	void *entry, **slot;
J
Jan Kara 已提交
379
	struct wait_exceptional_entry_queue ewait;
380
	wait_queue_head_t *wq;
J
Jan Kara 已提交
381 382 383 384 385

	init_wait(&ewait.wait);
	ewait.wait.func = wake_exceptional_entry_func;

	for (;;) {
386
		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
J
Jan Kara 已提交
387
					  &slot);
388
		if (!entry || !radix_tree_exceptional_entry(entry) ||
J
Jan Kara 已提交
389 390 391
		    !slot_locked(mapping, slot)) {
			if (slotp)
				*slotp = slot;
392
			return entry;
J
Jan Kara 已提交
393
		}
394 395

		wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
J
Jan Kara 已提交
396 397 398 399 400 401 402 403 404
		prepare_to_wait_exclusive(wq, &ewait.wait,
					  TASK_UNINTERRUPTIBLE);
		spin_unlock_irq(&mapping->tree_lock);
		schedule();
		finish_wait(wq, &ewait.wait);
		spin_lock_irq(&mapping->tree_lock);
	}
}

405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
static void put_locked_mapping_entry(struct address_space *mapping,
				     pgoff_t index, void *entry)
{
	if (!radix_tree_exceptional_entry(entry)) {
		unlock_page(entry);
		put_page(entry);
	} else {
		dax_unlock_mapping_entry(mapping, index);
	}
}

/*
 * Called when we are done with radix tree entry we looked up via
 * get_unlocked_mapping_entry() and which we didn't lock in the end.
 */
static void put_unlocked_mapping_entry(struct address_space *mapping,
				       pgoff_t index, void *entry)
{
	if (!radix_tree_exceptional_entry(entry))
		return;

	/* We have to wake up next waiter for the radix tree entry lock */
	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
}

J
Jan Kara 已提交
430 431 432 433 434 435
/*
 * Find radix tree entry at given index. If it points to a page, return with
 * the page locked. If it points to the exceptional entry, return with the
 * radix tree entry locked. If the radix tree doesn't contain given index,
 * create empty exceptional entry for the index and return with it locked.
 *
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
 * either return that locked entry or will return an error.  This error will
 * happen if there are any 4k entries (either zero pages or DAX entries)
 * within the 2MiB range that we are requesting.
 *
 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
 * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
 * insertion will fail if it finds any 4k entries already in the tree, and a
 * 4k insertion will cause an existing 2MiB entry to be unmapped and
 * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
 * well as 2MiB empty entries.
 *
 * The exception to this downgrade path is for 2MiB DAX PMD entries that have
 * real storage backing them.  We will leave these real 2MiB DAX entries in
 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
 *
J
Jan Kara 已提交
452 453 454 455
 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
 * persistent memory the benefit is doubtful. We can add that later if we can
 * show it helps.
 */
456 457
static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
		unsigned long size_flag)
J
Jan Kara 已提交
458
{
459
	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
460
	void *entry, **slot;
J
Jan Kara 已提交
461 462 463

restart:
	spin_lock_irq(&mapping->tree_lock);
464
	entry = get_unlocked_mapping_entry(mapping, index, &slot);
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484

	if (entry) {
		if (size_flag & RADIX_DAX_PMD) {
			if (!radix_tree_exceptional_entry(entry) ||
			    dax_is_pte_entry(entry)) {
				put_unlocked_mapping_entry(mapping, index,
						entry);
				entry = ERR_PTR(-EEXIST);
				goto out_unlock;
			}
		} else { /* trying to grab a PTE entry */
			if (radix_tree_exceptional_entry(entry) &&
			    dax_is_pmd_entry(entry) &&
			    (dax_is_zero_entry(entry) ||
			     dax_is_empty_entry(entry))) {
				pmd_downgrade = true;
			}
		}
	}

J
Jan Kara 已提交
485
	/* No entry for given index? Make sure radix tree is big enough. */
486
	if (!entry || pmd_downgrade) {
J
Jan Kara 已提交
487 488
		int err;

489 490 491 492 493 494 495 496
		if (pmd_downgrade) {
			/*
			 * Make sure 'entry' remains valid while we drop
			 * mapping->tree_lock.
			 */
			entry = lock_slot(mapping, slot);
		}

J
Jan Kara 已提交
497 498 499
		spin_unlock_irq(&mapping->tree_lock);
		err = radix_tree_preload(
				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
500 501 502
		if (err) {
			if (pmd_downgrade)
				put_locked_mapping_entry(mapping, index, entry);
J
Jan Kara 已提交
503
			return ERR_PTR(err);
504 505 506 507 508 509 510 511 512 513 514
		}

		/*
		 * Besides huge zero pages the only other thing that gets
		 * downgraded are empty entries which don't need to be
		 * unmapped.
		 */
		if (pmd_downgrade && dax_is_zero_entry(entry))
			unmap_mapping_range(mapping,
				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);

J
Jan Kara 已提交
515
		spin_lock_irq(&mapping->tree_lock);
516 517 518 519 520 521 522 523 524 525 526 527

		if (pmd_downgrade) {
			radix_tree_delete(&mapping->page_tree, index);
			mapping->nrexceptional--;
			dax_wake_mapping_entry_waiter(mapping, index, entry,
					true);
		}

		entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);

		err = __radix_tree_insert(&mapping->page_tree, index,
				dax_radix_order(entry), entry);
J
Jan Kara 已提交
528 529 530
		radix_tree_preload_end();
		if (err) {
			spin_unlock_irq(&mapping->tree_lock);
531 532 533 534 535 536 537
			/*
			 * Someone already created the entry?  This is a
			 * normal failure when inserting PMDs in a range
			 * that already contains PTEs.  In that case we want
			 * to return -EEXIST immediately.
			 */
			if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
J
Jan Kara 已提交
538
				goto restart;
539 540 541 542 543 544 545
			/*
			 * Our insertion of a DAX PMD entry failed, most
			 * likely because it collided with a PTE sized entry
			 * at a different index in the PMD range.  We haven't
			 * inserted anything into the radix tree and have no
			 * waiters to wake.
			 */
J
Jan Kara 已提交
546 547 548 549 550
			return ERR_PTR(err);
		}
		/* Good, we have inserted empty locked entry into the tree. */
		mapping->nrexceptional++;
		spin_unlock_irq(&mapping->tree_lock);
551
		return entry;
J
Jan Kara 已提交
552 553
	}
	/* Normal page in radix tree? */
554 555
	if (!radix_tree_exceptional_entry(entry)) {
		struct page *page = entry;
J
Jan Kara 已提交
556 557 558 559 560 561 562 563 564 565 566 567

		get_page(page);
		spin_unlock_irq(&mapping->tree_lock);
		lock_page(page);
		/* Page got truncated? Retry... */
		if (unlikely(page->mapping != mapping)) {
			unlock_page(page);
			put_page(page);
			goto restart;
		}
		return page;
	}
568
	entry = lock_slot(mapping, slot);
569
 out_unlock:
J
Jan Kara 已提交
570
	spin_unlock_irq(&mapping->tree_lock);
571
	return entry;
J
Jan Kara 已提交
572 573
}

574 575 576
/*
 * We do not necessarily hold the mapping->tree_lock when we call this
 * function so it is possible that 'entry' is no longer a valid item in the
577 578 579
 * radix tree.  This is okay because all we really need to do is to find the
 * correct waitqueue where tasks might be waiting for that old 'entry' and
 * wake them.
580
 */
J
Jan Kara 已提交
581
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
582
		pgoff_t index, void *entry, bool wake_all)
J
Jan Kara 已提交
583
{
584 585 586 587
	struct exceptional_entry_key key;
	wait_queue_head_t *wq;

	wq = dax_entry_waitqueue(mapping, index, entry, &key);
J
Jan Kara 已提交
588 589 590 591 592 593 594

	/*
	 * Checking for locked entry and prepare_to_wait_exclusive() happens
	 * under mapping->tree_lock, ditto for entry handling in our callers.
	 * So at this point all tasks that could have seen our entry locked
	 * must be in the waitqueue and the following check will see them.
	 */
595
	if (waitqueue_active(wq))
J
Jan Kara 已提交
596 597 598
		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
}

599
void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
J
Jan Kara 已提交
600
{
601
	void *entry, **slot;
J
Jan Kara 已提交
602 603

	spin_lock_irq(&mapping->tree_lock);
604 605
	entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
J
Jan Kara 已提交
606 607 608 609 610 611
			 !slot_locked(mapping, slot))) {
		spin_unlock_irq(&mapping->tree_lock);
		return;
	}
	unlock_slot(mapping, slot);
	spin_unlock_irq(&mapping->tree_lock);
612
	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
J
Jan Kara 已提交
613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638
}

/*
 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
 * entry to get unlocked before deleting it.
 */
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
	void *entry;

	spin_lock_irq(&mapping->tree_lock);
	entry = get_unlocked_mapping_entry(mapping, index, NULL);
	/*
	 * This gets called from truncate / punch_hole path. As such, the caller
	 * must hold locks protecting against concurrent modifications of the
	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
	 * caller has seen exceptional entry for this index, we better find it
	 * at that index as well...
	 */
	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
		spin_unlock_irq(&mapping->tree_lock);
		return 0;
	}
	radix_tree_delete(&mapping->page_tree, index);
	mapping->nrexceptional--;
	spin_unlock_irq(&mapping->tree_lock);
639
	dax_wake_mapping_entry_waiter(mapping, index, entry, true);
J
Jan Kara 已提交
640 641 642 643

	return 1;
}

644 645 646 647 648 649 650 651
/*
 * The user has performed a load from a hole in the file.  Allocating
 * a new page in the file would cause excessive storage usage for
 * workloads with sparse files.  We allocate a page cache page instead.
 * We'll kick it out of the page cache if it's ever written to,
 * otherwise it will simply fall out of the page cache under memory
 * pressure without ever having been dirtied.
 */
J
Jan Kara 已提交
652 653
static int dax_load_hole(struct address_space *mapping, void *entry,
			 struct vm_fault *vmf)
654
{
J
Jan Kara 已提交
655
	struct page *page;
656

J
Jan Kara 已提交
657 658 659 660 661
	/* Hole page already exists? Return it...  */
	if (!radix_tree_exceptional_entry(entry)) {
		vmf->page = entry;
		return VM_FAULT_LOCKED;
	}
662

J
Jan Kara 已提交
663 664 665 666 667 668 669
	/* This will replace locked radix tree entry with a hole page */
	page = find_or_create_page(mapping, vmf->pgoff,
				   vmf->gfp_mask | __GFP_ZERO);
	if (!page) {
		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
		return VM_FAULT_OOM;
	}
670 671 672 673
	vmf->page = page;
	return VM_FAULT_LOCKED;
}

674 675
static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
		struct page *to, unsigned long vaddr)
676
{
677
	struct blk_dax_ctl dax = {
678 679
		.sector = sector,
		.size = size,
680
	};
681 682
	void *vto;

683 684
	if (dax_map_atomic(bdev, &dax) < 0)
		return PTR_ERR(dax.addr);
685
	vto = kmap_atomic(to);
686
	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
687
	kunmap_atomic(vto);
688
	dax_unmap_atomic(bdev, &dax);
689 690 691
	return 0;
}

692 693 694 695 696 697 698
/*
 * By this point grab_mapping_entry() has ensured that we have a locked entry
 * of the appropriate size so we don't have to worry about downgrading PMDs to
 * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
 * already in the tree, we will skip the insertion and just dirty the PMD as
 * appropriate.
 */
J
Jan Kara 已提交
699 700
static void *dax_insert_mapping_entry(struct address_space *mapping,
				      struct vm_fault *vmf,
701 702
				      void *entry, sector_t sector,
				      unsigned long flags)
R
Ross Zwisler 已提交
703 704
{
	struct radix_tree_root *page_tree = &mapping->page_tree;
J
Jan Kara 已提交
705 706 707 708
	int error = 0;
	bool hole_fill = false;
	void *new_entry;
	pgoff_t index = vmf->pgoff;
R
Ross Zwisler 已提交
709

J
Jan Kara 已提交
710
	if (vmf->flags & FAULT_FLAG_WRITE)
711
		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
R
Ross Zwisler 已提交
712

J
Jan Kara 已提交
713 714 715 716 717 718 719 720 721 722 723 724
	/* Replacing hole page with block mapping? */
	if (!radix_tree_exceptional_entry(entry)) {
		hole_fill = true;
		/*
		 * Unmap the page now before we remove it from page cache below.
		 * The page is locked so it cannot be faulted in again.
		 */
		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
				    PAGE_SIZE, 0);
		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
		if (error)
			return ERR_PTR(error);
725 726 727 728
	} else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
		/* replacing huge zero page with PMD block mapping */
		unmap_mapping_range(mapping,
			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
R
Ross Zwisler 已提交
729 730
	}

J
Jan Kara 已提交
731
	spin_lock_irq(&mapping->tree_lock);
732 733
	new_entry = dax_radix_locked_entry(sector, flags);

J
Jan Kara 已提交
734 735 736 737
	if (hole_fill) {
		__delete_from_page_cache(entry, NULL);
		/* Drop pagecache reference */
		put_page(entry);
738 739
		error = __radix_tree_insert(page_tree, index,
				dax_radix_order(new_entry), new_entry);
J
Jan Kara 已提交
740 741
		if (error) {
			new_entry = ERR_PTR(error);
R
Ross Zwisler 已提交
742 743
			goto unlock;
		}
J
Jan Kara 已提交
744
		mapping->nrexceptional++;
745 746 747 748 749 750 751 752 753
	} else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
		/*
		 * Only swap our new entry into the radix tree if the current
		 * entry is a zero page or an empty entry.  If a normal PTE or
		 * PMD entry is already in the tree, we leave it alone.  This
		 * means that if we are trying to insert a PTE and the
		 * existing entry is a PMD, we will just leave the PMD in the
		 * tree and dirty it if necessary.
		 */
J
Jan Kara 已提交
754 755
		void **slot;
		void *ret;
R
Ross Zwisler 已提交
756

J
Jan Kara 已提交
757 758 759
		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
		WARN_ON_ONCE(ret != entry);
		radix_tree_replace_slot(slot, new_entry);
R
Ross Zwisler 已提交
760
	}
J
Jan Kara 已提交
761
	if (vmf->flags & FAULT_FLAG_WRITE)
R
Ross Zwisler 已提交
762 763 764
		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 unlock:
	spin_unlock_irq(&mapping->tree_lock);
J
Jan Kara 已提交
765 766 767 768 769 770 771 772 773 774 775 776
	if (hole_fill) {
		radix_tree_preload_end();
		/*
		 * We don't need hole page anymore, it has been replaced with
		 * locked radix tree entry now.
		 */
		if (mapping->a_ops->freepage)
			mapping->a_ops->freepage(entry);
		unlock_page(entry);
		put_page(entry);
	}
	return new_entry;
R
Ross Zwisler 已提交
777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
}

static int dax_writeback_one(struct block_device *bdev,
		struct address_space *mapping, pgoff_t index, void *entry)
{
	struct radix_tree_root *page_tree = &mapping->page_tree;
	struct radix_tree_node *node;
	struct blk_dax_ctl dax;
	void **slot;
	int ret = 0;

	spin_lock_irq(&mapping->tree_lock);
	/*
	 * Regular page slots are stabilized by the page lock even
	 * without the tree itself locked.  These unlocked entries
	 * need verification under the tree lock.
	 */
	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
		goto unlock;
	if (*slot != entry)
		goto unlock;

	/* another fsync thread may have already written back this entry */
	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
		goto unlock;

803 804
	if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
				dax_is_zero_entry(entry))) {
R
Ross Zwisler 已提交
805 806 807 808
		ret = -EIO;
		goto unlock;
	}

809 810 811 812 813 814 815 816 817
	/*
	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
	 * in the middle of a PMD, the 'index' we are given will be aligned to
	 * the start index of the PMD, as will the sector we pull from
	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
	 * worry about partial PMD writebacks.
	 */
	dax.sector = dax_radix_sector(entry);
	dax.size = PAGE_SIZE << dax_radix_order(entry);
R
Ross Zwisler 已提交
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851
	spin_unlock_irq(&mapping->tree_lock);

	/*
	 * We cannot hold tree_lock while calling dax_map_atomic() because it
	 * eventually calls cond_resched().
	 */
	ret = dax_map_atomic(bdev, &dax);
	if (ret < 0)
		return ret;

	if (WARN_ON_ONCE(ret < dax.size)) {
		ret = -EIO;
		goto unmap;
	}

	wb_cache_pmem(dax.addr, dax.size);

	spin_lock_irq(&mapping->tree_lock);
	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
	spin_unlock_irq(&mapping->tree_lock);
 unmap:
	dax_unmap_atomic(bdev, &dax);
	return ret;

 unlock:
	spin_unlock_irq(&mapping->tree_lock);
	return ret;
}

/*
 * Flush the mapping to the persistent domain within the byte range of [start,
 * end]. This is required by data integrity operations to ensure file data is
 * on persistent storage prior to completion of the operation.
 */
852 853
int dax_writeback_mapping_range(struct address_space *mapping,
		struct block_device *bdev, struct writeback_control *wbc)
R
Ross Zwisler 已提交
854 855
{
	struct inode *inode = mapping->host;
856
	pgoff_t start_index, end_index;
R
Ross Zwisler 已提交
857 858 859 860 861 862 863 864
	pgoff_t indices[PAGEVEC_SIZE];
	struct pagevec pvec;
	bool done = false;
	int i, ret = 0;

	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
		return -EIO;

865 866 867
	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
		return 0;

868 869
	start_index = wbc->range_start >> PAGE_SHIFT;
	end_index = wbc->range_end >> PAGE_SHIFT;
R
Ross Zwisler 已提交
870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897

	tag_pages_for_writeback(mapping, start_index, end_index);

	pagevec_init(&pvec, 0);
	while (!done) {
		pvec.nr = find_get_entries_tag(mapping, start_index,
				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
				pvec.pages, indices);

		if (pvec.nr == 0)
			break;

		for (i = 0; i < pvec.nr; i++) {
			if (indices[i] > end_index) {
				done = true;
				break;
			}

			ret = dax_writeback_one(bdev, mapping, indices[i],
					pvec.pages[i]);
			if (ret < 0)
				return ret;
		}
	}
	return 0;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);

J
Jan Kara 已提交
898
static int dax_insert_mapping(struct address_space *mapping,
899 900
		struct block_device *bdev, sector_t sector, size_t size,
		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
901 902
{
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
903
	struct blk_dax_ctl dax = {
904 905
		.sector = sector,
		.size = size,
906
	};
J
Jan Kara 已提交
907 908
	void *ret;
	void *entry = *entryp;
909

J
Jan Kara 已提交
910 911
	if (dax_map_atomic(bdev, &dax) < 0)
		return PTR_ERR(dax.addr);
912
	dax_unmap_atomic(bdev, &dax);
913

914
	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
J
Jan Kara 已提交
915 916
	if (IS_ERR(ret))
		return PTR_ERR(ret);
J
Jan Kara 已提交
917
	*entryp = ret;
R
Ross Zwisler 已提交
918

J
Jan Kara 已提交
919
	return vm_insert_mixed(vma, vaddr, dax.pfn);
920 921
}

922
/**
R
Ross Zwisler 已提交
923
 * dax_fault - handle a page fault on a DAX file
924 925 926 927 928
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
 * @get_block: The filesystem method used to translate file offsets to blocks
 *
 * When a page fault occurs, filesystems may call this helper in their
R
Ross Zwisler 已提交
929
 * fault handler for DAX files. dax_fault() assumes the caller has done all
930 931
 * the necessary locking for the page fault to proceed successfully.
 */
R
Ross Zwisler 已提交
932
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
933
			get_block_t get_block)
934 935 936 937
{
	struct file *file = vma->vm_file;
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
J
Jan Kara 已提交
938
	void *entry;
939 940 941 942 943 944 945 946
	struct buffer_head bh;
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	unsigned blkbits = inode->i_blkbits;
	sector_t block;
	pgoff_t size;
	int error;
	int major = 0;

J
Jan Kara 已提交
947 948 949 950 951
	/*
	 * Check whether offset isn't beyond end of file now. Caller is supposed
	 * to hold locks serializing us with truncate / punch hole so this is
	 * a reliable test.
	 */
952 953 954 955 956 957
	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
	if (vmf->pgoff >= size)
		return VM_FAULT_SIGBUS;

	memset(&bh, 0, sizeof(bh));
	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
958
	bh.b_bdev = inode->i_sb->s_bdev;
959 960
	bh.b_size = PAGE_SIZE;

961
	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
J
Jan Kara 已提交
962 963 964
	if (IS_ERR(entry)) {
		error = PTR_ERR(entry);
		goto out;
965 966 967 968 969 970
	}

	error = get_block(inode, block, &bh, 0);
	if (!error && (bh.b_size < PAGE_SIZE))
		error = -EIO;		/* fs corruption? */
	if (error)
J
Jan Kara 已提交
971
		goto unlock_entry;
972 973 974 975

	if (vmf->cow_page) {
		struct page *new_page = vmf->cow_page;
		if (buffer_written(&bh))
976 977
			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
					bh.b_size, new_page, vaddr);
978 979 980
		else
			clear_user_highpage(new_page, vaddr);
		if (error)
J
Jan Kara 已提交
981 982 983
			goto unlock_entry;
		if (!radix_tree_exceptional_entry(entry)) {
			vmf->page = entry;
984
			return VM_FAULT_LOCKED;
J
Jan Kara 已提交
985
		}
986 987
		vmf->entry = entry;
		return VM_FAULT_DAX_LOCKED;
988 989
	}

J
Jan Kara 已提交
990
	if (!buffer_mapped(&bh)) {
991 992 993 994 995 996 997 998
		if (vmf->flags & FAULT_FLAG_WRITE) {
			error = get_block(inode, block, &bh, 1);
			count_vm_event(PGMAJFAULT);
			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			major = VM_FAULT_MAJOR;
			if (!error && (bh.b_size < PAGE_SIZE))
				error = -EIO;
			if (error)
J
Jan Kara 已提交
999
				goto unlock_entry;
1000
		} else {
J
Jan Kara 已提交
1001
			return dax_load_hole(mapping, entry, vmf);
1002 1003 1004
		}
	}

1005
	/* Filesystem should not return unwritten buffers to us! */
1006
	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
1007 1008
	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
			bh.b_size, &entry, vma, vmf);
J
Jan Kara 已提交
1009 1010
 unlock_entry:
	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1011 1012 1013 1014 1015 1016 1017 1018 1019
 out:
	if (error == -ENOMEM)
		return VM_FAULT_OOM | major;
	/* -EBUSY is fine, somebody else faulted on the same PTE */
	if ((error < 0) && (error != -EBUSY))
		return VM_FAULT_SIGBUS | major;
	return VM_FAULT_NOPAGE | major;
}
EXPORT_SYMBOL_GPL(dax_fault);
1020

1021 1022 1023 1024 1025 1026 1027
/**
 * dax_pfn_mkwrite - handle first write to DAX page
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
 */
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
R
Ross Zwisler 已提交
1028
	struct file *file = vma->vm_file;
J
Jan Kara 已提交
1029 1030 1031
	struct address_space *mapping = file->f_mapping;
	void *entry;
	pgoff_t index = vmf->pgoff;
1032

J
Jan Kara 已提交
1033 1034 1035 1036 1037 1038 1039 1040
	spin_lock_irq(&mapping->tree_lock);
	entry = get_unlocked_mapping_entry(mapping, index, NULL);
	if (!entry || !radix_tree_exceptional_entry(entry))
		goto out;
	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
	put_unlocked_mapping_entry(mapping, index, entry);
out:
	spin_unlock_irq(&mapping->tree_lock);
1041 1042 1043 1044
	return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);

1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
static bool dax_range_is_aligned(struct block_device *bdev,
				 unsigned int offset, unsigned int length)
{
	unsigned short sector_size = bdev_logical_block_size(bdev);

	if (!IS_ALIGNED(offset, sector_size))
		return false;
	if (!IS_ALIGNED(length, sector_size))
		return false;

	return true;
}

1058 1059 1060 1061 1062 1063 1064 1065
int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
		unsigned int offset, unsigned int length)
{
	struct blk_dax_ctl dax = {
		.sector		= sector,
		.size		= PAGE_SIZE,
	};

1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
	if (dax_range_is_aligned(bdev, offset, length)) {
		sector_t start_sector = dax.sector + (offset >> 9);

		return blkdev_issue_zeroout(bdev, start_sector,
				length >> 9, GFP_NOFS, true);
	} else {
		if (dax_map_atomic(bdev, &dax) < 0)
			return PTR_ERR(dax.addr);
		clear_pmem(dax.addr + offset, length);
		dax_unmap_atomic(bdev, &dax);
	}
1077 1078 1079 1080
	return 0;
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);

1081
/**
M
Matthew Wilcox 已提交
1082
 * dax_zero_page_range - zero a range within a page of a DAX file
1083 1084
 * @inode: The file being truncated
 * @from: The file offset that is being truncated to
M
Matthew Wilcox 已提交
1085
 * @length: The number of bytes to zero
1086 1087
 * @get_block: The filesystem method used to translate file offsets to blocks
 *
M
Matthew Wilcox 已提交
1088 1089 1090 1091
 * This function can be called by a filesystem when it is zeroing part of a
 * page in a DAX file.  This is intended for hole-punch operations.  If
 * you are truncating a file, the helper function dax_truncate_page() may be
 * more convenient.
1092
 */
M
Matthew Wilcox 已提交
1093 1094
int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
							get_block_t get_block)
1095 1096
{
	struct buffer_head bh;
1097 1098
	pgoff_t index = from >> PAGE_SHIFT;
	unsigned offset = from & (PAGE_SIZE-1);
1099 1100 1101 1102 1103
	int err;

	/* Block boundary? Nothing to do */
	if (!length)
		return 0;
1104 1105
	if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
		return -EINVAL;
1106 1107

	memset(&bh, 0, sizeof(bh));
1108
	bh.b_bdev = inode->i_sb->s_bdev;
1109
	bh.b_size = PAGE_SIZE;
1110
	err = get_block(inode, index, &bh, 0);
1111
	if (err < 0 || !buffer_written(&bh))
1112 1113
		return err;

1114 1115
	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
			offset, length);
1116
}
M
Matthew Wilcox 已提交
1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
EXPORT_SYMBOL_GPL(dax_zero_page_range);

/**
 * dax_truncate_page - handle a partial page being truncated in a DAX file
 * @inode: The file being truncated
 * @from: The file offset that is being truncated to
 * @get_block: The filesystem method used to translate file offsets to blocks
 *
 * Similar to block_truncate_page(), this function can be called by a
 * filesystem when it is truncating a DAX file to handle the partial page.
 */
int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
{
1130
	unsigned length = PAGE_ALIGN(from) - from;
M
Matthew Wilcox 已提交
1131 1132
	return dax_zero_page_range(inode, from, length, get_block);
}
1133
EXPORT_SYMBOL_GPL(dax_truncate_page);
1134 1135

#ifdef CONFIG_FS_IOMAP
1136 1137 1138 1139 1140
static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
}

1141
static loff_t
1142
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165
		struct iomap *iomap)
{
	struct iov_iter *iter = data;
	loff_t end = pos + length, done = 0;
	ssize_t ret = 0;

	if (iov_iter_rw(iter) == READ) {
		end = min(end, i_size_read(inode));
		if (pos >= end)
			return 0;

		if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
			return iov_iter_zero(min(length, end - pos), iter);
	}

	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
		return -EIO;

	while (pos < end) {
		unsigned offset = pos & (PAGE_SIZE - 1);
		struct blk_dax_ctl dax = { 0 };
		ssize_t map_len;

1166
		dax.sector = dax_iomap_sector(iomap, pos);
1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197
		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
		map_len = dax_map_atomic(iomap->bdev, &dax);
		if (map_len < 0) {
			ret = map_len;
			break;
		}

		dax.addr += offset;
		map_len -= offset;
		if (map_len > end - pos)
			map_len = end - pos;

		if (iov_iter_rw(iter) == WRITE)
			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
		else
			map_len = copy_to_iter(dax.addr, map_len, iter);
		dax_unmap_atomic(iomap->bdev, &dax);
		if (map_len <= 0) {
			ret = map_len ? map_len : -EFAULT;
			break;
		}

		pos += map_len;
		length -= map_len;
		done += map_len;
	}

	return done ? done : ret;
}

/**
1198
 * dax_iomap_rw - Perform I/O to a DAX file
1199 1200 1201 1202 1203 1204 1205 1206 1207
 * @iocb:	The control block for this I/O
 * @iter:	The addresses to do I/O from or to
 * @ops:	iomap ops passed from the file system
 *
 * This function performs read and write operations to directly mapped
 * persistent memory.  The callers needs to take care of read/write exclusion
 * and evicting any page cache pages in the region under I/O.
 */
ssize_t
1208
dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
		struct iomap_ops *ops)
{
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	struct inode *inode = mapping->host;
	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
	unsigned flags = 0;

	if (iov_iter_rw(iter) == WRITE)
		flags |= IOMAP_WRITE;

	/*
	 * Yes, even DAX files can have page cache attached to them:  A zeroed
	 * page is inserted into the pagecache when we have to serve a write
	 * fault on a hole.  It should never be dirtied and can simply be
	 * dropped from the pagecache once we get real data for the page.
	 *
	 * XXX: This is racy against mmap, and there's nothing we can do about
	 * it. We'll eventually need to shift this down even further so that
	 * we can check if we allocated blocks over a hole first.
	 */
	if (mapping->nrpages) {
		ret = invalidate_inode_pages2_range(mapping,
				pos >> PAGE_SHIFT,
				(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
		WARN_ON_ONCE(ret);
	}

	while (iov_iter_count(iter)) {
		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1238
				iter, dax_iomap_actor);
1239 1240 1241 1242 1243 1244 1245 1246 1247
		if (ret <= 0)
			break;
		pos += ret;
		done += ret;
	}

	iocb->ki_pos += done;
	return done ? done : ret;
}
1248
EXPORT_SYMBOL_GPL(dax_iomap_rw);
1249 1250

/**
1251
 * dax_iomap_fault - handle a page fault on a DAX file
1252 1253 1254 1255 1256 1257 1258 1259
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
 * @ops: iomap ops passed from the file system
 *
 * When a page fault occurs, filesystems may call this helper in their fault
 * or mkwrite handler for DAX files. Assumes the caller has done all the
 * necessary locking for the page fault to proceed successfully.
 */
1260
int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1261 1262 1263 1264 1265 1266 1267 1268 1269 1270
			struct iomap_ops *ops)
{
	struct address_space *mapping = vma->vm_file->f_mapping;
	struct inode *inode = mapping->host;
	unsigned long vaddr = (unsigned long)vmf->virtual_address;
	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
	sector_t sector;
	struct iomap iomap = { 0 };
	unsigned flags = 0;
	int error, major = 0;
1271
	int locked_status = 0;
1272 1273 1274 1275 1276 1277 1278 1279 1280 1281
	void *entry;

	/*
	 * Check whether offset isn't beyond end of file now. Caller is supposed
	 * to hold locks serializing us with truncate / punch hole so this is
	 * a reliable test.
	 */
	if (pos >= i_size_read(inode))
		return VM_FAULT_SIGBUS;

1282
	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300
	if (IS_ERR(entry)) {
		error = PTR_ERR(entry);
		goto out;
	}

	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
		flags |= IOMAP_WRITE;

	/*
	 * Note that we don't bother to use iomap_apply here: DAX required
	 * the file system block size to be equal the page size, which means
	 * that we never have to deal with more than a single extent here.
	 */
	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
	if (error)
		goto unlock_entry;
	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
		error = -EIO;		/* fs corruption? */
1301
		goto finish_iomap;
1302 1303
	}

1304
	sector = dax_iomap_sector(&iomap, pos);
1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322

	if (vmf->cow_page) {
		switch (iomap.type) {
		case IOMAP_HOLE:
		case IOMAP_UNWRITTEN:
			clear_user_highpage(vmf->cow_page, vaddr);
			break;
		case IOMAP_MAPPED:
			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
					vmf->cow_page, vaddr);
			break;
		default:
			WARN_ON_ONCE(1);
			error = -EIO;
			break;
		}

		if (error)
1323
			goto finish_iomap;
1324 1325
		if (!radix_tree_exceptional_entry(entry)) {
			vmf->page = entry;
1326 1327 1328 1329
			locked_status = VM_FAULT_LOCKED;
		} else {
			vmf->entry = entry;
			locked_status = VM_FAULT_DAX_LOCKED;
1330
		}
1331
		goto finish_iomap;
1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345
	}

	switch (iomap.type) {
	case IOMAP_MAPPED:
		if (iomap.flags & IOMAP_F_NEW) {
			count_vm_event(PGMAJFAULT);
			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			major = VM_FAULT_MAJOR;
		}
		error = dax_insert_mapping(mapping, iomap.bdev, sector,
				PAGE_SIZE, &entry, vma, vmf);
		break;
	case IOMAP_UNWRITTEN:
	case IOMAP_HOLE:
1346 1347 1348 1349
		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
			locked_status = dax_load_hole(mapping, entry, vmf);
			break;
		}
1350 1351 1352 1353 1354 1355 1356
		/*FALLTHRU*/
	default:
		WARN_ON_ONCE(1);
		error = -EIO;
		break;
	}

1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
 finish_iomap:
	if (ops->iomap_end) {
		if (error) {
			/* keep previous error */
			ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
					&iomap);
		} else {
			error = ops->iomap_end(inode, pos, PAGE_SIZE,
					PAGE_SIZE, flags, &iomap);
		}
	}
1368
 unlock_entry:
1369 1370
	if (!locked_status || error)
		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1371 1372 1373 1374 1375 1376
 out:
	if (error == -ENOMEM)
		return VM_FAULT_OOM | major;
	/* -EBUSY is fine, somebody else faulted on the same PTE */
	if (error < 0 && error != -EBUSY)
		return VM_FAULT_SIGBUS | major;
1377 1378 1379 1380
	if (locked_status) {
		WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
		return locked_status;
	}
1381 1382
	return VM_FAULT_NOPAGE | major;
}
1383
EXPORT_SYMBOL_GPL(dax_iomap_fault);
1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570

#ifdef CONFIG_FS_DAX_PMD
/*
 * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
 * more often than one might expect in the below functions.
 */
#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)

static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
		struct vm_fault *vmf, unsigned long address,
		struct iomap *iomap, loff_t pos, bool write, void **entryp)
{
	struct address_space *mapping = vma->vm_file->f_mapping;
	struct block_device *bdev = iomap->bdev;
	struct blk_dax_ctl dax = {
		.sector = dax_iomap_sector(iomap, pos),
		.size = PMD_SIZE,
	};
	long length = dax_map_atomic(bdev, &dax);
	void *ret;

	if (length < 0) /* dax_map_atomic() failed */
		return VM_FAULT_FALLBACK;
	if (length < PMD_SIZE)
		goto unmap_fallback;
	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
		goto unmap_fallback;
	if (!pfn_t_devmap(dax.pfn))
		goto unmap_fallback;

	dax_unmap_atomic(bdev, &dax);

	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
			RADIX_DAX_PMD);
	if (IS_ERR(ret))
		return VM_FAULT_FALLBACK;
	*entryp = ret;

	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);

 unmap_fallback:
	dax_unmap_atomic(bdev, &dax);
	return VM_FAULT_FALLBACK;
}

static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
		struct vm_fault *vmf, unsigned long address,
		struct iomap *iomap, void **entryp)
{
	struct address_space *mapping = vma->vm_file->f_mapping;
	unsigned long pmd_addr = address & PMD_MASK;
	struct page *zero_page;
	spinlock_t *ptl;
	pmd_t pmd_entry;
	void *ret;

	zero_page = mm_get_huge_zero_page(vma->vm_mm);

	if (unlikely(!zero_page))
		return VM_FAULT_FALLBACK;

	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
			RADIX_DAX_PMD | RADIX_DAX_HZP);
	if (IS_ERR(ret))
		return VM_FAULT_FALLBACK;
	*entryp = ret;

	ptl = pmd_lock(vma->vm_mm, pmd);
	if (!pmd_none(*pmd)) {
		spin_unlock(ptl);
		return VM_FAULT_FALLBACK;
	}

	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
	pmd_entry = pmd_mkhuge(pmd_entry);
	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
	spin_unlock(ptl);
	return VM_FAULT_NOPAGE;
}

int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
{
	struct address_space *mapping = vma->vm_file->f_mapping;
	unsigned long pmd_addr = address & PMD_MASK;
	bool write = flags & FAULT_FLAG_WRITE;
	unsigned int iomap_flags = write ? IOMAP_WRITE : 0;
	struct inode *inode = mapping->host;
	int result = VM_FAULT_FALLBACK;
	struct iomap iomap = { 0 };
	pgoff_t max_pgoff, pgoff;
	struct vm_fault vmf;
	void *entry;
	loff_t pos;
	int error;

	/* Fall back to PTEs if we're going to COW */
	if (write && !(vma->vm_flags & VM_SHARED))
		goto fallback;

	/* If the PMD would extend outside the VMA */
	if (pmd_addr < vma->vm_start)
		goto fallback;
	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
		goto fallback;

	/*
	 * Check whether offset isn't beyond end of file now. Caller is
	 * supposed to hold locks serializing us with truncate / punch hole so
	 * this is a reliable test.
	 */
	pgoff = linear_page_index(vma, pmd_addr);
	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;

	if (pgoff > max_pgoff)
		return VM_FAULT_SIGBUS;

	/* If the PMD would extend beyond the file size */
	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
		goto fallback;

	/*
	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
	 * the tree, for instance), it will return -EEXIST and we just fall
	 * back to 4k entries.
	 */
	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
	if (IS_ERR(entry))
		goto fallback;

	/*
	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
	 * setting up a mapping, so really we're using iomap_begin() as a way
	 * to look up our filesystem block.
	 */
	pos = (loff_t)pgoff << PAGE_SHIFT;
	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
	if (error)
		goto unlock_entry;
	if (iomap.offset + iomap.length < pos + PMD_SIZE)
		goto finish_iomap;

	vmf.pgoff = pgoff;
	vmf.flags = flags;
	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;

	switch (iomap.type) {
	case IOMAP_MAPPED:
		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
				&iomap, pos, write, &entry);
		break;
	case IOMAP_UNWRITTEN:
	case IOMAP_HOLE:
		if (WARN_ON_ONCE(write))
			goto finish_iomap;
		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
				&entry);
		break;
	default:
		WARN_ON_ONCE(1);
		break;
	}

 finish_iomap:
	if (ops->iomap_end) {
		if (result == VM_FAULT_FALLBACK) {
			ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
					&iomap);
		} else {
			error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
					iomap_flags, &iomap);
			if (error)
				result = VM_FAULT_FALLBACK;
		}
	}
 unlock_entry:
	put_locked_mapping_entry(mapping, pgoff, entry);
 fallback:
	if (result == VM_FAULT_FALLBACK) {
		split_huge_pmd(vma, pmd, address);
		count_vm_event(THP_FAULT_FALLBACK);
	}
	return result;
}
EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
#endif /* CONFIG_FS_DAX_PMD */
1571
#endif /* CONFIG_FS_IOMAP */