提交 691429e1 编写于 作者: L Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge fixes from Andrew Morton:
 "10 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  dax: move writeback calls into the filesystems
  dax: give DAX clearing code correct bdev
  ext4: online defrag not supported with DAX
  ext2, ext4: only set S_DAX for regular inodes
  block: disable block device DAX by default
  ocfs2: unlock inode if deleting inode from orphan fails
  mm: ASLR: use get_random_long()
  drivers: char: random: add get_random_long()
  mm: numa: quickly fail allocations for NUMA balancing on full nodes
  mm: thp: fix SMP race condition between THP page fault and MADV_DONTNEED
......@@ -173,7 +173,7 @@ unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
......
......@@ -53,10 +53,10 @@ unsigned long arch_mmap_rnd(void)
#ifdef CONFIG_COMPAT
if (test_thread_flag(TIF_32BIT))
rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
else
#endif
rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
......
......@@ -146,7 +146,7 @@ unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
rnd = (unsigned long)get_random_int();
rnd = get_random_long();
rnd <<= PAGE_SHIFT;
if (TASK_IS_32BIT_ADDR)
rnd &= 0xfffffful;
......@@ -174,7 +174,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
static inline unsigned long brk_rnd(void)
{
unsigned long rnd = get_random_int();
unsigned long rnd = get_random_long();
rnd = rnd << PAGE_SHIFT;
/* 8MB for 32bit, 256MB for 64bit */
......
......@@ -1768,9 +1768,9 @@ static inline unsigned long brk_rnd(void)
/* 8MB for 32bit, 1GB for 64bit */
if (is_32bit_task())
rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT)));
else
rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT)));
return rnd << PAGE_SHIFT;
}
......
......@@ -59,9 +59,9 @@ unsigned long arch_mmap_rnd(void)
/* 8MB for 32bit, 1GB for 64bit */
if (is_32bit_task())
rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT));
rnd = get_random_long() % (1<<(23-PAGE_SHIFT));
else
rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT));
rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT));
return rnd << PAGE_SHIFT;
}
......
......@@ -264,7 +264,7 @@ static unsigned long mmap_rnd(void)
unsigned long rnd = 0UL;
if (current->flags & PF_RANDOMIZE) {
unsigned long val = get_random_int();
unsigned long val = get_random_long();
if (test_thread_flag(TIF_32BIT))
rnd = (val % (1UL << (23UL-PAGE_SHIFT)));
else
......
......@@ -71,12 +71,12 @@ unsigned long arch_mmap_rnd(void)
if (mmap_is_ia32())
#ifdef CONFIG_COMPAT
rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
#else
rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
#endif
else
rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
......
......@@ -88,6 +88,19 @@ config BLK_DEV_INTEGRITY
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
config BLK_DEV_DAX
bool "Block device DAX support"
depends on FS_DAX
depends on BROKEN
help
When DAX support is available (CONFIG_FS_DAX) raw block
devices can also support direct userspace access to the
storage capacity via MMAP(2) similar to a file on a
DAX-enabled filesystem. However, the DAX I/O-path disables
some standard I/O-statistics, and the MMAP(2) path has some
operational differences due to bypassing the page
cache. If in doubt, say N.
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
depends on BLK_CGROUP=y
......
......@@ -1818,6 +1818,28 @@ unsigned int get_random_int(void)
}
EXPORT_SYMBOL(get_random_int);
/*
* Same as get_random_int(), but returns unsigned long.
*/
unsigned long get_random_long(void)
{
__u32 *hash;
unsigned long ret;
if (arch_get_random_long(&ret))
return ret;
hash = get_cpu_var(get_random_int_hash);
hash[0] += current->pid + jiffies + random_get_entropy();
md5_transform(hash, random_int_secret);
ret = *(unsigned long *)hash;
put_cpu_var(get_random_int_hash);
return ret;
}
EXPORT_SYMBOL(get_random_long);
/*
* randomize_range() returns a start address such that
*
......
......@@ -653,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
random_variable = (unsigned long) get_random_int();
random_variable = get_random_long();
random_variable &= STACK_RND_MASK;
random_variable <<= PAGE_SHIFT;
}
......
......@@ -1201,7 +1201,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
bdev->bd_inode->i_flags = S_DAX;
else
bdev->bd_inode->i_flags = 0;
if (!partno) {
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
......@@ -1693,13 +1697,24 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
if (dax_mapping(mapping)) {
struct block_device *bdev = I_BDEV(mapping->host);
return dax_writeback_mapping_range(mapping, bdev, wbc);
}
return generic_writepages(mapping, wbc);
}
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
.readpages = blkdev_readpages,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
.writepages = generic_writepages,
.writepages = blkdev_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
.is_dirty_writeback = buffer_check_dirty_writeback,
......
......@@ -79,15 +79,14 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
}
/*
* dax_clear_blocks() is called from within transaction context from XFS,
* dax_clear_sectors() is called from within transaction context from XFS,
* and hence this means the stack from this point must follow GFP_NOFS
* semantics for all operations.
*/
int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
{
struct block_device *bdev = inode->i_sb->s_bdev;
struct blk_dax_ctl dax = {
.sector = block << (inode->i_blkbits - 9),
.sector = _sector,
.size = _size,
};
......@@ -109,7 +108,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
wmb_pmem();
return 0;
}
EXPORT_SYMBOL_GPL(dax_clear_blocks);
EXPORT_SYMBOL_GPL(dax_clear_sectors);
/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
......@@ -485,11 +484,10 @@ static int dax_writeback_one(struct block_device *bdev,
* end]. This is required by data integrity operations to ensure file data is
* on persistent storage prior to completion of the operation.
*/
int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
loff_t end)
int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct block_device *bdev = inode->i_sb->s_bdev;
pgoff_t start_index, end_index, pmd_index;
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
......@@ -500,8 +498,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
start_index = start >> PAGE_CACHE_SHIFT;
end_index = end >> PAGE_CACHE_SHIFT;
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
return 0;
start_index = wbc->range_start >> PAGE_CACHE_SHIFT;
end_index = wbc->range_end >> PAGE_CACHE_SHIFT;
pmd_index = DAX_PMD_INDEX(start_index);
rcu_read_lock();
......
......@@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode,
* so that it's not found by another thread before it's
* initialised
*/
err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
1 << inode->i_blkbits);
err = dax_clear_sectors(inode->i_sb->s_bdev,
le32_to_cpu(chain[depth-1].key) <<
(inode->i_blkbits - 9),
1 << inode->i_blkbits);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
......@@ -874,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
static int
ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
#ifdef CONFIG_FS_DAX
if (dax_mapping(mapping)) {
return dax_writeback_mapping_range(mapping,
mapping->host->i_sb->s_bdev,
wbc);
}
#endif
return mpage_writepages(mapping, wbc, ext2_get_block);
}
......@@ -1296,7 +1306,7 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_NOATIME;
if (flags & EXT2_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
if (test_opt(inode->i_sb, DAX))
if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
inode->i_flags |= S_DAX;
}
......
......@@ -2478,6 +2478,10 @@ static int ext4_writepages(struct address_space *mapping,
trace_ext4_writepages(inode, wbc);
if (dax_mapping(mapping))
return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
wbc);
/*
* No pages to write? This is mainly a kludge to avoid starting
* a transaction for special inodes like journal inode on last iput()
......@@ -4155,7 +4159,7 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
if (test_opt(inode->i_sb, DAX))
if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
......
......@@ -583,6 +583,11 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
"Online defrag not supported with bigalloc");
err = -EOPNOTSUPP;
goto mext_out;
} else if (IS_DAX(inode)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with DAX");
err = -EOPNOTSUPP;
goto mext_out;
}
err = mnt_want_write_file(filp);
......
......@@ -956,6 +956,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
update_isize, end);
if (tmp_ret < 0) {
ocfs2_inode_unlock(inode, 1);
ret = tmp_ret;
mlog_errno(ret);
brelse(di_bh);
......
......@@ -55,7 +55,7 @@ xfs_count_page_state(
} while ((bh = bh->b_this_page) != head);
}
STATIC struct block_device *
struct block_device *
xfs_find_bdev_for_inode(
struct inode *inode)
{
......@@ -1208,6 +1208,10 @@ xfs_vm_writepages(
struct writeback_control *wbc)
{
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
if (dax_mapping(mapping))
return dax_writeback_mapping_range(mapping,
xfs_find_bdev_for_inode(mapping->host), wbc);
return generic_writepages(mapping, wbc);
}
......
......@@ -62,5 +62,6 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *);
extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
#endif /* __XFS_AOPS_H__ */
......@@ -75,7 +75,8 @@ xfs_zero_extent(
ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
if (IS_DAX(VFS_I(ip)))
return dax_clear_blocks(VFS_I(ip), block, size);
return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
sector, size);
/*
* let the block layer decide on the fastest method of
......
......@@ -7,7 +7,7 @@
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
get_block_t, dio_iodone_t, int flags);
int dax_clear_blocks(struct inode *, sector_t block, long size);
int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
......@@ -52,6 +52,8 @@ static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
}
int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
loff_t end);
struct writeback_control;
int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc);
#endif
......@@ -34,6 +34,7 @@ extern const struct file_operations random_fops, urandom_fops;
#endif
unsigned int get_random_int(void);
unsigned long get_random_long(void);
unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
u32 prandom_u32(void);
......
......@@ -446,7 +446,8 @@ int filemap_write_and_wait(struct address_space *mapping)
{
int err = 0;
if (mapping->nrpages) {
if ((!dax_mapping(mapping) && mapping->nrpages) ||
(dax_mapping(mapping) && mapping->nrexceptional)) {
err = filemap_fdatawrite(mapping);
/*
* Even if the above returned error, the pages may be
......@@ -482,13 +483,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0;
if (dax_mapping(mapping) && mapping->nrexceptional) {
err = dax_writeback_mapping_range(mapping, lstart, lend);
if (err)
return err;
}
if (mapping->nrpages) {
if ((!dax_mapping(mapping) && mapping->nrpages) ||
(dax_mapping(mapping) && mapping->nrexceptional)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
......
......@@ -3404,8 +3404,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(pmd_none(*pmd)) &&
unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
/*
* If a huge pmd materialized under us just retry later. Use
* pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
* didn't become pmd_trans_huge under us and then back to pmd_none, as
* a result of MADV_DONTNEED running immediately after a huge pmd fault
* in a different thread of this mm, in turn leading to a misleading
* pmd_trans_huge() retval. All we have to ensure is that it is a
* regular pmd that we can walk with pte_offset_map() and we can do that
* through an atomic read in C, which is what pmd_trans_unstable()
* provides.
*/
if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
......
......@@ -1582,7 +1582,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
(GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE | __GFP_NOMEMALLOC |
__GFP_NORETRY | __GFP_NOWARN) &
~(__GFP_IO | __GFP_FS), 0);
~__GFP_RECLAIM, 0);
return newpage;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册