Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton: "15 fixes" [ This does not merge the "fortify: use WARN instead of BUG for now" patch, which needs a bit of extra work to build cleanly with all configurations. Arnd is on it. - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: ocfs2: don't clear SGID when inheriting ACLs mm: allow page_cache_get_speculative in interrupt context userfaultfd: non-cooperative: flush event_wqh at release time ipc: add missing container_of()s for randstruct cpuset: fix a deadlock due to incomplete patching of cpusets_enabled() userfaultfd_zeropage: return -ENOSPC in case mm has gone mm: take memory hotplug lock within numa_zonelist_order_handler() mm/page_io.c: fix oops during block io poll in swapin path zram: do not free pool->size_class kthread: fix documentation build warning kasan: avoid -Wmaybe-uninitialized warning userfaultfd: non-cooperative: notify about unmap of destination during mremap mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries pid: kill pidhash_size in pidhash_init() mm/hugetlb.c: __get_user_pages ignores certain follow_hugetlb_page errors

Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "15 fixes" [ This does not merge the "fortify: use WARN instead of BUG for now" patch, which needs a bit of extra work to build cleanly with all configurations. Arnd is on it. - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: ocfs2: don't clear SGID when inheriting ACLs mm: allow page_cache_get_speculative in interrupt context userfaultfd: non-cooperative: flush event_wqh at release time ipc: add missing container_of()s for randstruct cpuset: fix a deadlock due to incomplete patching of cpusets_enabled() userfaultfd_zeropage: return -ENOSPC in case mm has gone mm: take memory hotplug lock within numa_zonelist_order_handler() mm/page_io.c: fix oops during block io poll in swapin path zram: do not free pool->size_class kthread: fix documentation build warning kasan: avoid -Wmaybe-uninitialized warning userfaultfd: non-cooperative: notify about unmap of destination during mremap mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries pid: kill pidhash_size in pidhash_init() mm/hugetlb.c: __get_user_pages ignores certain follow_hugetlb_page errors
995d03ae · Linus Torvalds · 8d3fe85f · 19ec8e48 · 995d03ae · 995d03ae
22 changed file
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -240,18 +240,6 @@ int ocfs2_set_acl(handle_t *handle,
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
-		if (acl) {
-			umode_t mode;
-
-			ret = posix_acl_update_mode(inode, &mode, &acl);
-			if (ret)
-				return ret;
-
-			ret = ocfs2_acl_set_mode(inode, di_bh,
-						 handle, mode);
-			if (ret)
-				return ret;
-		}
 		break;
 	case ACL_TYPE_DEFAULT:
 		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
@@ -289,7 +277,19 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
 	if (had_lock < 0)
 		return had_lock;
+	if (type == ACL_TYPE_ACCESS && acl) {
+		umode_t mode;
+
+		status = posix_acl_update_mode(inode, &mode, &acl);
+		if (status)
+			goto unlock;
+
+		status = ocfs2_acl_set_mode(inode, bh, NULL, mode);
+		if (status)
+			goto unlock;
+	}
 	status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+unlock:
 	ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
 	brelse(bh);
 	return status;

--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -854,6 +854,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
 	spin_unlock(&ctx->fault_pending_wqh.lock);

+	/* Flush pending events that may still wait on event_wqh */
+	wake_up_all(&ctx->event_wqh);
+
 	wake_up_poll(&ctx->fd_wqh, POLLHUP);
 	userfaultfd_ctx_put(ctx);
 	return 0;
@@ -1643,6 +1646,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
 				     uffdio_zeropage.range.len);
 		mmput(ctx->mm);
+	} else {
+		return -ENOSPC;
 	}
 	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
 		return -EFAULT;

--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,6 +18,19 @@

 #ifdef CONFIG_CPUSETS

+/*
+ * Static branch rewrites can happen in an arbitrary order for a given
+ * key. In code paths where we need to loop with read_mems_allowed_begin() and
+ * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
+ * to ensure that begin() always gets rewritten before retry() in the
+ * disabled -> enabled transition. If not, then if local irqs are disabled
+ * around the loop, we can deadlock since retry() would always be
+ * comparing the latest value of the mems_allowed seqcount against 0 as
+ * begin() still would see cpusets_enabled() as false. The enabled -> disabled
+ * transition should happen in reverse order for the same reasons (want to stop
+ * looking at real value of mems_allowed.sequence in retry() first).
+ */
+extern struct static_key_false cpusets_pre_enable_key;
 extern struct static_key_false cpusets_enabled_key;
 static inline bool cpusets_enabled(void)
 {
@@ -32,12 +45,14 @@ static inline int nr_cpusets(void)

 static inline void cpuset_inc(void)
 {
+	static_branch_inc(&cpusets_pre_enable_key);
 	static_branch_inc(&cpusets_enabled_key);
 }

 static inline void cpuset_dec(void)
 {
 	static_branch_dec(&cpusets_enabled_key);
+	static_branch_dec(&cpusets_pre_enable_key);
 }

 extern int cpuset_init(void);
@@ -115,7 +130,7 @@ extern void cpuset_print_current_mems_allowed(void);
 */
 static inline unsigned int read_mems_allowed_begin(void)
 {
-	if (!cpusets_enabled())
+	if (!static_branch_unlikely(&cpusets_pre_enable_key))
 		return 0;

 	return read_seqcount_begin(&current->mems_allowed_seq);
@@ -129,7 +144,7 @@ static inline unsigned int read_mems_allowed_begin(void)
 */
 static inline bool read_mems_allowed_retry(unsigned int seq)
 {
-	if (!cpusets_enabled())
+	if (!static_branch_unlikely(&cpusets_enabled_key))
 		return false;

 	return read_seqcount_retry(&current->mems_allowed_seq, seq);

--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -15,7 +15,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 * @threadfn: the function to run in the thread
 * @data: data pointer for @threadfn()
 * @namefmt: printf-style format string for the thread name
- * @...: arguments for @namefmt.
+ * @arg...: arguments for @namefmt.
 *
 * This macro will create a kthread on the current node, leaving it in
 * the stopped state.  This is just a helper for kthread_create_on_node();

--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -494,6 +494,10 @@ struct mm_struct {
 	 * PROT_NONE or PROT_NUMA mapped page.
 	 */
 	bool tlb_flush_pending;
+#endif
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+	/* See flush_tlb_batched_pending() */
+	bool tlb_flush_batched;
 #endif
 	struct uprobes_state uprobes_state;
 #ifdef CONFIG_HUGETLB_PAGE

--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -163,8 +163,6 @@ void release_pages(struct page **pages, int nr, bool cold);
 */
 static inline int page_cache_get_speculative(struct page *page)
 {
-	VM_BUG_ON(in_interrupt());
-
 #ifdef CONFIG_TINY_RCU
 # ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic() && !irqs_disabled());

--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1034,7 +1034,8 @@ void msg_exit_ns(struct ipc_namespace *ns)
 static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
 {
 	struct user_namespace *user_ns = seq_user_ns(s);
-	struct msg_queue *msq = it;
+	struct kern_ipc_perm *ipcp = it;
+	struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);

 	seq_printf(s,
 		   "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",

--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2179,7 +2179,8 @@ void exit_sem(struct task_struct *tsk)
 static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 {
 	struct user_namespace *user_ns = seq_user_ns(s);
-	struct sem_array *sma = it;
+	struct kern_ipc_perm *ipcp = it;
+	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
 	time_t sem_otime;

 	/*

--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1380,9 +1380,11 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 {
 	struct user_namespace *user_ns = seq_user_ns(s);
-	struct shmid_kernel *shp = it;
+	struct kern_ipc_perm *ipcp = it;
+	struct shmid_kernel *shp;
 	unsigned long rss = 0, swp = 0;

+	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
 	shm_add_rss_swap(shp, &rss, &swp);

 #if BITS_PER_LONG <= 32

--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -63,6 +63,7 @@
 #include <linux/cgroup.h>
 #include <linux/wait.h>

+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);

 /* See "Frequency meter" comments, below. */

--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -575,13 +575,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 */
 void __init pidhash_init(void)
 {
-	unsigned int pidhash_size;
-
 	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
 					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
 					   &pidhash_shift, NULL,
 					   0, 4096);
-	pidhash_size = 1U << pidhash_shift;
 }

 void __init pidmap_init(void)

--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4078,6 +4078,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long vaddr = *position;
 	unsigned long remainder = *nr_pages;
 	struct hstate *h = hstate_vma(vma);
+	int err = -EFAULT;

 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
@@ -4154,11 +4155,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			}
 			ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
 			if (ret & VM_FAULT_ERROR) {
-				int err = vm_fault_to_errno(ret, flags);
-
-				if (err)
-					return err;
-
+				err = vm_fault_to_errno(ret, flags);
 				remainder = 0;
 				break;
 			}
@@ -4213,7 +4210,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	*position = vaddr;

-	return i ? i : -EFAULT;
+	return i ? i : err;
 }

 #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE

--- a/mm/internal.h
+++ b/mm/internal.h
@@ -498,6 +498,7 @@ extern struct workqueue_struct *mm_percpu_wq;
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 void try_to_unmap_flush(void);
 void try_to_unmap_flush_dirty(void);
+void flush_tlb_batched_pending(struct mm_struct *mm);
 #else
 static inline void try_to_unmap_flush(void)
 {
@@ -505,7 +506,9 @@ static inline void try_to_unmap_flush(void)
 static inline void try_to_unmap_flush_dirty(void)
 {
 }
-
+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+}
 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

 extern const struct trace_print_flags pageflag_names[];

--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -401,6 +401,7 @@ void kasan_report(unsigned long addr, size_t size,
 	disable_trace_on_warning();

 	info.access_addr = (void *)addr;
+	info.first_bad_addr = (void *)addr;
 	info.access_size = size;
 	info.is_write = is_write;
 	info.ip = ip;

--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -320,6 +320,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,

 	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
 	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1197,6 +1197,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	init_rss_vec(rss);
 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	pte = start_pte;
+	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = *pte;

--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	    atomic_read(&vma->vm_mm->mm_users) == 1)
 		target_node = numa_node_id();

+	flush_tlb_batched_pending(vma->vm_mm);
 	arch_enter_lazy_mmu_mode();
 	do {
 		oldpte = *pte;

--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+	flush_tlb_batched_pending(vma->vm_mm);
 	arch_enter_lazy_mmu_mode();

 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
@@ -428,6 +429,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		unsigned long new_addr, unsigned long new_len, bool *locked,
 		struct vm_userfaultfd_ctx *uf,
+		struct list_head *uf_unmap_early,
 		struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = current->mm;
@@ -446,7 +448,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (addr + old_len > new_addr && new_addr + new_len > addr)
 		goto out;

-	ret = do_munmap(mm, new_addr, new_len, NULL);
+	ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
 	if (ret)
 		goto out;

@@ -514,6 +516,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long charged = 0;
 	bool locked = false;
 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+	LIST_HEAD(uf_unmap_early);
 	LIST_HEAD(uf_unmap);

 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
@@ -541,7 +544,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,

 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked, &uf, &uf_unmap);
+				&locked, &uf, &uf_unmap_early, &uf_unmap);
 		goto out;
 	}

@@ -621,6 +624,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	up_write(&current->mm->mmap_sem);
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
+	userfaultfd_unmap_complete(mm, &uf_unmap_early);
 	mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
 	userfaultfd_unmap_complete(mm, &uf_unmap);
 	return ret;

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4891,9 +4891,11 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order) {
+			mem_hotplug_begin();
 			mutex_lock(&zonelists_mutex);
 			build_all_zonelists(NULL, NULL);
 			mutex_unlock(&zonelists_mutex);
+			mem_hotplug_done();
 		}
 	}
 out:

--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -22,6 +22,7 @@
 #include <linux/frontswap.h>
 #include <linux/blkdev.h>
 #include <linux/uio.h>
+#include <linux/sched/task.h>
 #include <asm/pgtable.h>

 static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -136,6 +137,7 @@ static void end_swap_bio_read(struct bio *bio)
 	WRITE_ONCE(bio->bi_private, NULL);
 	bio_put(bio);
 	wake_up_process(waiter);
+	put_task_struct(waiter);
 }

 int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -378,6 +380,11 @@ int swap_readpage(struct page *page, bool do_poll)
 		goto out;
 	}
 	bdev = bio->bi_bdev;
+	/*
+	 * Keep this task valid during swap readpage because the oom killer may
+	 * attempt to access it in the page fault retry time check.
+	 */
+	get_task_struct(current);
 	bio->bi_private = current;
 	bio_set_op_attrs(bio, REQ_OP_READ, 0);
 	count_vm_event(PSWPIN);

--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -604,6 +604,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
 	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
 	tlb_ubc->flush_required = true;

+	/*
+	 * Ensure compiler does not re-order the setting of tlb_flush_batched
+	 * before the PTE is cleared.
+	 */
+	barrier();
+	mm->tlb_flush_batched = true;
+
 	/*
 	 * If the PTE was dirty then it's best to assume it's writable. The
 	 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
@@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)

 	return should_defer;
 }
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+	if (mm->tlb_flush_batched) {
+		flush_tlb_mm(mm);
+
+		/*
+		 * Do not allow the compiler to re-order the clearing of
+		 * tlb_flush_batched before the tlb is flushed.
+		 */
+		barrier();
+		mm->tlb_flush_batched = false;
+	}
+}
 #else
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
 {

--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2453,7 +2453,6 @@ void zs_destroy_pool(struct zs_pool *pool)
 	}

 	destroy_cache(pool);
-	kfree(pool->size_class);
 	kfree(pool->name);
 	kfree(pool);
 }