diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 427091fcfcf10b0fb4d8a893b3d5f150ba6ed9e9..b72fbbfe3fcb7013c3f803bdda27d52b8842e050 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -76,10 +76,14 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A57 | #1319537 | ARM64_ERRATUM_1319367 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A57 | #1742098 | ARM64_ERRATUM_1742098 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A72 | #853709 | N/A | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A72 | #1319367 | ARM64_ERRATUM_1319367 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A72 | #1655431 | ARM64_ERRATUM_1742098 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1188873,1418040| ARM64_ERRATUM_1418040 | diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ba92a15196cf7dc0891b3dfd977ab8bb450ed2dd..0d42eaf7acb4f919828443ffa2b0263932a1e3e3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1022,6 +1022,17 @@ config ARM_ERRATA_764369 relevant cache maintenance functions and sets a specific bit in the diagnostic control register of the SCU. +config ARM_ERRATA_764319 + bool "ARM errata: Read to DBGPRSR and DBGOSLSR may generate Undefined instruction" + depends on CPU_V7 + help + This option enables the workaround for the 764319 Cortex A-9 erratum. + CP14 read accesses to the DBGPRSR and DBGOSLSR registers generate an + unexpected Undefined Instruction exception when the DBGSWENABLE + external pin is set to 0, even when the CP14 accesses are performed + from a privileged mode. This work around catches the exception in a + way the kernel does not stop execution. + config ARM_ERRATA_775420 bool "ARM errata: A data cache maintenance operation which aborts, might lead to deadlock" depends on CPU_V7 diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index b1423fb130ea4a04d5192ec61beb8453b29b04af..054e9199f30db654e3fba1116b48ccf9ae7bafb6 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -941,6 +941,23 @@ static int hw_breakpoint_pending(unsigned long addr, unsigned int fsr, return ret; } +#ifdef CONFIG_ARM_ERRATA_764319 +static int oslsr_fault; + +static int debug_oslsr_trap(struct pt_regs *regs, unsigned int instr) +{ + oslsr_fault = 1; + instruction_pointer(regs) += 4; + return 0; +} + +static struct undef_hook debug_oslsr_hook = { + .instr_mask = 0xffffffff, + .instr_val = 0xee115e91, + .fn = debug_oslsr_trap, +}; +#endif + /* * One-time initialisation. */ @@ -974,7 +991,16 @@ static bool core_has_os_save_restore(void) case ARM_DEBUG_ARCH_V7_1: return true; case ARM_DEBUG_ARCH_V7_ECP14: +#ifdef CONFIG_ARM_ERRATA_764319 + oslsr_fault = 0; + register_undef_hook(&debug_oslsr_hook); ARM_DBG_READ(c1, c1, 4, oslsr); + unregister_undef_hook(&debug_oslsr_hook); + if (oslsr_fault) + return false; +#else + ARM_DBG_READ(c1, c1, 4, oslsr); +#endif if (oslsr & ARM_OSLSR_OSLM0) return true; fallthrough; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ddbdbc7ecdd94bdb6b5743b6ca2ad924679e5156..3427154e218dfa635e30fbb7c4a0398c380536c0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -492,6 +492,22 @@ config ARM64_ERRATUM_834220 If unsure, say Y. +config ARM64_ERRATUM_1742098 + bool "Cortex-A57/A72: 1742098: ELR recorded incorrectly on interrupt taken between cryptographic instructions in a sequence" + depends on COMPAT + default y + help + This option removes the AES hwcap for aarch32 user-space to + workaround erratum 1742098 on Cortex-A57 and Cortex-A72. + + Affected parts may corrupt the AES state if an interrupt is + taken between a pair of AES instructions. These instructions + are only present if the cryptography extensions are present. + All software should have a fallback implementation for CPUs + that don't implement the cryptography extensions. + + If unsure, say Y. + config ARM64_ERRATUM_845719 bool "Cortex-A53: 845719: a load might read incorrect data" depends on AARCH32_EL0 diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 3ffa6108c96dbfeb86146270d9ddea7c7b7afc02..2f4e8c06674e0644e898ce8f81af16367b502da9 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -72,6 +72,7 @@ #define ARM64_HAS_ECV 64 #define ARM64_HAS_EPAN 65 #define ARM64_SPECTRE_BHB 66 +#define ARM64_WORKAROUND_1742098 67 #define ARM64_NCAPS 80 diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index cc30f82cf9943a518752b24084099958800e5d1e..a975374e140e933828f46382bb32c1b82c59c344 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -427,6 +427,14 @@ static const struct midr_range erratum_1463225[] = { }; #endif +#ifdef CONFIG_ARM64_ERRATUM_1742098 +static struct midr_range broken_aarch32_aes[] = { + MIDR_RANGE(MIDR_CORTEX_A57, 0, 1, 0xf, 0xf), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -626,6 +634,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { 1, 0), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_1742098 + { + .desc = "ARM erratum 1742098", + .capability = ARM64_WORKAROUND_1742098, + CAP_MIDR_RANGE_LIST(broken_aarch32_aes), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + }, +#endif #ifdef CONFIG_HISILICON_ERRATUM_HIP08_RU_PREFETCH { .desc = "HiSilicon HIP08 Cache Readunique Prefetch Disable", diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 8401e1e0fe04132a418f21825c1b4b37e033ffd3..d096942b57d6a8abefe96e0659f2ebb3dae59bd4 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -1771,6 +1772,14 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) } #endif /* CONFIG_ARM64_MTE */ +static void elf_hwcap_fixup(void) +{ +#ifdef CONFIG_ARM64_ERRATUM_1742098 + if (cpus_have_const_cap(ARM64_WORKAROUND_1742098)) + a32_elf_hwcap2 &= ~COMPAT_HWCAP2_AES; +#endif /* ARM64_ERRATUM_1742098 */ +} + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2837,8 +2846,10 @@ void __init setup_cpu_features(void) setup_system_capabilities(); setup_elf_hwcaps(arm64_elf_hwcaps); - if (system_supports_32bit_el0()) + if (system_supports_32bit_el0()) { setup_elf_hwcaps(a32_elf_hwcaps); + elf_hwcap_fixup(); + } if (system_uses_ttbr0_pan()) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); diff --git a/block/blk-core.c b/block/blk-core.c index f0e28624ef9c8abe7c5d2592a5658cea3c25e8e6..06fb25bd24df5ec0709e4d4669c9489dcd7d5b73 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -400,8 +400,7 @@ void blk_cleanup_queue(struct request_queue *q) * prevent that blk_mq_run_hw_queues() accesses the hardware queues * after draining finished. */ - blk_freeze_queue_start(q); - blk_mq_freeze_queue_wait_sync(q); + blk_freeze_queue(q); rq_qos_exit(q); @@ -432,8 +431,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_mq_sched_free_rqs(q); mutex_unlock(&q->sysfs_lock); - percpu_ref_exit(&q->q_usage_counter); - /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } @@ -518,7 +515,6 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref) struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter); - blk_queue_flag_set(QUEUE_FLAG_USAGE_COUNT_SYNC, q); wake_up_all(&q->mq_freeze_wq); } diff --git a/block/blk-mq.c b/block/blk-mq.c index e0ed97465b0f074ce3f5cbb9723e9949e9ae9ee8..5f8660ef8c37239335f58d961526fab55cfa9773 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -195,7 +195,6 @@ void blk_freeze_queue_start(struct request_queue *q) { mutex_lock(&q->mq_freeze_lock); if (++q->mq_freeze_depth == 1) { - blk_queue_flag_clear(QUEUE_FLAG_USAGE_COUNT_SYNC, q); percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) @@ -206,12 +205,6 @@ void blk_freeze_queue_start(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); -void blk_mq_freeze_queue_wait_sync(struct request_queue *q) -{ - wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter) && - test_bit(QUEUE_FLAG_USAGE_COUNT_SYNC, &q->queue_flags)); -} - void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 780f02cbda84ff6d3c5c3d151510d36e1926902d..169e63ee05cbe8280e9d9b527bac959acbefb567 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -726,6 +726,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); + + percpu_ref_exit(&q->q_usage_counter); kmem_cache_free(blk_requestq_cachep, q); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 454d90b785b97dd6e25bf15f5b4cfbdab5fd85e7..27f961aa271489d2e512f95d3f4b972d99ad43eb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5429,7 +5429,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set); md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); raid_bio->bi_next = (void *)rdev; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index c2b6e5c966d047caec06d1c2aac2fb846f480b98..300c63d75df205e07becec442617064e071a9407 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -5831,6 +5831,11 @@ static s32 brcmf_get_assoc_ies(struct brcmf_cfg80211_info *cfg, (struct brcmf_cfg80211_assoc_ielen_le *)cfg->extra_buf; req_len = le32_to_cpu(assoc_info->req_len); resp_len = le32_to_cpu(assoc_info->resp_len); + if (req_len > WL_EXTRA_BUF_MAX || resp_len > WL_EXTRA_BUF_MAX) { + bphy_err(drvr, "invalid lengths in assoc info: req %u resp %u\n", + req_len, resp_len); + return -EINVAL; + } if (req_len) { err = brcmf_fil_iovar_data_get(ifp, "assoc_req_ies", cfg->extra_buf, diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 669aef77a0bd0e2a5bcbcaba7cd6938b2e301ade..c37d2657308cd7062a5ff1d49740e63149e0e93e 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1237,14 +1237,16 @@ static struct tty_struct *tty_driver_lookup_tty(struct tty_driver *driver, { struct tty_struct *tty; - if (driver->ops->lookup) + if (driver->ops->lookup) { if (!file) tty = ERR_PTR(-EIO); else tty = driver->ops->lookup(driver, file, idx); - else + } else { + if (idx >= driver->num) + return ERR_PTR(-EINVAL); tty = driver->ttys[idx]; - + } if (!IS_ERR(tty)) tty_kref_get(tty); return tty; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ed507d27034b1dc804e6482d7403bd6bcc0025cd..213864bc7e8c0f7394d5d065821c5e6cc92bf953 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1613,17 +1613,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, * long file_ofs * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... */ -static int fill_files_note(struct memelfnote *note) +static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; user_long_t *start_end_ofs; char *name_base, *name_curpos; + int i; /* *Estimated* file count and total data size needed */ - count = mm->map_count; + count = cprm->vma_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1645,11 +1644,12 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = &cprm->vma_meta[i]; struct file *file; const char *filename; - file = vma->vm_file; + file = m->file; if (!file) continue; filename = file_path(file, name_curpos, remaining); @@ -1669,9 +1669,9 @@ static int fill_files_note(struct memelfnote *note) memmove(name_curpos, filename, n); name_curpos += n; - *start_end_ofs++ = vma->vm_start; - *start_end_ofs++ = vma->vm_end; - *start_end_ofs++ = vma->vm_pgoff; + *start_end_ofs++ = m->start; + *start_end_ofs++ = m->end; + *start_end_ofs++ = m->pgoff; count++; } @@ -1682,7 +1682,7 @@ static int fill_files_note(struct memelfnote *note) * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = mm->map_count - count; + n = cprm->vma_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -1797,7 +1797,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct task_struct *dump_task = current; const struct user_regset_view *view = task_user_regset_view(dump_task); @@ -1869,7 +1869,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Now fill in each thread's information. */ for (t = info->thread; t != NULL; t = t->next) - if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size)) + if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, &info->size)) return 0; /* @@ -1878,13 +1878,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); info->size += notesize(&info->psinfo); - fill_siginfo_note(&info->signote, &info->csigdata, siginfo); + fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo); info->size += notesize(&info->signote); fill_auxv_note(&info->auxv, current->mm); info->size += notesize(&info->auxv); - if (fill_files_note(&info->files) == 0) + if (fill_files_note(&info->files, cprm) == 0) info->size += notesize(&info->files); return 1; @@ -2026,7 +2026,7 @@ static int elf_note_info_init(struct elf_note_info *info) static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct core_thread *ct; struct elf_thread_status *ets; @@ -2047,13 +2047,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, list_for_each_entry(ets, &info->thread_list, list) { int sz; - sz = elf_dump_thread_status(siginfo->si_signo, ets); + sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets); info->thread_status_size += sz; } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(info->prstatus, current, siginfo->si_signo); - elf_core_copy_regs(&info->prstatus->pr_reg, regs); + fill_prstatus(info->prstatus, current, cprm->siginfo->si_signo); + elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs); /* Set up header */ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); @@ -2069,18 +2069,18 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_note(info->notes + 1, "CORE", NT_PRPSINFO, sizeof(*info->psinfo), info->psinfo); - fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo); + fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo); fill_auxv_note(info->notes + 3, current->mm); info->numnote = 4; - if (fill_files_note(info->notes + info->numnote) == 0) { + if (fill_files_note(info->notes + info->numnote, cprm) == 0) { info->notes_files = info->notes + info->numnote; info->numnote++; } /* Try to dump the FPU. */ - info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, - info->fpu); + info->prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, cprm->regs, info->fpu); if (info->prstatus->pr_fpvalid) fill_note(info->notes + info->numnote++, "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); @@ -2166,8 +2166,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs, i; - size_t vma_data_size; + int segs, i; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2175,16 +2174,12 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - struct core_vma_metadata *vma_meta; - - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - return 0; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2198,7 +2193,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + if (!fill_note_info(&elf, e_phnum, &info, cprm)) goto end_coredump; has_dumped = 1; @@ -2222,7 +2217,7 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2242,8 +2237,8 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; @@ -2280,8 +2275,8 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; @@ -2299,7 +2294,6 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_meta); kfree(phdr4note); return has_dumped; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index be4062b8ba75eef23d71b19de1c52a9e65efd064..5764295a3f0ff93d2e457b5667c6e08e8b9c2db8 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1479,7 +1479,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm, static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs; + int segs; int i; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; @@ -1494,8 +1494,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - struct core_vma_metadata *vma_meta = NULL; - size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1505,9 +1503,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - goto end_coredump; - for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1527,7 +1522,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1572,7 +1567,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1592,8 +1587,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; size_t sz; @@ -1643,7 +1638,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) + if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1667,7 +1662,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) thread_list = thread_list->next; kfree(tmp); } - kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/coredump.c b/fs/coredump.c index a85987fe868d261674c7515d765ef8b6a849cb23..21cda234685f255cd3b209b175dcb8eb00d90e84 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -53,6 +53,9 @@ #include +static bool dump_vma_snapshot(struct coredump_params *cprm); +static void free_vma_snapshot(struct coredump_params *cprm); + int core_uses_pid; unsigned int core_pipe_limit; char core_pattern[CORENAME_MAX_SIZE] = "core"; @@ -602,6 +605,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) * by any locks. */ .mm_flags = mm->flags, + .vma_meta = NULL, }; audit_core_dumps(siginfo->si_signo); @@ -807,9 +811,13 @@ void do_coredump(const kernel_siginfo_t *siginfo) pr_info("Core dump to |%s disabled\n", cn.corename); goto close_fail; } + if (!dump_vma_snapshot(&cprm)) + goto close_fail; + file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); file_end_write(cprm.file); + free_vma_snapshot(&cprm); } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -1081,18 +1089,29 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, return gate_vma; } +static void free_vma_snapshot(struct coredump_params *cprm) +{ + if (cprm->vma_meta) { + int i; + for (i = 0; i < cprm->vma_count; i++) { + struct file *file = cprm->vma_meta[i].file; + if (file) + fput(file); + } + kvfree(cprm->vma_meta); + cprm->vma_meta = NULL; + } +} + /* * Under the mmap_lock, take a snapshot of relevant information about the task's * VMAs. */ -int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, - struct core_vma_metadata **vma_meta, - size_t *vma_data_size_ptr) +static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *vma, *gate_vma; struct mm_struct *mm = current->mm; int i; - size_t vma_data_size = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1100,36 +1119,37 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, * mmap_lock in read mode. */ if (mmap_write_lock_killable(mm)) - return -EINTR; + return false; + cprm->vma_data_size = 0; gate_vma = get_gate_vma(mm); - *vma_count = mm->map_count + (gate_vma ? 1 : 0); + cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0); - *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); - if (!*vma_meta) { + cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL); + if (!cprm->vma_meta) { mmap_write_unlock(mm); - return -ENOMEM; + return false; } for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma), i++) { - struct core_vma_metadata *m = (*vma_meta) + i; + struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); + m->pgoff = vma->vm_pgoff; + + m->file = vma->vm_file; + if (m->file) + get_file(m->file); } mmap_write_unlock(mm); - if (WARN_ON(i != *vma_count)) { - kvfree(*vma_meta); - return -EFAULT; - } - - for (i = 0; i < *vma_count; i++) { - struct core_vma_metadata *m = (*vma_meta) + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = cprm->vma_meta + i; if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { char elfmag[SELFMAG]; @@ -1142,9 +1162,8 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, } } - vma_data_size += m->dump_size; + cprm->vma_data_size += m->dump_size; } - *vma_data_size_ptr = vma_data_size; - return 0; + return true; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index da8bd8031119020c720b46190388ebd343eab7b7..ffac35e66bd978dcb7cc9c2c8e0fca2c4ac45e3d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5539,6 +5539,7 @@ static int ext4_load_journal(struct super_block *sb, err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + if (save) memcpy(save, ((char *) es) + EXT4_S_ERR_START, EXT4_S_ERR_LEN); @@ -5547,6 +5548,14 @@ static int ext4_load_journal(struct super_block *sb, memcpy(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN); kfree(save); + es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & + EXT4_ERROR_FS); + /* Write out restored error information to the superblock */ + if (!bdev_read_only(sb->s_bdev)) { + int err2; + err2 = ext4_commit_super(sb); + err = err ? : err2; + } } if (err) { @@ -5769,11 +5778,13 @@ static int ext4_clear_journal_err(struct super_block *sb, errstr = ext4_decode_error(sb, j_errno, nbuf); ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); - ext4_warning(sb, "Marking fs in need of filesystem check."); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb); + j_errno = ext4_commit_super(sb); + if (j_errno) + return j_errno; + ext4_warning(sb, "Marked fs in need of filesystem check."); jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 986020dcf02670c33f76eb9aebe122d32f77f132..33ddce5c5bbccb58dc7a79361fffac363793df89 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -21,6 +21,8 @@ #include "../internal.h" +#define IOEND_BATCH_SIZE 4096 + /* * Structure allocated for each page or THP when block size < page size * to track sub-page uptodate status and I/O completions. @@ -1061,7 +1063,7 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static void +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; @@ -1070,6 +1072,7 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) u64 start = bio->bi_iter.bi_sector; loff_t offset = ioend->io_offset; bool quiet = bio_flagged(bio, BIO_QUIET); + u32 folio_count = 0; for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bv; @@ -1085,9 +1088,11 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) next = bio->bi_private; /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bv, bio, iter_all) + bio_for_each_segment_all(bv, bio, iter_all) { iomap_finish_page_writeback(inode, bv->bv_page, error, bv->bv_len); + folio_count++; + } bio_put(bio); } /* The ioend has been freed by bio_put() */ @@ -1097,20 +1102,36 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) "%s: writeback error on inode %lu, offset %lld, sector %llu", inode->i_sb->s_id, inode->i_ino, offset, start); } + return folio_count; } +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ void iomap_finish_ioends(struct iomap_ioend *ioend, int error) { struct list_head tmp; + u32 completions; + + might_sleep(); list_replace_init(&ioend->io_list, &tmp); - iomap_finish_ioend(ioend, error); + completions = iomap_finish_ioend(ioend, error); while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); list_del_init(&ioend->io_list); - iomap_finish_ioend(ioend, error); + completions += iomap_finish_ioend(ioend, error); } } EXPORT_SYMBOL_GPL(iomap_finish_ioends); @@ -1131,6 +1152,18 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) return false; if (ioend->io_offset + ioend->io_size != next->io_offset) return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) + return false; return true; } @@ -1236,9 +1269,11 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; + ioend->io_folios = 0; ioend->io_offset = offset; ioend->io_private = NULL; ioend->io_bio = bio; + ioend->io_sector = sector; return ioend; } @@ -1279,6 +1314,13 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, return false; if (sector != bio_end_sector(wpc->ioend->io_bio)) return false; + /* + * Limit ioend bio chain lengths to minimise IO completion latency. This + * also prevents long tight loops ending page writeback on all the + * folios in the ioend. + */ + if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) + return false; return true; } @@ -1372,6 +1414,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, &submit_list); count++; } + if (count) + wpc->ioend->io_folios++; WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); WARN_ON_ONCE(!PageLocked(page)); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index eecf78e5d754f8f942e5b43d452ec0849f2a8c19..a4ee2954346f7a0dba5150becf9f6102b5040b8d 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -98,6 +98,12 @@ static struct inode *ntfs_read_mft(struct inode *inode, /* Record should contain $I30 root. */ is_dir = rec->flags & RECORD_FLAG_DIR; + /* MFT_REC_MFT is not a dir */ + if (is_dir && ino == MFT_REC_MFT) { + err = -EINVAL; + goto out; + } + inode->i_generation = le16_to_cpu(rec->seq); /* Enumerate all struct Attributes MFT. */ @@ -129,6 +135,9 @@ static struct inode *ntfs_read_mft(struct inode *inode, rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size); asize = le32_to_cpu(attr->size); + if (le16_to_cpu(attr->name_off) + attr->name_len > asize) + goto out; + switch (attr->type) { case ATTR_STD: if (attr->non_res || diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 861e35791506e801dc446414d935d5463e64a1fb..b06af26c00a07e9e7f7e763590ec0e7d02f3012f 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -260,6 +260,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (t16 + t32 > asize) return NULL; + if (attr->name_len && + le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > t16) { + return NULL; + } + return attr; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e341d6531e687ce94c863bb82214546c6bdeaea0..d4d531c78ae4e9f75c5ddbcc6b7334764e2129ca 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -191,7 +191,20 @@ xfs_ioend_merge_private( } } -/* Finish all pending io completions. */ +/* + * Finish all pending IO completions that require transactional modifications. + * + * We try to merge physical and logically contiguous ioends before completion to + * minimise the number of transactions we need to perform during IO completion. + * Both unwritten extent conversion and COW remapping need to iterate and modify + * one physical extent at a time, so we gain nothing by merging physically + * discontiguous extents here. + * + * The ioend chain length that we can be processing here is largely unbound in + * length and we may have to perform significant amounts of work on each ioend + * to complete it. Hence we have to be careful about holding the CPU for too + * long in this loop. + */ void xfs_end_io( struct work_struct *work) @@ -212,6 +225,7 @@ xfs_end_io( list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private); xfs_end_ioend(ioend); + cond_resched(); } } diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 0571701ab1c57cc0d3dcddb119ea0c503e1ba641..1716a07a9809b5e7e0b863d6e850ac6886899125 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -82,6 +82,9 @@ struct coredump_params { unsigned long mm_flags; loff_t written; loff_t pos; + KABI_EXTEND(int vma_count) + KABI_EXTEND(size_t vma_data_size) + KABI_EXTEND(struct core_vma_metadata *vma_meta) }; /* diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f3d78b939e0145b6a79a2f61e57f2193af1b8750..2eca0b76b3e95736724d58329e0fd77a12de2c63 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -566,7 +566,6 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); -void blk_mq_freeze_queue_wait_sync(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b04613bc3ed57092ac056f1380b30f0da5c58812..9ede0a81e46676b5f445726482e15ddc98d55cf7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -643,8 +643,6 @@ struct request_queue { #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ /*at least one blk-mq hctx can't get driver tag */ #define QUEUE_FLAG_HCTX_WAIT 30 -/* sync for q_usage_counter */ -#define QUEUE_FLAG_USAGE_COUNT_SYNC 31 #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ diff --git a/include/linux/coredump.h b/include/linux/coredump.h index e58e8c2077828d3d3f7f526f34976fff0062d40b..5300d0cb70f7ee0d8613c276106b53cc3bdcf8b9 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -11,6 +11,8 @@ struct core_vma_metadata { unsigned long start, end; unsigned long flags; unsigned long dump_size; + unsigned long pgoff; + struct file *file; }; /* @@ -24,9 +26,6 @@ extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); -int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, - struct core_vma_metadata **vma_meta, - size_t *vma_data_size_ptr); #ifdef CONFIG_COREDUMP extern void do_coredump(const kernel_siginfo_t *siginfo); #else diff --git a/include/linux/filter.h b/include/linux/filter.h index 45ba2a61f9e55146f1c092d8936757200cd7f313..0418d88aa0f3d09a76c10335be84ff65c5399a58 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -578,12 +578,13 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ struct bpf_prog_stats *__stats; \ u64 __start = sched_clock(); \ + unsigned long flags; \ __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ __stats = this_cpu_ptr(prog->aux->stats); \ - u64_stats_update_begin(&__stats->syncp); \ + flags = u64_stats_update_begin_irqsave(&__stats->syncp);\ __stats->cnt++; \ __stats->nsecs += sched_clock() - __start; \ - u64_stats_update_end(&__stats->syncp); \ + u64_stats_update_end_irqrestore(&__stats->syncp, flags);\ } else { \ __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ } \ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d84fc7e55e219a78541263832c5a6c166e650e60..0c95321f42fdc1cc7e1508c9b60f23fd793cb509 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -199,10 +199,12 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ + u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ void *io_private; /* file system private data */ + sector_t io_sector; /* start sector of ioend */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index fba28f17e61aa7b353cb8d42b3fef9eae740c0d3..a8326316992df08148e4c81e7859c464f3361c56 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3675,6 +3675,11 @@ static int btf_func_proto_check(struct btf_verifier_env *env, break; } + if (btf_type_is_resolve_source_only(arg_type)) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + return -EINVAL; + } + if (args[i].name_off && (!btf_name_offset_valid(btf, args[i].name_off) || !btf_name_valid_identifier(btf, args[i].name_off))) { diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 87becf77cc759f1155cf36fe6ebad225866519ed..02637c6cedde4c8842efc76942b26a9837d96edb 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -526,11 +526,13 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) * Hence check that 'start' is not zero. */ start) { + unsigned long flags; + stats = this_cpu_ptr(prog->aux->stats); - u64_stats_update_begin(&stats->syncp); + flags = u64_stats_update_begin_irqsave(&stats->syncp); stats->cnt++; stats->nsecs += sched_clock() - start; - u64_stats_update_end(&stats->syncp); + u64_stats_update_end_irqrestore(&stats->syncp, flags); } migrate_enable(); rcu_read_unlock(); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 72513ed2a5fc664be5d3a4a1ac83ec3d365c6370..0df62a3a1f3742a6f8c6e00e81477b85d1ac0113 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -144,7 +144,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EIO; - if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; if (type) @@ -238,7 +238,7 @@ static ssize_t default_affinity_write(struct file *file, cpumask_var_t new_value; int err; - if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); diff --git a/kernel/profile.c b/kernel/profile.c index 737b1c704aa888fea351aee5a6a3ac183a8b3ac8..0db1122855c0d5b8f94692d755dd24104c03446b 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -438,7 +438,7 @@ static ssize_t prof_cpu_mask_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); diff --git a/kernel/softirq.c b/kernel/softirq.c index 09229ad822096c618060be948c3c4f2914d759cf..19668d614f4738d511df3289d650e55ca3bb7c8d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -376,7 +376,7 @@ static inline void invoke_softirq(void) if (ksoftirqd_running(local_softirq_pending())) return; - if (!force_irqthreads) { + if (!force_irqthreads || !__this_cpu_read(ksoftirqd)) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 81a7b622c691703e8dca0d6f5cd47ce2916e1f37..b1e255e4f8e7cedd540fcb0da631e987b74d6c63 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1072,6 +1072,9 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; + /* Task should not be pid=1 to avoid kernel panic. */ + if (unlikely(is_global_init(current))) + return -EPERM; if (irqs_disabled()) { /* Do an early check on signal validity. Otherwise, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4e130e2bb566999a44dcb32bfbd021ea93f54e53..951181d5b9fbb8229b2cfd14982a9321cc616b7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4792,7 +4792,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, cpumask_var_t tracing_cpumask_new; int err; - if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); diff --git a/mm/compaction.c b/mm/compaction.c index 80116210c2e64ec2db3d95c87eabbb052dae072e..8e8017d909b745973d44c75eeea1bc7e11a9d5f8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1926,7 +1926,9 @@ static inline bool is_via_compact_memory(int order) static bool kswapd_is_running(pg_data_t *pgdat) { - return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING); + struct task_struct *t = READ_ONCE(pgdat->kswapd); + + return t && (t->state == TASK_RUNNING); } /* @@ -2947,7 +2949,8 @@ static int kcompactd_cpu_online(unsigned int cpu) if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) /* One of our CPUs online: restore mask */ - set_cpus_allowed_ptr(pgdat->kcompactd, mask); + if (pgdat->kcompactd) + set_cpus_allowed_ptr(pgdat->kcompactd, mask); } return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7456d825414da42fc463c39bf6fd01b159f601f8..d2dd2bfcaac32c423aa5f1d98f3d51b6d1def35c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1483,8 +1483,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { - kswapd_stop(node); kcompactd_stop(node); + kswapd_stop(node); } writeback_set_ratelimit(); diff --git a/mm/swapfile.c b/mm/swapfile.c index eaf483c7c83e7691297d12b818bfe20ffa1104e8..9fe12f0b92058760582d28cb3f3f41f1b607a327 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1104,6 +1104,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); + cond_resched(); spin_lock(&swap_avail_lock); nextsi: diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a97d3011aa9ace1699fb796e95006c83c58399c5..dadbea29241dc9417b573171999ed4f9f26a2e61 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2637,17 +2637,14 @@ static void __vunmap(const void *addr, int deallocate_pages) vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { + unsigned int page_order = vm_area_page_order(area); int i; - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page = area->pages[i]; BUG_ON(!page); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - __free_pages(page, 0); + __free_pages(page, page_order); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2927,7 +2924,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; int p; - page = alloc_pages_node(node, gfp_mask, page_order); + /* Compound pages required for remap_vmalloc_page */ + page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; @@ -2939,16 +2937,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } - /* - * Higher order allocations must be able to be treated as - * indepdenent small pages by callers (as they can with - * small-page vmallocs). Some drivers do their own refcounting - * on vmalloc_to_page() pages, some use page->mapping, - * page->lru, etc. - */ - if (page_order) - split_page(page, page_order); - for (p = 0; p < (1U << page_order); p++) area->pages[i + p] = page + p; diff --git a/mm/vmscan.c b/mm/vmscan.c index 1b333a4d247a999aa905c90c2d69a181950e6713..a177de05a1a4c9c2912f44b16767ade56ac84dc4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4329,17 +4329,19 @@ int kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); int ret = 0; + struct task_struct *t; - if (pgdat->kswapd) + if (READ_ONCE(pgdat->kswapd)) return 0; - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); - if (IS_ERR(pgdat->kswapd)) { + t = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(t)) { /* failure at boot is fatal */ BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); - ret = PTR_ERR(pgdat->kswapd); - pgdat->kswapd = NULL; + ret = PTR_ERR(t); + } else { + WRITE_ONCE(pgdat->kswapd, t); } return ret; } @@ -4350,11 +4352,11 @@ int kswapd_run(int nid) */ void kswapd_stop(int nid) { - struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + struct task_struct *kswapd = READ_ONCE(NODE_DATA(nid)->kswapd); if (kswapd) { kthread_stop(kswapd); - NODE_DATA(nid)->kswapd = NULL; + WRITE_ONCE(NODE_DATA(nid)->kswapd, NULL); } } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index eaf2308c355a6ed9f9ecafc660f296fbbd1ea698..ddb1730cdf9b5e369e3a123244b29916719cefda 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -273,6 +273,9 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); + if (!len) + return 0; + psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index e9a8a2c86bbdd033b3dca556fcdb1e79b835b5cd..56612caf7833e6ed453f36d5b71b38b239c67dcd 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -338,6 +339,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcf_result cr = {}; int err, balloc = 0; struct tcf_exts e; + bool update_h = false; err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) @@ -455,10 +457,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } } - if (cp->perfect) + if (cp->perfect) { r = cp->perfect + handle; - else - r = tcindex_lookup(cp, handle) ? : &new_filter_result; + } else { + /* imperfect area is updated in-place using rcu */ + update_h = !!tcindex_lookup(cp, handle); + r = &new_filter_result; + } if (r == &new_filter_result) { f = kzalloc(sizeof(*f), GFP_KERNEL); @@ -492,7 +497,28 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, rcu_assign_pointer(tp->root, cp); - if (r == &new_filter_result) { + if (update_h) { + struct tcindex_filter __rcu **fp; + struct tcindex_filter *cf; + + f->result.res = r->res; + tcf_exts_change(&f->result.exts, &r->exts); + + /* imperfect area bucket */ + fp = cp->h + (handle % cp->hash); + + /* lookup the filter, guaranteed to exist */ + for (cf = rcu_dereference_bh_rtnl(*fp); cf; + fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp)) + if (cf->key == (u16)handle) + break; + + f->next = cf->next; + + cf = rcu_replace_pointer(*fp, f, 1); + tcf_exts_get_net(&cf->result.exts); + tcf_queue_work(&cf->rwork, tcindex_destroy_fexts_work); + } else if (r == &new_filter_result) { struct tcindex_filter *nfp; struct tcindex_filter __rcu **fp; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index e537085b184fe5bc753456574c197d505397fa5f..54863e68f30405a9d1958bb3105b307cd859a257 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -386,13 +386,11 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, rc = -EINVAL; goto out; } - lock_sock(sk); memcpy(crypto_info_aes_gcm_128->iv, cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); memcpy(crypto_info_aes_gcm_128->rec_seq, cctx->rec_seq, TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); - release_sock(sk); if (copy_to_user(optval, crypto_info_aes_gcm_128, sizeof(*crypto_info_aes_gcm_128))) @@ -410,13 +408,11 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, rc = -EINVAL; goto out; } - lock_sock(sk); memcpy(crypto_info_aes_gcm_256->iv, cctx->iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE, TLS_CIPHER_AES_GCM_256_IV_SIZE); memcpy(crypto_info_aes_gcm_256->rec_seq, cctx->rec_seq, TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE); - release_sock(sk); if (copy_to_user(optval, crypto_info_aes_gcm_256, sizeof(*crypto_info_aes_gcm_256))) @@ -436,6 +432,8 @@ static int do_tls_getsockopt(struct sock *sk, int optname, { int rc = 0; + lock_sock(sk); + switch (optname) { case TLS_TX: case TLS_RX: @@ -446,6 +444,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, rc = -ENOPROTOOPT; break; } + + release_sock(sk); + return rc; } diff --git a/net/unix/diag.c b/net/unix/diag.c index 9ff64f9df1f3bbfed3cb2d4834d53006126db20f..951b33fa8f5cf0748bc4ab424273051f9d4fce4f 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -113,14 +113,16 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql); } -static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb) +static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb, + struct user_namespace *user_ns) { - uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk)); + uid_t uid = from_kuid_munged(user_ns, sock_i_uid(sk)); return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, - u32 portid, u32 seq, u32 flags, int sk_ino) + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct unix_diag_msg *rep; @@ -166,7 +168,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_UID) && - sk_diag_dump_uid(sk, skb)) + sk_diag_dump_uid(sk, skb, user_ns)) goto out_nlmsg_trim; nlmsg_end(skb, nlh); @@ -178,7 +180,8 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r } static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, - u32 portid, u32 seq, u32 flags) + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags) { int sk_ino; @@ -189,7 +192,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_r if (!sk_ino) return 0; - return sk_diag_fill(sk, skb, req, portid, seq, flags, sk_ino); + return sk_diag_fill(sk, skb, req, user_ns, portid, seq, flags, sk_ino); } static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) @@ -217,7 +220,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto next; if (!(req->udiag_states & (1 << sk->sk_state))) goto next; - if (sk_diag_dump(sk, skb, req, + if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) @@ -285,7 +288,8 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, if (!rep) goto out; - err = sk_diag_fill(sk, rep, req, NETLINK_CB(in_skb).portid, + err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, req->udiag_ino); if (err < 0) { nlmsg_free(rep); diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index d9f4599dee40a5af77a34148cec042ab7bcb06a4..6ecaf6834844e9fa154bf4b28e5b9512fb07255d 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -133,7 +133,9 @@ int ima_store_template(struct ima_template_entry *entry, entry->pcr = pcr; result = ima_add_template_entry(entry, violation, op, inode, filename); - if (!result && duplicated_entry) { + if (result) { + kfree(duplicated_entry); + } else if (duplicated_entry) { result = ima_add_template_entry(duplicated_entry, violation, op, inode, filename); if (result < 0)