diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 8602f06c759f7eec2786eaa17df10b6a6720e8ea..1a50e09c945bd41564a0ef94e43e3f19d8d05b88 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode) * * On Xen, we don't appear to have that guarantee, but Xen still * supplies a valid seqlock using the version field. - + * * We only do pvclock vdso timing at all if * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to * mean that all vCPUs have matching pvti and that the TSC is * synced, so we can just look at vCPU 0's pvti. */ - if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { - *mode = VCLOCK_NONE; - return 0; - } - do { version = pvti->version; smp_rmb(); + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { + *mode = VCLOCK_NONE; + return 0; + } + tsc = rdtsc_ordered(); pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; pvti_tsc_shift = pvti->tsc_shift; diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 4fa687a47a62d85d4fd84062c2a5fc7c622c1cc7..6b8d6e8cd4494ad4938a444ed48816c3089e495f 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -27,7 +27,7 @@ #define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ -#define BOOT_HEAP_SIZE 0x8000 +#define BOOT_HEAP_SIZE 0x10000 #endif /* !CONFIG_KERNEL_BZIP2 */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index eadcdd5bb9464af38645499004fee0755d6b071f..0fd440df63f18d77c040f8683f34018ed426b4b2 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); +extern u64 fpu__get_supported_xfeatures_mask(void); /* * Debugging facility: diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 3a6c89b7030757e707a77be75a5a4d3a30e2982f..af30fdeb140da75691e91c52446f7f1bb7fe2e93 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -20,15 +20,16 @@ /* Supported features which support lazy state saving */ #define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ - XFEATURE_MASK_SSE | \ + XFEATURE_MASK_SSE) + +/* Supported features which require eager state saving */ +#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_YMM | \ - XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM) -/* Supported features which require eager state saving */ -#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR) - /* All currently supported features */ #define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 3bbc07a57a31e9d68f232fd56651be6788491f95..73d0c9b92087c4eb64f834c21df64889529f72fd 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -12,7 +12,9 @@ #define GUEST_PL 1 /* Page for Switcher text itself, then two pages per cpu */ -#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids) +#define SWITCHER_TEXT_PAGES (1) +#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) +#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) /* Where we map the Switcher, in both Host and Guest. */ extern unsigned long switcher_addr; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 379cd3658799f3fb0623e5dfca5c3e7fde4a7a23..bfd9b2a35a0b14aa879ac9973202ef882dae52d7 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, #endif cpumask_set_cpu(cpu, mm_cpumask(next)); - /* Re-load page tables */ + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + * + */ load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); /* Stop flush ipis for the previous mm */ @@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * schedule, protecting us from simultaneous changes. */ cpumask_set_cpu(cpu, mm_cpumask(next)); + /* * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. + * + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e678ddeed03067c3e1d63ebc3b7bd53239d3f8cb..a07956a08936e8ea56c1a73075d700d281895577 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) */ int ht_nodeid = c->initial_apicid; - if (ht_nodeid >= 0 && - __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = __apicid_to_node[ht_nodeid]; /* Pick a nearby node */ if (!node_online(node)) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7b2978ab30df478669bbf10b2304e2fac04d1235..6d9f0a7ef4c8e5da3d596444242de7478d7b9356 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -3,8 +3,11 @@ */ #include #include +#include +#include #include +#include /* * Initialize the TS bit in CR0 according to the style of context-switches @@ -270,20 +273,52 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; -static int __init eager_fpu_setup(char *s) +/* + * Find supported xfeatures based on cpu features and command-line input. + * This must be called after fpu__init_parse_early_param() is called and + * xfeatures_mask is enumerated. + */ +u64 __init fpu__get_supported_xfeatures_mask(void) { - if (!strcmp(s, "on")) - eagerfpu = ENABLE; - else if (!strcmp(s, "off")) - eagerfpu = DISABLE; - else if (!strcmp(s, "auto")) - eagerfpu = AUTO; - return 1; + /* Support all xfeatures known to us */ + if (eagerfpu != DISABLE) + return XCNTXT_MASK; + + /* Warning of xfeatures being disabled for no eagerfpu mode */ + if (xfeatures_mask & XFEATURE_MASK_EAGER) { + pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", + xfeatures_mask & XFEATURE_MASK_EAGER); + } + + /* Return a mask that masks out all features requiring eagerfpu mode */ + return ~XFEATURE_MASK_EAGER; +} + +/* + * Disable features dependent on eagerfpu. + */ +static void __init fpu__clear_eager_fpu_features(void) +{ + setup_clear_cpu_cap(X86_FEATURE_MPX); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); + setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512PF); + setup_clear_cpu_cap(X86_FEATURE_AVX512ER); + setup_clear_cpu_cap(X86_FEATURE_AVX512CD); } -__setup("eagerfpu=", eager_fpu_setup); /* * Pick the FPU context switching strategy: + * + * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of + * the following is true: + * + * (1) the cpu has xsaveopt, as it has the optimization and doing eager + * FPU switching has a relatively low cost compared to a plain xsave; + * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU + * switching. Should the kernel boot with noxsaveopt, we support MPX + * with eager FPU switching at a higher cost. */ static void __init fpu__init_system_ctx_switch(void) { @@ -295,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(current->thread.fpu.fpstate_active); current_thread_info()->status = 0; - /* Auto enable eagerfpu for xsaveopt */ if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) eagerfpu = ENABLE; - if (xfeatures_mask & XFEATURE_MASK_EAGER) { - if (eagerfpu == DISABLE) { - pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - xfeatures_mask & XFEATURE_MASK_EAGER); - xfeatures_mask &= ~XFEATURE_MASK_EAGER; - } else { - eagerfpu = ENABLE; - } - } + if (xfeatures_mask & XFEATURE_MASK_EAGER) + eagerfpu = ENABLE; if (eagerfpu == ENABLE) setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); @@ -315,12 +342,49 @@ static void __init fpu__init_system_ctx_switch(void) printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); } +/* + * We parse fpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init fpu__init_parse_early_param(void) +{ + /* + * No need to check "eagerfpu=auto" again, since it is the + * initial default. + */ + if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { + eagerfpu = DISABLE; + fpu__clear_eager_fpu_features(); + } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) { + eagerfpu = ENABLE; + } + + if (cmdline_find_option_bool(boot_command_line, "no387")) + setup_clear_cpu_cap(X86_FEATURE_FPU); + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) { + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); + setup_clear_cpu_cap(X86_FEATURE_XMM); + } + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + fpu__xstate_clear_all_cpu_caps(); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); +} + /* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: */ void __init fpu__init_system(struct cpuinfo_x86 *c) { + fpu__init_parse_early_param(); fpu__init_system_early_generic(c); /* @@ -344,62 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_ctx_switch(); } - -/* - * Boot parameter to turn off FPU support and fall back to math-emu: - */ -static int __init no_387(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FPU); - return 1; -} -__setup("no387", no_387); - -/* - * Disable all xstate CPU features: - */ -static int __init x86_noxsave_setup(char *s) -{ - if (strlen(s)) - return 0; - - fpu__xstate_clear_all_cpu_caps(); - - return 1; -} -__setup("noxsave", x86_noxsave_setup); - -/* - * Disable the XSAVEOPT instruction specifically: - */ -static int __init x86_noxsaveopt_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - - return 1; -} -__setup("noxsaveopt", x86_noxsaveopt_setup); - -/* - * Disable the XSAVES instruction: - */ -static int __init x86_noxsaves_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - - return 1; -} -__setup("noxsaves", x86_noxsaves_setup); - -/* - * Disable FX save/restore and SSE support: - */ -static int __init x86_nofxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); - setup_clear_cpu_cap(X86_FEATURE_XMM); - - return 1; -} -__setup("nofxsr", x86_nofxsr_setup); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 40f100285984eb9f73876115f263c53c85f86a59..d425cda5ae6d0f1544c5d8decdb14d8682d7f7dd 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); setup_clear_cpu_cap(X86_FEATURE_MPX); + setup_clear_cpu_cap(X86_FEATURE_XGETBV1); } /* @@ -632,8 +633,7 @@ void __init fpu__init_system_xstate(void) BUG(); } - /* Support only the state known to the OS: */ - xfeatures_mask = xfeatures_mask & XCNTXT_MASK; + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d64889aa2d46d50fec76d7c653ac31094c8b0a30..ab0adc0fa5db4da281de34e46c71e2cfbb707c07 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), }, }, + { /* Handle problems with rebooting on the iMac10,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac10,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"), + }, + }, /* ASRock */ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 4cf401f581e7e720f858393476eb895a440d0ab2..07efb35ee4bc3fd5f4452af68214de79dc936716 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -48,31 +48,31 @@ verify_cpu: pushfl popl %eax cmpl %eax,%ebx - jz verify_cpu_no_longmode # cpu has no cpuid + jz .Lverify_cpu_no_longmode # cpu has no cpuid #endif movl $0x0,%eax # See if cpuid 1 is implemented cpuid cmpl $0x1,%eax - jb verify_cpu_no_longmode # no cpuid 1 + jb .Lverify_cpu_no_longmode # no cpuid 1 xor %di,%di cmpl $0x68747541,%ebx # AuthenticAMD - jnz verify_cpu_noamd + jnz .Lverify_cpu_noamd cmpl $0x69746e65,%edx - jnz verify_cpu_noamd + jnz .Lverify_cpu_noamd cmpl $0x444d4163,%ecx - jnz verify_cpu_noamd + jnz .Lverify_cpu_noamd mov $1,%di # cpu is from AMD - jmp verify_cpu_check + jmp .Lverify_cpu_check -verify_cpu_noamd: +.Lverify_cpu_noamd: cmpl $0x756e6547,%ebx # GenuineIntel? - jnz verify_cpu_check + jnz .Lverify_cpu_check cmpl $0x49656e69,%edx - jnz verify_cpu_check + jnz .Lverify_cpu_check cmpl $0x6c65746e,%ecx - jnz verify_cpu_check + jnz .Lverify_cpu_check # only call IA32_MISC_ENABLE when: # family > 6 || (family == 6 && model >= 0xd) @@ -83,59 +83,59 @@ verify_cpu_noamd: andl $0x0ff00f00, %eax # mask family and extended family shrl $8, %eax cmpl $6, %eax - ja verify_cpu_clear_xd # family > 6, ok - jb verify_cpu_check # family < 6, skip + ja .Lverify_cpu_clear_xd # family > 6, ok + jb .Lverify_cpu_check # family < 6, skip andl $0x000f00f0, %ecx # mask model and extended model shrl $4, %ecx cmpl $0xd, %ecx - jb verify_cpu_check # family == 6, model < 0xd, skip + jb .Lverify_cpu_check # family == 6, model < 0xd, skip -verify_cpu_clear_xd: +.Lverify_cpu_clear_xd: movl $MSR_IA32_MISC_ENABLE, %ecx rdmsr btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE - jnc verify_cpu_check # only write MSR if bit was changed + jnc .Lverify_cpu_check # only write MSR if bit was changed wrmsr -verify_cpu_check: +.Lverify_cpu_check: movl $0x1,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK0,%edx xorl $REQUIRED_MASK0,%edx - jnz verify_cpu_no_longmode + jnz .Lverify_cpu_no_longmode movl $0x80000000,%eax # See if extended cpuid is implemented cpuid cmpl $0x80000001,%eax - jb verify_cpu_no_longmode # no extended cpuid + jb .Lverify_cpu_no_longmode # no extended cpuid movl $0x80000001,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK1,%edx xorl $REQUIRED_MASK1,%edx - jnz verify_cpu_no_longmode + jnz .Lverify_cpu_no_longmode -verify_cpu_sse_test: +.Lverify_cpu_sse_test: movl $1,%eax cpuid andl $SSE_MASK,%edx cmpl $SSE_MASK,%edx - je verify_cpu_sse_ok + je .Lverify_cpu_sse_ok test %di,%di - jz verify_cpu_no_longmode # only try to force SSE on AMD + jz .Lverify_cpu_no_longmode # only try to force SSE on AMD movl $MSR_K7_HWCR,%ecx rdmsr btr $15,%eax # enable SSE wrmsr xor %di,%di # don't loop - jmp verify_cpu_sse_test # try again + jmp .Lverify_cpu_sse_test # try again -verify_cpu_no_longmode: +.Lverify_cpu_no_longmode: popf # Restore caller passed flags movl $1,%eax ret -verify_cpu_sse_ok: +.Lverify_cpu_sse_ok: popf # Restore caller passed flags xorl %eax, %eax ret diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec081fe0ce2c10246fec7126dd213aa6b4d75dbb..8829482d69ec2872adaeaeb9bd68539c1b6dda38 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, if (phys_addr < (phys_addr_t)0x40000000) return; - if (IS_ALIGNED(addr, PAGE_SIZE) && - IS_ALIGNED(next, PAGE_SIZE)) { + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { /* * Do not free direct mapping pages since they were * freed when offlining, or simplely not in use. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6000ad7f560c399226f06612c406268ac6a68820..fc6a4c8f6e2abc79c29803264ad16a48067904e8 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages) static void split_page_count(int level) { + if (direct_pages_count[level] == 0) + return; + direct_pages_count[level]--; direct_pages_count[level - 1] += PTRS_PER_PTE; } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8ddb5d0d66fb6f6353e735760489a47a0b98dea7..8f4cc3dfac322a2911ab2b2e8471c6e8e87d8af0 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -161,7 +161,10 @@ void flush_tlb_current_task(void) preempt_disable(); count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + + /* This is an implicit full barrier that synchronizes with switch_mm. */ local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); - if (current->active_mm != mm) + if (current->active_mm != mm) { + /* Synchronize with switch_mm. */ + smp_mb(); + goto out; + } if (!current->mm) { leave_mm(smp_processor_id()); + + /* Synchronize with switch_mm. */ + smp_mb(); + goto out; } if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) base_pages_to_flush = (end - start) >> PAGE_SHIFT; + /* + * Both branches below are implicit full barriers (MOV to CR or + * INVLPG) that synchronize with switch_mm. + */ if (base_pages_to_flush > tlb_single_page_flush_ceiling) { base_pages_to_flush = TLB_FLUSH_ALL; count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); @@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) preempt_disable(); if (current->active_mm == mm) { - if (current->mm) + if (current->mm) { + /* + * Implicit full barrier (INVLPG) that synchronizes + * with switch_mm. + */ __flush_tlb_one(start); - else + } else { leave_mm(smp_processor_id()); + + /* Synchronize with switch_mm. */ + smp_mb(); + } } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 312ffd3d00177ca5a5e21393377c760e9bdec91e..9e385b38debfd1c9d6a0b374ee8a44098f577e0d 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -22,7 +22,8 @@ unsigned long switcher_addr; struct page **lg_switcher_pages; -static struct vm_struct *switcher_vma; +static struct vm_struct *switcher_text_vma; +static struct vm_struct *switcher_stacks_vma; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); @@ -82,55 +83,81 @@ static __init int map_switcher(void) } } + /* + * Copy in the compiled-in Switcher code (from x86/switcher_32.S). + * It goes in the first page, which we map in momentarily. + */ + memcpy(kmap(lg_switcher_pages[0]), start_switcher_text, + end_switcher_text - start_switcher_text); + kunmap(lg_switcher_pages[0]); + /* * We place the Switcher underneath the fixmap area, which is the * highest virtual address we can get. This is important, since we * tell the Guest it can't access this memory, so we want its ceiling * as high as possible. */ - switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; + switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE; /* - * Now we reserve the "virtual memory area" we want. We might - * not get it in theory, but in practice it's worked so far. - * The end address needs +1 because __get_vm_area allocates an - * extra guard page, so we need space for that. + * Now we reserve the "virtual memory area"s we want. We might + * not get them in theory, but in practice it's worked so far. + * + * We want the switcher text to be read-only and executable, and + * the stacks to be read-write and non-executable. */ - switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, - VM_ALLOC, switcher_addr, switcher_addr - + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); - if (!switcher_vma) { + switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD, + switcher_addr, + switcher_addr + PAGE_SIZE); + + if (!switcher_text_vma) { err = -ENOMEM; printk("lguest: could not map switcher pages high\n"); goto free_pages; } + switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE, + VM_ALLOC|VM_NO_GUARD, + switcher_addr + PAGE_SIZE, + switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE); + if (!switcher_stacks_vma) { + err = -ENOMEM; + printk("lguest: could not map switcher pages high\n"); + goto free_text_vma; + } + /* * This code actually sets up the pages we've allocated to appear at * switcher_addr. map_vm_area() takes the vma we allocated above, the - * kind of pages we're mapping (kernel pages), and a pointer to our - * array of struct pages. + * kind of pages we're mapping (kernel text pages and kernel writable + * pages respectively), and a pointer to our array of struct pages. */ - err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages); + err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages); + if (err) { + printk("lguest: text map_vm_area failed: %i\n", err); + goto free_vmas; + } + + err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL, + lg_switcher_pages + SWITCHER_TEXT_PAGES); if (err) { - printk("lguest: map_vm_area failed: %i\n", err); - goto free_vma; + printk("lguest: stacks map_vm_area failed: %i\n", err); + goto free_vmas; } /* * Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from x86/switcher_32.S). */ - memcpy(switcher_vma->addr, start_switcher_text, - end_switcher_text - start_switcher_text); - printk(KERN_INFO "lguest: mapped switcher at %p\n", - switcher_vma->addr); + switcher_text_vma->addr); /* And we succeeded... */ return 0; -free_vma: - vunmap(switcher_vma->addr); +free_vmas: + /* Undoes map_vm_area and __get_vm_area */ + vunmap(switcher_stacks_vma->addr); +free_text_vma: + vunmap(switcher_text_vma->addr); free_pages: i = TOTAL_SWITCHER_PAGES; free_some_pages: @@ -148,7 +175,8 @@ static void unmap_switcher(void) unsigned int i; /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */ - vunmap(switcher_vma->addr); + vunmap(switcher_text_vma->addr); + vunmap(switcher_stacks_vma->addr); /* Now we just need to free the pages we copied the switcher into */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) __free_pages(lg_switcher_pages[i], 0); diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index eabcff4119840aaf663da3bbd0a8c4154b072881..d0c473f6585058053d83ea2e6a3eee46321668a1 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,9 +4,11 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \ - test_FCMOV test_FCOMI test_FISTTP + test_FCMOV test_FCOMI test_FISTTP \ + ldt_gdt \ + vdso_restorer TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) diff --git a/tools/testing/selftests/x86/vdso_restorer.c b/tools/testing/selftests/x86/vdso_restorer.c new file mode 100644 index 0000000000000000000000000000000000000000..cb038424a40327207100966e3a8fc4afcd6d4c4f --- /dev/null +++ b/tools/testing/selftests/x86/vdso_restorer.c @@ -0,0 +1,88 @@ +/* + * vdso_restorer.c - tests vDSO-based signal restore + * Copyright (c) 2015 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * This makes sure that sa_restorer == NULL keeps working on 32-bit + * configurations. Modern glibc doesn't use it under any circumstances, + * so it's easy to overlook breakage. + * + * 64-bit userspace has never supported sa_restorer == NULL, so this is + * 32-bit only. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +/* Open-code this -- the headers are too messy to easily use them. */ +struct real_sigaction { + void *handler; + unsigned long flags; + void *restorer; + unsigned int mask[2]; +}; + +static volatile sig_atomic_t handler_called; + +static void handler_with_siginfo(int sig, siginfo_t *info, void *ctx_void) +{ + handler_called = 1; +} + +static void handler_without_siginfo(int sig) +{ + handler_called = 1; +} + +int main() +{ + int nerrs = 0; + struct real_sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.handler = handler_with_siginfo; + sa.flags = SA_SIGINFO; + sa.restorer = NULL; /* request kernel-provided restorer */ + + if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0) + err(1, "raw rt_sigaction syscall"); + + raise(SIGUSR1); + + if (handler_called) { + printf("[OK]\tSA_SIGINFO handler returned successfully\n"); + } else { + printf("[FAIL]\tSA_SIGINFO handler was not called\n"); + nerrs++; + } + + sa.flags = 0; + sa.handler = handler_without_siginfo; + if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0) + err(1, "raw sigaction syscall"); + handler_called = 0; + + raise(SIGUSR1); + + if (handler_called) { + printf("[OK]\t!SA_SIGINFO handler returned successfully\n"); + } else { + printf("[FAIL]\t!SA_SIGINFO handler was not called\n"); + nerrs++; + } +}