diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 867bc5bea8dcf57c7616cb33a428eb7a565610fa..faff6934c05a21f874ce5839f4b9b8542f9c02e5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1295,14 +1295,14 @@ config ARCH_DMA_ADDR_T_64BIT def_bool y depends on X86_64 || HIGHMEM64G -config DIRECT_GBPAGES - bool "Enable 1GB pages for kernel pagetables" if EXPERT - default y - depends on X86_64 +config X86_DIRECT_GBPAGES + def_bool y + depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK ---help--- - Allow the kernel linear mapping to use 1GB pages on CPUs that - support it. This can improve the kernel's performance a tiny bit by - reducing TLB pressure. If in doubt, say "Y". + Certain kernel features effectively disable kernel + linear 1 GB mappings (even if the CPU otherwise + supports them), so don't confuse the user by printing + that we have them enabled. # Common NUMA Features config NUMA diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 3563107b50601d7eb11e767340c7c25aa87b32dd..935588d95c82fa35b68e3638d9acfe0a2734b373 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -366,6 +366,7 @@ enum align_flags { struct va_alignment { int flags; unsigned long mask; + unsigned long bits; } ____cacheline_aligned; extern struct va_alignment va_align; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dd9e50500297ef596e549c17db2cbd5a630e9756..fd470ebf924e574e827f1942b973075d3168a9b6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) va_align.mask = (upperbit - 1) & PAGE_MASK; va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + + /* A random value per boot for bit slice [12:upper_bit) */ + va_align.bits = get_random_int() & va_align.mask; } } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3d3503351242b7c6d8dcc0d32350e36e63da1939..6367a780cc8ca891b9513d2e4688717c8d5d3207 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void) initial_boot_params = dt = early_memremap(initial_dtb, map_len); size = of_get_flat_dt_size(); if (map_len < size) { - early_iounmap(dt, map_len); + early_memunmap(dt, map_len); initial_boot_params = dt = early_memremap(initial_dtb, size); map_len = size; } unflatten_and_copy_device_tree(); - early_iounmap(dt, map_len); + early_memunmap(dt, map_len); } #else static inline void x86_flattree_get_config(void) { } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 46201deee923fffd686244546dc7039dbaab8b2b..7d46bb2603346b41dfb924eee28975cd76e704f3 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -661,7 +661,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - early_iounmap(sdata, data_len); + early_memunmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 014466b152b5008919b765d89934def79687c984..d74ac33290ae3eeef46b923c4556d644b72f0d5a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -354,7 +354,7 @@ static void __init relocate_initrd(void) mapaddr = ramdisk_image & PAGE_MASK; p = early_memremap(mapaddr, clen+slop); memcpy(q, p+slop, clen); - early_iounmap(p, clen+slop); + early_memunmap(p, clen+slop); q += clen; ramdisk_image += clen; ramdisk_size -= clen; @@ -438,7 +438,7 @@ static void __init parse_setup_data(void) data_len = data->len + sizeof(struct setup_data); data_type = data->type; pa_next = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); switch (data_type) { case SETUP_E820_EXT: @@ -470,7 +470,7 @@ static void __init e820_reserve_setup_data(void) E820_RAM, E820_RESERVED_KERN); found = 1; pa_data = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); } if (!found) return; @@ -491,7 +491,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) data = early_memremap(pa_data, sizeof(*data)); memblock_reserve(pa_data, sizeof(*data) + data->len); pa_data = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); } } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 30277e27431acde9a9320e0b1be4470bddb40e3a..10e0272d789a189b7215100a1d66a676d9b4bbfa 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -34,10 +34,26 @@ static unsigned long get_align_mask(void) return va_align.mask; } +/* + * To avoid aliasing in the I$ on AMD F15h, the bits defined by the + * va_align.bits, [12:upper_bit), are set to a random value instead of + * zeroing them. This random value is computed once per boot. This form + * of ASLR is known as "per-boot ASLR". + * + * To achieve this, the random value is added to the info.align_offset + * value before calling vm_unmapped_area() or ORed directly to the + * address. + */ +static unsigned long get_align_bits(void) +{ + return va_align.bits & get_align_mask(); +} + unsigned long align_vdso_addr(unsigned long addr) { unsigned long align_mask = get_align_mask(); - return (addr + align_mask) & ~align_mask; + addr = (addr + align_mask) & ~align_mask; + return addr | get_align_bits(); } static int __init control_va_addr_alignment(char *str) @@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.length = len; info.low_limit = begin; info.high_limit = end; - info.align_mask = filp ? get_align_mask() : 0; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } return vm_unmapped_area(&info); } @@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - info.align_mask = filp ? get_align_mask() : 0; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } addr = vm_unmapped_area(&info); if (!(addr & ~PAGE_MASK)) return addr; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 52417e771af9f66029f42812d02cbce58953b76a..1d553186c4345c02be5c0152764d988cfe365ae1 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,29 +29,33 @@ /* * Tables translating between page_cache_type_t and pte encoding. - * Minimal supported modes are defined statically, modified if more supported - * cache modes are available. - * Index into __cachemode2pte_tbl is the cachemode. - * Index into __pte2cachemode_tbl are the caching attribute bits of the pte - * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. + * + * Minimal supported modes are defined statically, they are modified + * during bootup if more supported cache modes are available. + * + * Index into __cachemode2pte_tbl[] is the cachemode. + * + * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. */ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { - [_PAGE_CACHE_MODE_WB] = 0, - [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, - [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, - [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, - [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, - [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, + [_PAGE_CACHE_MODE_WB ] = 0 | 0 , + [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 , + [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, + [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); + uint8_t __pte2cachemode_tbl[8] = { - [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, - [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, - [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, - [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, - [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, + [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; EXPORT_SYMBOL(__pte2cachemode_tbl); @@ -131,21 +135,7 @@ void __init early_alloc_pgt_buf(void) int after_bootmem; -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; - -static void __init init_gbpages(void) -{ -#ifdef CONFIG_X86_64 - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -#endif -} +early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); struct map_range { unsigned long start; @@ -157,16 +147,12 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { - init_gbpages(); - #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (direct_gbpages) - page_size_mask |= 1 << PG_LEVEL_1G; if (cpu_has_pse) page_size_mask |= 1 << PG_LEVEL_2M; #endif @@ -181,6 +167,14 @@ static void __init probe_page_size_mask(void) __supported_pte_mask |= _PAGE_GLOBAL; } else __supported_pte_mask &= ~_PAGE_GLOBAL; + + /* Enable 1 GB linear kernel mappings if available: */ + if (direct_gbpages && cpu_has_gbpages) { + printk(KERN_INFO "Using GB pages for direct mapping\n"); + page_size_mask |= 1 << PG_LEVEL_1G; + } else { + direct_gbpages = 0; + } } #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 30eb05ae7061624f7faee0077ce41bddb4991ef7..3fba623e3ba558553d9740d3a994343a2960428f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -130,20 +130,6 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, return 0; } -static int __init parse_direct_gbpages_off(char *arg) -{ - direct_gbpages = 0; - return 0; -} -early_param("nogbpages", parse_direct_gbpages_off); - -static int __init parse_direct_gbpages_on(char *arg) -{ - direct_gbpages = 1; - return 0; -} -early_param("gbpages", parse_direct_gbpages_on); - /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 536ea2fb6e335677df559390520c3312976dcb81..89af288ec6740cfd793a4c4804e939bb023e9453 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -81,11 +81,9 @@ void arch_report_meminfo(struct seq_file *m) seq_printf(m, "DirectMap4M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 12); #endif -#ifdef CONFIG_X86_64 if (direct_gbpages) seq_printf(m, "DirectMap1G: %8lu kB\n", direct_pages_count[PG_LEVEL_1G] << 20); -#endif } #else static inline void split_page_count(int level) { } @@ -1654,13 +1652,11 @@ int set_memory_ro(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } -EXPORT_SYMBOL_GPL(set_memory_ro); int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } -EXPORT_SYMBOL_GPL(set_memory_rw); int set_memory_np(unsigned long addr, int numpages) { diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 7ac68698406c3b35e5ce0b0e98c73c5441e869a3..35af6771a95ad6c126cca1f352b896998116876e 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -610,7 +610,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } #ifdef CONFIG_STRICT_DEVMEM -/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { return 1; @@ -628,8 +628,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", - current->comm, from, to - 1); + printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7b22adaad4f1379a1195aec23b38a28a8db83f5c..5a7e5252c878bdcd2e45a11257adc15763cdea5f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -275,12 +275,87 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +/* + * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also + * assumes that pgd should be in one page. + * + * But kernel with PAE paging that is not running as a Xen domain + * only needs to allocate 32 bytes for pgd instead of one page. + */ +#ifdef CONFIG_X86_PAE + +#include + +#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) +#define PGD_ALIGN 32 + +static struct kmem_cache *pgd_cache; + +static int __init pgd_cache_init(void) +{ + /* + * When PAE kernel is running as a Xen domain, it does not use + * shared kernel pmd. And this requires a whole page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return 0; + + /* + * when PAE kernel is not running as a Xen domain, it uses + * shared kernel pmd. Shared kernel pmd does not require a whole + * page for pgd. We are able to just allocate a 32-byte for pgd. + * During boot time, we create a 32-byte slab for pgd table allocation. + */ + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, + SLAB_PANIC, NULL); + if (!pgd_cache) + return -ENOMEM; + + return 0; +} +core_initcall(pgd_cache_init); + +static inline pgd_t *_pgd_alloc(void) +{ + /* + * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. + * We allocate one page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return (pgd_t *)__get_free_page(PGALLOC_GFP); + + /* + * Now PAE kernel is not running as a Xen domain. We can allocate + * a 32-byte slab for pgd to save memory space. + */ + return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + if (!SHARED_KERNEL_PMD) + free_page((unsigned long)pgd); + else + kmem_cache_free(pgd_cache, pgd); +} +#else +static inline pgd_t *_pgd_alloc(void) +{ + return (pgd_t *)__get_free_page(PGALLOC_GFP); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + free_page((unsigned long)pgd); +} +#endif /* CONFIG_X86_PAE */ + pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + pgd = _pgd_alloc(); if (pgd == NULL) goto out; @@ -310,7 +385,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) out_free_pmds: free_pmds(mm, pmds); out_free_pgd: - free_page((unsigned long)pgd); + _pgd_free(pgd); out: return NULL; } @@ -320,7 +395,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); - free_page((unsigned long)pgd); + _pgd_free(pgd); } /* diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index d143d216d52bec69b912128d88c3283cd0122c6c..d7f997f7c26d2502a1a188583d6817c3672417d1 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -67,7 +67,7 @@ void __init efi_bgrt_init(void) image = efi_lookup_mapped_addr(bgrt_tab->image_address); if (!image) { - image = early_memremap(bgrt_tab->image_address, + image = early_ioremap(bgrt_tab->image_address, sizeof(bmp_header)); ioremapped = true; if (!image) { @@ -89,7 +89,7 @@ void __init efi_bgrt_init(void) } if (ioremapped) { - image = early_memremap(bgrt_tab->image_address, + image = early_ioremap(bgrt_tab->image_address, bmp_header.size); if (!image) { pr_err("Ignoring BGRT: failed to map image memory\n"); diff --git a/include/linux/init.h b/include/linux/init.h index 2df8e8dd10a483d55d723b6ac4875de807ab2044..21b6d768edd7a4e0f1aee98ed9f437000fa1b996 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -253,21 +253,41 @@ struct obs_kernel_param { * obs_kernel_param "array" too far apart in .init.setup. */ #define __setup_param(str, unique_id, fn, early) \ - static const char __setup_str_##unique_id[] __initconst \ - __aligned(1) = str; \ - static struct obs_kernel_param __setup_##unique_id \ - __used __section(.init.setup) \ - __attribute__((aligned((sizeof(long))))) \ + static const char __setup_str_##unique_id[] __initconst \ + __aligned(1) = str; \ + static struct obs_kernel_param __setup_##unique_id \ + __used __section(.init.setup) \ + __attribute__((aligned((sizeof(long))))) \ = { __setup_str_##unique_id, fn, early } -#define __setup(str, fn) \ +#define __setup(str, fn) \ __setup_param(str, fn, fn, 0) -/* NOTE: fn is as per module_param, not __setup! Emits warning if fn - * returns non-zero. */ -#define early_param(str, fn) \ +/* + * NOTE: fn is as per module_param, not __setup! + * Emits warning if fn returns non-zero. + */ +#define early_param(str, fn) \ __setup_param(str, fn, fn, 1) +#define early_param_on_off(str_on, str_off, var, config) \ + \ + int var = IS_ENABLED(config); \ + \ + static int __init parse_##var##_on(char *arg) \ + { \ + var = 1; \ + return 0; \ + } \ + __setup_param(str_on, parse_##var##_on, parse_##var##_on, 1); \ + \ + static int __init parse_##var##_off(char *arg) \ + { \ + var = 0; \ + return 0; \ + } \ + __setup_param(str_off, parse_##var##_off, parse_##var##_off, 1) + /* Relies on boot_command_line being set */ void __init parse_early_param(void); void __init parse_early_options(char *cmdline);