提交 99792e0c 编写于 作者: L Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "Lots of changes in this cycle:

   - Lots of CPA (change page attribute) optimizations and related
     cleanups (Thomas Gleixner, Peter Zijstra)

   - Make lazy TLB mode even lazier (Rik van Riel)

   - Fault handler cleanups and improvements (Dave Hansen)

   - kdump, vmcore: Enable kdumping encrypted memory with AMD SME
     enabled (Lianbo Jiang)

   - Clean up VM layout documentation (Baoquan He, Ingo Molnar)

   - ... plus misc other fixes and enhancements"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
  x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry()
  x86/mm: Kill stray kernel fault handling comment
  x86/mm: Do not warn about PCI BIOS W+X mappings
  resource: Clean it up a bit
  resource: Fix find_next_iomem_res() iteration issue
  resource: Include resource end in walk_*() interfaces
  x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
  x86/mm: Remove spurious fault pkey check
  x86/mm/vsyscall: Consider vsyscall page part of user address space
  x86/mm: Add vsyscall address helper
  x86/mm: Fix exception table comments
  x86/mm: Add clarifying comments for user addr space
  x86/mm: Break out user address space handling
  x86/mm: Break out kernel address space handling
  x86/mm: Clarify hardware vs. software "error_code"
  x86/mm/tlb: Make lazy TLB mode lazier
  x86/mm/tlb: Add freed_tables element to flush_tlb_info
  x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range
  smp,cpumask: introduce on_each_cpu_cond_mask
  smp: use __cpumask_set_cpu in on_each_cpu_cond
  ...
====================================================
Complete virtual memory map with 4-level page tables
====================================================
Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
hole caused by [47:63] sign extension
ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
vaddr_end for KASLR
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Virtual memory map with 5 level page tables:
0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ...
vaddr_end for KASLR
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
... unused hole ...
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Notes:
- Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down
from the top of the 64-bit address space. It's easier to understand the layout
when seen both in absolute addresses and in distance-from-top notation.
For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the
64-bit address space (ffffffffffffffff).
Note that as we get closer to the top of the address space, the notation changes
from TB to GB and then MB/KB.
- "16M TB" might look weird at first sight, but it's an easier to visualize size
notation than "16 EB", which few will recognize at first sight as 16 exabytes.
It also shows it nicely how incredibly large 64-bit address space is.
========================================================================================================================
Start addr | Offset | End addr | Size | VM area description
========================================================================================================================
| | | |
0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm
__________________|____________|__________________|_________|___________________________________________________________
| | | |
0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
| | | | virtual memory addresses up to the -128 TB
| | | | starting offset of kernel mappings.
__________________|____________|__________________|_________|___________________________________________________________
|
| Kernel-space virtual memory, shared between all processes:
____________________________________________________________|___________________________________________________________
| | | |
ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor
ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base)
ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole
ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base)
ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole
ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base)
ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole
ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
__________________|____________|__________________|_________|____________________________________________________________
|
| Identical layout to the 47-bit one from here on:
____________________________________________________________|____________________________________________________________
| | | |
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
ffffffff80000000 |-2048 MB | | |
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
ffffffffff000000 | -16 MB | | |
FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
__________________|____________|__________________|_________|___________________________________________________________
====================================================
Complete virtual memory map with 5-level page tables
====================================================
Notes:
- With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
offset and many of the regions expand to support the much larger physical
memory supported.
========================================================================================================================
Start addr | Offset | End addr | Size | VM area description
========================================================================================================================
| | | |
0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
__________________|____________|__________________|_________|___________________________________________________________
| | | |
0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
| | | | virtual memory addresses up to the -128 TB
| | | | starting offset of kernel mappings.
__________________|____________|__________________|_________|___________________________________________________________
|
| Kernel-space virtual memory, shared between all processes:
____________________________________________________________|___________________________________________________________
| | | |
ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor
ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base)
ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI
ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base)
ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
__________________|____________|__________________|_________|____________________________________________________________
|
| Identical layout to the 47-bit one from here on:
____________________________________________________________|____________________________________________________________
| | | |
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
ffffffff80000000 |-2048 MB | | |
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
ffffffffff000000 | -16 MB | | |
FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
__________________|____________|__________________|_________|___________________________________________________________
Architecture defines a 64-bit virtual address. Implementations can support
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
......
......@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES
supports them), so don't confuse the user by printing
that we have them enabled.
config X86_CPA_STATISTICS
bool "Enable statistic for Change Page Attribute"
depends on DEBUG_FS
---help---
Expose statistics about the Change Page Attribute mechanims, which
helps to determine the effectivness of preserving large and huge
page mappings when mapping protections are changed.
config ARCH_HAS_MEM_ENCRYPT
def_bool y
......
......@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
#define ioremap_nocache ioremap_nocache
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
#define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
#define ioremap_prot ioremap_prot
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
#define ioremap_encrypted ioremap_encrypted
/**
* ioremap - map bus memory into CPU space
......
......@@ -67,7 +67,7 @@ struct kimage;
/* Memory to backup during crash kdump */
#define KEXEC_BACKUP_SRC_START (0UL)
#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
/*
* CPU does not save ss and sp on stack if execution is already
......
......@@ -59,13 +59,16 @@
#endif
/*
* Kernel image size is limited to 1GiB due to the fixmap living in the
* next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
* 512MiB by default, leaving 1.5GiB for modules once the page tables
* are fully set up. If kernel ASLR is configured, it can extend the
* kernel page table mapping, reducing the size of the modules area.
* Maximum kernel image size is limited to 1 GiB, due to the fixmap living
* in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
*
* On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
* page tables are fully set up.
*
* If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
* of the modules area to 1.5 GiB.
*/
#if defined(CONFIG_RANDOMIZE_BASE)
#ifdef CONFIG_RANDOMIZE_BASE
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
......
......@@ -6,16 +6,23 @@
#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush(tlb) \
{ \
if (!tlb->fullmm && !tlb->need_flush_all) \
flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
else \
flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
}
static inline void tlb_flush(struct mmu_gather *tlb);
#include <asm-generic/tlb.h>
static inline void tlb_flush(struct mmu_gather *tlb)
{
unsigned long start = 0UL, end = TLB_FLUSH_ALL;
unsigned int stride_shift = tlb_get_unmap_shift(tlb);
if (!tlb->fullmm && !tlb->need_flush_all) {
start = tlb->start;
end = tlb->end;
}
flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}
/*
* While x86 architecture in general requires an IPI to perform TLB
* shootdown, enablement code for several hypervisors overrides
......
......@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
static inline bool tlb_defer_switch_to_init_mm(void)
{
/*
* If we have PCID, then switching to init_mm is reasonably
* fast. If we don't have PCID, then switching to init_mm is
* quite slow, so we try to defer it in the hopes that we can
* avoid it entirely. The latter approach runs the risk of
* receiving otherwise unnecessary IPIs.
*
* This choice is just a heuristic. The tlb code can handle this
* function returning true or false regardless of whether we have
* PCID.
*/
return !static_cpu_has(X86_FEATURE_PCID);
}
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
......@@ -547,23 +531,30 @@ struct flush_tlb_info {
unsigned long start;
unsigned long end;
u64 new_tlb_gen;
unsigned int stride_shift;
bool freed_tables;
};
#define local_flush_tlb() __flush_tlb()
#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
#define flush_tlb_mm(mm) \
flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range((vma)->vm_mm, start, end, \
((vma)->vm_flags & VM_HUGETLB) \
? huge_page_shift(hstate_vma(vma)) \
: PAGE_SHIFT, false)
extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
unsigned long end, unsigned int stride_shift,
bool freed_tables);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
......
......@@ -11,40 +11,62 @@
#include <linux/uaccess.h>
#include <linux/io.h>
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from "oldmem". For this page, there is no pte mapped
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
size_t csize, unsigned long offset, int userbuf)
static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf,
bool encrypted)
{
void *vaddr;
if (!csize)
return 0;
vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (encrypted)
vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
else
vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr)
return -ENOMEM;
if (userbuf) {
if (copy_to_user(buf, vaddr + offset, csize)) {
iounmap(vaddr);
if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
iounmap((void __iomem *)vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
set_iounmap_nonlazy();
iounmap(vaddr);
iounmap((void __iomem *)vaddr);
return csize;
}
/**
* copy_oldmem_page - copy one page of memory
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from the old kernel's memory. For this page, there is no pte
* mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
}
/**
* copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
* memory with the encryption mask set to accomodate kdump on SME-enabled
* machines.
*/
ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
}
......@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
map_ldt_struct_to_user(mm);
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
ldt->slot = slot;
return 0;
......
......@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_unmap_unlock(pte, ptl);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
}
......
......@@ -19,7 +19,9 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/pci.h>
#include <asm/e820/types.h>
#include <asm/pgtable.h>
/*
......@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
return (signed long)(u << shift) >> shift;
}
static void note_wx(struct pg_state *st)
{
unsigned long npages;
npages = (st->current_address - st->start_address) / PAGE_SIZE;
#ifdef CONFIG_PCI_BIOS
/*
* If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
* Inform about it, but avoid the warning.
*/
if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
st->current_address <= PAGE_OFFSET + BIOS_END) {
pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
return;
}
#endif
/* Account the WX pages */
st->wx_pages += npages;
WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
(void *)st->start_address);
}
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
......@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
unsigned long delta;
int width = sizeof(unsigned long) * 2;
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
WARN_ONCE(1,
"x86/mm: Found insecure W+X mapping at address %p/%pS\n",
(void *)st->start_address,
(void *)st->start_address);
st->wx_pages += (st->current_address -
st->start_address) / PAGE_SIZE;
}
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
note_wx(st);
/*
* Now print the actual finished series
......
......@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
show_opcodes(regs, loglvl);
}
/*
* The (legacy) vsyscall page is the long page in the kernel portion
* of the address space that has user-accessible permissions.
*/
static bool is_vsyscall_vaddr(unsigned long vaddr)
{
return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 *pkey, int si_code)
......@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_errata100(regs, address))
return;
#ifdef CONFIG_X86_64
/*
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
/*
* To avoid leaking information about the kernel page table
* layout, pretend that user-mode accesses to kernel addresses
......@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
}
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
* changes, so no spurious fault will ever set X86_PF_PK.
*/
if ((error_code & X86_PF_PK))
return 1;
return 1;
}
......@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* (Optional Invalidation).
*/
static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
......@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
return 0;
if (p4d_large(*p4d))
return spurious_fault_check(error_code, (pte_t *) p4d);
return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
return spurious_fault_check(error_code, (pte_t *) pud);
return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd);
return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte))
return 0;
ret = spurious_fault_check(error_code, pte);
ret = spurious_kernel_fault_check(error_code, pte);
if (!ret)
return 0;
......@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
* Make sure we have permissions in PMD.
* If not, then there's a bug in the page tables:
*/
ret = spurious_fault_check(error_code, (pte_t *) pmd);
ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
return ret;
}
NOKPROBE_SYMBOL(spurious_fault);
NOKPROBE_SYMBOL(spurious_kernel_fault);
int show_unhandled_signals = 1;
......@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
static int fault_in_kernel_space(unsigned long address)
{
/*
* On 64-bit systems, the vsyscall page is at an address above
* TASK_SIZE_MAX, but is not considered part of the kernel
* address space.
*/
if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
return false;
return address >= TASK_SIZE_MAX;
}
......@@ -1214,31 +1213,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
}
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
* Called for all faults where 'address' is part of the kernel address
* space. Might get called for faults that originate from *code* that
* ran in userspace or the kernel.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
tsk = current;
mm = tsk->mm;
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/*
* Protection keys exceptions only happen on user pages. We
* have no user pages in the kernel portion of the address
* space, so do not expect them here.
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
/*
* We fault-in kernel-space virtual memory on-demand. The
* We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
......@@ -1246,41 +1237,74 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
* Before doing this on-demand faulting, ensure that the
* fault is not any of the following:
* 1. A fault on a PTE with a reserved bit set.
* 2. A fault caused by a user-mode access. (Do not demand-
* fault kernel memory due to user-mode accesses).
* 3. A fault caused by a page-level protection violation.
* (A demand fault would be on a non-present page which
* would have X86_PF_PROT==0).
*/
if (unlikely(fault_in_kernel_space(address))) {
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address, NULL);
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return;
}
/*
* Note, despite being a "bad area", there are quite a few
* acceptable reasons to get here, such as erratum fixups
* and handling kernel code that can fault, like get_user().
*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
/* Handle faults in the user portion of the address space */
static inline
void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
unsigned long sw_error_code;
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
tsk = current;
mm = tsk->mm;
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobes_fault(regs)))
return;
if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
/*
* Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
if (unlikely(hw_error_code & X86_PF_RSVD))
pgtable_bad(regs, hw_error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
bad_area_nosemaphore(regs, error_code, address, NULL);
/*
* Check for invalid kernel (supervisor) access to user
* pages in the user address space.
*/
if (unlikely(smap_violation(hw_error_code, regs))) {
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}
......@@ -1289,10 +1313,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}
/*
* hw_error_code is literally the "page fault error code" passed to
* the kernel directly from the hardware. But, we will shortly be
* modifying it in software, so give it a new name.
*/
sw_error_code = hw_error_code;
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
......@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
error_code |= X86_PF_USER;
/*
* Up to this point, X86_PF_USER set in hw_error_code
* indicated a user-mode access. But, after this,
* X86_PF_USER in sw_error_code will indicate either
* that, *or* an implicit kernel(supervisor)-mode access
* which originated from user mode.
*/
if (!(hw_error_code & X86_PF_USER)) {
/*
* The CPU was in user mode, but the CPU says
* the fault was not a user-mode access.
* Must be an implicit kernel-mode access,
* which we do not expect to happen in the
* user address space.
*/
pr_warn_once("kernel-mode error from user-mode: %lx\n",
hw_error_code);
sw_error_code |= X86_PF_USER;
}
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
......@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & X86_PF_WRITE)
if (sw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
if (sw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
* case of an erroneous fault occurring in a code path which already
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
* Instruction fetch faults in the vsyscall page might need
* emulation. The vsyscall page is at a high address
* (>PAGE_OFFSET), but is considered to be part of the user
* address space.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a
* deadlock. Attempt to lock the address space, if we cannot we then
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
*/
if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
/*
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
* tables. But, an erroneous kernel fault occurring outside one of
* those areas which also holds mmap_sem might deadlock attempting
* to validate the fault against the address space.
*
* Only do the expensive exception table search when we might be at
* risk of a deadlock. This happens if we
* 1. Failed to acquire mmap_sem, and
* 2. The access did not originate in userspace. Note: either the
* hardware or earlier page fault code may set X86_PF_USER
* in sw_error_code.
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if (!(error_code & X86_PF_USER) &&
if (!(sw_error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
/*
* Fault from code in kernel from
* which we do not expect faults.
*/
bad_area_nosemaphore(regs, sw_error_code, address, NULL);
return;
}
retry:
......@@ -1351,16 +1419,16 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
vma = find_vma(mm, address);
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (error_code & X86_PF_USER) {
if (sw_error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
......@@ -1368,12 +1436,12 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
}
if (unlikely(expand_stack(vma, address))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
......@@ -1382,8 +1450,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* we can handle it..
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address, vma);
if (unlikely(access_error(sw_error_code, vma))) {
bad_area_access_error(regs, sw_error_code, address, vma);
return;
}
......@@ -1425,13 +1493,13 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
return;
/* Not returning to user mode? Handle exceptions or die: */
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, &pkey, fault);
mm_fault_error(regs, sw_error_code, address, &pkey, fault);
return;
}
......@@ -1449,6 +1517,28 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(do_user_addr_fault);
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
prefetchw(&current->mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/* Was the fault on kernel-controlled part of the address space? */
if (unlikely(fault_in_kernel_space(address)))
do_kern_addr_fault(regs, hw_error_code, address);
else
do_user_addr_fault(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(__do_page_fault);
static nokprobe_inline void
......
......@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
unsigned long size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel text: %luk\n",
pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10);
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
start, start+size);
set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
start += size;
size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
size >> 10);
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
pr_info("Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();
......
......@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
* caller shouldn't need to know that small detail.
*/
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, enum page_cache_mode pcm, void *caller)
unsigned long size, enum page_cache_mode pcm,
void *caller, bool encrypted)
{
unsigned long offset, vaddr;
resource_size_t last_addr;
......@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* resulting mapping.
*/
prot = PAGE_KERNEL_IO;
if (sev_active() && mem_flags.desc_other)
if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
switch (pcm) {
......@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_nocache);
......@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL_GPL(ioremap_uc);
......@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wc);
......@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wt);
void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0), true);
}
EXPORT_SYMBOL(ioremap_encrypted);
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_cache);
......@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
{
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_prot);
......
此差异已折叠。
......@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
......@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
* We don't currently support having a real mm loaded without
* our cpu set in mm_cpumask(). We have all the bookkeeping
* in place to figure out whether we would need to flush
* if our cpu were cleared in mm_cpumask(), but we don't
* currently use it.
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
return;
/*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
* process. No TLB flush required.
*/
if (!was_lazy)
return;
/*
* Read the tlb_gen to check whether a flush is needed.
* If the TLB is up to date, just use it.
* The barrier synchronizes with the tlb_gen increment in
* the TLB shootdown code.
*/
smp_mb();
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
next_tlb_gen)
return;
/*
* TLB contents went out of date while we were in lazy
* mode. Fall through to the TLB switching code below.
*/
new_asid = prev_asid;
need_flush = true;
} else {
u16 new_asid;
bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
......@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/* Let nmi_uaccess_okay() know that we're changing CR3. */
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
}
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
load_mm_cr4(next);
switch_ldt(real_prev, next);
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
if (next != real_prev) {
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
}
/*
......@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
if (tlb_defer_switch_to_init_mm()) {
/*
* There's a significant optimization that may be possible
* here. We have accurate enough TLB flush tracking that we
* don't need to maintain coherence of TLB per se when we're
* lazy. We do, however, need to maintain coherence of
* paging-structure caches. We could, in principle, leave our
* old mm loaded and only switch to init_mm when
* tlb_remove_page() happens.
*/
this_cpu_write(cpu_tlbstate.is_lazy, true);
} else {
switch_mm(NULL, &init_mm, NULL);
}
this_cpu_write(cpu_tlbstate.is_lazy, true);
}
/*
......@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
* This should be rare, with native_flush_tlb_others skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
......@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
unsigned long addr;
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long addr = f->start;
addr = f->start;
while (addr < f->end) {
__flush_tlb_one_user(addr);
addr += PAGE_SIZE;
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
trace_tlb_flush(reason, nr_pages);
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
local_flush_tlb();
......@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
static bool tlb_is_not_lazy(int cpu, void *data)
{
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
......@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
smp_call_function_many(cpumask, flush_tlb_func_remote,
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves
* at the next context switch.
*
* However, if page tables are getting freed, we need to send the
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
if (info->freed_tables)
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
(void *)info, 1, GFP_ATOMIC, cpumask);
}
/*
......@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
int cpu;
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
};
cpu = get_cpu();
......@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
!(vmflag & VM_HUGETLB) &&
((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {
......
......@@ -22,6 +22,7 @@
#include <linux/tick.h>
#include <linux/nmi.h>
#include <linux/cpuhotplug.h>
#include <linux/stackprotector.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
......@@ -88,6 +89,7 @@ static void cpu_bringup(void)
asmlinkage __visible void cpu_bringup_and_idle(void)
{
cpu_bringup();
boot_init_stack_canary();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
......
......@@ -902,12 +902,22 @@ static bool copy_device_table(void)
}
}
old_devtb_phys = entry & PAGE_MASK;
/*
* When SME is enabled in the first kernel, the entry includes the
* memory encryption mask(sme_me_mask), we must remove the memory
* encryption mask to obtain the true physical address in kdump kernel.
*/
old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
if (old_devtb_phys >= 0x100000000ULL) {
pr_err("The address of old device table is above 4G, not trustworthy!\n");
return false;
}
old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
old_devtb = (sme_active() && is_kdump_kernel())
? (__force void *)ioremap_encrypted(old_devtb_phys,
dev_table_size)
: memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
if (!old_devtb)
return false;
......
......@@ -24,6 +24,8 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
#include <linux/mem_encrypt.h>
#include <asm/pgtable.h>
#include <asm/io.h>
#include "internal.h"
......@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
/* Reads a page from the oldmem device from given offset. */
static ssize_t read_from_oldmem(char *buf, size_t count,
u64 *ppos, int userbuf)
u64 *ppos, int userbuf,
bool encrypted)
{
unsigned long pfn, offset;
size_t nr_bytes;
......@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
if (pfn_is_ram(pfn) == 0)
memset(buf, 0, nr_bytes);
else {
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
if (encrypted)
tmp = copy_oldmem_page_encrypted(pfn, buf,
nr_bytes,
offset,
userbuf);
else
tmp = copy_oldmem_page(pfn, buf, nr_bytes,
offset, userbuf);
if (tmp < 0)
return tmp;
}
......@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
return read_from_oldmem(buf, count, ppos, 0);
return read_from_oldmem(buf, count, ppos, 0, false);
}
/*
......@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
return read_from_oldmem(buf, count, ppos, 0);
return read_from_oldmem(buf, count, ppos, 0, sme_active());
}
/*
......@@ -173,9 +183,20 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
prot = pgprot_encrypted(prot);
return remap_pfn_range(vma, from, pfn, size, prot);
}
/*
* Architectures which support memory encryption override this.
*/
ssize_t __weak
copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
}
/*
* Copy to either kernel or user space
*/
......@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
m->offset + m->size - *fpos,
buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
tmp = read_from_oldmem(buffer, tsz, &start,
userbuf, sme_active());
if (tmp < 0)
return tmp;
buflen -= tsz;
......
......@@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
unsigned long, int);
extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
size_t csize, unsigned long offset,
int userbuf);
void vmcore_cleanup(void);
/* Architecture code defines this if there are other possible ELF
......
......@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags);
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags, const struct cpumask *mask);
int smp_call_function_single_async(int cpu, call_single_data_t *csd);
#ifdef CONFIG_SMP
......
......@@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
}
}
/* Ensure that these pages are decrypted if SME is enabled. */
if (pages)
arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
return pages;
}
......@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = -ENOMEM;
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
......@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
......
......@@ -318,33 +318,34 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
/*
* Finds the lowest iomem resource existing within [res->start.res->end).
* The caller must specify res->start, res->end, res->flags, and optionally
* desc. If found, returns 0, res is overwritten, if not found, returns -1.
* This function walks the whole tree and not just first level children until
* and unless first_level_children_only is true.
/**
* Finds the lowest iomem resource that covers part of [start..end]. The
* caller must specify start, end, flags, and desc (which may be
* IORES_DESC_NONE).
*
* If a resource is found, returns 0 and *res is overwritten with the part
* of the resource that's within [start..end]; if none is found, returns
* -1.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
*/
static int find_next_iomem_res(struct resource *res, unsigned long desc,
bool first_level_children_only)
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
bool first_lvl, struct resource *res)
{
resource_size_t start, end;
struct resource *p;
bool sibling_only = false;
BUG_ON(!res);
start = res->start;
end = res->end;
BUG_ON(start >= end);
if (!res)
return -EINVAL;
if (first_level_children_only)
sibling_only = true;
if (start >= end)
return -EINVAL;
read_lock(&resource_lock);
for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
if ((p->flags & res->flags) != res->flags)
for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
if ((p->flags & flags) != flags)
continue;
if ((desc != IORES_DESC_NONE) && (desc != p->desc))
continue;
......@@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
p = NULL;
break;
}
if ((p->end >= start) && (p->start < end))
if ((p->end >= start) && (p->start <= end))
break;
}
read_unlock(&resource_lock);
if (!p)
return -1;
/* copy data */
if (res->start < p->start)
res->start = p->start;
if (res->end > p->end)
res->end = p->end;
res->start = max(start, p->start);
res->end = min(end, p->end);
res->flags = p->flags;
res->desc = p->desc;
return 0;
}
static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
bool first_level_children_only,
void *arg,
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
bool first_lvl, void *arg,
int (*func)(struct resource *, void *))
{
u64 orig_end = res->end;
struct resource res;
int ret = -1;
while ((res->start < res->end) &&
!find_next_iomem_res(res, desc, first_level_children_only)) {
ret = (*func)(res, arg);
while (start < end &&
!find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
ret = (*func)(&res, arg);
if (ret)
break;
res->start = res->end + 1;
res->end = orig_end;
start = res.end + 1;
}
return ret;
}
/*
/**
* Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children.
* All the memory ranges which overlap start,end and also match flags and
......@@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
u64 end, void *arg, int (*func)(struct resource *, void *))
{
struct resource res;
res.start = start;
res.end = end;
res.flags = flags;
return __walk_iomem_res_desc(&res, desc, false, arg, func);
return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
}
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
......@@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
* ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *))
int (*func)(struct resource *, void *))
{
struct resource res;
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
res.start = start;
res.end = end;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
arg, func);
}
......@@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
int walk_mem_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *))
{
struct resource res;
res.start = start;
res.end = end;
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
arg, func);
}
......@@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg,
* It is to be used only for System RAM.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
void *arg, int (*func)(unsigned long, unsigned long, void *))
{
resource_size_t start, end;
unsigned long flags;
struct resource res;
unsigned long pfn, end_pfn;
u64 orig_end;
int ret = -1;
res.start = (u64) start_pfn << PAGE_SHIFT;
res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
start = (u64) start_pfn << PAGE_SHIFT;
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
while (start < end &&
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
true, &res)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret)
break;
res.start = res.end + 1;
res.end = orig_end;
start = res.end + 1;
}
return ret;
}
......@@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new,
* @constraint: the size and alignment constraints to be met.
*/
static int reallocate_resource(struct resource *root, struct resource *old,
resource_size_t newsize,
struct resource_constraint *constraint)
resource_size_t newsize,
struct resource_constraint *constraint)
{
int err=0;
struct resource new = *old;
......@@ -972,7 +957,7 @@ static int __adjust_resource(struct resource *res, resource_size_t start,
* Existing children of the resource are assumed to be immutable.
*/
int adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size)
resource_size_t size)
{
int result;
......@@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start,
}
EXPORT_SYMBOL(adjust_resource);
static void __init __reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
static void __init
__reserve_region_with_split(struct resource *root, resource_size_t start,
resource_size_t end, const char *name)
{
struct resource *parent = root;
struct resource *conflict;
......@@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root,
}
void __init reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
void __init
reserve_region_with_split(struct resource *root, resource_size_t start,
resource_size_t end, const char *name)
{
int abort = 0;
......@@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region);
* The described resource region must match a currently busy region.
*/
void __release_region(struct resource *parent, resource_size_t start,
resource_size_t n)
resource_size_t n)
{
struct resource **p;
resource_size_t end;
......@@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region);
* simplicity. Enhance this logic when necessary.
*/
int release_mem_region_adjustable(struct resource *parent,
resource_size_t start, resource_size_t size)
resource_size_t start, resource_size_t size)
{
struct resource **p;
struct resource *res;
......@@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data)
this->start == match->start && this->n == match->n;
}
struct resource * __devm_request_region(struct device *dev,
struct resource *parent, resource_size_t start,
resource_size_t n, const char *name)
struct resource *
__devm_request_region(struct device *dev, struct resource *parent,
resource_size_t start, resource_size_t n, const char *name)
{
struct region_devres *dr = NULL;
struct resource *res;
......
......@@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle);
void cpu_startup_entry(enum cpuhp_state state)
{
/*
* This #ifdef needs to die, but it's too late in the cycle to
* make this generic (ARM and SH have never invoked the canary
* init for the non boot CPUs!). Will be fixed in 3.11
*/
#ifdef CONFIG_X86
/*
* If we're the non-boot CPU, nothing set the stack canary up
* for us. The boot CPU already has it initialized but no harm
* in doing it again. This is a good place for updating it, as
* we wont ever return from this function (so the invalid
* canaries already on the stack wont ever trigger).
*/
boot_init_stack_canary();
#endif
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
......
......@@ -56,7 +56,6 @@
#include <linux/profile.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stackprotector.h>
#include <linux/stop_machine.h>
#include <linux/suspend.h>
#include <linux/swait.h>
......
......@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* You must not call this function with disabled interrupts or
* from a hardware interrupt handler or from a bottom half handler.
*/
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
gfp_t gfp_flags, const struct cpumask *mask)
{
cpumask_var_t cpus;
int cpu, ret;
......@@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
preempt_disable();
for_each_online_cpu(cpu)
for_each_cpu(cpu, mask)
if (cond_func(cpu, info))
cpumask_set_cpu(cpu, cpus);
__cpumask_set_cpu(cpu, cpus);
on_each_cpu_mask(cpus, func, info, wait);
preempt_enable();
free_cpumask_var(cpus);
......@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
* just have to IPI them one by one.
*/
preempt_disable();
for_each_online_cpu(cpu)
for_each_cpu(cpu, mask)
if (cond_func(cpu, info)) {
ret = smp_call_function_single(cpu, func,
info, wait);
......@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
preempt_enable();
}
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
cpu_online_mask);
}
EXPORT_SYMBOL(on_each_cpu_cond);
static void do_nothing(void *unused)
......
......@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* Preemption is disabled here to make sure the cond_func is called under the
* same condtions in UP and SMP.
*/
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags, const struct cpumask *mask)
{
unsigned long flags;
......@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
}
preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
}
EXPORT_SYMBOL(on_each_cpu_cond);
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
......
......@@ -8,6 +8,7 @@
*/
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <asm/tlb.h>
#include <asm-generic/pgtable.h>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册