提交 93c26d7d 编写于 作者: L Linus Torvalds

Merge branch 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull protection keys syscall interface from Thomas Gleixner:
 "This is the final step of Protection Keys support which adds the
  syscalls so user space can actually allocate keys and protect memory
  areas with them. Details and usage examples can be found in the
  documentation.

  The mm side of this has been acked by Mel"

* 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/pkeys: Update documentation
  x86/mm/pkeys: Do not skip PKRU register if debug registers are not used
  x86/pkeys: Fix pkeys build breakage for some non-x86 arches
  x86/pkeys: Add self-tests
  x86/pkeys: Allow configuration of init_pkru
  x86/pkeys: Default to a restrictive init PKRU
  pkeys: Add details of system call use to Documentation/
  generic syscalls: Wire up memory protection keys syscalls
  x86: Wire up protection keys system calls
  x86/pkeys: Allocation/free syscalls
  x86/pkeys: Make mprotect_key() mask off additional vm_flags
  mm: Implement new pkey_mprotect() system call
  x86/pkeys: Add fault handling for PF_PK page fault bit
......@@ -1666,6 +1666,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
initrd= [BOOT] Specify the location of the initial ramdisk
init_pkru= [x86] Specify the default memory protection keys rights
register contents for all processes. 0x55555554 by
default (disallow access to all but pkey 0). Can
override in debugfs after boot.
inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
Format: <irq>
......
......@@ -18,10 +18,68 @@ even though there is theoretically space in the PAE PTEs. These
permissions are enforced on data access only and have no effect on
instruction fetches.
=========================== Config Option ===========================
=========================== Syscalls ===========================
This config option adds approximately 1.5kb of text. and 50 bytes of
data to the executable. A workload which does large O_DIRECT reads
of holes in XFS files was run to exercise get_user_pages_fast(). No
performance delta was observed with the config option
enabled or disabled.
There are 3 system calls which directly interact with pkeys:
int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
int pkey_free(int pkey);
int pkey_mprotect(unsigned long start, size_t len,
unsigned long prot, int pkey);
Before a pkey can be used, it must first be allocated with
pkey_alloc(). An application calls the WRPKRU instruction
directly in order to change access permissions to memory covered
with a key. In this example WRPKRU is wrapped by a C function
called pkey_set().
int real_prot = PROT_READ|PROT_WRITE;
pkey = pkey_alloc(0, PKEY_DENY_WRITE);
ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
... application runs here
Now, if the application needs to update the data at 'ptr', it can
gain access, do the update, then remove its write access:
pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
*ptr = foo; // assign something
pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
Now when it frees the memory, it will also free the pkey since it
is no longer in use:
munmap(ptr, PAGE_SIZE);
pkey_free(pkey);
(Note: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions.
An example implementation can be found in
tools/testing/selftests/x86/protection_keys.c)
=========================== Behavior ===========================
The kernel attempts to make protection keys consistent with the
behavior of a plain mprotect(). For instance if you do this:
mprotect(ptr, size, PROT_NONE);
something(ptr);
you can expect the same effects with protection keys when doing this:
pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ);
pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey);
something(ptr);
That should be true whether something() is a direct access to 'ptr'
like:
*ptr = foo;
or when the kernel does the access on the application's behalf like
with a read():
read(fd, ptr, 1);
The kernel will send a SIGSEGV in both cases, but si_code will be set
to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when
the plain mprotect() permissions are violated.
......@@ -78,4 +78,9 @@
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_MASK 0x3f
#define PKEY_DISABLE_ACCESS 0x1
#define PKEY_DISABLE_WRITE 0x2
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)
#endif /* __ALPHA_MMAN_H__ */
......@@ -105,4 +105,9 @@
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_MASK 0x3f
#define PKEY_DISABLE_ACCESS 0x1
#define PKEY_DISABLE_WRITE 0x2
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)
#endif /* _ASM_MMAN_H */
......@@ -75,4 +75,9 @@
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_MASK 0x3f
#define PKEY_DISABLE_ACCESS 0x1
#define PKEY_DISABLE_WRITE 0x2
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)
#endif /* __PARISC_MMAN_H__ */
......@@ -386,3 +386,8 @@
377 i386 copy_file_range sys_copy_file_range
378 i386 preadv2 sys_preadv2 compat_sys_preadv2
379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2
380 i386 pkey_mprotect sys_pkey_mprotect
381 i386 pkey_alloc sys_pkey_alloc
382 i386 pkey_free sys_pkey_free
#383 i386 pkey_get sys_pkey_get
#384 i386 pkey_set sys_pkey_set
......@@ -335,6 +335,11 @@
326 common copy_file_range sys_copy_file_range
327 64 preadv2 sys_preadv2
328 64 pwritev2 sys_pwritev2
329 common pkey_mprotect sys_pkey_mprotect
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
#332 common pkey_get sys_pkey_get
#333 common pkey_set sys_pkey_set
#
# x32-specific system call numbers start at 512 to avoid cache impact
......
......@@ -23,6 +23,14 @@ typedef struct {
const struct vdso_image *vdso_image; /* vdso image in use */
atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
/*
* One bit per protection key says whether userspace can
* use it or not. protected by mmap_sem.
*/
u16 pkey_allocation_map;
s16 execute_only_pkey;
#endif
} mm_context_t;
#ifdef CONFIG_SMP
......
......@@ -4,6 +4,7 @@
#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>
#include <trace/events/tlb.h>
......@@ -107,7 +108,16 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and always allocated */
mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
}
#endif
init_new_context_ldt(tsk, mm);
return 0;
}
static inline void destroy_context(struct mm_struct *mm)
......@@ -195,16 +205,20 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
mpx_notify_unmap(mm, vma, start, end);
}
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
static inline int vma_pkey(struct vm_area_struct *vma)
{
u16 pkey = 0;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
VM_PKEY_BIT2 | VM_PKEY_BIT3;
pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
#endif
return pkey;
return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
}
#else
static inline int vma_pkey(struct vm_area_struct *vma)
{
return 0;
}
#endif
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
......@@ -258,5 +272,4 @@ static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
}
#endif /* _ASM_X86_MMU_CONTEXT_H */
......@@ -10,7 +10,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
* Try to dedicate one of the protection keys to be used as an
* execute-only protection key.
*/
#define PKEY_DEDICATED_EXECUTE_ONLY 15
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
......@@ -31,4 +30,76 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
return __arch_override_mprotect_pkey(vma, prot, pkey);
}
extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
#define mm_set_pkey_allocated(mm, pkey) do { \
mm_pkey_allocation_map(mm) |= (1U << pkey); \
} while (0)
#define mm_set_pkey_free(mm, pkey) do { \
mm_pkey_allocation_map(mm) &= ~(1U << pkey); \
} while (0)
static inline
bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
return mm_pkey_allocation_map(mm) & (1U << pkey);
}
/*
* Returns a positive, 4-bit key on success, or -1 on failure.
*/
static inline
int mm_pkey_alloc(struct mm_struct *mm)
{
/*
* Note: this is the one and only place we make sure
* that the pkey is valid as far as the hardware is
* concerned. The rest of the kernel trusts that
* only good, valid pkeys come out of here.
*/
u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
int ret;
/*
* Are we out of pkeys? We must handle this specially
* because ffz() behavior is undefined if there are no
* zeros.
*/
if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
return -1;
ret = ffz(mm_pkey_allocation_map(mm));
mm_set_pkey_allocated(mm, ret);
return ret;
}
static inline
int mm_pkey_free(struct mm_struct *mm, int pkey)
{
/*
* pkey 0 is special, always allocated and can never
* be freed.
*/
if (!pkey)
return -EINVAL;
if (!mm_pkey_is_allocated(mm, pkey))
return -EINVAL;
mm_set_pkey_free(mm, pkey);
return 0;
}
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
extern void copy_init_pkru_to_fpregs(void);
#endif /*_ASM_X86_PKEYS_H */
......@@ -12,6 +12,7 @@
#include <asm/traps.h>
#include <linux/hardirq.h>
#include <linux/pkeys.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>
......@@ -505,6 +506,9 @@ static inline void copy_init_fpstate_to_fpregs(void)
copy_kernel_to_fxregs(&init_fpstate.fxsave);
else
copy_kernel_to_fregs(&init_fpstate.fsave);
if (boot_cpu_has(X86_FEATURE_OSPKE))
copy_init_pkru_to_fpregs();
}
/*
......
......@@ -5,6 +5,7 @@
*/
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
#include <linux/pkeys.h>
#include <asm/fpu/api.h>
......@@ -866,9 +867,10 @@ const void *get_xsave_field_ptr(int xsave_state)
return get_xsave_addr(&fpu->state.xsave, xsave_state);
}
#ifdef CONFIG_ARCH_HAS_PKEYS
#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
/*
* This will go out and modify PKRU register to set the access
* rights for @pkey to @init_val.
......@@ -914,6 +916,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
return 0;
}
#endif /* ! CONFIG_ARCH_HAS_PKEYS */
/*
* This is similar to user_regset_copyout(), but will not add offset to
......
......@@ -109,12 +109,13 @@ void __show_regs(struct pt_regs *regs, int all)
get_debugreg(d7, 7);
/* Only print out debug registers if they are in their non-default state. */
if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
(d6 == DR6_RESERVED) && (d7 == 0x400))
return;
printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
(d6 == DR6_RESERVED) && (d7 == 0x400))) {
printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
d0, d1, d2);
printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
d3, d6, d7);
}
if (boot_cpu_has(X86_FEATURE_OSPKE))
printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
......
......@@ -1144,6 +1144,15 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
{
/* This is only called for the current mm, so: */
bool foreign = false;
/*
* Read or write was blocked by protection keys. This is
* always an unconditional error and can never result in
* a follow-up action to resolve the fault, like a COW.
*/
if (error_code & PF_PK)
return 1;
/*
* Make sure to check the VMA so that we do not perform
* faults just to hit a PF_PK as soon as we fill in a
......
......@@ -11,6 +11,7 @@
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/debugfs.h> /* debugfs_create_u32() */
#include <linux/mm_types.h> /* mm_struct, vma, etc... */
#include <linux/pkeys.h> /* PKEY_* */
#include <uapi/asm-generic/mman-common.h>
......@@ -21,8 +22,19 @@
int __execute_only_pkey(struct mm_struct *mm)
{
bool need_to_set_mm_pkey = false;
int execute_only_pkey = mm->context.execute_only_pkey;
int ret;
/* Do we need to assign a pkey for mm's execute-only maps? */
if (execute_only_pkey == -1) {
/* Go allocate one to use, which might fail */
execute_only_pkey = mm_pkey_alloc(mm);
if (execute_only_pkey < 0)
return -1;
need_to_set_mm_pkey = true;
}
/*
* We do not want to go through the relatively costly
* dance to set PKRU if we do not need to. Check it
......@@ -32,22 +44,33 @@ int __execute_only_pkey(struct mm_struct *mm)
* can make fpregs inactive.
*/
preempt_disable();
if (fpregs_active() &&
!__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
if (!need_to_set_mm_pkey &&
fpregs_active() &&
!__pkru_allows_read(read_pkru(), execute_only_pkey)) {
preempt_enable();
return PKEY_DEDICATED_EXECUTE_ONLY;
return execute_only_pkey;
}
preempt_enable();
ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
/*
* Set up PKRU so that it denies access for everything
* other than execution.
*/
ret = arch_set_user_pkey_access(current, execute_only_pkey,
PKEY_DISABLE_ACCESS);
/*
* If the PKRU-set operation failed somehow, just return
* 0 and effectively disable execute-only support.
*/
if (ret)
return 0;
if (ret) {
mm_set_pkey_free(mm, execute_only_pkey);
return -1;
}
return PKEY_DEDICATED_EXECUTE_ONLY;
/* We got one, store it and use it from here on out */
if (need_to_set_mm_pkey)
mm->context.execute_only_pkey = execute_only_pkey;
return execute_only_pkey;
}
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
......@@ -55,7 +78,7 @@ static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
/* Do this check first since the vm_flags should be hot */
if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
return false;
if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey)
return false;
return true;
......@@ -99,3 +122,106 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
*/
return vma_pkey(vma);
}
#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
/*
* Make the default PKRU value (at execve() time) as restrictive
* as possible. This ensures that any threads clone()'d early
* in the process's lifetime will not accidentally get access
* to data which is pkey-protected later on.
*/
u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
/*
* Called from the FPU code when creating a fresh set of FPU
* registers. This is called from a very specific context where
* we know the FPU regstiers are safe for use and we can use PKRU
* directly. The fact that PKRU is only available when we are
* using eagerfpu mode makes this possible.
*/
void copy_init_pkru_to_fpregs(void)
{
u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
/*
* Any write to PKRU takes it out of the XSAVE 'init
* state' which increases context switch cost. Avoid
* writing 0 when PKRU was already 0.
*/
if (!init_pkru_value_snapshot && !read_pkru())
return;
/*
* Override the PKRU state that came from 'init_fpstate'
* with the baseline from the process.
*/
write_pkru(init_pkru_value_snapshot);
}
static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[32];
unsigned int len;
len = sprintf(buf, "0x%x\n", init_pkru_value);
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
static ssize_t init_pkru_write_file(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[32];
ssize_t len;
u32 new_init_pkru;
len = min(count, sizeof(buf) - 1);
if (copy_from_user(buf, user_buf, len))
return -EFAULT;
/* Make the buffer a valid string that we can not overrun */
buf[len] = '\0';
if (kstrtouint(buf, 0, &new_init_pkru))
return -EINVAL;
/*
* Don't allow insane settings that will blow the system
* up immediately if someone attempts to disable access
* or writes to pkey 0.
*/
if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT))
return -EINVAL;
WRITE_ONCE(init_pkru_value, new_init_pkru);
return count;
}
static const struct file_operations fops_init_pkru = {
.read = init_pkru_read_file,
.write = init_pkru_write_file,
.llseek = default_llseek,
};
static int __init create_init_pkru_value(void)
{
debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_init_pkru);
return 0;
}
late_initcall(create_init_pkru_value);
static __init int setup_init_pkru(char *opt)
{
u32 new_init_pkru;
if (kstrtouint(opt, 0, &new_init_pkru))
return 1;
WRITE_ONCE(init_pkru_value, new_init_pkru);
return 1;
}
__setup("init_pkru=", setup_init_pkru);
......@@ -117,4 +117,9 @@
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_MASK 0x3f
#define PKEY_DISABLE_ACCESS 0x1
#define PKEY_DISABLE_WRITE 0x2
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)
#endif /* _XTENSA_MMAN_H */
......@@ -4,11 +4,6 @@
#include <linux/mm_types.h>
#include <asm/mmu_context.h>
#define PKEY_DISABLE_ACCESS 0x1
#define PKEY_DISABLE_WRITE 0x2
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)
#ifdef CONFIG_ARCH_HAS_PKEYS
#include <asm/pkeys.h>
#else /* ! CONFIG_ARCH_HAS_PKEYS */
......@@ -16,18 +11,34 @@
#define execute_only_pkey(mm) (0)
#define arch_override_mprotect_pkey(vma, prot, pkey) (0)
#define PKEY_DEDICATED_EXECUTE_ONLY 0
#endif /* ! CONFIG_ARCH_HAS_PKEYS */
#define ARCH_VM_PKEY_FLAGS 0
static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
return (pkey == 0);
}
static inline int mm_pkey_alloc(struct mm_struct *mm)
{
return -1;
}
/*
* This is called from mprotect_pkey().
*
* Returns true if the protection keys is valid.
*/
static inline bool validate_pkey(int pkey)
static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
{
if (pkey < 0)
return false;
return (pkey < arch_max_pkey());
WARN_ONCE(1, "free of protection key when disabled");
return -EINVAL;
}
static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val)
{
return 0;
}
static inline void copy_init_pkru_to_fpregs(void)
{
}
#endif /* ! CONFIG_ARCH_HAS_PKEYS */
#endif /* _LINUX_PKEYS_H */
......@@ -898,4 +898,12 @@ asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
unsigned long prot, int pkey);
asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
asmlinkage long sys_pkey_free(int pkey);
//asmlinkage long sys_pkey_get(int pkey, unsigned long flags);
//asmlinkage long sys_pkey_set(int pkey, unsigned long access_rights,
// unsigned long flags);
#endif
......@@ -72,4 +72,9 @@
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_MASK 0x3f
#define PKEY_DISABLE_ACCESS 0x1
#define PKEY_DISABLE_WRITE 0x2
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)
#endif /* __ASM_GENERIC_MMAN_COMMON_H */
......@@ -724,9 +724,19 @@ __SYSCALL(__NR_copy_file_range, sys_copy_file_range)
__SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2)
#define __NR_pwritev2 287
__SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2)
#define __NR_pkey_mprotect 288
__SYSCALL(__NR_pkey_mprotect, sys_pkey_mprotect)
#define __NR_pkey_alloc 289
__SYSCALL(__NR_pkey_alloc, sys_pkey_alloc)
#define __NR_pkey_free 290
__SYSCALL(__NR_pkey_free, sys_pkey_free)
#define __NR_pkey_get 291
//__SYSCALL(__NR_pkey_get, sys_pkey_get)
#define __NR_pkey_set 292
//__SYSCALL(__NR_pkey_set, sys_pkey_set)
#undef __NR_syscalls
#define __NR_syscalls 288
#define __NR_syscalls 291
/*
* All syscalls below here should go away really,
......
......@@ -250,3 +250,8 @@ cond_syscall(sys_execveat);
/* membarrier */
cond_syscall(sys_membarrier);
/* memory protection keys */
cond_syscall(sys_pkey_mprotect);
cond_syscall(sys_pkey_alloc);
cond_syscall(sys_pkey_free);
......@@ -23,11 +23,13 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/perf_event.h>
#include <linux/pkeys.h>
#include <linux/ksm.h>
#include <linux/pkeys.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
#include "internal.h"
......@@ -353,8 +355,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
return error;
}
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
/*
* pkey==-1 when doing a legacy mprotect()
*/
static int do_mprotect_pkey(unsigned long start, size_t len,
unsigned long prot, int pkey)
{
unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
......@@ -383,6 +388,14 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
if (down_write_killable(&current->mm->mmap_sem))
return -EINTR;
/*
* If userspace did not allocate the pkey, do not let
* them use it here.
*/
error = -EINVAL;
if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
goto out;
vma = find_vma(current->mm, start);
error = -ENOMEM;
if (!vma)
......@@ -409,8 +422,9 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
prev = vma;
for (nstart = start ; ; ) {
unsigned long mask_off_old_flags;
unsigned long newflags;
int pkey = arch_override_mprotect_pkey(vma, prot, -1);
int new_vma_pkey;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
......@@ -418,8 +432,17 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
if (rier && (vma->vm_flags & VM_MAYEXEC))
prot |= PROT_EXEC;
newflags = calc_vm_prot_bits(prot, pkey);
newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/*
* Each mprotect() call explicitly passes r/w/x permissions.
* If a permission is not passed to mprotect(), it must be
* cleared from the VMA.
*/
mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
ARCH_VM_PKEY_FLAGS;
new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
newflags = calc_vm_prot_bits(prot, new_vma_pkey);
newflags |= (vma->vm_flags & ~mask_off_old_flags);
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
......@@ -455,3 +478,60 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
up_write(&current->mm->mmap_sem);
return error;
}
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
return do_mprotect_pkey(start, len, prot, -1);
}
SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
unsigned long, prot, int, pkey)
{
return do_mprotect_pkey(start, len, prot, pkey);
}
SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
{
int pkey;
int ret;
/* No flags supported yet. */
if (flags)
return -EINVAL;
/* check for unsupported init values */
if (init_val & ~PKEY_ACCESS_MASK)
return -EINVAL;
down_write(&current->mm->mmap_sem);
pkey = mm_pkey_alloc(current->mm);
ret = -ENOSPC;
if (pkey == -1)
goto out;
ret = arch_set_user_pkey_access(current, pkey, init_val);
if (ret) {
mm_pkey_free(current->mm, pkey);
goto out;
}
ret = pkey;
out:
up_write(&current->mm->mmap_sem);
return ret;
}
SYSCALL_DEFINE1(pkey_free, int, pkey)
{
int ret;
down_write(&current->mm->mmap_sem);
ret = mm_pkey_free(current->mm, pkey);
up_write(&current->mm->mmap_sem);
/*
* We could provie warnings or errors if any VMA still
* has the pkey set here.
*/
return ret;
}
......@@ -5,7 +5,8 @@ include ../lib.mk
.PHONY: all all_32 all_64 warn_32bit_failure clean
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test
check_initial_reg_state sigreturn ldt_gdt iopl \
protection_keys
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
......
#ifndef _PKEYS_HELPER_H
#define _PKEYS_HELPER_H
#define _GNU_SOURCE
#include <string.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <signal.h>
#include <assert.h>
#include <stdlib.h>
#include <ucontext.h>
#include <sys/mman.h>
#define NR_PKEYS 16
#define PKRU_BITS_PER_PKEY 2
#ifndef DEBUG_LEVEL
#define DEBUG_LEVEL 0
#endif
#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
extern int dprint_in_signal;
extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
static inline void sigsafe_printf(const char *format, ...)
{
va_list ap;
va_start(ap, format);
if (!dprint_in_signal) {
vprintf(format, ap);
} else {
int len = vsnprintf(dprint_in_signal_buffer,
DPRINT_IN_SIGNAL_BUF_SIZE,
format, ap);
/*
* len is amount that would have been printed,
* but actual write is truncated at BUF_SIZE.
*/
if (len > DPRINT_IN_SIGNAL_BUF_SIZE)
len = DPRINT_IN_SIGNAL_BUF_SIZE;
write(1, dprint_in_signal_buffer, len);
}
va_end(ap);
}
#define dprintf_level(level, args...) do { \
if (level <= DEBUG_LEVEL) \
sigsafe_printf(args); \
fflush(NULL); \
} while (0)
#define dprintf0(args...) dprintf_level(0, args)
#define dprintf1(args...) dprintf_level(1, args)
#define dprintf2(args...) dprintf_level(2, args)
#define dprintf3(args...) dprintf_level(3, args)
#define dprintf4(args...) dprintf_level(4, args)
extern unsigned int shadow_pkru;
static inline unsigned int __rdpkru(void)
{
unsigned int eax, edx;
unsigned int ecx = 0;
unsigned int pkru;
asm volatile(".byte 0x0f,0x01,0xee\n\t"
: "=a" (eax), "=d" (edx)
: "c" (ecx));
pkru = eax;
return pkru;
}
static inline unsigned int _rdpkru(int line)
{
unsigned int pkru = __rdpkru();
dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
line, pkru, shadow_pkru);
assert(pkru == shadow_pkru);
return pkru;
}
#define rdpkru() _rdpkru(__LINE__)
static inline void __wrpkru(unsigned int pkru)
{
unsigned int eax = pkru;
unsigned int ecx = 0;
unsigned int edx = 0;
dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
asm volatile(".byte 0x0f,0x01,0xef\n\t"
: : "a" (eax), "c" (ecx), "d" (edx));
assert(pkru == __rdpkru());
}
static inline void wrpkru(unsigned int pkru)
{
dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
/* will do the shadow check for us: */
rdpkru();
__wrpkru(pkru);
shadow_pkru = pkru;
dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
}
/*
* These are technically racy. since something could
* change PKRU between the read and the write.
*/
static inline void __pkey_access_allow(int pkey, int do_allow)
{
unsigned int pkru = rdpkru();
int bit = pkey * 2;
if (do_allow)
pkru &= (1<<bit);
else
pkru |= (1<<bit);
dprintf4("pkru now: %08x\n", rdpkru());
wrpkru(pkru);
}
static inline void __pkey_write_allow(int pkey, int do_allow_write)
{
long pkru = rdpkru();
int bit = pkey * 2 + 1;
if (do_allow_write)
pkru &= (1<<bit);
else
pkru |= (1<<bit);
wrpkru(pkru);
dprintf4("pkru now: %08x\n", rdpkru());
}
#define PROT_PKEY0 0x10 /* protection key value (bit 0) */
#define PROT_PKEY1 0x20 /* protection key value (bit 1) */
#define PROT_PKEY2 0x40 /* protection key value (bit 2) */
#define PROT_PKEY3 0x80 /* protection key value (bit 3) */
#define PAGE_SIZE 4096
#define MB (1<<20)
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
/* ecx is often an input as well as an output. */
asm volatile(
"cpuid;"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
"=d" (*edx)
: "0" (*eax), "2" (*ecx));
}
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */
static inline int cpu_has_pku(void)
{
unsigned int eax;
unsigned int ebx;
unsigned int ecx;
unsigned int edx;
eax = 0x7;
ecx = 0x0;
__cpuid(&eax, &ebx, &ecx, &edx);
if (!(ecx & X86_FEATURE_PKU)) {
dprintf2("cpu does not have PKU\n");
return 0;
}
if (!(ecx & X86_FEATURE_OSPKE)) {
dprintf2("cpu does not have OSPKE\n");
return 0;
}
return 1;
}
#define XSTATE_PKRU_BIT (9)
#define XSTATE_PKRU 0x200
int pkru_xstate_offset(void)
{
unsigned int eax;
unsigned int ebx;
unsigned int ecx;
unsigned int edx;
int xstate_offset;
int xstate_size;
unsigned long XSTATE_CPUID = 0xd;
int leaf;
/* assume that XSTATE_PKRU is set in XCR0 */
leaf = XSTATE_PKRU_BIT;
{
eax = XSTATE_CPUID;
ecx = leaf;
__cpuid(&eax, &ebx, &ecx, &edx);
if (leaf == XSTATE_PKRU_BIT) {
xstate_offset = ebx;
xstate_size = eax;
}
}
if (xstate_size == 0) {
printf("could not find size/offset of PKRU in xsave state\n");
return 0;
}
return xstate_offset;
}
#endif /* _PKEYS_HELPER_H */
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册