diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d9c171ce4190845950e7c14e362265b4d26adc74..372cc66bba23286c485abd8aa178abe2f3119fe7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2233,6 +2233,17 @@ memory contents and reserves bad memory regions that are detected. + mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control + Valid arguments: on, off + Default (depends on kernel configuration option): + on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) + off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n) + mem_encrypt=on: Activate SME + mem_encrypt=off: Do not activate SME + + Refer to Documentation/x86/amd-memory-encryption.txt + for details on when memory encryption can be activated. + mem_sleep_default= [SUSPEND] Default system suspend mode: s2idle - Suspend-To-Idle shallow - Power-On Suspend or equivalent (if supported) @@ -2696,6 +2707,8 @@ nopat [X86] Disable PAT (page attribute table extension of pagetables) support. + nopcid [X86-64] Disable the PCID cpu feature. + norandmaps Don't use address space randomization. Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 3e7b946dea2778441ce0599643535188699e16ca..5e40e1f68873b0f2b594c22a6e69ddb218ce56ec 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -228,7 +228,7 @@ Learning on the device port should be enabled, as well as learning_sync: bridge link set dev DEV learning on self bridge link set dev DEV learning_sync on self -Learning_sync attribute enables syncing of the learned/forgotton FDB entry to +Learning_sync attribute enables syncing of the learned/forgotten FDB entry to the bridge's FDB. It's possible, but not optimal, to enable learning on the device port and on the bridge port, and disable learning_sync. @@ -245,7 +245,7 @@ the responsibility of the port driver/device to age out these entries. If the port device supports ageing, when the FDB entry expires, it will notify the driver which in turn will notify the bridge with SWITCHDEV_FDB_DEL. If the device does not support ageing, the driver can simulate ageing using a -garbage collection timer to monitor FBD entries. Expired entries will be +garbage collection timer to monitor FDB entries. Expired entries will be notified to the bridge using SWITCHDEV_FDB_DEL. See rocker driver for example of driver running ageing timer. diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 14db18c970b1b048c0ecc50d75e10efb674715ca..28596e03220b5528891c19316b7cfe2b53fcf331 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -35,9 +35,34 @@ Table : Subdirectories in /proc/sys/net bpf_jit_enable -------------- -This enables Berkeley Packet Filter Just in Time compiler. -Currently supported on x86_64 architecture, bpf_jit provides a framework -to speed packet filtering, the one used by tcpdump/libpcap for example. +This enables the BPF Just in Time (JIT) compiler. BPF is a flexible +and efficient infrastructure allowing to execute bytecode at various +hook points. It is used in a number of Linux kernel subsystems such +as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints) +and security (e.g. seccomp). LLVM has a BPF back end that can compile +restricted C into a sequence of BPF instructions. After program load +through bpf(2) and passing a verifier in the kernel, a JIT will then +translate these BPF proglets into native CPU instructions. There are +two flavors of JITs, the newer eBPF JIT currently supported on: + - x86_64 + - arm64 + - ppc64 + - sparc64 + - mips64 + - s390x + +And the older cBPF JIT supported on the following archs: + - arm + - mips + - ppc + - sparc + +eBPF JITs are a superset of cBPF JITs, meaning the kernel will +migrate cBPF instructions into eBPF instructions and then JIT +compile them transparently. Older cBPF JITs can only translate +tcpdump filters, seccomp rules, etc, but not mentioned eBPF +programs loaded through bpf(2). + Values : 0 - disable the JIT (default value) 1 - enable the JIT @@ -46,9 +71,9 @@ Values : bpf_jit_harden -------------- -This enables hardening for the Berkeley Packet Filter Just in Time compiler. -Supported are eBPF JIT backends. Enabling hardening trades off performance, -but can mitigate JIT spraying. +This enables hardening for the BPF JIT compiler. Supported are eBPF +JIT backends. Enabling hardening trades off performance, but can +mitigate JIT spraying. Values : 0 - disable JIT hardening (default value) 1 - enable JIT hardening for unprivileged users only @@ -57,11 +82,11 @@ Values : bpf_jit_kallsyms ---------------- -When Berkeley Packet Filter Just in Time compiler is enabled, then compiled -images are unknown addresses to the kernel, meaning they neither show up in -traces nor in /proc/kallsyms. This enables export of these addresses, which -can be used for debugging/tracing. If bpf_jit_harden is enabled, this feature -is disabled. +When BPF JIT compiler is enabled, then compiled images are unknown +addresses to the kernel, meaning they neither show up in traces nor +in /proc/kallsyms. This enables export of these addresses, which can +be used for debugging/tracing. If bpf_jit_harden is enabled, this +feature is disabled. Values : 0 - disable JIT kallsyms export (default value) 1 - enable JIT kallsyms export for privileged users only diff --git a/Documentation/x86/amd-memory-encryption.txt b/Documentation/x86/amd-memory-encryption.txt new file mode 100644 index 0000000000000000000000000000000000000000..f512ab7185411e813a233f0473853b41a94f7d2f --- /dev/null +++ b/Documentation/x86/amd-memory-encryption.txt @@ -0,0 +1,68 @@ +Secure Memory Encryption (SME) is a feature found on AMD processors. + +SME provides the ability to mark individual pages of memory as encrypted using +the standard x86 page tables. A page that is marked encrypted will be +automatically decrypted when read from DRAM and encrypted when written to +DRAM. SME can therefore be used to protect the contents of DRAM from physical +attacks on the system. + +A page is encrypted when a page table entry has the encryption bit set (see +below on how to determine its position). The encryption bit can also be +specified in the cr3 register, allowing the PGD table to be encrypted. Each +successive level of page tables can also be encrypted by setting the encryption +bit in the page table entry that points to the next table. This allows the full +page table hierarchy to be encrypted. Note, this means that just because the +encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted. +Each page table entry in the hierarchy needs to have the encryption bit set to +achieve that. So, theoretically, you could have the encryption bit set in cr3 +so that the PGD is encrypted, but not set the encryption bit in the PGD entry +for a PUD which results in the PUD pointed to by that entry to not be +encrypted. + +Support for SME can be determined through the CPUID instruction. The CPUID +function 0x8000001f reports information related to SME: + + 0x8000001f[eax]: + Bit[0] indicates support for SME + 0x8000001f[ebx]: + Bits[5:0] pagetable bit number used to activate memory + encryption + Bits[11:6] reduction in physical address space, in bits, when + memory encryption is enabled (this only affects + system physical addresses, not guest physical + addresses) + +If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to +determine if SME is enabled and/or to enable memory encryption: + + 0xc0010010: + Bit[23] 0 = memory encryption features are disabled + 1 = memory encryption features are enabled + +Linux relies on BIOS to set this bit if BIOS has determined that the reduction +in the physical address space as a result of enabling memory encryption (see +CPUID information above) will not conflict with the address space resource +requirements for the system. If this bit is not set upon Linux startup then +Linux itself will not set it and memory encryption will not be possible. + +The state of SME in the Linux kernel can be documented as follows: + - Supported: + The CPU supports SME (determined through CPUID instruction). + + - Enabled: + Supported and bit 23 of MSR_K8_SYSCFG is set. + + - Active: + Supported, Enabled and the Linux kernel is actively applying + the encryption bit to page table entries (the SME mask in the + kernel is non-zero). + +SME can also be enabled and activated in the BIOS. If SME is enabled and +activated in the BIOS, then all memory accesses will be encrypted and it will +not be necessary to activate the Linux memory encryption support. If the BIOS +merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate +memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or +by supplying mem_encrypt=on on the kernel command line. However, if BIOS does +not enable SME, then Linux will not be able to activate memory encryption, even +if configured to do so by default or the mem_encrypt=on command line parameter +is specified. diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt index b643045408218669de1af81b9a9a661c035ffc70..fa46dcb347bc1d2ac60901c4621bd3bad81de601 100644 --- a/Documentation/x86/protection-keys.txt +++ b/Documentation/x86/protection-keys.txt @@ -34,7 +34,7 @@ with a key. In this example WRPKRU is wrapped by a C function called pkey_set(). int real_prot = PROT_READ|PROT_WRITE; - pkey = pkey_alloc(0, PKEY_DENY_WRITE); + pkey = pkey_alloc(0, PKEY_DISABLE_WRITE); ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey); ... application runs here @@ -42,9 +42,9 @@ called pkey_set(). Now, if the application needs to update the data at 'ptr', it can gain access, do the update, then remove its write access: - pkey_set(pkey, 0); // clear PKEY_DENY_WRITE + pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE *ptr = foo; // assign something - pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again + pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again Now when it frees the memory, it will also free the pkey since it is no longer in use: diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt new file mode 100644 index 0000000000000000000000000000000000000000..087251a0d99cbd6ea2a76a1da687c3fbe528e769 --- /dev/null +++ b/Documentation/x86/x86_64/5level-paging.txt @@ -0,0 +1,64 @@ +== Overview == + +Original x86-64 was limited by 4-level paing to 256 TiB of virtual address +space and 64 TiB of physical address space. We are already bumping into +this limit: some vendors offers servers with 64 TiB of memory today. + +To overcome the limitation upcoming hardware will introduce support for +5-level paging. It is a straight-forward extension of the current page +table structure adding one more layer of translation. + +It bumps the limits to 128 PiB of virtual address space and 4 PiB of +physical address space. This "ought to be enough for anybody" ©. + +QEMU 2.9 and later support 5-level paging. + +Virtual memory layout for 5-level paging is described in +Documentation/x86/x86_64/mm.txt + +== Enabling 5-level paging == + +CONFIG_X86_5LEVEL=y enables the feature. + +So far, a kernel compiled with the option enabled will be able to boot +only on machines that supports the feature -- see for 'la57' flag in +/proc/cpuinfo. + +The plan is to implement boot-time switching between 4- and 5-level paging +in the future. + +== User-space and large virtual address space == + +On x86, 5-level paging enables 56-bit userspace virtual address space. +Not all user space is ready to handle wide addresses. It's known that +at least some JIT compilers use higher bits in pointers to encode their +information. It collides with valid pointers with 5-level paging and +leads to crashes. + +To mitigate this, we are not going to allocate virtual address space +above 47-bit by default. + +But userspace can ask for allocation from full address space by +specifying hint address (with or without MAP_FIXED) above 47-bits. + +If hint address set above 47-bit, but MAP_FIXED is not specified, we try +to look for unmapped area by specified address. If it's already +occupied, we look for unmapped area in *full* address space, rather than +from 47-bit window. + +A high hint address would only affect the allocation in question, but not +any future mmap()s. + +Specifying high hint address on older kernel or on machine without 5-level +paging support is safe. The hint will be ignored and kernel will fall back +to allocation from 47-bit address space. + +This approach helps to easily make application's memory allocator aware +about large address space without manually tracking allocated virtual +address space. + +One important case we need to handle here is interaction with MPX. +MPX (without MAWA extension) cannot handle addresses above 47-bit, so we +need to make sure that MPX cannot be enabled we already have VMA above +the boundary and forbid creating such VMAs once MPX is enabled. + diff --git a/Makefile b/Makefile index 235826f957411e8a3f02225439c38d92adf00802..dda88e744d5f3132ab926c43f512fe49f5a0c3b4 100644 --- a/Makefile +++ b/Makefile @@ -396,7 +396,7 @@ LINUXINCLUDE := \ KBUILD_CPPFLAGS := -D__KERNEL__ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ - -fno-strict-aliasing -fno-common \ + -fno-strict-aliasing -fno-common -fshort-wchar \ -Werror-implicit-function-declaration \ -Wno-format-security \ -std=gnu89 $(call cc-option,-fno-PIE) @@ -442,7 +442,7 @@ export RCS_TAR_IGNORE := --exclude SCCS --exclude BitKeeper --exclude .svn \ # =========================================================================== # Rules shared between *config targets and build targets -# Basic helpers built in scripts/ +# Basic helpers built in scripts/basic/ PHONY += scripts_basic scripts_basic: $(Q)$(MAKE) $(build)=scripts/basic @@ -505,7 +505,7 @@ ifeq ($(KBUILD_EXTMOD),) endif endif endif -# install and module_install need also be processed one by one +# install and modules_install need also be processed one by one ifneq ($(filter install,$(MAKECMDGOALS)),) ifneq ($(filter modules_install,$(MAKECMDGOALS)),) mixed-targets := 1 @@ -964,7 +964,7 @@ export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y2) $(drivers-y) $(net-y) $(virt- export KBUILD_VMLINUX_LIBS := $(libs-y1) export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds export LDFLAGS_vmlinux -# used by scripts/pacmage/Makefile +# used by scripts/package/Makefile export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools) vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN) $(KBUILD_VMLINUX_LIBS) @@ -992,8 +992,8 @@ include/generated/autoksyms.h: FORCE ARCH_POSTLINK := $(wildcard $(srctree)/arch/$(SRCARCH)/Makefile.postlink) # Final link of vmlinux with optional arch pass after final link - cmd_link-vmlinux = \ - $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) ; \ +cmd_link-vmlinux = \ + $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) ; \ $(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true) vmlinux: scripts/link-vmlinux.sh vmlinux_prereq $(vmlinux-deps) FORCE @@ -1184,6 +1184,7 @@ PHONY += kselftest kselftest: $(Q)$(MAKE) -C tools/testing/selftests run_tests +PHONY += kselftest-clean kselftest-clean: $(Q)$(MAKE) -C tools/testing/selftests clean diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index a5459698f0ee3762917c82c8cbf133e34643f021..7db85ab00c522315c3108930365fc57c59fd311c 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -96,7 +96,6 @@ menu "ARC Architecture Configuration" menu "ARC Platform/SoC/Board" -source "arch/arc/plat-sim/Kconfig" source "arch/arc/plat-tb10x/Kconfig" source "arch/arc/plat-axs10x/Kconfig" #New platform adds here diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 44ef35d339564623b2beb4a380a81da6a7790087..3a61cfcc38c0dd78f58247512a4d31231a2daa4b 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -107,7 +107,7 @@ core-y += arch/arc/ # w/o this dtb won't embed into kernel binary core-y += arch/arc/boot/dts/ -core-$(CONFIG_ARC_PLAT_SIM) += arch/arc/plat-sim/ +core-y += arch/arc/plat-sim/ core-$(CONFIG_ARC_PLAT_TB10X) += arch/arc/plat-tb10x/ core-$(CONFIG_ARC_PLAT_AXS10X) += arch/arc/plat-axs10x/ core-$(CONFIG_ARC_PLAT_EZNPS) += arch/arc/plat-eznps/ diff --git a/arch/arc/boot/dts/axc001.dtsi b/arch/arc/boot/dts/axc001.dtsi index 53ce226f77a59857615f8fc53fca3ba1c4f09749..a380ffa1a4589b11de92a5a0c21bc831e6dc751b 100644 --- a/arch/arc/boot/dts/axc001.dtsi +++ b/arch/arc/boot/dts/axc001.dtsi @@ -15,15 +15,15 @@ / { compatible = "snps,arc"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; cpu_card { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; core_clk: core_clk { #clock-cells = <0>; @@ -91,23 +91,21 @@ mb_intc: dw-apb-ictl@0xe0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; - reg = < 0xe0012000 0x200 >; + reg = < 0x0 0xe0012000 0x0 0x200 >; interrupt-controller; interrupt-parent = <&core_intc>; interrupts = < 7 >; }; memory { - #address-cells = <1>; - #size-cells = <1>; - ranges = <0x00000000 0x80000000 0x20000000>; device_type = "memory"; - reg = <0x80000000 0x1b000000>; /* (512 - 32) MiB */ + /* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */ + reg = <0x0 0x80000000 0x0 0x1b000000>; /* (512 - 32) MiB */ }; reserved-memory { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; ranges; /* * We just move frame buffer area to the very end of @@ -118,7 +116,7 @@ */ frame_buffer: frame_buffer@9e000000 { compatible = "shared-dma-pool"; - reg = <0x9e000000 0x2000000>; + reg = <0x0 0x9e000000 0x0 0x2000000>; no-map; }; }; diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index 14df46f141bf3450b8df0660bc0f248beef4669b..cc9239ef8d08c9f6efbe6dbee0323f283c4f5e37 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -14,15 +14,15 @@ / { compatible = "snps,arc"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; cpu_card { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; core_clk: core_clk { #clock-cells = <0>; @@ -94,30 +94,29 @@ mb_intc: dw-apb-ictl@0xe0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; - reg = < 0xe0012000 0x200 >; + reg = < 0x0 0xe0012000 0x0 0x200 >; interrupt-controller; interrupt-parent = <&core_intc>; interrupts = < 24 >; }; memory { - #address-cells = <1>; - #size-cells = <1>; - ranges = <0x00000000 0x80000000 0x40000000>; device_type = "memory"; - reg = <0x80000000 0x20000000>; /* 512MiB */ + /* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */ + reg = <0x0 0x80000000 0x0 0x20000000 /* 512 MiB low mem */ + 0x1 0xc0000000 0x0 0x40000000>; /* 1 GiB highmem */ }; reserved-memory { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; ranges; /* * Move frame buffer out of IOC aperture (0x8z-0xAz). */ frame_buffer: frame_buffer@be000000 { compatible = "shared-dma-pool"; - reg = <0xbe000000 0x2000000>; + reg = <0x0 0xbe000000 0x0 0x2000000>; no-map; }; }; diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi index 695f9fa1996bcbb2689e1bf98dc8fa661cc1c216..4ebb2170abecc7e00cd581a7ca6dcef6e94231e0 100644 --- a/arch/arc/boot/dts/axc003_idu.dtsi +++ b/arch/arc/boot/dts/axc003_idu.dtsi @@ -14,15 +14,15 @@ / { compatible = "snps,arc"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; cpu_card { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; core_clk: core_clk { #clock-cells = <0>; @@ -100,30 +100,29 @@ mb_intc: dw-apb-ictl@0xe0012000 { #interrupt-cells = <1>; compatible = "snps,dw-apb-ictl"; - reg = < 0xe0012000 0x200 >; + reg = < 0x0 0xe0012000 0x0 0x200 >; interrupt-controller; interrupt-parent = <&idu_intc>; interrupts = <0>; }; memory { - #address-cells = <1>; - #size-cells = <1>; - ranges = <0x00000000 0x80000000 0x40000000>; device_type = "memory"; - reg = <0x80000000 0x20000000>; /* 512MiB */ + /* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */ + reg = <0x0 0x80000000 0x0 0x20000000 /* 512 MiB low mem */ + 0x1 0xc0000000 0x0 0x40000000>; /* 1 GiB highmem */ }; reserved-memory { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; ranges; /* * Move frame buffer out of IOC aperture (0x8z-0xAz). */ frame_buffer: frame_buffer@be000000 { compatible = "shared-dma-pool"; - reg = <0xbe000000 0x2000000>; + reg = <0x0 0xbe000000 0x0 0x2000000>; no-map; }; }; diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi index 41cfb29b62c142ca48193359e8fedf20151cc9bf..0ff7e07edcd4d28bcf6266cedd590034b8aebd13 100644 --- a/arch/arc/boot/dts/axs10x_mb.dtsi +++ b/arch/arc/boot/dts/axs10x_mb.dtsi @@ -13,7 +13,7 @@ compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; - ranges = <0x00000000 0xe0000000 0x10000000>; + ranges = <0x00000000 0x0 0xe0000000 0x10000000>; interrupt-parent = <&mb_intc>; i2sclk: i2sclk@100a0 { diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig index 57b3e599322f83d5babd0014c7d9ccf3a03578eb..db04ea4dd2d97237b843fd4b90706f2e0c8ef6d9 100644 --- a/arch/arc/configs/haps_hs_defconfig +++ b/arch/arc/configs/haps_hs_defconfig @@ -21,7 +21,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs" CONFIG_PREEMPT=y diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig index f85985adebb245981d8a4130e8c0b91a45abe7dd..821a2e562f3f12422844556630d7136ad63097a8 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_SMP=y CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs_idu" diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig index ede625c76216616b097365a3274e09ccea97d3d6..7c9c706ae7f66eb29d4cf48ca0b95d5dd44630f5 100644 --- a/arch/arc/configs/nps_defconfig +++ b/arch/arc/configs/nps_defconfig @@ -39,7 +39,6 @@ CONFIG_IP_PNP=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_LRO is not set # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set # CONFIG_WIRELESS is not set diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig index b0066a749d4c49d8a23e3bd33b4a52789789c913..6dff83a238b8591ba08f889e7d7013734ff587db 100644 --- a/arch/arc/configs/nsim_700_defconfig +++ b/arch/arc/configs/nsim_700_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ARC_BUILTIN_DTB_NAME="nsim_700" CONFIG_PREEMPT=y # CONFIG_COMPACTION is not set diff --git a/arch/arc/configs/nsim_hs_defconfig b/arch/arc/configs/nsim_hs_defconfig index ebe9ebb92933302af79f98dae000e8567b86ec6b..31ee51b987e7c5b97c2f044794f9d1c5b4eea73e 100644 --- a/arch/arc/configs/nsim_hs_defconfig +++ b/arch/arc/configs/nsim_hs_defconfig @@ -26,7 +26,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs" CONFIG_PREEMPT=y diff --git a/arch/arc/configs/nsim_hs_smp_defconfig b/arch/arc/configs/nsim_hs_smp_defconfig index 4bde43278be6757c5c739bfd58a9f030a80c7568..8d3b1f67cae421c423e55c2622b5a78ea3d67794 100644 --- a/arch/arc/configs/nsim_hs_smp_defconfig +++ b/arch/arc/configs/nsim_hs_smp_defconfig @@ -24,7 +24,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_SMP=y CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs_idu" diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig index f6fb3d26557eb7c63b2f77263acfed80f0b45387..6168ce2ac2efdd869705a8a75e78f7f9b9c55af4 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci" # CONFIG_COMPACTION is not set CONFIG_NET=y diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig index b9f0fe00044b6c44d62a81a91064583cd9badee3..a70bdeb2b3fd03bdefd10388e69cd8e5b120ac0f 100644 --- a/arch/arc/configs/nsimosci_hs_defconfig +++ b/arch/arc/configs/nsimosci_hs_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci_hs" # CONFIG_COMPACTION is not set diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig index 155add7761ed63080bd81e830c819483fb55ee61..ef96406c446e823d4706e5c84b6ac0626d4b5812 100644 --- a/arch/arc/configs/nsimosci_hs_smp_defconfig +++ b/arch/arc/configs/nsimosci_hs_smp_defconfig @@ -18,7 +18,6 @@ CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set -CONFIG_ARC_PLAT_SIM=y CONFIG_ISA_ARCV2=y CONFIG_SMP=y # CONFIG_ARC_TIMERS_64BIT is not set diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index 4c5118384eb5f1c393208cde601fddc1be773494..f3018254939508e373662a2ae98f9d08be9425a2 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -38,7 +38,6 @@ CONFIG_IP_MULTICAST=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_LRO is not set # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set # CONFIG_WIRELESS is not set diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index 19ebddffb279db05ca8e53e2c4753665376cc9ad..02fd1cece6ef339209c993a6b56b87312e39d648 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -96,7 +96,9 @@ extern unsigned long perip_base, perip_end; #define ARC_REG_SLC_FLUSH 0x904 #define ARC_REG_SLC_INVALIDATE 0x905 #define ARC_REG_SLC_RGN_START 0x914 +#define ARC_REG_SLC_RGN_START1 0x915 #define ARC_REG_SLC_RGN_END 0x916 +#define ARC_REG_SLC_RGN_END1 0x917 /* Bit val in SLC_CONTROL */ #define SLC_CTRL_DIS 0x001 diff --git a/arch/arc/include/asm/mmu.h b/arch/arc/include/asm/mmu.h index db7319e9b506e9ece826fde788aba60adada59cb..efb79fafff1d12214dc41923a75b237a969214f5 100644 --- a/arch/arc/include/asm/mmu.h +++ b/arch/arc/include/asm/mmu.h @@ -94,6 +94,8 @@ static inline int is_pae40_enabled(void) return IS_ENABLED(CONFIG_ARC_HAS_PAE40); } +extern int pae40_exist_but_not_enab(void); + #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c index f928795fd07a813e9f6f3e6d86f325b45561b3dc..cf90714a676d6baabbdc1beb2119fe530e1c84f3 100644 --- a/arch/arc/kernel/intc-arcv2.c +++ b/arch/arc/kernel/intc-arcv2.c @@ -75,10 +75,13 @@ void arc_init_IRQ(void) * Set a default priority for all available interrupts to prevent * switching of register banks if Fast IRQ and multiple register banks * are supported by CPU. + * Also disable all IRQ lines so faulty external hardware won't + * trigger interrupt that kernel is not ready to handle. */ for (i = NR_EXCEPTIONS; i < irq_bcr.irqs + NR_EXCEPTIONS; i++) { write_aux_reg(AUX_IRQ_SELECT, i); write_aux_reg(AUX_IRQ_PRIORITY, ARCV2_IRQ_DEF_PRIO); + write_aux_reg(AUX_IRQ_ENABLE, 0); } /* setup status32, don't enable intr yet as kernel doesn't want */ diff --git a/arch/arc/kernel/intc-compact.c b/arch/arc/kernel/intc-compact.c index 7e608c6b0a018697fb22c52c7fb4ac46a45fe2f0..cef388025adf43aed091869f102e01e1bee72092 100644 --- a/arch/arc/kernel/intc-compact.c +++ b/arch/arc/kernel/intc-compact.c @@ -27,7 +27,7 @@ */ void arc_init_IRQ(void) { - int level_mask = 0; + int level_mask = 0, i; /* Is timer high priority Interrupt (Level2 in ARCompact jargon) */ level_mask |= IS_ENABLED(CONFIG_ARC_COMPACT_IRQ_LEVELS) << TIMER0_IRQ; @@ -40,6 +40,18 @@ void arc_init_IRQ(void) if (level_mask) pr_info("Level-2 interrupts bitset %x\n", level_mask); + + /* + * Disable all IRQ lines so faulty external hardware won't + * trigger interrupt that kernel is not ready to handle. + */ + for (i = TIMER0_IRQ; i < NR_CPU_IRQS; i++) { + unsigned int ienb; + + ienb = read_aux_reg(AUX_IENABLE); + ienb &= ~(1 << i); + write_aux_reg(AUX_IENABLE, ienb); + } } /* diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index a867575a758b9845c09a84fb4515f17f08f1272b..7db283b46ebde8daccf779525e82526e47ad2722 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -665,6 +665,7 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op) static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned int ctrl; + phys_addr_t end; spin_lock_irqsave(&lock, flags); @@ -694,8 +695,19 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op) * END needs to be setup before START (latter triggers the operation) * END can't be same as START, so add (l2_line_sz - 1) to sz */ - write_aux_reg(ARC_REG_SLC_RGN_END, (paddr + sz + l2_line_sz - 1)); - write_aux_reg(ARC_REG_SLC_RGN_START, paddr); + end = paddr + sz + l2_line_sz - 1; + if (is_pae40_enabled()) + write_aux_reg(ARC_REG_SLC_RGN_END1, upper_32_bits(end)); + + write_aux_reg(ARC_REG_SLC_RGN_END, lower_32_bits(end)); + + if (is_pae40_enabled()) + write_aux_reg(ARC_REG_SLC_RGN_START1, upper_32_bits(paddr)); + + write_aux_reg(ARC_REG_SLC_RGN_START, lower_32_bits(paddr)); + + /* Make sure "busy" bit reports correct stataus, see STAR 9001165532 */ + read_aux_reg(ARC_REG_SLC_CTRL); while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY); @@ -1111,6 +1123,13 @@ noinline void __init arc_ioc_setup(void) __dc_enable(); } +/* + * Cache related boot time checks/setups only needed on master CPU: + * - Geometry checks (kernel build and hardware agree: e.g. L1_CACHE_BYTES) + * Assume SMP only, so all cores will have same cache config. A check on + * one core suffices for all + * - IOC setup / dma callbacks only need to be done once + */ void __init arc_cache_init_master(void) { unsigned int __maybe_unused cpu = smp_processor_id(); @@ -1190,12 +1209,27 @@ void __ref arc_cache_init(void) printk(arc_cache_mumbojumbo(0, str, sizeof(str))); - /* - * Only master CPU needs to execute rest of function: - * - Assume SMP so all cores will have same cache config so - * any geomtry checks will be same for all - * - IOC setup / dma callbacks only need to be setup once - */ if (!cpu) arc_cache_init_master(); + + /* + * In PAE regime, TLB and cache maintenance ops take wider addresses + * And even if PAE is not enabled in kernel, the upper 32-bits still need + * to be zeroed to keep the ops sane. + * As an optimization for more common !PAE enabled case, zero them out + * once at init, rather than checking/setting to 0 for every runtime op + */ + if (is_isa_arcv2() && pae40_exist_but_not_enab()) { + + if (IS_ENABLED(CONFIG_ARC_HAS_ICACHE)) + write_aux_reg(ARC_REG_IC_PTAG_HI, 0); + + if (IS_ENABLED(CONFIG_ARC_HAS_DCACHE)) + write_aux_reg(ARC_REG_DC_PTAG_HI, 0); + + if (l2_line_sz) { + write_aux_reg(ARC_REG_SLC_RGN_END1, 0); + write_aux_reg(ARC_REG_SLC_RGN_START1, 0); + } + } } diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 71d3efff99d35c56c6f30bea8a6ab1cf1702d753..e9d93604ad0ff7b76a5458235b3ab4d0ef18aefb 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -153,6 +153,19 @@ static void _dma_cache_sync(phys_addr_t paddr, size_t size, } } +/* + * arc_dma_map_page - map a portion of a page for streaming DMA + * + * Ensure that any data held in the cache is appropriately discarded + * or written back. + * + * The device owns this memory once this call has completed. The CPU + * can regain ownership by calling dma_unmap_page(). + * + * Note: while it takes struct page as arg, caller can "abuse" it to pass + * a region larger than PAGE_SIZE, provided it is physically contiguous + * and this still works correctly + */ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -165,6 +178,24 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page, return plat_phys_to_dma(dev, paddr); } +/* + * arc_dma_unmap_page - unmap a buffer previously mapped through dma_map_page() + * + * After this call, reads by the CPU to the buffer are guaranteed to see + * whatever the device wrote there. + * + * Note: historically this routine was not implemented for ARC + */ +static void arc_dma_unmap_page(struct device *dev, dma_addr_t handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t paddr = plat_dma_to_phys(dev, handle); + + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + _dma_cache_sync(paddr, size, dir); +} + static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { @@ -178,6 +209,18 @@ static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg, return nents; } +static void arc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) + arc_dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir, + attrs); +} + static void arc_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) { @@ -223,7 +266,9 @@ const struct dma_map_ops arc_dma_ops = { .free = arc_dma_free, .mmap = arc_dma_mmap, .map_page = arc_dma_map_page, + .unmap_page = arc_dma_unmap_page, .map_sg = arc_dma_map_sg, + .unmap_sg = arc_dma_unmap_sg, .sync_single_for_device = arc_dma_sync_single_for_device, .sync_single_for_cpu = arc_dma_sync_single_for_cpu, .sync_sg_for_cpu = arc_dma_sync_sg_for_cpu, diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index d0126fdfe2d8546d62883cbef419e4b76589cf18..b181f3ee38aab54565d8de8c1e5136375c6358cc 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -104,6 +104,8 @@ /* A copy of the ASID from the PID reg is kept in asid_cache */ DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE; +static int __read_mostly pae_exists; + /* * Utility Routine to erase a J-TLB entry * Caller needs to setup Index Reg (manually or via getIndex) @@ -784,7 +786,7 @@ void read_decode_mmu_bcr(void) mmu->u_dtlb = mmu4->u_dtlb * 4; mmu->u_itlb = mmu4->u_itlb * 4; mmu->sasid = mmu4->sasid; - mmu->pae = mmu4->pae; + pae_exists = mmu->pae = mmu4->pae; } } @@ -809,6 +811,11 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len) return buf; } +int pae40_exist_but_not_enab(void) +{ + return pae_exists && !is_pae40_enabled(); +} + void arc_mmu_init(void) { char str[256]; @@ -859,6 +866,9 @@ void arc_mmu_init(void) /* swapper_pg_dir is the pgd for the kernel, used by vmalloc */ write_aux_reg(ARC_REG_SCRATCH_DATA0, swapper_pg_dir); #endif + + if (pae40_exist_but_not_enab()) + write_aux_reg(ARC_REG_TLBPD1HI, 0); } /* diff --git a/arch/arc/plat-sim/Kconfig b/arch/arc/plat-sim/Kconfig deleted file mode 100644 index ac6af96a82f3208980f80e07d488a02761ede1a5..0000000000000000000000000000000000000000 --- a/arch/arc/plat-sim/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -# -# Copyright (C) 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. -# - -menuconfig ARC_PLAT_SIM - bool "ARC nSIM based simulation virtual platforms" - help - Support for nSIM based ARC simulation platforms - This includes the standalone nSIM (uart only) vs. System C OSCI VP diff --git a/arch/arc/plat-sim/platform.c b/arch/arc/plat-sim/platform.c index aea87389e44bd517b196ec87de0edb4c0216b4b1..5cda56b1a2ead7bc54ca1ac63296fcd1f54fc43f 100644 --- a/arch/arc/plat-sim/platform.c +++ b/arch/arc/plat-sim/platform.c @@ -20,11 +20,14 @@ */ static const char *simulation_compat[] __initconst = { +#ifdef CONFIG_ISA_ARCOMPACT "snps,nsim", - "snps,nsim_hs", "snps,nsimosci", +#else + "snps,nsim_hs", "snps,nsimosci_hs", "snps,zebu_hs", +#endif NULL, }; diff --git a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi index f92f957412073279a3be9713794bb2336dbf4752..a183b56283f8ff9e9d41d616edfc00a82d200267 100644 --- a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi +++ b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi @@ -266,6 +266,7 @@ &hdmicec { status = "okay"; + needs-hpd; }; &hsi2c_4 { diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig index d735e5fc47727b0536c943b6fdb15223d7394360..195da38cb9a220f22c2f890c5bcb8fc1f2c7c621 100644 --- a/arch/arm/mach-at91/Kconfig +++ b/arch/arm/mach-at91/Kconfig @@ -1,7 +1,7 @@ menuconfig ARCH_AT91 bool "Atmel SoCs" depends on ARCH_MULTI_V4T || ARCH_MULTI_V5 || ARCH_MULTI_V7 || ARM_SINGLE_ARMV7M - select ARM_CPU_SUSPEND if PM + select ARM_CPU_SUSPEND if PM && ARCH_MULTI_V7 select COMMON_CLK_AT91 select GPIOLIB select PINCTRL diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 667fddac38561eff3f25788ccbf9e05ac8214d84..5036f996e694a533d66f82743b901385ae048007 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -608,6 +608,9 @@ static void __init at91_pm_init(void (*pm_idle)(void)) void __init at91rm9200_pm_init(void) { + if (!IS_ENABLED(CONFIG_SOC_AT91RM9200)) + return; + at91_dt_ramc(); /* @@ -620,18 +623,27 @@ void __init at91rm9200_pm_init(void) void __init at91sam9_pm_init(void) { + if (!IS_ENABLED(CONFIG_SOC_AT91SAM9)) + return; + at91_dt_ramc(); at91_pm_init(at91sam9_idle); } void __init sama5_pm_init(void) { + if (!IS_ENABLED(CONFIG_SOC_SAMA5)) + return; + at91_dt_ramc(); at91_pm_init(NULL); } void __init sama5d2_pm_init(void) { + if (!IS_ENABLED(CONFIG_SOC_SAMA5D2)) + return; + at91_pm_backup_init(); sama5_pm_init(); } diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 06da8ea16bbe5e150bb7330ab0049c7438b25f8d..c7b4995868e14d78216c4b06ce87476dfe8a4789 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -161,9 +161,11 @@ void fpsimd_flush_thread(void) { if (!system_supports_fpsimd()) return; + preempt_disable(); memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); fpsimd_flush_task_state(current); set_thread_flag(TIF_FOREIGN_FPSTATE); + preempt_enable(); } /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 973df7de7bf8d3e4fcde6dc2ba1590fa051964a2..adb0910b88f5f26c1562a7fef6f00f29ee9ec391 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -354,7 +354,6 @@ __primary_switched: tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? b.ne 0f mov x0, x21 // pass FDT address in x0 - mov x1, x23 // pass modulo offset in x1 bl kaslr_early_init // parse FDT for KASLR options cbz x0, 0f // KASLR disabled? just proceed orr x23, x23, x0 // record KASLR offset diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index a9710efb8c017a735f9aa6518463d11b8d6264c9..47080c49cc7e77a3df120f061f7b10f90a8c2ec5 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -75,7 +75,7 @@ extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, * containing function pointers) to be reinitialized, and zero-initialized * .bss variables will be reset to 0. */ -u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset) +u64 __init kaslr_early_init(u64 dt_phys) { void *fdt; u64 seed, offset, mask, module_range; @@ -131,15 +131,17 @@ u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset) /* * The kernel Image should not extend across a 1GB/32MB/512MB alignment * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this - * happens, increase the KASLR offset by the size of the kernel image - * rounded up by SWAPPER_BLOCK_SIZE. + * happens, round down the KASLR offset by (1 << SWAPPER_TABLE_SHIFT). + * + * NOTE: The references to _text and _end below will already take the + * modulo offset (the physical displacement modulo 2 MB) into + * account, given that the physical placement is controlled by + * the loader, and will not change as a result of the virtual + * mapping we choose. */ - if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) != - (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT)) { - u64 kimg_sz = _end - _text; - offset = (offset + round_up(kimg_sz, SWAPPER_BLOCK_SIZE)) - & mask; - } + if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) != + (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) + offset = round_down(offset, 1 << SWAPPER_TABLE_SHIFT); if (IS_ENABLED(CONFIG_KASAN)) /* diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 2509e4fe699225675f74876032e22b24b338a3b1..1f22a41565a38ac08083b29fcad216ca4ea1a2c9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -435,8 +435,11 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, * the mmap_sem because it would already be released * in __lock_page_or_retry in mm/filemap.c. */ - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current)) { + if (!user_mode(regs)) + goto no_context; return 0; + } /* * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h index a3d0211970e95e5152edbaeeecd8ab3a42041ac6..c86a947f5368633b86d1a8e5dc7ddd1803982c64 100644 --- a/arch/ia64/include/asm/acpi.h +++ b/arch/ia64/include/asm/acpi.h @@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; } -#define acpi_unlazy_tlb(x) - #ifdef CONFIG_ACPI_NUMA extern cpumask_t early_cpu_possible_map; #define for_each_possible_early_cpu(cpu) \ diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 121295637d0df831fbbf919983ee9a0cd1a7d366..81416000c5e07c18f16032bb0b9c9f77abc8b883 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size) return 0; } -u32 +int efi_mem_type (unsigned long phys_addr) { efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); if (md) return md->type; - return 0; + return -EINVAL; } u64 diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index 0efd0583a8c9da1afc2acc508f2e2e1a5388441d..6249214148c24bb44f2203117200768a410cee2d 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -68,6 +68,7 @@ typedef struct { unsigned long iopgprot; } iopgprot_t; #define iopgprot_val(x) ((x).iopgprot) #define __pte(x) ((pte_t) { (x) } ) +#define __pmd(x) ((pmd_t) { { (x) }, }) #define __iopte(x) ((iopte_t) { (x) } ) #define __pgd(x) ((pgd_t) { (x) } ) #define __ctxd(x) ((ctxd_t) { (x) } ) @@ -95,6 +96,7 @@ typedef unsigned long iopgprot_t; #define iopgprot_val(x) (x) #define __pte(x) (x) +#define __pmd(x) ((pmd_t) { { (x) }, }) #define __iopte(x) (x) #define __pgd(x) (x) #define __ctxd(x) (x) diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index f10e2f7123949dedc8904ea37895d402ed097d2b..9ebebf1fd93d2f5db8c09804c336b006041b28af 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -1266,8 +1266,6 @@ static int pci_sun4v_probe(struct platform_device *op) * ATU group, but ATU hcalls won't be available. */ hv_atu = false; - pr_err(PFX "Could not register hvapi ATU err=%d\n", - err); } else { pr_info(PFX "Registered hvapi ATU major[%lu] minor[%lu]\n", vatu_major, vatu_minor); diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c index a38787b843220807d0f5527f864981fc75e8486f..732af9a9f6ddef510e39da7d6eca5547e210296e 100644 --- a/arch/sparc/kernel/pcic.c +++ b/arch/sparc/kernel/pcic.c @@ -602,7 +602,7 @@ void pcibios_fixup_bus(struct pci_bus *bus) { struct pci_dev *dev; int i, has_io, has_mem; - unsigned int cmd; + unsigned int cmd = 0; struct linux_pcic *pcic; /* struct linux_pbm_info* pbm = &pcic->pbm; */ int node; diff --git a/arch/sparc/lib/multi3.S b/arch/sparc/lib/multi3.S index d6b6c97fe3c73061f911737a5da23ab41a3eab77..703127aaf4a5015ee4a1099e6129735f7162222b 100644 --- a/arch/sparc/lib/multi3.S +++ b/arch/sparc/lib/multi3.S @@ -5,26 +5,26 @@ .align 4 ENTRY(__multi3) /* %o0 = u, %o1 = v */ mov %o1, %g1 - srl %o3, 0, %g4 - mulx %g4, %g1, %o1 + srl %o3, 0, %o4 + mulx %o4, %g1, %o1 srlx %g1, 0x20, %g3 - mulx %g3, %g4, %g5 - sllx %g5, 0x20, %o5 - srl %g1, 0, %g4 + mulx %g3, %o4, %g7 + sllx %g7, 0x20, %o5 + srl %g1, 0, %o4 sub %o1, %o5, %o5 srlx %o5, 0x20, %o5 - addcc %g5, %o5, %g5 + addcc %g7, %o5, %g7 srlx %o3, 0x20, %o5 - mulx %g4, %o5, %g4 + mulx %o4, %o5, %o4 mulx %g3, %o5, %o5 sethi %hi(0x80000000), %g3 - addcc %g5, %g4, %g5 - srlx %g5, 0x20, %g5 + addcc %g7, %o4, %g7 + srlx %g7, 0x20, %g7 add %g3, %g3, %g3 movcc %xcc, %g0, %g3 - addcc %o5, %g5, %o5 - sllx %g4, 0x20, %g4 - add %o1, %g4, %o1 + addcc %o5, %g7, %o5 + sllx %o4, 0x20, %o4 + add %o1, %o4, %o1 add %o5, %g3, %g2 mulx %g1, %o2, %g1 add %g1, %g2, %g1 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 323cb065be5eda120b44dac79618a13301ece231..e4844e934728c58f054eefaf1375e6b018dc982a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -327,6 +327,7 @@ config FIX_EARLYCON_MEM config PGTABLE_LEVELS int + default 5 if X86_5LEVEL default 4 if X86_64 default 3 if X86_PAE default 2 @@ -1399,6 +1400,24 @@ config X86_PAE has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. +config X86_5LEVEL + bool "Enable 5-level page tables support" + depends on X86_64 + ---help--- + 5-level paging enables access to larger address space: + upto 128 PiB of virtual address space and 4 PiB of + physical address space. + + It will be supported by future Intel CPUs. + + Note: a kernel with this option enabled can only be booted + on machines that support the feature. + + See Documentation/x86/x86_64/5level-paging.txt for more + information. + + Say N if unsure. + config ARCH_PHYS_ADDR_T_64BIT def_bool y depends on X86_64 || X86_PAE @@ -1416,6 +1435,35 @@ config X86_DIRECT_GBPAGES supports them), so don't confuse the user by printing that we have them enabled. +config ARCH_HAS_MEM_ENCRYPT + def_bool y + +config AMD_MEM_ENCRYPT + bool "AMD Secure Memory Encryption (SME) support" + depends on X86_64 && CPU_SUP_AMD + ---help--- + Say yes to enable support for the encryption of system memory. + This requires an AMD processor that supports Secure Memory + Encryption (SME). + +config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT + bool "Activate AMD Secure Memory Encryption (SME) by default" + default y + depends on AMD_MEM_ENCRYPT + ---help--- + Say yes to have system memory encrypted by default if running on + an AMD processor that supports Secure Memory Encryption (SME). + + If set to Y, then the encryption of system memory can be + deactivated with the mem_encrypt=off command line option. + + If set to N, then the encryption of system memory can be + activated with the mem_encrypt=on command line option. + +config ARCH_USE_MEMREMAP_PROT + def_bool y + depends on AMD_MEM_ENCRYPT + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 91f27ab970ef74347c1915e7ff6ac5f8a8803b8d..99c7194f7ea626379ac649faa7da83fdb95776bf 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -479,35 +479,31 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_e820_entry(struct boot_e820_entry *entry, +static void process_mem_region(struct mem_vector *entry, unsigned long minimum, unsigned long image_size) { struct mem_vector region, overlap; struct slot_area slot_area; unsigned long start_orig, end; - struct boot_e820_entry cur_entry; - - /* Skip non-RAM entries. */ - if (entry->type != E820_TYPE_RAM) - return; + struct mem_vector cur_entry; /* On 32-bit, ignore entries entirely above our maximum. */ - if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) + if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) + if (entry->start + entry->size < minimum) return; /* Ignore entries above memory limit */ - end = min(entry->size + entry->addr, mem_limit); - if (entry->addr >= end) + end = min(entry->size + entry->start, mem_limit); + if (entry->start >= end) return; - cur_entry.addr = entry->addr; - cur_entry.size = end - entry->addr; + cur_entry.start = entry->start; + cur_entry.size = end - entry->start; - region.start = cur_entry.addr; + region.start = cur_entry.start; region.size = cur_entry.size; /* Give up if slot area array is full. */ @@ -521,8 +517,8 @@ static void process_e820_entry(struct boot_e820_entry *entry, /* Potentially raise address to meet alignment needs. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - /* Did we raise the address above this e820 region? */ - if (region.start > cur_entry.addr + cur_entry.size) + /* Did we raise the address above the passed in memory entry? */ + if (region.start > cur_entry.start + cur_entry.size) return; /* Reduce size by any delta from the original address. */ @@ -562,12 +558,32 @@ static void process_e820_entry(struct boot_e820_entry *entry, } } -static unsigned long find_random_phys_addr(unsigned long minimum, - unsigned long image_size) +static void process_e820_entries(unsigned long minimum, + unsigned long image_size) { int i; - unsigned long addr; + struct mem_vector region; + struct boot_e820_entry *entry; + + /* Verify potential e820 positions, appending to slots list. */ + for (i = 0; i < boot_params->e820_entries; i++) { + entry = &boot_params->e820_table[i]; + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + region.start = entry->addr; + region.size = entry->size; + process_mem_region(®ion, minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted e820 scan (slot_areas full)!\n"); + break; + } + } +} +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) +{ /* Check if we had too many memmaps. */ if (memmap_too_large) { debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); @@ -577,16 +593,7 @@ static unsigned long find_random_phys_addr(unsigned long minimum, /* Make sure minimum is aligned. */ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < boot_params->e820_entries; i++) { - process_e820_entry(&boot_params->e820_table[i], minimum, - image_size); - if (slot_area_index == MAX_SLOT_AREA) { - debug_putstr("Aborted e820 scan (slot_areas full)!\n"); - break; - } - } - + process_e820_entries(minimum, image_size); return slots_fetch_random(); } diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 28029be47fbb839f248826b517a9e295f4389395..f1aa43854bed423e7bfccaa84ff66ea91996b731 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -15,6 +15,13 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) +/* + * The pgtable.h and mm/ident_map.c includes make use of the SME related + * information which is not used in the compressed image support. Un-define + * the SME support to avoid any compile and link errors. + */ +#undef CONFIG_AMD_MEM_ENCRYPT + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2efc768e43627a48118d28729c380c0fbcaa7681..72d867f6b518e4db5a79a10c924f858a3edb0af8 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -150,8 +150,6 @@ static inline void disable_acpi(void) { } extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ -#define acpi_unlazy_tlb(x) leave_mm(x) - #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { @@ -162,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) * you call efi_mem_attributes() during boot and at runtime, * you could theoretically see different attributes. * - * Since we are yet to see any x86 platforms that require - * anything other than PAGE_KERNEL (some arm64 platforms - * require the equivalent of PAGE_KERNEL_NOCACHE), return that - * until we know differently. + * We are yet to see any x86 platforms that require anything + * other than PAGE_KERNEL (some ARM64 platforms require the + * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME + * is active, the ACPI information will not be encrypted, + * so return PAGE_KERNEL_NOENC until we know differently. */ - return PAGE_KERNEL; + return PAGE_KERNEL_NOENC; } #endif diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index e01f7f7ccb0c5711db4f1fe130938cee36169965..84ae170bc3d0cd73cfa7d08fa6f730bd3106e4ea 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,5 +2,7 @@ #define _ASM_X86_CMDLINE_H int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); +int cmdline_find_option(const char *cmdline_ptr, const char *option, + char *buffer, int bufsize); #endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5a28e8e55e36fd2164c0cda175288a7fcc534c4b..66ac08607471c4a6fcdfb919304f2fd3405458ef 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -196,6 +196,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5dff775af7cd6456f7177d9ce5888ae78dc6bc10..c10c9128f54e6b7296014a74e7a253a1eedaacd9 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 398c79889f5c43d384b72238ce025f140debeb0b..1387dafdba2d2c24061adb014354b82174f620d4 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_ISA # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) @@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return paddr; + return __sme_set(paddr); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { - return daddr; + return __sme_clr(daddr); } #endif /* CONFIG_X86_DMA_REMAP */ diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 3c69fed215c56c3203e59d97a6c7e11381c97cf8..a8e15b04565b842def6a1bd1fa4b3b03db756c3c 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len) } /* Use early IO mappings for DMI because it's initialized early */ -#define dmi_early_remap early_ioremap -#define dmi_early_unmap early_iounmap -#define dmi_remap ioremap_cache -#define dmi_unmap iounmap +#define dmi_early_remap early_memremap +#define dmi_early_unmap early_memunmap +#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB) +#define dmi_unmap(_x) memunmap(_x) #endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index a504adc661a4954ca0d1c5ed04149fece0e042a3..cd266d830e4960fd023aa0411c991a2701db09f9 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void); extern void e820__reallocate_tables(void); extern void e820__register_nosave_regions(unsigned long limit_pfn); +extern int e820__get_entry_type(u64 start, u64 end); + /* * Returns true iff the specified range [start,end) is completely contained inside * the ISA region. diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9aeb91935ce02387d8dae5e2f51bf1750f420a43..a3de31ffb72254199d769afd9266997f563c67d5 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -304,8 +304,8 @@ static inline int mmap_is_ia32(void) test_thread_flag(TIF_ADDR32)); } -extern unsigned long tasksize_32bit(void); -extern unsigned long tasksize_64bit(void); +extern unsigned long task_size_32bit(void); +extern unsigned long task_size_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b65155cc3760a72b49b680c3f70923ddedf684d2..dcd9fb55e67991821d46602754a392c6f2ed0e06 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx, } #endif +/* + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not + * supported for MMIO addresses, so make sure that the memory encryption + * mask is not part of the page attributes. + */ +#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE + +/* + * Early memremap routines used for in-place encryption. The mappings created + * by these routines are intended to be used as temporary mappings. + */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size); + #include #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 474eb8c66feeb2c98de2f5d6fe1db84de752c806..05c4aa00cc862e3b1dad1b344b0eddb9d6f44db4 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -7,6 +7,7 @@ struct x86_mapping_info { unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ bool direct_gbpages; /* PUD level 1GB page support */ + unsigned long kernpg_flag; /* kernel pagetable flag override */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 48febf07e828099d0580be40ad82a8baee6ade18..4bc6f459a8b6dd9861f078c794d6c26534db5148 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -381,4 +381,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif +extern bool arch_memremap_can_ram_remap(resource_size_t offset, + unsigned long size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap + +extern bool phys_mem_access_encrypted(unsigned long phys_addr, + unsigned long size); + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 70ef205489f00e53ff568180c4dcbf6fb9e6ded1..942c1f444da88ddeb182e57f582a068c15cb2717 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -147,7 +147,8 @@ unsigned long relocate_kernel(unsigned long indirection_page, unsigned long page_list, unsigned long start_address, - unsigned int preserve_context); + unsigned int preserve_context, + unsigned int sme_active); #endif #define ARCH_HAS_KIMAGE_ARCH @@ -207,6 +208,14 @@ struct kexec_entry64_regs { uint64_t r15; uint64_t rip; }; + +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, + gfp_t gfp); +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages + +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages + #endif typedef void crash_vmclear_fn(void); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 87ac4fba6d8e12f07e8a9f191bdb028a1c3e6234..7cbaab523f22dcd91812dc269021d1e884ec0df2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1078,7 +1078,7 @@ void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask); + u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h new file mode 100644 index 0000000000000000000000000000000000000000..8e618fcf1f7c9f842873988ab4cb91e3fb5078fd --- /dev/null +++ b/arch/x86/include/asm/mem_encrypt.h @@ -0,0 +1,80 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __X86_MEM_ENCRYPT_H__ +#define __X86_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#include + +#include + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +extern unsigned long sme_me_mask; + +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, + unsigned long decrypted_kernel_vaddr, + unsigned long kernel_len, + unsigned long encryption_wa, + unsigned long encryption_pgd); + +void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size); +void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size); + +void __init sme_map_bootdata(char *real_mode_data); +void __init sme_unmap_bootdata(char *real_mode_data); + +void __init sme_early_init(void); + +void __init sme_encrypt_kernel(void); +void __init sme_enable(struct boot_params *bp); + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void); + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +#define sme_me_mask 0UL + +static inline void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size) { } +static inline void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size) { } + +static inline void __init sme_map_bootdata(char *real_mode_data) { } +static inline void __init sme_unmap_bootdata(char *real_mode_data) { } + +static inline void __init sme_early_init(void) { } + +static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_enable(struct boot_params *bp) { } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +/* + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when + * writing to or comparing values from the cr3 register. Having the + * encryption mask set in cr3 enables the PGD entry to be encrypted and + * avoid special case handling of PGD allocations. + */ +#define __sme_pa(x) (__pa(x) | sme_me_mask) +#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) + +#endif /* __ASSEMBLY__ */ + +#endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 79b647a7ebd0079b96472e52634c898301d3a63e..bb8c597c2248a9c8341d04813b04e1d42e9c7019 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,12 +3,28 @@ #include #include +#include /* - * The x86 doesn't have a mmu context, but - * we put the segment information here. + * x86 has arch-specific MMU state beyond what lives in mm_struct. */ typedef struct { + /* + * ctx_id uniquely identifies this mm_struct. A ctx_id will never + * be reused, and zero is not a valid ctx_id. + */ + u64 ctx_id; + + /* + * Any code that needs to do any sort of TLB flushing for this + * mm will first make its changes to the page tables, then + * increment tlb_gen, then flush. This lets the low-level + * flushing code keep track of what needs flushing. + * + * This is not used on Xen PV. + */ + atomic64_t tlb_gen; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; #endif @@ -37,6 +53,11 @@ typedef struct { #endif } mm_context_t; +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .ctx_id = 1, \ + } + void leave_mm(int cpu); #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 265c907d7d4c9b8c69e24792a20c5a3dfb6c95ee..d25d9f4abb15a1e06e83e6e4ebac1ef9f2c70c04 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -12,6 +12,9 @@ #include #include #include + +extern atomic64_t last_mm_ctx_id; + #ifndef CONFIG_PARAVIRT static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -125,13 +128,18 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) + cpumask_clear_cpu(cpu, mm_cpumask(mm)); } static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ @@ -292,6 +300,9 @@ static inline unsigned long __get_current_cr3_fast(void) { unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); + if (static_cpu_has(X86_FEATURE_PCID)) + cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); + /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index a0d662be4c5b8545a7b0b8c5c846187ced8d0020..7d7404756bb4a734bcb04c5f57931daff8af73b3 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm) } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); + +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags); #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { @@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } + +static inline unsigned long mpx_unmapped_area_check(unsigned long addr, + unsigned long len, unsigned long flags) +{ + return addr; +} #endif /* CONFIG_X86_INTEL_MPX */ #endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5573c75f8e4ced276c8585b71f0df9b786ea9e90..17f5c12e1afd0c6ddb3fa2e6fc94b1fec52f7d5c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -356,6 +356,8 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index b4a0d43248cf3d6f2c7ef042200c31a25f29f122..b50df06ad251f143e105843ab84ac1a14f40abdf 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -51,6 +51,10 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); +#ifdef CONFIG_X86_MCE +#define arch_unmap_kpfn arch_unmap_kpfn +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 7bd0099384cac4ed0fa89a4e25c42e4a63ae5f9c..b98ed9d1463098936bcebbae46b59dc00495508c 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,6 +3,7 @@ #include #include +#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -15,7 +16,7 @@ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) /* Cast *PAGE_MASK to a signed type so that it is sign-extended if diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 77037b6f1caa22f622f50d67ea6cebd00f76f685..bbeae4a2bd01a3209e6d68f8f2af918eeb17dff2 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H +#include #include #include @@ -13,9 +14,18 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) +/* + * Macros to add or remove encryption attribute + */ +#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) +#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) + #ifndef __ASSEMBLY__ #include +extern pgd_t early_top_pgt[PTRS_PER_PGD]; +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); @@ -38,6 +48,8 @@ extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); +extern pmdval_t early_pmd_flags; + #ifdef CONFIG_PARAVIRT #include #else /* !CONFIG_PARAVIRT */ @@ -195,6 +207,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d) return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; } +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + static inline int p4d_large(p4d_t p4d) { /* No 512 GiB pages yet */ @@ -704,8 +721,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) \ - pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -773,8 +789,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) \ - pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) +#define pud_page(pud) pfn_to_page(pud_pfn(pud)) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) @@ -824,8 +839,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define p4d_page(p4d) \ - pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) +#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) /* Find an entry in the third-level page table.. */ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) @@ -859,7 +873,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bf9638e1ee4215d4101d2836d5c3963d59e6dbab..399261ce904ca1df269e5194b8e523d73b8a3f69 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -2,6 +2,8 @@ #define _ASM_X86_PGTABLE_DEFS_H #include +#include + #include #define FIRST_USER_ADDRESS 0UL @@ -121,10 +123,10 @@ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY) +#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_ACCESSED | _PAGE_DIRTY) /* * Set of bits not changed in pte_modify. The pte's @@ -159,6 +161,7 @@ enum page_cache_mode { #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) +#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -187,22 +190,42 @@ enum page_cache_mode { #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) +#ifndef __ASSEMBLY__ + +#define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | _PAGE_ENC) + +#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) +#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) + +#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) +#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#endif /* __ASSEMBLY__ */ /* xwr */ #define __P000 PAGE_NONE @@ -287,6 +310,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d) #else #include +static inline p4d_t native_make_p4d(pudval_t val) +{ + return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; +} + static inline p4dval_t native_p4d_val(p4d_t p4d) { return native_pgd_val(p4d.pgd); diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 79aa2f98398d4eaabf45a1493baaa072f11782ec..dc723b64acf0675689c1feb187bfd957480c5d94 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -2,6 +2,7 @@ #define _ASM_X86_PROCESSOR_FLAGS_H #include +#include #ifdef CONFIG_VM86 #define X86_VM_MASK X86_EFLAGS_VM @@ -32,16 +33,18 @@ * CR3_ADDR_MASK is the mask used by read_cr3_pa(). */ #ifdef CONFIG_X86_64 -/* Mask off the address space ID bits. */ -#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull -#define CR3_PCID_MASK 0xFFFull +/* Mask off the address space ID and SME encryption bits. */ +#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) +#define CR3_PCID_MASK 0xFFFull +#define CR3_NOFLUSH BIT_ULL(63) #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save * a tiny bit of code size by setting all the bits. */ -#define CR3_ADDR_MASK 0xFFFFFFFFull -#define CR3_PCID_MASK 0ull +#define CR3_ADDR_MASK 0xFFFFFFFFull +#define CR3_PCID_MASK 0ull +#define CR3_NOFLUSH 0 #endif #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 028245e1c42b23d1498643427ebb73be25ded661..c61bab07a84e05a0a21afc9e166db6e37fa15442 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -29,6 +29,7 @@ struct vm86; #include #include #include +#include /* * We handle most unaligned accesses in hardware. On the other hand @@ -239,9 +240,14 @@ static inline unsigned long read_cr3_pa(void) return __read_cr3() & CR3_ADDR_MASK; } +static inline unsigned long native_read_cr3_pa(void) +{ + return __native_read_cr3() & CR3_ADDR_MASK; +} + static inline void load_cr3(pgd_t *pgdir) { - write_cr3(__pa(pgdir)); + write_cr3(__sme_pa(pgdir)); } #ifdef CONFIG_X86_32 @@ -802,7 +808,9 @@ static inline void spin_lock_prefetch(const void *x) */ #define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_LOW TASK_SIZE #define TASK_SIZE_MAX TASK_SIZE +#define DEFAULT_MAP_WINDOW TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -842,7 +850,9 @@ static inline void spin_lock_prefetch(const void *x) * particular problem by preventing anything from being mapped * at the maximum canonical address. */ -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -850,12 +860,14 @@ static inline void spin_lock_prefetch(const void *x) #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 0xc0000000 : 0xFFFFe000) +#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define STACK_TOP TASK_SIZE +#define STACK_TOP TASK_SIZE_LOW #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ @@ -876,7 +888,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, * space during mmap's. */ #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) -#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) +#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 230e1903acf07faa831c8d2496a4457aaae9b5ea..90d91520c13ab09b566ca0a599a6170353153e47 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -1,6 +1,15 @@ #ifndef _ARCH_X86_REALMODE_H #define _ARCH_X86_REALMODE_H +/* + * Flag bit definitions for use with the flags field of the trampoline header + * in the CONFIG_X86_64 variant. + */ +#define TH_FLAGS_SME_ACTIVE_BIT 0 +#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT) + +#ifndef __ASSEMBLY__ + #include #include @@ -38,6 +47,7 @@ struct trampoline_header { u64 start; u64 efer; u32 cr4; + u32 flags; #endif }; @@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void) void set_real_mode_mem(phys_addr_t mem, size_t size); void reserve_real_mode(void); +#endif /* __ASSEMBLY__ */ + #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index eaec6c364e42d07f55930a80ef29b9d5c248165d..cd71273ec49d91aacffe3eff8883617ef94d8627 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -11,6 +11,7 @@ * Executability : eXeutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent + * Encryption : Encrypted, Decrypted * * Within a category, the attributes are mutually exclusive. * @@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 50ea3482e1d1d0babfecf7864a6e9307ba6651fe..d23e61dc0640e451d8d1f997fad65af11b30dbfa 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void) __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); } +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + u64 new_tlb_gen; + + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + smp_mb__before_atomic(); + new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); + smp_mb__after_atomic(); + + return new_tlb_gen; +} + #ifdef CONFIG_PARAVIRT #include #else @@ -65,6 +82,17 @@ static inline void invpcid_flush_all_nonglobals(void) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * 6 because 6 should be plenty and struct tlb_state will fit in + * two cache lines. + */ +#define TLB_NR_DYN_ASIDS 6 + +struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +}; + struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts @@ -73,13 +101,35 @@ struct tlb_state { * mode even if we've already switched back to swapper_pg_dir. */ struct mm_struct *loaded_mm; - int state; + u16 loaded_mm_asid; + u16 next_asid; /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ unsigned long cr4; + + /* + * This is a list of all contexts that might exist in the TLB. + * There is one per ASID that we use, and the ASID (what the + * CPU calls PCID) is the index into ctxts. + * + * For each context, ctx_id indicates which mm the TLB's user + * entries came from. As an invariant, the TLB will never + * contain entries that are out-of-date as when that mm reached + * the tlb_gen in the list. + * + * To be clear, this means that it's legal for the TLB code to + * flush the TLB without updating tlb_gen. This can happen + * (for now, at least) due to paravirt remote flushes. + * + * NB: context 0 is a bit special, since it's also used by + * various bits of init code. This is fine -- code that + * isn't aware of PCID will end up harmlessly flushing + * context 0. + */ + struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); @@ -207,6 +257,14 @@ static inline void __flush_tlb_all(void) __flush_tlb_global(); else __flush_tlb(); + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but + * we might fail to flush kernel translations for other cached ASIDs. + * + * To avoid this issue, we force PCID off if PGE is off. + */ } static inline void __flush_tlb_one(unsigned long addr) @@ -231,9 +289,26 @@ static inline void __flush_tlb_one(unsigned long addr) * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { - struct mm_struct *mm; - unsigned long start; - unsigned long end; + /* + * We support several kinds of flushes. + * + * - Fully flush a single mm. .mm will be set, .end will be + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to + * which the IPI sender is trying to catch us up. + * + * - Partially flush a single mm. .mm will be set, .start and + * .end will indicate the range, and .new_tlb_gen will be set + * such that the changes between generation .new_tlb_gen-1 and + * .new_tlb_gen are entirely contained in the indicated range. + * + * - Fully flush all mms whose tlb_gens have been updated. .mm + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen + * will be zero. + */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + u64 new_tlb_gen; }; #define local_flush_tlb() __flush_tlb() @@ -256,12 +331,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { + inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h index c4b9dc2f67c5f6f7a095dd7d4a3a6bb7fd7b3975..9f42beefc67a3d5d5242e054425c713ca394553e 100644 --- a/arch/x86/include/asm/vga.h +++ b/arch/x86/include/asm/vga.h @@ -7,12 +7,24 @@ #ifndef _ASM_X86_VGA_H #define _ASM_X86_VGA_H +#include + /* * On the PC, we can just recalculate addresses and then * access the videoram directly without any black magic. + * To support memory encryption however, we need to access + * the videoram as decrypted memory. */ -#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x) +#define VGA_MAP_MEM(x, s) \ +({ \ + unsigned long start = (unsigned long)phys_to_virt(x); \ + \ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \ + set_memory_decrypted(start, (s) >> PAGE_SHIFT); \ + \ + start; \ +}) #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7491e73d92530bf868a9b91b2c71c892bcbdb94a..97bb2caf342879ba365582d3bd7b1caab5f77dfe 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { #define ACPI_INVALID_GSI INT_MIN /* - * This is just a simple wrapper around early_ioremap(), + * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. */ char *__init __acpi_map_table(unsigned long phys, unsigned long size) @@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) if (!phys || !size) return NULL; - return early_ioremap(phys, size); + return early_memremap(phys, size); } void __init __acpi_unmap_table(char *map, unsigned long size) @@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size) if (!map || !size) return; - early_iounmap(map, size); + early_memunmap(map, size); } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3b9e220621f83c8a5161e8b57b297233370f72ea..110ca5d2bb872a7f15cffe4349ffab74b03a4c86 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -548,8 +548,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) static void early_init_amd(struct cpuinfo_x86 *c) { + u32 dummy; + early_init_amd_mc(c); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate * with P/T states and does not stop in deep C-states @@ -612,6 +616,27 @@ static void early_init_amd(struct cpuinfo_x86 *c) */ if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_E400); + + /* + * BIOS support is required for SME. If BIOS has enabled SME then + * adjust x86_phys_bits by the SME physical address space reduction + * value. If BIOS has not enabled SME then don't advertise the + * feature (set in scattered.c). Also, since the SME support requires + * long mode, don't advertise the feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME)) { + u64 msr; + + /* Check if SME is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + if (IS_ENABLED(CONFIG_X86_32)) + clear_cpu_cap(c, X86_FEATURE_SME); + } else { + clear_cpu_cap(c, X86_FEATURE_SME); + } + } } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -730,8 +755,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c) static void init_amd(struct cpuinfo_x86 *c) { - u32 dummy; - early_init_amd(c); /* @@ -793,8 +816,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); - /* 3DNow or LM implies PREFETCHW */ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0af86d9242da0f6882f1f5252dfa659038c627ac..db684880d74ae47fbff37888ff31dd13b9a4653b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -21,6 +21,14 @@ void __init check_bugs(void) { +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif + identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c8b39870f33e8d5579eb1b30ecd24ace23de9d86..b95cd94ca97bc191121e87bd5c0471d0ad8de494 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +#ifdef CONFIG_X86_64 +static int __init x86_pcid_setup(char *s) +{ + /* require an exact match without trailing characters */ + if (strlen(s)) + return 0; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 1; +} +__setup("nopcid", x86_pcid_setup); +#endif + static int __init x86_noinvpcid_setup(char *s) { /* noinvpcid doesn't accept parameters */ @@ -311,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +static void setup_pcid(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PCID)) { + if (cpu_has(c, X86_FEATURE_PGE)) { + cr4_set_bits(X86_CR4_PCIDE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't + * work if PCID is on but PGE is not. Since that + * combination doesn't exist on real hardware, there's + * no reason to try to fully support it, but it's + * polite to avoid corrupting data if we're on + * an improperly configured VM. + */ + clear_cpu_cap(c, X86_FEATURE_PCID); + } + } +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -1125,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smep(c); setup_smap(c); + /* Set up PCID */ + setup_pcid(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6dde0497efc7514b9981bd2d33014a6686c65677..3b413065c61308104794db2efe6ab939b39411a1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "mce-internal.h" @@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m) return ret; } +#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) + +void arch_unmap_kpfn(unsigned long pfn) +{ + unsigned long decoy_addr; + + /* + * Unmap this page from the kernel 1:1 mappings to make sure + * we don't log more errors because of speculative access to + * the page. + * We would like to just call: + * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the posion page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_np() not checking whether we passed + * a legal address. + */ + +/* + * Build time check to see if we have a spare virtual bit. Don't want + * to leave this until run time because most developers don't have a + * system that can exercise this code path. This will only become a + * problem if/when we move beyond 5-level page tables. + * + * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) + */ +#if PGDIR_SHIFT + 9 < 63 + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); +#else +#error "no unused virtual bit available" +#endif + + if (set_memory_np(decoy_addr, 1)) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + +} +#endif + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 23c23508c0125e50caf69a6959aaf000151cf572..05459ad3db46e2139b7d97514899d398c321c541 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 532da61d605ccc2271067fdf67bac83c454616aa..71c11ad5643e80059d4f262002fc9620044b594b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any); * Note: this function only works correctly once the E820 table is sorted and * not-overlapping (at least for the range specified), which is the case normally. */ -bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +static struct e820_entry *__e820__mapped_all(u64 start, u64 end, + enum e820_type type) { int i; @@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) * coverage of the desired range exists: */ if (start >= end) - return 1; + return entry; } - return 0; + + return NULL; +} + +/* + * This function checks if the entire range is mapped with type. + */ +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +{ + return __e820__mapped_all(start, end, type); +} + +/* + * This function returns the type associated with the range . + */ +int e820__get_entry_type(u64 start, u64 end) +{ + struct e820_entry *entry = __e820__mapped_all(start, end, 0); + + return entry ? entry->type : -EINVAL; } /* diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 6b91e2eb8d3f8a5b8ad1a57c7a2a47d0a3b4440d..9c4e7ba6870c142921cfbbd07b8bbf45285e5c07 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -195,7 +195,7 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); - pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 9ba79543d9ee9f1ee19bce38955467afa04b125f..6a193b93fd952d59b4bca8a2071859edf8bcfcb6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -33,7 +34,6 @@ /* * Manage page tables very early on. */ -extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); @@ -45,9 +45,11 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } -void __head __startup_64(unsigned long physaddr) +unsigned long __head __startup_64(unsigned long physaddr, + struct boot_params *bp) { unsigned long load_delta, *p; + unsigned long pgtable_flags; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; @@ -69,6 +71,12 @@ void __head __startup_64(unsigned long physaddr) if (load_delta & ~PMD_PAGE_MASK) for (;;); + /* Activate Secure Memory Encryption (SME) if supported and enabled */ + sme_enable(bp); + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + /* Fixup the physical addresses in the page table */ pgd = fixup_pointer(&early_top_pgt, physaddr); @@ -92,31 +100,35 @@ void __head __startup_64(unsigned long physaddr) * creates a bunch of nonsense entries but that is fine -- * it avoids problems around wraparound. */ + next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; - p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + p4d[i + 0] = (pgdval_t)pud + pgtable_flags; + p4d[i + 1] = (pgdval_t)pud + pgtable_flags; } else { i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; } i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; - pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; - pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; + pud[i + 0] = (pudval_t)pmd + pgtable_flags; + pud[i + 1] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { @@ -137,9 +149,30 @@ void __head __startup_64(unsigned long physaddr) pmd[i] += load_delta; } - /* Fixup phys_base */ + /* + * Fixup phys_base - remove the memory encryption mask to obtain + * the true physical address. + */ p = fixup_pointer(&phys_base, physaddr); - *p += load_delta; + *p += load_delta - sme_get_me_mask(); + + /* Encrypt the kernel (if SME is active) */ + sme_encrypt_kernel(); + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +unsigned long __startup_secondary_64(void) +{ + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); } /* Wipe all early page tables except for the kernel symbol map */ @@ -147,17 +180,17 @@ static void __init reset_early_page_tables(void) { memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_top_pgt)); + write_cr3(__sme_pa_nodebug(early_top_pgt)); } /* Create a new PMD entry */ -int __init early_make_pgtable(unsigned long address) +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; p4dval_t p4d, *p4d_p; pudval_t pud, *pud_p; - pmdval_t pmd, *pmd_p; + pmdval_t *pmd_p; /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) @@ -216,12 +249,21 @@ int __init early_make_pgtable(unsigned long address) memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } - pmd = (physaddr & PMD_MASK) + early_pmd_flags; pmd_p[pmd_index(address)] = pmd; return 0; } +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pmdval_t pmd; + + pmd = (physaddr & PMD_MASK) + early_pmd_flags; + + return __early_make_pgtable(address, pmd); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -244,6 +286,12 @@ static void __init copy_bootdata(char *real_mode_data) char * command_line; unsigned long cmd_line_ptr; + /* + * If SME is active, this will create decrypted mappings of the + * boot data in advance of the copy operations. + */ + sme_map_bootdata(real_mode_data); + memcpy(&boot_params, real_mode_data, sizeof boot_params); sanitize_boot_params(&boot_params); cmd_line_ptr = get_cmd_line_ptr(); @@ -251,6 +299,14 @@ static void __init copy_bootdata(char *real_mode_data) command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } + + /* + * The old boot data is no longer needed and won't be reserved, + * freeing up that memory for use by the system. If SME is active, + * we need to remove the mappings that were created so that the + * memory doesn't remain mapped as decrypted. + */ + sme_unmap_bootdata(real_mode_data); } asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) @@ -280,6 +336,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_page(init_top_pgt); + /* + * SME support may update early_pmd_flags to include the memory + * encryption mask, so it needs to be called before anything + * that may generate a page fault. + */ + sme_early_init(); + kasan_early_init(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6225550883dfe1e98bcf35ac5ed870e334b34399..513cbb012eccc51f18ac16f9dac74a33abaa4c16 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -73,12 +73,19 @@ startup_64: /* Sanitize CPU configuration */ call verify_cpu + /* + * Perform pagetable fixups. Additionally, if SME is active, encrypt + * the kernel and retrieve the modifier (SME encryption mask if SME + * is active) to be added to the initial pgdir entry that will be + * programmed into CR3. + */ leaq _text(%rip), %rdi pushq %rsi call __startup_64 popq %rsi - movq $(early_top_pgt - __START_KERNEL_map), %rax + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) /* @@ -98,7 +105,16 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - movq $(init_top_pgt - __START_KERNEL_map), %rax + /* + * Retrieve the modifier (SME encryption mask if SME is active) to be + * added to the initial pgdir entry that will be programmed into CR3. + */ + pushq %rsi + call __startup_secondary_64 + popq %rsi + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(init_top_pgt - __START_KERNEL_map), %rax 1: /* Enable PAE mode, PGE and LA57 */ @@ -335,9 +351,9 @@ GLOBAL(name) NEXT_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(early_dynamic_pgts) @@ -350,15 +366,15 @@ NEXT_PAGE(init_top_pgt) .fill 512,8,0 #else NEXT_PAGE(init_top_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. @@ -370,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt) #ifdef CONFIG_X86_5LEVEL NEXT_PAGE(level4_kernel_pgt) .fill 511,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level2_kernel_pgt) /* @@ -395,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt) NEXT_PAGE(level2_fixmap_pgt) .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ .fill 5,8,0 diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 38b64587b31be5611a763df6dafe8434db2a66b5..fd6f8fbbe6f2a05d061c3e0196c313293ae8ad08 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, struct setup_data_node *node = file->private_data; unsigned long remain; loff_t pos = *ppos; - struct page *pg; void *p; u64 pa; @@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, count = node->len - pos; pa = node->paddr + sizeof(struct setup_data) + pos; - pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - p = ioremap_cache(pa, count); - if (!p) - return -ENXIO; - } else - p = __va(pa); + p = memremap(pa, count, MEMREMAP_WB); + if (!p) + return -ENOMEM; remain = copy_to_user(user_buf, p, count); - if (PageHighMem(pg)) - iounmap(p); + memunmap(p); if (remain) return -EFAULT; @@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent) struct setup_data *data; int error; struct dentry *d; - struct page *pg; u64 pa_data; int no = 0; @@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - data = ioremap_cache(pa_data, sizeof(*data)); - if (!data) { - kfree(node); - error = -ENXIO; - goto err_dir; - } - } else - data = __va(pa_data); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } node->paddr = pa_data; node->type = data->type; @@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = create_setup_data_node(d, no, node); pa_data = data->next; - if (PageHighMem(pg)) - iounmap(data); + memunmap(data); if (error) goto err_dir; no++; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 06e1ff5562c0b4a5a28f3c052ed74b545c7ccca0..4b0592ca9e47b332d0ce67f8bcf5f555653587b2 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -16,8 +16,8 @@ #include #include #include +#include -#include #include static ssize_t version_show(struct kobject *kobj, @@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr) *paddr = pa_data; return 0; } - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size) u64 pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; if (nr == i) { *size = data->len; - iounmap(data); + memunmap(data); return 0; } pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; ret = sprintf(buf, "0x%x\n", data->type); - iounmap(data); + memunmap(data); return ret; } @@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp, goto out; ret = count; - p = ioremap_cache(paddr + sizeof(*data), data->len); + p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; } memcpy(buf, p + off, count); - iounmap(p); + memunmap(p); out: - iounmap(data); + memunmap(data); return ret; } @@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr) *nr = 0; while (pa_data) { *nr += 1; - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) { ret = -ENOMEM; goto out; } pa_data = data->next; - iounmap(data); + memunmap(data); } out: diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index cb0a30473c2310b76695c73ec6fad3cd1e7b051f..1f790cf9d38fe0e10e46eaf9b5bef945d25a9370 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); return 0; err: free_transition_pgtable(image); @@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) .alloc_pgt_page = alloc_pgt_page, .context = image, .page_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; pgd_t *level4p; @@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image) image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start, - image->preserve_context); + image->preserve_context, + sme_active()); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } + +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) +{ + /* + * If SME is active we need to be sure that kexec pages are + * not encrypted because when we boot to the new kernel the + * pages won't be accessed encrypted (initially). + */ + return set_memory_decrypted((unsigned long)vaddr, pages); +} + +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) +{ + /* + * If SME is active we need to reset the pages back to being + * an encrypted mapping before freeing them. + */ + set_memory_encrypted((unsigned long)vaddr, pages); +} diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0d904d759ff1d42c97e57a7ef5e126bd961b8b03..5cbb3177ed17270b993a4c1a686282ce80608ed1 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -429,16 +429,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) } } -static struct mpf_intel *mpf_found; +static unsigned long mpf_base; static unsigned long __init get_mpc_size(unsigned long physptr) { struct mpc_table *mpc; unsigned long size; - mpc = early_ioremap(physptr, PAGE_SIZE); + mpc = early_memremap(physptr, PAGE_SIZE); size = mpc->length; - early_iounmap(mpc, PAGE_SIZE); + early_memunmap(mpc, PAGE_SIZE); apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); return size; @@ -450,7 +450,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) unsigned long size; size = get_mpc_size(mpf->physptr); - mpc = early_ioremap(mpf->physptr, size); + mpc = early_memremap(mpf->physptr, size); + /* * Read the physical hardware table. Anything here will * override the defaults. @@ -461,10 +462,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) #endif pr_err("BIOS bug, MP table errors detected!...\n"); pr_cont("... disabling SMP support. (tell your hw vendor)\n"); - early_iounmap(mpc, size); + early_memunmap(mpc, size); return -1; } - early_iounmap(mpc, size); + early_memunmap(mpc, size); if (early) return -1; @@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) */ void __init default_get_smp_config(unsigned int early) { - struct mpf_intel *mpf = mpf_found; + struct mpf_intel *mpf; if (!smp_found_config) return; - if (!mpf) + if (!mpf_base) return; if (acpi_lapic && early) @@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: error mapping MP table\n"); + return; + } + pr_info("Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early) /* * Now see if we need to read further. */ - if (mpf->feature1 != 0) { + if (mpf->feature1) { if (early) { /* * local APIC has default address @@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early) construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { - if (check_physptr(mpf, early)) + if (check_physptr(mpf, early)) { + early_memunmap(mpf, sizeof(*mpf)); return; + } } else BUG(); @@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early) /* * Only use the first configuration found. */ + + early_memunmap(mpf, sizeof(*mpf)); } static void __init smp_reserve_memory(struct mpf_intel *mpf) @@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf) static int __init smp_scan_config(unsigned long base, unsigned long length) { - unsigned int *bp = phys_to_virt(base); + unsigned int *bp; struct mpf_intel *mpf; - unsigned long mem; + int ret = 0; apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { + bp = early_memremap(base, length); mpf = (struct mpf_intel *)bp; if ((*bp == SMP_MAGIC_IDENT) && (mpf->length == 1) && @@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; #endif - mpf_found = mpf; + mpf_base = base; - pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", - (unsigned long long) virt_to_phys(mpf), - (unsigned long long) virt_to_phys(mpf) + - sizeof(*mpf) - 1, mpf); + pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", + base, base + sizeof(*mpf) - 1, mpf); - mem = virt_to_phys(mpf); - memblock_reserve(mem, sizeof(*mpf)); + memblock_reserve(base, sizeof(*mpf)); if (mpf->physptr) smp_reserve_memory(mpf); - return 1; + ret = 1; } - bp += 4; + early_memunmap(bp, length); + + if (ret) + break; + + base += 16; length -= 16; } - return 0; + return ret; } void __init default_find_smp_config(void) @@ -838,29 +852,40 @@ static int __init update_mp_table(void) char oem[10]; struct mpf_intel *mpf; struct mpc_table *mpc, *mpc_new; + unsigned long size; if (!enable_update_mptable) return 0; - mpf = mpf_found; - if (!mpf) + if (!mpf_base) return 0; + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: mpf early_memremap() failed\n"); + return 0; + } + /* * Now see if we need to go further. */ - if (mpf->feature1 != 0) - return 0; + if (mpf->feature1) + goto do_unmap_mpf; if (!mpf->physptr) - return 0; + goto do_unmap_mpf; - mpc = phys_to_virt(mpf->physptr); + size = get_mpc_size(mpf->physptr); + mpc = early_memremap(mpf->physptr, size); + if (!mpc) { + pr_err("MPTABLE: mpc early_memremap() failed\n"); + goto do_unmap_mpf; + } if (!smp_check_mpc(mpc, oem, str)) - return 0; + goto do_unmap_mpc; - pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf)); + pr_info("mpf: %llx\n", (u64)mpf_base); pr_info("physptr: %x\n", mpf->physptr); if (mpc_new_phys && mpc->length > mpc_new_length) { @@ -878,21 +903,32 @@ static int __init update_mp_table(void) new = mpf_checksum((unsigned char *)mpc, mpc->length); if (old == new) { pr_info("mpc is readonly, please try alloc_mptable instead\n"); - return 0; + goto do_unmap_mpc; } pr_info("use in-position replacing\n"); } else { + mpc_new = early_memremap(mpc_new_phys, mpc_new_length); + if (!mpc_new) { + pr_err("MPTABLE: new mpc early_memremap() failed\n"); + goto do_unmap_mpc; + } mpf->physptr = mpc_new_phys; - mpc_new = phys_to_virt(mpc_new_phys); memcpy(mpc_new, mpc, mpc->length); + early_memunmap(mpc, size); mpc = mpc_new; + size = mpc_new_length; /* check if we can modify that */ if (mpc_new_phys - mpf->physptr) { struct mpf_intel *mpf_new; /* steal 16 bytes from [0, 1k) */ + mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new)); + if (!mpf_new) { + pr_err("MPTABLE: new mpf early_memremap() failed\n"); + goto do_unmap_mpc; + } pr_info("mpf new: %x\n", 0x400 - 16); - mpf_new = phys_to_virt(0x400 - 16); memcpy(mpf_new, mpf, 16); + early_memunmap(mpf, sizeof(*mpf)); mpf = mpf_new; mpf->physptr = mpc_new_phys; } @@ -909,6 +945,12 @@ static int __init update_mp_table(void) */ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); +do_unmap_mpc: + early_memunmap(mpc, size); + +do_unmap_mpf: + early_memunmap(mpf, sizeof(*mpf)); + return 0; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5e16d3f2959468f6d05aaa1be5eef2f8b945abb1..0accc2404b9214d1b318577668f103bc41bc8d35 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -93,9 +93,12 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size), flag); - if (page && page_to_phys(page) + size > dma_mask) { - dma_release_from_contiguous(dev, page, count); - page = NULL; + if (page) { + addr = phys_to_dma(dev, page_to_phys(page)); + if (addr + size > dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } } } /* fallback */ @@ -104,7 +107,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, if (!page) return NULL; - addr = page_to_phys(page); + addr = phys_to_dma(dev, page_to_phys(page)); if (addr + size > dma_mask) { __free_pages(page, get_order(size)); diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a6d404087fe3285f65de353613153abde4a698a0..4fc3cb60ea11a546b08ae8c2f3fdbb9007dee670 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t bus = page_to_phys(page) + offset; + dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) return NOMMU_MAPPING_ERROR; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 1e23577e17cf10f87d584e4cdc38f4691db57862..677077510e308ebfabb56d8072a2afed439f23c8 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -6,12 +6,14 @@ #include #include #include +#include #include #include #include #include #include + int swiotlb __read_mostly; void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override, pci_swiotlb_late_init); /* - * if 4GB or more detected (and iommu=off not set) return 1 - * and set swiotlb to 1. + * If 4GB or more detected (and iommu=off not set) or if SME is active + * then set swiotlb to 1 and return 1. */ int __init pci_swiotlb_detect_4gb(void) { @@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void) if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif + + /* + * If SME is active then swiotlb will be set to 1 so that bounce + * buffers are allocated and used for devices that do not support + * the addressing range required for the encryption mask. + */ + if (sme_active()) + swiotlb = 1; + return swiotlb; } IOMMU_INIT(pci_swiotlb_detect_4gb, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3ca198080ea9294486ae9a1121e7815dfba7cb19..bd6b85fac66696da70e316656ad6f0d51291f8aa 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -355,6 +355,7 @@ bool xen_set_default_idle(void) return ret; } #endif + void stop_this_cpu(void *dummy) { local_irq_disable(); @@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); - for (;;) - halt(); + for (;;) { + /* + * Use wbinvd followed by hlt to stop the processor. This + * provides support for kexec on a processor that supports + * SME. With kexec, going from SME inactive to SME active + * requires clearing cache entries so that addresses without + * the encryption bit set don't corrupt the same physical + * address that has the encryption bit set when caches are + * flushed. To achieve this a wbinvd is performed followed by + * a hlt. Even if the processor is not in the kexec/SME + * scenario this only adds a wbinvd to a halting processor. + */ + asm volatile("wbinvd; hlt" : : : "memory"); + } } /* diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 98111b38ebfd6eb9949242c5aae7b18bbbdb4489..307d3bac5f04ece485ac1fe42226ee111c0c6e85 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -47,6 +47,7 @@ relocate_kernel: * %rsi page_list * %rdx start address * %rcx preserve_context + * %r8 sme_active */ /* Save the CPU context, used for jumping back */ @@ -71,6 +72,9 @@ relocate_kernel: pushq $0 popfq + /* Save SME active flag */ + movq %r8, %r12 + /* * get physical address of control page now * this is impossible after page table switch @@ -132,6 +136,16 @@ identity_mapped: /* Flush the TLB (needed?) */ movq %r9, %cr3 + /* + * If SME is active, there could be old encrypted cache line + * entries that will conflict with the now unencrypted memory + * used by kexec. Flush the caches before copying the kernel. + */ + testq %r12, %r12 + jz 1f + wbinvd +1: + movq %rcx, %r11 call swap_pages diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3486d04988000b05344a590ce9ab8c86e96d0ec2..0bfe0c1628f638a0ae7b1ab69e9414485fbae147 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include