提交 5579fd7e 编写于 作者: T Tejun Heo

Merge branch 'for-next' into for-linus

* pcpu_chunk_page_occupied() doesn't exist in for-next.
* pcpu_chunk_addr_search() updated to use raw_smp_processor_id().

Conflicts:
	mm/percpu.c
......@@ -1919,11 +1919,12 @@ and is between 256 and 4096 characters. It is defined in the file
Format: { 0 | 1 }
See arch/parisc/kernel/pdc_chassis.c
percpu_alloc= [X86] Select which percpu first chunk allocator to use.
Allowed values are one of "lpage", "embed" and "4k".
See comments in arch/x86/kernel/setup_percpu.c for
details on each allocator. This parameter is primarily
for debugging and performance comparison.
percpu_alloc= Select which percpu first chunk allocator to use.
Currently supported values are "embed" and "page".
Archs may support subset or none of the selections.
See comments in mm/percpu.c for details on each
allocator. This parameter is primarily for debugging
and performance comparison.
pf. [PARIDE]
See Documentation/blockdev/paride.txt.
......
......@@ -325,7 +325,7 @@ CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
MODFLAGS = -DMODULE
CFLAGS_MODULE = $(MODFLAGS)
AFLAGS_MODULE = $(MODFLAGS)
LDFLAGS_MODULE =
LDFLAGS_MODULE = -T $(srctree)/scripts/module-common.lds
CFLAGS_KERNEL =
AFLAGS_KERNEL =
CFLAGS_GCOV = -fprofile-arcs -ftest-coverage
......
#ifndef __ALPHA_PERCPU_H
#define __ALPHA_PERCPU_H
#include <linux/compiler.h>
#include <linux/threads.h>
#include <linux/percpu-defs.h>
/*
* Determine the real variable name from the name visible in the
* kernel sources.
*/
#define per_cpu_var(var) per_cpu__##var
#ifdef CONFIG_SMP
/*
* per_cpu_offset() is the offset that has to be added to a
* percpu variable to get to the instance for a certain processor.
*/
extern unsigned long __per_cpu_offset[NR_CPUS];
#define per_cpu_offset(x) (__per_cpu_offset[x])
#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id())
#ifdef CONFIG_DEBUG_PREEMPT
#define my_cpu_offset per_cpu_offset(smp_processor_id())
#else
#define my_cpu_offset __my_cpu_offset
#endif
#ifndef MODULE
#define SHIFT_PERCPU_PTR(var, offset) RELOC_HIDE(&per_cpu_var(var), (offset))
#define PER_CPU_DEF_ATTRIBUTES
#else
/*
* To calculate addresses of locally defined variables, GCC uses 32-bit
* displacement from the GP. Which doesn't work for per cpu variables in
* modules, as an offset to the kernel per cpu area is way above 4G.
* To calculate addresses of locally defined variables, GCC uses
* 32-bit displacement from the GP. Which doesn't work for per cpu
* variables in modules, as an offset to the kernel per cpu area is
* way above 4G.
*
* This forces allocation of a GOT entry for per cpu variable using
* ldq instruction with a 'literal' relocation.
*/
#define SHIFT_PERCPU_PTR(var, offset) ({ \
extern int simple_identifier_##var(void); \
unsigned long __ptr, tmp_gp; \
asm ( "br %1, 1f \n\
1: ldgp %1, 0(%1) \n\
ldq %0, per_cpu__" #var"(%1)\t!literal" \
: "=&r"(__ptr), "=&r"(tmp_gp)); \
(typeof(&per_cpu_var(var)))(__ptr + (offset)); })
#define PER_CPU_DEF_ATTRIBUTES __used
#endif /* MODULE */
/*
* A percpu variable may point to a discarded regions. The following are
* established ways to produce a usable pointer from the percpu variable
* offset.
* Always use weak definitions for percpu variables in modules.
*/
#define per_cpu(var, cpu) \
(*SHIFT_PERCPU_PTR(var, per_cpu_offset(cpu)))
#define __get_cpu_var(var) \
(*SHIFT_PERCPU_PTR(var, my_cpu_offset))
#define __raw_get_cpu_var(var) \
(*SHIFT_PERCPU_PTR(var, __my_cpu_offset))
#else /* ! SMP */
#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var)))
#define __get_cpu_var(var) per_cpu_var(var)
#define __raw_get_cpu_var(var) per_cpu_var(var)
#define PER_CPU_DEF_ATTRIBUTES
#endif /* SMP */
#ifdef CONFIG_SMP
#define PER_CPU_BASE_SECTION ".data.percpu"
#else
#define PER_CPU_BASE_SECTION ".data"
#endif
#ifdef CONFIG_SMP
#ifdef MODULE
#define PER_CPU_SHARED_ALIGNED_SECTION ""
#else
#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
#endif
#define PER_CPU_FIRST_SECTION ".first"
#else
#define PER_CPU_SHARED_ALIGNED_SECTION ""
#define PER_CPU_FIRST_SECTION ""
#if defined(MODULE) && defined(CONFIG_SMP)
#define ARCH_NEEDS_WEAK_PER_CPU
#endif
#define PER_CPU_ATTRIBUTES
#include <asm-generic/percpu.h>
#endif /* __ALPHA_PERCPU_H */
......@@ -2,6 +2,7 @@
#define _ALPHA_TLBFLUSH_H
#include <linux/mm.h>
#include <linux/sched.h>
#include <asm/compiler.h>
#include <asm/pgalloc.h>
......
......@@ -134,13 +134,6 @@ SECTIONS
__bss_stop = .;
_end = .;
/* Sections to be discarded */
/DISCARD/ : {
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
}
.mdebug 0 : {
*(.mdebug)
}
......@@ -150,4 +143,6 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
DISCARDS
}
......@@ -83,6 +83,7 @@ SECTIONS
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
*(.discard)
*(.ARM.exidx.exit.text)
*(.ARM.extab.exit.text)
#ifndef CONFIG_HOTPLUG_CPU
......
......@@ -124,14 +124,11 @@ SECTIONS
_end = .;
}
DWARF_DEBUG
/* When something in the kernel is NOT compiled as a module, the module
* cleanup code and data are put into these segments. Both can then be
* thrown away, as cleanup code is never called unless it's a module.
*/
/DISCARD/ : {
EXIT_DATA
*(.exitcall.exit)
}
DWARF_DEBUG
DISCARDS
}
......@@ -277,8 +277,5 @@ SECTIONS
DWARF_DEBUG
/DISCARD/ :
{
*(.exitcall.exit)
}
DISCARDS
}
......@@ -42,9 +42,9 @@
#include <asm/mem_map.h>
#include "blackfin_sram.h"
static DEFINE_PER_CPU(spinlock_t, l1sram_lock) ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU(spinlock_t, l1_data_sram_lock) ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU(spinlock_t, l1_inst_sram_lock) ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1sram_lock);
static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_data_sram_lock);
static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_inst_sram_lock);
static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp;
/* the data structure for L1 scratchpad and DATA SRAM */
......
......@@ -17,7 +17,8 @@ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* registers like cr3 on the i386
*/
extern volatile DEFINE_PER_CPU(pgd_t *,current_pgd); /* defined in arch/cris/mm/fault.c */
/* defined in arch/cris/mm/fault.c */
DECLARE_PER_CPU(pgd_t *, current_pgd);
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
......
......@@ -140,12 +140,7 @@ SECTIONS
_end = .;
__end = .;
/* Sections to be discarded */
/DISCARD/ : {
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
}
dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024;
DISCARDS
}
......@@ -29,7 +29,7 @@ extern void die_if_kernel(const char *, struct pt_regs *, long);
/* current active page directory */
volatile DEFINE_PER_CPU(pgd_t *,current_pgd);
DEFINE_PER_CPU(pgd_t *, current_pgd);
unsigned long cris_signal_return_page;
/*
......
......@@ -177,6 +177,8 @@ SECTIONS
.debug_ranges 0 : { *(.debug_ranges) }
.comment 0 : { *(.comment) }
DISCARDS
}
__kernel_image_size_no_bss = __bss_start - __kernel_image_start;
......@@ -152,9 +152,6 @@ SECTIONS
__end = . ;
__ramstart = .;
}
/DISCARD/ : {
*(.exitcall.exit)
}
.romfs :
{
*(.romfs*)
......@@ -165,4 +162,6 @@ SECTIONS
COMMAND_START = . - 0x200 ;
__ramend = . ;
}
DISCARDS
}
......@@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL
bool
default y
config HAVE_LEGACY_PER_CPU_AREA
def_bool y
config HAVE_SETUP_PER_CPU_AREA
def_bool y
......
......@@ -855,11 +855,17 @@ identify_cpu (struct cpuinfo_ia64 *c)
c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1));
}
/*
* In UP configuration, setup_per_cpu_areas() is defined in
* include/linux/percpu.h
*/
#ifdef CONFIG_SMP
void __init
setup_per_cpu_areas (void)
{
/* start_kernel() requires this... */
}
#endif
/*
* Do the following calculations:
......
......@@ -58,7 +58,8 @@ static struct local_tlb_flush_counts {
unsigned int count;
} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS];
static DEFINE_PER_CPU(unsigned short, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned;
static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS],
shadow_flush_counts);
#define IPI_CALL_FUNC 0
#define IPI_CPU_STOP 1
......
......@@ -24,14 +24,14 @@ PHDRS {
}
SECTIONS
{
/* Sections to be discarded */
/* unwind exit sections must be discarded before the rest of the
sections get included. */
/DISCARD/ : {
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
*(.IA_64.unwind.exit.text)
*(.IA_64.unwind_info.exit.text)
}
*(.comment)
*(.note)
}
v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */
phys_start = _start - LOAD_OFFSET;
......@@ -316,7 +316,7 @@ SECTIONS
.debug_funcnames 0 : { *(.debug_funcnames) }
.debug_typenames 0 : { *(.debug_typenames) }
.debug_varnames 0 : { *(.debug_varnames) }
/* These must appear regardless of . */
/DISCARD/ : { *(.comment) }
/DISCARD/ : { *(.note) }
/* Default discards */
DISCARDS
}
......@@ -71,7 +71,7 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info);
EXPORT_PER_CPU_SYMBOL(__sn_hub_info);
DEFINE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]);
DEFINE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid);
EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid);
DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda);
......
......@@ -120,13 +120,6 @@ SECTIONS
_end = . ;
/* Sections to be discarded */
/DISCARD/ : {
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
}
/* Stabs debugging sections. */
.stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) }
......@@ -135,4 +128,7 @@ SECTIONS
.stab.index 0 : { *(.stab.index) }
.stab.indexstr 0 : { *(.stab.indexstr) }
.comment 0 : { *(.comment) }
/* Sections to be discarded */
DISCARDS
}
......@@ -82,13 +82,6 @@ SECTIONS
_end = . ;
/* Sections to be discarded */
/DISCARD/ : {
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
}
/* Stabs debugging sections. */
.stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) }
......@@ -97,4 +90,7 @@ SECTIONS
.stab.index 0 : { *(.stab.index) }
.stab.indexstr 0 : { *(.stab.indexstr) }
.comment 0 : { *(.comment) }
/* Sections to be discarded */
DISCARDS
}
......@@ -77,13 +77,6 @@ __init_begin = .;
_end = . ;
/* Sections to be discarded */
/DISCARD/ : {
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
}
.crap : {
/* Stabs debugging sections. */
*(.stab)
......@@ -96,4 +89,6 @@ __init_begin = .;
*(.note)
}
/* Sections to be discarded */
DISCARDS
}
......@@ -184,12 +184,6 @@ SECTIONS {
__init_end = .;
} > INIT
/DISCARD/ : {
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
}
.bss : {
. = ALIGN(4);
_sbss = . ;
......@@ -200,5 +194,6 @@ SECTIONS {
_end = . ;
} > BSS
DISCARDS
}
......@@ -23,8 +23,8 @@ SECTIONS {
_stext = . ;
*(.text .text.*)
*(.fixup)
*(.exitcall.exit)
EXIT_TEXT
EXIT_CALL
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
......@@ -162,4 +162,6 @@ SECTIONS {
}
. = ALIGN(4096);
_end = .;
DISCARDS
}
......@@ -176,17 +176,6 @@ SECTIONS
_end = . ;
/* Sections to be discarded */
/DISCARD/ : {
*(.exitcall.exit)
/* ABI crap starts here */
*(.MIPS.options)
*(.options)
*(.pdr)
*(.reginfo)
}
/* These mark the ABI of the kernel for debuggers. */
.mdebug.abi32 : {
KEEP(*(.mdebug.abi32))
......@@ -212,4 +201,14 @@ SECTIONS
*(.gptab.bss)
*(.gptab.sbss)
}
/* Sections to be discarded */
DISCARDS
/DISCARD/ : {
/* ABI crap starts here */
*(.MIPS.options)
*(.options)
*(.pdr)
*(.reginfo)
}
}
......@@ -115,12 +115,10 @@ SECTIONS
. = ALIGN(PAGE_SIZE);
pg0 = .;
/* Sections to be discarded */
/DISCARD/ : {
EXIT_CALL
}
STABS_DEBUG
DWARF_DEBUG
/* Sections to be discarded */
DISCARDS
}
......@@ -237,9 +237,12 @@ SECTIONS
/* freed after init ends here */
_end = . ;
STABS_DEBUG
.note 0 : { *(.note) }
/* Sections to be discarded */
DISCARDS
/DISCARD/ : {
*(.exitcall.exit)
#ifdef CONFIG_64BIT
/* temporary hack until binutils is fixed to not emit these
* for static binaries
......@@ -252,7 +255,4 @@ SECTIONS
*(.gnu.hash)
#endif
}
STABS_DEBUG
.note 0 : { *(.note) }
}
......@@ -49,6 +49,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
config HAVE_SETUP_PER_CPU_AREA
def_bool PPC64
config NEED_PER_CPU_EMBED_FIRST_CHUNK
def_bool PPC64
config IRQ_PER_CPU
bool
default y
......
......@@ -57,6 +57,7 @@
#include <asm/cache.h>
#include <asm/page.h>
#include <asm/mmu.h>
#include <asm/mmu-hash64.h>
#include <asm/firmware.h>
#include <asm/xmon.h>
#include <asm/udbg.h>
......@@ -569,25 +570,53 @@ void cpu_die(void)
}
#ifdef CONFIG_SMP
void __init setup_per_cpu_areas(void)
#define PCPU_DYN_SIZE ()
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
{
int i;
unsigned long size;
char *ptr;
/* Copy section for each CPU (we discard the original) */
size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
#ifdef CONFIG_MODULES
if (size < PERCPU_ENOUGH_ROOM)
size = PERCPU_ENOUGH_ROOM;
#endif
return __alloc_bootmem_node(NODE_DATA(cpu_to_node(cpu)), size, align,
__pa(MAX_DMA_ADDRESS));
}
for_each_possible_cpu(i) {
ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
static void __init pcpu_fc_free(void *ptr, size_t size)
{
free_bootmem(__pa(ptr), size);
}
paca[i].data_offset = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
}
static int pcpu_cpu_distance(unsigned int from, unsigned int to)
{
if (cpu_to_node(from) == cpu_to_node(to))
return LOCAL_DISTANCE;
else
return REMOTE_DISTANCE;
}
void __init setup_per_cpu_areas(void)
{
const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
size_t atom_size;
unsigned long delta;
unsigned int cpu;
int rc;
/*
* Linear mapping is one of 4K, 1M and 16M. For 4K, no need
* to group units. For larger mappings, use 1M atom which
* should be large enough to contain a number of units.
*/
if (mmu_linear_psize == MMU_PAGE_4K)
atom_size = PAGE_SIZE;
else
atom_size = 1 << 20;
rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance,
pcpu_fc_alloc, pcpu_fc_free);
if (rc < 0)
panic("cannot initialize percpu area (err=%d)", rc);
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu)
paca[cpu].data_offset = delta + pcpu_unit_offsets[cpu];
}
#endif
......
......@@ -37,12 +37,6 @@ jiffies = jiffies_64 + 4;
#endif
SECTIONS
{
/* Sections to be discarded. */
/DISCARD/ : {
*(.exitcall.exit)
EXIT_DATA
}
. = KERNELBASE;
/*
......@@ -298,4 +292,7 @@ SECTIONS
. = ALIGN(PAGE_SIZE);
_end = . ;
PROVIDE32 (end = .);
/* Sections to be discarded. */
DISCARDS
}
......@@ -31,7 +31,7 @@ struct stab_entry {
#define NR_STAB_CACHE_ENTRIES 8
static DEFINE_PER_CPU(long, stab_cache_ptr);
static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
/*
* Create a segment table entry for the given esid/vsid pair.
......
......@@ -37,7 +37,7 @@
*/
#define MSG_COUNT 4
static DEFINE_PER_CPU(unsigned int, ps3_ipi_virqs[MSG_COUNT]);
static DEFINE_PER_CPU(unsigned int [MSG_COUNT], ps3_ipi_virqs);
static void do_message_pass(int target, int msg)
{
......
#ifndef __ARCH_S390_PERCPU__
#define __ARCH_S390_PERCPU__
#include <linux/compiler.h>
#include <asm/lowcore.h>
/*
* s390 uses its own implementation for per cpu data, the offset of
* the cpu local data area is cached in the cpu's lowcore memory.
* For 64 bit module code s390 forces the use of a GOT slot for the
* address of the per cpu variable. This is needed because the module
* may be more than 4G above the per cpu area.
*/
#if defined(__s390x__) && defined(MODULE)
#define SHIFT_PERCPU_PTR(ptr,offset) (({ \
extern int simple_identifier_##var(void); \
unsigned long *__ptr; \
asm ( "larl %0, %1@GOTENT" \
: "=a" (__ptr) : "X" (ptr) ); \
(typeof(ptr))((*__ptr) + (offset)); }))
#else
#define SHIFT_PERCPU_PTR(ptr, offset) (({ \
extern int simple_identifier_##var(void); \
unsigned long __ptr; \
asm ( "" : "=a" (__ptr) : "0" (ptr) ); \
(typeof(ptr)) (__ptr + (offset)); }))
#define __my_cpu_offset S390_lowcore.percpu_offset
/*
* For 64 bit module code, the module may be more than 4G above the
* per cpu area, use weak definitions to force the compiler to
* generate external references.
*/
#if defined(CONFIG_SMP) && defined(__s390x__) && defined(MODULE)
#define ARCH_NEEDS_WEAK_PER_CPU
#endif
#define __my_cpu_offset S390_lowcore.percpu_offset
#include <asm-generic/percpu.h>
#endif /* __ARCH_S390_PERCPU__ */
......@@ -157,13 +157,10 @@ SECTIONS
_end = . ;
/* Sections to be discarded */
/DISCARD/ : {
EXIT_DATA
*(.exitcall.exit)
}
/* Debugging sections. */
STABS_DEBUG
DWARF_DEBUG
/* Sections to be discarded */
DISCARDS
}
......@@ -163,16 +163,14 @@ SECTIONS
_end = . ;
}
STABS_DEBUG
DWARF_DEBUG
/*
* When something in the kernel is NOT compiled as a module, the
* module cleanup code and data are put into these segments. Both
* can then be thrown away, as cleanup code is never called unless
* it's a module.
*/
/DISCARD/ : {
*(.exitcall.exit)
}
STABS_DEBUG
DWARF_DEBUG
DISCARDS
}
......@@ -95,7 +95,7 @@ config AUDIT_ARCH
config HAVE_SETUP_PER_CPU_AREA
def_bool y if SPARC64
config HAVE_DYNAMIC_PER_CPU_AREA
config NEED_PER_CPU_EMBED_FIRST_CHUNK
def_bool y if SPARC64
config GENERIC_HARDIRQS_NO__DO_IRQ
......
......@@ -1389,8 +1389,8 @@ void smp_send_stop(void)
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
unsigned long align)
static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
size_t align)
{
const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NEED_MULTIPLE_NODES
......@@ -1415,127 +1415,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
#endif
}
static size_t pcpur_size __initdata;
static void **pcpur_ptrs __initdata;
static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
static void __init pcpu_free_bootmem(void *ptr, size_t size)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
if (off >= pcpur_size)
return NULL;
return virt_to_page(pcpur_ptrs[cpu] + off);
free_bootmem(__pa(ptr), size);
}
#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
static void __init pcpu_map_range(unsigned long start, unsigned long end,
struct page *page)
static int pcpu_cpu_distance(unsigned int from, unsigned int to)
{
unsigned long pfn = page_to_pfn(page);
unsigned long pte_base;
BUG_ON((pfn<<PAGE_SHIFT)&(PCPU_CHUNK_SIZE - 1UL));
pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
_PAGE_CP_4U | _PAGE_CV_4U |
_PAGE_P_4U | _PAGE_W_4U);
if (tlb_type == hypervisor)
pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
_PAGE_CP_4V | _PAGE_CV_4V |
_PAGE_P_4V | _PAGE_W_4V);
while (start < end) {
pgd_t *pgd = pgd_offset_k(start);
unsigned long this_end;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pud = pud_offset(pgd, start);
if (pud_none(*pud)) {
pmd_t *new;
new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
pud_populate(&init_mm, pud, new);
}
pmd = pmd_offset(pud, start);
if (!pmd_present(*pmd)) {
pte_t *new;
new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
pmd_populate_kernel(&init_mm, pmd, new);
}
pte = pte_offset_kernel(pmd, start);
this_end = (start + PMD_SIZE) & PMD_MASK;
if (this_end > end)
this_end = end;
while (start < this_end) {
unsigned long paddr = pfn << PAGE_SHIFT;
pte_val(*pte) = (paddr | pte_base);
start += PAGE_SIZE;
pte++;
pfn++;
}
}
if (cpu_to_node(from) == cpu_to_node(to))
return LOCAL_DISTANCE;
else
return REMOTE_DISTANCE;
}
void __init setup_per_cpu_areas(void)
{
size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
static struct vm_struct vm;
unsigned long delta, cpu;
size_t pcpu_unit_size;
size_t ptrs_size;
pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE);
dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
unsigned long delta;
unsigned int cpu;
int rc;
ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0]));
pcpur_ptrs = alloc_bootmem(ptrs_size);
for_each_possible_cpu(cpu) {
pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
PCPU_CHUNK_SIZE);
free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
PCPU_CHUNK_SIZE - pcpur_size);
memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
}
/* allocate address and map */
vm.flags = VM_ALLOC;
vm.size = nr_cpu_ids * PCPU_CHUNK_SIZE;
vm_area_register_early(&vm, PCPU_CHUNK_SIZE);
for_each_possible_cpu(cpu) {
unsigned long start = (unsigned long) vm.addr;
unsigned long end;
start += cpu * PCPU_CHUNK_SIZE;
end = start + PCPU_CHUNK_SIZE;
pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu]));
}
pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size,
PERCPU_MODULE_RESERVE, dyn_size,
PCPU_CHUNK_SIZE, vm.addr, NULL);
free_bootmem(__pa(pcpur_ptrs), ptrs_size);
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
PERCPU_DYNAMIC_RESERVE, 4 << 20,
pcpu_cpu_distance, pcpu_alloc_bootmem,
pcpu_free_bootmem);
if (rc)
panic("failed to initialize first chunk (%d)", rc);
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) {
__per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
}
for_each_possible_cpu(cpu)
__per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
/* Setup %g5 for the boot cpu. */
__local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
......
......@@ -171,12 +171,8 @@ SECTIONS
}
_end = . ;
/DISCARD/ : {
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
}
STABS_DEBUG
DWARF_DEBUG
DISCARDS
}
......@@ -123,8 +123,3 @@
__initramfs_end = .;
}
/* Sections to be discarded */
/DISCARD/ : {
*(.exitcall.exit)
}
......@@ -156,4 +156,6 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
DISCARDS
}
......@@ -100,4 +100,6 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
DISCARDS
}
......@@ -150,7 +150,10 @@ config ARCH_HAS_CACHE_LINE_SIZE
config HAVE_SETUP_PER_CPU_AREA
def_bool y
config HAVE_DYNAMIC_PER_CPU_AREA
config NEED_PER_CPU_EMBED_FIRST_CHUNK
def_bool y
config NEED_PER_CPU_PAGE_FIRST_CHUNK
def_bool y
config HAVE_CPUMASK_OF_CPU_MAP
......
......@@ -156,15 +156,6 @@ do { \
/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);
#ifdef CONFIG_NEED_MULTIPLE_NODES
void *pcpu_lpage_remapped(void *kaddr);
#else
static inline void *pcpu_lpage_remapped(void *kaddr)
{
return NULL;
}
#endif
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_SMP
......
......@@ -30,8 +30,8 @@
#include <asm/apic.h>
#include <asm/desc.h>
static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
static DEFINE_PER_CPU(int, cpu_priv_count);
static DEFINE_MUTEX(cpu_debug_lock);
......
......@@ -1091,7 +1091,7 @@ void mce_log_therm_throt_event(__u64 status)
*/
static int check_interval = 5 * 60; /* 5 minutes */
static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
static void mcheck_timer(unsigned long data)
......@@ -1110,7 +1110,7 @@ static void mcheck_timer(unsigned long data)
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
n = &__get_cpu_var(next_interval);
n = &__get_cpu_var(mce_next_interval);
if (mce_notify_irq())
*n = max(*n/2, HZ/100);
else
......@@ -1311,7 +1311,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
static void mce_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
int *n = &__get_cpu_var(next_interval);
int *n = &__get_cpu_var(mce_next_interval);
if (mce_ignore_ce)
return;
......@@ -1912,7 +1912,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
t->expires = round_jiffies(jiffies +
__get_cpu_var(next_interval));
__get_cpu_var(mce_next_interval));
add_timer_on(t, cpu);
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
break;
......
......@@ -69,7 +69,7 @@ struct threshold_bank {
struct threshold_block *blocks;
cpumask_var_t cpus;
};
static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
#ifdef CONFIG_SMP
static unsigned char shared_bank[NR_BANKS] = {
......
......@@ -976,7 +976,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
x86_pmu_disable_counter(hwc, idx);
}
static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
/*
* Set the next IRQ period, based on the hwc->period_left value.
......@@ -1015,7 +1015,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
per_cpu(prev_left[idx], smp_processor_id()) = left;
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
* The hw counter starts counting from this counter offset,
......@@ -1211,7 +1211,7 @@ void perf_counter_print_debug(void)
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
rdmsrl(x86_pmu.perfctr + idx, pmc_count);
prev_left = per_cpu(prev_left[idx], cpu);
prev_left = per_cpu(pmc_prev_left[idx], cpu);
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
cpu, idx, pmc_ctrl);
......@@ -1798,8 +1798,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
entry->ip[entry->nr++] = ip;
}
static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
static DEFINE_PER_CPU(int, in_nmi_frame);
......@@ -1952,9 +1952,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
struct perf_callchain_entry *entry;
if (in_nmi())
entry = &__get_cpu_var(nmi_entry);
entry = &__get_cpu_var(pmc_nmi_entry);
else
entry = &__get_cpu_var(irq_entry);
entry = &__get_cpu_var(pmc_irq_entry);
entry->nr = 0;
......
......@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
#define PERCPU_FIRST_CHUNK_RESERVE 0
#endif
#ifdef CONFIG_X86_32
/**
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
*
......@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
#endif
return false;
}
#endif
/**
* pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
......@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
}
/*
* Large page remap allocator
*
* This allocator uses PMD page as unit. A PMD page is allocated for
* each cpu and each is remapped into vmalloc area using PMD mapping.
* As PMD page is quite large, only part of it is used for the first
* chunk. Unused part is returned to the bootmem allocator.
*
* So, the PMD pages are mapped twice - once to the physical mapping
* and to the vmalloc area for the first percpu chunk. The double
* mapping does add one more PMD TLB entry pressure but still is much
* better than only using 4k mappings while still being NUMA friendly.
* Helpers for first chunk memory allocation
*/
#ifdef CONFIG_NEED_MULTIPLE_NODES
struct pcpul_ent {
unsigned int cpu;
void *ptr;
};
static size_t pcpul_size;
static struct pcpul_ent *pcpul_map;
static struct vm_struct pcpul_vm;
static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
if (off >= pcpul_size)
return NULL;
return virt_to_page(pcpul_map[cpu].ptr + off);
return pcpu_alloc_bootmem(cpu, size, align);
}
static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
static void __init pcpu_fc_free(void *ptr, size_t size)
{
size_t map_size, dyn_size;
unsigned int cpu;
int i, j;
ssize_t ret;
if (!chosen) {
size_t vm_size = VMALLOC_END - VMALLOC_START;
size_t tot_size = nr_cpu_ids * PMD_SIZE;
/* on non-NUMA, embedding is better */
if (!pcpu_need_numa())
return -EINVAL;
/* don't consume more than 20% of vmalloc area */
if (tot_size > vm_size / 5) {
pr_info("PERCPU: too large chunk size %zuMB for "
"large page remap\n", tot_size >> 20);
return -EINVAL;
}
}
/* need PSE */
if (!cpu_has_pse) {
pr_warning("PERCPU: lpage allocator requires PSE\n");
return -EINVAL;
}
/*
* Currently supports only single page. Supporting multiple
* pages won't be too difficult if it ever becomes necessary.
*/
pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE);
if (pcpul_size > PMD_SIZE) {
pr_warning("PERCPU: static data is larger than large page, "
"can't use large page\n");
return -EINVAL;
}
dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
/* allocate pointer array and alloc large pages */
map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
pcpul_map = alloc_bootmem(map_size);
for_each_possible_cpu(cpu) {
pcpul_map[cpu].cpu = cpu;
pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
PMD_SIZE);
if (!pcpul_map[cpu].ptr) {
pr_warning("PERCPU: failed to allocate large page "
"for cpu%u\n", cpu);
goto enomem;
}
/*
* Only use pcpul_size bytes and give back the rest.
*
* Ingo: The 2MB up-rounding bootmem is needed to make
* sure the partial 2MB page is still fully RAM - it's
* not well-specified to have a PAT-incompatible area
* (unmapped RAM, device memory, etc.) in that hole.
*/
free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
PMD_SIZE - pcpul_size);
memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
}
/* allocate address and map */
pcpul_vm.flags = VM_ALLOC;
pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
vm_area_register_early(&pcpul_vm, PMD_SIZE);
for_each_possible_cpu(cpu) {
pmd_t *pmd, pmd_v;
pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
cpu * PMD_SIZE);
pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
PAGE_KERNEL_LARGE);
set_pmd(pmd, pmd_v);
}
/* we're ready, commit */
pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", pcpul_vm.addr, static_size);
ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
PMD_SIZE, pcpul_vm.addr, NULL);
/* sort pcpul_map array for pcpu_lpage_remapped() */
for (i = 0; i < nr_cpu_ids - 1; i++)
for (j = i + 1; j < nr_cpu_ids; j++)
if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
struct pcpul_ent tmp = pcpul_map[i];
pcpul_map[i] = pcpul_map[j];
pcpul_map[j] = tmp;
}
return ret;
enomem:
for_each_possible_cpu(cpu)
if (pcpul_map[cpu].ptr)
free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
free_bootmem(__pa(pcpul_map), map_size);
return -ENOMEM;
free_bootmem(__pa(ptr), size);
}
/**
* pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
* @kaddr: the kernel address in question
*
* Determine whether @kaddr falls in the pcpul recycled area. This is
* used by pageattr to detect VM aliases and break up the pcpu PMD
* mapping such that the same physical page is not mapped under
* different attributes.
*
* The recycled area is always at the tail of a partially used PMD
* page.
*
* RETURNS:
* Address of corresponding remapped pcpu address if match is found;
* otherwise, NULL.
*/
void *pcpu_lpage_remapped(void *kaddr)
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
{
void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
int left = 0, right = nr_cpu_ids - 1;
int pos;
/* pcpul in use at all? */
if (!pcpul_map)
return NULL;
/* okay, perform binary search */
while (left <= right) {
pos = (left + right) / 2;
if (pcpul_map[pos].ptr < pmd_addr)
left = pos + 1;
else if (pcpul_map[pos].ptr > pmd_addr)
right = pos - 1;
else {
/* it shouldn't be in the area for the first chunk */
WARN_ON(offset < pcpul_size);
return pcpul_vm.addr +
pcpul_map[pos].cpu * PMD_SIZE + offset;
}
}
return NULL;
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
if (early_cpu_to_node(from) == early_cpu_to_node(to))
return LOCAL_DISTANCE;
else
return REMOTE_DISTANCE;
#else
static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
{
return -EINVAL;
}
return LOCAL_DISTANCE;
#endif
/*
* Embedding allocator
*
* The first chunk is sized to just contain the static area plus
* module and dynamic reserves and embedded into linear physical
* mapping so that it can use PMD mapping without additional TLB
* pressure.
*/
static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
{
size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
/*
* If large page isn't supported, there's no benefit in doing
* this. Also, embedding allocation doesn't play well with
* NUMA.
*/
if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
return -EINVAL;
return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
}
/*
* 4k page allocator
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page and most of initialization is done by the generic
* setup function.
*/
static struct page **pcpu4k_pages __initdata;
static int pcpu4k_nr_static_pages __initdata;
static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
{
if (pageno < pcpu4k_nr_static_pages)
return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
return NULL;
}
static void __init pcpu4k_populate_pte(unsigned long addr)
static void __init pcpup_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
}
static ssize_t __init setup_pcpu_4k(size_t static_size)
{
size_t pages_size;
unsigned int cpu;
int i, j;
ssize_t ret;
pcpu4k_nr_static_pages = PFN_UP(static_size);
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
* sizeof(pcpu4k_pages[0]));
pcpu4k_pages = alloc_bootmem(pages_size);
/* allocate and copy */
j = 0;
for_each_possible_cpu(cpu)
for (i = 0; i < pcpu4k_nr_static_pages; i++) {
void *ptr;
ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
pr_warning("PERCPU: failed to allocate "
"4k page for cpu%u\n", cpu);
goto enomem;
}
memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
pcpu4k_pages[j++] = virt_to_page(ptr);
}
/* we're ready, commit */
pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
pcpu4k_nr_static_pages, static_size);
ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
PERCPU_FIRST_CHUNK_RESERVE, -1,
-1, NULL, pcpu4k_populate_pte);
goto out_free_ar;
enomem:
while (--j >= 0)
free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
ret = -ENOMEM;
out_free_ar:
free_bootmem(__pa(pcpu4k_pages), pages_size);
return ret;
}
/* for explicit first chunk allocator selection */
static char pcpu_chosen_alloc[16] __initdata;
static int __init percpu_alloc_setup(char *str)
{
strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
......@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu)
void __init setup_per_cpu_areas(void)
{
size_t static_size = __per_cpu_end - __per_cpu_start;
unsigned int cpu;
unsigned long delta;
size_t pcpu_unit_size;
ssize_t ret;
int rc;
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
* Allocate percpu area. If PSE is supported, try to make use
* of large page mappings. Please read comments on top of
* each allocator for details.
* Allocate percpu area. Embedding allocator is our favorite;
* however, on NUMA configurations, it can result in very
* sparse unit mapping and vmalloc area isn't spacious enough
* on 32bit. Use page in that case.
*/
ret = -EINVAL;
if (strlen(pcpu_chosen_alloc)) {
if (strcmp(pcpu_chosen_alloc, "4k")) {
if (!strcmp(pcpu_chosen_alloc, "lpage"))
ret = setup_pcpu_lpage(static_size, true);
else if (!strcmp(pcpu_chosen_alloc, "embed"))
ret = setup_pcpu_embed(static_size, true);
else
pr_warning("PERCPU: unknown allocator %s "
"specified\n", pcpu_chosen_alloc);
if (ret < 0)
pr_warning("PERCPU: %s allocator failed (%zd), "
"falling back to 4k\n",
pcpu_chosen_alloc, ret);
}
} else {
ret = setup_pcpu_lpage(static_size, false);
if (ret < 0)
ret = setup_pcpu_embed(static_size, false);
#ifdef CONFIG_X86_32
if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
rc = -EINVAL;
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
const size_t dyn_size = PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
dyn_size, atom_size,
pcpu_cpu_distance,
pcpu_fc_alloc, pcpu_fc_free);
if (rc < 0)
pr_warning("PERCPU: %s allocator failed (%d), "
"falling back to page size\n",
pcpu_fc_names[pcpu_chosen_fc], rc);
}
if (ret < 0)
ret = setup_pcpu_4k(static_size);
if (ret < 0)
panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
static_size, ret);
pcpu_unit_size = ret;
if (rc < 0)
rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
pcpu_fc_alloc, pcpu_fc_free,
pcpup_populate_pte);
if (rc < 0)
panic("cannot initialize percpu area (err=%d)", rc);
/* alrighty, percpu areas up and running */
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) {
per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
......
......@@ -380,15 +380,12 @@ SECTIONS
_end = .;
}
/* Sections to be discarded */
/DISCARD/ : {
*(.exitcall.exit)
*(.eh_frame)
*(.discard)
}
STABS_DEBUG
DWARF_DEBUG
/* Sections to be discarded */
DISCARDS
/DISCARD/ : { *(.eh_frame) }
}
......
......@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pfn.h>
#include <linux/percpu.h>
#include <asm/e820.h>
#include <asm/processor.h>
......@@ -686,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
unsigned long vaddr, remapped;
unsigned long vaddr;
int ret;
if (cpa->pfn >= max_pfn_mapped)
......@@ -744,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
}
#endif
/*
* If the PMD page was partially used for per-cpu remapping,
* the recycled area needs to be split and modified. Because
* the area is always proper subset of a PMD page
* cpa->numpages is guaranteed to be 1 for these areas, so
* there's no need to loop over and check for further remaps.
*/
remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
if (remapped) {
WARN_ON(cpa->numpages > 1);
alias_cpa = *cpa;
alias_cpa.vaddr = &remapped;
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
ret = __change_page_attr_set_clr(&alias_cpa, 0);
if (ret)
return ret;
}
return 0;
}
......
......@@ -280,15 +280,6 @@ SECTIONS
*(.ResetVector.text)
}
/* Sections to be discarded */
/DISCARD/ :
{
*(.exit.literal)
EXIT_TEXT
EXIT_DATA
*(.exitcall.exit)
}
.xt.lit : { *(.xt.lit) }
.xt.prop : { *(.xt.prop) }
......@@ -321,4 +312,8 @@ SECTIONS
*(.xt.lit)
*(.gnu.linkonce.p*)
}
/* Sections to be discarded */
DISCARDS
/DISCARD/ : { *(.exit.literal) }
}
......@@ -146,7 +146,7 @@ enum arq_state {
#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2)
#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state)
static DEFINE_PER_CPU(unsigned long, ioc_count);
static DEFINE_PER_CPU(unsigned long, as_ioc_count);
static struct completion *ioc_gone;
static DEFINE_SPINLOCK(ioc_gone_lock);
......@@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad);
static void free_as_io_context(struct as_io_context *aic)
{
kfree(aic);
elv_ioc_count_dec(ioc_count);
elv_ioc_count_dec(as_ioc_count);
if (ioc_gone) {
/*
* AS scheduler is exiting, grab exit lock and check
......@@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic)
* complete ioc_gone and set it back to NULL.
*/
spin_lock(&ioc_gone_lock);
if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
complete(ioc_gone);
ioc_gone = NULL;
}
......@@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void)
ret->seek_total = 0;
ret->seek_samples = 0;
ret->seek_mean = 0;
elv_ioc_count_inc(ioc_count);
elv_ioc_count_inc(as_ioc_count);
}
return ret;
......@@ -1507,7 +1507,7 @@ static void __exit as_exit(void)
ioc_gone = &all_gone;
/* ioc_gone's update must be visible before reading ioc_count */
smp_wmb();
if (elv_ioc_count_read(ioc_count))
if (elv_ioc_count_read(as_ioc_count))
wait_for_completion(&all_gone);
synchronize_rcu();
}
......
......@@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125;
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
static DEFINE_PER_CPU(unsigned long, ioc_count);
static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
static struct completion *ioc_gone;
static DEFINE_SPINLOCK(ioc_gone_lock);
......@@ -1427,7 +1427,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
cic = container_of(head, struct cfq_io_context, rcu_head);
kmem_cache_free(cfq_ioc_pool, cic);
elv_ioc_count_dec(ioc_count);
elv_ioc_count_dec(cfq_ioc_count);
if (ioc_gone) {
/*
......@@ -1436,7 +1436,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
* complete ioc_gone and set it back to NULL
*/
spin_lock(&ioc_gone_lock);
if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
complete(ioc_gone);
ioc_gone = NULL;
}
......@@ -1562,7 +1562,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
INIT_HLIST_NODE(&cic->cic_list);
cic->dtor = cfq_free_io_context;
cic->exit = cfq_exit_io_context;
elv_ioc_count_inc(ioc_count);
elv_ioc_count_inc(cfq_ioc_count);
}
return cic;
......@@ -2668,7 +2668,7 @@ static void __exit cfq_exit(void)
* this also protects us from entering cfq_slab_kill() with
* pending RCU callbacks
*/
if (elv_ioc_count_read(ioc_count))
if (elv_ioc_count_read(cfq_ioc_count))
wait_for_completion(&all_gone);
cfq_slab_kill();
}
......
......@@ -71,7 +71,7 @@ struct cpu_dbs_info_s {
*/
struct mutex timer_mutex;
};
static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
static unsigned int dbs_enable; /* number of CPUs using this policy */
......@@ -137,7 +137,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info,
struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info,
freq->cpu);
struct cpufreq_policy *policy;
......@@ -297,7 +297,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
/* we need to re-evaluate prev_cpu_idle */
for_each_online_cpu(j) {
struct cpu_dbs_info_s *dbs_info;
dbs_info = &per_cpu(cpu_dbs_info, j);
dbs_info = &per_cpu(cs_cpu_dbs_info, j);
dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
&dbs_info->prev_cpu_wall);
if (dbs_tuners_ins.ignore_nice)
......@@ -387,7 +387,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
cputime64_t cur_wall_time, cur_idle_time;
unsigned int idle_time, wall_time;
j_dbs_info = &per_cpu(cpu_dbs_info, j);
j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
......@@ -521,7 +521,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
unsigned int j;
int rc;
this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
switch (event) {
case CPUFREQ_GOV_START:
......@@ -538,7 +538,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
for_each_cpu(j, policy->cpus) {
struct cpu_dbs_info_s *j_dbs_info;
j_dbs_info = &per_cpu(cpu_dbs_info, j);
j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
j_dbs_info->cur_policy = policy;
j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
......
......@@ -78,7 +78,7 @@ struct cpu_dbs_info_s {
*/
struct mutex timer_mutex;
};
static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
static unsigned int dbs_enable; /* number of CPUs using this policy */
......@@ -149,7 +149,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
unsigned int freq_hi, freq_lo;
unsigned int index = 0;
unsigned int jiffies_total, jiffies_hi, jiffies_lo;
struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu);
struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
policy->cpu);
if (!dbs_info->freq_table) {
dbs_info->freq_lo = 0;
......@@ -192,7 +193,7 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
static void ondemand_powersave_bias_init_cpu(int cpu)
{
struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
dbs_info->freq_lo = 0;
}
......@@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
/* we need to re-evaluate prev_cpu_idle */
for_each_online_cpu(j) {
struct cpu_dbs_info_s *dbs_info;
dbs_info = &per_cpu(cpu_dbs_info, j);
dbs_info = &per_cpu(od_cpu_dbs_info, j);
dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
&dbs_info->prev_cpu_wall);
if (dbs_tuners_ins.ignore_nice)
......@@ -388,7 +389,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
unsigned int load, load_freq;
int freq_avg;
j_dbs_info = &per_cpu(cpu_dbs_info, j);
j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
......@@ -535,7 +536,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
unsigned int j;
int rc;
this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
switch (event) {
case CPUFREQ_GOV_START:
......@@ -553,7 +554,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
dbs_enable++;
for_each_cpu(j, policy->cpus) {
struct cpu_dbs_info_s *j_dbs_info;
j_dbs_info = &per_cpu(cpu_dbs_info, j);
j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
j_dbs_info->cur_policy = policy;
j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
......
......@@ -47,10 +47,10 @@
static DEFINE_SPINLOCK(irq_mapping_update_lock);
/* IRQ <-> VIRQ mapping. */
static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
/* IRQ <-> IPI mapping */
static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
/* Interrupt types. */
enum xen_irq_type {
......@@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
static DEFINE_PER_CPU(unsigned, xed_nesting_count);
/*
* Search the CPUs pending events bitmasks. For each one found, map
* the event number to an irq, and feed it into do_IRQ() for
......@@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
struct pt_regs *old_regs = set_irq_regs(regs);
struct shared_info *s = HYPERVISOR_shared_info;
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
static DEFINE_PER_CPU(unsigned, nesting_count);
unsigned count;
exit_idle();
......@@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
vcpu_info->evtchn_upcall_pending = 0;
if (__get_cpu_var(nesting_count)++)
if (__get_cpu_var(xed_nesting_count)++)
goto out;
#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
......@@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
BUG_ON(!irqs_disabled());
count = __get_cpu_var(nesting_count);
__get_cpu_var(nesting_count) = 0;
count = __get_cpu_var(xed_nesting_count);
__get_cpu_var(xed_nesting_count) = 0;
} while(count != 1);
out:
......
......@@ -33,13 +33,10 @@
* BSS_SECTION(0, 0, 0)
* _end = .;
*
* /DISCARD/ : {
* EXIT_TEXT
* EXIT_DATA
* EXIT_CALL
* }
* STABS_DEBUG
* DWARF_DEBUG
*
* DISCARDS // must be the last
* }
*
* [__init_begin, __init_end] is the init section that may be freed after init
......@@ -626,6 +623,23 @@
#define INIT_RAM_FS
#endif
/*
* Default discarded sections.
*
* Some archs want to discard exit text/data at runtime rather than
* link time due to cross-section references such as alt instructions,
* bug table, eh_frame, etc. DISCARDS must be the last of output
* section definitions so that such archs put those in earlier section
* definitions.
*/
#define DISCARDS \
/DISCARD/ : { \
EXIT_TEXT \
EXIT_DATA \
EXIT_CALL \
*(.discard) \
}
/**
* PERCPU_VADDR - define output section for percpu area
* @vaddr: explicit base address (optional)
......
......@@ -10,22 +10,70 @@
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
* 'section' argument. This may be used to affect the parameters governing the
* 'sec' argument. This may be used to affect the parameters governing the
* variable's storage.
*
* NOTE! The sections for the DECLARE and for the DEFINE must match, lest
* linkage errors occur due the compiler generating the wrong code to access
* that section.
*/
#define DECLARE_PER_CPU_SECTION(type, name, section) \
extern \
__attribute__((__section__(PER_CPU_BASE_SECTION section))) \
PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
#define DEFINE_PER_CPU_SECTION(type, name, section) \
__attribute__((__section__(PER_CPU_BASE_SECTION section))) \
PER_CPU_ATTRIBUTES PER_CPU_DEF_ATTRIBUTES \
#define __PCPU_ATTRS(sec) \
__attribute__((section(PER_CPU_BASE_SECTION sec))) \
PER_CPU_ATTRIBUTES
#define __PCPU_DUMMY_ATTRS \
__attribute__((section(".discard"), unused))
/*
* s390 and alpha modules require percpu variables to be defined as
* weak to force the compiler to generate GOT based external
* references for them. This is necessary because percpu sections
* will be located outside of the usually addressable area.
*
* This definition puts the following two extra restrictions when
* defining percpu variables.
*
* 1. The symbol must be globally unique, even the static ones.
* 2. Static percpu variables cannot be defined inside a function.
*
* Archs which need weak percpu definitions should define
* ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary.
*
* To ensure that the generic code observes the above two
* restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak
* definition is used for all cases.
*/
#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU)
/*
* __pcpu_scope_* dummy variable is used to enforce scope. It
* receives the static modifier when it's used in front of
* DEFINE_PER_CPU() and will trigger build failure if
* DECLARE_PER_CPU() is used for the same variable.
*
* __pcpu_unique_* dummy variable is used to enforce symbol uniqueness
* such that hidden weak symbol collision, which will cause unrelated
* variables to share the same address, can be detected during build.
*/
#define DECLARE_PER_CPU_SECTION(type, name, sec) \
extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \
extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
#define DEFINE_PER_CPU_SECTION(type, name, sec) \
__PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \
__PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \
__PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES __weak \
__typeof__(type) per_cpu__##name
#else
/*
* Normal declaration and definition macros.
*/
#define DECLARE_PER_CPU_SECTION(type, name, sec) \
extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
#define DEFINE_PER_CPU_SECTION(type, name, sec) \
__PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES \
__typeof__(type) per_cpu__##name
#endif
/*
* Variant on the per-CPU variable declaration/definition theme used for
......
......@@ -34,7 +34,7 @@
#ifdef CONFIG_SMP
#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
/* minimum unit size, also is the maximum supported allocation size */
#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10)
......@@ -57,19 +57,70 @@
#endif
extern void *pcpu_base_addr;
extern const unsigned long *pcpu_unit_offsets;
typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
struct pcpu_group_info {
int nr_units; /* aligned # of units */
unsigned long base_offset; /* base address offset */
unsigned int *cpu_map; /* unit->cpu map, empty
* entries contain NR_CPUS */
};
struct pcpu_alloc_info {
size_t static_size;
size_t reserved_size;
size_t dyn_size;
size_t unit_size;
size_t atom_size;
size_t alloc_size;
size_t __ai_size; /* internal, don't use */
int nr_groups; /* 0 if grouping unnecessary */
struct pcpu_group_info groups[];
};
extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
size_t static_size, size_t reserved_size,
ssize_t dyn_size, ssize_t unit_size,
void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn);
enum pcpu_fc {
PCPU_FC_AUTO,
PCPU_FC_EMBED,
PCPU_FC_PAGE,
extern ssize_t __init pcpu_embed_first_chunk(
size_t static_size, size_t reserved_size,
ssize_t dyn_size, ssize_t unit_size);
PCPU_FC_NR,
};
extern const char *pcpu_fc_names[PCPU_FC_NR];
extern enum pcpu_fc pcpu_chosen_fc;
typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size,
size_t align);
typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
int nr_units);
extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai);
extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
size_t reserved_size, ssize_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr);
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn);
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
extern int __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn);
#endif
/*
* Use this to get to a cpu's version of the per-cpu object
......@@ -80,7 +131,7 @@ extern ssize_t __init pcpu_embed_first_chunk(
extern void *__alloc_reserved_percpu(size_t size, size_t align);
#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
struct percpu_data {
void *ptrs[1];
......@@ -99,11 +150,15 @@ struct percpu_data {
(__typeof__(ptr))__p->ptrs[(cpu)]; \
})
#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
extern void *__alloc_percpu(size_t size, size_t align);
extern void free_percpu(void *__pdata);
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
extern void __init setup_per_cpu_areas(void);
#endif
#else /* CONFIG_SMP */
#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
......@@ -124,6 +179,13 @@ static inline void free_percpu(void *p)
kfree(p);
}
static inline void __init setup_per_cpu_areas(void) { }
static inline void *pcpu_lpage_remapped(void *kaddr)
{
return NULL;
}
#endif /* CONFIG_SMP */
#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \
......
......@@ -115,4 +115,10 @@ extern rwlock_t vmlist_lock;
extern struct vm_struct *vmlist;
extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
size_t align, gfp_t gfp_mask);
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
#endif /* _LINUX_VMALLOC_H */
......@@ -353,7 +353,6 @@ static void __init smp_init(void)
#define smp_init() do { } while (0)
#endif
static inline void setup_per_cpu_areas(void) { }
static inline void setup_nr_cpu_ids(void) { }
static inline void smp_prepare_cpus(unsigned int maxcpus) { }
......@@ -374,29 +373,6 @@ static void __init setup_nr_cpu_ids(void)
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
static void __init setup_per_cpu_areas(void)
{
unsigned long size, i;
char *ptr;
unsigned long nr_possible_cpus = num_possible_cpus();
/* Copy section for each CPU (we discard the original) */
size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
ptr = alloc_bootmem_pages(size * nr_possible_cpus);
for_each_possible_cpu(i) {
__per_cpu_offset[i] = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
ptr += size;
}
}
#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
......
......@@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module);
#ifdef CONFIG_SMP
#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
static void *percpu_modalloc(unsigned long size, unsigned long align,
const char *name)
......@@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme)
free_percpu(freeme);
}
#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
/* Number of blocks used and allocated. */
static unsigned int pcpu_num_used, pcpu_num_allocated;
......@@ -535,7 +535,7 @@ static int percpu_modinit(void)
}
__initcall(percpu_modinit);
#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
static unsigned int find_pcpusec(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
......
......@@ -100,16 +100,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,
void __weak perf_counter_print_debug(void) { }
static DEFINE_PER_CPU(int, disable_count);
static DEFINE_PER_CPU(int, perf_disable_count);
void __perf_disable(void)
{
__get_cpu_var(disable_count)++;
__get_cpu_var(perf_disable_count)++;
}
bool __perf_enable(void)
{
return !--__get_cpu_var(disable_count);
return !--__get_cpu_var(perf_disable_count);
}
void perf_disable(void)
......
......@@ -318,12 +318,12 @@ struct task_group root_task_group;
/* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
#endif /* CONFIG_RT_GROUP_SCHED */
#else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
......
......@@ -1334,7 +1334,7 @@ static __init void event_trace_self_tests(void)
#ifdef CONFIG_FUNCTION_TRACER
static DEFINE_PER_CPU(atomic_t, test_event_disable);
static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
static void
function_test_events_call(unsigned long ip, unsigned long parent_ip)
......@@ -1350,7 +1350,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
pc = preempt_count();
resched = ftrace_preempt_disable();
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
if (disabled != 1)
goto out;
......@@ -1368,7 +1368,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
trace_nowake_buffer_unlock_commit(event, flags, pc);
out:
atomic_dec(&per_cpu(test_event_disable, cpu));
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
ftrace_preempt_enable(resched);
}
......
......@@ -790,6 +790,21 @@ config DEBUG_BLOCK_EXT_DEVT
Say N if you are unsure.
config DEBUG_FORCE_WEAK_PER_CPU
bool "Force weak per-cpu definitions"
depends on DEBUG_KERNEL
help
s390 and alpha require percpu variables in modules to be
defined weak to work around addressing range issue which
puts the following two restrictions on percpu variable
definitions.
1. percpu symbols must be unique whether static or not
2. percpu variables can't be defined inside a function
To ensure that generic code follows the above rules, this
option forces all percpu variables to be defined as weak.
config LKDTM
tristate "Linux Kernel Dump Test Tool Module"
depends on DEBUG_KERNEL
......
......@@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
obj-$(CONFIG_SMP) += percpu.o
else
obj-$(CONFIG_SMP) += allocpercpu.o
......
......@@ -5,6 +5,8 @@
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/bootmem.h>
#include <asm/sections.h>
#ifndef cache_line_size
#define cache_line_size() L1_CACHE_BYTES
......@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
kfree(__percpu_disguise(__pdata));
}
EXPORT_SYMBOL_GPL(free_percpu);
/*
* Generic percpu area setup.
*/
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
void __init setup_per_cpu_areas(void)
{
unsigned long size, i;
char *ptr;
unsigned long nr_possible_cpus = num_possible_cpus();
/* Copy section for each CPU (we discard the original) */
size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
ptr = alloc_bootmem_pages(size * nr_possible_cpus);
for_each_possible_cpu(i) {
__per_cpu_offset[i] = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
ptr += size;
}
}
#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
......@@ -36,7 +36,7 @@ struct test_node {
};
static LIST_HEAD(test_list);
static DEFINE_PER_CPU(void *, test_pointer);
static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
/*
* Some very simple testing. This function needs to be extended for
......@@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void)
}
for_each_possible_cpu(i) {
per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL);
per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
pr_info("kmemleak: kmalloc(129) = %p\n",
per_cpu(test_pointer, i));
per_cpu(kmemleak_test_pointer, i));
}
return 0;
......
......@@ -610,6 +610,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
}
}
static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
/**
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
* @mapping: address_space which was dirtied
......@@ -627,7 +629,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
unsigned long ratelimit;
unsigned long *p;
......@@ -640,7 +641,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
* tasks in balance_dirty_pages(). Period.
*/
preempt_disable();
p = &__get_cpu_var(ratelimits);
p = &__get_cpu_var(bdp_ratelimits);
*p += nr_pages_dirtied;
if (unlikely(*p >= ratelimit)) {
*p = 0;
......
......@@ -8,12 +8,13 @@
*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
* chunk is consisted of nr_cpu_ids units and the first chunk is used
* for static percpu variables in the kernel image (special boot time
* alloc/init handling necessary as these areas need to be brought up
* before allocation services are running). Unit grows as necessary
* and all units grow or shrink in unison. When a chunk is filled up,
* another chunk is allocated. ie. in vmalloc area
* chunk is consisted of boot-time determined number of units and the
* first chunk is used for static percpu variables in the kernel image
* (special boot time alloc/init handling necessary as these areas
* need to be brought up before allocation services are running).
* Unit grows as necessary and all units grow or shrink in unison.
* When a chunk is filled up, another chunk is allocated. ie. in
* vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
......@@ -22,11 +23,13 @@
*
* Allocation is done in offset-size areas of single unit space. Ie,
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
* c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
* percpu base registers pcpu_unit_size apart.
* c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
* cpus. On NUMA, the mapping can be non-linear and even sparse.
* Percpu access can be done by configuring percpu base registers
* according to cpu to unit mapping and pcpu_unit_size.
*
* There are usually many small percpu allocations many of them as
* small as 4 bytes. The allocator organizes chunks into lists
* There are usually many small percpu allocations many of them being
* as small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
* guaranteed to be eqaul to or larger than the maximum contiguous
......@@ -43,7 +46,7 @@
*
* To use this allocator, arch code should do the followings.
*
* - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
* - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
......@@ -55,7 +58,9 @@
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
......@@ -89,25 +94,38 @@ struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
int free_size; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */
struct vm_struct *vm; /* mapped vmalloc region */
void *base_addr; /* base address of this chunk */
int map_used; /* # of map entries used */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
struct vm_struct **vms; /* mapped vmalloc regions */
bool immutable; /* no [de]population allowed */
struct page **page; /* points to page array */
struct page *page_ar[]; /* #cpus * UNIT_PAGES */
unsigned long populated[]; /* populated bitmap */
};
static int pcpu_unit_pages __read_mostly;
static int pcpu_unit_size __read_mostly;
static int pcpu_chunk_size __read_mostly;
static int pcpu_nr_units __read_mostly;
static int pcpu_atom_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
/* cpus with the lowest and highest unit numbers */
static unsigned int pcpu_first_unit_cpu __read_mostly;
static unsigned int pcpu_last_unit_cpu __read_mostly;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
/* group information, used for vm allocation */
static int pcpu_nr_groups __read_mostly;
static const unsigned long *pcpu_group_offsets __read_mostly;
static const size_t *pcpu_group_sizes __read_mostly;
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
......@@ -129,9 +147,9 @@ static int pcpu_reserved_chunk_limit;
* Synchronization rules.
*
* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
* protects allocation/reclaim paths, chunks and chunk->page arrays.
* The latter is a spinlock and protects the index data structures -
* chunk slots, chunks and area maps in chunks.
* protects allocation/reclaim paths, chunks, populated bitmap and
* vmalloc mapping. The latter is a spinlock and protects the index
* data structures - chunk slots, chunks and area maps in chunks.
*
* During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory
......@@ -178,31 +196,23 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
static int pcpu_page_idx(unsigned int cpu, int page_idx)
{
return cpu * pcpu_unit_pages + page_idx;
}
static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
return &chunk->page[pcpu_page_idx(cpu, page_idx)];
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
return (unsigned long)chunk->vm->addr +
(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
(page_idx << PAGE_SHIFT);
}
static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
int page_idx)
static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
/*
* Any possible cpu id can be used here, so there's no need to
* worry about preemption or cpu hotplug.
*/
return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(),
page_idx) != NULL;
/* must not be used on pre-mapped chunk */
WARN_ON(chunk->immutable);
return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
}
/* set the pointer to a chunk in a page struct */
......@@ -217,6 +227,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
return (struct pcpu_chunk *)page->index;
}
static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
{
*rs = find_next_zero_bit(chunk->populated, end, *rs);
*re = find_next_bit(chunk->populated, end, *rs + 1);
}
static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
{
*rs = find_next_bit(chunk->populated, end, *rs);
*re = find_next_zero_bit(chunk->populated, end, *rs + 1);
}
/*
* (Un)populated page region iterators. Iterate over (un)populated
* page regions betwen @start and @end in @chunk. @rs and @re should
* be integer variables and will be set to start and end page index of
* the current region.
*/
#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
/**
* pcpu_mem_alloc - allocate memory
* @size: bytes to allocate
......@@ -292,10 +330,10 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
void *first_start = pcpu_first_chunk->vm->addr;
void *first_start = pcpu_first_chunk->base_addr;
/* is it in the first chunk? */
if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
if (addr >= first_start && addr < first_start + pcpu_unit_size) {
/* is it in the reserved area? */
if (addr < first_start + pcpu_reserved_chunk_limit)
return pcpu_reserved_chunk;
......@@ -309,7 +347,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
* space. Note that any possible cpu id can be used here, so
* there's no need to worry about preemption or cpu hotplug.
*/
addr += raw_smp_processor_id() * pcpu_unit_size;
addr += pcpu_unit_offsets[raw_smp_processor_id()];
return pcpu_get_page_chunk(vmalloc_to_page(addr));
}
......@@ -558,125 +596,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
}
/**
* pcpu_unmap - unmap pages out of a pcpu_chunk
* pcpu_get_pages_and_bitmap - get temp pages array and bitmap
* @chunk: chunk of interest
* @page_start: page index of the first page to unmap
* @page_end: page index of the last page to unmap + 1
* @flush_tlb: whether to flush tlb or not
* @bitmapp: output parameter for bitmap
* @may_alloc: may allocate the array
*
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
* If @flush is true, vcache is flushed before unmapping and tlb
* after.
* Returns pointer to array of pointers to struct page and bitmap,
* both of which can be indexed with pcpu_page_idx(). The returned
* array is cleared to zero and *@bitmapp is copied from
* @chunk->populated. Note that there is only one array and bitmap
* and access exclusion is the caller's responsibility.
*
* CONTEXT:
* pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
* Otherwise, don't care.
*
* RETURNS:
* Pointer to temp pages array on success, NULL on failure.
*/
static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
bool flush_tlb)
static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
unsigned long **bitmapp,
bool may_alloc)
{
unsigned int last = nr_cpu_ids - 1;
unsigned int cpu;
static struct page **pages;
static unsigned long *bitmap;
size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
sizeof(unsigned long);
if (!pages || !bitmap) {
if (may_alloc && !pages)
pages = pcpu_mem_alloc(pages_size);
if (may_alloc && !bitmap)
bitmap = pcpu_mem_alloc(bitmap_size);
if (!pages || !bitmap)
return NULL;
}
/* unmap must not be done on immutable chunk */
WARN_ON(chunk->immutable);
memset(pages, 0, pages_size);
bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
/*
* Each flushing trial can be very expensive, issue flush on
* the whole region at once rather than doing it for each cpu.
* This could be an overkill but is more scalable.
*/
flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
*bitmapp = bitmap;
return pages;
}
for_each_possible_cpu(cpu)
unmap_kernel_range_noflush(
pcpu_chunk_addr(chunk, cpu, page_start),
(page_end - page_start) << PAGE_SHIFT);
/* ditto as flush_cache_vunmap() */
if (flush_tlb)
flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
/**
* pcpu_free_pages - free pages which were allocated for @chunk
* @chunk: chunk pages were allocated for
* @pages: array of pages to be freed, indexed by pcpu_page_idx()
* @populated: populated bitmap
* @page_start: page index of the first page to be freed
* @page_end: page index of the last page to be freed + 1
*
* Free pages [@page_start and @page_end) in @pages for all units.
* The pages were allocated for @chunk.
*/
static void pcpu_free_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{
unsigned int cpu;
int i;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page *page = pages[pcpu_page_idx(cpu, i)];
if (page)
__free_page(page);
}
}
}
/**
* pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
* @chunk: chunk to depopulate
* @off: offset to the area to depopulate
* @size: size of the area to depopulate in bytes
* @flush: whether to flush cache and tlb or not
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk. If @flush is true, vcache is flushed before unmapping
* and tlb after.
*
* CONTEXT:
* pcpu_alloc_mutex.
* pcpu_alloc_pages - allocates pages for @chunk
* @chunk: target chunk
* @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
* @populated: populated bitmap
* @page_start: page index of the first page to be allocated
* @page_end: page index of the last page to be allocated + 1
*
* Allocate pages [@page_start,@page_end) into @pages for all units.
* The allocation is for @chunk. Percpu core doesn't care about the
* content of @pages and will pass it verbatim to pcpu_map_pages().
*/
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
bool flush)
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
int unmap_start = -1;
int uninitialized_var(unmap_end);
const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
unsigned int cpu;
int i;
for (i = page_start; i < page_end; i++) {
for_each_possible_cpu(cpu) {
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
if (!*pagep) {
pcpu_free_pages(chunk, pages, populated,
page_start, page_end);
return -ENOMEM;
}
}
}
return 0;
}
if (!*pagep)
continue;
/**
* pcpu_pre_unmap_flush - flush cache prior to unmapping
* @chunk: chunk the regions to be flushed belongs to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages in [@page_start,@page_end) of @chunk are about to be
* unmapped. Flush cache. As each flushing trial can be very
* expensive, issue flush on the whole region at once rather than
* doing it for each cpu. This could be an overkill but is more
* scalable.
*/
static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vunmap(
pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
}
static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
}
__free_page(*pagep);
/**
* pcpu_unmap_pages - unmap pages out of a pcpu_chunk
* @chunk: chunk of interest
* @pages: pages array which can be used to pass information to free
* @populated: populated bitmap
* @page_start: page index of the first page to unmap
* @page_end: page index of the last page to unmap + 1
*
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
* Corresponding elements in @pages were cleared by the caller and can
* be used to carry information to pcpu_free_pages() which will be
* called after all unmaps are finished. The caller should call
* proper pre/post flush functions.
*/
static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{
unsigned int cpu;
int i;
/*
* If it's partial depopulation, it might get
* populated or depopulated again. Mark the
* page gone.
*/
*pagep = NULL;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page *page;
unmap_start = unmap_start < 0 ? i : unmap_start;
unmap_end = i + 1;
page = pcpu_chunk_page(chunk, cpu, i);
WARN_ON(!page);
pages[pcpu_page_idx(cpu, i)] = page;
}
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
page_end - page_start);
}
if (unmap_start >= 0)
pcpu_unmap(chunk, unmap_start, unmap_end, flush);
for (i = page_start; i < page_end; i++)
__clear_bit(i, populated);
}
/**
* pcpu_post_unmap_tlb_flush - flush TLB after unmapping
* @chunk: pcpu_chunk the regions to be flushed belong to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
* TLB for the regions. This can be skipped if the area is to be
* returned to vmalloc as vmalloc will handle TLB flushing lazily.
*
* As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
* for the whole region.
*/
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_tlb_kernel_range(
pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
}
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
int nr_pages)
{
return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
PAGE_KERNEL, pages);
}
/**
* pcpu_map - map pages into a pcpu_chunk
* pcpu_map_pages - map pages into a pcpu_chunk
* @chunk: chunk of interest
* @pages: pages array containing pages to be mapped
* @populated: populated bitmap
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
* For each cpu, map pages [@page_start,@page_end) into @chunk.
* vcache is flushed afterwards.
* For each cpu, map pages [@page_start,@page_end) into @chunk. The
* caller is responsible for calling pcpu_post_map_flush() after all
* mappings are complete.
*
* This function is responsible for setting corresponding bits in
* @chunk->populated bitmap and whatever is necessary for reverse
* lookup (addr -> chunk).
*/
static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
static int pcpu_map_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{
unsigned int last = nr_cpu_ids - 1;
unsigned int cpu;
int err;
/* map must not be done on immutable chunk */
WARN_ON(chunk->immutable);
unsigned int cpu, tcpu;
int i, err;
for_each_possible_cpu(cpu) {
err = map_kernel_range_noflush(
pcpu_chunk_addr(chunk, cpu, page_start),
(page_end - page_start) << PAGE_SHIFT,
PAGE_KERNEL,
pcpu_chunk_pagep(chunk, cpu, page_start));
err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
&pages[pcpu_page_idx(cpu, page_start)],
page_end - page_start);
if (err < 0)
return err;
goto err;
}
/* mapping successful, link chunk and mark populated */
for (i = page_start; i < page_end; i++) {
for_each_possible_cpu(cpu)
pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
chunk);
__set_bit(i, populated);
}
/* flush at once, please read comments in pcpu_unmap() */
flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
return 0;
err:
for_each_possible_cpu(tcpu) {
if (tcpu == cpu)
break;
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
page_end - page_start);
}
return err;
}
/**
* pcpu_post_map_flush - flush cache after mapping
* @chunk: pcpu_chunk the regions to be flushed belong to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages [@page_start,@page_end) of @chunk have been mapped. Flush
* cache.
*
* As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
* for the whole region.
*/
static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vmap(
pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
}
/**
* pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
* @chunk: chunk to depopulate
* @off: offset to the area to depopulate
* @size: size of the area to depopulate in bytes
* @flush: whether to flush cache and tlb or not
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk. If @flush is true, vcache is flushed before unmapping
* and tlb after.
*
* CONTEXT:
* pcpu_alloc_mutex.
*/
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
struct page **pages;
unsigned long *populated;
int rs, re;
/* quick path, check whether it's empty already */
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
if (rs == page_start && re == page_end)
return;
break;
}
/* immutable chunks can't be depopulated */
WARN_ON(chunk->immutable);
/*
* If control reaches here, there must have been at least one
* successful population attempt so the temp pages array must
* be available now.
*/
pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
BUG_ON(!pages);
/* unmap and free */
pcpu_pre_unmap_flush(chunk, page_start, page_end);
pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
pcpu_unmap_pages(chunk, pages, populated, rs, re);
/* no need to flush tlb, vmalloc will handle it lazily */
pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
pcpu_free_pages(chunk, pages, populated, rs, re);
/* commit new bitmap */
bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
}
/**
......@@ -693,58 +933,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
int map_start = -1;
int uninitialized_var(map_end);
int free_end = page_start, unmap_end = page_start;
struct page **pages;
unsigned long *populated;
unsigned int cpu;
int i;
int rs, re, rc;
for (i = page_start; i < page_end; i++) {
if (pcpu_chunk_page_occupied(chunk, i)) {
if (map_start >= 0) {
if (pcpu_map(chunk, map_start, map_end))
goto err;
map_start = -1;
}
continue;
}
/* quick path, check whether all pages are already there */
pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
if (rs == page_start && re == page_end)
goto clear;
break;
}
map_start = map_start < 0 ? i : map_start;
map_end = i + 1;
/* need to allocate and map pages, this chunk can't be immutable */
WARN_ON(chunk->immutable);
for_each_possible_cpu(cpu) {
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
if (!pages)
return -ENOMEM;
*pagep = alloc_pages_node(cpu_to_node(cpu),
alloc_mask, 0);
if (!*pagep)
goto err;
pcpu_set_page_chunk(*pagep, chunk);
}
/* alloc and map */
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
if (rc)
goto err_free;
free_end = re;
}
if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
goto err;
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
rc = pcpu_map_pages(chunk, pages, populated, rs, re);
if (rc)
goto err_unmap;
unmap_end = re;
}
pcpu_post_map_flush(chunk, page_start, page_end);
/* commit new bitmap */
bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
clear:
for_each_possible_cpu(cpu)
memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
size);
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
return 0;
err:
/* likely under heavy memory pressure, give memory back */
pcpu_depopulate_chunk(chunk, off, size, true);
return -ENOMEM;
err_unmap:
pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
pcpu_unmap_pages(chunk, pages, populated, rs, re);
pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
err_free:
pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
pcpu_free_pages(chunk, pages, populated, rs, re);
return rc;
}
static void free_pcpu_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
if (chunk->vm)
free_vm_area(chunk->vm);
if (chunk->vms)
pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
kfree(chunk);
}
......@@ -760,10 +1010,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->page = chunk->page_ar;
chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
pcpu_nr_groups, pcpu_atom_size,
GFP_KERNEL);
if (!chunk->vms) {
free_pcpu_chunk(chunk);
return NULL;
}
......@@ -771,6 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
return chunk;
}
......@@ -860,7 +1112,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
mutex_unlock(&pcpu_alloc_mutex);
return __addr_to_pcpu_ptr(chunk->vm->addr + off);
/* return address relative to base address */
return __addr_to_pcpu_ptr(chunk->base_addr + off);
fail_unlock:
spin_unlock_irq(&pcpu_lock);
......@@ -938,12 +1191,13 @@ static void pcpu_reclaim(struct work_struct *work)
}
spin_unlock_irq(&pcpu_lock);
mutex_unlock(&pcpu_alloc_mutex);
list_for_each_entry_safe(chunk, next, &todo, list) {
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk);
}
mutex_unlock(&pcpu_alloc_mutex);
}
/**
......@@ -968,7 +1222,7 @@ void free_percpu(void *ptr)
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->vm->addr;
off = addr - chunk->base_addr;
pcpu_free_area(chunk, off);
......@@ -987,30 +1241,295 @@ void free_percpu(void *ptr)
}
EXPORT_SYMBOL_GPL(free_percpu);
static inline size_t pcpu_calc_fc_sizes(size_t static_size,
size_t reserved_size,
ssize_t *dyn_sizep)
{
size_t size_sum;
size_sum = PFN_ALIGN(static_size + reserved_size +
(*dyn_sizep >= 0 ? *dyn_sizep : 0));
if (*dyn_sizep != 0)
*dyn_sizep = size_sum - static_size - reserved_size;
return size_sum;
}
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes
* pcpu_alloc_alloc_info - allocate percpu allocation info
* @nr_groups: the number of groups
* @nr_units: the number of units
*
* Allocate ai which is large enough for @nr_groups groups containing
* @nr_units units. The returned ai's groups[0].cpu_map points to the
* cpu_map array which is long enough for @nr_units and filled with
* NR_CPUS. It's the caller's responsibility to initialize cpu_map
* pointer of other groups.
*
* RETURNS:
* Pointer to the allocated pcpu_alloc_info on success, NULL on
* failure.
*/
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
int nr_units)
{
struct pcpu_alloc_info *ai;
size_t base_size, ai_size;
void *ptr;
int unit;
base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
if (!ptr)
return NULL;
ai = ptr;
ptr += base_size;
ai->groups[0].cpu_map = ptr;
for (unit = 0; unit < nr_units; unit++)
ai->groups[0].cpu_map[unit] = NR_CPUS;
ai->nr_groups = nr_groups;
ai->__ai_size = PFN_ALIGN(ai_size);
return ai;
}
/**
* pcpu_free_alloc_info - free percpu allocation info
* @ai: pcpu_alloc_info to free
*
* Free @ai which was allocated by pcpu_alloc_alloc_info().
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
free_bootmem(__pa(ai), ai->__ai_size);
}
/**
* pcpu_build_alloc_info - build alloc_info considering distances between CPUs
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
* @base_addr: mapped address, NULL for auto
* @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
*
* This function determines grouping of units, their mappings to cpus
* and other parameters considering needed percpu size, allocation
* atom size and distances between CPUs.
*
* Groups are always mutliples of atom size and CPUs which are of
* LOCAL_DISTANCE both ways are grouped together and share space for
* units in the same group. The returned configuration is guaranteed
* to have CPUs on different nodes on different groups and >=75% usage
* of allocated virtual address space.
*
* RETURNS:
* On success, pointer to the new allocation_info is returned. On
* failure, ERR_PTR value is returned.
*/
struct pcpu_alloc_info * __init pcpu_build_alloc_info(
size_t reserved_size, ssize_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
int last_allocs, group, unit;
unsigned int cpu, tcpu;
struct pcpu_alloc_info *ai;
unsigned int *cpu_map;
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of atom_size and is the smallest
* which can accomodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size;
while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
upa--;
max_upa = upa;
/* group cpus according to their proximity */
for_each_possible_cpu(cpu) {
group = 0;
next_group:
for_each_possible_cpu(tcpu) {
if (cpu == tcpu)
break;
if (group_map[tcpu] == group && cpu_distance_fn &&
(cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
group++;
nr_groups = max(nr_groups, group + 1);
goto next_group;
}
}
group_map[cpu] = group;
group_cnt[group]++;
group_cnt_max = max(group_cnt_max, group_cnt[group]);
}
/*
* Expand unit size until address space usage goes over 75%
* and then as much as possible without using more address
* space.
*/
last_allocs = INT_MAX;
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
continue;
for (group = 0; group < nr_groups; group++) {
int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
allocs += this_allocs;
wasted += this_allocs * upa - group_cnt[group];
}
/*
* Don't accept if wastage is over 25%. The
* greater-than comparison ensures upa==1 always
* passes the following check.
*/
if (wasted > num_possible_cpus() / 3)
continue;
/* and then don't consume more memory */
if (allocs > last_allocs)
break;
last_allocs = allocs;
best_upa = upa;
}
upa = best_upa;
/* allocate and fill alloc_info */
for (group = 0; group < nr_groups; group++)
nr_units += roundup(group_cnt[group], upa);
ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
if (!ai)
return ERR_PTR(-ENOMEM);
cpu_map = ai->groups[0].cpu_map;
for (group = 0; group < nr_groups; group++) {
ai->groups[group].cpu_map = cpu_map;
cpu_map += roundup(group_cnt[group], upa);
}
ai->static_size = static_size;
ai->reserved_size = reserved_size;
ai->dyn_size = dyn_size;
ai->unit_size = alloc_size / upa;
ai->atom_size = atom_size;
ai->alloc_size = alloc_size;
for (group = 0, unit = 0; group_cnt[group]; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
/*
* Initialize base_offset as if all groups are located
* back-to-back. The caller should update this to
* reflect actual allocation.
*/
gi->base_offset = unit * ai->unit_size;
for_each_possible_cpu(cpu)
if (group_map[cpu] == group)
gi->cpu_map[gi->nr_units++] = cpu;
gi->nr_units = roundup(gi->nr_units, upa);
unit += gi->nr_units;
}
BUG_ON(unit != nr_units);
return ai;
}
/**
* pcpu_dump_alloc_info - print out information about pcpu_alloc_info
* @lvl: loglevel
* @ai: allocation info to dump
*
* Print out information about @ai using loglevel @lvl.
*/
static void pcpu_dump_alloc_info(const char *lvl,
const struct pcpu_alloc_info *ai)
{
int group_width = 1, cpu_width = 1, width;
char empty_str[] = "--------";
int alloc = 0, alloc_end = 0;
int group, v;
int upa, apl; /* units per alloc, allocs per line */
v = ai->nr_groups;
while (v /= 10)
group_width++;
v = num_possible_cpus();
while (v /= 10)
cpu_width++;
empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
upa = ai->alloc_size / ai->unit_size;
width = upa * (cpu_width + 1) + group_width + 3;
apl = rounddown_pow_of_two(max(60 / width, 1));
printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
for (group = 0; group < ai->nr_groups; group++) {
const struct pcpu_group_info *gi = &ai->groups[group];
int unit = 0, unit_end = 0;
BUG_ON(gi->nr_units % upa);
for (alloc_end += gi->nr_units / upa;
alloc < alloc_end; alloc++) {
if (!(alloc % apl)) {
printk("\n");
printk("%spcpu-alloc: ", lvl);
}
printk("[%0*d] ", group_width, group);
for (unit_end += upa; unit < unit_end; unit++)
if (gi->cpu_map[unit] != NR_CPUS)
printk("%0*d ", cpu_width,
gi->cpu_map[unit]);
else
printk("%s ", empty_str);
}
}
printk("\n");
}
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @ai: pcpu_alloc_info describing how to percpu area is shaped
* @base_addr: mapped address
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
* setup path. The first two parameters are mandatory. The rest are
* optional.
*
* @get_page_fn() should return pointer to percpu page given cpu
* number and page number. It should at least return enough pages to
* cover the static area. The returned pages for static area should
* have been initialized with valid data. If @unit_size is specified,
* it can also return pages after the static area. NULL return
* indicates end of pages for the cpu. Note that @get_page_fn() must
* return the same number of pages for all cpus.
*
* @reserved_size, if non-zero, specifies the amount of bytes to
* setup path.
*
* @ai contains all information necessary to initialize the first
* chunk and prime the dynamic percpu allocator.
*
* @ai->static_size is the size of static percpu area.
*
* @ai->reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved
* percpu allocation. This is primarily used to serve module percpu
......@@ -1018,22 +1537,29 @@ EXPORT_SYMBOL_GPL(free_percpu);
* limited offset range for symbol relocations to guarantee module
* percpu symbols fall inside the relocatable range.
*
* @dyn_size, if non-negative, determines the number of bytes
* available for dynamic allocation in the first chunk. Specifying
* non-negative value makes percpu leave alone the area beyond
* @static_size + @reserved_size + @dyn_size.
* @ai->dyn_size determines the number of bytes available for dynamic
* allocation in the first chunk. The area between @ai->static_size +
* @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
*
* @unit_size, if non-negative, specifies unit size and must be
* aligned to PAGE_SIZE and equal to or larger than @static_size +
* @reserved_size + if non-negative, @dyn_size.
* @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
* and equal to or larger than @ai->static_size + @ai->reserved_size +
* @ai->dyn_size.
*
* Non-null @base_addr means that the caller already allocated virtual
* region for the first chunk and mapped it. percpu must not mess
* with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
* @populate_pte_fn doesn't make any sense.
* @ai->atom_size is the allocation atom size and used as alignment
* for vm areas.
*
* @populate_pte_fn is used to populate the pagetable. NULL means the
* caller already populated the pagetable.
* @ai->alloc_size is the allocation size and always multiple of
* @ai->atom_size. This is larger than @ai->atom_size if
* @ai->unit_size is larger than @ai->atom_size.
*
* @ai->nr_groups and @ai->groups describe virtual memory layout of
* percpu areas. Units which should be colocated are put into the
* same group. Dynamic VM areas will be allocated according to these
* groupings. If @ai->nr_groups is zero, a single group containing
* all units is assumed.
*
* The caller should have mapped the first chunk at @base_addr and
* copied static data to each unit.
*
* If the first chunk ends up with both reserved and dynamic areas, it
* is served by two chunks - one to serve the core static and reserved
......@@ -1043,49 +1569,83 @@ EXPORT_SYMBOL_GPL(free_percpu);
* and available for dynamic allocation like any other chunks.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
* 0 on success, -errno on failure.
*/
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
size_t static_size, size_t reserved_size,
ssize_t dyn_size, ssize_t unit_size,
void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn)
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
static struct vm_struct first_vm;
static int smap[2], dmap[2];
size_t size_sum = static_size + reserved_size +
(dyn_size >= 0 ? dyn_size : 0);
size_t dyn_size = ai->dyn_size;
size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
struct pcpu_chunk *schunk, *dchunk = NULL;
unsigned long *group_offsets;
size_t *group_sizes;
unsigned long *unit_off;
unsigned int cpu;
int nr_pages;
int err, i;
int *unit_map;
int group, unit, i;
/* santiy checks */
/* sanity checks */
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
if (unit_size >= 0) {
BUG_ON(unit_size < size_sum);
BUG_ON(unit_size & ~PAGE_MASK);
BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
} else
BUG_ON(base_addr);
BUG_ON(base_addr && populate_pte_fn);
if (unit_size >= 0)
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
else
pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
PFN_UP(size_sum));
BUG_ON(ai->nr_groups <= 0);
BUG_ON(!ai->static_size);
BUG_ON(!base_addr);
BUG_ON(ai->unit_size < size_sum);
BUG_ON(ai->unit_size & ~PAGE_MASK);
BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
pcpu_dump_alloc_info(KERN_DEBUG, ai);
/* process group information and build config tables accordingly */
group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = NR_CPUS;
pcpu_first_unit_cpu = NR_CPUS;
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
const struct pcpu_group_info *gi = &ai->groups[group];
group_offsets[group] = gi->base_offset;
group_sizes[group] = gi->nr_units * ai->unit_size;
for (i = 0; i < gi->nr_units; i++) {
cpu = gi->cpu_map[i];
if (cpu == NR_CPUS)
continue;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+ nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
BUG_ON(unit_map[cpu] != NR_CPUS);
if (dyn_size < 0)
dyn_size = pcpu_unit_size - static_size - reserved_size;
unit_map[cpu] = unit + i;
unit_off[cpu] = gi->base_offset + i * ai->unit_size;
if (pcpu_first_unit_cpu == NR_CPUS)
pcpu_first_unit_cpu = cpu;
}
}
pcpu_last_unit_cpu = cpu;
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)
BUG_ON(unit_map[cpu] == NR_CPUS);
pcpu_nr_groups = ai->nr_groups;
pcpu_group_offsets = group_offsets;
pcpu_group_sizes = group_sizes;
pcpu_unit_map = unit_map;
pcpu_unit_offsets = unit_off;
/* determine basic parameters */
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_atom_size = ai->atom_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
/*
* Allocate chunk slots. The additional last slot is for
......@@ -1105,189 +1665,351 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
*/
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->vm = &first_vm;
schunk->base_addr = base_addr;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
schunk->page = schunk->page_ar;
schunk->immutable = true;
bitmap_fill(schunk->populated, pcpu_unit_pages);
if (reserved_size) {
schunk->free_size = reserved_size;
if (ai->reserved_size) {
schunk->free_size = ai->reserved_size;
pcpu_reserved_chunk = schunk;
pcpu_reserved_chunk_limit = static_size + reserved_size;
pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
} else {
schunk->free_size = dyn_size;
dyn_size = 0; /* dynamic area covered */
}
schunk->contig_hint = schunk->free_size;
schunk->map[schunk->map_used++] = -static_size;
schunk->map[schunk->map_used++] = -ai->static_size;
if (schunk->free_size)
schunk->map[schunk->map_used++] = schunk->free_size;
/* init dynamic chunk if necessary */
if (dyn_size) {
dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
dchunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&dchunk->list);
dchunk->vm = &first_vm;
dchunk->base_addr = base_addr;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
dchunk->page = schunk->page_ar; /* share page map with schunk */
dchunk->immutable = true;
bitmap_fill(dchunk->populated, pcpu_unit_pages);
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
/* allocate vm address */
first_vm.flags = VM_ALLOC;
first_vm.size = pcpu_chunk_size;
if (!base_addr)
vm_area_register_early(&first_vm, PAGE_SIZE);
else {
/*
* Pages already mapped. No need to remap into
* vmalloc area. In this case the first chunks can't
* be mapped or unmapped by percpu and are marked
* immutable.
*/
first_vm.addr = base_addr;
schunk->immutable = true;
if (dchunk)
dchunk->immutable = true;
}
/* assign pages */
nr_pages = -1;
for_each_possible_cpu(cpu) {
for (i = 0; i < pcpu_unit_pages; i++) {
struct page *page = get_page_fn(cpu, i);
if (!page)
break;
*pcpu_chunk_pagep(schunk, cpu, i) = page;
}
BUG_ON(i < PFN_UP(static_size));
if (nr_pages < 0)
nr_pages = i;
else
BUG_ON(nr_pages != i);
}
/* map them */
if (populate_pte_fn) {
for_each_possible_cpu(cpu)
for (i = 0; i < nr_pages; i++)
populate_pte_fn(pcpu_chunk_addr(schunk,
cpu, i));
err = pcpu_map(schunk, 0, nr_pages);
if (err)
panic("failed to setup static percpu area, err=%d\n",
err);
}
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
/* we're done */
pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
return pcpu_unit_size;
pcpu_base_addr = base_addr;
return 0;
}
/*
* Embedding first chunk setup helper.
*/
static void *pcpue_ptr __initdata;
static size_t pcpue_size __initdata;
static size_t pcpue_unit_size __initdata;
const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
[PCPU_FC_PAGE] = "page",
};
static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
if (off >= pcpue_size)
return NULL;
static int __init percpu_alloc_setup(char *str)
{
if (0)
/* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
else if (!strcmp(str, "embed"))
pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
else if (!strcmp(str, "page"))
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
else
pr_warning("PERCPU: unknown allocator %s specified\n", str);
return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
* @alloc_fn: function to allocate percpu page
* @free_fn: funtion to free percpu page
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
* as a contiguous area using bootmem allocator and used as-is without
* being mapped into vmalloc area. This enables the first chunk to
* piggy back on the linear physical mapping which often uses larger
* page size.
* by calling @alloc_fn and used as-is without being mapped into
* vmalloc area. Allocations are always whole multiples of @atom_size
* aligned to @atom_size.
*
* This enables the first chunk to piggy back on the linear physical
* mapping which often uses larger page size. Please note that this
* can result in very sparse cpu->unit mapping on NUMA machines thus
* requiring large vmalloc address space. Don't use this allocator if
* vmalloc space is not orders of magnitude larger than distances
* between node memory addresses (ie. 32bit NUMA machines).
*
* When @dyn_size is positive, dynamic area might be larger than
* specified to fill page alignment. Also, when @dyn_size is auto,
* @dyn_size does not fill the whole first chunk but only what's
* necessary for page alignment after static and reserved areas.
* specified to fill page alignment. When @dyn_size is auto,
* @dyn_size is just big enough to fill page alignment after static
* and reserved areas.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned to the bootmem allocator.
* size, the leftover is returned using @free_fn.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
* 0 on success, -errno on failure.
*/
ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
ssize_t dyn_size, ssize_t unit_size)
int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn)
{
size_t chunk_size;
unsigned int cpu;
void *base = (void *)ULONG_MAX;
void **areas = NULL;
struct pcpu_alloc_info *ai;
size_t size_sum, areas_size;
int group, i, rc;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
cpu_distance_fn);
if (IS_ERR(ai))
return PTR_ERR(ai);
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
areas = alloc_bootmem_nopanic(areas_size);
if (!areas) {
rc = -ENOMEM;
goto out_free;
}
/* determine parameters and allocate */
pcpue_size = PFN_ALIGN(static_size + reserved_size +
(dyn_size >= 0 ? dyn_size : 0));
if (dyn_size != 0)
dyn_size = pcpue_size - static_size - reserved_size;
if (unit_size >= 0) {
BUG_ON(unit_size < pcpue_size);
pcpue_unit_size = unit_size;
} else
pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
chunk_size = pcpue_unit_size * nr_cpu_ids;
pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
__pa(MAX_DMA_ADDRESS));
if (!pcpue_ptr) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
return -ENOMEM;
/* allocate, copy and determine base address */
for (group = 0; group < ai->nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
unsigned int cpu = NR_CPUS;
void *ptr;
for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
cpu = gi->cpu_map[i];
BUG_ON(cpu == NR_CPUS);
/* allocate space for the whole group */
ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
if (!ptr) {
rc = -ENOMEM;
goto out_free_areas;
}
areas[group] = ptr;
base = min(ptr, base);
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
if (gi->cpu_map[i] == NR_CPUS) {
/* unused unit, free whole */
free_fn(ptr, ai->unit_size);
continue;
}
/* copy and return the unused part */
memcpy(ptr, __per_cpu_load, ai->static_size);
free_fn(ptr + size_sum, ai->unit_size - size_sum);
}
}
/* return the leftover and copy */
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
/* base address is now known, determine group base offsets */
for (group = 0; group < ai->nr_groups; group++)
ai->groups[group].base_offset = areas[group] - base;
pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
rc = pcpu_setup_first_chunk(ai, base);
goto out_free;
out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
free_fn(areas[group],
ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
if (areas)
free_bootmem(__pa(areas), areas_size);
return rc;
}
#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
!CONFIG_HAVE_SETUP_PER_CPU_AREA */
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/**
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
* @free_fn: funtion to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
* This is a helper to ease setting up page-remapped first percpu
* chunk and can be called where pcpu_setup_first_chunk() is expected.
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page into vmalloc area.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct vm;
struct pcpu_alloc_info *ai;
char psize_str[16];
int unit_pages;
size_t pages_size;
struct page **pages;
int unit, i, j, rc;
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
unit_pages = ai->unit_size >> PAGE_SHIFT;
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
pages = alloc_bootmem(pages_size);
/* allocate pages */
j = 0;
for (unit = 0; unit < num_possible_cpus(); unit++)
for (i = 0; i < unit_pages; i++) {
unsigned int cpu = ai->groups[0].cpu_map[unit];
void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
pr_warning("PERCPU: failed to allocate %s page "
"for cpu%u\n", psize_str, cpu);
goto enomem;
}
pages[j++] = virt_to_page(ptr);
}
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
vm.size = num_possible_cpus() * ai->unit_size;
vm_area_register_early(&vm, PAGE_SIZE);
for (unit = 0; unit < num_possible_cpus(); unit++) {
unsigned long unit_addr =
(unsigned long)vm.addr + unit * ai->unit_size;
for (i = 0; i < unit_pages; i++)
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
unit_pages);
if (rc < 0)
panic("failed to map percpu area, err=%d\n", rc);
if (cpu_possible(cpu)) {
free_bootmem(__pa(ptr + pcpue_size),
pcpue_unit_size - pcpue_size);
memcpy(ptr, __per_cpu_load, static_size);
} else
free_bootmem(__pa(ptr), pcpue_unit_size);
/*
* FIXME: Archs with virtual cache should flush local
* cache for the linear mapping here - something
* equivalent to flush_cache_vmap() on the local cpu.
* flush_cache_vmap() can't be used as most supporting
* data structures are not set up yet.
*/
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
}
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
unit_pages, psize_str, vm.addr, ai->static_size,
ai->reserved_size, ai->dyn_size);
rc = pcpu_setup_first_chunk(ai, vm.addr);
goto out_free_ar;
enomem:
while (--j >= 0)
free_fn(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
free_bootmem(__pa(pages), pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
/*
* Generic percpu area setup.
*
* The embedding helper is used because its behavior closely resembles
* the original non-dynamic generic percpu area setup. This is
* important because many archs have addressing restrictions and might
* fail if the percpu area is located far away from the previous
* location. As an added bonus, in non-NUMA cases, embedding is
* generally a good idea TLB-wise because percpu area can piggy back
* on the physical linear memory mapping which uses large page
* mappings on applicable archs.
*/
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
}
return pcpu_setup_first_chunk(pcpue_get_page, static_size,
reserved_size, dyn_size,
pcpue_unit_size, pcpue_ptr, NULL);
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
free_bootmem(__pa(ptr), size);
}
void __init setup_per_cpu_areas(void)
{
unsigned long delta;
unsigned int cpu;
int rc;
/*
* Always reserve area for module percpu variables. That's
* what the legacy allocator did.
*/
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
if (rc < 0)
panic("Failed to initialized percpu areas.");
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu)
__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
......@@ -19,7 +19,7 @@
#include <linux/module.h>
#include <linux/quicklist.h>
DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
#define FRACTION_OF_NODE_MEM 16
......
......@@ -2091,8 +2091,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
*/
#define NR_KMEM_CACHE_CPU 100
static DEFINE_PER_CPU(struct kmem_cache_cpu,
kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
kmem_cache_cpu);
static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
......
......@@ -265,6 +265,7 @@ struct vmap_area {
static DEFINE_SPINLOCK(vmap_area_lock);
static struct rb_root vmap_area_root = RB_ROOT;
static LIST_HEAD(vmap_area_list);
static unsigned long vmap_area_pcpu_hole;
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
......@@ -431,6 +432,15 @@ static void __free_vmap_area(struct vmap_area *va)
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);
/*
* Track the highest possible candidate for pcpu area
* allocation. Areas outside of vmalloc area can be returned
* here too, consider only end addresses which fall inside
* vmalloc area proper.
*/
if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
call_rcu(&va->rcu_head, rcu_free_va);
}
......@@ -1038,6 +1048,9 @@ void __init vmalloc_init(void)
va->va_end = va->va_start + tmp->size;
__insert_vmap_area(va);
}
vmap_area_pcpu_hole = VMALLOC_END;
vmap_initialized = true;
}
......@@ -1122,13 +1135,34 @@ EXPORT_SYMBOL_GPL(map_vm_area);
DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, void *caller)
{
struct vm_struct *tmp, **p;
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->private = vm;
va->flags |= VM_VM_AREA;
write_lock(&vmlist_lock);
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
if (tmp->addr >= vm->addr)
break;
}
vm->next = *p;
*p = vm;
write_unlock(&vmlist_lock);
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long flags, unsigned long start, unsigned long end,
int node, gfp_t gfp_mask, void *caller)
{
static struct vmap_area *va;
struct vm_struct *area;
struct vm_struct *tmp, **p;
unsigned long align = 1;
BUG_ON(in_interrupt());
......@@ -1147,7 +1181,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (unlikely(!size))
return NULL;
area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
......@@ -1162,25 +1196,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}
area->flags = flags;
area->addr = (void *)va->va_start;
area->size = size;
area->pages = NULL;
area->nr_pages = 0;
area->phys_addr = 0;
area->caller = caller;
va->private = area;
va->flags |= VM_VM_AREA;
write_lock(&vmlist_lock);
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
if (tmp->addr >= area->addr)
break;
}
area->next = *p;
*p = area;
write_unlock(&vmlist_lock);
insert_vmalloc_vm(area, va, flags, caller);
return area;
}
......@@ -1818,6 +1834,286 @@ void free_vm_area(struct vm_struct *area)
}
EXPORT_SYMBOL_GPL(free_vm_area);
static struct vmap_area *node_to_va(struct rb_node *n)
{
return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
}
/**
* pvm_find_next_prev - find the next and prev vmap_area surrounding @end
* @end: target address
* @pnext: out arg for the next vmap_area
* @pprev: out arg for the previous vmap_area
*
* Returns: %true if either or both of next and prev are found,
* %false if no vmap_area exists
*
* Find vmap_areas end addresses of which enclose @end. ie. if not
* NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
*/
static bool pvm_find_next_prev(unsigned long end,
struct vmap_area **pnext,
struct vmap_area **pprev)
{
struct rb_node *n = vmap_area_root.rb_node;
struct vmap_area *va = NULL;
while (n) {
va = rb_entry(n, struct vmap_area, rb_node);
if (end < va->va_end)
n = n->rb_left;
else if (end > va->va_end)
n = n->rb_right;
else
break;
}
if (!va)
return false;
if (va->va_end > end) {
*pnext = va;
*pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
} else {
*pprev = va;
*pnext = node_to_va(rb_next(&(*pprev)->rb_node));
}
return true;
}
/**
* pvm_determine_end - find the highest aligned address between two vmap_areas
* @pnext: in/out arg for the next vmap_area
* @pprev: in/out arg for the previous vmap_area
* @align: alignment
*
* Returns: determined end address
*
* Find the highest aligned address between *@pnext and *@pprev below
* VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
* down address is between the end addresses of the two vmap_areas.
*
* Please note that the address returned by this function may fall
* inside *@pnext vmap_area. The caller is responsible for checking
* that.
*/
static unsigned long pvm_determine_end(struct vmap_area **pnext,
struct vmap_area **pprev,
unsigned long align)
{
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
unsigned long addr;
if (*pnext)
addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
else
addr = vmalloc_end;
while (*pprev && (*pprev)->va_end > addr) {
*pnext = *pprev;
*pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
}
return addr;
}
/**
* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
* @offsets: array containing offset of each area
* @sizes: array containing size of each area
* @nr_vms: the number of areas to allocate
* @align: alignment, all entries in @offsets and @sizes must be aligned to this
* @gfp_mask: allocation mask
*
* Returns: kmalloc'd vm_struct pointer array pointing to allocated
* vm_structs on success, %NULL on failure
*
* Percpu allocator wants to use congruent vm areas so that it can
* maintain the offsets among percpu areas. This function allocates
* congruent vmalloc areas for it. These areas tend to be scattered
* pretty far, distance between two areas easily going up to
* gigabytes. To avoid interacting with regular vmallocs, these areas
* are allocated from top.
*
* Despite its complicated look, this allocator is rather simple. It
* does everything top-down and scans areas from the end looking for
* matching slot. While scanning, if any of the areas overlaps with
* existing vmap_area, the base address is pulled down to fit the
* area. Scanning is repeated till all the areas fit and then all
* necessary data structres are inserted and the result is returned.
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
size_t align, gfp_t gfp_mask)
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
struct vmap_area **vas, *prev, *next;
struct vm_struct **vms;
int area, area2, last_area, term_area;
unsigned long base, start, end, last_end;
bool purged = false;
gfp_mask &= GFP_RECLAIM_MASK;
/* verify parameters and allocate data structures */
BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
start = offsets[area];
end = start + sizes[area];
/* is everything aligned properly? */
BUG_ON(!IS_ALIGNED(offsets[area], align));
BUG_ON(!IS_ALIGNED(sizes[area], align));
/* detect the area with the highest address */
if (start > offsets[last_area])
last_area = area;
for (area2 = 0; area2 < nr_vms; area2++) {
unsigned long start2 = offsets[area2];
unsigned long end2 = start2 + sizes[area2];
if (area2 == area)
continue;
BUG_ON(start2 >= start && start2 < end);
BUG_ON(end2 <= end && end2 > start);
}
}
last_end = offsets[last_area] + sizes[last_area];
if (vmalloc_end - vmalloc_start < last_end) {
WARN_ON(true);
return NULL;
}
vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
if (!vas || !vms)
goto err_free;
for (area = 0; area < nr_vms; area++) {
vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
if (!vas[area] || !vms[area])
goto err_free;
}
retry:
spin_lock(&vmap_area_lock);
/* start scanning - we scan from the top, begin with the last area */
area = term_area = last_area;
start = offsets[area];
end = start + sizes[area];
if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
base = vmalloc_end - last_end;
goto found;
}
base = pvm_determine_end(&next, &prev, align) - end;
while (true) {
BUG_ON(next && next->va_end <= base + end);
BUG_ON(prev && prev->va_end > base + end);
/*
* base might have underflowed, add last_end before
* comparing.
*/
if (base + last_end < vmalloc_start + last_end) {
spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = true;
goto retry;
}
goto err_free;
}
/*
* If next overlaps, move base downwards so that it's
* right below next and then recheck.
*/
if (next && next->va_start < base + end) {
base = pvm_determine_end(&next, &prev, align) - end;
term_area = area;
continue;
}
/*
* If prev overlaps, shift down next and prev and move
* base so that it's right below new next and then
* recheck.
*/
if (prev && prev->va_end > base + start) {
next = prev;
prev = node_to_va(rb_prev(&next->rb_node));
base = pvm_determine_end(&next, &prev, align) - end;
term_area = area;
continue;
}
/*
* This area fits, move on to the previous one. If
* the previous one is the terminal one, we're done.
*/
area = (area + nr_vms - 1) % nr_vms;
if (area == term_area)
break;
start = offsets[area];
end = start + sizes[area];
pvm_find_next_prev(base + end, &next, &prev);
}
found:
/* we've found a fitting base, insert all va's */
for (area = 0; area < nr_vms; area++) {
struct vmap_area *va = vas[area];
va->va_start = base + offsets[area];
va->va_end = va->va_start + sizes[area];
__insert_vmap_area(va);
}
vmap_area_pcpu_hole = base + offsets[last_area];
spin_unlock(&vmap_area_lock);
/* insert all vm's */
for (area = 0; area < nr_vms; area++)
insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
kfree(vas);
return vms;
err_free:
for (area = 0; area < nr_vms; area++) {
if (vas)
kfree(vas[area]);
if (vms)
kfree(vms[area]);
}
kfree(vas);
kfree(vms);
return NULL;
}
/**
* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
* @nr_vms: the number of allocated areas
*
* Free vm_structs and the array allocated by pcpu_get_vm_areas().
*/
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
int i;
for (i = 0; i < nr_vms; i++)
free_vm_area(vms[i]);
kfree(vms);
}
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
......
......@@ -37,12 +37,13 @@ __initcall(init_syncookies);
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS];
static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
ipv4_cookie_scratch);
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
{
__u32 *tmp = __get_cpu_var(cookie_scratch);
__u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
tmp[0] = (__force u32)saddr;
......
......@@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
return child;
}
static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS];
static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
ipv6_cookie_scratch);
static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr,
__be16 sport, __be16 dport, u32 count, int c)
{
__u32 *tmp = __get_cpu_var(cookie_scratch);
__u32 *tmp = __get_cpu_var(ipv6_cookie_scratch);
/*
* we have 320 bits of information to hash, copy in the remaining
......
......@@ -37,7 +37,7 @@
#include "rds.h"
#include "ib.h"
DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
static char *rds_ib_stat_names[] = {
"ib_connect_raced",
......
......@@ -37,7 +37,7 @@
#include "rds.h"
#include "iw.h"
DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
static char *rds_iw_stat_names[] = {
"iw_connect_raced",
......
......@@ -39,7 +39,7 @@ struct rds_page_remainder {
unsigned long r_offset;
};
DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
/*
* returns 0 on success or -errno on failure.
......
/*
* Common module linker script, always used when linking a module.
* Archs are free to supply their own linker scripts. ld will
* combine them automatically.
*/
SECTIONS {
/DISCARD/ : { *(.discard) }
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册