diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 97e48b0eefc7c18f54f0d1ee76860eed43a53c4e..1841ef69183d8742db20194334f248b88ffac4a4 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -645,7 +645,7 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { pg_data_t *pgdat; struct zone *zone; @@ -656,7 +656,7 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdat = NODE_DATA(nid); zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL); + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); ret = __add_pages(nid, zone, start_pfn, nr_pages); if (ret) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 0f11819d8f1dc07441f6e13362a4dca53f55fb87..6571cfb056686fe0440d82acb08dff7ddf1ae625 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start) } #endif -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdata; struct zone *zone; @@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size) /* this should work for most non-highmem platforms */ zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0); + zone_for_memory(nid, start, size, 0, for_device); return __add_pages(nid, zone, start_pfn, nr_pages); } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 76e873748b56e91579ad06fd2fece4e2c346a51f..48ee78be88ba6cdc0756b1dad3c35f0a731ff2f1 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -168,7 +168,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 2790b6a64157f79663fe5232afe9f857e6d81cb7..c1490096b8637f61eac9c7ee9a3703ce6680dae2 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,7 +485,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { pg_data_t *pgdat; unsigned long start_pfn = start >> PAGE_SHIFT; @@ -496,7 +496,8 @@ int arch_add_memory(int nid, u64 start, u64 size) /* We only have ZONE_NORMAL, so this is easy.. */ ret = __add_pages(nid, pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL), + zone_for_memory(nid, start, size, ZONE_NORMAL, + for_device), start_pfn, nr_pages); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index 5bd252e3fdc506a6aa393419252a94261768d400..d4e1fc41d06db21a475b1bdbd6508df5b7f5ab8a 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -863,7 +863,7 @@ void __init mem_init(void) * memory to the highmem for now. */ #ifndef CONFIG_NEED_MULTIPLE_NODES -int arch_add_memory(u64 start, u64 size) +int arch_add_memory(u64 start, u64 size, bool for_device) { struct pglist_data *pgdata = &contig_page_data; struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8340e45c891a1dc879a6e23ab95fe0e2e82b5eec..2a9237d20a701edb1adb0e0feacb15bf7c225478 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -822,11 +822,11 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdata = NODE_DATA(nid); struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM); + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3fba623e3ba558553d9740d3a994343a2960428f..30564e2752d361870e91a4e25a5afbb3d029b7d6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL); + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 6ffa0ac7f7d62a2521ba40d154f4f5248f882afd..8f60e899b33c574b58be6d0a0d928c8f094941ea 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -266,8 +266,9 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); -extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default); -extern int arch_add_memory(int nid, u64 start, u64 size); +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, + bool for_device); +extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 754c25966a0a7828901deaf2c69dc1d5243736f4..9217fd93c25b411afe65a0705873c136d2b61745 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -319,7 +319,11 @@ enum zone_type { ZONE_HIGHMEM, #endif ZONE_MOVABLE, +#ifdef CONFIG_ZONE_DEVICE + ZONE_DEVICE, +#endif __MAX_NR_ZONES + }; #ifndef __GENERATING_BOUNDS_H @@ -794,6 +798,25 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; } +static inline int zone_id(const struct zone *zone) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + + return zone - pgdat->node_zones; +} + +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_dev_zone(const struct zone *zone) +{ + return zone_id(zone) == ZONE_DEVICE; +} +#else +static inline bool is_dev_zone(const struct zone *zone) +{ + return false; +} +#endif + #include extern struct mutex zonelists_mutex; diff --git a/mm/Kconfig b/mm/Kconfig index e79de2bd12cd046c416537b8c6550ca3aea95a01..a0cd086df16bee0ed462b5dea5a388a6b5638309 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -654,3 +654,20 @@ config DEFERRED_STRUCT_PAGE_INIT when kswapd starts. This has a potential performance impact on processes running early in the lifetime of the systemm until kswapd finishes the initialisation. + +config ZONE_DEVICE + bool "Device memory (pmem, etc...) hotplug support" if EXPERT + default !ZONE_DMA + depends on !ZONE_DMA + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on X86_64 #arch_add_memory() comprehends device memory + + help + Device memory hotplug support allows for establishing pmem, + or other device driver discovered memory regions, in the + memmap. This allows pfn_to_page() lookups of otherwise + "device-physical" addresses which is needed for using a DAX + mapping in an O_DIRECT operation, among other things. + + If FS_DAX is enabled, then say Y. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 26fbba7d888f887c3383c1bb5829cb4f204e2040..24e4c76c951be7704a7bdb386d24560e41bf22ed 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -770,7 +770,10 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, start = phys_start_pfn << PAGE_SHIFT; size = nr_pages * PAGE_SIZE; - ret = release_mem_region_adjustable(&iomem_resource, start, size); + + /* in the ZONE_DEVICE case device driver owns the memory region */ + if (!is_dev_zone(zone)) + ret = release_mem_region_adjustable(&iomem_resource, start, size); if (ret) { resource_size_t endres = start + size - 1; @@ -1207,8 +1210,13 @@ static int should_add_memory_movable(int nid, u64 start, u64 size) return 0; } -int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +int zone_for_memory(int nid, u64 start, u64 size, int zone_default, + bool for_device) { +#ifdef CONFIG_ZONE_DEVICE + if (for_device) + return ZONE_DEVICE; +#endif if (should_add_memory_movable(nid, start, size)) return ZONE_MOVABLE; @@ -1249,7 +1257,7 @@ int __ref add_memory(int nid, u64 start, u64 size) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size); + ret = arch_add_memory(nid, start, size, false); if (ret < 0) goto error; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef19f22b2b7de1728fb4ed8d6451d29bb993a928..0f19b4e1823390aa42d3d01e2027db5af3815d4c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -207,6 +207,9 @@ static char * const zone_names[MAX_NR_ZONES] = { "HighMem", #endif "Movable", +#ifdef CONFIG_ZONE_DEVICE + "Device", +#endif }; int min_free_kbytes = 1024;