memory.c 26.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Memory subsystem support
4 5 6 7 8 9 10 11 12 13 14 15 16
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
17
#include <linux/capability.h>
18 19 20 21
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
22
#include <linux/stat.h>
23
#include <linux/slab.h>
24
#include <linux/xarray.h>
25

A
Arun Sharma 已提交
26
#include <linux/atomic.h>
27
#include <linux/uaccess.h>
28 29

#define MEMORY_CLASS_NAME	"memory"
30

31 32 33 34 35 36 37
static const char *const online_type_to_str[] = {
	[MMOP_OFFLINE] = "offline",
	[MMOP_ONLINE] = "online",
	[MMOP_ONLINE_KERNEL] = "online_kernel",
	[MMOP_ONLINE_MOVABLE] = "online_movable",
};

38
int mhp_online_type_from_str(const char *str)
39 40 41 42 43 44 45 46 47 48
{
	int i;

	for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
		if (sysfs_streq(str, online_type_to_str[i]))
			return i;
	}
	return -EINVAL;
}

49 50
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)

51 52
static int sections_per_block;

53
static inline unsigned long memory_block_id(unsigned long section_nr)
54 55 56
{
	return section_nr / sections_per_block;
}
57

58
static inline unsigned long pfn_to_block_id(unsigned long pfn)
59
{
60
	return memory_block_id(pfn_to_section_nr(pfn));
61 62
}

63 64 65 66 67
static inline unsigned long phys_to_block_id(unsigned long phys)
{
	return pfn_to_block_id(PFN_DOWN(phys));
}

68 69 70
static int memory_subsys_online(struct device *dev);
static int memory_subsys_offline(struct device *dev);

71
static struct bus_type memory_subsys = {
72
	.name = MEMORY_CLASS_NAME,
73
	.dev_name = MEMORY_CLASS_NAME,
74 75
	.online = memory_subsys_online,
	.offline = memory_subsys_offline,
76 77
};

78 79 80 81 82 83 84
/*
 * Memory blocks are cached in a local radix tree to avoid
 * a costly linear search for the corresponding device on
 * the subsystem bus.
 */
static DEFINE_XARRAY(memory_blocks);

85 86 87 88 89
/*
 * Memory groups, indexed by memory group id (mgid).
 */
static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);

90
static BLOCKING_NOTIFIER_HEAD(memory_chain);
91

92
int register_memory_notifier(struct notifier_block *nb)
93
{
94
	return blocking_notifier_chain_register(&memory_chain, nb);
95
}
96
EXPORT_SYMBOL(register_memory_notifier);
97

98
void unregister_memory_notifier(struct notifier_block *nb)
99
{
100
	blocking_notifier_chain_unregister(&memory_chain, nb);
101
}
102
EXPORT_SYMBOL(unregister_memory_notifier);
103

104 105
static void memory_block_release(struct device *dev)
{
106
	struct memory_block *mem = to_memory_block(dev);
107 108 109 110

	kfree(mem);
}

111 112 113 114
unsigned long __weak memory_block_size_bytes(void)
{
	return MIN_MEMORY_BLOCK_SIZE;
}
115
EXPORT_SYMBOL_GPL(memory_block_size_bytes);
116

117
/*
118
 * Show the first physical section index (number) of this memory block.
119
 */
120 121
static ssize_t phys_index_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
122
{
123
	struct memory_block *mem = to_memory_block(dev);
124 125 126
	unsigned long phys_index;

	phys_index = mem->start_section_nr / sections_per_block;
127

128
	return sysfs_emit(buf, "%08lx\n", phys_index);
129 130
}

131
/*
132 133
 * Legacy interface that we cannot remove. Always indicate "removable"
 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
134
 */
135 136
static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
			      char *buf)
137
{
138
	return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
139 140
}

141 142 143
/*
 * online, offline, going offline, etc.
 */
144 145
static ssize_t state_show(struct device *dev, struct device_attribute *attr,
			  char *buf)
146
{
147
	struct memory_block *mem = to_memory_block(dev);
148
	const char *output;
149 150 151 152 153 154

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
155
	case MEM_ONLINE:
156
		output = "online";
157 158
		break;
	case MEM_OFFLINE:
159
		output = "offline";
160 161
		break;
	case MEM_GOING_OFFLINE:
162
		output = "going-offline";
163 164 165
		break;
	default:
		WARN_ON(1);
166
		return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
167 168
	}

169
	return sysfs_emit(buf, "%s\n", output);
170 171
}

172
int memory_notify(unsigned long val, void *v)
173
{
174
	return blocking_notifier_call_chain(&memory_chain, val, v);
175 176
}

177 178 179 180
static int memory_block_online(struct memory_block *mem)
{
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
181 182 183 184
	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
	struct zone *zone;
	int ret;

185 186
	zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
				  start_pfn, nr_pages);
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201

	/*
	 * Although vmemmap pages have a different lifecycle than the pages
	 * they describe (they remain until the memory is unplugged), doing
	 * their initialization and accounting at memory onlining/offlining
	 * stage helps to keep accounting easier to follow - e.g vmemmaps
	 * belong to the same zone as the memory they backed.
	 */
	if (nr_vmemmap_pages) {
		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
		if (ret)
			return ret;
	}

	ret = online_pages(start_pfn + nr_vmemmap_pages,
202
			   nr_pages - nr_vmemmap_pages, zone, mem->group);
203 204 205 206 207 208 209 210 211 212 213
	if (ret) {
		if (nr_vmemmap_pages)
			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
		return ret;
	}

	/*
	 * Account once onlining succeeded. If the zone was unpopulated, it is
	 * now already properly populated.
	 */
	if (nr_vmemmap_pages)
214
		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
215
					  nr_vmemmap_pages);
216

217
	return ret;
218 219 220 221 222 223
}

static int memory_block_offline(struct memory_block *mem)
{
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
224 225 226 227 228 229 230
	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
	int ret;

	/*
	 * Unaccount before offlining, such that unpopulated zone and kthreads
	 * can properly be torn down in offline_pages().
	 */
231
	if (nr_vmemmap_pages)
232
		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
233
					  -nr_vmemmap_pages);
234

235
	ret = offline_pages(start_pfn + nr_vmemmap_pages,
236
			    nr_pages - nr_vmemmap_pages, mem->group);
237 238 239
	if (ret) {
		/* offline_pages() failed. Account back. */
		if (nr_vmemmap_pages)
240
			adjust_present_page_count(pfn_to_page(start_pfn),
241
						  mem->group, nr_vmemmap_pages);
242 243 244 245 246 247 248
		return ret;
	}

	if (nr_vmemmap_pages)
		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);

	return ret;
249 250
}

251 252 253 254 255
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
256
memory_block_action(struct memory_block *mem, unsigned long action)
257 258 259 260
{
	int ret;

	switch (action) {
261
	case MEM_ONLINE:
262
		ret = memory_block_online(mem);
263 264
		break;
	case MEM_OFFLINE:
265
		ret = memory_block_offline(mem);
266 267 268
		break;
	default:
		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
269
		     "%ld\n", __func__, mem->start_section_nr, action, action);
270
		ret = -EINVAL;
271 272 273 274 275
	}

	return ret;
}

276
static int memory_block_change_state(struct memory_block *mem,
277
		unsigned long to_state, unsigned long from_state_req)
278
{
279
	int ret = 0;
280

281 282
	if (mem->state != from_state_req)
		return -EINVAL;
283

284 285 286
	if (to_state == MEM_OFFLINE)
		mem->state = MEM_GOING_OFFLINE;

287
	ret = memory_block_action(mem, to_state);
288
	mem->state = ret ? from_state_req : to_state;
289

290 291
	return ret;
}
292

293
/* The device lock serializes operations on memory_subsys_[online|offline] */
294 295
static int memory_subsys_online(struct device *dev)
{
296
	struct memory_block *mem = to_memory_block(dev);
297
	int ret;
298

299 300
	if (mem->state == MEM_ONLINE)
		return 0;
301

302
	/*
303 304
	 * When called via device_online() without configuring the online_type,
	 * we want to default to MMOP_ONLINE.
305
	 */
306
	if (mem->online_type == MMOP_OFFLINE)
307
		mem->online_type = MMOP_ONLINE;
308

309
	ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
310
	mem->online_type = MMOP_OFFLINE;
311 312 313 314 315

	return ret;
}

static int memory_subsys_offline(struct device *dev)
316
{
317
	struct memory_block *mem = to_memory_block(dev);
318

319 320
	if (mem->state == MEM_OFFLINE)
		return 0;
321

322
	return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
323
}
324

325 326
static ssize_t state_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
327
{
328
	const int online_type = mhp_online_type_from_str(buf);
329
	struct memory_block *mem = to_memory_block(dev);
330 331 332 333
	int ret;

	if (online_type < 0)
		return -EINVAL;
334

335 336 337
	ret = lock_device_hotplug_sysfs();
	if (ret)
		return ret;
338

339
	switch (online_type) {
340 341
	case MMOP_ONLINE_KERNEL:
	case MMOP_ONLINE_MOVABLE:
342
	case MMOP_ONLINE:
343
		/* mem->online_type is protected by device_hotplug_lock */
344 345 346
		mem->online_type = online_type;
		ret = device_online(&mem->dev);
		break;
347
	case MMOP_OFFLINE:
348 349 350 351
		ret = device_offline(&mem->dev);
		break;
	default:
		ret = -EINVAL; /* should never happen */
352 353 354
	}

	unlock_device_hotplug();
355

356
	if (ret < 0)
357
		return ret;
358 359 360
	if (ret)
		return -EINVAL;

361 362 363 364
	return count;
}

/*
365 366 367 368 369
 * Legacy interface that we cannot remove: s390x exposes the storage increment
 * covered by a memory block, allowing for identifying which memory blocks
 * comprise a storage increment. Since a memory block spans complete
 * storage increments nowadays, this interface is basically unused. Other
 * archs never exposed != 0.
370
 */
371
static ssize_t phys_device_show(struct device *dev,
372
				struct device_attribute *attr, char *buf)
373
{
374
	struct memory_block *mem = to_memory_block(dev);
375
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
376

377 378
	return sysfs_emit(buf, "%d\n",
			  arch_get_memory_phys_device(start_pfn));
379 380
}

381
#ifdef CONFIG_MEMORY_HOTREMOVE
382
static int print_allowed_zone(char *buf, int len, int nid,
383
			      struct memory_group *group,
384 385
			      unsigned long start_pfn, unsigned long nr_pages,
			      int online_type, struct zone *default_zone)
386 387 388
{
	struct zone *zone;

389
	zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
390 391
	if (zone == default_zone)
		return 0;
392

393
	return sysfs_emit_at(buf, len, " %s", zone->name);
394 395
}

396
static ssize_t valid_zones_show(struct device *dev,
397 398 399
				struct device_attribute *attr, char *buf)
{
	struct memory_block *mem = to_memory_block(dev);
400
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
401
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
402
	struct memory_group *group = mem->group;
403
	struct zone *default_zone;
404
	int nid = mem->nid;
405
	int len = 0;
406

407 408 409 410 411
	/*
	 * Check the existing zone. Make sure that we do that only on the
	 * online nodes otherwise the page_zone is not reliable
	 */
	if (mem->state == MEM_ONLINE) {
412 413 414 415
		/*
		 * The block contains more than one zone can not be offlined.
		 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
		 */
416 417 418
		default_zone = test_pages_in_a_zone(start_pfn,
						    start_pfn + nr_pages);
		if (!default_zone)
419 420
			return sysfs_emit(buf, "%s\n", "none");
		len += sysfs_emit_at(buf, len, "%s", default_zone->name);
421
		goto out;
422 423
	}

424 425
	default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
					  start_pfn, nr_pages);
426

427
	len += sysfs_emit_at(buf, len, "%s", default_zone->name);
428
	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
429
				  MMOP_ONLINE_KERNEL, default_zone);
430
	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
431
				  MMOP_ONLINE_MOVABLE, default_zone);
432
out:
433
	len += sysfs_emit_at(buf, len, "\n");
434
	return len;
435
}
436
static DEVICE_ATTR_RO(valid_zones);
437 438
#endif

439 440 441 442
static DEVICE_ATTR_RO(phys_index);
static DEVICE_ATTR_RW(state);
static DEVICE_ATTR_RO(phys_device);
static DEVICE_ATTR_RO(removable);
443 444

/*
445
 * Show the memory block size (shared by all memory blocks).
446
 */
447 448
static ssize_t block_size_bytes_show(struct device *dev,
				     struct device_attribute *attr, char *buf)
449
{
450
	return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
451 452
}

453
static DEVICE_ATTR_RO(block_size_bytes);
454

455 456 457 458
/*
 * Memory auto online policy.
 */

459 460
static ssize_t auto_online_blocks_show(struct device *dev,
				       struct device_attribute *attr, char *buf)
461
{
462
	return sysfs_emit(buf, "%s\n",
463
			  online_type_to_str[mhp_default_online_type]);
464 465
}

466 467 468
static ssize_t auto_online_blocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *buf, size_t count)
469
{
470
	const int online_type = mhp_online_type_from_str(buf);
471 472

	if (online_type < 0)
473 474
		return -EINVAL;

475
	mhp_default_online_type = online_type;
476 477 478
	return count;
}

479
static DEVICE_ATTR_RW(auto_online_blocks);
480

481 482 483 484 485 486 487
/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
488 489
static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
490 491
{
	u64 phys_addr;
492
	int nid, ret;
493
	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
494

495 496 497
	ret = kstrtoull(buf, 0, &phys_addr);
	if (ret)
		return ret;
498

499 500 501
	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
		return -EINVAL;

502 503
	ret = lock_device_hotplug_sysfs();
	if (ret)
504
		return ret;
505

506
	nid = memory_add_physaddr_to_nid(phys_addr);
507
	ret = __add_memory(nid, phys_addr,
508 509
			   MIN_MEMORY_BLOCK_SIZE * sections_per_block,
			   MHP_NONE);
510

511 512
	if (ret)
		goto out;
513

514 515
	ret = count;
out:
516
	unlock_device_hotplug();
517
	return ret;
518 519
}

520
static DEVICE_ATTR_WO(probe);
521 522
#endif

523 524 525 526 527 528
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
529 530 531
static ssize_t soft_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
532 533 534 535 536
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
537
	if (kstrtoull(buf, 0, &pfn) < 0)
538 539
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
540
	ret = soft_offline_page(pfn, 0);
541 542 543 544
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
545 546 547
static ssize_t hard_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
548 549 550 551 552
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
553
	if (kstrtoull(buf, 0, &pfn) < 0)
554 555
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
556
	ret = memory_failure(pfn, 0);
557 558 559
	return ret ? ret : count;
}

560 561
static DEVICE_ATTR_WO(soft_offline_page);
static DEVICE_ATTR_WO(hard_offline_page);
562 563
#endif

564
/* See phys_device_show(). */
565 566 567 568
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
569

570 571 572 573 574
/*
 * A reference for the returned memory block device is acquired.
 *
 * Called under device_hotplug_lock.
 */
575
static struct memory_block *find_memory_block_by_id(unsigned long block_id)
576
{
577
	struct memory_block *mem;
578

579 580 581 582
	mem = xa_load(&memory_blocks, block_id);
	if (mem)
		get_device(&mem->dev);
	return mem;
583 584
}

585
/*
586
 * Called under device_hotplug_lock.
587 588 589
 */
struct memory_block *find_memory_block(struct mem_section *section)
{
590
	unsigned long block_id = memory_block_id(__section_nr(section));
591 592

	return find_memory_block_by_id(block_id);
593 594
}

595 596 597 598 599
static struct attribute *memory_memblk_attrs[] = {
	&dev_attr_phys_index.attr,
	&dev_attr_state.attr,
	&dev_attr_phys_device.attr,
	&dev_attr_removable.attr,
600 601 602
#ifdef CONFIG_MEMORY_HOTREMOVE
	&dev_attr_valid_zones.attr,
#endif
603 604 605
	NULL
};

606
static const struct attribute_group memory_memblk_attr_group = {
607 608 609 610 611 612 613 614 615 616 617 618 619 620
	.attrs = memory_memblk_attrs,
};

static const struct attribute_group *memory_memblk_attr_groups[] = {
	&memory_memblk_attr_group,
	NULL,
};

/*
 * register_memory - Setup a sysfs device for a memory block
 */
static
int register_memory(struct memory_block *memory)
{
621 622
	int ret;

623 624 625 626
	memory->dev.bus = &memory_subsys;
	memory->dev.id = memory->start_section_nr / sections_per_block;
	memory->dev.release = memory_block_release;
	memory->dev.groups = memory_memblk_attr_groups;
627
	memory->dev.offline = memory->state == MEM_OFFLINE;
628

629
	ret = device_register(&memory->dev);
630
	if (ret) {
631
		put_device(&memory->dev);
632 633 634 635 636 637 638 639
		return ret;
	}
	ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
			      GFP_KERNEL));
	if (ret) {
		put_device(&memory->dev);
		device_unregister(&memory->dev);
	}
640
	return ret;
641 642
}

643
static int init_memory_block(unsigned long block_id, unsigned long state,
644 645
			     unsigned long nr_vmemmap_pages,
			     struct memory_group *group)
646
{
647
	struct memory_block *mem;
648 649
	int ret = 0;

650
	mem = find_memory_block_by_id(block_id);
651 652 653 654
	if (mem) {
		put_device(&mem->dev);
		return -EEXIST;
	}
655
	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
656 657 658
	if (!mem)
		return -ENOMEM;

659
	mem->start_section_nr = block_id * sections_per_block;
660
	mem->state = state;
661
	mem->nid = NUMA_NO_NODE;
662
	mem->nr_vmemmap_pages = nr_vmemmap_pages;
663 664 665 666 667 668
	INIT_LIST_HEAD(&mem->group_next);

	if (group) {
		mem->group = group;
		list_add(&mem->group_next, &group->memory_blocks);
	}
669

670 671 672 673 674
	ret = register_memory(mem);

	return ret;
}

675
static int add_memory_block(unsigned long base_section_nr)
676
{
677
	int section_count = 0;
678
	unsigned long nr;
679

680 681 682
	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
	     nr++)
		if (present_section_nr(nr))
683
			section_count++;
684

685 686
	if (section_count == 0)
		return 0;
687
	return init_memory_block(memory_block_id(base_section_nr),
688
				 MEM_ONLINE, 0,  NULL);
689 690
}

691 692 693 694 695
static void unregister_memory(struct memory_block *memory)
{
	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
		return;

696 697
	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);

698 699 700 701 702
	if (memory->group) {
		list_del(&memory->group_next);
		memory->group = NULL;
	}

703 704 705 706 707
	/* drop the ref. we got via find_memory_block() */
	put_device(&memory->dev);
	device_unregister(&memory->dev);
}

708
/*
709 710 711
 * Create memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * will be initialized as offline.
712 713
 *
 * Called under device_hotplug_lock.
714
 */
715
int create_memory_block_devices(unsigned long start, unsigned long size,
716 717
				unsigned long vmemmap_pages,
				struct memory_group *group)
718
{
719 720
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
721
	struct memory_block *mem;
722 723
	unsigned long block_id;
	int ret = 0;
724

725 726 727
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
		return -EINVAL;
728

729
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
730 731
		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages,
					group);
732
		if (ret)
733 734 735 736 737 738
			break;
	}
	if (ret) {
		end_block_id = block_id;
		for (block_id = start_block_id; block_id != end_block_id;
		     block_id++) {
739
			mem = find_memory_block_by_id(block_id);
740 741
			if (WARN_ON_ONCE(!mem))
				continue;
742 743
			unregister_memory(mem);
		}
744
	}
745
	return ret;
746 747
}

748 749 750 751
/*
 * Remove memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * have to be offline.
752 753
 *
 * Called under device_hotplug_lock.
754 755
 */
void remove_memory_block_devices(unsigned long start, unsigned long size)
756
{
757 758
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
759
	struct memory_block *mem;
760
	unsigned long block_id;
761

762 763
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
764 765
		return;

766
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
767
		mem = find_memory_block_by_id(block_id);
768 769 770
		if (WARN_ON_ONCE(!mem))
			continue;
		unregister_memory_block_under_nodes(mem);
771
		unregister_memory(mem);
772
	}
773 774
}

775 776 777 778 779 780
/* return true if the memory block is offlined, otherwise, return false */
bool is_memblock_offlined(struct memory_block *mem)
{
	return mem->state == MEM_OFFLINE;
}

781 782 783 784 785 786 787 788 789 790 791
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
	&dev_attr_probe.attr,
#endif

#ifdef CONFIG_MEMORY_FAILURE
	&dev_attr_soft_offline_page.attr,
	&dev_attr_hard_offline_page.attr,
#endif

	&dev_attr_block_size_bytes.attr,
792
	&dev_attr_auto_online_blocks.attr,
793 794 795
	NULL
};

796
static const struct attribute_group memory_root_attr_group = {
797 798 799 800 801 802 803 804
	.attrs = memory_root_attrs,
};

static const struct attribute_group *memory_root_attr_groups[] = {
	&memory_root_attr_group,
	NULL,
};

805
/*
806 807 808
 * Initialize the sysfs support for memory devices. At the time this function
 * is called, we cannot have concurrent creation/deletion of memory block
 * devices, the device_hotplug_lock is not needed.
809
 */
810
void __init memory_dev_init(void)
811 812
{
	int ret;
813
	unsigned long block_sz, nr;
814

815 816 817 818 819 820
	/* Validate the configured memory block size */
	block_sz = memory_block_size_bytes();
	if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
		panic("Memory block size not suitable: 0x%lx\n", block_sz);
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;

821
	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
822
	if (ret)
823
		panic("%s() failed to register subsystem: %d\n", __func__, ret);
824 825 826 827 828

	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
829 830
	for (nr = 0; nr <= __highest_present_section_nr;
	     nr += sections_per_block) {
831 832 833 834
		ret = add_memory_block(nr);
		if (ret)
			panic("%s() failed to add memory block: %d\n", __func__,
			      ret);
835 836
	}
}
837 838 839 840 841 842 843 844 845 846 847 848 849 850 851

/**
 * walk_memory_blocks - walk through all present memory blocks overlapped
 *			by the range [start, start + size)
 *
 * @start: start address of the memory range
 * @size: size of the memory range
 * @arg: argument passed to func
 * @func: callback for each memory section walked
 *
 * This function walks through all present memory blocks overlapped by the
 * range [start, start + size), calling func on each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
852 853
 *
 * Called under device_hotplug_lock.
854 855 856 857 858 859 860 861 862 863
 */
int walk_memory_blocks(unsigned long start, unsigned long size,
		       void *arg, walk_memory_blocks_func_t func)
{
	const unsigned long start_block_id = phys_to_block_id(start);
	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
	struct memory_block *mem;
	unsigned long block_id;
	int ret = 0;

864 865 866
	if (!size)
		return 0;

867
	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
868
		mem = find_memory_block_by_id(block_id);
869 870 871 872 873 874 875 876 877 878
		if (!mem)
			continue;

		ret = func(mem, arg);
		put_device(&mem->dev);
		if (ret)
			break;
	}
	return ret;
}
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914

struct for_each_memory_block_cb_data {
	walk_memory_blocks_func_t func;
	void *arg;
};

static int for_each_memory_block_cb(struct device *dev, void *data)
{
	struct memory_block *mem = to_memory_block(dev);
	struct for_each_memory_block_cb_data *cb_data = data;

	return cb_data->func(mem, cb_data->arg);
}

/**
 * for_each_memory_block - walk through all present memory blocks
 *
 * @arg: argument passed to func
 * @func: callback for each memory block walked
 *
 * This function walks through all present memory blocks, calling func on
 * each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
 */
int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
{
	struct for_each_memory_block_cb_data cb_data = {
		.func = func,
		.arg = arg,
	};

	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
				for_each_memory_block_cb);
}
915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046

/*
 * This is an internal helper to unify allocation and initialization of
 * memory groups. Note that the passed memory group will be copied to a
 * dynamically allocated memory group. After this call, the passed
 * memory group should no longer be used.
 */
static int memory_group_register(struct memory_group group)
{
	struct memory_group *new_group;
	uint32_t mgid;
	int ret;

	if (!node_possible(group.nid))
		return -EINVAL;

	new_group = kzalloc(sizeof(group), GFP_KERNEL);
	if (!new_group)
		return -ENOMEM;
	*new_group = group;
	INIT_LIST_HEAD(&new_group->memory_blocks);

	ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
		       GFP_KERNEL);
	if (ret) {
		kfree(new_group);
		return ret;
	}
	return mgid;
}

/**
 * memory_group_register_static() - Register a static memory group.
 * @nid: The node id.
 * @max_pages: The maximum number of pages we'll have in this static memory
 *	       group.
 *
 * Register a new static memory group and return the memory group id.
 * All memory in the group belongs to a single unit, such as a DIMM. All
 * memory belonging to a static memory group is added in one go to be removed
 * in one go -- it's static.
 *
 * Returns an error if out of memory, if the node id is invalid, if no new
 * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
 * returns the new memory group id.
 */
int memory_group_register_static(int nid, unsigned long max_pages)
{
	struct memory_group group = {
		.nid = nid,
		.s = {
			.max_pages = max_pages,
		},
	};

	if (!max_pages)
		return -EINVAL;
	return memory_group_register(group);
}
EXPORT_SYMBOL_GPL(memory_group_register_static);

/**
 * memory_group_register_dynamic() - Register a dynamic memory group.
 * @nid: The node id.
 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
 *		memory group.
 *
 * Register a new dynamic memory group and return the memory group id.
 * Memory within a dynamic memory group is added/removed dynamically
 * in unit_pages.
 *
 * Returns an error if out of memory, if the node id is invalid, if no new
 * memory groups can be registered, or if unit_pages is invalid (0, not a
 * power of two, smaller than a single memory block). Otherwise, returns the
 * new memory group id.
 */
int memory_group_register_dynamic(int nid, unsigned long unit_pages)
{
	struct memory_group group = {
		.nid = nid,
		.is_dynamic = true,
		.d = {
			.unit_pages = unit_pages,
		},
	};

	if (!unit_pages || !is_power_of_2(unit_pages) ||
	    unit_pages < PHYS_PFN(memory_block_size_bytes()))
		return -EINVAL;
	return memory_group_register(group);
}
EXPORT_SYMBOL_GPL(memory_group_register_dynamic);

/**
 * memory_group_unregister() - Unregister a memory group.
 * @mgid: the memory group id
 *
 * Unregister a memory group. If any memory block still belongs to this
 * memory group, unregistering will fail.
 *
 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
 * memory blocks still belong to this memory group and returns 0 if
 * unregistering succeeded.
 */
int memory_group_unregister(int mgid)
{
	struct memory_group *group;

	if (mgid < 0)
		return -EINVAL;

	group = xa_load(&memory_groups, mgid);
	if (!group)
		return -EINVAL;
	if (!list_empty(&group->memory_blocks))
		return -EBUSY;
	xa_erase(&memory_groups, mgid);
	kfree(group);
	return 0;
}
EXPORT_SYMBOL_GPL(memory_group_unregister);

/*
 * This is an internal helper only to be used in core memory hotplug code to
 * lookup a memory group. We don't care about locking, as we don't expect a
 * memory group to get unregistered while adding memory to it -- because
 * the group and the memory is managed by the same driver.
 */
struct memory_group *memory_group_find_by_id(int mgid)
{
	return xa_load(&memory_groups, mgid);
}