memory.c 27.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Memory subsystem support
4 5 6 7 8 9 10 11 12 13 14 15 16
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
17
#include <linux/capability.h>
18 19 20 21
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
22
#include <linux/stat.h>
23
#include <linux/slab.h>
24
#include <linux/xarray.h>
25

A
Arun Sharma 已提交
26
#include <linux/atomic.h>
27
#include <linux/uaccess.h>
28 29

#define MEMORY_CLASS_NAME	"memory"
30

31 32 33 34 35 36 37
static const char *const online_type_to_str[] = {
	[MMOP_OFFLINE] = "offline",
	[MMOP_ONLINE] = "online",
	[MMOP_ONLINE_KERNEL] = "online_kernel",
	[MMOP_ONLINE_MOVABLE] = "online_movable",
};

38
int mhp_online_type_from_str(const char *str)
39 40 41 42 43 44 45 46 47 48
{
	int i;

	for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
		if (sysfs_streq(str, online_type_to_str[i]))
			return i;
	}
	return -EINVAL;
}

49 50
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)

51 52
static int sections_per_block;

53
static inline unsigned long memory_block_id(unsigned long section_nr)
54 55 56
{
	return section_nr / sections_per_block;
}
57

58
static inline unsigned long pfn_to_block_id(unsigned long pfn)
59
{
60
	return memory_block_id(pfn_to_section_nr(pfn));
61 62
}

63 64 65 66 67
static inline unsigned long phys_to_block_id(unsigned long phys)
{
	return pfn_to_block_id(PFN_DOWN(phys));
}

68 69 70
static int memory_subsys_online(struct device *dev);
static int memory_subsys_offline(struct device *dev);

71
static struct bus_type memory_subsys = {
72
	.name = MEMORY_CLASS_NAME,
73
	.dev_name = MEMORY_CLASS_NAME,
74 75
	.online = memory_subsys_online,
	.offline = memory_subsys_offline,
76 77
};

78 79 80 81 82 83 84
/*
 * Memory blocks are cached in a local radix tree to avoid
 * a costly linear search for the corresponding device on
 * the subsystem bus.
 */
static DEFINE_XARRAY(memory_blocks);

85 86 87 88
/*
 * Memory groups, indexed by memory group id (mgid).
 */
static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
89
#define MEMORY_GROUP_MARK_DYNAMIC	XA_MARK_1
90

91
static BLOCKING_NOTIFIER_HEAD(memory_chain);
92

93
int register_memory_notifier(struct notifier_block *nb)
94
{
95
	return blocking_notifier_chain_register(&memory_chain, nb);
96
}
97
EXPORT_SYMBOL(register_memory_notifier);
98

99
void unregister_memory_notifier(struct notifier_block *nb)
100
{
101
	blocking_notifier_chain_unregister(&memory_chain, nb);
102
}
103
EXPORT_SYMBOL(unregister_memory_notifier);
104

105 106
static void memory_block_release(struct device *dev)
{
107
	struct memory_block *mem = to_memory_block(dev);
108 109 110 111

	kfree(mem);
}

112 113 114 115
unsigned long __weak memory_block_size_bytes(void)
{
	return MIN_MEMORY_BLOCK_SIZE;
}
116
EXPORT_SYMBOL_GPL(memory_block_size_bytes);
117

118
/*
119
 * Show the first physical section index (number) of this memory block.
120
 */
121 122
static ssize_t phys_index_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
123
{
124
	struct memory_block *mem = to_memory_block(dev);
125 126 127
	unsigned long phys_index;

	phys_index = mem->start_section_nr / sections_per_block;
128

129
	return sysfs_emit(buf, "%08lx\n", phys_index);
130 131
}

132
/*
133 134
 * Legacy interface that we cannot remove. Always indicate "removable"
 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
135
 */
136 137
static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
			      char *buf)
138
{
139
	return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
140 141
}

142 143 144
/*
 * online, offline, going offline, etc.
 */
145 146
static ssize_t state_show(struct device *dev, struct device_attribute *attr,
			  char *buf)
147
{
148
	struct memory_block *mem = to_memory_block(dev);
149
	const char *output;
150 151 152 153 154 155

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
156
	case MEM_ONLINE:
157
		output = "online";
158 159
		break;
	case MEM_OFFLINE:
160
		output = "offline";
161 162
		break;
	case MEM_GOING_OFFLINE:
163
		output = "going-offline";
164 165 166
		break;
	default:
		WARN_ON(1);
167
		return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
168 169
	}

170
	return sysfs_emit(buf, "%s\n", output);
171 172
}

173
int memory_notify(unsigned long val, void *v)
174
{
175
	return blocking_notifier_call_chain(&memory_chain, val, v);
176 177
}

178 179 180 181
static int memory_block_online(struct memory_block *mem)
{
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
182 183 184 185
	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
	struct zone *zone;
	int ret;

186 187
	zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
				  start_pfn, nr_pages);
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202

	/*
	 * Although vmemmap pages have a different lifecycle than the pages
	 * they describe (they remain until the memory is unplugged), doing
	 * their initialization and accounting at memory onlining/offlining
	 * stage helps to keep accounting easier to follow - e.g vmemmaps
	 * belong to the same zone as the memory they backed.
	 */
	if (nr_vmemmap_pages) {
		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
		if (ret)
			return ret;
	}

	ret = online_pages(start_pfn + nr_vmemmap_pages,
203
			   nr_pages - nr_vmemmap_pages, zone, mem->group);
204 205 206 207 208 209 210 211 212 213 214
	if (ret) {
		if (nr_vmemmap_pages)
			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
		return ret;
	}

	/*
	 * Account once onlining succeeded. If the zone was unpopulated, it is
	 * now already properly populated.
	 */
	if (nr_vmemmap_pages)
215
		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
216
					  nr_vmemmap_pages);
217

218
	return ret;
219 220 221 222 223 224
}

static int memory_block_offline(struct memory_block *mem)
{
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
225 226 227 228 229 230 231
	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
	int ret;

	/*
	 * Unaccount before offlining, such that unpopulated zone and kthreads
	 * can properly be torn down in offline_pages().
	 */
232
	if (nr_vmemmap_pages)
233
		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
234
					  -nr_vmemmap_pages);
235

236
	ret = offline_pages(start_pfn + nr_vmemmap_pages,
237
			    nr_pages - nr_vmemmap_pages, mem->group);
238 239 240
	if (ret) {
		/* offline_pages() failed. Account back. */
		if (nr_vmemmap_pages)
241
			adjust_present_page_count(pfn_to_page(start_pfn),
242
						  mem->group, nr_vmemmap_pages);
243 244 245 246 247 248 249
		return ret;
	}

	if (nr_vmemmap_pages)
		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);

	return ret;
250 251
}

252 253 254 255 256
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
257
memory_block_action(struct memory_block *mem, unsigned long action)
258 259 260 261
{
	int ret;

	switch (action) {
262
	case MEM_ONLINE:
263
		ret = memory_block_online(mem);
264 265
		break;
	case MEM_OFFLINE:
266
		ret = memory_block_offline(mem);
267 268 269
		break;
	default:
		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
270
		     "%ld\n", __func__, mem->start_section_nr, action, action);
271
		ret = -EINVAL;
272 273 274 275 276
	}

	return ret;
}

277
static int memory_block_change_state(struct memory_block *mem,
278
		unsigned long to_state, unsigned long from_state_req)
279
{
280
	int ret = 0;
281

282 283
	if (mem->state != from_state_req)
		return -EINVAL;
284

285 286 287
	if (to_state == MEM_OFFLINE)
		mem->state = MEM_GOING_OFFLINE;

288
	ret = memory_block_action(mem, to_state);
289
	mem->state = ret ? from_state_req : to_state;
290

291 292
	return ret;
}
293

294
/* The device lock serializes operations on memory_subsys_[online|offline] */
295 296
static int memory_subsys_online(struct device *dev)
{
297
	struct memory_block *mem = to_memory_block(dev);
298
	int ret;
299

300 301
	if (mem->state == MEM_ONLINE)
		return 0;
302

303
	/*
304 305
	 * When called via device_online() without configuring the online_type,
	 * we want to default to MMOP_ONLINE.
306
	 */
307
	if (mem->online_type == MMOP_OFFLINE)
308
		mem->online_type = MMOP_ONLINE;
309

310
	ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
311
	mem->online_type = MMOP_OFFLINE;
312 313 314 315 316

	return ret;
}

static int memory_subsys_offline(struct device *dev)
317
{
318
	struct memory_block *mem = to_memory_block(dev);
319

320 321
	if (mem->state == MEM_OFFLINE)
		return 0;
322

323
	return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
324
}
325

326 327
static ssize_t state_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
328
{
329
	const int online_type = mhp_online_type_from_str(buf);
330
	struct memory_block *mem = to_memory_block(dev);
331 332 333 334
	int ret;

	if (online_type < 0)
		return -EINVAL;
335

336 337 338
	ret = lock_device_hotplug_sysfs();
	if (ret)
		return ret;
339

340
	switch (online_type) {
341 342
	case MMOP_ONLINE_KERNEL:
	case MMOP_ONLINE_MOVABLE:
343
	case MMOP_ONLINE:
344
		/* mem->online_type is protected by device_hotplug_lock */
345 346 347
		mem->online_type = online_type;
		ret = device_online(&mem->dev);
		break;
348
	case MMOP_OFFLINE:
349 350 351 352
		ret = device_offline(&mem->dev);
		break;
	default:
		ret = -EINVAL; /* should never happen */
353 354 355
	}

	unlock_device_hotplug();
356

357
	if (ret < 0)
358
		return ret;
359 360 361
	if (ret)
		return -EINVAL;

362 363 364 365
	return count;
}

/*
366 367 368 369 370
 * Legacy interface that we cannot remove: s390x exposes the storage increment
 * covered by a memory block, allowing for identifying which memory blocks
 * comprise a storage increment. Since a memory block spans complete
 * storage increments nowadays, this interface is basically unused. Other
 * archs never exposed != 0.
371
 */
372
static ssize_t phys_device_show(struct device *dev,
373
				struct device_attribute *attr, char *buf)
374
{
375
	struct memory_block *mem = to_memory_block(dev);
376
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
377

378 379
	return sysfs_emit(buf, "%d\n",
			  arch_get_memory_phys_device(start_pfn));
380 381
}

382
#ifdef CONFIG_MEMORY_HOTREMOVE
383
static int print_allowed_zone(char *buf, int len, int nid,
384
			      struct memory_group *group,
385 386
			      unsigned long start_pfn, unsigned long nr_pages,
			      int online_type, struct zone *default_zone)
387 388 389
{
	struct zone *zone;

390
	zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
391 392
	if (zone == default_zone)
		return 0;
393

394
	return sysfs_emit_at(buf, len, " %s", zone->name);
395 396
}

397
static ssize_t valid_zones_show(struct device *dev,
398 399 400
				struct device_attribute *attr, char *buf)
{
	struct memory_block *mem = to_memory_block(dev);
401
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
402
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
403
	struct memory_group *group = mem->group;
404
	struct zone *default_zone;
405
	int nid = mem->nid;
406
	int len = 0;
407

408 409 410 411 412
	/*
	 * Check the existing zone. Make sure that we do that only on the
	 * online nodes otherwise the page_zone is not reliable
	 */
	if (mem->state == MEM_ONLINE) {
413 414 415 416
		/*
		 * The block contains more than one zone can not be offlined.
		 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
		 */
417 418 419
		default_zone = test_pages_in_a_zone(start_pfn,
						    start_pfn + nr_pages);
		if (!default_zone)
420 421
			return sysfs_emit(buf, "%s\n", "none");
		len += sysfs_emit_at(buf, len, "%s", default_zone->name);
422
		goto out;
423 424
	}

425 426
	default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
					  start_pfn, nr_pages);
427

428
	len += sysfs_emit_at(buf, len, "%s", default_zone->name);
429
	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
430
				  MMOP_ONLINE_KERNEL, default_zone);
431
	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
432
				  MMOP_ONLINE_MOVABLE, default_zone);
433
out:
434
	len += sysfs_emit_at(buf, len, "\n");
435
	return len;
436
}
437
static DEVICE_ATTR_RO(valid_zones);
438 439
#endif

440 441 442 443
static DEVICE_ATTR_RO(phys_index);
static DEVICE_ATTR_RW(state);
static DEVICE_ATTR_RO(phys_device);
static DEVICE_ATTR_RO(removable);
444 445

/*
446
 * Show the memory block size (shared by all memory blocks).
447
 */
448 449
static ssize_t block_size_bytes_show(struct device *dev,
				     struct device_attribute *attr, char *buf)
450
{
451
	return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
452 453
}

454
static DEVICE_ATTR_RO(block_size_bytes);
455

456 457 458 459
/*
 * Memory auto online policy.
 */

460 461
static ssize_t auto_online_blocks_show(struct device *dev,
				       struct device_attribute *attr, char *buf)
462
{
463
	return sysfs_emit(buf, "%s\n",
464
			  online_type_to_str[mhp_default_online_type]);
465 466
}

467 468 469
static ssize_t auto_online_blocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *buf, size_t count)
470
{
471
	const int online_type = mhp_online_type_from_str(buf);
472 473

	if (online_type < 0)
474 475
		return -EINVAL;

476
	mhp_default_online_type = online_type;
477 478 479
	return count;
}

480
static DEVICE_ATTR_RW(auto_online_blocks);
481

482 483 484 485 486 487 488
/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
489 490
static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
491 492
{
	u64 phys_addr;
493
	int nid, ret;
494
	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
495

496 497 498
	ret = kstrtoull(buf, 0, &phys_addr);
	if (ret)
		return ret;
499

500 501 502
	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
		return -EINVAL;

503 504
	ret = lock_device_hotplug_sysfs();
	if (ret)
505
		return ret;
506

507
	nid = memory_add_physaddr_to_nid(phys_addr);
508
	ret = __add_memory(nid, phys_addr,
509 510
			   MIN_MEMORY_BLOCK_SIZE * sections_per_block,
			   MHP_NONE);
511

512 513
	if (ret)
		goto out;
514

515 516
	ret = count;
out:
517
	unlock_device_hotplug();
518
	return ret;
519 520
}

521
static DEVICE_ATTR_WO(probe);
522 523
#endif

524 525 526 527 528 529
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
530 531 532
static ssize_t soft_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
533 534 535 536 537
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
538
	if (kstrtoull(buf, 0, &pfn) < 0)
539 540
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
541
	ret = soft_offline_page(pfn, 0);
542 543 544 545
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
546 547 548
static ssize_t hard_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
549 550 551 552 553
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
554
	if (kstrtoull(buf, 0, &pfn) < 0)
555 556
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
557
	ret = memory_failure(pfn, 0);
558 559 560
	return ret ? ret : count;
}

561 562
static DEVICE_ATTR_WO(soft_offline_page);
static DEVICE_ATTR_WO(hard_offline_page);
563 564
#endif

565
/* See phys_device_show(). */
566 567 568 569
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
570

571 572 573 574 575
/*
 * A reference for the returned memory block device is acquired.
 *
 * Called under device_hotplug_lock.
 */
576
static struct memory_block *find_memory_block_by_id(unsigned long block_id)
577
{
578
	struct memory_block *mem;
579

580 581 582 583
	mem = xa_load(&memory_blocks, block_id);
	if (mem)
		get_device(&mem->dev);
	return mem;
584 585
}

586
/*
587
 * Called under device_hotplug_lock.
588
 */
589
struct memory_block *find_memory_block(unsigned long section_nr)
590
{
591
	unsigned long block_id = memory_block_id(section_nr);
592 593

	return find_memory_block_by_id(block_id);
594 595
}

596 597 598 599 600
static struct attribute *memory_memblk_attrs[] = {
	&dev_attr_phys_index.attr,
	&dev_attr_state.attr,
	&dev_attr_phys_device.attr,
	&dev_attr_removable.attr,
601 602 603
#ifdef CONFIG_MEMORY_HOTREMOVE
	&dev_attr_valid_zones.attr,
#endif
604 605 606
	NULL
};

607
static const struct attribute_group memory_memblk_attr_group = {
608 609 610 611 612 613 614 615 616 617 618 619 620 621
	.attrs = memory_memblk_attrs,
};

static const struct attribute_group *memory_memblk_attr_groups[] = {
	&memory_memblk_attr_group,
	NULL,
};

/*
 * register_memory - Setup a sysfs device for a memory block
 */
static
int register_memory(struct memory_block *memory)
{
622 623
	int ret;

624 625 626 627
	memory->dev.bus = &memory_subsys;
	memory->dev.id = memory->start_section_nr / sections_per_block;
	memory->dev.release = memory_block_release;
	memory->dev.groups = memory_memblk_attr_groups;
628
	memory->dev.offline = memory->state == MEM_OFFLINE;
629

630
	ret = device_register(&memory->dev);
631
	if (ret) {
632
		put_device(&memory->dev);
633 634 635 636 637 638 639 640
		return ret;
	}
	ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
			      GFP_KERNEL));
	if (ret) {
		put_device(&memory->dev);
		device_unregister(&memory->dev);
	}
641
	return ret;
642 643
}

644
static int init_memory_block(unsigned long block_id, unsigned long state,
645 646
			     unsigned long nr_vmemmap_pages,
			     struct memory_group *group)
647
{
648
	struct memory_block *mem;
649 650
	int ret = 0;

651
	mem = find_memory_block_by_id(block_id);
652 653 654 655
	if (mem) {
		put_device(&mem->dev);
		return -EEXIST;
	}
656
	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
657 658 659
	if (!mem)
		return -ENOMEM;

660
	mem->start_section_nr = block_id * sections_per_block;
661
	mem->state = state;
662
	mem->nid = NUMA_NO_NODE;
663
	mem->nr_vmemmap_pages = nr_vmemmap_pages;
664 665 666 667 668 669
	INIT_LIST_HEAD(&mem->group_next);

	if (group) {
		mem->group = group;
		list_add(&mem->group_next, &group->memory_blocks);
	}
670

671 672 673 674 675
	ret = register_memory(mem);

	return ret;
}

676
static int add_memory_block(unsigned long base_section_nr)
677
{
678
	int section_count = 0;
679
	unsigned long nr;
680

681 682 683
	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
	     nr++)
		if (present_section_nr(nr))
684
			section_count++;
685

686 687
	if (section_count == 0)
		return 0;
688
	return init_memory_block(memory_block_id(base_section_nr),
689
				 MEM_ONLINE, 0,  NULL);
690 691
}

692 693 694 695 696
static void unregister_memory(struct memory_block *memory)
{
	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
		return;

697 698
	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);

699 700 701 702 703
	if (memory->group) {
		list_del(&memory->group_next);
		memory->group = NULL;
	}

704 705 706 707 708
	/* drop the ref. we got via find_memory_block() */
	put_device(&memory->dev);
	device_unregister(&memory->dev);
}

709
/*
710 711 712
 * Create memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * will be initialized as offline.
713 714
 *
 * Called under device_hotplug_lock.
715
 */
716
int create_memory_block_devices(unsigned long start, unsigned long size,
717 718
				unsigned long vmemmap_pages,
				struct memory_group *group)
719
{
720 721
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
722
	struct memory_block *mem;
723 724
	unsigned long block_id;
	int ret = 0;
725

726 727 728
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
		return -EINVAL;
729

730
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
731 732
		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages,
					group);
733
		if (ret)
734 735 736 737 738 739
			break;
	}
	if (ret) {
		end_block_id = block_id;
		for (block_id = start_block_id; block_id != end_block_id;
		     block_id++) {
740
			mem = find_memory_block_by_id(block_id);
741 742
			if (WARN_ON_ONCE(!mem))
				continue;
743 744
			unregister_memory(mem);
		}
745
	}
746
	return ret;
747 748
}

749 750 751 752
/*
 * Remove memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * have to be offline.
753 754
 *
 * Called under device_hotplug_lock.
755 756
 */
void remove_memory_block_devices(unsigned long start, unsigned long size)
757
{
758 759
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
760
	struct memory_block *mem;
761
	unsigned long block_id;
762

763 764
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
765 766
		return;

767
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
768
		mem = find_memory_block_by_id(block_id);
769 770 771
		if (WARN_ON_ONCE(!mem))
			continue;
		unregister_memory_block_under_nodes(mem);
772
		unregister_memory(mem);
773
	}
774 775
}

776 777 778 779 780 781
/* return true if the memory block is offlined, otherwise, return false */
bool is_memblock_offlined(struct memory_block *mem)
{
	return mem->state == MEM_OFFLINE;
}

782 783 784 785 786 787 788 789 790 791 792
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
	&dev_attr_probe.attr,
#endif

#ifdef CONFIG_MEMORY_FAILURE
	&dev_attr_soft_offline_page.attr,
	&dev_attr_hard_offline_page.attr,
#endif

	&dev_attr_block_size_bytes.attr,
793
	&dev_attr_auto_online_blocks.attr,
794 795 796
	NULL
};

797
static const struct attribute_group memory_root_attr_group = {
798 799 800 801 802 803 804 805
	.attrs = memory_root_attrs,
};

static const struct attribute_group *memory_root_attr_groups[] = {
	&memory_root_attr_group,
	NULL,
};

806
/*
807 808 809
 * Initialize the sysfs support for memory devices. At the time this function
 * is called, we cannot have concurrent creation/deletion of memory block
 * devices, the device_hotplug_lock is not needed.
810
 */
811
void __init memory_dev_init(void)
812 813
{
	int ret;
814
	unsigned long block_sz, nr;
815

816 817 818 819 820 821
	/* Validate the configured memory block size */
	block_sz = memory_block_size_bytes();
	if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
		panic("Memory block size not suitable: 0x%lx\n", block_sz);
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;

822
	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
823
	if (ret)
824
		panic("%s() failed to register subsystem: %d\n", __func__, ret);
825 826 827 828 829

	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
830 831
	for (nr = 0; nr <= __highest_present_section_nr;
	     nr += sections_per_block) {
832 833 834 835
		ret = add_memory_block(nr);
		if (ret)
			panic("%s() failed to add memory block: %d\n", __func__,
			      ret);
836 837
	}
}
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852

/**
 * walk_memory_blocks - walk through all present memory blocks overlapped
 *			by the range [start, start + size)
 *
 * @start: start address of the memory range
 * @size: size of the memory range
 * @arg: argument passed to func
 * @func: callback for each memory section walked
 *
 * This function walks through all present memory blocks overlapped by the
 * range [start, start + size), calling func on each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
853 854
 *
 * Called under device_hotplug_lock.
855 856 857 858 859 860 861 862 863 864
 */
int walk_memory_blocks(unsigned long start, unsigned long size,
		       void *arg, walk_memory_blocks_func_t func)
{
	const unsigned long start_block_id = phys_to_block_id(start);
	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
	struct memory_block *mem;
	unsigned long block_id;
	int ret = 0;

865 866 867
	if (!size)
		return 0;

868
	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
869
		mem = find_memory_block_by_id(block_id);
870 871 872 873 874 875 876 877 878 879
		if (!mem)
			continue;

		ret = func(mem, arg);
		put_device(&mem->dev);
		if (ret)
			break;
	}
	return ret;
}
880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915

struct for_each_memory_block_cb_data {
	walk_memory_blocks_func_t func;
	void *arg;
};

static int for_each_memory_block_cb(struct device *dev, void *data)
{
	struct memory_block *mem = to_memory_block(dev);
	struct for_each_memory_block_cb_data *cb_data = data;

	return cb_data->func(mem, cb_data->arg);
}

/**
 * for_each_memory_block - walk through all present memory blocks
 *
 * @arg: argument passed to func
 * @func: callback for each memory block walked
 *
 * This function walks through all present memory blocks, calling func on
 * each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
 */
int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
{
	struct for_each_memory_block_cb_data cb_data = {
		.func = func,
		.arg = arg,
	};

	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
				for_each_memory_block_cb);
}
916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942

/*
 * This is an internal helper to unify allocation and initialization of
 * memory groups. Note that the passed memory group will be copied to a
 * dynamically allocated memory group. After this call, the passed
 * memory group should no longer be used.
 */
static int memory_group_register(struct memory_group group)
{
	struct memory_group *new_group;
	uint32_t mgid;
	int ret;

	if (!node_possible(group.nid))
		return -EINVAL;

	new_group = kzalloc(sizeof(group), GFP_KERNEL);
	if (!new_group)
		return -ENOMEM;
	*new_group = group;
	INIT_LIST_HEAD(&new_group->memory_blocks);

	ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
		       GFP_KERNEL);
	if (ret) {
		kfree(new_group);
		return ret;
943 944
	} else if (group.is_dynamic) {
		xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049
	}
	return mgid;
}

/**
 * memory_group_register_static() - Register a static memory group.
 * @nid: The node id.
 * @max_pages: The maximum number of pages we'll have in this static memory
 *	       group.
 *
 * Register a new static memory group and return the memory group id.
 * All memory in the group belongs to a single unit, such as a DIMM. All
 * memory belonging to a static memory group is added in one go to be removed
 * in one go -- it's static.
 *
 * Returns an error if out of memory, if the node id is invalid, if no new
 * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
 * returns the new memory group id.
 */
int memory_group_register_static(int nid, unsigned long max_pages)
{
	struct memory_group group = {
		.nid = nid,
		.s = {
			.max_pages = max_pages,
		},
	};

	if (!max_pages)
		return -EINVAL;
	return memory_group_register(group);
}
EXPORT_SYMBOL_GPL(memory_group_register_static);

/**
 * memory_group_register_dynamic() - Register a dynamic memory group.
 * @nid: The node id.
 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
 *		memory group.
 *
 * Register a new dynamic memory group and return the memory group id.
 * Memory within a dynamic memory group is added/removed dynamically
 * in unit_pages.
 *
 * Returns an error if out of memory, if the node id is invalid, if no new
 * memory groups can be registered, or if unit_pages is invalid (0, not a
 * power of two, smaller than a single memory block). Otherwise, returns the
 * new memory group id.
 */
int memory_group_register_dynamic(int nid, unsigned long unit_pages)
{
	struct memory_group group = {
		.nid = nid,
		.is_dynamic = true,
		.d = {
			.unit_pages = unit_pages,
		},
	};

	if (!unit_pages || !is_power_of_2(unit_pages) ||
	    unit_pages < PHYS_PFN(memory_block_size_bytes()))
		return -EINVAL;
	return memory_group_register(group);
}
EXPORT_SYMBOL_GPL(memory_group_register_dynamic);

/**
 * memory_group_unregister() - Unregister a memory group.
 * @mgid: the memory group id
 *
 * Unregister a memory group. If any memory block still belongs to this
 * memory group, unregistering will fail.
 *
 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
 * memory blocks still belong to this memory group and returns 0 if
 * unregistering succeeded.
 */
int memory_group_unregister(int mgid)
{
	struct memory_group *group;

	if (mgid < 0)
		return -EINVAL;

	group = xa_load(&memory_groups, mgid);
	if (!group)
		return -EINVAL;
	if (!list_empty(&group->memory_blocks))
		return -EBUSY;
	xa_erase(&memory_groups, mgid);
	kfree(group);
	return 0;
}
EXPORT_SYMBOL_GPL(memory_group_unregister);

/*
 * This is an internal helper only to be used in core memory hotplug code to
 * lookup a memory group. We don't care about locking, as we don't expect a
 * memory group to get unregistered while adding memory to it -- because
 * the group and the memory is managed by the same driver.
 */
struct memory_group *memory_group_find_by_id(int mgid)
{
	return xa_load(&memory_groups, mgid);
}
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076

/*
 * This is an internal helper only to be used in core memory hotplug code to
 * walk all dynamic memory groups excluding a given memory group, either
 * belonging to a specific node, or belonging to any node.
 */
int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
			       struct memory_group *excluded, void *arg)
{
	struct memory_group *group;
	unsigned long index;
	int ret = 0;

	xa_for_each_marked(&memory_groups, index, group,
			   MEMORY_GROUP_MARK_DYNAMIC) {
		if (group == excluded)
			continue;
#ifdef CONFIG_NUMA
		if (nid != NUMA_NO_NODE && group->nid != nid)
			continue;
#endif /* CONFIG_NUMA */
		ret = func(group, arg);
		if (ret)
			break;
	}
	return ret;
}