memory.c 30.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Memory subsystem support
4 5 6 7 8 9 10 11 12 13 14 15 16
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
17
#include <linux/capability.h>
18 19 20 21
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
22
#include <linux/stat.h>
23
#include <linux/slab.h>
24
#include <linux/xarray.h>
25

A
Arun Sharma 已提交
26
#include <linux/atomic.h>
27
#include <linux/uaccess.h>
28 29

#define MEMORY_CLASS_NAME	"memory"
30

31 32 33 34 35 36 37
static const char *const online_type_to_str[] = {
	[MMOP_OFFLINE] = "offline",
	[MMOP_ONLINE] = "online",
	[MMOP_ONLINE_KERNEL] = "online_kernel",
	[MMOP_ONLINE_MOVABLE] = "online_movable",
};

38
int mhp_online_type_from_str(const char *str)
39 40 41 42 43 44 45 46 47 48
{
	int i;

	for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
		if (sysfs_streq(str, online_type_to_str[i]))
			return i;
	}
	return -EINVAL;
}

49 50
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)

51 52
static int sections_per_block;

53
static inline unsigned long memory_block_id(unsigned long section_nr)
54 55 56
{
	return section_nr / sections_per_block;
}
57

58
static inline unsigned long pfn_to_block_id(unsigned long pfn)
59
{
60
	return memory_block_id(pfn_to_section_nr(pfn));
61 62
}

63 64 65 66 67
static inline unsigned long phys_to_block_id(unsigned long phys)
{
	return pfn_to_block_id(PFN_DOWN(phys));
}

68 69 70
static int memory_subsys_online(struct device *dev);
static int memory_subsys_offline(struct device *dev);

71
static struct bus_type memory_subsys = {
72
	.name = MEMORY_CLASS_NAME,
73
	.dev_name = MEMORY_CLASS_NAME,
74 75
	.online = memory_subsys_online,
	.offline = memory_subsys_offline,
76 77
};

78 79 80 81 82 83 84
/*
 * Memory blocks are cached in a local radix tree to avoid
 * a costly linear search for the corresponding device on
 * the subsystem bus.
 */
static DEFINE_XARRAY(memory_blocks);

85 86 87 88
/*
 * Memory groups, indexed by memory group id (mgid).
 */
static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
89
#define MEMORY_GROUP_MARK_DYNAMIC	XA_MARK_1
90

91
static BLOCKING_NOTIFIER_HEAD(memory_chain);
92

93
int register_memory_notifier(struct notifier_block *nb)
94
{
95
	return blocking_notifier_chain_register(&memory_chain, nb);
96
}
97
EXPORT_SYMBOL(register_memory_notifier);
98

99
void unregister_memory_notifier(struct notifier_block *nb)
100
{
101
	blocking_notifier_chain_unregister(&memory_chain, nb);
102
}
103
EXPORT_SYMBOL(unregister_memory_notifier);
104

105 106
static void memory_block_release(struct device *dev)
{
107
	struct memory_block *mem = to_memory_block(dev);
108 109 110 111

	kfree(mem);
}

112 113 114 115
unsigned long __weak memory_block_size_bytes(void)
{
	return MIN_MEMORY_BLOCK_SIZE;
}
116
EXPORT_SYMBOL_GPL(memory_block_size_bytes);
117

118
/*
119
 * Show the first physical section index (number) of this memory block.
120
 */
121 122
static ssize_t phys_index_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
123
{
124
	struct memory_block *mem = to_memory_block(dev);
125 126 127
	unsigned long phys_index;

	phys_index = mem->start_section_nr / sections_per_block;
128

129
	return sysfs_emit(buf, "%08lx\n", phys_index);
130 131
}

132
/*
133 134
 * Legacy interface that we cannot remove. Always indicate "removable"
 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
135
 */
136 137
static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
			      char *buf)
138
{
139
	return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
140 141
}

142 143 144
/*
 * online, offline, going offline, etc.
 */
145 146
static ssize_t state_show(struct device *dev, struct device_attribute *attr,
			  char *buf)
147
{
148
	struct memory_block *mem = to_memory_block(dev);
149
	const char *output;
150 151 152 153 154 155

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
156
	case MEM_ONLINE:
157
		output = "online";
158 159
		break;
	case MEM_OFFLINE:
160
		output = "offline";
161 162
		break;
	case MEM_GOING_OFFLINE:
163
		output = "going-offline";
164 165 166
		break;
	default:
		WARN_ON(1);
167
		return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
168 169
	}

170
	return sysfs_emit(buf, "%s\n", output);
171 172
}

173
int memory_notify(unsigned long val, void *v)
174
{
175
	return blocking_notifier_call_chain(&memory_chain, val, v);
176 177
}

178 179 180 181
static int memory_block_online(struct memory_block *mem)
{
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
182 183 184 185
	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
	struct zone *zone;
	int ret;

186 187
	zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
				  start_pfn, nr_pages);
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202

	/*
	 * Although vmemmap pages have a different lifecycle than the pages
	 * they describe (they remain until the memory is unplugged), doing
	 * their initialization and accounting at memory onlining/offlining
	 * stage helps to keep accounting easier to follow - e.g vmemmaps
	 * belong to the same zone as the memory they backed.
	 */
	if (nr_vmemmap_pages) {
		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
		if (ret)
			return ret;
	}

	ret = online_pages(start_pfn + nr_vmemmap_pages,
203
			   nr_pages - nr_vmemmap_pages, zone, mem->group);
204 205 206 207 208 209 210 211 212 213 214
	if (ret) {
		if (nr_vmemmap_pages)
			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
		return ret;
	}

	/*
	 * Account once onlining succeeded. If the zone was unpopulated, it is
	 * now already properly populated.
	 */
	if (nr_vmemmap_pages)
215
		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
216
					  nr_vmemmap_pages);
217

218
	mem->zone = zone;
219
	return ret;
220 221 222 223 224 225
}

static int memory_block_offline(struct memory_block *mem)
{
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
226 227 228
	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
	int ret;

229 230 231
	if (!mem->zone)
		return -EINVAL;

232 233 234 235
	/*
	 * Unaccount before offlining, such that unpopulated zone and kthreads
	 * can properly be torn down in offline_pages().
	 */
236
	if (nr_vmemmap_pages)
237
		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
238
					  -nr_vmemmap_pages);
239

240
	ret = offline_pages(start_pfn + nr_vmemmap_pages,
241
			    nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
242 243 244
	if (ret) {
		/* offline_pages() failed. Account back. */
		if (nr_vmemmap_pages)
245
			adjust_present_page_count(pfn_to_page(start_pfn),
246
						  mem->group, nr_vmemmap_pages);
247 248 249 250 251 252
		return ret;
	}

	if (nr_vmemmap_pages)
		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);

253
	mem->zone = NULL;
254
	return ret;
255 256
}

257 258 259 260 261
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
262
memory_block_action(struct memory_block *mem, unsigned long action)
263 264 265 266
{
	int ret;

	switch (action) {
267
	case MEM_ONLINE:
268
		ret = memory_block_online(mem);
269 270
		break;
	case MEM_OFFLINE:
271
		ret = memory_block_offline(mem);
272 273 274
		break;
	default:
		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
275
		     "%ld\n", __func__, mem->start_section_nr, action, action);
276
		ret = -EINVAL;
277 278 279 280 281
	}

	return ret;
}

282
static int memory_block_change_state(struct memory_block *mem,
283
		unsigned long to_state, unsigned long from_state_req)
284
{
285
	int ret = 0;
286

287 288
	if (mem->state != from_state_req)
		return -EINVAL;
289

290 291 292
	if (to_state == MEM_OFFLINE)
		mem->state = MEM_GOING_OFFLINE;

293
	ret = memory_block_action(mem, to_state);
294
	mem->state = ret ? from_state_req : to_state;
295

296 297
	return ret;
}
298

299
/* The device lock serializes operations on memory_subsys_[online|offline] */
300 301
static int memory_subsys_online(struct device *dev)
{
302
	struct memory_block *mem = to_memory_block(dev);
303
	int ret;
304

305 306
	if (mem->state == MEM_ONLINE)
		return 0;
307

308
	/*
309 310
	 * When called via device_online() without configuring the online_type,
	 * we want to default to MMOP_ONLINE.
311
	 */
312
	if (mem->online_type == MMOP_OFFLINE)
313
		mem->online_type = MMOP_ONLINE;
314

315
	ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
316
	mem->online_type = MMOP_OFFLINE;
317 318 319 320 321

	return ret;
}

static int memory_subsys_offline(struct device *dev)
322
{
323
	struct memory_block *mem = to_memory_block(dev);
324

325 326
	if (mem->state == MEM_OFFLINE)
		return 0;
327

328
	return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
329
}
330

331 332
static ssize_t state_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
333
{
334
	const int online_type = mhp_online_type_from_str(buf);
335
	struct memory_block *mem = to_memory_block(dev);
336 337 338 339
	int ret;

	if (online_type < 0)
		return -EINVAL;
340

341 342 343
	ret = lock_device_hotplug_sysfs();
	if (ret)
		return ret;
344

345
	switch (online_type) {
346 347
	case MMOP_ONLINE_KERNEL:
	case MMOP_ONLINE_MOVABLE:
348
	case MMOP_ONLINE:
349
		/* mem->online_type is protected by device_hotplug_lock */
350 351 352
		mem->online_type = online_type;
		ret = device_online(&mem->dev);
		break;
353
	case MMOP_OFFLINE:
354 355 356 357
		ret = device_offline(&mem->dev);
		break;
	default:
		ret = -EINVAL; /* should never happen */
358 359 360
	}

	unlock_device_hotplug();
361

362
	if (ret < 0)
363
		return ret;
364 365 366
	if (ret)
		return -EINVAL;

367 368 369 370
	return count;
}

/*
371 372 373 374 375
 * Legacy interface that we cannot remove: s390x exposes the storage increment
 * covered by a memory block, allowing for identifying which memory blocks
 * comprise a storage increment. Since a memory block spans complete
 * storage increments nowadays, this interface is basically unused. Other
 * archs never exposed != 0.
376
 */
377
static ssize_t phys_device_show(struct device *dev,
378
				struct device_attribute *attr, char *buf)
379
{
380
	struct memory_block *mem = to_memory_block(dev);
381
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
382

383 384
	return sysfs_emit(buf, "%d\n",
			  arch_get_memory_phys_device(start_pfn));
385 386
}

387
#ifdef CONFIG_MEMORY_HOTREMOVE
388
static int print_allowed_zone(char *buf, int len, int nid,
389
			      struct memory_group *group,
390 391
			      unsigned long start_pfn, unsigned long nr_pages,
			      int online_type, struct zone *default_zone)
392 393 394
{
	struct zone *zone;

395
	zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
396 397
	if (zone == default_zone)
		return 0;
398

399
	return sysfs_emit_at(buf, len, " %s", zone->name);
400 401
}

402
static ssize_t valid_zones_show(struct device *dev,
403 404 405
				struct device_attribute *attr, char *buf)
{
	struct memory_block *mem = to_memory_block(dev);
406
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
407
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
408
	struct memory_group *group = mem->group;
409
	struct zone *default_zone;
410
	int nid = mem->nid;
411
	int len = 0;
412

413 414 415 416 417
	/*
	 * Check the existing zone. Make sure that we do that only on the
	 * online nodes otherwise the page_zone is not reliable
	 */
	if (mem->state == MEM_ONLINE) {
418
		/*
419 420
		 * If !mem->zone, the memory block spans multiple zones and
		 * cannot get offlined.
421
		 */
422
		default_zone = mem->zone;
423
		if (!default_zone)
424 425
			return sysfs_emit(buf, "%s\n", "none");
		len += sysfs_emit_at(buf, len, "%s", default_zone->name);
426
		goto out;
427 428
	}

429 430
	default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
					  start_pfn, nr_pages);
431

432
	len += sysfs_emit_at(buf, len, "%s", default_zone->name);
433
	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
434
				  MMOP_ONLINE_KERNEL, default_zone);
435
	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
436
				  MMOP_ONLINE_MOVABLE, default_zone);
437
out:
438
	len += sysfs_emit_at(buf, len, "\n");
439
	return len;
440
}
441
static DEVICE_ATTR_RO(valid_zones);
442 443
#endif

444 445 446 447
static DEVICE_ATTR_RO(phys_index);
static DEVICE_ATTR_RW(state);
static DEVICE_ATTR_RO(phys_device);
static DEVICE_ATTR_RO(removable);
448 449

/*
450
 * Show the memory block size (shared by all memory blocks).
451
 */
452 453
static ssize_t block_size_bytes_show(struct device *dev,
				     struct device_attribute *attr, char *buf)
454
{
455
	return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
456 457
}

458
static DEVICE_ATTR_RO(block_size_bytes);
459

460 461 462 463
/*
 * Memory auto online policy.
 */

464 465
static ssize_t auto_online_blocks_show(struct device *dev,
				       struct device_attribute *attr, char *buf)
466
{
467
	return sysfs_emit(buf, "%s\n",
468
			  online_type_to_str[mhp_default_online_type]);
469 470
}

471 472 473
static ssize_t auto_online_blocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *buf, size_t count)
474
{
475
	const int online_type = mhp_online_type_from_str(buf);
476 477

	if (online_type < 0)
478 479
		return -EINVAL;

480
	mhp_default_online_type = online_type;
481 482 483
	return count;
}

484
static DEVICE_ATTR_RW(auto_online_blocks);
485

486 487 488 489 490 491 492
/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
493 494
static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
495 496
{
	u64 phys_addr;
497
	int nid, ret;
498
	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
499

500 501 502
	ret = kstrtoull(buf, 0, &phys_addr);
	if (ret)
		return ret;
503

504 505 506
	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
		return -EINVAL;

507 508
	ret = lock_device_hotplug_sysfs();
	if (ret)
509
		return ret;
510

511
	nid = memory_add_physaddr_to_nid(phys_addr);
512
	ret = __add_memory(nid, phys_addr,
513 514
			   MIN_MEMORY_BLOCK_SIZE * sections_per_block,
			   MHP_NONE);
515

516 517
	if (ret)
		goto out;
518

519 520
	ret = count;
out:
521
	unlock_device_hotplug();
522
	return ret;
523 524
}

525
static DEVICE_ATTR_WO(probe);
526 527
#endif

528 529 530 531 532 533
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
534 535 536
static ssize_t soft_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
537 538 539 540 541
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
542
	if (kstrtoull(buf, 0, &pfn) < 0)
543 544
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
545
	ret = soft_offline_page(pfn, 0);
546 547 548 549
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
550 551 552
static ssize_t hard_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
553 554 555 556 557
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
558
	if (kstrtoull(buf, 0, &pfn) < 0)
559 560
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
561
	ret = memory_failure(pfn, 0);
562 563
	if (ret == -EOPNOTSUPP)
		ret = 0;
564 565 566
	return ret ? ret : count;
}

567 568
static DEVICE_ATTR_WO(soft_offline_page);
static DEVICE_ATTR_WO(hard_offline_page);
569 570
#endif

571
/* See phys_device_show(). */
572 573 574 575
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
576

577 578 579 580 581
/*
 * A reference for the returned memory block device is acquired.
 *
 * Called under device_hotplug_lock.
 */
582
static struct memory_block *find_memory_block_by_id(unsigned long block_id)
583
{
584
	struct memory_block *mem;
585

586 587 588 589
	mem = xa_load(&memory_blocks, block_id);
	if (mem)
		get_device(&mem->dev);
	return mem;
590 591
}

592
/*
593
 * Called under device_hotplug_lock.
594
 */
595
struct memory_block *find_memory_block(unsigned long section_nr)
596
{
597
	unsigned long block_id = memory_block_id(section_nr);
598 599

	return find_memory_block_by_id(block_id);
600 601
}

602 603 604 605 606
static struct attribute *memory_memblk_attrs[] = {
	&dev_attr_phys_index.attr,
	&dev_attr_state.attr,
	&dev_attr_phys_device.attr,
	&dev_attr_removable.attr,
607 608 609
#ifdef CONFIG_MEMORY_HOTREMOVE
	&dev_attr_valid_zones.attr,
#endif
610 611 612
	NULL
};

613
static const struct attribute_group memory_memblk_attr_group = {
614 615 616 617 618 619 620 621 622 623 624 625 626 627
	.attrs = memory_memblk_attrs,
};

static const struct attribute_group *memory_memblk_attr_groups[] = {
	&memory_memblk_attr_group,
	NULL,
};

/*
 * register_memory - Setup a sysfs device for a memory block
 */
static
int register_memory(struct memory_block *memory)
{
628 629
	int ret;

630 631 632 633
	memory->dev.bus = &memory_subsys;
	memory->dev.id = memory->start_section_nr / sections_per_block;
	memory->dev.release = memory_block_release;
	memory->dev.groups = memory_memblk_attr_groups;
634
	memory->dev.offline = memory->state == MEM_OFFLINE;
635

636
	ret = device_register(&memory->dev);
637
	if (ret) {
638
		put_device(&memory->dev);
639 640 641 642 643 644 645 646
		return ret;
	}
	ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
			      GFP_KERNEL));
	if (ret) {
		put_device(&memory->dev);
		device_unregister(&memory->dev);
	}
647
	return ret;
648 649
}

650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725
static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
						     int nid)
{
	const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
	const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
	struct zone *zone, *matching_zone = NULL;
	pg_data_t *pgdat = NODE_DATA(nid);
	int i;

	/*
	 * This logic only works for early memory, when the applicable zones
	 * already span the memory block. We don't expect overlapping zones on
	 * a single node for early memory. So if we're told that some PFNs
	 * of a node fall into this memory block, we can assume that all node
	 * zones that intersect with the memory block are actually applicable.
	 * No need to look at the memmap.
	 */
	for (i = 0; i < MAX_NR_ZONES; i++) {
		zone = pgdat->node_zones + i;
		if (!populated_zone(zone))
			continue;
		if (!zone_intersects(zone, start_pfn, nr_pages))
			continue;
		if (!matching_zone) {
			matching_zone = zone;
			continue;
		}
		/* Spans multiple zones ... */
		matching_zone = NULL;
		break;
	}
	return matching_zone;
}

#ifdef CONFIG_NUMA
/**
 * memory_block_add_nid() - Indicate that system RAM falling into this memory
 *			    block device (partially) belongs to the given node.
 * @mem: The memory block device.
 * @nid: The node id.
 * @context: The memory initialization context.
 *
 * Indicate that system RAM falling into this memory block (partially) belongs
 * to the given node. If the context indicates ("early") that we are adding the
 * node during node device subsystem initialization, this will also properly
 * set/adjust mem->zone based on the zone ranges of the given node.
 */
void memory_block_add_nid(struct memory_block *mem, int nid,
			  enum meminit_context context)
{
	if (context == MEMINIT_EARLY && mem->nid != nid) {
		/*
		 * For early memory we have to determine the zone when setting
		 * the node id and handle multiple nodes spanning a single
		 * memory block by indicate via zone == NULL that we're not
		 * dealing with a single zone. So if we're setting the node id
		 * the first time, determine if there is a single zone. If we're
		 * setting the node id a second time to a different node,
		 * invalidate the single detected zone.
		 */
		if (mem->nid == NUMA_NO_NODE)
			mem->zone = early_node_zone_for_memory_block(mem, nid);
		else
			mem->zone = NULL;
	}

	/*
	 * If this memory block spans multiple nodes, we only indicate
	 * the last processed node. If we span multiple nodes (not applicable
	 * to hotplugged memory), zone == NULL will prohibit memory offlining
	 * and consequently unplug.
	 */
	mem->nid = nid;
}
#endif

726
static int init_memory_block(unsigned long block_id, unsigned long state,
727 728
			     unsigned long nr_vmemmap_pages,
			     struct memory_group *group)
729
{
730
	struct memory_block *mem;
731 732
	int ret = 0;

733
	mem = find_memory_block_by_id(block_id);
734 735 736 737
	if (mem) {
		put_device(&mem->dev);
		return -EEXIST;
	}
738
	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
739 740 741
	if (!mem)
		return -ENOMEM;

742
	mem->start_section_nr = block_id * sections_per_block;
743
	mem->state = state;
744
	mem->nid = NUMA_NO_NODE;
745
	mem->nr_vmemmap_pages = nr_vmemmap_pages;
746 747
	INIT_LIST_HEAD(&mem->group_next);

748 749 750 751 752 753 754 755 756 757 758
#ifndef CONFIG_NUMA
	if (state == MEM_ONLINE)
		/*
		 * MEM_ONLINE at this point implies early memory. With NUMA,
		 * we'll determine the zone when setting the node id via
		 * memory_block_add_nid(). Memory hotplug updated the zone
		 * manually when memory onlining/offlining succeeds.
		 */
		mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
#endif /* CONFIG_NUMA */

759 760 761 762
	ret = register_memory(mem);
	if (ret)
		return ret;

763 764 765 766
	if (group) {
		mem->group = group;
		list_add(&mem->group_next, &group->memory_blocks);
	}
767

768
	return 0;
769 770
}

771
static int add_memory_block(unsigned long base_section_nr)
772
{
773
	int section_count = 0;
774
	unsigned long nr;
775

776 777 778
	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
	     nr++)
		if (present_section_nr(nr))
779
			section_count++;
780

781 782
	if (section_count == 0)
		return 0;
783
	return init_memory_block(memory_block_id(base_section_nr),
784
				 MEM_ONLINE, 0,  NULL);
785 786
}

787 788 789 790 791
static void unregister_memory(struct memory_block *memory)
{
	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
		return;

792 793
	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);

794 795 796 797 798
	if (memory->group) {
		list_del(&memory->group_next);
		memory->group = NULL;
	}

799 800 801 802 803
	/* drop the ref. we got via find_memory_block() */
	put_device(&memory->dev);
	device_unregister(&memory->dev);
}

804
/*
805 806 807
 * Create memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * will be initialized as offline.
808 809
 *
 * Called under device_hotplug_lock.
810
 */
811
int create_memory_block_devices(unsigned long start, unsigned long size,
812 813
				unsigned long vmemmap_pages,
				struct memory_group *group)
814
{
815 816
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
817
	struct memory_block *mem;
818 819
	unsigned long block_id;
	int ret = 0;
820

821 822 823
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
		return -EINVAL;
824

825
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
826 827
		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages,
					group);
828
		if (ret)
829 830 831 832 833 834
			break;
	}
	if (ret) {
		end_block_id = block_id;
		for (block_id = start_block_id; block_id != end_block_id;
		     block_id++) {
835
			mem = find_memory_block_by_id(block_id);
836 837
			if (WARN_ON_ONCE(!mem))
				continue;
838 839
			unregister_memory(mem);
		}
840
	}
841
	return ret;
842 843
}

844 845 846 847
/*
 * Remove memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * have to be offline.
848 849
 *
 * Called under device_hotplug_lock.
850 851
 */
void remove_memory_block_devices(unsigned long start, unsigned long size)
852
{
853 854
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
855
	struct memory_block *mem;
856
	unsigned long block_id;
857

858 859
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
860 861
		return;

862
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
863
		mem = find_memory_block_by_id(block_id);
864 865 866
		if (WARN_ON_ONCE(!mem))
			continue;
		unregister_memory_block_under_nodes(mem);
867
		unregister_memory(mem);
868
	}
869 870
}

871 872 873 874 875 876
/* return true if the memory block is offlined, otherwise, return false */
bool is_memblock_offlined(struct memory_block *mem)
{
	return mem->state == MEM_OFFLINE;
}

877 878 879 880 881 882 883 884 885 886 887
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
	&dev_attr_probe.attr,
#endif

#ifdef CONFIG_MEMORY_FAILURE
	&dev_attr_soft_offline_page.attr,
	&dev_attr_hard_offline_page.attr,
#endif

	&dev_attr_block_size_bytes.attr,
888
	&dev_attr_auto_online_blocks.attr,
889 890 891
	NULL
};

892
static const struct attribute_group memory_root_attr_group = {
893 894 895 896 897 898 899 900
	.attrs = memory_root_attrs,
};

static const struct attribute_group *memory_root_attr_groups[] = {
	&memory_root_attr_group,
	NULL,
};

901
/*
902 903 904
 * Initialize the sysfs support for memory devices. At the time this function
 * is called, we cannot have concurrent creation/deletion of memory block
 * devices, the device_hotplug_lock is not needed.
905
 */
906
void __init memory_dev_init(void)
907 908
{
	int ret;
909
	unsigned long block_sz, nr;
910

911 912 913 914 915 916
	/* Validate the configured memory block size */
	block_sz = memory_block_size_bytes();
	if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
		panic("Memory block size not suitable: 0x%lx\n", block_sz);
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;

917
	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
918
	if (ret)
919
		panic("%s() failed to register subsystem: %d\n", __func__, ret);
920 921 922 923 924

	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
925 926
	for (nr = 0; nr <= __highest_present_section_nr;
	     nr += sections_per_block) {
927 928 929 930
		ret = add_memory_block(nr);
		if (ret)
			panic("%s() failed to add memory block: %d\n", __func__,
			      ret);
931 932
	}
}
933 934 935 936 937 938 939 940 941 942 943 944 945 946 947

/**
 * walk_memory_blocks - walk through all present memory blocks overlapped
 *			by the range [start, start + size)
 *
 * @start: start address of the memory range
 * @size: size of the memory range
 * @arg: argument passed to func
 * @func: callback for each memory section walked
 *
 * This function walks through all present memory blocks overlapped by the
 * range [start, start + size), calling func on each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
948 949
 *
 * Called under device_hotplug_lock.
950 951 952 953 954 955 956 957 958 959
 */
int walk_memory_blocks(unsigned long start, unsigned long size,
		       void *arg, walk_memory_blocks_func_t func)
{
	const unsigned long start_block_id = phys_to_block_id(start);
	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
	struct memory_block *mem;
	unsigned long block_id;
	int ret = 0;

960 961 962
	if (!size)
		return 0;

963
	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
964
		mem = find_memory_block_by_id(block_id);
965 966 967 968 969 970 971 972 973 974
		if (!mem)
			continue;

		ret = func(mem, arg);
		put_device(&mem->dev);
		if (ret)
			break;
	}
	return ret;
}
975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010

struct for_each_memory_block_cb_data {
	walk_memory_blocks_func_t func;
	void *arg;
};

static int for_each_memory_block_cb(struct device *dev, void *data)
{
	struct memory_block *mem = to_memory_block(dev);
	struct for_each_memory_block_cb_data *cb_data = data;

	return cb_data->func(mem, cb_data->arg);
}

/**
 * for_each_memory_block - walk through all present memory blocks
 *
 * @arg: argument passed to func
 * @func: callback for each memory block walked
 *
 * This function walks through all present memory blocks, calling func on
 * each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
 */
int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
{
	struct for_each_memory_block_cb_data cb_data = {
		.func = func,
		.arg = arg,
	};

	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
				for_each_memory_block_cb);
}
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037

/*
 * This is an internal helper to unify allocation and initialization of
 * memory groups. Note that the passed memory group will be copied to a
 * dynamically allocated memory group. After this call, the passed
 * memory group should no longer be used.
 */
static int memory_group_register(struct memory_group group)
{
	struct memory_group *new_group;
	uint32_t mgid;
	int ret;

	if (!node_possible(group.nid))
		return -EINVAL;

	new_group = kzalloc(sizeof(group), GFP_KERNEL);
	if (!new_group)
		return -ENOMEM;
	*new_group = group;
	INIT_LIST_HEAD(&new_group->memory_blocks);

	ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
		       GFP_KERNEL);
	if (ret) {
		kfree(new_group);
		return ret;
1038 1039
	} else if (group.is_dynamic) {
		xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
	}
	return mgid;
}

/**
 * memory_group_register_static() - Register a static memory group.
 * @nid: The node id.
 * @max_pages: The maximum number of pages we'll have in this static memory
 *	       group.
 *
 * Register a new static memory group and return the memory group id.
 * All memory in the group belongs to a single unit, such as a DIMM. All
 * memory belonging to a static memory group is added in one go to be removed
 * in one go -- it's static.
 *
 * Returns an error if out of memory, if the node id is invalid, if no new
 * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
 * returns the new memory group id.
 */
int memory_group_register_static(int nid, unsigned long max_pages)
{
	struct memory_group group = {
		.nid = nid,
		.s = {
			.max_pages = max_pages,
		},
	};

	if (!max_pages)
		return -EINVAL;
	return memory_group_register(group);
}
EXPORT_SYMBOL_GPL(memory_group_register_static);

/**
 * memory_group_register_dynamic() - Register a dynamic memory group.
 * @nid: The node id.
 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
 *		memory group.
 *
 * Register a new dynamic memory group and return the memory group id.
 * Memory within a dynamic memory group is added/removed dynamically
 * in unit_pages.
 *
 * Returns an error if out of memory, if the node id is invalid, if no new
 * memory groups can be registered, or if unit_pages is invalid (0, not a
 * power of two, smaller than a single memory block). Otherwise, returns the
 * new memory group id.
 */
int memory_group_register_dynamic(int nid, unsigned long unit_pages)
{
	struct memory_group group = {
		.nid = nid,
		.is_dynamic = true,
		.d = {
			.unit_pages = unit_pages,
		},
	};

	if (!unit_pages || !is_power_of_2(unit_pages) ||
	    unit_pages < PHYS_PFN(memory_block_size_bytes()))
		return -EINVAL;
	return memory_group_register(group);
}
EXPORT_SYMBOL_GPL(memory_group_register_dynamic);

/**
 * memory_group_unregister() - Unregister a memory group.
 * @mgid: the memory group id
 *
 * Unregister a memory group. If any memory block still belongs to this
 * memory group, unregistering will fail.
 *
 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
 * memory blocks still belong to this memory group and returns 0 if
 * unregistering succeeded.
 */
int memory_group_unregister(int mgid)
{
	struct memory_group *group;

	if (mgid < 0)
		return -EINVAL;

	group = xa_load(&memory_groups, mgid);
	if (!group)
		return -EINVAL;
	if (!list_empty(&group->memory_blocks))
		return -EBUSY;
	xa_erase(&memory_groups, mgid);
	kfree(group);
	return 0;
}
EXPORT_SYMBOL_GPL(memory_group_unregister);

/*
 * This is an internal helper only to be used in core memory hotplug code to
 * lookup a memory group. We don't care about locking, as we don't expect a
 * memory group to get unregistered while adding memory to it -- because
 * the group and the memory is managed by the same driver.
 */
struct memory_group *memory_group_find_by_id(int mgid)
{
	return xa_load(&memory_groups, mgid);
}
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171

/*
 * This is an internal helper only to be used in core memory hotplug code to
 * walk all dynamic memory groups excluding a given memory group, either
 * belonging to a specific node, or belonging to any node.
 */
int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
			       struct memory_group *excluded, void *arg)
{
	struct memory_group *group;
	unsigned long index;
	int ret = 0;

	xa_for_each_marked(&memory_groups, index, group,
			   MEMORY_GROUP_MARK_DYNAMIC) {
		if (group == excluded)
			continue;
#ifdef CONFIG_NUMA
		if (nid != NUMA_NO_NODE && group->nid != nid)
			continue;
#endif /* CONFIG_NUMA */
		ret = func(group, arg);
		if (ret)
			break;
	}
	return ret;
}