memory.c 22.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Memory subsystem support
4 5 6 7 8 9 10 11 12 13 14 15 16
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
17
#include <linux/capability.h>
18 19 20 21
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
22
#include <linux/stat.h>
23
#include <linux/slab.h>
24
#include <linux/xarray.h>
25

A
Arun Sharma 已提交
26
#include <linux/atomic.h>
27
#include <linux/uaccess.h>
28 29

#define MEMORY_CLASS_NAME	"memory"
30

31 32 33 34 35 36 37
static const char *const online_type_to_str[] = {
	[MMOP_OFFLINE] = "offline",
	[MMOP_ONLINE] = "online",
	[MMOP_ONLINE_KERNEL] = "online_kernel",
	[MMOP_ONLINE_MOVABLE] = "online_movable",
};

38
int mhp_online_type_from_str(const char *str)
39 40 41 42 43 44 45 46 47 48
{
	int i;

	for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
		if (sysfs_streq(str, online_type_to_str[i]))
			return i;
	}
	return -EINVAL;
}

49 50
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)

51 52
static int sections_per_block;

53
static inline unsigned long memory_block_id(unsigned long section_nr)
54 55 56
{
	return section_nr / sections_per_block;
}
57

58
static inline unsigned long pfn_to_block_id(unsigned long pfn)
59
{
60
	return memory_block_id(pfn_to_section_nr(pfn));
61 62
}

63 64 65 66 67
static inline unsigned long phys_to_block_id(unsigned long phys)
{
	return pfn_to_block_id(PFN_DOWN(phys));
}

68 69 70
static int memory_subsys_online(struct device *dev);
static int memory_subsys_offline(struct device *dev);

71
static struct bus_type memory_subsys = {
72
	.name = MEMORY_CLASS_NAME,
73
	.dev_name = MEMORY_CLASS_NAME,
74 75
	.online = memory_subsys_online,
	.offline = memory_subsys_offline,
76 77
};

78 79 80 81 82 83 84
/*
 * Memory blocks are cached in a local radix tree to avoid
 * a costly linear search for the corresponding device on
 * the subsystem bus.
 */
static DEFINE_XARRAY(memory_blocks);

85
static BLOCKING_NOTIFIER_HEAD(memory_chain);
86

87
int register_memory_notifier(struct notifier_block *nb)
88
{
89
	return blocking_notifier_chain_register(&memory_chain, nb);
90
}
91
EXPORT_SYMBOL(register_memory_notifier);
92

93
void unregister_memory_notifier(struct notifier_block *nb)
94
{
95
	blocking_notifier_chain_unregister(&memory_chain, nb);
96
}
97
EXPORT_SYMBOL(unregister_memory_notifier);
98

99 100
static void memory_block_release(struct device *dev)
{
101
	struct memory_block *mem = to_memory_block(dev);
102 103 104 105

	kfree(mem);
}

106 107 108 109
unsigned long __weak memory_block_size_bytes(void)
{
	return MIN_MEMORY_BLOCK_SIZE;
}
110
EXPORT_SYMBOL_GPL(memory_block_size_bytes);
111

112
/*
113
 * Show the first physical section index (number) of this memory block.
114
 */
115 116
static ssize_t phys_index_show(struct device *dev,
			       struct device_attribute *attr, char *buf)
117
{
118
	struct memory_block *mem = to_memory_block(dev);
119 120 121
	unsigned long phys_index;

	phys_index = mem->start_section_nr / sections_per_block;
122

123
	return sysfs_emit(buf, "%08lx\n", phys_index);
124 125
}

126
/*
127 128
 * Legacy interface that we cannot remove. Always indicate "removable"
 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
129
 */
130 131
static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
			      char *buf)
132
{
133
	return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
134 135
}

136 137 138
/*
 * online, offline, going offline, etc.
 */
139 140
static ssize_t state_show(struct device *dev, struct device_attribute *attr,
			  char *buf)
141
{
142
	struct memory_block *mem = to_memory_block(dev);
143
	const char *output;
144 145 146 147 148 149

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
150
	case MEM_ONLINE:
151
		output = "online";
152 153
		break;
	case MEM_OFFLINE:
154
		output = "offline";
155 156
		break;
	case MEM_GOING_OFFLINE:
157
		output = "going-offline";
158 159 160
		break;
	default:
		WARN_ON(1);
161
		return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
162 163
	}

164
	return sysfs_emit(buf, "%s\n", output);
165 166
}

167
int memory_notify(unsigned long val, void *v)
168
{
169
	return blocking_notifier_call_chain(&memory_chain, val, v);
170 171
}

172 173 174 175
static int memory_block_online(struct memory_block *mem)
{
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
	struct zone *zone;
	int ret;

	zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages);

	/*
	 * Although vmemmap pages have a different lifecycle than the pages
	 * they describe (they remain until the memory is unplugged), doing
	 * their initialization and accounting at memory onlining/offlining
	 * stage helps to keep accounting easier to follow - e.g vmemmaps
	 * belong to the same zone as the memory they backed.
	 */
	if (nr_vmemmap_pages) {
		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
		if (ret)
			return ret;
	}

	ret = online_pages(start_pfn + nr_vmemmap_pages,
			   nr_pages - nr_vmemmap_pages, zone);
	if (ret) {
		if (nr_vmemmap_pages)
			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
		return ret;
	}

	/*
	 * Account once onlining succeeded. If the zone was unpopulated, it is
	 * now already properly populated.
	 */
	if (nr_vmemmap_pages)
		adjust_present_page_count(zone, nr_vmemmap_pages);
209

210
	return ret;
211 212 213 214 215 216
}

static int memory_block_offline(struct memory_block *mem)
{
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
217 218 219 220 221 222 223 224
	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
	struct zone *zone;
	int ret;

	/*
	 * Unaccount before offlining, such that unpopulated zone and kthreads
	 * can properly be torn down in offline_pages().
	 */
225 226
	if (nr_vmemmap_pages) {
		zone = page_zone(pfn_to_page(start_pfn));
227
		adjust_present_page_count(zone, -nr_vmemmap_pages);
228
	}
229

230 231 232 233 234 235 236 237 238 239 240 241 242
	ret = offline_pages(start_pfn + nr_vmemmap_pages,
			    nr_pages - nr_vmemmap_pages);
	if (ret) {
		/* offline_pages() failed. Account back. */
		if (nr_vmemmap_pages)
			adjust_present_page_count(zone, nr_vmemmap_pages);
		return ret;
	}

	if (nr_vmemmap_pages)
		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);

	return ret;
243 244
}

245 246 247 248 249
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
250
memory_block_action(struct memory_block *mem, unsigned long action)
251 252 253 254
{
	int ret;

	switch (action) {
255
	case MEM_ONLINE:
256
		ret = memory_block_online(mem);
257 258
		break;
	case MEM_OFFLINE:
259
		ret = memory_block_offline(mem);
260 261 262
		break;
	default:
		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
263
		     "%ld\n", __func__, mem->start_section_nr, action, action);
264
		ret = -EINVAL;
265 266 267 268 269
	}

	return ret;
}

270
static int memory_block_change_state(struct memory_block *mem,
271
		unsigned long to_state, unsigned long from_state_req)
272
{
273
	int ret = 0;
274

275 276
	if (mem->state != from_state_req)
		return -EINVAL;
277

278 279 280
	if (to_state == MEM_OFFLINE)
		mem->state = MEM_GOING_OFFLINE;

281
	ret = memory_block_action(mem, to_state);
282
	mem->state = ret ? from_state_req : to_state;
283

284 285
	return ret;
}
286

287
/* The device lock serializes operations on memory_subsys_[online|offline] */
288 289
static int memory_subsys_online(struct device *dev)
{
290
	struct memory_block *mem = to_memory_block(dev);
291
	int ret;
292

293 294
	if (mem->state == MEM_ONLINE)
		return 0;
295

296
	/*
297 298
	 * When called via device_online() without configuring the online_type,
	 * we want to default to MMOP_ONLINE.
299
	 */
300
	if (mem->online_type == MMOP_OFFLINE)
301
		mem->online_type = MMOP_ONLINE;
302

303
	ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
304
	mem->online_type = MMOP_OFFLINE;
305 306 307 308 309

	return ret;
}

static int memory_subsys_offline(struct device *dev)
310
{
311
	struct memory_block *mem = to_memory_block(dev);
312

313 314
	if (mem->state == MEM_OFFLINE)
		return 0;
315

316
	return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
317
}
318

319 320
static ssize_t state_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
321
{
322
	const int online_type = mhp_online_type_from_str(buf);
323
	struct memory_block *mem = to_memory_block(dev);
324 325 326 327
	int ret;

	if (online_type < 0)
		return -EINVAL;
328

329 330 331
	ret = lock_device_hotplug_sysfs();
	if (ret)
		return ret;
332

333
	switch (online_type) {
334 335
	case MMOP_ONLINE_KERNEL:
	case MMOP_ONLINE_MOVABLE:
336
	case MMOP_ONLINE:
337
		/* mem->online_type is protected by device_hotplug_lock */
338 339 340
		mem->online_type = online_type;
		ret = device_online(&mem->dev);
		break;
341
	case MMOP_OFFLINE:
342 343 344 345
		ret = device_offline(&mem->dev);
		break;
	default:
		ret = -EINVAL; /* should never happen */
346 347 348
	}

	unlock_device_hotplug();
349

350
	if (ret < 0)
351
		return ret;
352 353 354
	if (ret)
		return -EINVAL;

355 356 357 358
	return count;
}

/*
359 360 361 362 363
 * Legacy interface that we cannot remove: s390x exposes the storage increment
 * covered by a memory block, allowing for identifying which memory blocks
 * comprise a storage increment. Since a memory block spans complete
 * storage increments nowadays, this interface is basically unused. Other
 * archs never exposed != 0.
364
 */
365
static ssize_t phys_device_show(struct device *dev,
366
				struct device_attribute *attr, char *buf)
367
{
368
	struct memory_block *mem = to_memory_block(dev);
369
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
370

371 372
	return sysfs_emit(buf, "%d\n",
			  arch_get_memory_phys_device(start_pfn));
373 374
}

375
#ifdef CONFIG_MEMORY_HOTREMOVE
376 377 378
static int print_allowed_zone(char *buf, int len, int nid,
			      unsigned long start_pfn, unsigned long nr_pages,
			      int online_type, struct zone *default_zone)
379 380 381 382
{
	struct zone *zone;

	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
383 384
	if (zone == default_zone)
		return 0;
385

386
	return sysfs_emit_at(buf, len, " %s", zone->name);
387 388
}

389
static ssize_t valid_zones_show(struct device *dev,
390 391 392
				struct device_attribute *attr, char *buf)
{
	struct memory_block *mem = to_memory_block(dev);
393
	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
394
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
395
	struct zone *default_zone;
396
	int len = 0;
397
	int nid;
398

399 400 401 402 403
	/*
	 * Check the existing zone. Make sure that we do that only on the
	 * online nodes otherwise the page_zone is not reliable
	 */
	if (mem->state == MEM_ONLINE) {
404 405 406 407
		/*
		 * The block contains more than one zone can not be offlined.
		 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
		 */
408 409 410
		default_zone = test_pages_in_a_zone(start_pfn,
						    start_pfn + nr_pages);
		if (!default_zone)
411 412
			return sysfs_emit(buf, "%s\n", "none");
		len += sysfs_emit_at(buf, len, "%s", default_zone->name);
413
		goto out;
414 415
	}

416
	nid = mem->nid;
417 418
	default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn,
					  nr_pages);
419

420 421 422 423 424
	len += sysfs_emit_at(buf, len, "%s", default_zone->name);
	len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages,
				  MMOP_ONLINE_KERNEL, default_zone);
	len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages,
				  MMOP_ONLINE_MOVABLE, default_zone);
425
out:
426
	len += sysfs_emit_at(buf, len, "\n");
427
	return len;
428
}
429
static DEVICE_ATTR_RO(valid_zones);
430 431
#endif

432 433 434 435
static DEVICE_ATTR_RO(phys_index);
static DEVICE_ATTR_RW(state);
static DEVICE_ATTR_RO(phys_device);
static DEVICE_ATTR_RO(removable);
436 437

/*
438
 * Show the memory block size (shared by all memory blocks).
439
 */
440 441
static ssize_t block_size_bytes_show(struct device *dev,
				     struct device_attribute *attr, char *buf)
442
{
443
	return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
444 445
}

446
static DEVICE_ATTR_RO(block_size_bytes);
447

448 449 450 451
/*
 * Memory auto online policy.
 */

452 453
static ssize_t auto_online_blocks_show(struct device *dev,
				       struct device_attribute *attr, char *buf)
454
{
455
	return sysfs_emit(buf, "%s\n",
456
			  online_type_to_str[mhp_default_online_type]);
457 458
}

459 460 461
static ssize_t auto_online_blocks_store(struct device *dev,
					struct device_attribute *attr,
					const char *buf, size_t count)
462
{
463
	const int online_type = mhp_online_type_from_str(buf);
464 465

	if (online_type < 0)
466 467
		return -EINVAL;

468
	mhp_default_online_type = online_type;
469 470 471
	return count;
}

472
static DEVICE_ATTR_RW(auto_online_blocks);
473

474 475 476 477 478 479 480
/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
481 482
static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
483 484
{
	u64 phys_addr;
485
	int nid, ret;
486
	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
487

488 489 490
	ret = kstrtoull(buf, 0, &phys_addr);
	if (ret)
		return ret;
491

492 493 494
	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
		return -EINVAL;

495 496
	ret = lock_device_hotplug_sysfs();
	if (ret)
497
		return ret;
498

499
	nid = memory_add_physaddr_to_nid(phys_addr);
500
	ret = __add_memory(nid, phys_addr,
501 502
			   MIN_MEMORY_BLOCK_SIZE * sections_per_block,
			   MHP_NONE);
503

504 505
	if (ret)
		goto out;
506

507 508
	ret = count;
out:
509
	unlock_device_hotplug();
510
	return ret;
511 512
}

513
static DEVICE_ATTR_WO(probe);
514 515
#endif

516 517 518 519 520 521
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
522 523 524
static ssize_t soft_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
525 526 527 528 529
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
530
	if (kstrtoull(buf, 0, &pfn) < 0)
531 532
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
533
	ret = soft_offline_page(pfn, 0);
534 535 536 537
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
538 539 540
static ssize_t hard_offline_page_store(struct device *dev,
				       struct device_attribute *attr,
				       const char *buf, size_t count)
541 542 543 544 545
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
546
	if (kstrtoull(buf, 0, &pfn) < 0)
547 548
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
549
	ret = memory_failure(pfn, 0);
550 551 552
	return ret ? ret : count;
}

553 554
static DEVICE_ATTR_WO(soft_offline_page);
static DEVICE_ATTR_WO(hard_offline_page);
555 556
#endif

557
/* See phys_device_show(). */
558 559 560 561
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
562

563 564 565 566 567
/*
 * A reference for the returned memory block device is acquired.
 *
 * Called under device_hotplug_lock.
 */
568
static struct memory_block *find_memory_block_by_id(unsigned long block_id)
569
{
570
	struct memory_block *mem;
571

572 573 574 575
	mem = xa_load(&memory_blocks, block_id);
	if (mem)
		get_device(&mem->dev);
	return mem;
576 577
}

578
/*
579
 * Called under device_hotplug_lock.
580 581 582
 */
struct memory_block *find_memory_block(struct mem_section *section)
{
583
	unsigned long block_id = memory_block_id(__section_nr(section));
584 585

	return find_memory_block_by_id(block_id);
586 587
}

588 589 590 591 592
static struct attribute *memory_memblk_attrs[] = {
	&dev_attr_phys_index.attr,
	&dev_attr_state.attr,
	&dev_attr_phys_device.attr,
	&dev_attr_removable.attr,
593 594 595
#ifdef CONFIG_MEMORY_HOTREMOVE
	&dev_attr_valid_zones.attr,
#endif
596 597 598
	NULL
};

599
static const struct attribute_group memory_memblk_attr_group = {
600 601 602 603 604 605 606 607 608 609 610 611 612 613
	.attrs = memory_memblk_attrs,
};

static const struct attribute_group *memory_memblk_attr_groups[] = {
	&memory_memblk_attr_group,
	NULL,
};

/*
 * register_memory - Setup a sysfs device for a memory block
 */
static
int register_memory(struct memory_block *memory)
{
614 615
	int ret;

616 617 618 619
	memory->dev.bus = &memory_subsys;
	memory->dev.id = memory->start_section_nr / sections_per_block;
	memory->dev.release = memory_block_release;
	memory->dev.groups = memory_memblk_attr_groups;
620
	memory->dev.offline = memory->state == MEM_OFFLINE;
621

622
	ret = device_register(&memory->dev);
623
	if (ret) {
624
		put_device(&memory->dev);
625 626 627 628 629 630 631 632
		return ret;
	}
	ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
			      GFP_KERNEL));
	if (ret) {
		put_device(&memory->dev);
		device_unregister(&memory->dev);
	}
633
	return ret;
634 635
}

636 637
static int init_memory_block(unsigned long block_id, unsigned long state,
			     unsigned long nr_vmemmap_pages)
638
{
639
	struct memory_block *mem;
640 641
	int ret = 0;

642
	mem = find_memory_block_by_id(block_id);
643 644 645 646
	if (mem) {
		put_device(&mem->dev);
		return -EEXIST;
	}
647
	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
648 649 650
	if (!mem)
		return -ENOMEM;

651
	mem->start_section_nr = block_id * sections_per_block;
652
	mem->state = state;
653
	mem->nid = NUMA_NO_NODE;
654
	mem->nr_vmemmap_pages = nr_vmemmap_pages;
655

656 657 658 659 660
	ret = register_memory(mem);

	return ret;
}

661
static int add_memory_block(unsigned long base_section_nr)
662
{
663
	int section_count = 0;
664
	unsigned long nr;
665

666 667 668
	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
	     nr++)
		if (present_section_nr(nr))
669
			section_count++;
670

671 672
	if (section_count == 0)
		return 0;
673
	return init_memory_block(memory_block_id(base_section_nr),
674
				 MEM_ONLINE, 0);
675 676
}

677 678 679 680 681
static void unregister_memory(struct memory_block *memory)
{
	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
		return;

682 683
	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);

684 685 686 687 688
	/* drop the ref. we got via find_memory_block() */
	put_device(&memory->dev);
	device_unregister(&memory->dev);
}

689
/*
690 691 692
 * Create memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * will be initialized as offline.
693 694
 *
 * Called under device_hotplug_lock.
695
 */
696 697
int create_memory_block_devices(unsigned long start, unsigned long size,
				unsigned long vmemmap_pages)
698
{
699 700
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
701
	struct memory_block *mem;
702 703
	unsigned long block_id;
	int ret = 0;
704

705 706 707
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
		return -EINVAL;
708

709
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
710
		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
711
		if (ret)
712 713 714 715 716 717
			break;
	}
	if (ret) {
		end_block_id = block_id;
		for (block_id = start_block_id; block_id != end_block_id;
		     block_id++) {
718
			mem = find_memory_block_by_id(block_id);
719 720
			if (WARN_ON_ONCE(!mem))
				continue;
721 722
			unregister_memory(mem);
		}
723
	}
724
	return ret;
725 726
}

727 728 729 730
/*
 * Remove memory block devices for the given memory area. Start and size
 * have to be aligned to memory block granularity. Memory block devices
 * have to be offline.
731 732
 *
 * Called under device_hotplug_lock.
733 734
 */
void remove_memory_block_devices(unsigned long start, unsigned long size)
735
{
736 737
	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
	const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
738
	struct memory_block *mem;
739
	unsigned long block_id;
740

741 742
	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
			 !IS_ALIGNED(size, memory_block_size_bytes())))
743 744
		return;

745
	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
746
		mem = find_memory_block_by_id(block_id);
747 748 749
		if (WARN_ON_ONCE(!mem))
			continue;
		unregister_memory_block_under_nodes(mem);
750
		unregister_memory(mem);
751
	}
752 753
}

754 755 756 757 758 759
/* return true if the memory block is offlined, otherwise, return false */
bool is_memblock_offlined(struct memory_block *mem)
{
	return mem->state == MEM_OFFLINE;
}

760 761 762 763 764 765 766 767 768 769 770
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
	&dev_attr_probe.attr,
#endif

#ifdef CONFIG_MEMORY_FAILURE
	&dev_attr_soft_offline_page.attr,
	&dev_attr_hard_offline_page.attr,
#endif

	&dev_attr_block_size_bytes.attr,
771
	&dev_attr_auto_online_blocks.attr,
772 773 774
	NULL
};

775
static const struct attribute_group memory_root_attr_group = {
776 777 778 779 780 781 782 783
	.attrs = memory_root_attrs,
};

static const struct attribute_group *memory_root_attr_groups[] = {
	&memory_root_attr_group,
	NULL,
};

784
/*
785 786 787
 * Initialize the sysfs support for memory devices. At the time this function
 * is called, we cannot have concurrent creation/deletion of memory block
 * devices, the device_hotplug_lock is not needed.
788
 */
789
void __init memory_dev_init(void)
790 791
{
	int ret;
792
	unsigned long block_sz, nr;
793

794 795 796 797 798 799
	/* Validate the configured memory block size */
	block_sz = memory_block_size_bytes();
	if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
		panic("Memory block size not suitable: 0x%lx\n", block_sz);
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;

800
	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
801
	if (ret)
802
		panic("%s() failed to register subsystem: %d\n", __func__, ret);
803 804 805 806 807

	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
808 809
	for (nr = 0; nr <= __highest_present_section_nr;
	     nr += sections_per_block) {
810 811 812 813
		ret = add_memory_block(nr);
		if (ret)
			panic("%s() failed to add memory block: %d\n", __func__,
			      ret);
814 815
	}
}
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830

/**
 * walk_memory_blocks - walk through all present memory blocks overlapped
 *			by the range [start, start + size)
 *
 * @start: start address of the memory range
 * @size: size of the memory range
 * @arg: argument passed to func
 * @func: callback for each memory section walked
 *
 * This function walks through all present memory blocks overlapped by the
 * range [start, start + size), calling func on each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
831 832
 *
 * Called under device_hotplug_lock.
833 834 835 836 837 838 839 840 841 842
 */
int walk_memory_blocks(unsigned long start, unsigned long size,
		       void *arg, walk_memory_blocks_func_t func)
{
	const unsigned long start_block_id = phys_to_block_id(start);
	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
	struct memory_block *mem;
	unsigned long block_id;
	int ret = 0;

843 844 845
	if (!size)
		return 0;

846
	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
847
		mem = find_memory_block_by_id(block_id);
848 849 850 851 852 853 854 855 856 857
		if (!mem)
			continue;

		ret = func(mem, arg);
		put_device(&mem->dev);
		if (ret)
			break;
	}
	return ret;
}
858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893

struct for_each_memory_block_cb_data {
	walk_memory_blocks_func_t func;
	void *arg;
};

static int for_each_memory_block_cb(struct device *dev, void *data)
{
	struct memory_block *mem = to_memory_block(dev);
	struct for_each_memory_block_cb_data *cb_data = data;

	return cb_data->func(mem, cb_data->arg);
}

/**
 * for_each_memory_block - walk through all present memory blocks
 *
 * @arg: argument passed to func
 * @func: callback for each memory block walked
 *
 * This function walks through all present memory blocks, calling func on
 * each memory block.
 *
 * In case func() returns an error, walking is aborted and the error is
 * returned.
 */
int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
{
	struct for_each_memory_block_cb_data cb_data = {
		.func = func,
		.arg = arg,
	};

	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
				for_each_memory_block_cb);
}