hmm.h 15.1 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2 3 4
/*
 * Copyright 2013 Red Hat Inc.
 *
J
Jérôme Glisse 已提交
5
 * Authors: Jérôme Glisse <jglisse@redhat.com>
6 7 8 9
 */
/*
 * Heterogeneous Memory Management (HMM)
 *
10
 * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it
11 12 13 14 15 16 17 18 19 20 21 22 23
 * is for. Here we focus on the HMM API description, with some explanation of
 * the underlying implementation.
 *
 * Short description: HMM provides a set of helpers to share a virtual address
 * space between CPU and a device, so that the device can access any valid
 * address of the process (while still obeying memory protection). HMM also
 * provides helpers to migrate process memory to device memory, and back. Each
 * set of functionality (address space mirroring, and migration to and from
 * device memory) can be used independently of the other.
 *
 *
 * HMM address space mirroring API:
 *
24 25
 * Use HMM address space mirroring if you want to mirror a range of the CPU
 * page tables of a process into a device page table. Here, "mirror" means "keep
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
 * synchronized". Prerequisites: the device must provide the ability to write-
 * protect its page tables (at PAGE_SIZE granularity), and must be able to
 * recover from the resulting potential page faults.
 *
 * HMM guarantees that at any point in time, a given virtual address points to
 * either the same memory in both CPU and device page tables (that is: CPU and
 * device page tables each point to the same pages), or that one page table (CPU
 * or device) points to no entry, while the other still points to the old page
 * for the address. The latter case happens when the CPU page table update
 * happens first, and then the update is mirrored over to the device page table.
 * This does not cause any issue, because the CPU page table cannot start
 * pointing to a new page until the device page table is invalidated.
 *
 * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
 * updates to each device driver that has registered a mirror. It also provides
 * some API calls to help with taking a snapshot of the CPU page table, and to
 * synchronize with any updates that might happen concurrently.
 *
 *
 * HMM migration to and from device memory:
 *
 * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
 * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
 * of the device memory, and allows the device driver to manage its memory
 * using those struct pages. Having struct pages for device memory makes
 * migration easier. Because that memory is not addressable by the CPU it must
 * never be pinned to the device; in other words, any CPU page fault can always
 * cause the device memory to be migrated (copied/moved) back to regular memory.
 *
 * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
 * allows use of a device DMA engine to perform the copy operation between
 * regular system memory and device memory.
 */
#ifndef LINUX_HMM_H
#define LINUX_HMM_H

#include <linux/kconfig.h>
63
#include <asm/pgtable.h>
64

65
#include <linux/device.h>
66 67 68
#include <linux/migrate.h>
#include <linux/memremap.h>
#include <linux/completion.h>
69
#include <linux/mmu_notifier.h>
70

71 72 73 74 75 76 77 78 79 80 81 82 83 84

/*
 * struct hmm - HMM per mm struct
 *
 * @mm: mm struct this HMM struct is bound to
 * @lock: lock protecting ranges list
 * @ranges: list of range being snapshotted
 * @mirrors: list of mirrors for this mm
 * @mmu_notifier: mmu notifier to track updates to CPU page table
 * @mirrors_sem: read/write semaphore protecting the mirrors list
 * @wq: wait queue for user waiting on a range invalidation
 * @notifiers: count of active mmu notifiers
 */
struct hmm {
85
	struct mmu_notifier	mmu_notifier;
86
	spinlock_t		ranges_lock;
87 88 89 90 91 92
	struct list_head	ranges;
	struct list_head	mirrors;
	struct rw_semaphore	mirrors_sem;
	wait_queue_head_t	wq;
	long			notifiers;
};
93 94

/*
95 96
 * hmm_pfn_flag_e - HMM flag enums
 *
97
 * Flags:
98
 * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
99
 * HMM_PFN_WRITE: CPU page table has write permission set
100 101
 * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
 *
102 103 104
 * The driver provides a flags array for mapping page protections to device
 * PTE bits. If the driver valid bit for an entry is bit 3,
 * i.e., (entry & (1 << 3)), then the driver must provide
105
 * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
106
 * Same logic apply to all flags. This is the same idea as vm_page_prot in vma
107 108 109 110 111 112 113 114 115 116 117 118 119
 * except that this is per device driver rather than per architecture.
 */
enum hmm_pfn_flag_e {
	HMM_PFN_VALID = 0,
	HMM_PFN_WRITE,
	HMM_PFN_DEVICE_PRIVATE,
	HMM_PFN_FLAG_MAX
};

/*
 * hmm_pfn_value_e - HMM pfn special value
 *
 * Flags:
120
 * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
121
 * HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
122
 * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
123
 *      result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not
124 125
 *      be mirrored by a device, because the entry will never have HMM_PFN_VALID
 *      set and the pfn value is undefined.
126
 *
127 128 129
 * Driver provides values for none entry, error entry, and special entry.
 * Driver can alias (i.e., use same value) error and special, but
 * it should not alias none with error or special.
130 131 132
 *
 * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
 * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
133
 * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
134
 * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
135
 */
136 137 138 139 140 141 142 143 144 145
enum hmm_pfn_value_e {
	HMM_PFN_ERROR,
	HMM_PFN_NONE,
	HMM_PFN_SPECIAL,
	HMM_PFN_VALUE_MAX
};

/*
 * struct hmm_range - track invalidation lock on virtual address range
 *
146 147 148
 * @notifier: an optional mmu_interval_notifier
 * @notifier_seq: when notifier is used this is the result of
 *                mmu_interval_read_begin()
149
 * @hmm: the core HMM structure this range is active against
150 151 152 153 154 155 156
 * @vma: the vm area struct for the range
 * @list: all range lock are on a list
 * @start: range virtual start address (inclusive)
 * @end: range virtual end address (exclusive)
 * @pfns: array of pfns (big enough for the range)
 * @flags: pfn flags to match device driver page table
 * @values: pfn value for some special case (none, special, error, ...)
157 158
 * @default_flags: default flags for the range (write, read, ... see hmm doc)
 * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
159 160 161 162
 * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
 * @valid: pfns array did not change since it has been fill by an HMM function
 */
struct hmm_range {
163 164
	struct mmu_interval_notifier *notifier;
	unsigned long		notifier_seq;
165
	struct hmm		*hmm;
166 167 168 169 170 171
	struct list_head	list;
	unsigned long		start;
	unsigned long		end;
	uint64_t		*pfns;
	const uint64_t		*flags;
	const uint64_t		*values;
172 173
	uint64_t		default_flags;
	uint64_t		pfn_flags_mask;
174 175 176
	uint8_t			pfn_shift;
	bool			valid;
};
177

178 179 180 181
/*
 * hmm_range_wait_until_valid() - wait for range to be valid
 * @range: range affected by invalidation to wait on
 * @timeout: time out for wait in ms (ie abort wait after that period of time)
182
 * Return: true if the range is valid, false otherwise.
183 184 185 186
 */
static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
					      unsigned long timeout)
{
187 188
	return wait_event_timeout(range->hmm->wq, range->valid,
				  msecs_to_jiffies(timeout)) != 0;
189 190 191 192 193
}

/*
 * hmm_range_valid() - test if a range is valid or not
 * @range: range
194
 * Return: true if the range is valid, false otherwise.
195 196 197 198 199 200
 */
static inline bool hmm_range_valid(struct hmm_range *range)
{
	return range->valid;
}

201
/*
202 203 204
 * hmm_device_entry_to_page() - return struct page pointed to by a device entry
 * @range: range use to decode device entry value
 * @entry: device entry value to get corresponding struct page from
205
 * Return: struct page pointer if entry is a valid, NULL otherwise
206
 *
207 208
 * If the device entry is valid (ie valid flag set) then return the struct page
 * matching the entry value. Otherwise return NULL.
209
 */
210 211
static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
						    uint64_t entry)
212
{
213
	if (entry == range->values[HMM_PFN_NONE])
214
		return NULL;
215
	if (entry == range->values[HMM_PFN_ERROR])
216
		return NULL;
217
	if (entry == range->values[HMM_PFN_SPECIAL])
218
		return NULL;
219
	if (!(entry & range->flags[HMM_PFN_VALID]))
220
		return NULL;
221
	return pfn_to_page(entry >> range->pfn_shift);
222 223 224
}

/*
225 226 227
 * hmm_device_entry_to_pfn() - return pfn value store in a device entry
 * @range: range use to decode device entry value
 * @entry: device entry to extract pfn from
228
 * Return: pfn value if device entry is valid, -1UL otherwise
229
 */
230 231
static inline unsigned long
hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
232
{
233 234 235 236 237
	if (pfn == range->values[HMM_PFN_NONE])
		return -1UL;
	if (pfn == range->values[HMM_PFN_ERROR])
		return -1UL;
	if (pfn == range->values[HMM_PFN_SPECIAL])
238
		return -1UL;
239 240 241
	if (!(pfn & range->flags[HMM_PFN_VALID]))
		return -1UL;
	return (pfn >> range->pfn_shift);
242 243 244
}

/*
245
 * hmm_device_entry_from_page() - create a valid device entry for a page
246
 * @range: range use to encode HMM pfn value
247
 * @page: page for which to create the device entry
248
 * Return: valid device entry for the page
249
 */
250 251
static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
						  struct page *page)
252
{
253 254
	return (page_to_pfn(page) << range->pfn_shift) |
		range->flags[HMM_PFN_VALID];
255 256 257
}

/*
258
 * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
259
 * @range: range use to encode HMM pfn value
260
 * @pfn: pfn value for which to create the device entry
261
 * Return: valid device entry for the pfn
262
 */
263 264
static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
						 unsigned long pfn)
265
{
266 267
	return (pfn << range->pfn_shift) |
		range->flags[HMM_PFN_VALID];
268 269
}

270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
/*
 * Mirroring: how to synchronize device page table with CPU page table.
 *
 * A device driver that is participating in HMM mirroring must always
 * synchronize with CPU page table updates. For this, device drivers can either
 * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
 * drivers can decide to register one mirror per device per process, or just
 * one mirror per process for a group of devices. The pattern is:
 *
 *      int device_bind_address_space(..., struct mm_struct *mm, ...)
 *      {
 *          struct device_address_space *das;
 *
 *          // Device driver specific initialization, and allocation of das
 *          // which contains an hmm_mirror struct as one of its fields.
 *          ...
 *
 *          ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
 *          if (ret) {
 *              // Cleanup on error
 *              return ret;
 *          }
 *
 *          // Other device driver specific initialization
 *          ...
 *      }
 *
 * Once an hmm_mirror is registered for an address space, the device driver
 * will get callbacks through sync_cpu_device_pagetables() operation (see
 * hmm_mirror_ops struct).
 *
 * Device driver must not free the struct containing the hmm_mirror struct
 * before calling hmm_mirror_unregister(). The expected usage is to do that when
 * the device driver is unbinding from an address space.
 *
 *
 *      void device_unbind_address_space(struct device_address_space *das)
 *      {
 *          // Device driver specific cleanup
 *          ...
 *
 *          hmm_mirror_unregister(&das->mirror);
 *
 *          // Other device driver specific cleanup, and now das can be freed
 *          ...
 *      }
 */

struct hmm_mirror;

/*
 * struct hmm_mirror_ops - HMM mirror device operations callback
 *
 * @update: callback to update range on a device
 */
struct hmm_mirror_ops {
326 327 328 329
	/* release() - release hmm_mirror
	 *
	 * @mirror: pointer to struct hmm_mirror
	 *
330 331 332 333
	 * This is called when the mm_struct is being released.  The callback
	 * must ensure that all access to any pages obtained from this mirror
	 * is halted before the callback returns. All future access should
	 * fault.
334 335 336
	 */
	void (*release)(struct hmm_mirror *mirror);

337 338 339
	/* sync_cpu_device_pagetables() - synchronize page tables
	 *
	 * @mirror: pointer to struct hmm_mirror
340 341 342
	 * @update: update information (see struct mmu_notifier_range)
	 * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false
	 * and callback needs to block, 0 otherwise.
343 344 345 346 347 348 349 350 351 352
	 *
	 * This callback ultimately originates from mmu_notifiers when the CPU
	 * page table is updated. The device driver must update its page table
	 * in response to this callback. The update argument tells what action
	 * to perform.
	 *
	 * The device driver must not return from this callback until the device
	 * page tables are completely updated (TLBs flushed, etc); this is a
	 * synchronous call.
	 */
353 354 355
	int (*sync_cpu_device_pagetables)(
		struct hmm_mirror *mirror,
		const struct mmu_notifier_range *update);
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
};

/*
 * struct hmm_mirror - mirror struct for a device driver
 *
 * @hmm: pointer to struct hmm (which is unique per mm_struct)
 * @ops: device driver callback for HMM mirror operations
 * @list: for list of mirrors of a given mm
 *
 * Each address space (mm_struct) being mirrored by a device must register one
 * instance of an hmm_mirror struct with HMM. HMM will track the list of all
 * mirrors for each mm_struct.
 */
struct hmm_mirror {
	struct hmm			*hmm;
	const struct hmm_mirror_ops	*ops;
	struct list_head		list;
};

375 376 377 378 379 380 381 382 383
/*
 * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case.
 */
#define HMM_FAULT_ALLOW_RETRY		(1 << 0)

/* Don't fault in missing PTEs, just snapshot the current state. */
#define HMM_FAULT_SNAPSHOT		(1 << 1)

#ifdef CONFIG_HMM_MIRROR
384 385
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
void hmm_mirror_unregister(struct hmm_mirror *mirror);
386 387

/*
388
 * Please see Documentation/vm/hmm.rst for how to use the range API.
389
 */
390
int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror);
391
void hmm_range_unregister(struct hmm_range *range);
392 393 394

long hmm_range_fault(struct hmm_range *range, unsigned int flags);

395 396 397
long hmm_range_dma_map(struct hmm_range *range,
		       struct device *device,
		       dma_addr_t *daddrs,
398
		       unsigned int flags);
399 400 401 402
long hmm_range_dma_unmap(struct hmm_range *range,
			 struct device *device,
			 dma_addr_t *daddrs,
			 bool dirty);
403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
#else
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
{
	return -EOPNOTSUPP;
}

void hmm_mirror_unregister(struct hmm_mirror *mirror)
{
}

int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror)
{
	return -EOPNOTSUPP;
}

void hmm_range_unregister(struct hmm_range *range)
{
}

static inline long hmm_range_fault(struct hmm_range *range, unsigned int flags)
{
	return -EOPNOTSUPP;
}

static inline long hmm_range_dma_map(struct hmm_range *range,
				     struct device *device, dma_addr_t *daddrs,
				     unsigned int flags)
{
	return -EOPNOTSUPP;
}

static inline long hmm_range_dma_unmap(struct hmm_range *range,
				       struct device *device,
				       dma_addr_t *daddrs, bool dirty)
{
	return -EOPNOTSUPP;
}
#endif
441 442

/*
443
 * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
444
 *
445 446 447
 * When waiting for mmu notifiers we need some kind of time out otherwise we
 * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
 * wait already.
448
 */
449 450
#define HMM_RANGE_DEFAULT_TIMEOUT 1000

451
#endif /* LINUX_HMM_H */