hmm.h 8.8 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2 3 4
/*
 * Copyright 2013 Red Hat Inc.
 *
J
Jérôme Glisse 已提交
5
 * Authors: Jérôme Glisse <jglisse@redhat.com>
6 7 8 9
 */
/*
 * Heterogeneous Memory Management (HMM)
 *
10
 * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it
11 12 13 14 15 16 17 18 19 20 21 22 23
 * is for. Here we focus on the HMM API description, with some explanation of
 * the underlying implementation.
 *
 * Short description: HMM provides a set of helpers to share a virtual address
 * space between CPU and a device, so that the device can access any valid
 * address of the process (while still obeying memory protection). HMM also
 * provides helpers to migrate process memory to device memory, and back. Each
 * set of functionality (address space mirroring, and migration to and from
 * device memory) can be used independently of the other.
 *
 *
 * HMM address space mirroring API:
 *
24 25
 * Use HMM address space mirroring if you want to mirror a range of the CPU
 * page tables of a process into a device page table. Here, "mirror" means "keep
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
 * synchronized". Prerequisites: the device must provide the ability to write-
 * protect its page tables (at PAGE_SIZE granularity), and must be able to
 * recover from the resulting potential page faults.
 *
 * HMM guarantees that at any point in time, a given virtual address points to
 * either the same memory in both CPU and device page tables (that is: CPU and
 * device page tables each point to the same pages), or that one page table (CPU
 * or device) points to no entry, while the other still points to the old page
 * for the address. The latter case happens when the CPU page table update
 * happens first, and then the update is mirrored over to the device page table.
 * This does not cause any issue, because the CPU page table cannot start
 * pointing to a new page until the device page table is invalidated.
 *
 * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
 * updates to each device driver that has registered a mirror. It also provides
 * some API calls to help with taking a snapshot of the CPU page table, and to
 * synchronize with any updates that might happen concurrently.
 *
 *
 * HMM migration to and from device memory:
 *
 * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
 * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
 * of the device memory, and allows the device driver to manage its memory
 * using those struct pages. Having struct pages for device memory makes
 * migration easier. Because that memory is not addressable by the CPU it must
 * never be pinned to the device; in other words, any CPU page fault can always
 * cause the device memory to be migrated (copied/moved) back to regular memory.
 *
 * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
 * allows use of a device DMA engine to perform the copy operation between
 * regular system memory and device memory.
 */
#ifndef LINUX_HMM_H
#define LINUX_HMM_H

#include <linux/kconfig.h>
63
#include <asm/pgtable.h>
64

65
#include <linux/device.h>
66 67 68
#include <linux/migrate.h>
#include <linux/memremap.h>
#include <linux/completion.h>
69
#include <linux/mmu_notifier.h>
70

71
/*
72 73
 * hmm_pfn_flag_e - HMM flag enums
 *
74
 * Flags:
75
 * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
76
 * HMM_PFN_WRITE: CPU page table has write permission set
77
 *
78 79 80
 * The driver provides a flags array for mapping page protections to device
 * PTE bits. If the driver valid bit for an entry is bit 3,
 * i.e., (entry & (1 << 3)), then the driver must provide
81
 * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
82
 * Same logic apply to all flags. This is the same idea as vm_page_prot in vma
83 84 85 86 87 88 89 90 91 92 93 94
 * except that this is per device driver rather than per architecture.
 */
enum hmm_pfn_flag_e {
	HMM_PFN_VALID = 0,
	HMM_PFN_WRITE,
	HMM_PFN_FLAG_MAX
};

/*
 * hmm_pfn_value_e - HMM pfn special value
 *
 * Flags:
95
 * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
96
 * HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
97
 * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
98
 *      result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not
99 100
 *      be mirrored by a device, because the entry will never have HMM_PFN_VALID
 *      set and the pfn value is undefined.
101
 *
102 103 104
 * Driver provides values for none entry, error entry, and special entry.
 * Driver can alias (i.e., use same value) error and special, but
 * it should not alias none with error or special.
105 106 107
 *
 * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
 * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
108
 * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
109
 * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
110
 */
111 112 113 114 115 116 117 118 119 120
enum hmm_pfn_value_e {
	HMM_PFN_ERROR,
	HMM_PFN_NONE,
	HMM_PFN_SPECIAL,
	HMM_PFN_VALUE_MAX
};

/*
 * struct hmm_range - track invalidation lock on virtual address range
 *
121 122
 * @notifier: a mmu_interval_notifier that includes the start/end
 * @notifier_seq: result of mmu_interval_read_begin()
123
 * @hmm: the core HMM structure this range is active against
124 125 126 127 128 129 130
 * @vma: the vm area struct for the range
 * @list: all range lock are on a list
 * @start: range virtual start address (inclusive)
 * @end: range virtual end address (exclusive)
 * @pfns: array of pfns (big enough for the range)
 * @flags: pfn flags to match device driver page table
 * @values: pfn value for some special case (none, special, error, ...)
131 132
 * @default_flags: default flags for the range (write, read, ... see hmm doc)
 * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
133 134
 * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
 * @valid: pfns array did not change since it has been fill by an HMM function
135
 * @dev_private_owner: owner of device private pages
136 137
 */
struct hmm_range {
138 139
	struct mmu_interval_notifier *notifier;
	unsigned long		notifier_seq;
140 141 142 143 144
	unsigned long		start;
	unsigned long		end;
	uint64_t		*pfns;
	const uint64_t		*flags;
	const uint64_t		*values;
145 146
	uint64_t		default_flags;
	uint64_t		pfn_flags_mask;
147
	uint8_t			pfn_shift;
148
	void			*dev_private_owner;
149
};
150 151

/*
152 153 154
 * hmm_device_entry_to_page() - return struct page pointed to by a device entry
 * @range: range use to decode device entry value
 * @entry: device entry value to get corresponding struct page from
155
 * Return: struct page pointer if entry is a valid, NULL otherwise
156
 *
157 158
 * If the device entry is valid (ie valid flag set) then return the struct page
 * matching the entry value. Otherwise return NULL.
159
 */
160 161
static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
						    uint64_t entry)
162
{
163
	if (entry == range->values[HMM_PFN_NONE])
164
		return NULL;
165
	if (entry == range->values[HMM_PFN_ERROR])
166
		return NULL;
167
	if (entry == range->values[HMM_PFN_SPECIAL])
168
		return NULL;
169
	if (!(entry & range->flags[HMM_PFN_VALID]))
170
		return NULL;
171
	return pfn_to_page(entry >> range->pfn_shift);
172 173 174
}

/*
175 176 177
 * hmm_device_entry_to_pfn() - return pfn value store in a device entry
 * @range: range use to decode device entry value
 * @entry: device entry to extract pfn from
178
 * Return: pfn value if device entry is valid, -1UL otherwise
179
 */
180 181
static inline unsigned long
hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
182
{
183 184 185 186 187
	if (pfn == range->values[HMM_PFN_NONE])
		return -1UL;
	if (pfn == range->values[HMM_PFN_ERROR])
		return -1UL;
	if (pfn == range->values[HMM_PFN_SPECIAL])
188
		return -1UL;
189 190 191
	if (!(pfn & range->flags[HMM_PFN_VALID]))
		return -1UL;
	return (pfn >> range->pfn_shift);
192 193 194
}

/*
195
 * hmm_device_entry_from_page() - create a valid device entry for a page
196
 * @range: range use to encode HMM pfn value
197
 * @page: page for which to create the device entry
198
 * Return: valid device entry for the page
199
 */
200 201
static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
						  struct page *page)
202
{
203 204
	return (page_to_pfn(page) << range->pfn_shift) |
		range->flags[HMM_PFN_VALID];
205 206 207
}

/*
208
 * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
209
 * @range: range use to encode HMM pfn value
210
 * @pfn: pfn value for which to create the device entry
211
 * Return: valid device entry for the pfn
212
 */
213 214
static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
						 unsigned long pfn)
215
{
216 217
	return (pfn << range->pfn_shift) |
		range->flags[HMM_PFN_VALID];
218 219
}

220 221 222
/* Don't fault in missing PTEs, just snapshot the current state. */
#define HMM_FAULT_SNAPSHOT		(1 << 1)

223
/*
224
 * Please see Documentation/vm/hmm.rst for how to use the range API.
225
 */
226
long hmm_range_fault(struct hmm_range *range, unsigned int flags);
227 228

/*
229
 * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
230
 *
231 232 233
 * When waiting for mmu notifiers we need some kind of time out otherwise we
 * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
 * wait already.
234
 */
235 236
#define HMM_RANGE_DEFAULT_TIMEOUT 1000

237
#endif /* LINUX_HMM_H */