dirty_log_test.c 14.1 KB
Newer Older
P
Peter Xu 已提交
1 2 3 4 5 6 7
// SPDX-License-Identifier: GPL-2.0
/*
 * KVM dirty page logging test
 *
 * Copyright (C) 2018, Red Hat, Inc.
 */

8 9
#define _GNU_SOURCE /* for program_invocation_name */

P
Peter Xu 已提交
10 11 12 13 14 15 16 17 18 19
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>

#include "test_util.h"
#include "kvm_util.h"
A
Andrew Jones 已提交
20
#include "processor.h"
P
Peter Xu 已提交
21

22
#define VCPU_ID				1
P
Peter Xu 已提交
23 24

/* The memory slot index to track dirty pages */
25 26
#define TEST_MEM_SLOT_INDEX		1

27 28
/* Default guest test virtual memory offset */
#define DEFAULT_GUEST_TEST_MEM		0xc0000000
29

P
Peter Xu 已提交
30
/* How many pages to dirty for each guest loop */
31 32
#define TEST_PAGES_PER_LOOP		1024

P
Peter Xu 已提交
33
/* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */
34
#define TEST_HOST_LOOP_N		32UL
35

P
Peter Xu 已提交
36
/* Interval for each host loop (ms) */
37
#define TEST_HOST_LOOP_INTERVAL		10UL
P
Peter Xu 已提交
38

39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
/* Dirty bitmaps are always little endian, so we need to swap on big endian */
#if defined(__s390x__)
# define BITOP_LE_SWIZZLE	((BITS_PER_LONG-1) & ~0x7)
# define test_bit_le(nr, addr) \
	test_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
# define set_bit_le(nr, addr) \
	set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
# define clear_bit_le(nr, addr) \
	clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
# define test_and_set_bit_le(nr, addr) \
	test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
# define test_and_clear_bit_le(nr, addr) \
	test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
#else
# define test_bit_le		test_bit
# define set_bit_le		set_bit
# define clear_bit_le		clear_bit
# define test_and_set_bit_le	test_and_set_bit
# define test_and_clear_bit_le	test_and_clear_bit
#endif

P
Peter Xu 已提交
60
/*
61 62 63 64
 * Guest/Host shared variables. Ensure addr_gva2hva() and/or
 * sync_global_to/from_guest() are used when accessing from
 * the host. READ/WRITE_ONCE() should also be used with anything
 * that may change.
P
Peter Xu 已提交
65
 */
66 67
static uint64_t host_page_size;
static uint64_t guest_page_size;
68
static uint64_t guest_num_pages;
69 70
static uint64_t random_array[TEST_PAGES_PER_LOOP];
static uint64_t iteration;
P
Peter Xu 已提交
71

72
/*
73 74 75 76 77 78 79 80 81
 * Guest physical memory offset of the testing memory slot.
 * This will be set to the topmost valid physical address minus
 * the test memory size.
 */
static uint64_t guest_test_phys_mem;

/*
 * Guest virtual memory offset of the testing memory slot.
 * Must not conflict with identity mapped test code.
82
 */
83
static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
84

P
Peter Xu 已提交
85
/*
86 87
 * Continuously write to the first 8 bytes of a random pages within
 * the testing memory region.
P
Peter Xu 已提交
88
 */
89
static void guest_code(void)
P
Peter Xu 已提交
90
{
91
	uint64_t addr;
92
	int i;
P
Peter Xu 已提交
93

94 95 96 97 98 99 100 101 102 103 104
	/*
	 * On s390x, all pages of a 1M segment are initially marked as dirty
	 * when a page of the segment is written to for the very first time.
	 * To compensate this specialty in this test, we need to touch all
	 * pages during the first iteration.
	 */
	for (i = 0; i < guest_num_pages; i++) {
		addr = guest_test_virt_mem + i * guest_page_size;
		*(uint64_t *)addr = READ_ONCE(iteration);
	}

P
Peter Xu 已提交
105 106
	while (true) {
		for (i = 0; i < TEST_PAGES_PER_LOOP; i++) {
107
			addr = guest_test_virt_mem;
108
			addr += (READ_ONCE(random_array[i]) % guest_num_pages)
109 110 111
				* guest_page_size;
			addr &= ~(host_page_size - 1);
			*(uint64_t *)addr = READ_ONCE(iteration);
P
Peter Xu 已提交
112
		}
113

P
Peter Xu 已提交
114 115 116 117 118
		/* Tell the host that we need more random numbers */
		GUEST_SYNC(1);
	}
}

119 120
/* Host variables */
static bool host_quit;
P
Peter Xu 已提交
121 122

/* Points to the test VM memory region on which we track dirty logs */
123 124
static void *host_test_mem;
static uint64_t host_num_pages;
P
Peter Xu 已提交
125 126

/* For statistics only */
127 128 129
static uint64_t host_dirty_count;
static uint64_t host_clear_count;
static uint64_t host_track_next_count;
P
Peter Xu 已提交
130 131 132 133 134 135 136 137

/*
 * We use this bitmap to track some pages that should have its dirty
 * bit set in the _next_ iteration.  For example, if we detected the
 * page value changed to current iteration but at the same time the
 * page bit is cleared in the latest bitmap, then the system must
 * report that write in the next get dirty log call.
 */
138
static unsigned long *host_bmap_track;
P
Peter Xu 已提交
139

140
static void generate_random_array(uint64_t *guest_array, uint64_t size)
P
Peter Xu 已提交
141 142 143
{
	uint64_t i;

144
	for (i = 0; i < size; i++)
P
Peter Xu 已提交
145 146 147
		guest_array[i] = random();
}

148
static void *vcpu_worker(void *data)
P
Peter Xu 已提交
149 150 151
{
	int ret;
	struct kvm_vm *vm = data;
152 153
	uint64_t *guest_array;
	uint64_t pages_count = 0;
P
Peter Xu 已提交
154 155 156 157
	struct kvm_run *run;

	run = vcpu_state(vm, VCPU_ID);

158
	guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array);
P
Peter Xu 已提交
159 160 161
	generate_random_array(guest_array, TEST_PAGES_PER_LOOP);

	while (!READ_ONCE(host_quit)) {
162
		/* Let the guest dirty the random pages */
P
Peter Xu 已提交
163
		ret = _vcpu_run(vm, VCPU_ID);
164
		TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
165
		if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) {
P
Peter Xu 已提交
166 167 168 169 170 171 172 173 174 175
			pages_count += TEST_PAGES_PER_LOOP;
			generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
		} else {
			TEST_ASSERT(false,
				    "Invalid guest sync status: "
				    "exit_reason=%s\n",
				    exit_reason_str(run->exit_reason));
		}
	}

176
	pr_info("Dirtied %"PRIu64" pages\n", pages_count);
P
Peter Xu 已提交
177 178 179 180

	return NULL;
}

181
static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
P
Peter Xu 已提交
182
{
183
	uint64_t step = vm_num_host_pages(mode, 1);
P
Peter Xu 已提交
184
	uint64_t page;
185
	uint64_t *value_ptr;
P
Peter Xu 已提交
186

187
	for (page = 0; page < host_num_pages; page += step) {
188
		value_ptr = host_test_mem + page * host_page_size;
P
Peter Xu 已提交
189 190

		/* If this is a special page that we were tracking... */
191
		if (test_and_clear_bit_le(page, host_bmap_track)) {
P
Peter Xu 已提交
192
			host_track_next_count++;
193
			TEST_ASSERT(test_bit_le(page, bmap),
P
Peter Xu 已提交
194 195 196 197 198
				    "Page %"PRIu64" should have its dirty bit "
				    "set in this iteration but it is missing",
				    page);
		}

199
		if (test_bit_le(page, bmap)) {
P
Peter Xu 已提交
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
			host_dirty_count++;
			/*
			 * If the bit is set, the value written onto
			 * the corresponding page should be either the
			 * previous iteration number or the current one.
			 */
			TEST_ASSERT(*value_ptr == iteration ||
				    *value_ptr == iteration - 1,
				    "Set page %"PRIu64" value %"PRIu64
				    " incorrect (iteration=%"PRIu64")",
				    page, *value_ptr, iteration);
		} else {
			host_clear_count++;
			/*
			 * If cleared, the value written can be any
			 * value smaller or equals to the iteration
			 * number.  Note that the value can be exactly
			 * (iteration-1) if that write can happen
			 * like this:
			 *
			 * (1) increase loop count to "iteration-1"
			 * (2) write to page P happens (with value
			 *     "iteration-1")
			 * (3) get dirty log for "iteration-1"; we'll
			 *     see that page P bit is set (dirtied),
			 *     and not set the bit in host_bmap_track
			 * (4) increase loop count to "iteration"
			 *     (which is current iteration)
			 * (5) get dirty log for current iteration,
			 *     we'll see that page P is cleared, with
			 *     value "iteration-1".
			 */
			TEST_ASSERT(*value_ptr <= iteration,
				    "Clear page %"PRIu64" value %"PRIu64
				    " incorrect (iteration=%"PRIu64")",
				    page, *value_ptr, iteration);
			if (*value_ptr == iteration) {
				/*
				 * This page is _just_ modified; it
				 * should report its dirtyness in the
				 * next run
				 */
242
				set_bit_le(page, host_bmap_track);
P
Peter Xu 已提交
243 244 245 246 247
			}
		}
	}
}

248
static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
249
				uint64_t extra_mem_pages, void *guest_code)
P
Peter Xu 已提交
250
{
251 252 253
	struct kvm_vm *vm;
	uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;

254 255
	pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));

256
	vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
257 258 259 260 261 262
	kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
#ifdef __x86_64__
	vm_create_irqchip(vm);
#endif
	vm_vcpu_add_default(vm, vcpuid, guest_code);
	return vm;
P
Peter Xu 已提交
263 264
}

265 266 267
#define DIRTY_MEM_BITS 30 /* 1G */
#define PAGE_SHIFT_4K  12

268 269 270 271
#ifdef USE_CLEAR_DIRTY_LOG
static u64 dirty_log_manual_caps;
#endif

272
static void run_test(enum vm_guest_mode mode, unsigned long iterations,
273
		     unsigned long interval, uint64_t phys_offset)
P
Peter Xu 已提交
274 275 276
{
	pthread_t vcpu_thread;
	struct kvm_vm *vm;
277
	unsigned long *bmap;
P
Peter Xu 已提交
278

279 280 281 282 283 284 285 286 287 288 289 290
	/*
	 * We reserve page table for 2 times of extra dirty mem which
	 * will definitely cover the original (1G+) test range.  Here
	 * we do the calculation with 4K page size which is the
	 * smallest so the page number will be enough for all archs
	 * (e.g., 64K page size guest will need even less memory for
	 * page tables).
	 */
	vm = create_vm(mode, VCPU_ID,
		       2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K),
		       guest_code);

291
	guest_page_size = vm_get_page_size(vm);
292 293 294 295
	/*
	 * A little more than 1G of guest page sized pages.  Cover the
	 * case where the size is not aligned to 64 pages.
	 */
296
	guest_num_pages = (1ul << (DIRTY_MEM_BITS -
297 298
				   vm_get_page_shift(vm))) + 3;
	guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
299

300
	host_page_size = getpagesize();
301
	host_num_pages = vm_num_host_pages(mode, guest_num_pages);
302

303
	if (!phys_offset) {
304 305
		guest_test_phys_mem = (vm_get_max_gfn(vm) -
				       guest_num_pages) * guest_page_size;
306
		guest_test_phys_mem &= ~(host_page_size - 1);
307 308
	} else {
		guest_test_phys_mem = phys_offset;
309 310
	}

311 312 313 314 315
#ifdef __s390x__
	/* Align to 1M (segment size) */
	guest_test_phys_mem &= ~((1 << 20) - 1);
#endif

316
	pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
317

318 319
	bmap = bitmap_alloc(host_num_pages);
	host_bmap_track = bitmap_alloc(host_num_pages);
P
Peter Xu 已提交
320

321 322 323
#ifdef USE_CLEAR_DIRTY_LOG
	struct kvm_enable_cap cap = {};

324
	cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
325
	cap.args[0] = dirty_log_manual_caps;
326 327 328
	vm_enable_cap(vm, &cap);
#endif

P
Peter Xu 已提交
329 330
	/* Add an extra memory slot for testing dirty logging */
	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
331
				    guest_test_phys_mem,
P
Peter Xu 已提交
332
				    TEST_MEM_SLOT_INDEX,
333
				    guest_num_pages,
P
Peter Xu 已提交
334 335
				    KVM_MEM_LOG_DIRTY_PAGES);

336
	/* Do mapping for the dirty track memory slot */
337
	virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
338 339

	/* Cache the HVA pointer of the region */
340
	host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
P
Peter Xu 已提交
341

342
#ifdef __x86_64__
P
Peter Xu 已提交
343
	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
344
#endif
345
	ucall_init(vm, NULL);
P
Peter Xu 已提交
346

347
	/* Export the shared variables to the guest */
348 349
	sync_global_to_guest(vm, host_page_size);
	sync_global_to_guest(vm, guest_page_size);
350
	sync_global_to_guest(vm, guest_test_virt_mem);
351
	sync_global_to_guest(vm, guest_num_pages);
P
Peter Xu 已提交
352 353

	/* Start the iterations */
354 355
	iteration = 1;
	sync_global_to_guest(vm, iteration);
356 357 358 359
	host_quit = false;
	host_dirty_count = 0;
	host_clear_count = 0;
	host_track_next_count = 0;
P
Peter Xu 已提交
360 361 362

	pthread_create(&vcpu_thread, NULL, vcpu_worker, vm);

363
	while (iteration < iterations) {
P
Peter Xu 已提交
364 365 366
		/* Give the vcpu thread some time to dirty some pages */
		usleep(interval * 1000);
		kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
367 368
#ifdef USE_CLEAR_DIRTY_LOG
		kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
369
				       host_num_pages);
370
#endif
371
		vm_dirty_log_verify(mode, bmap);
372 373
		iteration++;
		sync_global_to_guest(vm, iteration);
P
Peter Xu 已提交
374 375 376 377 378 379
	}

	/* Tell the vcpu thread to quit */
	host_quit = true;
	pthread_join(vcpu_thread, NULL);

380 381 382
	pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
		"track_next (%"PRIu64")\n", host_dirty_count, host_clear_count,
		host_track_next_count);
P
Peter Xu 已提交
383 384 385

	free(bmap);
	free(host_bmap_track);
386
	ucall_uninit(vm);
P
Peter Xu 已提交
387
	kvm_vm_free(vm);
388 389
}

390
struct guest_mode {
391 392 393
	bool supported;
	bool enabled;
};
394
static struct guest_mode guest_modes[NUM_VM_MODES];
395

396 397
#define guest_mode_init(mode, supported, enabled) ({ \
	guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
398
})
399 400 401 402 403 404

static void help(char *name)
{
	int i;

	puts("");
405
	printf("usage: %s [-h] [-i iterations] [-I interval] "
406
	       "[-p offset] [-m mode]\n", name);
407 408 409 410 411
	puts("");
	printf(" -i: specify iteration counts (default: %"PRIu64")\n",
	       TEST_HOST_LOOP_N);
	printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
	       TEST_HOST_LOOP_INTERVAL);
412 413
	printf(" -p: specify guest physical test memory offset\n"
	       "     Warning: a low offset can conflict with the loaded test code.\n");
414 415 416 417 418
	printf(" -m: specify the guest mode ID to test "
	       "(default: test all supported modes)\n"
	       "     This option may be used multiple times.\n"
	       "     Guest mode IDs:\n");
	for (i = 0; i < NUM_VM_MODES; ++i) {
419
		printf("         %d:    %s%s\n", i, vm_guest_mode_string(i),
420
		       guest_modes[i].supported ? " (supported)" : "");
421 422 423 424 425 426 427 428 429 430
	}
	puts("");
	exit(0);
}

int main(int argc, char *argv[])
{
	unsigned long iterations = TEST_HOST_LOOP_N;
	unsigned long interval = TEST_HOST_LOOP_INTERVAL;
	bool mode_selected = false;
431
	uint64_t phys_offset = 0;
432
	unsigned int mode;
433 434
	int opt, i;

435
#ifdef USE_CLEAR_DIRTY_LOG
436 437 438
	dirty_log_manual_caps =
		kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
	if (!dirty_log_manual_caps) {
439
		print_skip("KVM_CLEAR_DIRTY_LOG not available");
440 441
		exit(KSFT_SKIP);
	}
442 443
	dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
				  KVM_DIRTY_LOG_INITIALLY_SET);
444 445
#endif

446
#ifdef __x86_64__
447
	guest_mode_init(VM_MODE_PXXV48_4K, true, true);
448 449
#endif
#ifdef __aarch64__
450 451 452
	guest_mode_init(VM_MODE_P40V48_4K, true, true);
	guest_mode_init(VM_MODE_P40V48_64K, true, true);

453 454 455 456
	{
		unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);

		if (limit >= 52)
457
			guest_mode_init(VM_MODE_P52V48_64K, true, true);
458
		if (limit >= 48) {
459 460
			guest_mode_init(VM_MODE_P48V48_4K, true, true);
			guest_mode_init(VM_MODE_P48V48_64K, true, true);
461
		}
462
	}
463
#endif
464
#ifdef __s390x__
465
	guest_mode_init(VM_MODE_P40V48_4K, true, true);
466
#endif
467

468
	while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) {
469 470 471 472 473 474 475
		switch (opt) {
		case 'i':
			iterations = strtol(optarg, NULL, 10);
			break;
		case 'I':
			interval = strtol(optarg, NULL, 10);
			break;
476
		case 'p':
477
			phys_offset = strtoull(optarg, NULL, 0);
478
			break;
479 480 481
		case 'm':
			if (!mode_selected) {
				for (i = 0; i < NUM_VM_MODES; ++i)
482
					guest_modes[i].enabled = false;
483 484 485 486 487
				mode_selected = true;
			}
			mode = strtoul(optarg, NULL, 10);
			TEST_ASSERT(mode < NUM_VM_MODES,
				    "Guest mode ID %d too big", mode);
488
			guest_modes[mode].enabled = true;
489 490 491 492 493 494 495 496 497 498 499
			break;
		case 'h':
		default:
			help(argv[0]);
			break;
		}
	}

	TEST_ASSERT(iterations > 2, "Iterations must be greater than two");
	TEST_ASSERT(interval > 0, "Interval must be greater than zero");

500 501
	pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
		iterations, interval);
502 503 504 505

	srandom(time(0));

	for (i = 0; i < NUM_VM_MODES; ++i) {
506
		if (!guest_modes[i].enabled)
507
			continue;
508
		TEST_ASSERT(guest_modes[i].supported,
509
			    "Guest mode ID %d (%s) not supported.",
510 511
			    i, vm_guest_mode_string(i));
		run_test(i, iterations, interval, phys_offset);
512
	}
P
Peter Xu 已提交
513 514 515

	return 0;
}