vsyscall_64.c 9.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Thanks to hpa@transmeta.com for some useful hint.
 *  Special thanks to Ingo Molnar for his early experience with
 *  a different vsyscall implementation for Linux/IA32 and for the name.
 *
 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
 *  jumping out of line if necessary. We cannot add more with this
 *  mechanism because older kernels won't return -ENOSYS.
 *  If we want more than four we need a vDSO.
 *
 *  Note: the concept clashes with user mode linux. If you use UML and
 *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
 */

#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
27
#include <linux/clocksource.h>
28
#include <linux/getcpu.h>
29 30 31
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
L
Linus Torvalds 已提交
32 33 34 35

#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/page.h>
36
#include <asm/unistd.h>
L
Linus Torvalds 已提交
37 38 39
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
40 41 42
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
43
#include <asm/vgtod.h>
L
Linus Torvalds 已提交
44 45

#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
A
Arnd Bergmann 已提交
46
#define __syscall_clobber "r11","rcx","memory"
47 48 49 50 51
#define __pa_vsymbol(x)			\
	({unsigned long v;  		\
	extern char __vsyscall_0; 	\
	  asm("" : "=r" (v) : "0" (x)); \
	  ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
L
Linus Torvalds 已提交
52

53 54 55 56 57 58
/*
 * vsyscall_gtod_data contains data that is :
 * - readonly from vsyscalls
 * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
 * Try to keep this structure as small as possible to avoid cache line ping pongs
 */
59
int __vgetcpu_mode __section_vgetcpu_mode;
L
Linus Torvalds 已提交
60

61
struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
L
Linus Torvalds 已提交
62
{
63 64 65
	.lock = SEQLOCK_UNLOCKED,
	.sysctl_enabled = 1,
};
L
Linus Torvalds 已提交
66

67
void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
L
Linus Torvalds 已提交
68
{
69
	unsigned long flags;
L
Linus Torvalds 已提交
70

71 72
	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* copy vsyscall data */
73 74 75 76 77 78 79
	vsyscall_gtod_data.clock.vread = clock->vread;
	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
	vsyscall_gtod_data.clock.mask = clock->mask;
	vsyscall_gtod_data.clock.mult = clock->mult;
	vsyscall_gtod_data.clock.shift = clock->shift;
	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
80
	vsyscall_gtod_data.sys_tz = sys_tz;
81
	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
82
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
L
Linus Torvalds 已提交
83 84
}

85 86 87
/* RED-PEN may want to readd seq locking, but then the variable should be
 * write-once.
 */
88
static __always_inline void do_get_tz(struct timezone * tz)
L
Linus Torvalds 已提交
89
{
90
	*tz = __vsyscall_gtod_data.sys_tz;
L
Linus Torvalds 已提交
91 92
}

93
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
L
Linus Torvalds 已提交
94 95 96 97
{
	int ret;
	asm volatile("vsysc2: syscall"
		: "=a" (ret)
98 99
		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
		: __syscall_clobber );
L
Linus Torvalds 已提交
100 101 102
	return ret;
}

103
static __always_inline long time_syscall(long *t)
L
Linus Torvalds 已提交
104 105 106 107 108 109 110 111
{
	long secs;
	asm volatile("vsysc1: syscall"
		: "=a" (secs)
		: "0" (__NR_time),"D" (t) : __syscall_clobber);
	return secs;
}

112 113 114
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
	cycle_t now, base, mask, cycle_delta;
115 116
	unsigned seq;
	unsigned long mult, shift, nsec;
117 118 119 120 121 122
	cycle_t (*vread)(void);
	do {
		seq = read_seqbegin(&__vsyscall_gtod_data.lock);

		vread = __vsyscall_gtod_data.clock.vread;
		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
A
Al Viro 已提交
123
			gettimeofday(tv,NULL);
124 125 126 127 128 129 130 131
			return;
		}
		now = vread();
		base = __vsyscall_gtod_data.clock.cycle_last;
		mask = __vsyscall_gtod_data.clock.mask;
		mult = __vsyscall_gtod_data.clock.mult;
		shift = __vsyscall_gtod_data.clock.shift;

132 133
		tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
		nsec = __vsyscall_gtod_data.wall_time_nsec;
134 135 136 137 138
	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));

	/* calculate interval: */
	cycle_delta = (now - base) & mask;
	/* convert to nsecs: */
139
	nsec += (cycle_delta * mult) >> shift;
140

141
	while (nsec >= NSEC_PER_SEC) {
142
		tv->tv_sec += 1;
143
		nsec -= NSEC_PER_SEC;
144
	}
145
	tv->tv_usec = nsec / NSEC_PER_USEC;
146 147
}

148
int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
L
Linus Torvalds 已提交
149 150 151 152 153 154 155 156 157 158
{
	if (tv)
		do_vgettimeofday(tv);
	if (tz)
		do_get_tz(tz);
	return 0;
}

/* This will break when the xtime seconds get inaccurate, but that is
 * unlikely */
159
time_t __vsyscall(1) vtime(time_t *t)
L
Linus Torvalds 已提交
160
{
J
john stultz 已提交
161
	struct timeval tv;
162
	time_t result;
163
	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
L
Linus Torvalds 已提交
164
		return time_syscall(t);
J
john stultz 已提交
165 166 167

	vgettimeofday(&tv, 0);
	result = tv.tv_sec;
168 169 170
	if (t)
		*t = result;
	return result;
L
Linus Torvalds 已提交
171 172
}

173 174 175 176 177 178 179 180 181 182
/* Fast way to get current CPU and node.
   This helps to do per node and per CPU caches in user space.
   The result is not guaranteed without CPU affinity, but usually
   works out because the scheduler tries to keep a thread on the same
   CPU.

   tcache must point to a two element sized long array.
   All arguments can be NULL. */
long __vsyscall(2)
vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
L
Linus Torvalds 已提交
183
{
184 185 186 187 188 189 190 191 192 193 194
	unsigned int dummy, p;
	unsigned long j = 0;

	/* Fast cache - only recompute value once per jiffies and avoid
	   relatively costly rdtscp/cpuid otherwise.
	   This works because the scheduler usually keeps the process
	   on the same CPU and this syscall doesn't guarantee its
	   results anyways.
	   We do this here because otherwise user space would do it on
	   its own in a likely inferior way (no access to jiffies).
	   If you don't like it pass NULL. */
195 196
	if (tcache && tcache->blob[0] == (j = __jiffies)) {
		p = tcache->blob[1];
197 198 199 200 201 202 203 204
	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
		/* Load per CPU data from RDTSCP */
		rdtscp(dummy, dummy, p);
	} else {
		/* Load per CPU data from GDT */
		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
	}
	if (tcache) {
205 206
		tcache->blob[0] = j;
		tcache->blob[1] = p;
207 208 209 210 211 212
	}
	if (cpu)
		*cpu = p & 0xfff;
	if (node)
		*node = p >> 12;
	return 0;
L
Linus Torvalds 已提交
213 214
}

215
long __vsyscall(3) venosys_1(void)
L
Linus Torvalds 已提交
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
{
	return -ENOSYS;
}

#ifdef CONFIG_SYSCTL

#define SYSCALL 0x050f
#define NOP2    0x9090

/*
 * NOP out syscall in vsyscall page when not needed.
 */
static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
                        void __user *buffer, size_t *lenp, loff_t *ppos)
{
	extern u16 vsysc1, vsysc2;
232 233
	u16 __iomem *map1;
	u16 __iomem *map2;
L
Linus Torvalds 已提交
234 235 236 237 238
	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
	if (!write)
		return ret;
	/* gcc has some trouble with __va(__pa()), so just do it this
	   way. */
239
	map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
L
Linus Torvalds 已提交
240 241
	if (!map1)
		return -ENOMEM;
242
	map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
L
Linus Torvalds 已提交
243 244 245 246
	if (!map2) {
		ret = -ENOMEM;
		goto out;
	}
247
	if (!vsyscall_gtod_data.sysctl_enabled) {
248 249
		writew(SYSCALL, map1);
		writew(SYSCALL, map2);
L
Linus Torvalds 已提交
250
	} else {
251 252
		writew(NOP2, map1);
		writew(NOP2, map2);
L
Linus Torvalds 已提交
253 254 255 256 257 258 259 260 261
	}
	iounmap(map2);
out:
	iounmap(map1);
	return ret;
}

static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
				void __user *oldval, size_t __user *oldlenp,
262
				void __user *newval, size_t newlen)
L
Linus Torvalds 已提交
263 264 265 266 267 268
{
	return -ENOSYS;
}

static ctl_table kernel_table2[] = {
	{ .ctl_name = 99, .procname = "vsyscall64",
269 270
	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
	  .mode = 0644,
L
Linus Torvalds 已提交
271 272
	  .strategy = vsyscall_sysctl_nostrat,
	  .proc_handler = vsyscall_sysctl_change },
273
	{}
L
Linus Torvalds 已提交
274 275 276 277 278
};

static ctl_table kernel_root_table2[] = {
	{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
	  .child = kernel_table2 },
279
	{}
L
Linus Torvalds 已提交
280 281 282 283
};

#endif

284 285 286
/* Assume __initcall executes before all user space. Hopefully kmod
   doesn't violate that. We'll find out if it does. */
static void __cpuinit vsyscall_set_cpu(int cpu)
287 288 289 290
{
	unsigned long *d;
	unsigned long node = 0;
#ifdef CONFIG_NUMA
M
Mike Travis 已提交
291
	node = cpu_to_node(cpu);
292
#endif
293 294
	if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
		write_rdtscp_aux((node << 12) | cpu);
295 296 297 298 299 300 301 302 303 304 305

	/* Store cpu number in limit so that it can be loaded quickly
	   in user space in vgetcpu.
	   12 bits for the CPU and 8 bits for the node. */
	d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
	*d = 0x0f40000000000ULL;
	*d |= cpu;
	*d |= (node & 0xf) << 12;
	*d |= (node >> 4) << 48;
}

306 307 308 309 310 311 312 313 314 315
static void __cpuinit cpu_vsyscall_init(void *arg)
{
	/* preemption should be already off */
	vsyscall_set_cpu(raw_smp_processor_id());
}

static int __cpuinit
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
	long cpu = (long)arg;
316
	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
317 318 319 320
		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
	return NOTIFY_DONE;
}

L
Linus Torvalds 已提交
321 322 323 324 325
static void __init map_vsyscall(void)
{
	extern char __vsyscall_0;
	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);

326
	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
L
Linus Torvalds 已提交
327 328 329 330 331 332 333 334 335
	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
}

static int __init vsyscall_init(void)
{
	BUG_ON(((unsigned long) &vgettimeofday !=
			VSYSCALL_ADDR(__NR_vgettimeofday)));
	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
336
	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
L
Linus Torvalds 已提交
337
	map_vsyscall();
338
#ifdef CONFIG_SYSCTL
339
	register_sysctl_table(kernel_root_table2);
340
#endif
341 342
	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
	hotcpu_notifier(cpu_vsyscall_notifier, 0);
L
Linus Torvalds 已提交
343 344 345 346
	return 0;
}

__initcall(vsyscall_init);