vsyscall_64.c 9.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Thanks to hpa@transmeta.com for some useful hint.
 *  Special thanks to Ingo Molnar for his early experience with
 *  a different vsyscall implementation for Linux/IA32 and for the name.
 *
 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
 *  jumping out of line if necessary. We cannot add more with this
 *  mechanism because older kernels won't return -ENOSYS.
 *  If we want more than four we need a vDSO.
 *
 *  Note: the concept clashes with user mode linux. If you use UML and
 *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
 */

#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
27
#include <linux/clocksource.h>
28
#include <linux/getcpu.h>
29 30 31
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
L
Linus Torvalds 已提交
32 33 34 35

#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/page.h>
36
#include <asm/unistd.h>
L
Linus Torvalds 已提交
37 38 39
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
40 41 42
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
43
#include <asm/vgtod.h>
L
Linus Torvalds 已提交
44 45

#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
46
#define __syscall_clobber "r11","cx","memory"
47 48 49 50
#define __pa_vsymbol(x)			\
	({unsigned long v;  		\
	extern char __vsyscall_0; 	\
	  asm("" : "=r" (v) : "0" (x)); \
51
	  ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
L
Linus Torvalds 已提交
52

53 54 55
/*
 * vsyscall_gtod_data contains data that is :
 * - readonly from vsyscalls
S
Simon Arlott 已提交
56
 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
57 58
 * Try to keep this structure as small as possible to avoid cache line ping pongs
 */
59
int __vgetcpu_mode __section_vgetcpu_mode;
L
Linus Torvalds 已提交
60

61
struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
L
Linus Torvalds 已提交
62
{
63 64 65
	.lock = SEQLOCK_UNLOCKED,
	.sysctl_enabled = 1,
};
L
Linus Torvalds 已提交
66

67 68 69 70 71 72 73 74 75 76
void update_vsyscall_tz(void)
{
	unsigned long flags;

	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* sys_tz has changed */
	vsyscall_gtod_data.sys_tz = sys_tz;
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}

77
void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
L
Linus Torvalds 已提交
78
{
79
	unsigned long flags;
L
Linus Torvalds 已提交
80

81 82
	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* copy vsyscall data */
83 84 85 86 87 88 89
	vsyscall_gtod_data.clock.vread = clock->vread;
	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
	vsyscall_gtod_data.clock.mask = clock->mask;
	vsyscall_gtod_data.clock.mult = clock->mult;
	vsyscall_gtod_data.clock.shift = clock->shift;
	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
90
	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
91
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
L
Linus Torvalds 已提交
92 93
}

94 95 96
/* RED-PEN may want to readd seq locking, but then the variable should be
 * write-once.
 */
97
static __always_inline void do_get_tz(struct timezone * tz)
L
Linus Torvalds 已提交
98
{
99
	*tz = __vsyscall_gtod_data.sys_tz;
L
Linus Torvalds 已提交
100 101
}

102
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
L
Linus Torvalds 已提交
103 104 105 106
{
	int ret;
	asm volatile("vsysc2: syscall"
		: "=a" (ret)
107 108
		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
		: __syscall_clobber );
L
Linus Torvalds 已提交
109 110 111
	return ret;
}

112
static __always_inline long time_syscall(long *t)
L
Linus Torvalds 已提交
113 114 115 116 117 118 119 120
{
	long secs;
	asm volatile("vsysc1: syscall"
		: "=a" (secs)
		: "0" (__NR_time),"D" (t) : __syscall_clobber);
	return secs;
}

121 122 123
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
	cycle_t now, base, mask, cycle_delta;
124 125
	unsigned seq;
	unsigned long mult, shift, nsec;
126 127 128 129 130 131
	cycle_t (*vread)(void);
	do {
		seq = read_seqbegin(&__vsyscall_gtod_data.lock);

		vread = __vsyscall_gtod_data.clock.vread;
		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
A
Al Viro 已提交
132
			gettimeofday(tv,NULL);
133 134 135 136 137 138 139 140
			return;
		}
		now = vread();
		base = __vsyscall_gtod_data.clock.cycle_last;
		mask = __vsyscall_gtod_data.clock.mask;
		mult = __vsyscall_gtod_data.clock.mult;
		shift = __vsyscall_gtod_data.clock.shift;

141 142
		tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
		nsec = __vsyscall_gtod_data.wall_time_nsec;
143 144 145 146 147
	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));

	/* calculate interval: */
	cycle_delta = (now - base) & mask;
	/* convert to nsecs: */
148
	nsec += (cycle_delta * mult) >> shift;
149

150
	while (nsec >= NSEC_PER_SEC) {
151
		tv->tv_sec += 1;
152
		nsec -= NSEC_PER_SEC;
153
	}
154
	tv->tv_usec = nsec / NSEC_PER_USEC;
155 156
}

157
int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
L
Linus Torvalds 已提交
158 159 160 161 162 163 164 165 166 167
{
	if (tv)
		do_vgettimeofday(tv);
	if (tz)
		do_get_tz(tz);
	return 0;
}

/* This will break when the xtime seconds get inaccurate, but that is
 * unlikely */
168
time_t __vsyscall(1) vtime(time_t *t)
L
Linus Torvalds 已提交
169
{
J
john stultz 已提交
170
	struct timeval tv;
171
	time_t result;
172
	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
L
Linus Torvalds 已提交
173
		return time_syscall(t);
J
john stultz 已提交
174

175
	vgettimeofday(&tv, NULL);
J
john stultz 已提交
176
	result = tv.tv_sec;
177 178 179
	if (t)
		*t = result;
	return result;
L
Linus Torvalds 已提交
180 181
}

182 183 184 185 186 187 188 189 190 191
/* Fast way to get current CPU and node.
   This helps to do per node and per CPU caches in user space.
   The result is not guaranteed without CPU affinity, but usually
   works out because the scheduler tries to keep a thread on the same
   CPU.

   tcache must point to a two element sized long array.
   All arguments can be NULL. */
long __vsyscall(2)
vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
L
Linus Torvalds 已提交
192
{
193 194 195 196 197 198 199 200 201 202 203
	unsigned int dummy, p;
	unsigned long j = 0;

	/* Fast cache - only recompute value once per jiffies and avoid
	   relatively costly rdtscp/cpuid otherwise.
	   This works because the scheduler usually keeps the process
	   on the same CPU and this syscall doesn't guarantee its
	   results anyways.
	   We do this here because otherwise user space would do it on
	   its own in a likely inferior way (no access to jiffies).
	   If you don't like it pass NULL. */
204 205
	if (tcache && tcache->blob[0] == (j = __jiffies)) {
		p = tcache->blob[1];
206 207 208 209 210 211 212 213
	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
		/* Load per CPU data from RDTSCP */
		rdtscp(dummy, dummy, p);
	} else {
		/* Load per CPU data from GDT */
		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
	}
	if (tcache) {
214 215
		tcache->blob[0] = j;
		tcache->blob[1] = p;
216 217 218 219 220 221
	}
	if (cpu)
		*cpu = p & 0xfff;
	if (node)
		*node = p >> 12;
	return 0;
L
Linus Torvalds 已提交
222 223
}

224
long __vsyscall(3) venosys_1(void)
L
Linus Torvalds 已提交
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
{
	return -ENOSYS;
}

#ifdef CONFIG_SYSCTL

#define SYSCALL 0x050f
#define NOP2    0x9090

/*
 * NOP out syscall in vsyscall page when not needed.
 */
static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
                        void __user *buffer, size_t *lenp, loff_t *ppos)
{
	extern u16 vsysc1, vsysc2;
241 242
	u16 __iomem *map1;
	u16 __iomem *map2;
L
Linus Torvalds 已提交
243 244 245 246 247
	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
	if (!write)
		return ret;
	/* gcc has some trouble with __va(__pa()), so just do it this
	   way. */
248
	map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
L
Linus Torvalds 已提交
249 250
	if (!map1)
		return -ENOMEM;
251
	map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
L
Linus Torvalds 已提交
252 253 254 255
	if (!map2) {
		ret = -ENOMEM;
		goto out;
	}
256
	if (!vsyscall_gtod_data.sysctl_enabled) {
257 258
		writew(SYSCALL, map1);
		writew(SYSCALL, map2);
L
Linus Torvalds 已提交
259
	} else {
260 261
		writew(NOP2, map1);
		writew(NOP2, map2);
L
Linus Torvalds 已提交
262 263 264 265 266 267 268 269
	}
	iounmap(map2);
out:
	iounmap(map1);
	return ret;
}

static ctl_table kernel_table2[] = {
270
	{ .procname = "vsyscall64",
271 272
	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
	  .mode = 0644,
L
Linus Torvalds 已提交
273
	  .proc_handler = vsyscall_sysctl_change },
274
	{}
L
Linus Torvalds 已提交
275 276 277 278 279
};

static ctl_table kernel_root_table2[] = {
	{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
	  .child = kernel_table2 },
280
	{}
L
Linus Torvalds 已提交
281 282 283 284
};

#endif

285 286 287
/* Assume __initcall executes before all user space. Hopefully kmod
   doesn't violate that. We'll find out if it does. */
static void __cpuinit vsyscall_set_cpu(int cpu)
288 289 290 291
{
	unsigned long *d;
	unsigned long node = 0;
#ifdef CONFIG_NUMA
M
Mike Travis 已提交
292
	node = cpu_to_node(cpu);
293
#endif
294
	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
295
		write_rdtscp_aux((node << 12) | cpu);
296 297 298 299 300 301 302 303 304 305 306

	/* Store cpu number in limit so that it can be loaded quickly
	   in user space in vgetcpu.
	   12 bits for the CPU and 8 bits for the node. */
	d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
	*d = 0x0f40000000000ULL;
	*d |= cpu;
	*d |= (node & 0xf) << 12;
	*d |= (node >> 4) << 48;
}

307 308 309 310 311 312 313 314 315 316
static void __cpuinit cpu_vsyscall_init(void *arg)
{
	/* preemption should be already off */
	vsyscall_set_cpu(raw_smp_processor_id());
}

static int __cpuinit
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
	long cpu = (long)arg;
317
	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
318 319 320 321
		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
	return NOTIFY_DONE;
}

L
Linus Torvalds 已提交
322 323 324 325 326
static void __init map_vsyscall(void)
{
	extern char __vsyscall_0;
	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);

327
	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
L
Linus Torvalds 已提交
328 329 330 331 332 333 334 335 336
	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
}

static int __init vsyscall_init(void)
{
	BUG_ON(((unsigned long) &vgettimeofday !=
			VSYSCALL_ADDR(__NR_vgettimeofday)));
	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
337
	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
L
Linus Torvalds 已提交
338
	map_vsyscall();
339
#ifdef CONFIG_SYSCTL
340
	register_sysctl_table(kernel_root_table2);
341
#endif
342 343
	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
	hotcpu_notifier(cpu_vsyscall_notifier, 0);
L
Linus Torvalds 已提交
344 345 346 347
	return 0;
}

__initcall(vsyscall_init);