vsyscall_64.c 9.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Thanks to hpa@transmeta.com for some useful hint.
 *  Special thanks to Ingo Molnar for his early experience with
 *  a different vsyscall implementation for Linux/IA32 and for the name.
 *
 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
 *  jumping out of line if necessary. We cannot add more with this
 *  mechanism because older kernels won't return -ENOSYS.
 *  If we want more than four we need a vDSO.
 *
 *  Note: the concept clashes with user mode linux. If you use UML and
 *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
 */

#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
27
#include <linux/clocksource.h>
28
#include <linux/getcpu.h>
29 30 31
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
L
Linus Torvalds 已提交
32 33 34 35

#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/page.h>
36
#include <asm/unistd.h>
L
Linus Torvalds 已提交
37 38 39
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
40 41 42
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
43
#include <asm/vgtod.h>
L
Linus Torvalds 已提交
44 45

#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
A
Arnd Bergmann 已提交
46
#define __syscall_clobber "r11","rcx","memory"
47 48 49 50 51
#define __pa_vsymbol(x)			\
	({unsigned long v;  		\
	extern char __vsyscall_0; 	\
	  asm("" : "=r" (v) : "0" (x)); \
	  ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
L
Linus Torvalds 已提交
52

53 54 55 56 57 58
/*
 * vsyscall_gtod_data contains data that is :
 * - readonly from vsyscalls
 * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
 * Try to keep this structure as small as possible to avoid cache line ping pongs
 */
59
int __vgetcpu_mode __section_vgetcpu_mode;
L
Linus Torvalds 已提交
60

61
struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
L
Linus Torvalds 已提交
62
{
63 64 65
	.lock = SEQLOCK_UNLOCKED,
	.sysctl_enabled = 1,
};
L
Linus Torvalds 已提交
66

67
void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
L
Linus Torvalds 已提交
68
{
69
	unsigned long flags;
L
Linus Torvalds 已提交
70

71 72
	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* copy vsyscall data */
73 74 75 76 77 78 79
	vsyscall_gtod_data.clock.vread = clock->vread;
	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
	vsyscall_gtod_data.clock.mask = clock->mask;
	vsyscall_gtod_data.clock.mult = clock->mult;
	vsyscall_gtod_data.clock.shift = clock->shift;
	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
80
	vsyscall_gtod_data.sys_tz = sys_tz;
81 82
	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
83
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
L
Linus Torvalds 已提交
84 85
}

86 87 88
/* RED-PEN may want to readd seq locking, but then the variable should be
 * write-once.
 */
89
static __always_inline void do_get_tz(struct timezone * tz)
L
Linus Torvalds 已提交
90
{
91
	*tz = __vsyscall_gtod_data.sys_tz;
L
Linus Torvalds 已提交
92 93
}

94
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
L
Linus Torvalds 已提交
95 96 97 98
{
	int ret;
	asm volatile("vsysc2: syscall"
		: "=a" (ret)
99 100
		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
		: __syscall_clobber );
L
Linus Torvalds 已提交
101 102 103
	return ret;
}

104
static __always_inline long time_syscall(long *t)
L
Linus Torvalds 已提交
105 106 107 108 109 110 111 112
{
	long secs;
	asm volatile("vsysc1: syscall"
		: "=a" (secs)
		: "0" (__NR_time),"D" (t) : __syscall_clobber);
	return secs;
}

113 114 115
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
	cycle_t now, base, mask, cycle_delta;
116 117
	unsigned seq;
	unsigned long mult, shift, nsec;
118 119 120 121 122 123
	cycle_t (*vread)(void);
	do {
		seq = read_seqbegin(&__vsyscall_gtod_data.lock);

		vread = __vsyscall_gtod_data.clock.vread;
		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
A
Al Viro 已提交
124
			gettimeofday(tv,NULL);
125 126 127 128 129 130 131 132
			return;
		}
		now = vread();
		base = __vsyscall_gtod_data.clock.cycle_last;
		mask = __vsyscall_gtod_data.clock.mask;
		mult = __vsyscall_gtod_data.clock.mult;
		shift = __vsyscall_gtod_data.clock.shift;

133 134
		tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
		nsec = __vsyscall_gtod_data.wall_time_nsec;
135 136 137 138 139
	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));

	/* calculate interval: */
	cycle_delta = (now - base) & mask;
	/* convert to nsecs: */
140
	nsec += (cycle_delta * mult) >> shift;
141

142
	while (nsec >= NSEC_PER_SEC) {
143
		tv->tv_sec += 1;
144
		nsec -= NSEC_PER_SEC;
145
	}
146
	tv->tv_usec = nsec / NSEC_PER_USEC;
147 148
}

149
int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
L
Linus Torvalds 已提交
150 151 152 153 154 155 156 157 158 159
{
	if (tv)
		do_vgettimeofday(tv);
	if (tz)
		do_get_tz(tz);
	return 0;
}

/* This will break when the xtime seconds get inaccurate, but that is
 * unlikely */
160
time_t __vsyscall(1) vtime(time_t *t)
L
Linus Torvalds 已提交
161
{
J
john stultz 已提交
162
	struct timeval tv;
163
	time_t result;
164
	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
L
Linus Torvalds 已提交
165
		return time_syscall(t);
J
john stultz 已提交
166 167 168

	vgettimeofday(&tv, 0);
	result = tv.tv_sec;
169 170 171
	if (t)
		*t = result;
	return result;
L
Linus Torvalds 已提交
172 173
}

174 175 176 177 178 179 180 181 182 183
/* Fast way to get current CPU and node.
   This helps to do per node and per CPU caches in user space.
   The result is not guaranteed without CPU affinity, but usually
   works out because the scheduler tries to keep a thread on the same
   CPU.

   tcache must point to a two element sized long array.
   All arguments can be NULL. */
long __vsyscall(2)
vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
L
Linus Torvalds 已提交
184
{
185 186 187 188 189 190 191 192 193 194 195
	unsigned int dummy, p;
	unsigned long j = 0;

	/* Fast cache - only recompute value once per jiffies and avoid
	   relatively costly rdtscp/cpuid otherwise.
	   This works because the scheduler usually keeps the process
	   on the same CPU and this syscall doesn't guarantee its
	   results anyways.
	   We do this here because otherwise user space would do it on
	   its own in a likely inferior way (no access to jiffies).
	   If you don't like it pass NULL. */
196 197
	if (tcache && tcache->blob[0] == (j = __jiffies)) {
		p = tcache->blob[1];
198 199 200 201 202 203 204 205
	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
		/* Load per CPU data from RDTSCP */
		rdtscp(dummy, dummy, p);
	} else {
		/* Load per CPU data from GDT */
		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
	}
	if (tcache) {
206 207
		tcache->blob[0] = j;
		tcache->blob[1] = p;
208 209 210 211 212 213
	}
	if (cpu)
		*cpu = p & 0xfff;
	if (node)
		*node = p >> 12;
	return 0;
L
Linus Torvalds 已提交
214 215
}

216
long __vsyscall(3) venosys_1(void)
L
Linus Torvalds 已提交
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
{
	return -ENOSYS;
}

#ifdef CONFIG_SYSCTL

#define SYSCALL 0x050f
#define NOP2    0x9090

/*
 * NOP out syscall in vsyscall page when not needed.
 */
static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
                        void __user *buffer, size_t *lenp, loff_t *ppos)
{
	extern u16 vsysc1, vsysc2;
233 234
	u16 __iomem *map1;
	u16 __iomem *map2;
L
Linus Torvalds 已提交
235 236 237 238 239
	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
	if (!write)
		return ret;
	/* gcc has some trouble with __va(__pa()), so just do it this
	   way. */
240
	map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
L
Linus Torvalds 已提交
241 242
	if (!map1)
		return -ENOMEM;
243
	map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
L
Linus Torvalds 已提交
244 245 246 247
	if (!map2) {
		ret = -ENOMEM;
		goto out;
	}
248
	if (!vsyscall_gtod_data.sysctl_enabled) {
249 250
		writew(SYSCALL, map1);
		writew(SYSCALL, map2);
L
Linus Torvalds 已提交
251
	} else {
252 253
		writew(NOP2, map1);
		writew(NOP2, map2);
L
Linus Torvalds 已提交
254 255 256 257 258 259 260 261 262
	}
	iounmap(map2);
out:
	iounmap(map1);
	return ret;
}

static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
				void __user *oldval, size_t __user *oldlenp,
263
				void __user *newval, size_t newlen)
L
Linus Torvalds 已提交
264 265 266 267 268 269
{
	return -ENOSYS;
}

static ctl_table kernel_table2[] = {
	{ .ctl_name = 99, .procname = "vsyscall64",
270 271
	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
	  .mode = 0644,
L
Linus Torvalds 已提交
272 273
	  .strategy = vsyscall_sysctl_nostrat,
	  .proc_handler = vsyscall_sysctl_change },
274
	{}
L
Linus Torvalds 已提交
275 276 277 278 279
};

static ctl_table kernel_root_table2[] = {
	{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
	  .child = kernel_table2 },
280
	{}
L
Linus Torvalds 已提交
281 282 283 284
};

#endif

285 286 287
/* Assume __initcall executes before all user space. Hopefully kmod
   doesn't violate that. We'll find out if it does. */
static void __cpuinit vsyscall_set_cpu(int cpu)
288 289 290 291 292 293
{
	unsigned long *d;
	unsigned long node = 0;
#ifdef CONFIG_NUMA
	node = cpu_to_node[cpu];
#endif
294 295
	if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
		write_rdtscp_aux((node << 12) | cpu);
296 297 298 299 300 301 302 303 304 305 306

	/* Store cpu number in limit so that it can be loaded quickly
	   in user space in vgetcpu.
	   12 bits for the CPU and 8 bits for the node. */
	d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
	*d = 0x0f40000000000ULL;
	*d |= cpu;
	*d |= (node & 0xf) << 12;
	*d |= (node >> 4) << 48;
}

307 308 309 310 311 312 313 314 315 316
static void __cpuinit cpu_vsyscall_init(void *arg)
{
	/* preemption should be already off */
	vsyscall_set_cpu(raw_smp_processor_id());
}

static int __cpuinit
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
	long cpu = (long)arg;
317
	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
318 319 320 321
		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
	return NOTIFY_DONE;
}

L
Linus Torvalds 已提交
322 323 324 325 326
static void __init map_vsyscall(void)
{
	extern char __vsyscall_0;
	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);

327
	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
L
Linus Torvalds 已提交
328 329 330 331 332 333 334 335 336
	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
}

static int __init vsyscall_init(void)
{
	BUG_ON(((unsigned long) &vgettimeofday !=
			VSYSCALL_ADDR(__NR_vgettimeofday)));
	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
337
	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
L
Linus Torvalds 已提交
338
	map_vsyscall();
339
#ifdef CONFIG_SYSCTL
340
	register_sysctl_table(kernel_root_table2);
341
#endif
342 343
	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
	hotcpu_notifier(cpu_vsyscall_notifier, 0);
L
Linus Torvalds 已提交
344 345 346 347
	return 0;
}

__initcall(vsyscall_init);