vsyscall_64.c 8.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Thanks to hpa@transmeta.com for some useful hint.
 *  Special thanks to Ingo Molnar for his early experience with
 *  a different vsyscall implementation for Linux/IA32 and for the name.
 *
 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
 *  jumping out of line if necessary. We cannot add more with this
 *  mechanism because older kernels won't return -ENOSYS.
 *  If we want more than four we need a vDSO.
 *
 *  Note: the concept clashes with user mode linux. If you use UML and
 *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
 */

#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
27
#include <linux/clocksource.h>
28
#include <linux/getcpu.h>
29 30 31
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
L
Linus Torvalds 已提交
32 33 34 35

#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/page.h>
36
#include <asm/unistd.h>
L
Linus Torvalds 已提交
37 38 39
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
40 41 42
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
43
#include <asm/vgtod.h>
L
Linus Torvalds 已提交
44 45

#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
46
#define __syscall_clobber "r11","cx","memory"
L
Linus Torvalds 已提交
47

48 49 50
/*
 * vsyscall_gtod_data contains data that is :
 * - readonly from vsyscalls
S
Simon Arlott 已提交
51
 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
52 53
 * Try to keep this structure as small as possible to avoid cache line ping pongs
 */
54
int __vgetcpu_mode __section_vgetcpu_mode;
L
Linus Torvalds 已提交
55

56
struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
L
Linus Torvalds 已提交
57
{
58 59 60
	.lock = SEQLOCK_UNLOCKED,
	.sysctl_enabled = 1,
};
L
Linus Torvalds 已提交
61

62 63 64 65 66 67 68 69 70 71
void update_vsyscall_tz(void)
{
	unsigned long flags;

	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* sys_tz has changed */
	vsyscall_gtod_data.sys_tz = sys_tz;
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}

72
void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
L
Linus Torvalds 已提交
73
{
74
	unsigned long flags;
L
Linus Torvalds 已提交
75

76 77
	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* copy vsyscall data */
78 79 80 81 82 83 84
	vsyscall_gtod_data.clock.vread = clock->vread;
	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
	vsyscall_gtod_data.clock.mask = clock->mask;
	vsyscall_gtod_data.clock.mult = clock->mult;
	vsyscall_gtod_data.clock.shift = clock->shift;
	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
85
	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
86
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
L
Linus Torvalds 已提交
87 88
}

89 90 91
/* RED-PEN may want to readd seq locking, but then the variable should be
 * write-once.
 */
92
static __always_inline void do_get_tz(struct timezone * tz)
L
Linus Torvalds 已提交
93
{
94
	*tz = __vsyscall_gtod_data.sys_tz;
L
Linus Torvalds 已提交
95 96
}

97
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
L
Linus Torvalds 已提交
98 99
{
	int ret;
T
Thomas Gleixner 已提交
100
	asm volatile("syscall"
L
Linus Torvalds 已提交
101
		: "=a" (ret)
102 103
		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
		: __syscall_clobber );
L
Linus Torvalds 已提交
104 105 106
	return ret;
}

107
static __always_inline long time_syscall(long *t)
L
Linus Torvalds 已提交
108 109
{
	long secs;
T
Thomas Gleixner 已提交
110
	asm volatile("syscall"
L
Linus Torvalds 已提交
111 112 113 114 115
		: "=a" (secs)
		: "0" (__NR_time),"D" (t) : __syscall_clobber);
	return secs;
}

116 117 118
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
	cycle_t now, base, mask, cycle_delta;
119 120
	unsigned seq;
	unsigned long mult, shift, nsec;
121 122 123 124 125 126
	cycle_t (*vread)(void);
	do {
		seq = read_seqbegin(&__vsyscall_gtod_data.lock);

		vread = __vsyscall_gtod_data.clock.vread;
		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
A
Al Viro 已提交
127
			gettimeofday(tv,NULL);
128 129 130 131 132 133 134 135
			return;
		}
		now = vread();
		base = __vsyscall_gtod_data.clock.cycle_last;
		mask = __vsyscall_gtod_data.clock.mask;
		mult = __vsyscall_gtod_data.clock.mult;
		shift = __vsyscall_gtod_data.clock.shift;

136 137
		tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
		nsec = __vsyscall_gtod_data.wall_time_nsec;
138 139 140 141 142
	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));

	/* calculate interval: */
	cycle_delta = (now - base) & mask;
	/* convert to nsecs: */
143
	nsec += (cycle_delta * mult) >> shift;
144

145
	while (nsec >= NSEC_PER_SEC) {
146
		tv->tv_sec += 1;
147
		nsec -= NSEC_PER_SEC;
148
	}
149
	tv->tv_usec = nsec / NSEC_PER_USEC;
150 151
}

152
int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
L
Linus Torvalds 已提交
153 154 155 156 157 158 159 160 161 162
{
	if (tv)
		do_vgettimeofday(tv);
	if (tz)
		do_get_tz(tz);
	return 0;
}

/* This will break when the xtime seconds get inaccurate, but that is
 * unlikely */
163
time_t __vsyscall(1) vtime(time_t *t)
L
Linus Torvalds 已提交
164
{
J
john stultz 已提交
165
	struct timeval tv;
166
	time_t result;
167
	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
L
Linus Torvalds 已提交
168
		return time_syscall(t);
J
john stultz 已提交
169

170
	vgettimeofday(&tv, NULL);
J
john stultz 已提交
171
	result = tv.tv_sec;
172 173 174
	if (t)
		*t = result;
	return result;
L
Linus Torvalds 已提交
175 176
}

177 178 179 180 181 182 183 184 185 186
/* Fast way to get current CPU and node.
   This helps to do per node and per CPU caches in user space.
   The result is not guaranteed without CPU affinity, but usually
   works out because the scheduler tries to keep a thread on the same
   CPU.

   tcache must point to a two element sized long array.
   All arguments can be NULL. */
long __vsyscall(2)
vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
L
Linus Torvalds 已提交
187
{
188
	unsigned int p;
189 190 191 192 193 194 195 196 197 198
	unsigned long j = 0;

	/* Fast cache - only recompute value once per jiffies and avoid
	   relatively costly rdtscp/cpuid otherwise.
	   This works because the scheduler usually keeps the process
	   on the same CPU and this syscall doesn't guarantee its
	   results anyways.
	   We do this here because otherwise user space would do it on
	   its own in a likely inferior way (no access to jiffies).
	   If you don't like it pass NULL. */
199 200
	if (tcache && tcache->blob[0] == (j = __jiffies)) {
		p = tcache->blob[1];
201 202
	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
		/* Load per CPU data from RDTSCP */
203
		native_read_tscp(&p);
204 205 206 207 208
	} else {
		/* Load per CPU data from GDT */
		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
	}
	if (tcache) {
209 210
		tcache->blob[0] = j;
		tcache->blob[1] = p;
211 212 213 214 215 216
	}
	if (cpu)
		*cpu = p & 0xfff;
	if (node)
		*node = p >> 12;
	return 0;
L
Linus Torvalds 已提交
217 218
}

I
Ingo Molnar 已提交
219
static long __vsyscall(3) venosys_1(void)
L
Linus Torvalds 已提交
220 221 222 223 224
{
	return -ENOSYS;
}

#ifdef CONFIG_SYSCTL
225 226 227 228 229 230 231 232

static int
vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
		       void __user *buffer, size_t *lenp, loff_t *ppos)
{
	return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
}

L
Linus Torvalds 已提交
233
static ctl_table kernel_table2[] = {
234
	{ .procname = "vsyscall64",
235
	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
236 237
	  .mode = 0644,
	  .proc_handler = vsyscall_sysctl_change },
238
	{}
L
Linus Torvalds 已提交
239 240 241 242 243
};

static ctl_table kernel_root_table2[] = {
	{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
	  .child = kernel_table2 },
244
	{}
L
Linus Torvalds 已提交
245 246 247
};
#endif

248 249 250
/* Assume __initcall executes before all user space. Hopefully kmod
   doesn't violate that. We'll find out if it does. */
static void __cpuinit vsyscall_set_cpu(int cpu)
251 252 253 254
{
	unsigned long *d;
	unsigned long node = 0;
#ifdef CONFIG_NUMA
M
Mike Travis 已提交
255
	node = cpu_to_node(cpu);
256
#endif
257
	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
258
		write_rdtscp_aux((node << 12) | cpu);
259 260 261 262

	/* Store cpu number in limit so that it can be loaded quickly
	   in user space in vgetcpu.
	   12 bits for the CPU and 8 bits for the node. */
263
	d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
264 265 266 267 268 269
	*d = 0x0f40000000000ULL;
	*d |= cpu;
	*d |= (node & 0xf) << 12;
	*d |= (node >> 4) << 48;
}

270 271 272 273 274 275 276 277 278 279
static void __cpuinit cpu_vsyscall_init(void *arg)
{
	/* preemption should be already off */
	vsyscall_set_cpu(raw_smp_processor_id());
}

static int __cpuinit
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
	long cpu = (long)arg;
280
	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
281 282 283 284
		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
	return NOTIFY_DONE;
}

I
Ingo Molnar 已提交
285
void __init map_vsyscall(void)
L
Linus Torvalds 已提交
286 287 288 289
{
	extern char __vsyscall_0;
	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);

290
	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
L
Linus Torvalds 已提交
291 292 293 294 295 296 297 298 299
	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
}

static int __init vsyscall_init(void)
{
	BUG_ON(((unsigned long) &vgettimeofday !=
			VSYSCALL_ADDR(__NR_vgettimeofday)));
	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
300
	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
301
#ifdef CONFIG_SYSCTL
302
	register_sysctl_table(kernel_root_table2);
303
#endif
304 305
	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
	hotcpu_notifier(cpu_vsyscall_notifier, 0);
L
Linus Torvalds 已提交
306 307 308 309
	return 0;
}

__initcall(vsyscall_init);