vsyscall_64.c 8.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Thanks to hpa@transmeta.com for some useful hint.
 *  Special thanks to Ingo Molnar for his early experience with
 *  a different vsyscall implementation for Linux/IA32 and for the name.
 *
 *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
 *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
 *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
 *  jumping out of line if necessary. We cannot add more with this
 *  mechanism because older kernels won't return -ENOSYS.
 *  If we want more than four we need a vDSO.
 *
 *  Note: the concept clashes with user mode linux. If you use UML and
 *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
 */

20 21 22 23 24 25 26 27
/* Protect userspace from profiling */
#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
# undef likely
# undef unlikely
# define likely(x)		likely_notrace(x)
# define unlikely(x)		unlikely_notrace(x)
#endif

L
Linus Torvalds 已提交
28 29 30 31 32 33 34
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
35
#include <linux/clocksource.h>
36
#include <linux/getcpu.h>
37 38 39
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
L
Linus Torvalds 已提交
40 41 42 43

#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/page.h>
44
#include <asm/unistd.h>
L
Linus Torvalds 已提交
45 46 47
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
48 49 50
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
51
#include <asm/vgtod.h>
L
Linus Torvalds 已提交
52

53 54
#define __vsyscall(nr) \
		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
55
#define __syscall_clobber "r11","cx","memory"
L
Linus Torvalds 已提交
56

57 58 59
/*
 * vsyscall_gtod_data contains data that is :
 * - readonly from vsyscalls
S
Simon Arlott 已提交
60
 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
61 62
 * Try to keep this structure as small as possible to avoid cache line ping pongs
 */
63
int __vgetcpu_mode __section_vgetcpu_mode;
L
Linus Torvalds 已提交
64

65
struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
L
Linus Torvalds 已提交
66
{
67 68 69
	.lock = SEQLOCK_UNLOCKED,
	.sysctl_enabled = 1,
};
L
Linus Torvalds 已提交
70

71 72 73 74 75 76 77 78 79 80
void update_vsyscall_tz(void)
{
	unsigned long flags;

	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* sys_tz has changed */
	vsyscall_gtod_data.sys_tz = sys_tz;
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}

81
void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
L
Linus Torvalds 已提交
82
{
83
	unsigned long flags;
L
Linus Torvalds 已提交
84

85 86
	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
	/* copy vsyscall data */
87 88 89 90 91 92 93
	vsyscall_gtod_data.clock.vread = clock->vread;
	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
	vsyscall_gtod_data.clock.mask = clock->mask;
	vsyscall_gtod_data.clock.mult = clock->mult;
	vsyscall_gtod_data.clock.shift = clock->shift;
	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
94
	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
95
	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
L
Linus Torvalds 已提交
96 97
}

98 99 100
/* RED-PEN may want to readd seq locking, but then the variable should be
 * write-once.
 */
101
static __always_inline void do_get_tz(struct timezone * tz)
L
Linus Torvalds 已提交
102
{
103
	*tz = __vsyscall_gtod_data.sys_tz;
L
Linus Torvalds 已提交
104 105
}

106
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
L
Linus Torvalds 已提交
107 108
{
	int ret;
T
Thomas Gleixner 已提交
109
	asm volatile("syscall"
L
Linus Torvalds 已提交
110
		: "=a" (ret)
111 112
		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
		: __syscall_clobber );
L
Linus Torvalds 已提交
113 114 115
	return ret;
}

116
static __always_inline long time_syscall(long *t)
L
Linus Torvalds 已提交
117 118
{
	long secs;
T
Thomas Gleixner 已提交
119
	asm volatile("syscall"
L
Linus Torvalds 已提交
120 121 122 123 124
		: "=a" (secs)
		: "0" (__NR_time),"D" (t) : __syscall_clobber);
	return secs;
}

125 126 127
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
	cycle_t now, base, mask, cycle_delta;
128 129
	unsigned seq;
	unsigned long mult, shift, nsec;
130 131 132 133 134 135
	cycle_t (*vread)(void);
	do {
		seq = read_seqbegin(&__vsyscall_gtod_data.lock);

		vread = __vsyscall_gtod_data.clock.vread;
		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
A
Al Viro 已提交
136
			gettimeofday(tv,NULL);
137 138 139 140 141 142 143 144
			return;
		}
		now = vread();
		base = __vsyscall_gtod_data.clock.cycle_last;
		mask = __vsyscall_gtod_data.clock.mask;
		mult = __vsyscall_gtod_data.clock.mult;
		shift = __vsyscall_gtod_data.clock.shift;

145 146
		tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
		nsec = __vsyscall_gtod_data.wall_time_nsec;
147 148 149 150 151
	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));

	/* calculate interval: */
	cycle_delta = (now - base) & mask;
	/* convert to nsecs: */
152
	nsec += (cycle_delta * mult) >> shift;
153

154
	while (nsec >= NSEC_PER_SEC) {
155
		tv->tv_sec += 1;
156
		nsec -= NSEC_PER_SEC;
157
	}
158
	tv->tv_usec = nsec / NSEC_PER_USEC;
159 160
}

161
int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
L
Linus Torvalds 已提交
162 163 164 165 166 167 168 169 170 171
{
	if (tv)
		do_vgettimeofday(tv);
	if (tz)
		do_get_tz(tz);
	return 0;
}

/* This will break when the xtime seconds get inaccurate, but that is
 * unlikely */
172
time_t __vsyscall(1) vtime(time_t *t)
L
Linus Torvalds 已提交
173
{
J
john stultz 已提交
174
	struct timeval tv;
175
	time_t result;
176
	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
L
Linus Torvalds 已提交
177
		return time_syscall(t);
J
john stultz 已提交
178

179
	vgettimeofday(&tv, NULL);
J
john stultz 已提交
180
	result = tv.tv_sec;
181 182 183
	if (t)
		*t = result;
	return result;
L
Linus Torvalds 已提交
184 185
}

186 187 188 189 190 191 192 193 194 195
/* Fast way to get current CPU and node.
   This helps to do per node and per CPU caches in user space.
   The result is not guaranteed without CPU affinity, but usually
   works out because the scheduler tries to keep a thread on the same
   CPU.

   tcache must point to a two element sized long array.
   All arguments can be NULL. */
long __vsyscall(2)
vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
L
Linus Torvalds 已提交
196
{
197
	unsigned int p;
198 199 200 201 202 203 204 205 206 207
	unsigned long j = 0;

	/* Fast cache - only recompute value once per jiffies and avoid
	   relatively costly rdtscp/cpuid otherwise.
	   This works because the scheduler usually keeps the process
	   on the same CPU and this syscall doesn't guarantee its
	   results anyways.
	   We do this here because otherwise user space would do it on
	   its own in a likely inferior way (no access to jiffies).
	   If you don't like it pass NULL. */
208 209
	if (tcache && tcache->blob[0] == (j = __jiffies)) {
		p = tcache->blob[1];
210 211
	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
		/* Load per CPU data from RDTSCP */
212
		native_read_tscp(&p);
213 214 215 216 217
	} else {
		/* Load per CPU data from GDT */
		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
	}
	if (tcache) {
218 219
		tcache->blob[0] = j;
		tcache->blob[1] = p;
220 221 222 223 224 225
	}
	if (cpu)
		*cpu = p & 0xfff;
	if (node)
		*node = p >> 12;
	return 0;
L
Linus Torvalds 已提交
226 227
}

I
Ingo Molnar 已提交
228
static long __vsyscall(3) venosys_1(void)
L
Linus Torvalds 已提交
229 230 231 232 233
{
	return -ENOSYS;
}

#ifdef CONFIG_SYSCTL
234 235 236 237 238 239 240 241

static int
vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
		       void __user *buffer, size_t *lenp, loff_t *ppos)
{
	return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
}

L
Linus Torvalds 已提交
242
static ctl_table kernel_table2[] = {
243
	{ .procname = "vsyscall64",
244
	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
245 246
	  .mode = 0644,
	  .proc_handler = vsyscall_sysctl_change },
247
	{}
L
Linus Torvalds 已提交
248 249 250 251 252
};

static ctl_table kernel_root_table2[] = {
	{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
	  .child = kernel_table2 },
253
	{}
L
Linus Torvalds 已提交
254 255 256
};
#endif

257 258 259
/* Assume __initcall executes before all user space. Hopefully kmod
   doesn't violate that. We'll find out if it does. */
static void __cpuinit vsyscall_set_cpu(int cpu)
260
{
261
	unsigned long d;
262 263
	unsigned long node = 0;
#ifdef CONFIG_NUMA
M
Mike Travis 已提交
264
	node = cpu_to_node(cpu);
265
#endif
266
	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
267
		write_rdtscp_aux((node << 12) | cpu);
268 269 270 271

	/* Store cpu number in limit so that it can be loaded quickly
	   in user space in vgetcpu.
	   12 bits for the CPU and 8 bits for the node. */
272 273 274 275 276
	d = 0x0f40000000000ULL;
	d |= cpu;
	d |= (node & 0xf) << 12;
	d |= (node >> 4) << 48;
	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
277 278
}

279 280 281 282 283 284 285 286 287 288
static void __cpuinit cpu_vsyscall_init(void *arg)
{
	/* preemption should be already off */
	vsyscall_set_cpu(raw_smp_processor_id());
}

static int __cpuinit
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
	long cpu = (long)arg;
289
	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
290
		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
291 292 293
	return NOTIFY_DONE;
}

I
Ingo Molnar 已提交
294
void __init map_vsyscall(void)
L
Linus Torvalds 已提交
295 296 297 298
{
	extern char __vsyscall_0;
	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);

299
	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
L
Linus Torvalds 已提交
300 301 302 303 304 305 306 307 308
	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
}

static int __init vsyscall_init(void)
{
	BUG_ON(((unsigned long) &vgettimeofday !=
			VSYSCALL_ADDR(__NR_vgettimeofday)));
	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
309
	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
310
#ifdef CONFIG_SYSCTL
311
	register_sysctl_table(kernel_root_table2);
312
#endif
313
	on_each_cpu(cpu_vsyscall_init, NULL, 1);
314
	hotcpu_notifier(cpu_vsyscall_notifier, 0);
L
Linus Torvalds 已提交
315 316 317 318
	return 0;
}

__initcall(vsyscall_init);