see http://lkml.org/lkml/2007/1/14/27 Index: linux/arch/x86_64/kernel/time.c =================================================================== --- linux.orig/arch/x86_64/kernel/time.c 2007-03-21 23:24:05.334912771 -0700 +++ linux/arch/x86_64/kernel/time.c 2007-03-21 23:25:02.053771688 -0700 @@ -974,11 +974,6 @@ void time_init_gtod(void) if (unsynchronized_tsc()) notsc = 1; - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) - vgetcpu_mode = VGETCPU_RDTSCP; - else - vgetcpu_mode = VGETCPU_LSL; - if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) Index: linux/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux.orig/arch/x86_64/kernel/vsyscall.c 2007-03-21 23:24:05.338912973 -0700 +++ linux/arch/x86_64/kernel/vsyscall.c 2007-03-21 23:25:02.053771688 -0700 @@ -40,148 +40,12 @@ #include #include #include - -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -#define __syscall_clobber "r11","rcx","memory" - -int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; -int __vgetcpu_mode __section_vgetcpu_mode; - #include -static __always_inline void timeval_normalize(struct timeval * tv) -{ - time_t __sec; - - __sec = tv->tv_usec / 1000000; - if (__sec) { - tv->tv_usec %= 1000000; - tv->tv_sec += __sec; - } -} - -static __always_inline void do_vgettimeofday(struct timeval * tv) -{ - long sequence, t; - unsigned long sec, usec; - - do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = __xtime.tv_nsec / 1000; - - if (__vxtime.mode != VXTIME_HPET) { - t = get_cycles_sync(); - if (t < __vxtime.last_tsc) - t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void __iomem *) - fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; - } - } while (read_seqretry(&__xtime_lock, sequence)); - - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; -} - -/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ -static __always_inline void do_get_tz(struct timezone * tz) -{ - *tz = __sys_tz; -} - -static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret; - asm volatile("vsysc2: syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); - return ret; -} - -static __always_inline long time_syscall(long *t) -{ - long secs; - asm volatile("vsysc1: syscall" - : "=a" (secs) - : "0" (__NR_time),"D" (t) : __syscall_clobber); - return secs; -} - -int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) -{ - if (!__sysctl_vsyscall) - return gettimeofday(tv,tz); - if (tv) - do_vgettimeofday(tv); - if (tz) - do_get_tz(tz); - return 0; -} - -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ -time_t __vsyscall(1) vtime(time_t *t) -{ - if (!__sysctl_vsyscall) - return time_syscall(t); - else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; -} - -/* Fast way to get current CPU and node. - This helps to do per node and per CPU caches in user space. - The result is not guaranteed without CPU affinity, but usually - works out because the scheduler tries to keep a thread on the same - CPU. - - tcache must point to a two element sized long array. - All arguments can be NULL. */ -long __vsyscall(2) -vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) -{ - unsigned int dummy, p; - unsigned long j = 0; - - /* Fast cache - only recompute value once per jiffies and avoid - relatively costly rdtscp/cpuid otherwise. - This works because the scheduler usually keeps the process - on the same CPU and this syscall doesn't guarantee its - results anyways. - We do this here because otherwise user space would do it on - its own in a likely inferior way (no access to jiffies). - If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = __jiffies)) { - p = tcache->blob[1]; - } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } - if (tcache) { - tcache->blob[0] = j; - tcache->blob[1] = p; - } - if (cpu) - *cpu = p & 0xfff; - if (node) - *node = p >> 12; - return 0; -} - -long __vsyscall(3) venosys_1(void) -{ - return -ENOSYS; -} +/* the vsyscalls themselves */ +extern int vgettimeofday(struct timeval * tv, struct timezone * tz); +extern time_t vtime(time_t *t); +extern long vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache); #ifdef CONFIG_SYSCTL @@ -246,26 +110,46 @@ static ctl_table kernel_root_table2[] = #endif +/* is this necessary? */ +#ifndef CONFIG_NODES_SHIFT +#define CONFIG_NODES_SHIFT 0 +#endif + /* Assume __initcall executes before all user space. Hopefully kmod doesn't violate that. We'll find out if it does. */ static void __cpuinit vsyscall_set_cpu(int cpu) { - unsigned long *d; - unsigned long node = 0; + unsigned long cpu_node_encoding = cpu << CONFIG_NODES_SHIFT; + #ifdef CONFIG_NUMA - node = cpu_to_node[cpu]; + cpu_node_encoding |= cpu_to_node[cpu]; #endif + + /* Even though we never use rdtscp for vgetcpu we set up the rdtscp_aux + * register here for (future) use in vgettimeofday et al. + */ if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); + write_rdtscp_aux(cpu_node_encoding); +#ifdef VGETCPU_USE_SIDT + { + struct desc_ptr local_idt; + + local_idt.size = 0x1000 + cpu_node_encoding; + local_idt.address = idt_descr.address; + asm("lidt %0" :: "m" (local_idt)); + } +#else /* Store cpu number in limit so that it can be loaded quickly - in user space in vgetcpu. - 12 bits for the CPU and 8 bits for the node. */ - d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); - *d = 0x0f40000000000ULL; - *d |= cpu; - *d |= (node & 0xf) << 12; - *d |= (node >> 4) << 48; + in user space in vgetcpu. */ + { + unsigned long *d; + d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); + *d = 0x0f40000000000ULL; + *d |= cpu_node_encoding & 0xffff; + *d |= (cpu_node_encoding >> 16) << 48; + }; +#endif } static void __cpuinit cpu_vsyscall_init(void *arg) Index: linux/include/asm-x86_64/vsyscall.h =================================================================== --- linux.orig/include/asm-x86_64/vsyscall.h 2007-03-21 23:24:05.362914182 -0700 +++ linux/include/asm-x86_64/vsyscall.h 2007-03-21 23:25:02.053771688 -0700 @@ -17,7 +17,6 @@ enum vsyscall_num { #include #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16))) -#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) #define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16))) #define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16))) @@ -28,9 +27,6 @@ enum vsyscall_num { #define VXTIME_HPET 2 #define VXTIME_PMTMR 3 -#define VGETCPU_RDTSCP 1 -#define VGETCPU_LSL 2 - struct vxtime_data { long hpet_address; /* HPET base address */ int last; @@ -45,7 +41,6 @@ struct vxtime_data { /* vsyscall space (readonly) */ extern struct vxtime_data __vxtime; -extern int __vgetcpu_mode; extern struct timespec __xtime; extern volatile unsigned long __jiffies; extern struct timezone __sys_tz; @@ -53,7 +48,6 @@ extern seqlock_t __xtime_lock; /* kernel space (writeable) */ extern struct vxtime_data vxtime; -extern int vgetcpu_mode; extern struct timezone sys_tz; extern int sysctl_vsyscall; extern seqlock_t xtime_lock; @@ -62,6 +56,28 @@ extern int sysctl_vsyscall; #define ARCH_HAVE_XTIME_LOCK 1 +/* + * To use the IDT limit for vgetcpu we encode things like so: + * + * 0x1000 + node + (cpu << CONFIG_NODES_SHIFT) + * + * this ensures a system using this method has an IDT limit other than + * 0xfff, while systems not using this method will have an IDT limit + * of 0xfff. (just in case anyone cares to have a test). + * + * This test verifies the various config options are in an appropriate + * range for the 16-bit limit field. + */ +#if 0x1000 + (CONFIG_NR_CPUS << CONFIG_NODES_SHIFT) <= 0x10000 +#define VGETCPU_USE_SIDT 1 + +/* might as well test this somewhere -- the lsl method of vgetcpu has + * only 20 bits available to it. + */ +#elif (CONFIG_NR_CPUS << CONFIG_NODES_SHIFT) >= (1<<20) +#error "(CONFIG_NR_CPUS << CONFIG_NODES_SHIFT) out of range for existing vgetcpu implementations" +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_64_VSYSCALL_H_ */ Index: linux/arch/x86_64/kernel/vmlinux.lds.S =================================================================== --- linux.orig/arch/x86_64/kernel/vmlinux.lds.S 2007-03-21 23:24:05.346913376 -0700 +++ linux/arch/x86_64/kernel/vmlinux.lds.S 2007-03-21 23:25:02.053771688 -0700 @@ -94,9 +94,6 @@ SECTIONS .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } vxtime = VVIRT(.vxtime); - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } sys_tz = VVIRT(.sys_tz); Index: linux/arch/x86_64/kernel/Makefile =================================================================== --- linux.orig/arch/x86_64/kernel/Makefile 2007-03-21 23:24:05.354913779 -0700 +++ linux/arch/x86_64/kernel/Makefile 2007-03-21 23:25:02.065772292 -0700 @@ -6,7 +6,7 @@ extra-y := head.o head64.o init_task.o EXTRA_AFLAGS := -traditional obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ - x8664_ksyms.o i387.o syscall.o vsyscall.o \ + x8664_ksyms.o i387.o syscall.o vsyscall.o vsyscall_user.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ pci-dma.o pci-nommu.o alternative.o @@ -45,6 +45,7 @@ obj-y += topology.o obj-y += intel_cacheinfo.o CFLAGS_vsyscall.o := $(PROFILING) -g0 +CFLAGS_vsyscall_user.o := $(PROFILING) -g0 -mred-zone therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o bootflag-y += ../../i386/kernel/bootflag.o Index: linux/arch/x86_64/kernel/vsyscall_user.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux/arch/x86_64/kernel/vsyscall_user.c 2007-03-21 23:25:02.065772292 -0700 @@ -0,0 +1,201 @@ +/* + * linux/arch/x86_64/kernel/vsyscall_user.c + * + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located + * at virtual address -10Mbyte+1024bytes etc... There are at max 4 + * vsyscalls. One vsyscall can reserve more than 1 slot to avoid + * jumping out of line if necessary. We cannot add more with this + * mechanism because older kernels won't return -ENOSYS. + * If we want more than four we need a vDSO. + * + * Note: the concept clashes with user mode linux. If you use UML and + * want per guest time just set the kernel.vsyscall64 sysctl to 0. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __syscall_clobber "r11","rcx","memory" + +int __sysctl_vsyscall __section_sysctl_vsyscall = 1; +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; + +#include + +static __always_inline void timeval_normalize(struct timeval * tv) +{ + time_t __sec; + + __sec = tv->tv_usec / 1000000; + if (__sec) { + tv->tv_usec %= 1000000; + tv->tv_sec += __sec; + } +} + +static __always_inline void do_vgettimeofday(struct timeval * tv) +{ + long sequence, t; + unsigned long sec, usec; + + do { + sequence = read_seqbegin(&__xtime_lock); + + sec = __xtime.tv_sec; + usec = __xtime.tv_nsec / 1000; + + if (__vxtime.mode != VXTIME_HPET) { + t = get_cycles_sync(); + if (t < __vxtime.last_tsc) + t = __vxtime.last_tsc; + usec += ((t - __vxtime.last_tsc) * + __vxtime.tsc_quot) >> 32; + /* See comment in x86_64 do_gettimeofday. */ + } else { + usec += ((readl((void __iomem *) + fix_to_virt(VSYSCALL_HPET) + 0xf0) - + __vxtime.last) * __vxtime.quot) >> 32; + } + } while (read_seqretry(&__xtime_lock, sequence)); + + tv->tv_sec = sec + usec / 1000000; + tv->tv_usec = usec % 1000000; +} + +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ +static __always_inline void do_get_tz(struct timezone * tz) +{ + *tz = __sys_tz; +} + +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + int ret; + asm volatile( + ".globl vsysc2\n" + "vsysc2: syscall\n" + : "=a" (ret) + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); + return ret; +} + +static __always_inline long time_syscall(long *t) +{ + long secs; + asm volatile( + ".globl vsysc1\n" + "vsysc1: syscall\n" + : "=a" (secs) + : "0" (__NR_time),"D" (t) : __syscall_clobber); + return secs; +} + +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) +{ + if (!__sysctl_vsyscall) + return gettimeofday(tv,tz); + if (tv) + do_vgettimeofday(tv); + if (tz) + do_get_tz(tz); + return 0; +} + +/* This will break when the xtime seconds get inaccurate, but that is + * unlikely */ +time_t __vsyscall(1) vtime(time_t *t) +{ + if (!__sysctl_vsyscall) + return time_syscall(t); + else if (t) + *t = __xtime.tv_sec; + return __xtime.tv_sec; +} + +/* is this necessary? */ +#ifndef CONFIG_NODES_SHIFT +#define CONFIG_NODES_SHIFT 0 +#endif + +/* Fast way to get current CPU and node. + This helps to do per node and per CPU caches in user space. + The result is not guaranteed without CPU affinity, but usually + works out because the scheduler tries to keep a thread on the same + CPU. + + tcache must point to a two element sized long array. + All arguments can be NULL. */ +long __vsyscall(2) +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) +{ + unsigned int p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly lsl/sidt otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->blob[0] == (j = __jiffies)) { + p = tcache->blob[1]; + } + else { +#ifdef VGETCPU_USE_SIDT + struct { + char pad[6]; /* avoid unaligned stores */ + u16 size; + u64 address; + } idt; + + asm("sidt %0" : "=m" (idt.size)); + p = idt.size - 0x1000; +#else + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); +#endif + if (tcache) { + tcache->blob[0] = j; + tcache->blob[1] = p; + } + } + if (cpu) + *cpu = p >> CONFIG_NODES_SHIFT; + if (node) + *node = p & ((1<