From b342501cd31e5546d0c9ca8ceff5ded1832f9e5b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Feb 2009 20:20:29 +0100 Subject: sched: allow architectures to specify sched_clock_stable Allow CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures to still specify that their sched_clock() implementation is reliable. This will be used by x86 to switch on a faster sched_clock_cpu() implementation on certain CPU types. Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index 8981e52..a063d19 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1670,6 +1670,16 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) return set_cpus_allowed_ptr(p, &new_mask); } +/* + * Architectures can set this to 1 if they have specified + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, + * but then during bootup it turns out that sched_clock() + * is reliable after all: + */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +extern int sched_clock_stable; +#endif + extern unsigned long long sched_clock(void); extern void sched_clock_init(void); diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index a0b0852..a755d02 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -24,11 +24,11 @@ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat * consistent between cpus (never more than 2 jiffies difference). */ -#include -#include #include -#include #include +#include +#include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -43,6 +43,10 @@ unsigned long long __attribute__((weak)) sched_clock(void) static __read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__read_mostly int sched_clock_stable; +#else +static const int sched_clock_stable = 1; +#endif struct sched_clock_data { /* @@ -87,7 +91,7 @@ void sched_clock_init(void) } /* - * min,max except they take wrapping into account + * min, max except they take wrapping into account */ static inline u64 wrap_min(u64 x, u64 y) @@ -116,10 +120,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) if (unlikely(delta < 0)) delta = 0; + if (unlikely(!sched_clock_running)) + return 0ull; + /* * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); */ clock = scd->tick_gtod + delta; @@ -148,12 +155,13 @@ static void lock_double_clock(struct sched_clock_data *data1, u64 sched_clock_cpu(int cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); u64 now, clock, this_clock, remote_clock; + struct sched_clock_data *scd; - if (unlikely(!sched_clock_running)) - return 0ull; + if (sched_clock_stable) + return sched_clock(); + scd = cpu_sdc(cpu); WARN_ON_ONCE(!irqs_disabled()); now = sched_clock(); @@ -193,6 +201,8 @@ u64 sched_clock_cpu(int cpu) return clock; } +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); @@ -235,22 +245,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - -u64 sched_clock_cpu(int cpu) -{ - if (unlikely(!sched_clock_running)) - return 0; - - return sched_clock(); -} - -#endif +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ unsigned long long cpu_clock(int cpu) { -- cgit v0.10.2 From 83ce400928680a6c8123d492684b27857f5a2d95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Feb 2009 20:16:58 +0100 Subject: x86: set X86_FEATURE_TSC_RELIABLE If the TSC is constant and non-stop, also set it reliable. (We will turn this off in DMI quirks for multi-chassis systems) The performance number on a 16-way Nehalem system running 32 tasks that context-switch between each other is significant: sched_clock_stable=0 sched_clock_stable=1 .................... .................... 22.456925 million/sec 24.306972 million/sec [+8.2%] lmbench's "lat_ctx -s 0 2" goes from 0.63 microseconds to 0.59 microseconds - a 6.7% increase in context-switching performance. Perfstat of 1 million pipe context switches between two tasks: Performance counter stats for './pipe-test-1m': [before] [after] ............ ............ 37621.421089 36436.848378 task clock ticks (msecs) 0 0 CPU migrations (events) 2000274 2000189 context switches (events) 194 193 pagefaults (events) 8433799643 8171016416 CPU cycles (events) -3.21% 8370133368 8180999694 instructions (events) -2.31% 4158565 3895941 cache references (events) -6.74% 44312 46264 cache misses (events) 2349.287976 2279.362465 wall-time (msecs) -3.06% The speedup comes straight from the reduction in the instruction count. sched_clock_cpu() got simpler and the whole workload thus executes faster. Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 24ff26a..5fff00c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -56,11 +57,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate - * with P/T states and does not stop in deep C-states + * with P/T states and does not stop in deep C-states. + * + * It is also reliable across cores and sockets. (but not across + * cabinets - we turn it off in that case explicitly.) */ if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); + sched_clock_stable = 1; } } -- cgit v0.10.2