Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2016-07-15 16:26:12 +0200
committerJiri Kosina <jkosina@suse.cz>2016-07-15 16:26:12 +0200
commit7f67c69d79117e4b87add3e059ce4e4df320c9a6 (patch)
tree58e9a3730e831cedc38dad53c3e05d482dd0d29a
parent93278a0ff952c439f0946900d03951c761b826b8 (diff)
parent6432abd721aa3e3367022b7fdd36d92d10404d52 (diff)
Merge remote-tracking branch 'origin/users/mgalbraith/SLE11-SP4/for-next' into SLE11-SP4
-rw-r--r--patches.fixes/sched-Provide-update_curr-callbacks-for-stop-idle-scheduling-classes.patch97
-rw-r--r--patches.fixes/sched-cputime-Fix-clock_nanosleep-clock_gettime-inconsistency.patch272
-rw-r--r--patches.fixes/sched-cputime-Fix-cpu_timer_sample_group-double-accounting.patch84
-rw-r--r--series.conf5
4 files changed, 458 insertions, 0 deletions
diff --git a/patches.fixes/sched-Provide-update_curr-callbacks-for-stop-idle-scheduling-classes.patch b/patches.fixes/sched-Provide-update_curr-callbacks-for-stop-idle-scheduling-classes.patch
new file mode 100644
index 0000000000..d7db3c1ce6
--- /dev/null
+++ b/patches.fixes/sched-Provide-update_curr-callbacks-for-stop-idle-scheduling-classes.patch
@@ -0,0 +1,97 @@
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Sun, 23 Nov 2014 23:04:52 +0100
+Subject: sched: Provide update_curr callbacks for stop/idle scheduling classes
+Git-commit: 90e362f4a75d0911ca75e5cd95591a6cf1f169dc
+Patch-mainline: v3.18-rc6
+References: bnc#988498
+
+Chris bisected a NULL pointer deference in task_sched_runtime() to
+commit 6e998916dfe3 'sched/cputime: Fix clock_nanosleep()/clock_gettime()
+inconsistency'.
+
+Chris observed crashes in atop or other /proc walking programs when he
+started fork bombs on his machine. He assumed that this is a new exit
+race, but that does not make any sense when looking at that commit.
+
+What's interesting is that, the commit provides update_curr callbacks
+for all scheduling classes except stop_task and idle_task.
+
+While nothing can ever hit that via the clock_nanosleep() and
+clock_gettime() interfaces, which have been the target of the commit in
+question, the author obviously forgot that there are other code paths
+which invoke task_sched_runtime()
+
+do_task_stat(()
+ thread_group_cputime_adjusted()
+ thread_group_cputime()
+ task_cputime()
+ task_sched_runtime()
+ if (task_current(rq, p) && task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ up->sched_class->update_curr(rq);
+ }
+
+If the stats are read for a stomp machine task, aka 'migration/N' and
+that task is current on its cpu, this will happily call the NULL pointer
+of stop_task->update_curr. Ooops.
+
+Chris observation that this happens faster when he runs the fork bomb
+makes sense as the fork bomb will kick migration threads more often so
+the probability to hit the issue will increase.
+
+Add the missing update_curr callbacks to the scheduler classes stop_task
+and idle_task. While idle tasks cannot be monitored via /proc we have
+other means to hit the idle case.
+
+Fixes: 6e998916dfe3 'sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency'
+Reported-by: Chris Mason <clm@fb.com>
+Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Stanislaw Gruszka <sgruszka@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
+---
+ kernel/sched_idletask.c | 5 +++++
+ kernel/sched_stoptask.c | 5 +++++
+ 2 files changed, 10 insertions(+)
+
+--- a/kernel/sched_idletask.c
++++ b/kernel/sched_idletask.c
+@@ -68,6 +68,10 @@ static unsigned int get_rr_interval_idle
+ return 0;
+ }
+
++static void update_curr_idle(struct rq *rq)
++{
++}
++
+ /*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+@@ -94,4 +98,5 @@ static const struct sched_class idle_sch
+
+ .prio_changed = prio_changed_idle,
+ .switched_to = switched_to_idle,
++ .update_curr = update_curr_idle,
+ };
+--- a/kernel/sched_stoptask.c
++++ b/kernel/sched_stoptask.c
+@@ -103,6 +103,10 @@ get_rr_interval_stop(struct rq *rq, stru
+ return 0;
+ }
+
++static void update_curr_stop(struct rq *rq)
++{
++}
++
+ /*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+@@ -129,4 +133,5 @@ static const struct sched_class stop_sch
+
+ .prio_changed = prio_changed_stop,
+ .switched_to = switched_to_stop,
++ .update_curr = update_curr_stop,
+ };
diff --git a/patches.fixes/sched-cputime-Fix-clock_nanosleep-clock_gettime-inconsistency.patch b/patches.fixes/sched-cputime-Fix-clock_nanosleep-clock_gettime-inconsistency.patch
new file mode 100644
index 0000000000..1cb6377167
--- /dev/null
+++ b/patches.fixes/sched-cputime-Fix-clock_nanosleep-clock_gettime-inconsistency.patch
@@ -0,0 +1,272 @@
+From: Stanislaw Gruszka <sgruszka@redhat.com>
+Date: Wed, 12 Nov 2014 16:58:44 +0100
+Subject: sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency
+Git-commit: 6e998916dfe327e785e7c2447959b2c1a3ea4930
+Patch-mainline: v3.18-rc6
+References: bnc#988498
+
+Commit d670ec13178d0 "posix-cpu-timers: Cure SMP wobbles" fixes one glibc
+test case in cost of breaking another one. After that commit, calling
+clock_nanosleep(TIMER_ABSTIME, X) and then clock_gettime(&Y) can result
+of Y time being smaller than X time.
+
+Reproducer/tester can be found further below, it can be compiled and ran by:
+
+ gcc -o tst-cpuclock2 tst-cpuclock2.c -pthread
+ while ./tst-cpuclock2 ; do : ; done
+
+This reproducer, when running on a buggy kernel, will complain
+about "clock_gettime difference too small".
+
+Issue happens because on start in thread_group_cputimer() we initialize
+sum_exec_runtime of cputimer with threads runtime not yet accounted and
+then add the threads runtime to running cputimer again on scheduler
+tick, making it's sum_exec_runtime bigger than actual threads runtime.
+
+KOSAKI Motohiro posted a fix for this problem, but that patch was never
+applied: https://lkml.org/lkml/2013/5/26/191 .
+
+This patch takes different approach to cure the problem. It calls
+update_curr() when cputimer starts, that assure we will have updated
+stats of running threads and on the next schedule tick we will account
+only the runtime that elapsed from cputimer start. That also assure we
+have consistent state between cpu times of individual threads and cpu
+time of the process consisted by those threads.
+
+Full reproducer (tst-cpuclock2.c):
+
+ #define _GNU_SOURCE
+ #include <unistd.h>
+ #include <sys/syscall.h>
+ #include <stdio.h>
+ #include <time.h>
+ #include <pthread.h>
+ #include <stdint.h>
+ #include <inttypes.h>
+
+ /* Parameters for the Linux kernel ABI for CPU clocks. */
+ #define CPUCLOCK_SCHED 2
+ #define MAKE_PROCESS_CPUCLOCK(pid, clock) \
+ ((~(clockid_t) (pid) << 3) | (clockid_t) (clock))
+
+ static pthread_barrier_t barrier;
+
+ /* Help advance the clock. */
+ static void *chew_cpu(void *arg)
+ {
+ pthread_barrier_wait(&barrier);
+ while (1) ;
+
+ return NULL;
+ }
+
+ /* Don't use the glibc wrapper. */
+ static int do_nanosleep(int flags, const struct timespec *req)
+ {
+ clockid_t clock_id = MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED);
+
+ return syscall(SYS_clock_nanosleep, clock_id, flags, req, NULL);
+ }
+
+ static int64_t tsdiff(const struct timespec *before, const struct timespec *after)
+ {
+ int64_t before_i = before->tv_sec * 1000000000ULL + before->tv_nsec;
+ int64_t after_i = after->tv_sec * 1000000000ULL + after->tv_nsec;
+
+ return after_i - before_i;
+ }
+
+ int main(void)
+ {
+ int result = 0;
+ pthread_t th;
+
+ pthread_barrier_init(&barrier, NULL, 2);
+
+ if (pthread_create(&th, NULL, chew_cpu, NULL) != 0) {
+ perror("pthread_create");
+ return 1;
+ }
+
+ pthread_barrier_wait(&barrier);
+
+ /* The test. */
+ struct timespec before, after, sleeptimeabs;
+ int64_t sleepdiff, diffabs;
+ const struct timespec sleeptime = {.tv_sec = 0,.tv_nsec = 100000000 };
+
+ /* The relative nanosleep. Not sure why this is needed, but its presence
+ seems to make it easier to reproduce the problem. */
+ if (do_nanosleep(0, &sleeptime) != 0) {
+ perror("clock_nanosleep");
+ return 1;
+ }
+
+ /* Get the current time. */
+ if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &before) < 0) {
+ perror("clock_gettime[2]");
+ return 1;
+ }
+
+ /* Compute the absolute sleep time based on the current time. */
+ uint64_t nsec = before.tv_nsec + sleeptime.tv_nsec;
+ sleeptimeabs.tv_sec = before.tv_sec + nsec / 1000000000;
+ sleeptimeabs.tv_nsec = nsec % 1000000000;
+
+ /* Sleep for the computed time. */
+ if (do_nanosleep(TIMER_ABSTIME, &sleeptimeabs) != 0) {
+ perror("absolute clock_nanosleep");
+ return 1;
+ }
+
+ /* Get the time after the sleep. */
+ if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &after) < 0) {
+ perror("clock_gettime[3]");
+ return 1;
+ }
+
+ /* The time after sleep should always be equal to or after the absolute sleep
+ time passed to clock_nanosleep. */
+ sleepdiff = tsdiff(&sleeptimeabs, &after);
+ if (sleepdiff < 0) {
+ printf("absolute clock_nanosleep woke too early: %" PRId64 "\n", sleepdiff);
+ result = 1;
+
+ printf("Before %llu.%09llu\n", before.tv_sec, before.tv_nsec);
+ printf("After %llu.%09llu\n", after.tv_sec, after.tv_nsec);
+ printf("Sleep %llu.%09llu\n", sleeptimeabs.tv_sec, sleeptimeabs.tv_nsec);
+ }
+
+ /* The difference between the timestamps taken before and after the
+ clock_nanosleep call should be equal to or more than the duration of the
+ sleep. */
+ diffabs = tsdiff(&before, &after);
+ if (diffabs < sleeptime.tv_nsec) {
+ printf("clock_gettime difference too small: %" PRId64 "\n", diffabs);
+ result = 1;
+ }
+
+ pthread_cancel(th);
+
+ return result;
+ }
+
+Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: http://lkml.kernel.org/r/20141112155843.GA24803@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
+---
+ include/linux/sched.h | 3 +++
+ kernel/sched.c | 38 +++++++++++---------------------------
+ kernel/sched_fair.c | 7 +++++++
+ kernel/sched_rt.c | 2 ++
+ 4 files changed, 23 insertions(+), 27 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1144,6 +1144,9 @@ struct sched_class {
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ void (*task_move_group) (struct task_struct *p, int on_rq);
+ #endif
++#ifndef __GENKSYMS__
++ void (*update_curr) (struct rq *rq);
++#endif
+ };
+
+ struct load_weight {
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -3898,31 +3898,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
+ EXPORT_PER_CPU_SYMBOL(kstat);
+
+ /*
+- * Return any ns on the sched_clock that have not yet been accounted in
+- * @p in case that task is currently running.
+- *
+- * Called with task_rq_lock() held on @rq.
+- */
+-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+-{
+- u64 ns = 0;
+-
+- /*
+- * Must be ->curr, ->on_cpu _and_ ->on_rq. If dequeued, we
+- * would project cycles that may never be accounted to this
+- * thread, breaking clock_gettime().
+- */
+- if (task_current(rq, p) && p->on_cpu && p->on_rq) {
+- update_rq_clock(rq);
+- ns = rq->clock_task - p->se.exec_start;
+- if ((s64)ns < 0)
+- ns = 0;
+- }
+-
+- return ns;
+-}
+-
+-/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+@@ -3931,10 +3906,19 @@ unsigned long long task_sched_runtime(st
+ {
+ unsigned long flags;
+ struct rq *rq;
+- u64 ns = 0;
++ u64 ns;
+
+ rq = task_rq_lock(p, &flags);
+- ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
++ /*
++ * Must be ->curr _and_ ->on_rq. If dequeued, we would
++ * project cycles that may never be accounted to this
++ * thread, breaking clock_gettime().
++ */
++ if (task_current(rq, p) && p->on_rq) {
++ update_rq_clock(rq);
++ p->sched_class->update_curr(rq);
++ }
++ ns = p->se.sum_exec_runtime;
+ task_rq_unlock(rq, p, &flags);
+
+ return ns;
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -611,6 +611,11 @@ static void update_curr(struct cfs_rq *c
+ account_cfs_rq_runtime(cfs_rq, delta_exec);
+ }
+
++static void update_curr_fair(struct rq *rq)
++{
++ update_curr(cfs_rq_of(&rq->curr->se));
++}
++
+ static inline void
+ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+@@ -5130,6 +5135,8 @@ static const struct sched_class fair_sch
+
+ .get_rr_interval = get_rr_interval_fair,
+
++ .update_curr = update_curr_fair,
++
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ .task_move_group = task_move_group_fair,
+ #endif
+--- a/kernel/sched_rt.c
++++ b/kernel/sched_rt.c
+@@ -1902,6 +1902,8 @@ static const struct sched_class rt_sched
+
+ .prio_changed = prio_changed_rt,
+ .switched_to = switched_to_rt,
++
++ .update_curr = update_curr_rt,
+ };
+
+ #ifdef CONFIG_SCHED_DEBUG
diff --git a/patches.fixes/sched-cputime-Fix-cpu_timer_sample_group-double-accounting.patch b/patches.fixes/sched-cputime-Fix-cpu_timer_sample_group-double-accounting.patch
new file mode 100644
index 0000000000..552534c52f
--- /dev/null
+++ b/patches.fixes/sched-cputime-Fix-cpu_timer_sample_group-double-accounting.patch
@@ -0,0 +1,84 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 12 Nov 2014 12:37:37 +0100
+Subject: sched/cputime: Fix cpu_timer_sample_group() double accounting
+Git-commit: 23cfa361f3e54a3e184a5e126bbbdd95f984881a
+Patch-mainline: v3.18-rc6
+References: bnc#988498
+
+While looking over the cpu-timer code I found that we appear to add
+the delta for the calling task twice, through:
+
+ cpu_timer_sample_group()
+ thread_group_cputimer()
+ thread_group_cputime()
+ times->sum_exec_runtime += task_sched_runtime();
+
+ *sample = cputime.sum_exec_runtime + task_delta_exec();
+
+Which would make the sample run ahead, making the sleep short.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Stanislaw Gruszka <sgruszka@redhat.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Tejun Heo <tj@kernel.org>
+Link: http://lkml.kernel.org/r/20141112113737.GI10476@twins.programming.kicks-ass.net
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
+---
+ include/linux/kernel_stat.h | 5 -----
+ kernel/posix-cpu-timers.c | 2 +-
+ kernel/sched.c | 13 -------------
+ 3 files changed, 1 insertion(+), 19 deletions(-)
+
+--- a/include/linux/kernel_stat.h
++++ b/include/linux/kernel_stat.h
+@@ -108,11 +108,6 @@ static inline unsigned int kstat_cpu_irq
+ return kstat_cpu(cpu).irqs_sum;
+ }
+
+-/*
+- * Lock/unlock the current runqueue - to extract task statistics:
+- */
+-extern unsigned long long task_delta_exec(struct task_struct *);
+-
+ extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
+ extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
+ extern void account_steal_time(cputime_t);
+--- a/kernel/posix-cpu-timers.c
++++ b/kernel/posix-cpu-timers.c
+@@ -657,7 +657,7 @@ static int cpu_timer_sample_group(const
+ cpu->cpu = cputime.utime;
+ break;
+ case CPUCLOCK_SCHED:
+- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
++ cpu->sched = cputime.sum_exec_runtime;
+ break;
+ }
+ return 0;
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -3922,19 +3922,6 @@ static u64 do_task_delta_exec(struct tas
+ return ns;
+ }
+
+-unsigned long long task_delta_exec(struct task_struct *p)
+-{
+- unsigned long flags;
+- struct rq *rq;
+- u64 ns = 0;
+-
+- rq = task_rq_lock(p, &flags);
+- ns = do_task_delta_exec(p, rq);
+- task_rq_unlock(rq, p, &flags);
+-
+- return ns;
+-}
+-
+ /*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
diff --git a/series.conf b/series.conf
index 07abc0bd72..2ae667550a 100644
--- a/series.conf
+++ b/series.conf
@@ -479,6 +479,11 @@
patches.fixes/cgroups-more-safe-tasklist-locking-in-cgroup_attach_proc.patch
patches.fixes/cgroups-don-t-attach-task-to-subsystem-if-migration-failed.patch
+ # bnc#988498
+ patches.fixes/sched-cputime-Fix-cpu_timer_sample_group-double-accounting.patch
+ patches.fixes/sched-cputime-Fix-clock_nanosleep-clock_gettime-inconsistency.patch
+ patches.fixes/sched-Provide-update_curr-callbacks-for-stop-idle-scheduling-classes.patch
+
########################################################
# futex
########################################################