Name: Debug Checks for CPU Hotplug and stop_machine Status: Booted on 2.6.7-bk8 Under stress, it seems kstopmachine thread is getting messed up, possibly being scheduled on two CPUs at once? It's hard to see how, so insert lots of debugging checks to try to catch it. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11927-linux-2.6.7-bk8/kernel/sched.c .11927-linux-2.6.7-bk8.updated/kernel/sched.c --- .11927-linux-2.6.7-bk8/kernel/sched.c 2004-06-26 10:58:39.000000000 +1000 +++ .11927-linux-2.6.7-bk8.updated/kernel/sched.c 2004-06-29 13:53:53.000000000 +1000 @@ -579,6 +579,9 @@ static int migrate_task(task_t *p, int d { runqueue_t *rq = task_rq(p); + /* Don't move idle thread! */ + BUG_ON(p->tgid == 0); + /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. @@ -2297,6 +2300,11 @@ switch_tasks: prev->timestamp = now; if (likely(prev != next)) { + if (cpu_is_offline(cpu)) { + /* Only schedule is from kstopmaster-%i -> idle. */ + WARN_ON(memcmp(prev->comm, "kstopmaster-", 12) != 0); + WARN_ON(next != rq->idle); + } next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -3371,6 +3379,8 @@ static void __migrate_task(struct task_s { runqueue_t *rq_dest, *rq_src; + BUG_ON(p->tgid == 0); + if (unlikely(cpu_is_offline(dest_cpu))) return; diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .11927-linux-2.6.7-bk8/kernel/stop_machine.c .11927-linux-2.6.7-bk8.updated/kernel/stop_machine.c --- .11927-linux-2.6.7-bk8/kernel/stop_machine.c 2004-05-10 15:13:59.000000000 +1000 +++ .11927-linux-2.6.7-bk8.updated/kernel/stop_machine.c 2004-06-29 13:49:25.000000000 +1000 @@ -30,6 +30,7 @@ static int stopmachine(void *cpu) int prepared = 0; set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); + sprintf(current->comm, "kstopmachine/%li\n", (long)cpu); /* Ack: we are alive */ mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ @@ -174,7 +175,7 @@ struct task_struct *__stop_machine_run(i if (cpu == NR_CPUS) cpu = smp_processor_id(); - p = kthread_create(do_stop, &smdata, "kstopmachine"); + p = kthread_create(do_stop, &smdata, "kstopmaster-%i", cpu); if (!IS_ERR(p)) { kthread_bind(p, cpu); wake_up_process(p);