diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c index 7dcb352..02c5d2c 100644 --- a/arch/arm/kernel/smp_tlb.c +++ b/arch/arm/kernel/smp_tlb.c @@ -13,18 +13,6 @@ #include #include -static void on_each_cpu_mask(void (*func)(void *), void *info, int wait, - const struct cpumask *mask) -{ - preempt_disable(); - - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(smp_processor_id(), mask)) - func(info); - - preempt_enable(); -} - /**********************************************************************/ /* @@ -87,7 +75,7 @@ void flush_tlb_all(void) void flush_tlb_mm(struct mm_struct *mm) { if (tlb_ops_need_broadcast()) - on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mm_cpumask(mm)); + on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1); else local_flush_tlb_mm(mm); } @@ -98,7 +86,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) struct tlb_args ta; ta.ta_vma = vma; ta.ta_start = uaddr; - on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, + &ta, 1); } else local_flush_tlb_page(vma, uaddr); } @@ -121,7 +110,8 @@ void flush_tlb_range(struct vm_area_struct *vma, ta.ta_vma = vma; ta.ta_start = start; ta.ta_end = end; - on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_range, + &ta, 1); } else local_flush_tlb_range(vma, start, end); } diff --git a/arch/tile/include/asm/smp.h b/arch/tile/include/asm/smp.h index 532124a..1aa759a 100644 --- a/arch/tile/include/asm/smp.h +++ b/arch/tile/include/asm/smp.h @@ -43,10 +43,6 @@ void evaluate_message(int tag); /* Boot a secondary cpu */ void online_secondary(void); -/* Call a function on a specified set of CPUs (may include this one). */ -extern void on_each_cpu_mask(const struct cpumask *mask, - void (*func)(void *), void *info, bool wait); - /* Topology of the supervisor tile grid, and coordinates of boot processor */ extern HV_Topology smp_topology; @@ -91,9 +87,6 @@ void print_disabled_cpus(void); #else /* !CONFIG_SMP */ -#define on_each_cpu_mask(mask, func, info, wait) \ - do { if (cpumask_test_cpu(0, (mask))) func(info); } while (0) - #define smp_master_cpu 0 #define smp_height 1 #define smp_width 1 diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index c52224d..a44e103 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -87,25 +87,6 @@ void send_IPI_allbutself(int tag) send_IPI_many(&mask, tag); } - -/* - * Provide smp_call_function_mask, but also run function locally - * if specified in the mask. - */ -void on_each_cpu_mask(const struct cpumask *mask, void (*func)(void *), - void *info, bool wait) -{ - int cpu = get_cpu(); - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(cpu, mask)) { - local_irq_disable(); - func(info); - local_irq_enable(); - } - put_cpu(); -} - - /* * Functions related to starting/stopping cpus. */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f70a65b..cafd56c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -468,11 +468,7 @@ extern void thread_do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); -static inline void __raise_softirq_irqoff(unsigned int nr) -{ - trace_softirq_raise(nr); - or_softirq_pending(1UL << nr); -} +extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); diff --git a/include/linux/sched.h b/include/linux/sched.h index 41c0979..1bb6414 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1607,6 +1607,7 @@ struct task_struct { #ifdef CONFIG_PREEMPT_RT_BASE struct rcu_head put_rcu; int softirq_nestcnt; + unsigned int softirqs_raised; #endif #ifdef CONFIG_PREEMPT_RT_FULL # if defined CONFIG_HIGHMEM || defined CONFIG_X86_32 @@ -1823,6 +1824,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * Per process flags */ +#define PF_IN_SOFTIRQ 0x00000001 /* Task is serving softirq */ #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index a32bcfd..0c674f6 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -52,7 +52,7 @@ struct kmem_cache_cpu { }; struct kmem_cache_node { - spinlock_t list_lock; /* Protect partial list and nr_partial */ + raw_spinlock_t list_lock; /* Protect partial list and nr_partial */ unsigned long nr_partial; struct list_head partial; #ifdef CONFIG_SLUB_DEBUG diff --git a/include/linux/smp.h b/include/linux/smp.h index 78fd0a2..3001ba5 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -101,6 +101,22 @@ static inline void call_function_init(void) { } int on_each_cpu(smp_call_func_t func, void *info, int wait); /* + * Call a function on processors specified by mask, which might include + * the local one. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait); + +/* + * Call a function on each processor for which the supplied function + * cond_func returns a positive value. This may include the local + * processor. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags); + +/* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. */ @@ -131,6 +147,36 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) local_irq_enable(); \ 0; \ }) +/* + * Note we still need to test the mask even for UP + * because we actually can get an empty mask from + * code that on SMP might call us without the local + * CPU in the mask. + */ +#define on_each_cpu_mask(mask, func, info, wait) \ + do { \ + if (cpumask_test_cpu(0, (mask))) { \ + local_irq_disable(); \ + (func)(info); \ + local_irq_enable(); \ + } \ + } while (0) +/* + * Preemption is disabled here to make sure the cond_func is called under the + * same condtions in UP and SMP. + */ +#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\ + do { \ + void *__info = (info); \ + preempt_disable(); \ + if ((cond_func)(0, __info)) { \ + local_irq_disable(); \ + (func)(__info); \ + local_irq_enable(); \ + } \ + preempt_enable(); \ + } while (0) + static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) diff --git a/init/Kconfig b/init/Kconfig index aa6545f..cfb1668 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1240,7 +1240,6 @@ config SLAB config SLUB bool "SLUB (Unqueued Allocator)" - depends on !PREEMPT_RT_FULL help SLUB is a slab allocator that minimizes cache line usage instead of managing queues of cached objects (SLAB approach). diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 7021e6d..1b40641 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1036,7 +1036,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, && hrtimer_enqueue_reprogram(timer, new_base)) { if (wakeup -#ifdef CONFIG_PREEMPT_RT_BASE +#if defined(CONFIG_PREEMPT_RT_BASE) && defined(CONFIG_HIGH_RES_TIMERS) /* * Move softirq based timers away from the rbtree in * case it expired already. Otherwise we would have a diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f0a6606..6be8561 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -553,7 +553,7 @@ static void rcu_read_unlock_special(struct task_struct *t) rcu_preempt_cpu_qs(); /* Hardware IRQ handlers cannot block. */ - if (in_irq()) { + if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) { local_irq_restore(flags); return; } diff --git a/kernel/smp.c b/kernel/smp.c index 9e800b2..d5f3238 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -712,3 +712,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait) return ret; } EXPORT_SYMBOL(on_each_cpu); + +/** + * on_each_cpu_mask(): Run a function on processors specified by + * cpumask, which may include the local processor. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait) +{ + int cpu = get_cpu(); + + smp_call_function_many(mask, func, info, wait); + if (cpumask_test_cpu(cpu, mask)) { + local_irq_disable(); + func(info); + local_irq_enable(); + } + put_cpu(); +} +EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * on_each_cpu_cond(): Call a function on each processor for which + * the supplied function cond_func returns true, optionally waiting + * for all the required CPUs to finish. This may include the local + * processor. + * @cond_func: A callback function that is passed a cpu id and + * the the info parameter. The function is called + * with preemption disabled. The function should + * return a blooean value indicating whether to IPI + * the specified CPU. + * @func: The function to run on all applicable CPUs. + * This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to both functions. + * @wait: If true, wait (atomically) until function has + * completed on other CPUs. + * @gfp_flags: GFP flags to use when allocating the cpumask + * used internally by the function. + * + * The function might sleep if the GFP flags indicates a non + * atomic allocation is allowed. + * + * Preemption is disabled to protect against CPUs going offline but not online. + * CPUs going online during the call will not be seen or sent an IPI. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + cpumask_var_t cpus; + int cpu, ret; + + might_sleep_if(gfp_flags & __GFP_WAIT); + + if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) + cpumask_set_cpu(cpu, cpus); + on_each_cpu_mask(cpus, func, info, wait); + preempt_enable(); + free_cpumask_var(cpus); + } else { + /* + * No free cpumask, bother. No matter, we'll + * just have to IPI them one by one. + */ + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) { + ret = smp_call_function_single(cpu, func, + info, wait); + WARN_ON_ONCE(!ret); + } + preempt_enable(); + } +} +EXPORT_SYMBOL(on_each_cpu_cond); diff --git a/kernel/softirq.c b/kernel/softirq.c index ca00a68..1a0145a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -65,45 +65,70 @@ char *softirq_to_name[NR_SOFTIRQS] = { #ifdef CONFIG_NO_HZ # ifdef CONFIG_PREEMPT_RT_FULL + +struct softirq_runner { + struct task_struct *runner[NR_SOFTIRQS]; +}; + +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners); + +static inline void softirq_set_runner(unsigned int sirq) +{ + struct softirq_runner *sr = &__get_cpu_var(softirq_runners); + + sr->runner[sirq] = current; +} + +static inline void softirq_clr_runner(unsigned int sirq) +{ + struct softirq_runner *sr = &__get_cpu_var(softirq_runners); + + sr->runner[sirq] = NULL; +} + /* - * On preempt-rt a softirq might be blocked on a lock. There might be - * no other runnable task on this CPU because the lock owner runs on - * some other CPU. So we have to go into idle with the pending bit - * set. Therefor we need to check this otherwise we warn about false - * positives which confuses users and defeats the whole purpose of - * this test. + * On preempt-rt a softirq running context might be blocked on a + * lock. There might be no other runnable task on this CPU because the + * lock owner runs on some other CPU. So we have to go into idle with + * the pending bit set. Therefor we need to check this otherwise we + * warn about false positives which confuses users and defeats the + * whole purpose of this test. * * This code is called with interrupts disabled. */ void softirq_check_pending_idle(void) { static int rate_limit; - u32 warnpending = 0, pending = local_softirq_pending(); + struct softirq_runner *sr = &__get_cpu_var(softirq_runners); + u32 warnpending = local_softirq_pending(); + int i; if (rate_limit >= 10) return; - if (pending) { - struct task_struct *tsk; + for (i = 0; i < NR_SOFTIRQS; i++) { + struct task_struct *tsk = sr->runner[i]; - tsk = __get_cpu_var(ksoftirqd); /* * The wakeup code in rtmutex.c wakes up the task * _before_ it sets pi_blocked_on to NULL under * tsk->pi_lock. So we need to check for both: state * and pi_blocked_on. */ - raw_spin_lock(&tsk->pi_lock); - - if (!tsk->pi_blocked_on && !(tsk->state == TASK_RUNNING)) - warnpending = 1; - - raw_spin_unlock(&tsk->pi_lock); + if (tsk) { + raw_spin_lock(&tsk->pi_lock); + if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) { + /* Clear all bits pending in that task */ + warnpending &= ~(tsk->softirqs_raised); + warnpending &= ~(1 << i); + } + raw_spin_unlock(&tsk->pi_lock); + } } if (warnpending) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - pending); + warnpending); rate_limit++; } } @@ -122,6 +147,10 @@ void softirq_check_pending_idle(void) } } # endif + +#else /* !NO_HZ */ +static inline void softirq_set_runner(unsigned int sirq) { } +static inline void softirq_clr_runner(unsigned int sirq) { } #endif /* @@ -139,36 +168,39 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } -static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs) +static void handle_softirq(unsigned int vec_nr, int cpu, int need_rcu_bh_qs) { - struct softirq_action *h = softirq_vec; + struct softirq_action *h = softirq_vec + vec_nr; unsigned int prev_count = preempt_count(); - local_irq_enable(); - for ( ; pending; h++, pending >>= 1) { - unsigned int vec_nr = h - softirq_vec; + kstat_incr_softirqs_this_cpu(vec_nr); + trace_softirq_entry(vec_nr); + h->action(h); + trace_softirq_exit(vec_nr); - if (!(pending & 1)) - continue; + if (unlikely(prev_count != preempt_count())) { + pr_err("softirq %u %s %p preempt count leak: %08x -> %08x\n", + vec_nr, softirq_to_name[vec_nr], h->action, + prev_count, (unsigned int) preempt_count()); + preempt_count() = prev_count; + } + if (need_rcu_bh_qs) + rcu_bh_qs(cpu); +} - kstat_incr_softirqs_this_cpu(vec_nr); - trace_softirq_entry(vec_nr); - h->action(h); - trace_softirq_exit(vec_nr); - if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR - "huh, entered softirq %u %s %p with preempt_count %08x exited with %08x?\n", - vec_nr, softirq_to_name[vec_nr], h->action, - prev_count, (unsigned int) preempt_count()); - preempt_count() = prev_count; - } - if (need_rcu_bh_qs) - rcu_bh_qs(cpu); +#ifndef CONFIG_PREEMPT_RT_FULL +static void handle_pending_softirqs(u32 pending, int cpu, int need_rcu_bh_qs) +{ + unsigned int vec_nr; + + local_irq_enable(); + for (vec_nr = 0; pending; vec_nr++, pending >>= 1) { + if (pending & 1) + handle_softirq(vec_nr, cpu, need_rcu_bh_qs); } local_irq_disable(); } -#ifndef CONFIG_PREEMPT_RT_FULL /* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -372,29 +404,115 @@ asmlinkage void do_softirq(void) #endif +/* + * This function must run with irqs disabled! + */ +void raise_softirq_irqoff(unsigned int nr) +{ + __raise_softirq_irqoff(nr); + + /* + * If we're in an interrupt or softirq, we're done + * (this also catches softirq-disabled code). We will + * actually run the softirq once we return from + * the irq or softirq. + * + * Otherwise we wake up ksoftirqd to make sure we + * schedule the softirq soon. + */ + if (!in_interrupt()) + wakeup_softirqd(); +} + +void __raise_softirq_irqoff(unsigned int nr) +{ + trace_softirq_raise(nr); + or_softirq_pending(1UL << nr); +} + static inline void local_bh_disable_nort(void) { local_bh_disable(); } static inline void _local_bh_enable_nort(void) { _local_bh_enable(); } static inline void ksoftirqd_set_sched_params(void) { } static inline void ksoftirqd_clr_sched_params(void) { } +static inline int ksoftirqd_softirq_pending(void) +{ + return local_softirq_pending(); +} + #else /* !PREEMPT_RT_FULL */ /* - * On RT we serialize softirq execution with a cpu local lock + * On RT we serialize softirq execution with a cpu local lock per softirq */ -static DEFINE_LOCAL_IRQ_LOCK(local_softirq_lock); -static DEFINE_PER_CPU(struct task_struct *, local_softirq_runner); +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks); -static void __do_softirq_common(int need_rcu_bh_qs); +void __init softirq_early_init(void) +{ + int i; + + for (i = 0; i < NR_SOFTIRQS; i++) + local_irq_lock_init(local_softirq_locks[i]); +} -void __do_softirq(void) +static void lock_softirq(int which) { - __do_softirq_common(0); + __local_lock(&__get_cpu_var(local_softirq_locks[which])); } -void __init softirq_early_init(void) +static void unlock_softirq(int which) +{ + __local_unlock(&__get_cpu_var(local_softirq_locks[which])); +} + +static void do_single_softirq(int which, int need_rcu_bh_qs) { - local_irq_lock_init(local_softirq_lock); + account_system_vtime(current); + current->flags |= PF_IN_SOFTIRQ; + lockdep_softirq_enter(); + local_irq_enable(); + handle_softirq(which, smp_processor_id(), need_rcu_bh_qs); + local_irq_disable(); + lockdep_softirq_exit(); + current->flags &= ~PF_IN_SOFTIRQ; + account_system_vtime(current); +} + +/* + * Called with interrupts disabled. Process softirqs which were raised + * in current context (or on behalf of ksoftirqd). + */ +static void do_current_softirqs(int need_rcu_bh_qs) +{ + while (current->softirqs_raised) { + int i = __ffs(current->softirqs_raised); + unsigned int pending, mask = (1U << i); + + current->softirqs_raised &= ~mask; + local_irq_enable(); + + /* + * If the lock is contended, we boost the owner to + * process the softirq or leave the critical section + * now. + */ + lock_softirq(i); + local_irq_disable(); + softirq_set_runner(i); + /* + * Check with the local_softirq_pending() bits, + * whether we need to process this still or if someone + * else took care of it. + */ + pending = local_softirq_pending(); + if (pending & mask) { + set_softirq_pending(pending & ~mask); + do_single_softirq(i, need_rcu_bh_qs); + } + softirq_clr_runner(i); + unlock_softirq(i); + WARN_ON(current->softirq_nestcnt != 1); + } } void local_bh_disable(void) @@ -409,17 +527,11 @@ void local_bh_enable(void) if (WARN_ON(current->softirq_nestcnt == 0)) return; - if ((current->softirq_nestcnt == 1) && - local_softirq_pending() && - local_trylock(local_softirq_lock)) { + local_irq_disable(); + if (current->softirq_nestcnt == 1 && current->softirqs_raised) + do_current_softirqs(1); + local_irq_enable(); - local_irq_disable(); - if (local_softirq_pending()) - __do_softirq(); - local_irq_enable(); - local_unlock(local_softirq_lock); - WARN_ON(current->softirq_nestcnt != 1); - } current->softirq_nestcnt--; migrate_enable(); } @@ -438,56 +550,14 @@ void _local_bh_enable(void) } EXPORT_SYMBOL(_local_bh_enable); -/* For tracing */ -int notrace __in_softirq(void) -{ - if (__get_cpu_var(local_softirq_lock).owner == current) - return __get_cpu_var(local_softirq_lock).nestcnt; - return 0; -} - int in_serving_softirq(void) { - int res; - - preempt_disable(); - res = __get_cpu_var(local_softirq_runner) == current; - preempt_enable(); - return res; + return current->flags & PF_IN_SOFTIRQ; } EXPORT_SYMBOL(in_serving_softirq); -/* - * Called with bh and local interrupts disabled. For full RT cpu must - * be pinned. - */ -static void __do_softirq_common(int need_rcu_bh_qs) -{ - u32 pending = local_softirq_pending(); - int cpu = smp_processor_id(); - - current->softirq_nestcnt++; - - /* Reset the pending bitmask before enabling irqs */ - set_softirq_pending(0); - - __get_cpu_var(local_softirq_runner) = current; - - lockdep_softirq_enter(); - - handle_pending_softirqs(pending, cpu, need_rcu_bh_qs); - - pending = local_softirq_pending(); - if (pending) - wakeup_softirqd(); - - lockdep_softirq_exit(); - __get_cpu_var(local_softirq_runner) = NULL; - - current->softirq_nestcnt--; -} - -static int __thread_do_softirq(int cpu) +/* Called with preemption disabled */ +static int ksoftirqd_do_softirq(int cpu) { /* * Prevent the current cpu from going offline. @@ -498,45 +568,92 @@ static int __thread_do_softirq(int cpu) */ pin_current_cpu(); /* - * If called from ksoftirqd (cpu >= 0) we need to check - * whether we are on the wrong cpu due to cpu offlining. If - * called via thread_do_softirq() no action required. + * We need to check whether we are on the wrong cpu due to cpu + * offlining. */ - if (cpu >= 0 && cpu_is_offline(cpu)) { + if (cpu_is_offline(cpu)) { unpin_current_cpu(); return -1; } preempt_enable(); - local_lock(local_softirq_lock); local_irq_disable(); - /* - * We cannot switch stacks on RT as we want to be able to - * schedule! - */ - if (local_softirq_pending()) - __do_softirq_common(cpu >= 0); - local_unlock(local_softirq_lock); - unpin_current_cpu(); - preempt_disable(); + current->softirq_nestcnt++; + do_current_softirqs(1); + current->softirq_nestcnt--; local_irq_enable(); + + preempt_disable(); + unpin_current_cpu(); return 0; } /* - * Called from netif_rx_ni(). Preemption enabled. + * Called from netif_rx_ni(). Preemption enabled, but migration + * disabled. So the cpu can't go away under us. */ void thread_do_softirq(void) { - if (!in_serving_softirq()) { - preempt_disable(); - __thread_do_softirq(-1); - preempt_enable(); + if (!in_serving_softirq() && current->softirqs_raised) { + current->softirq_nestcnt++; + do_current_softirqs(0); + current->softirq_nestcnt--; } } -static int ksoftirqd_do_softirq(int cpu) +static void do_raise_softirq_irqoff(unsigned int nr) +{ + trace_softirq_raise(nr); + or_softirq_pending(1UL << nr); + + /* + * If we are not in a hard interrupt and inside a bh disabled + * region, we simply raise the flag on current. local_bh_enable() + * will make sure that the softirq is executed. Otherwise we + * delegate it to ksoftirqd. + */ + if (!in_irq() && current->softirq_nestcnt) + current->softirqs_raised |= (1U << nr); + else if (__this_cpu_read(ksoftirqd)) + __this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr); +} + +void __raise_softirq_irqoff(unsigned int nr) +{ + do_raise_softirq_irqoff(nr); + if (!in_irq() && !current->softirq_nestcnt) + wakeup_softirqd(); +} + +/* + * This function must run with irqs disabled! + */ +void raise_softirq_irqoff(unsigned int nr) +{ + do_raise_softirq_irqoff(nr); + + /* + * If we're in an hard interrupt we let irq return code deal + * with the wakeup of ksoftirqd. + */ + if (in_irq()) + return; + + /* + * If we are in thread context but outside of a bh disabled + * region, we need to wake ksoftirqd as well. + * + * CHECKME: Some of the places which do that could be wrapped + * into local_bh_disable/enable pairs. Though it's unclear + * whether this is worth the effort. To find those places just + * raise a WARN() if the condition is met. + */ + if (!current->softirq_nestcnt) + wakeup_softirqd(); +} + +static inline int ksoftirqd_softirq_pending(void) { - return __thread_do_softirq(cpu); + return current->softirqs_raised; } static inline void local_bh_disable_nort(void) { } @@ -547,6 +664,10 @@ static inline void ksoftirqd_set_sched_params(void) struct sched_param param = { .sched_priority = 1 }; sched_setscheduler(current, SCHED_FIFO, ¶m); + /* Take over all pending softirqs when starting */ + local_irq_disable(); + current->softirqs_raised = local_softirq_pending(); + local_irq_enable(); } static inline void ksoftirqd_clr_sched_params(void) @@ -578,39 +699,31 @@ void irq_enter(void) __irq_enter(); } -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED static inline void invoke_softirq(void) { #ifndef CONFIG_PREEMPT_RT_FULL - if (!force_irqthreads) + if (!force_irqthreads) { +#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED __do_softirq(); - else { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); - wakeup_softirqd(); - __local_bh_enable(SOFTIRQ_OFFSET); - } -#else - wakeup_softirqd(); -#endif -} #else -static inline void invoke_softirq(void) -{ -#ifndef CONFIG_PREEMPT_RT_FULL - if (!force_irqthreads) do_softirq(); - else { +#endif + } else { __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); wakeup_softirqd(); __local_bh_enable(SOFTIRQ_OFFSET); } -#else - wakeup_softirqd(); +#else /* PREEMPT_RT_FULL */ + unsigned long flags; + + local_irq_save(flags); + if (__this_cpu_read(ksoftirqd) && + __this_cpu_read(ksoftirqd)->softirqs_raised) + wakeup_softirqd(); + local_irq_restore(flags); #endif } -#endif /* * Exit an interrupt context. Process softirqs if needed and possible: @@ -632,26 +745,6 @@ void irq_exit(void) __preempt_enable_no_resched(); } -/* - * This function must run with irqs disabled! - */ -inline void raise_softirq_irqoff(unsigned int nr) -{ - __raise_softirq_irqoff(nr); - - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); -} - void raise_softirq(unsigned int nr) { unsigned long flags; @@ -1112,12 +1205,12 @@ static int run_ksoftirqd(void * __bind_cpu) while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending()) + if (!ksoftirqd_softirq_pending()) schedule_preempt_disabled(); __set_current_state(TASK_RUNNING); - while (local_softirq_pending()) { + while (ksoftirqd_softirq_pending()) { if (ksoftirqd_do_softirq((long) __bind_cpu)) goto wait_to_die; __preempt_enable_no_resched(); diff --git a/localversion-rt-feat b/localversion-rt-feat new file mode 100644 index 0000000..b349c49 --- /dev/null +++ b/localversion-rt-feat @@ -0,0 +1 @@ +-feat2 diff --git a/mm/slub.c b/mm/slub.c index 5710788..4c62b7f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1258,6 +1258,12 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) {} #endif /* CONFIG_SLUB_DEBUG */ +struct slub_free_list { + raw_spinlock_t lock; + struct list_head list; +}; +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list); + /* * Slab allocation and freeing */ @@ -1279,10 +1285,15 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct page *page; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + bool enableirqs; flags &= gfp_allowed_mask; - if (flags & __GFP_WAIT) + enableirqs = (flags & __GFP_WAIT) != 0; +#ifdef CONFIG_PREEMPT_RT_FULL + enableirqs |= system_state == SYSTEM_RUNNING; +#endif + if (enableirqs) local_irq_enable(); flags |= s->allocflags; @@ -1306,7 +1317,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) stat(s, ORDER_FALLBACK); } - if (flags & __GFP_WAIT) + if (enableirqs) local_irq_disable(); if (!page) @@ -1412,6 +1423,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } +static void free_delayed(struct kmem_cache *s, struct list_head *h) +{ + while(!list_empty(h)) { + struct page *page = list_first_entry(h, struct page, lru); + + list_del(&page->lru); + __free_slab(s, page); + } +} + #define need_reserve_slab_rcu \ (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) @@ -1446,6 +1467,12 @@ static void free_slab(struct kmem_cache *s, struct page *page) } call_rcu(head, rcu_free_slab); + } else if (irqs_disabled()) { + struct slub_free_list *f = &__get_cpu_var(slub_free_list); + + raw_spin_lock(&f->lock); + list_add(&page->lru, &f->list); + raw_spin_unlock(&f->lock); } else __free_slab(s, page); } @@ -1545,7 +1572,7 @@ static void *get_partial_node(struct kmem_cache *s, if (!n || !n->nr_partial) return NULL; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); list_for_each_entry_safe(page, page2, &n->partial, lru) { void *t = acquire_slab(s, n, page, object == NULL); int available; @@ -1566,7 +1593,7 @@ static void *get_partial_node(struct kmem_cache *s, break; } - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); return object; } @@ -1815,7 +1842,7 @@ redo: * that acquire_slab() will see a slab page that * is frozen */ - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); } } else { m = M_FULL; @@ -1826,7 +1853,7 @@ redo: * slabs from diagnostic functions will not see * any frozen slabs. */ - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); } } @@ -1861,7 +1888,7 @@ redo: goto redo; if (lock) - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); if (m == M_FREE) { stat(s, DEACTIVATE_EMPTY); @@ -1870,11 +1897,15 @@ redo: } } -/* Unfreeze all the cpu partial slabs */ -static void unfreeze_partials(struct kmem_cache *s) +/* + * Unfreeze all the cpu partial slabs. + * + * This function must be called with interrupt disabled. + */ +static void unfreeze_partials(struct kmem_cache *s, + struct kmem_cache_cpu *c) { struct kmem_cache_node *n = NULL; - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); struct page *page, *discard_page = NULL; while ((page = c->partial)) { @@ -1906,10 +1937,10 @@ static void unfreeze_partials(struct kmem_cache *s) m = M_PARTIAL; if (n != n2) { if (n) - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); n = n2; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); } } @@ -1935,7 +1966,7 @@ static void unfreeze_partials(struct kmem_cache *s) } if (n) - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); while (discard_page) { page = discard_page; @@ -1956,7 +1987,7 @@ static void unfreeze_partials(struct kmem_cache *s) * If we did not find a slot then simply move all the partials to the * per node partial list. */ -int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { struct page *oldpage; int pages; @@ -1971,14 +2002,21 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) pobjects = oldpage->pobjects; pages = oldpage->pages; if (drain && pobjects > s->cpu_partial) { + LIST_HEAD(tofree); + struct slub_free_list *f; unsigned long flags; /* * partial array is full. Move the existing * set to the per node partial list. */ local_irq_save(flags); - unfreeze_partials(s); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + f = &__get_cpu_var(slub_free_list); + raw_spin_lock(&f->lock); + list_splice_init(&f->list, &tofree); + raw_spin_unlock(&f->lock); local_irq_restore(flags); + free_delayed(s, &tofree); pobjects = 0; pages = 0; } @@ -2015,7 +2053,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) if (c->page) flush_slab(s, c); - unfreeze_partials(s); + unfreeze_partials(s, c); } } @@ -2026,9 +2064,32 @@ static void flush_cpu_slab(void *d) __flush_cpu_slab(s, smp_processor_id()); } +static bool has_cpu_slab(int cpu, void *info) +{ + struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return !!(c->page); +} + static void flush_all(struct kmem_cache *s) { - on_each_cpu(flush_cpu_slab, s, 1); + LIST_HEAD(tofree); + int cpu; + + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); + for_each_online_cpu(cpu) { + struct slub_free_list *f; + + if (!has_cpu_slab(cpu, s)) + continue; + + f = &per_cpu(slub_free_list, cpu); + raw_spin_lock_irq(&f->lock); + list_splice_init(&f->list, &tofree); + raw_spin_unlock_irq(&f->lock); + free_delayed(s, &tofree); + } } /* @@ -2038,7 +2099,7 @@ static void flush_all(struct kmem_cache *s) static inline int node_match(struct kmem_cache_cpu *c, int node) { #ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && c->node != node) + if (!c->page || (node != NUMA_NO_NODE && c->node != node)) return 0; #endif return 1; @@ -2056,10 +2117,10 @@ static unsigned long count_partial(struct kmem_cache_node *n, unsigned long x = 0; struct page *page; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) x += get_count(page); - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); return x; } @@ -2155,6 +2216,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { + struct slub_free_list *f; + LIST_HEAD(tofree); void **object; unsigned long flags; struct page new; @@ -2221,7 +2284,13 @@ redo: load_freelist: c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); +out: + f = &__get_cpu_var(slub_free_list); + raw_spin_lock(&f->lock); + list_splice_init(&f->list, &tofree); + raw_spin_unlock(&f->lock); local_irq_restore(flags); + free_delayed(s, &tofree); return object; new_slab: @@ -2246,8 +2315,7 @@ new_slab: if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) slab_out_of_memory(s, gfpflags, node); - local_irq_restore(flags); - return NULL; + goto out; } } @@ -2261,8 +2329,7 @@ new_slab: c->freelist = get_freepointer(s, object); deactivate_slab(s, c); c->node = NUMA_NO_NODE; - local_irq_restore(flags); - return object; + goto out; } /* @@ -2286,13 +2353,18 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, return NULL; redo: - /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is * enabled. We may switch back and forth between cpus while * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. + * + * Preemption is disabled for the retrieval of the tid because that + * must occur from the current processor. We cannot allow rescheduling + * on a different processor between the determination of the pointer + * and the retrieval of the tid. */ + preempt_disable(); c = __this_cpu_ptr(s->cpu_slab); /* @@ -2302,7 +2374,7 @@ redo: * linked list in between. */ tid = c->tid; - barrier(); + preempt_enable(); object = c->freelist; if (unlikely(!object || !node_match(c, node))) @@ -2449,7 +2521,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * Otherwise the list_lock will synchronize with * other processors updating the list of slabs. */ - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); } } @@ -2498,7 +2570,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_ADD_PARTIAL); } } - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: @@ -2512,7 +2584,7 @@ slab_empty: /* Slab must be on the full list */ remove_full(s, page); - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, page); } @@ -2544,10 +2616,11 @@ redo: * data is retrieved via this pointer. If we are on the same cpu * during the cmpxchg then the free will succedd. */ + preempt_disable(); c = __this_cpu_ptr(s->cpu_slab); tid = c->tid; - barrier(); + preempt_enable(); if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); @@ -2741,7 +2814,7 @@ static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; - spin_lock_init(&n->list_lock); + raw_spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG atomic_long_set(&n->nr_slabs, 0); @@ -3481,7 +3554,7 @@ int kmem_cache_shrink(struct kmem_cache *s) for (i = 0; i < objects; i++) INIT_LIST_HEAD(slabs_by_inuse + i); - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); /* * Build lists indexed by the items in use in each slab. @@ -3502,7 +3575,7 @@ int kmem_cache_shrink(struct kmem_cache *s) for (i = objects - 1; i > 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ list_for_each_entry_safe(page, t, slabs_by_inuse, lru) @@ -3668,10 +3741,15 @@ void __init kmem_cache_init(void) int i; int caches = 0; struct kmem_cache *temp_kmem_cache; - int order; + int order, cpu; struct kmem_cache *temp_kmem_cache_node; unsigned long kmalloc_size; + for_each_possible_cpu(cpu) { + raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock); + INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list); + } + kmem_size = offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *); @@ -4092,7 +4170,7 @@ static int validate_slab_node(struct kmem_cache *s, struct page *page; unsigned long flags; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) { validate_slab_slab(s, page, map); @@ -4115,7 +4193,7 @@ static int validate_slab_node(struct kmem_cache *s, atomic_long_read(&n->nr_slabs)); out: - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); return count; } @@ -4305,12 +4383,12 @@ static int list_locations(struct kmem_cache *s, char *buf, if (!atomic_long_read(&n->nr_slabs)) continue; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) process_slab(&t, s, page, alloc, map); list_for_each_entry(page, &n->full, lru) process_slab(&t, s, page, alloc, map); - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); } for (i = 0; i < t.count; i++) { diff --git a/net/core/dev.c b/net/core/dev.c index 9942479..7de85c1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3033,11 +3033,9 @@ int netif_rx_ni(struct sk_buff *skb) { int err; - migrate_disable(); + local_bh_disable(); err = netif_rx(skb); - if (local_softirq_pending()) - thread_do_softirq(); - migrate_enable(); + local_bh_enable(); return err; }