diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 27ec49af1bf27..8006bf5a3d993 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6596,6 +6596,18 @@ Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. + threadprintk [KNL] + Force threaded printing of all legacy consoles. Be + aware that with this option, the shutdown, reboot, and + panic messages may not be printed on the legacy + consoles. Also, earlycon/earlyprintk printing will be + delayed until a regular console or the kthread is + available. + + Users can view /proc/consoles to see if their console + driver is legacy or not. Non-legacy (NBCON) console + drivers are already threaded and are shown with 'N'. + topology= [S390,EARLY] Format: {off | on} Specify if the kernel should make use of the cpu diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ee5115252aac4..92cb5d38b524b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -38,6 +38,7 @@ config ARM select ARCH_SUPPORTS_CFI_CLANG select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @@ -77,7 +78,7 @@ config ARM select HAS_IOPORT select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL @@ -100,7 +101,7 @@ config ARM select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU select HAVE_EXIT_THREAD - select HAVE_GUP_FAST if ARM_LPAE + select HAVE_GUP_FAST if ARM_LPAE && !(PREEMPT_RT && HIGHPTE) select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER @@ -123,6 +124,7 @@ config ARM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 67c425341a951..0320810ca0953 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -474,6 +474,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); + if (interrupts_enabled(regs)) + local_irq_enable(); + if (user_mode(regs)) goto bad_area; @@ -544,6 +547,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + if (interrupts_enabled(regs)) + local_irq_enable(); + do_bad_area(addr, fsr, regs); return 0; } diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index b68efe643a12c..48745a3c52618 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -55,6 +55,34 @@ extern unsigned int VFP_arch_feroceon __alias(VFP_arch); */ union vfp_state *vfp_current_hw_state[NR_CPUS]; +/* + * Claim ownership of the VFP unit. + * + * The caller may change VFP registers until vfp_unlock() is called. + * + * local_bh_disable() is used to disable preemption and to disable VFP + * processing in softirq context. On PREEMPT_RT kernels local_bh_disable() is + * not sufficient because it only serializes soft interrupt related sections + * via a local lock, but stays preemptible. Disabling preemption is the right + * choice here as bottom half processing is always in thread context on RT + * kernels so it implicitly prevents bottom half processing as well. + */ +static void vfp_lock(void) +{ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); +} + +static void vfp_unlock(void) +{ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); +} + /* * Is 'thread's most up to date state stored in this CPUs hardware? * Must be called from non-preemptible context. @@ -240,7 +268,7 @@ static void vfp_panic(char *reason, u32 inst) /* * Process bitmask of exception conditions. */ -static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_regs *regs) +static int vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr) { int si_code = 0; @@ -248,8 +276,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ if (exceptions == VFP_EXCEPTION_ERROR) { vfp_panic("unhandled bounce", inst); - vfp_raise_sigfpe(FPE_FLTINV, regs); - return; + return FPE_FLTINV; } /* @@ -277,8 +304,7 @@ static void vfp_raise_exceptions(u32 exceptions, u32 inst, u32 fpscr, struct pt_ RAISE(FPSCR_OFC, FPSCR_OFE, FPE_FLTOVF); RAISE(FPSCR_IOC, FPSCR_IOE, FPE_FLTINV); - if (si_code) - vfp_raise_sigfpe(si_code, regs); + return si_code; } /* @@ -324,6 +350,8 @@ static u32 vfp_emulate_instruction(u32 inst, u32 fpscr, struct pt_regs *regs) static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) { u32 fpscr, orig_fpscr, fpsid, exceptions; + int si_code2 = 0; + int si_code = 0; pr_debug("VFP: bounce: trigger %08x fpexc %08x\n", trigger, fpexc); @@ -369,8 +397,8 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) * unallocated VFP instruction but with FPSCR.IXE set and not * on VFP subarch 1. */ - vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr, regs); - return; + si_code = vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr); + goto exit; } /* @@ -394,14 +422,14 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) */ exceptions = vfp_emulate_instruction(trigger, fpscr, regs); if (exceptions) - vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); + si_code2 = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); /* * If there isn't a second FP instruction, exit now. Note that * the FPEXC.FP2V bit is valid only if FPEXC.EX is 1. */ if ((fpexc & (FPEXC_EX | FPEXC_FP2V)) != (FPEXC_EX | FPEXC_FP2V)) - return; + goto exit; /* * The barrier() here prevents fpinst2 being read @@ -413,7 +441,13 @@ static void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) emulate: exceptions = vfp_emulate_instruction(trigger, orig_fpscr, regs); if (exceptions) - vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs); + si_code = vfp_raise_exceptions(exceptions, trigger, orig_fpscr); +exit: + vfp_unlock(); + if (si_code2) + vfp_raise_sigfpe(si_code2, regs); + if (si_code) + vfp_raise_sigfpe(si_code, regs); } static void vfp_enable(void *unused) @@ -512,11 +546,9 @@ static inline void vfp_pm_init(void) { } */ void vfp_sync_hwstate(struct thread_info *thread) { - unsigned int cpu = get_cpu(); + vfp_lock(); - local_bh_disable(); - - if (vfp_state_in_hw(cpu, thread)) { + if (vfp_state_in_hw(raw_smp_processor_id(), thread)) { u32 fpexc = fmrx(FPEXC); /* @@ -527,8 +559,7 @@ void vfp_sync_hwstate(struct thread_info *thread) fmxr(FPEXC, fpexc); } - local_bh_enable(); - put_cpu(); + vfp_unlock(); } /* Ensure that the thread reloads the hardware VFP state on the next use. */ @@ -683,7 +714,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) if (!user_mode(regs)) return vfp_kmode_exception(regs, trigger); - local_bh_disable(); + vfp_lock(); fpexc = fmrx(FPEXC); /* @@ -748,6 +779,7 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) * replay the instruction that trapped. */ fmxr(FPEXC, fpexc); + vfp_unlock(); } else { /* Check for synchronous or asynchronous exceptions */ if (!(fpexc & (FPEXC_EX | FPEXC_DEX))) { @@ -762,17 +794,17 @@ static int vfp_support_entry(struct pt_regs *regs, u32 trigger) if (!(fpscr & FPSCR_IXE)) { if (!(fpscr & FPSCR_LENGTH_MASK)) { pr_debug("not VFP\n"); - local_bh_enable(); + vfp_unlock(); return -ENOEXEC; } fpexc |= FPEXC_DEX; } } bounce: regs->ARM_pc += 4; + /* VFP_bounce() will invoke vfp_unlock() */ VFP_bounce(trigger, fpexc, regs); } - local_bh_enable(); return 0; } @@ -837,7 +869,7 @@ void kernel_neon_begin(void) unsigned int cpu; u32 fpexc; - local_bh_disable(); + vfp_lock(); /* * Kernel mode NEON is only allowed outside of hardirq context with @@ -868,7 +900,7 @@ void kernel_neon_end(void) { /* Disable the NEON/VFP unit. */ fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN); - local_bh_enable(); + vfp_unlock(); } EXPORT_SYMBOL(kernel_neon_end); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5d91259ee7b53..537633203ff0d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -98,6 +98,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_RT select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c88c6d46a5bc0..c3b4e5bbe31dd 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -168,6 +168,7 @@ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST @@ -272,6 +273,7 @@ config PPC select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select HAVE_RSEQ select HAVE_SETUP_PER_CPU_AREA if PPC64 select HAVE_SOFTIRQ_ON_OWN_STACK diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h index 283c346478565..4727f40052ddd 100644 --- a/arch/powerpc/include/asm/stackprotector.h +++ b/arch/powerpc/include/asm/stackprotector.h @@ -19,8 +19,13 @@ */ static __always_inline void boot_init_stack_canary(void) { - unsigned long canary = get_random_canary(); + unsigned long canary; +#ifndef CONFIG_PREEMPT_RT + canary = get_random_canary(); +#else + canary = ((unsigned long)&canary) & CANARY_MASK; +#endif current->stack_canary = canary; #ifdef CONFIG_PPC64 get_paca()->canary = canary; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index f23430adb68ad..9c71dd24ab997 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -261,12 +261,17 @@ static char *get_mmu_str(void) static int __die(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", PAGE_SIZE / 1024, get_mmu_str(), - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index dbfdc126bf144..82dd033cf7968 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -221,6 +221,7 @@ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" depends on KVM && PPC_E500 + depends on !PREEMPT_RT select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index afc0f6a613372..dc3f63c2687d4 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -2,6 +2,7 @@ config PPC_PSERIES depends on PPC64 && PPC_BOOK3S bool "IBM pSeries & new (POWER5-based) iSeries" + select GENERIC_ALLOCATOR select HAVE_PCSPKR_PLATFORM select MPIC select OF_DYNAMIC diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index b1e6d275cda9e..9a8d3970da3c5 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -206,7 +207,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, return ret; } -static DEFINE_PER_CPU(__be64 *, tce_page); +struct tce_page { + __be64 * page; + local_lock_t lock; +}; +static DEFINE_PER_CPU(struct tce_page, tce_page) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @@ -229,9 +236,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, direction, attrs); } - local_irq_save(flags); /* to protect tcep and the page behind it */ + /* to protect tcep and the page behind it */ + local_lock_irqsave(&tce_page.lock, flags); - tcep = __this_cpu_read(tce_page); + tcep = __this_cpu_read(tce_page.page); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() @@ -240,12 +248,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, tceshift, npages, uaddr, direction, attrs); } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } rpn = __pa(uaddr) >> tceshift; @@ -275,7 +283,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum += limit; } while (npages > 0 && !rc); - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; @@ -459,16 +467,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, DMA_BIDIRECTIONAL, 0); } - local_irq_disable(); /* to protect tcep and the page behind it */ - tcep = __this_cpu_read(tce_page); + /* to protect tcep and the page behind it */ + local_lock_irq(&tce_page.lock); + tcep = __this_cpu_read(tce_page.page); if (!tcep) { tcep = (__be64 *)__get_free_page(GFP_ATOMIC); if (!tcep) { - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return -ENOMEM; } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; @@ -511,7 +520,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, /* error cleanup: caller will clear whole range */ - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return rc; } diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 0525ee2d63c71..c72a3ded6e133 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -58,6 +58,7 @@ config RISCV select ARCH_SUPPORTS_LTO_CLANG_THIN if LLD_VERSION >= 140000 select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU select ARCH_SUPPORTS_PER_VMA_LOCK if MMU + select ARCH_SUPPORTS_RT select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK select ARCH_USE_CMPXCHG_LOCKREF if 64BIT select ARCH_USE_MEMTEST @@ -162,6 +163,7 @@ config RISCV select HAVE_PERF_USER_STACK_DUMP select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_PREEMPT_DYNAMIC_KEY if !XIP_KERNEL + select HAVE_PREEMPT_AUTO select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RETHOOK if !XIP_KERNEL select HAVE_RSEQ diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 5d473343634b9..23b136286e927 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -94,6 +94,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); * - pending work-to-be-done flags are in lowest half-word * - other flags in upper half-word(s) */ +#define TIF_ARCH_RESCHED_LAZY 0 /* Lazy rescheduling */ #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ @@ -104,6 +105,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #define TIF_32BIT 11 /* compat-mode 32bit process */ #define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ +#define _TIF_ARCH_RESCHED_LAZY (1 << TIF_ARCH_RESCHED_LAZY) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d7122a1883e8..f9a09e9e43088 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86_64 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_SUPPORTS_RT select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE @@ -122,6 +123,7 @@ config X86 select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 select ARCH_USE_MEMTEST @@ -280,6 +282,7 @@ config X86 select HAVE_STATIC_CALL select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL select HAVE_PREEMPT_DYNAMIC_CALL + select HAVE_PREEMPT_AUTO select HAVE_RSEQ select HAVE_RUST if X86_64 select HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 12da7dfd5ef13..38e2cf05525e3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -87,8 +87,9 @@ struct thread_info { #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ -#define TIF_SSBD 5 /* Speculative store bypass disable */ +#define TIF_ARCH_RESCHED_LAZY 4 /* Lazy rescheduling */ +#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ +#define TIF_SSBD 6 /* Speculative store bypass disable */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ @@ -110,6 +111,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_ARCH_RESCHED_LAZY (1 << TIF_ARCH_RESCHED_LAZY) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 831fa4a121598..5af3ebec0f74a 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -107,7 +107,7 @@ static const struct dmi_system_id processor_power_dmi_table[] = { */ static void __cpuidle acpi_safe_halt(void) { - if (!tif_need_resched()) { + if (!need_resched()) { raw_safe_halt(); raw_local_irq_disable(); } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3acd7006ad2cc..a35d4bd2e60ef 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -57,19 +57,27 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_read_page(struct zram *zram, struct page *page, u32 index, struct bio *parent); +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) +{ + size_t index; + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); +} + static int zram_slot_trylock(struct zram *zram, u32 index) { - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); + return spin_trylock(&zram->table[index].lock); } static void zram_slot_lock(struct zram *zram, u32 index) { - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); + spin_lock(&zram->table[index].lock); } static void zram_slot_unlock(struct zram *zram, u32 index) { - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); } static inline bool init_done(struct zram *zram) @@ -1226,6 +1234,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + zram_meta_init_table_locks(zram, num_pages); return true; } @@ -1283,7 +1292,7 @@ static void zram_free_page(struct zram *zram, size_t index) zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); WARN_ON_ONCE(zram->table[index].flags & - ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB)); + ~(1UL << ZRAM_UNDER_WB)); } /* @@ -2403,9 +2412,10 @@ static void destroy_devices(void) static int __init zram_init(void) { + struct zram_table_entry zram_te; int ret; - BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8); ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 35e3221446292..531cefc666682 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -45,9 +45,7 @@ /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { - /* zram slot is locked */ - ZRAM_LOCK = ZRAM_FLAG_SHIFT, - ZRAM_SAME, /* Page consists the same element */ + ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ @@ -68,7 +66,8 @@ struct zram_table_entry { unsigned long handle; unsigned long element; }; - unsigned long flags; + unsigned int flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME ktime_t ac_time; #endif diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 5932024f8f954..a02162d6b710e 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -3,7 +3,6 @@ config DRM_I915 tristate "Intel 8xx/9xx/G3x/G4x/HD Graphics" depends on DRM depends on X86 && PCI - depends on !PREEMPT_RT select INTEL_GTT if X86 select INTERVAL_TREE # we need shmfs for the swappable backing store, and in particular diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c index 25593f6aae7de..22b80004574fa 100644 --- a/drivers/gpu/drm/i915/display/intel_crtc.c +++ b/drivers/gpu/drm/i915/display/intel_crtc.c @@ -512,7 +512,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state, */ intel_psr_wait_for_idle_locked(new_crtc_state); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); crtc->debug.min_vbl = evade.min; crtc->debug.max_vbl = evade.max; @@ -530,7 +531,8 @@ void intel_pipe_update_start(struct intel_atomic_state *state, return; irq_disable: - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) @@ -632,7 +634,8 @@ void intel_pipe_update_end(struct intel_atomic_state *state, */ intel_vrr_send_push(new_crtc_state); - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); if (intel_vgpu_active(dev_priv)) goto out; diff --git a/drivers/gpu/drm/i915/display/intel_cursor.c b/drivers/gpu/drm/i915/display/intel_cursor.c index 23a122ee20c98..b5676e667c1ff 100644 --- a/drivers/gpu/drm/i915/display/intel_cursor.c +++ b/drivers/gpu/drm/i915/display/intel_cursor.c @@ -797,13 +797,15 @@ intel_legacy_cursor_update(struct drm_plane *_plane, */ intel_psr_wait_for_idle_locked(crtc_state); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); intel_vblank_evade(&evade); drm_crtc_vblank_put(&crtc->base); } else { - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } if (new_plane_state->uapi.visible) { @@ -813,7 +815,8 @@ intel_legacy_cursor_update(struct drm_plane *_plane, intel_plane_disable_arm(plane, crtc_state); } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); intel_psr_unlock(crtc_state); diff --git a/drivers/gpu/drm/i915/display/intel_display_trace.h b/drivers/gpu/drm/i915/display/intel_display_trace.h index 49a5e6d9dc0d7..b15c999d91e68 100644 --- a/drivers/gpu/drm/i915/display/intel_display_trace.h +++ b/drivers/gpu/drm/i915/display/intel_display_trace.h @@ -9,6 +9,10 @@ #if !defined(__INTEL_DISPLAY_TRACE_H__) || defined(TRACE_HEADER_MULTI_READ) #define __INTEL_DISPLAY_TRACE_H__ +#if defined(CONFIG_PREEMPT_RT) && !defined(NOTRACE) +#define NOTRACE +#endif + #include #include #include diff --git a/drivers/gpu/drm/i915/display/intel_vblank.c b/drivers/gpu/drm/i915/display/intel_vblank.c index baf7354cb6e2c..d639b51a49195 100644 --- a/drivers/gpu/drm/i915/display/intel_vblank.c +++ b/drivers/gpu/drm/i915/display/intel_vblank.c @@ -276,6 +276,26 @@ int intel_crtc_scanline_to_hw(struct intel_crtc *crtc, int scanline) * all register accesses to the same cacheline to be serialized, * otherwise they may hang. */ +static void intel_vblank_section_enter_irqsave(struct drm_i915_private *i915, unsigned long *flags) + __acquires(i915->uncore.lock) +{ +#ifdef I915 + spin_lock_irqsave(&i915->uncore.lock, *flags); +#else + *flags = 0; +#endif +} + +static void intel_vblank_section_exit_irqrestore(struct drm_i915_private *i915, unsigned long flags) + __releases(i915->uncore.lock) +{ +#ifdef I915 + spin_unlock_irqrestore(&i915->uncore.lock, flags); +#else + if (flags) + return; +#endif +} static void intel_vblank_section_enter(struct drm_i915_private *i915) __acquires(i915->uncore.lock) { @@ -333,10 +353,10 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, * timing critical raw register reads, potentially with * preemption disabled, so the following code must not block. */ - local_irq_save(irqflags); - intel_vblank_section_enter(dev_priv); + intel_vblank_section_enter_irqsave(dev_priv, &irqflags); - /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); /* Get optional system timestamp before query. */ if (stime) @@ -400,10 +420,10 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, if (etime) *etime = ktime_get(); - /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); - intel_vblank_section_exit(dev_priv); - local_irq_restore(irqflags); + intel_vblank_section_exit_irqrestore(dev_priv, irqflags); /* * While in vblank, position will be negative @@ -441,13 +461,11 @@ int intel_get_crtc_scanline(struct intel_crtc *crtc) unsigned long irqflags; int position; - local_irq_save(irqflags); - intel_vblank_section_enter(dev_priv); + intel_vblank_section_enter_irqsave(dev_priv, &irqflags); position = __intel_get_crtc_scanline(crtc); - intel_vblank_section_exit(dev_priv); - local_irq_restore(irqflags); + intel_vblank_section_exit_irqrestore(dev_priv, irqflags); return position; } @@ -682,11 +700,13 @@ int intel_vblank_evade(struct intel_vblank_evade_ctx *evade) break; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); timeout = schedule_timeout(timeout); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } finish_wait(wq, &wait); diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 21829439e6867..ed29a2dd6ea0a 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -1303,7 +1303,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * and context switches) submission. */ - spin_lock(&sched_engine->lock); + spin_lock_irq(&sched_engine->lock); /* * If the queue is higher priority than the last @@ -1403,7 +1403,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * Even if ELSP[1] is occupied and not worthy * of timeslices, our queue might be. */ - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); return; } } @@ -1429,7 +1429,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.sched_engine->lock); - spin_unlock(&engine->sched_engine->lock); + spin_unlock_irq(&engine->sched_engine->lock); return; /* leave this for another sibling */ } @@ -1591,7 +1591,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) */ sched_engine->queue_priority_hint = queue_prio(sched_engine); i915_sched_engine_reset_on_empty(sched_engine); - spin_unlock(&sched_engine->lock); + spin_unlock_irq(&sched_engine->lock); /* * We can skip poking the HW if we ended up with exactly the same set @@ -1617,13 +1617,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) } } -static void execlists_dequeue_irq(struct intel_engine_cs *engine) -{ - local_irq_disable(); /* Suspend interrupts across request submission */ - execlists_dequeue(engine); - local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ -} - static void clear_ports(struct i915_request **ports, int count) { memset_p((void **)ports, NULL, count); @@ -2478,7 +2471,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) } if (!engine->execlists.pending[0]) { - execlists_dequeue_irq(engine); + execlists_dequeue(engine); start_timeslice(engine); } diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h index 57b9031327767..ff213b79ba83d 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h @@ -362,7 +362,7 @@ static inline int intel_guc_send_busy_loop(struct intel_guc *guc, { int err; unsigned int sleep_period_ms = 1; - bool not_atomic = !in_atomic() && !irqs_disabled(); + bool not_atomic = !in_atomic() && !irqs_disabled() && !rcu_preempt_depth(); /* * FIXME: Have caller pass in if we are in an atomic context to avoid diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 519e096c607cd..466b5ee8ed6d2 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -608,7 +608,6 @@ bool __i915_request_submit(struct i915_request *request) RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* @@ -717,7 +716,6 @@ void __i915_request_unsubmit(struct i915_request *request) */ RQ_TRACE(request, "\n"); - GEM_BUG_ON(!irqs_disabled()); lockdep_assert_held(&engine->sched_engine->lock); /* diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h index ce1cbee1b39dd..247e7d9448d70 100644 --- a/drivers/gpu/drm/i915/i915_trace.h +++ b/drivers/gpu/drm/i915/i915_trace.h @@ -6,6 +6,10 @@ #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _I915_TRACE_H_ +#if defined(CONFIG_PREEMPT_RT) && !defined(NOTRACE) +#define NOTRACE +#endif + #include #include #include diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h index 06ec6ceb61d57..12cbf04990182 100644 --- a/drivers/gpu/drm/i915/i915_utils.h +++ b/drivers/gpu/drm/i915/i915_utils.h @@ -273,8 +273,13 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) (Wmax)) #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) -/* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ -#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) +/* + * If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. + * On PREEMPT_RT the context isn't becoming atomic because it is used in an + * interrupt handler or because a spinlock_t is acquired. This leads to + * warnings which don't occur otherwise and therefore the check is disabled. + */ +#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) #else # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) diff --git a/drivers/gpu/drm/ttm/tests/ttm_bo_test.c b/drivers/gpu/drm/ttm/tests/ttm_bo_test.c index 1f8a4f8adc929..9cc367a795341 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_bo_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_bo_test.c @@ -18,6 +18,12 @@ #define BO_SIZE SZ_8K +#ifdef CONFIG_PREEMPT_RT +#define ww_mutex_base_lock(b) rt_mutex_lock(b) +#else +#define ww_mutex_base_lock(b) mutex_lock(b) +#endif + struct ttm_bo_test_case { const char *description; bool interruptible; @@ -142,7 +148,7 @@ static void ttm_bo_reserve_deadlock(struct kunit *test) bo2 = ttm_bo_kunit_init(test, test->priv, BO_SIZE); ww_acquire_init(&ctx1, &reservation_ww_class); - mutex_lock(&bo2->base.resv->lock.base); + ww_mutex_base_lock(&bo2->base.resv->lock.base); /* The deadlock will be caught by WW mutex, don't warn about it */ lock_release(&bo2->base.resv->lock.base.dep_map, 1); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 4a5107117b4a6..1d06c560c5e65 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1661,6 +1661,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, int len, int *skb_xdp) { struct page_frag *alloc_frag = ¤t->task_frag; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); char *buf; @@ -1700,6 +1701,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, local_bh_disable(); rcu_read_lock(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { struct xdp_buff xdp; @@ -1728,12 +1730,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, pad = xdp.data - xdp.data_hard_start; len = xdp.data_end - xdp.data; } + bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); out: + bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return NULL; @@ -2569,6 +2573,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct tun_page tpage; int n = ctl->num; int flush = 0, queued = 0; @@ -2577,6 +2582,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) local_bh_disable(); rcu_read_lock(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); for (i = 0; i < n; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; @@ -2591,6 +2597,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) if (tfile->napi_enabled && queued > 0) napi_schedule(&tfile->napi); + bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index b0adafc44747d..fd462494f7b33 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -589,6 +589,7 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) #ifdef CONFIG_SERIAL_8250_CONSOLE +#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE static void univ8250_console_write(struct console *co, const char *s, unsigned int count) { @@ -596,6 +597,37 @@ static void univ8250_console_write(struct console *co, const char *s, serial8250_console_write(up, s, count); } +#else +static void univ8250_console_write_atomic(struct console *co, + struct nbcon_write_context *wctxt) +{ + struct uart_8250_port *up = &serial8250_ports[co->index]; + + serial8250_console_write_atomic(up, wctxt); +} + +static void univ8250_console_write_thread(struct console *co, + struct nbcon_write_context *wctxt) +{ + struct uart_8250_port *up = &serial8250_ports[co->index]; + + serial8250_console_write_thread(up, wctxt); +} + +static void univ8250_console_device_lock(struct console *con, unsigned long *flags) +{ + struct uart_port *up = &serial8250_ports[con->index].port; + + __uart_port_lock_irqsave(up, flags); +} + +static void univ8250_console_device_unlock(struct console *con, unsigned long flags) +{ + struct uart_port *up = &serial8250_ports[con->index].port; + + __uart_port_unlock_irqrestore(up, flags); +} +#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */ static int univ8250_console_setup(struct console *co, char *options) { @@ -624,11 +656,11 @@ static int univ8250_console_setup(struct console *co, char *options) port = &serial8250_ports[co->index].port; /* link port to console */ - port->cons = co; + uart_port_set_cons(port, co); retval = serial8250_console_setup(port, options, false); if (retval != 0) - port->cons = NULL; + uart_port_set_cons(port, NULL); return retval; } @@ -686,7 +718,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, continue; co->index = i; - port->cons = co; + uart_port_set_cons(port, co); return serial8250_console_setup(port, options, true); } @@ -695,12 +727,20 @@ static int univ8250_console_match(struct console *co, char *name, int idx, static struct console univ8250_console = { .name = "ttyS", +#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE .write = univ8250_console_write, + .flags = CON_PRINTBUFFER | CON_ANYTIME, +#else + .write_atomic = univ8250_console_write_atomic, + .write_thread = univ8250_console_write_thread, + .device_lock = univ8250_console_device_lock, + .device_unlock = univ8250_console_device_unlock, + .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_NBCON, +#endif .device = uart_console_device, .setup = univ8250_console_setup, .exit = univ8250_console_exit, .match = univ8250_console_match, - .flags = CON_PRINTBUFFER | CON_ANYTIME, .index = -1, .data = &serial8250_reg, }; diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 893bc493f6625..60c5926c23547 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -546,6 +546,13 @@ static int serial8250_em485_init(struct uart_8250_port *p) if (!p->em485) return -ENOMEM; +#ifndef CONFIG_SERIAL_8250_LEGACY_CONSOLE + if (uart_console(&p->port)) { + dev_warn(p->port.dev, "no atomic printing for rs485 consoles\n"); + p->port.cons->write_atomic = NULL; + } +#endif + hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, @@ -691,7 +698,11 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) serial8250_rpm_put(p); } -static void serial8250_clear_IER(struct uart_8250_port *up) +/* + * Only to be used by write_atomic() and the legacy write(), which do not + * require port lock. + */ +static void __serial8250_clear_IER(struct uart_8250_port *up) { if (up->capabilities & UART_CAP_UUE) serial_out(up, UART_IER, UART_IER_UUE); @@ -699,6 +710,14 @@ static void serial8250_clear_IER(struct uart_8250_port *up) serial_out(up, UART_IER, 0); } +static inline void serial8250_clear_IER(struct uart_8250_port *up) +{ + /* Port locked to synchronize UART_IER access against the console. */ + lockdep_assert_held_once(&up->port.lock); + + __serial8250_clear_IER(up); +} + #ifdef CONFIG_SERIAL_8250_RSA /* * Attempts to turn on the RSA FIFO. Returns zero on failure. @@ -3269,6 +3288,11 @@ static void serial8250_console_putchar(struct uart_port *port, unsigned char ch) wait_for_xmitr(up, UART_LSR_THRE); serial_port_out(port, UART_TX, ch); + + if (ch == '\n') + up->console_newline_needed = false; + else + up->console_newline_needed = true; } /* @@ -3297,6 +3321,7 @@ static void serial8250_console_restore(struct uart_8250_port *up) serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); } +#ifdef CONFIG_SERIAL_8250_LEGACY_CONSOLE /* * Print a string to the serial port using the device FIFO * @@ -3355,7 +3380,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, * First save the IER then disable the interrupts */ ier = serial_port_in(port, UART_IER); - serial8250_clear_IER(up); + __serial8250_clear_IER(up); /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { @@ -3421,6 +3446,131 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (locked) uart_port_unlock_irqrestore(port, flags); } +#else +void serial8250_console_write_thread(struct uart_8250_port *up, + struct nbcon_write_context *wctxt) +{ + struct uart_8250_em485 *em485 = up->em485; + struct uart_port *port = &up->port; + unsigned int ier; + + touch_nmi_watchdog(); + + if (!nbcon_enter_unsafe(wctxt)) + return; + + /* First save IER then disable the interrupts. */ + ier = serial_port_in(port, UART_IER); + serial8250_clear_IER(up); + + /* Check scratch reg if port powered off during system sleep. */ + if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { + serial8250_console_restore(up); + up->canary = 0; + } + + if (em485) { + if (em485->tx_stopped) + up->rs485_start_tx(up); + mdelay(port->rs485.delay_rts_before_send); + } + + if (nbcon_exit_unsafe(wctxt)) { + int len = READ_ONCE(wctxt->len); + int i; + + /* + * Write out the message. Toggle unsafe for each byte in order + * to give another (higher priority) context the opportunity + * for a friendly takeover. If such a takeover occurs, this + * context must reacquire ownership in order to perform final + * actions (such as re-enabling the interrupts). + * + * IMPORTANT: wctxt->outbuf and wctxt->len are no longer valid + * after a reacquire so writing the message must be + * aborted. + */ + for (i = 0; i < len; i++) { + if (!nbcon_enter_unsafe(wctxt)) { + nbcon_reacquire(wctxt); + break; + } + + uart_console_write(port, wctxt->outbuf + i, 1, serial8250_console_putchar); + + if (!nbcon_exit_unsafe(wctxt)) { + nbcon_reacquire(wctxt); + break; + } + } + } else { + nbcon_reacquire(wctxt); + } + + while (!nbcon_enter_unsafe(wctxt)) + nbcon_reacquire(wctxt); + + /* Finally, wait for transmitter to become empty and restore IER. */ + wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); + if (em485) { + mdelay(port->rs485.delay_rts_after_send); + if (em485->tx_stopped) + up->rs485_stop_tx(up); + } + serial_port_out(port, UART_IER, ier); + + /* + * The receive handling will happen properly because the receive ready + * bit will still be set; it is not cleared on read. However, modem + * control will not, we must call it if we have saved something in the + * saved flags while processing with interrupts off. + */ + if (up->msr_saved_flags) + serial8250_modem_status(up); + + nbcon_exit_unsafe(wctxt); +} + +void serial8250_console_write_atomic(struct uart_8250_port *up, + struct nbcon_write_context *wctxt) +{ + struct uart_port *port = &up->port; + unsigned int ier; + + /* Atomic console not supported for rs485 mode. */ + if (WARN_ON_ONCE(up->em485)) + return; + + touch_nmi_watchdog(); + + if (!nbcon_enter_unsafe(wctxt)) + return; + + /* + * First save IER then disable the interrupts. The special variant to + * clear IER is used because atomic printing may occur without holding + * the port lock. + */ + ier = serial_port_in(port, UART_IER); + __serial8250_clear_IER(up); + + /* Check scratch reg if port powered off during system sleep. */ + if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { + serial8250_console_restore(up); + up->canary = 0; + } + + if (up->console_newline_needed) + uart_console_write(port, "\n", 1, serial8250_console_putchar); + uart_console_write(port, wctxt->outbuf, wctxt->len, serial8250_console_putchar); + + /* Finally, wait for transmitter to become empty and restore IER. */ + wait_for_xmitr(up, UART_LSR_BOTH_EMPTY); + serial_port_out(port, UART_IER, ier); + + nbcon_exit_unsafe(wctxt); +} +#endif /* CONFIG_SERIAL_8250_LEGACY_CONSOLE */ static unsigned int probe_baud(struct uart_port *port) { @@ -3439,6 +3589,7 @@ static unsigned int probe_baud(struct uart_port *port) int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { + struct uart_8250_port *up = up_to_u8250p(port); int baud = 9600; int bits = 8; int parity = 'n'; @@ -3448,6 +3599,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) if (!port->iobase && !port->membase) return -ENODEV; + up->console_newline_needed = false; + if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 8b1644f5411ec..7d0134ecd82fa 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2480,7 +2480,7 @@ static int pl011_console_match(struct console *co, char *name, int idx, continue; co->index = i; - port->cons = co; + uart_port_set_cons(port, co); return pl011_console_setup(co, options); } diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 2a8006e3d6878..950d598d60764 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -3168,8 +3168,15 @@ static int serial_core_add_one_port(struct uart_driver *drv, struct uart_port *u state->uart_port = uport; uport->state = state; + /* + * If this port is in use as a console then the spinlock is already + * initialised. + */ + if (!uart_console_registered(uport)) + uart_port_spin_lock_init(uport); + state->pm_state = UART_PM_STATE_UNDEFINED; - uport->cons = drv->cons; + uart_port_set_cons(uport, drv->cons); uport->minor = drv->tty_driver->minor_start + uport->line; uport->name = kasprintf(GFP_KERNEL, "%s%d", drv->dev_name, drv->tty_driver->name_base + uport->line); @@ -3178,13 +3185,6 @@ static int serial_core_add_one_port(struct uart_driver *drv, struct uart_port *u goto out; } - /* - * If this port is in use as a console then the spinlock is already - * initialised. - */ - if (!uart_console_registered(uport)) - uart_port_spin_lock_init(uport); - if (uport->cons && uport->dev) of_console_check(uport->dev->of_node, uport->cons->name, uport->line); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index e5974b8239c99..53f8c2329c30c 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -452,7 +452,7 @@ static const struct sysrq_key_op sysrq_unrt_op = { static void sysrq_handle_replay_logs(u8 key) { - console_replay_all(); + console_try_replay_all(); } static struct sysrq_key_op sysrq_replay_logs_op = { .handler = sysrq_handle_replay_logs, diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 407b0d87b7c10..c9c914bc033c9 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -3567,8 +3567,13 @@ static ssize_t show_cons_active(struct device *dev, for_each_console(c) { if (!c->device) continue; - if (!c->write) - continue; + if (c->flags & CON_NBCON) { + if (!c->write_atomic && !c->write_thread) + continue; + } else { + if (!c->write) + continue; + } if ((c->flags & CON_ENABLED) == 0) continue; cs[i++] = c; diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index e0758fe7936dc..c3c01ec2273c5 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -21,12 +21,14 @@ static int show_console_dev(struct seq_file *m, void *v) { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, + { CON_NBCON, 'N' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, }; char flags[ARRAY_SIZE(con_flags) + 1]; struct console *con = v; + char con_write = '-'; unsigned int a; dev_t dev = 0; @@ -57,9 +59,15 @@ static int show_console_dev(struct seq_file *m, void *v) seq_setwidth(m, 21 - 1); seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); - seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', - con->write ? 'W' : '-', con->unblank ? 'U' : '-', - flags); + if (con->flags & CON_NBCON) { + if (con->write_atomic || con->write_thread) + con_write = 'W'; + } else { + if (con->write) + con_write = 'W'; + } + seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con_write, + con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); @@ -68,6 +76,7 @@ static int show_console_dev(struct seq_file *m, void *v) } static void *c_start(struct seq_file *m, loff_t *pos) + __acquires(&console_mutex) { struct console *con; loff_t off = 0; @@ -94,6 +103,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) } static void c_stop(struct seq_file *m, void *v) + __releases(&console_mutex) { console_list_unlock(); } diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index fc53e0ad56d90..448bbef474564 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -35,8 +35,10 @@ static inline void local_bh_enable(void) #ifdef CONFIG_PREEMPT_RT extern bool local_bh_blocked(void); +extern void softirq_preempt(void); #else static inline bool local_bh_blocked(void) { return false; } +static inline void softirq_preempt(void) { } #endif #endif /* _LINUX_BH_H */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5e694a308081a..ff261349b5ed3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2492,7 +2492,7 @@ struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; -void __dev_flush(void); +void __dev_flush(struct list_head *flush_list); int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, @@ -2505,7 +2505,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress); -void __cpu_map_flush(void); +void __cpu_map_flush(struct list_head *flush_list); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, @@ -2642,8 +2642,6 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); -bool dev_check_flush(void); -bool cpu_map_check_flush(void); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2731,7 +2729,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } -static inline void __dev_flush(void) +static inline void __dev_flush(struct list_head *flush_list) { } @@ -2777,7 +2775,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, return 0; } -static inline void __cpu_map_flush(void) +static inline void __cpu_map_flush(struct list_head *flush_list) { } diff --git a/include/linux/console.h b/include/linux/console.h index 31a8f5b85f5d7..96c0923d023b8 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -16,7 +16,9 @@ #include #include +#include #include +#include #include #include @@ -303,7 +305,7 @@ struct nbcon_write_context { /** * struct console - The console descriptor structure * @name: The name of the console driver - * @write: Write callback to output messages (Optional) + * @write: Legacy write callback to output messages (Optional) * @read: Read callback for console input (Optional) * @device: The underlying TTY device driver (Optional) * @unblank: Callback to unblank the console (Optional) @@ -320,10 +322,14 @@ struct nbcon_write_context { * @data: Driver private data * @node: hlist node for the console list * - * @write_atomic: Write callback for atomic context * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print + * @nbcon_device_ctxt: Context available for non-printing operations + * @nbcon_prev_seq: Seq num the previous nbcon owner was assigned to print * @pbufs: Pointer to nbcon private buffer + * @kthread: Printer kthread for this console + * @rcuwait: RCU-safe wait object for @kthread waking + * @irq_work: Defer @kthread waking to IRQ work context */ struct console { char name[16]; @@ -345,11 +351,111 @@ struct console { struct hlist_node node; /* nbcon console specific members */ - bool (*write_atomic)(struct console *con, - struct nbcon_write_context *wctxt); + + /** + * @write_atomic: + * + * NBCON callback to write out text in any context. + * + * This callback is called with the console already acquired. However, + * a higher priority context is allowed to take it over by default. + * + * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe() + * around any code where the takeover is not safe, for example, when + * manipulating the serial port registers. + * + * nbcon_enter_unsafe() will fail if the context has lost the console + * ownership in the meantime. In this case, the callback is no longer + * allowed to go forward. It must back out immediately and carefully. + * The buffer content is also no longer trusted since it no longer + * belongs to the context. + * + * The callback should allow the takeover whenever it is safe. It + * increases the chance to see messages when the system is in trouble. + * If the driver must reacquire ownership in order to finalize or + * revert hardware changes, nbcon_reacquire() can be used. However, + * on reacquire the buffer content is no longer available. A + * reacquire cannot be used to resume printing. + * + * The callback can be called from any context (including NMI). + * Therefore it must avoid usage of any locking and instead rely + * on the console ownership for synchronization. + */ + void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); + + /** + * @write_thread: + * + * NBCON callback to write out text in task context. + * + * This callback is called after device_lock() and with the nbcon + * console acquired. Any necessary driver synchronization should have + * been performed by the device_lock() callback. + * + * This callback is always called from task context but with migration + * disabled. + * + * The same criteria for console ownership verification and unsafe + * sections applies as with write_atomic(). The difference between + * this callback and write_atomic() is that this callback is used + * during normal operation and is always called from task context. + * This allows drivers to operate in their own locking context for + * synchronizing output to the hardware. + */ + void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt); + + /** + * @device_lock: + * + * NBCON callback to begin synchronization with driver code. + * + * Console drivers typically must deal with access to the hardware + * via user input/output (such as an interactive login shell) and + * output of kernel messages via printk() calls. This callback is + * called by the printk-subsystem whenever it needs to synchronize + * with hardware access by the driver. It should be implemented to + * use whatever synchronization mechanism the driver is using for + * itself (for example, the port lock for uart serial consoles). + * + * The callback is always called from task context. It may use any + * synchronization method required by the driver. + * + * IMPORTANT: The callback MUST disable migration. The console driver + * may be using a synchronization mechanism that already takes + * care of this (such as spinlocks). Otherwise this function must + * explicitly call migrate_disable(). + * + * The flags argument is provided as a convenience to the driver. It + * will be passed again to device_unlock(). It can be ignored if the + * driver does not need it. + */ + void (*device_lock)(struct console *con, unsigned long *flags); + + /** + * @device_unlock: + * + * NBCON callback to finish synchronization with driver code. + * + * It is the counterpart to device_lock(). + * + * This callback is always called from task context. It must + * appropriately re-enable migration (depending on how device_lock() + * disabled migration). + * + * The flags argument is the value of the same variable that was + * passed to device_lock(). + */ + void (*device_unlock)(struct console *con, unsigned long flags); + atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; + struct nbcon_context __private nbcon_device_ctxt; + atomic_long_t __private nbcon_prev_seq; + struct printk_buffers *pbufs; + struct task_struct *kthread; + struct rcuwait rcuwait; + struct irq_work irq_work; }; #ifdef CONFIG_LOCKDEP @@ -378,28 +484,34 @@ extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; /** - * console_srcu_read_flags - Locklessly read the console flags + * console_srcu_read_flags - Locklessly read flags of a possibly registered + * console * @con: struct console pointer of console to read flags from * - * This function provides the necessary READ_ONCE() and data_race() - * notation for locklessly reading the console flags. The READ_ONCE() - * in this function matches the WRITE_ONCE() when @flags are modified - * for registered consoles with console_srcu_write_flags(). + * Locklessly reading @con->flags provides a consistent read value because + * there is at most one CPU modifying @con->flags and that CPU is using only + * read-modify-write operations to do so. * - * Only use this function to read console flags when locklessly - * iterating the console list via srcu. + * Requires console_srcu_read_lock to be held, which implies that @con might + * be a registered console. The purpose of holding console_srcu_read_lock is + * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED) + * and that no exit/cleanup routines will run if the console is currently + * undergoing unregistration. + * + * If the caller is holding the console_list_lock or it is _certain_ that + * @con is not and will not become registered, the caller may read + * @con->flags directly instead. * * Context: Any context. + * Return: The current value of the @con->flags field. */ static inline short console_srcu_read_flags(const struct console *con) { WARN_ON_ONCE(!console_srcu_read_lock_is_held()); /* - * Locklessly reading console->flags provides a consistent - * read value because there is at most one CPU modifying - * console->flags and that CPU is using only read-modify-write - * operations to do so. + * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified + * for registered consoles with console_srcu_write_flags(). */ return data_race(READ_ONCE(con->flags)); } @@ -477,13 +589,21 @@ static inline bool console_is_registered(const struct console *con) hlist_for_each_entry(con, &console_list, node) #ifdef CONFIG_PRINTK +extern void nbcon_cpu_emergency_enter(void); +extern void nbcon_cpu_emergency_exit(void); +extern void nbcon_cpu_emergency_flush(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); +extern void nbcon_reacquire(struct nbcon_write_context *wctxt); #else +static inline void nbcon_cpu_emergency_enter(void) { } +static inline void nbcon_cpu_emergency_exit(void) { } +static inline void nbcon_cpu_emergency_flush(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } +static inline void nbcon_reacquire(struct nbcon_write_context *wctxt) { } #endif extern int console_set_on_cmdline; diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index b0fb775a600d9..f5bb19369973a 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -65,7 +65,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ - ARCH_EXIT_TO_USER_MODE_WORK) + _TIF_NEED_RESCHED_LAZY | ARCH_EXIT_TO_USER_MODE_WORK) /** * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 6813171afccb2..674a622c91be2 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -18,7 +18,7 @@ #define XFER_TO_GUEST_MODE_WORK \ (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ - _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) + _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED_LAZY | ARCH_XFER_TO_GUEST_MODE_WORK) struct kvm_vcpu; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5669da513cd7c..b1d9bc714c987 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -733,21 +733,128 @@ struct bpf_nh_params { }; }; +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ +#define BPF_RI_F_RI_INIT BIT(1) +#define BPF_RI_F_CPU_MAP_INIT BIT(2) +#define BPF_RI_F_DEV_MAP_INIT BIT(3) +#define BPF_RI_F_XSK_MAP_INIT BIT(4) + struct bpf_redirect_info { u64 tgt_index; void *tgt_value; struct bpf_map *map; u32 flags; - u32 kern_flags; u32 map_id; enum bpf_map_type map_type; struct bpf_nh_params nh; + u32 kern_flags; }; -DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +struct bpf_net_context { + struct bpf_redirect_info ri; + struct list_head cpu_map_flush_list; + struct list_head dev_map_flush_list; + struct list_head xskmap_map_flush_list; +}; -/* flags for bpf_redirect_info kern_flags */ -#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ +static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx) +{ + struct task_struct *tsk = current; + + if (tsk->bpf_net_context != NULL) + return NULL; + bpf_net_ctx->ri.kern_flags = 0; + + tsk->bpf_net_context = bpf_net_ctx; + return bpf_net_ctx; +} + +static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx) +{ + if (bpf_net_ctx) + current->bpf_net_context = NULL; +} + +static inline struct bpf_net_context *bpf_net_ctx_get(void) +{ + return current->bpf_net_context; +} + +static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) { + memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh)); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT; + } + + return &bpf_net_ctx->ri; +} + +static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) { + INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT; + } + + return &bpf_net_ctx->cpu_map_flush_list; +} + +static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) { + INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT; + } + + return &bpf_net_ctx->dev_map_flush_list; +} + +static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) { + INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT; + } + + return &bpf_net_ctx->xskmap_map_flush_list; +} + +static inline void bpf_net_ctx_get_all_used_flush_lists(struct list_head **lh_map, + struct list_head **lh_dev, + struct list_head **lh_xsk) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + u32 kern_flags = bpf_net_ctx->ri.kern_flags; + struct list_head *lh; + + *lh_map = *lh_dev = *lh_xsk = NULL; + + if (!IS_ENABLED(CONFIG_BPF_SYSCALL)) + return; + + lh = &bpf_net_ctx->dev_map_flush_list; + if (kern_flags & BPF_RI_F_DEV_MAP_INIT && !list_empty(lh)) + *lh_dev = lh; + + lh = &bpf_net_ctx->cpu_map_flush_list; + if (kern_flags & BPF_RI_F_CPU_MAP_INIT && !list_empty(lh)) + *lh_map = lh; + + lh = &bpf_net_ctx->xskmap_map_flush_list; + if (IS_ENABLED(CONFIG_XDP_SOCKETS) && + kern_flags & BPF_RI_F_XSK_MAP_INIT && !list_empty(lh)) + *lh_xsk = lh; +} /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, @@ -1018,25 +1125,23 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); -void bpf_clear_redirect_map(struct bpf_map *map); - static inline bool xdp_return_frame_no_direct(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_set_return_frame_no_direct(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_clear_return_frame_no_direct(void) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; } @@ -1592,7 +1697,7 @@ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 inde u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; /* Lower bits of the flags are used as return code on lookup failure */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5c9bdd3ffccc8..42e91eaa971b2 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -612,6 +612,35 @@ extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); +#ifdef CONFIG_PREEMPT_RT +DECLARE_PER_CPU(struct task_struct *, timersd); +DECLARE_PER_CPU(unsigned long, pending_timer_softirq); + +extern void raise_timer_softirq(void); +extern void raise_hrtimer_softirq(void); + +static inline unsigned int local_pending_timers(void) +{ + return __this_cpu_read(pending_timer_softirq); +} + +#else +static inline void raise_timer_softirq(void) +{ + raise_softirq(TIMER_SOFTIRQ); +} + +static inline void raise_hrtimer_softirq(void) +{ + raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} + +static inline unsigned int local_pending_timers(void) +{ + return local_softirq_pending(); +} +#endif + DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index e55010fa73296..091dc0b6bdfb9 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -51,4 +51,25 @@ #define local_unlock_irqrestore(lock, flags) \ __local_unlock_irqrestore(lock, flags) +DEFINE_GUARD(local_lock, local_lock_t __percpu*, + local_lock(_T), + local_unlock(_T)) +DEFINE_GUARD(local_lock_irq, local_lock_t __percpu*, + local_lock_irq(_T), + local_unlock_irq(_T)) +DEFINE_LOCK_GUARD_1(local_lock_irqsave, local_lock_t __percpu, + local_lock_irqsave(_T->lock, _T->flags), + local_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + +#define local_lock_nested_bh(_lock) \ + __local_lock_nested_bh(_lock) + +#define local_unlock_nested_bh(_lock) \ + __local_unlock_nested_bh(_lock) + +DEFINE_GUARD(local_lock_nested_bh, local_lock_t __percpu*, + local_lock_nested_bh(_T), + local_unlock_nested_bh(_T)) + #endif diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 975e33b793a77..8dd71fbbb6d2b 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -62,6 +62,17 @@ do { \ local_lock_debug_init(lock); \ } while (0) +#define __spinlock_nested_bh_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ + lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ + 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ + LD_LOCK_NORMAL); \ + local_lock_debug_init(lock); \ +} while (0) + #define __local_lock(lock) \ do { \ preempt_disable(); \ @@ -98,6 +109,15 @@ do { \ local_irq_restore(flags); \ } while (0) +#define __local_lock_nested_bh(lock) \ + do { \ + lockdep_assert_in_softirq(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_unlock_nested_bh(lock) \ + local_lock_release(this_cpu_ptr(lock)) + #else /* !CONFIG_PREEMPT_RT */ /* @@ -138,4 +158,15 @@ typedef spinlock_t local_lock_t; #define __local_unlock_irqrestore(lock, flags) __local_unlock(lock) +#define __local_lock_nested_bh(lock) \ +do { \ + lockdep_assert_in_softirq_func(); \ + spin_lock(this_cpu_ptr(lock)); \ +} while (0) + +#define __local_unlock_nested_bh(lock) \ +do { \ + spin_unlock(this_cpu_ptr((lock))); \ +} while (0) + #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 08b0d1d9d78b7..3f5a551579cc9 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -600,6 +600,8 @@ do { \ (!in_softirq() || in_irq() || in_nmi())); \ } while (0) +extern void lockdep_assert_in_softirq_func(void); + #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) @@ -613,6 +615,7 @@ do { \ # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) # define lockdep_assert_in_softirq() do { } while (0) +# define lockdep_assert_in_softirq_func() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d20c6c99eb887..883d55005362c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -3200,6 +3201,7 @@ static inline bool dev_has_header(const struct net_device *dev) struct softnet_data { struct list_head poll_list; struct sk_buff_head process_queue; + local_lock_t process_queue_bh_lock; /* stats */ unsigned int processed; @@ -3222,13 +3224,7 @@ struct softnet_data { struct sk_buff_head xfrm_backlog; #endif /* written and read only by owning cpu: */ - struct { - u16 recursion; - u8 more; -#ifdef CONFIG_NET_EGRESS - u8 skip_txqueue; -#endif - } xmit; + struct netdev_xmit xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, * and only read by other cpus. Worth using a cache line. @@ -3256,10 +3252,18 @@ struct softnet_data { DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +#ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) { return this_cpu_read(softnet_data.xmit.recursion); } +#else +static inline int dev_recursion_level(void) +{ + return current->net_xmit.recursion; +} + +#endif void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); @@ -4874,18 +4878,35 @@ static inline ktime_t netdev_get_tstamp(struct net_device *dev, return hwtstamps->hwtstamp; } -static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, - struct sk_buff *skb, struct net_device *dev, - bool more) +#ifndef CONFIG_PREEMPT_RT +static inline void netdev_xmit_set_more(bool more) { __this_cpu_write(softnet_data.xmit.more, more); - return ops->ndo_start_xmit(skb, dev); } static inline bool netdev_xmit_more(void) { return __this_cpu_read(softnet_data.xmit.more); } +#else +static inline void netdev_xmit_set_more(bool more) +{ + current->net_xmit.more = more; +} + +static inline bool netdev_xmit_more(void) +{ + return current->net_xmit.more; +} +#endif + +static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, + struct sk_buff *skb, struct net_device *dev, + bool more) +{ + netdev_xmit_set_more(more); + return ops->ndo_start_xmit(skb, dev); +} static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h new file mode 100644 index 0000000000000..38325e0702968 --- /dev/null +++ b/include/linux/netdevice_xmit.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_NETDEVICE_XMIT_H +#define _LINUX_NETDEVICE_XMIT_H + +struct netdev_xmit { + u16 recursion; + u8 more; +#ifdef CONFIG_NET_EGRESS + u8 skip_txqueue; +#endif +}; + +#endif diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a5304ae8c654f..65ece0d5b4b6d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -781,11 +781,12 @@ struct perf_event { unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; - unsigned int pending_sigtrap; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; + struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; + struct rcuwait pending_work_wait; atomic_t event_limit; @@ -962,7 +963,7 @@ struct perf_event_context { struct rcu_head rcu_head; /* - * Sum (event->pending_sigtrap + event->pending_work) + * Sum (event->pending_work + event->pending_work) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. @@ -970,12 +971,6 @@ struct perf_event_context { local_t nr_pending; }; -/* - * Number of contexts where an event can trigger: - * task, softirq, hardirq, nmi. - */ -#define PERF_NR_CONTEXTS 4 - struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; diff --git a/include/linux/printk.h b/include/linux/printk.h index 65c5184470f18..1da78bc3c3262 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -9,6 +9,8 @@ #include #include +struct console; + extern const char linux_banner[]; extern const char linux_proc_banner[]; @@ -157,15 +159,16 @@ int _printk(const char *fmt, ...); */ __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); -extern void __printk_safe_enter(void); -extern void __printk_safe_exit(void); +extern void __printk_deferred_enter(void); +extern void __printk_deferred_exit(void); + /* * The printk_deferred_enter/exit macros are available only as a hack for * some code paths that need to defer all printk console printing. Interrupts * must be disabled for the deferred duration. */ -#define printk_deferred_enter __printk_safe_enter -#define printk_deferred_exit __printk_safe_exit +#define printk_deferred_enter() __printk_deferred_enter() +#define printk_deferred_exit() __printk_deferred_exit() /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -192,7 +195,11 @@ void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; void printk_trigger_flush(void); -void console_replay_all(void); +void console_try_replay_all(void); +void printk_legacy_allow_panic_sync(void); +extern bool nbcon_device_try_acquire(struct console *con); +extern void nbcon_device_release(struct console *con); +void nbcon_atomic_flush_unsafe(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -272,9 +279,28 @@ static inline void dump_stack(void) static inline void printk_trigger_flush(void) { } -static inline void console_replay_all(void) + +static inline void console_try_replay_all(void) { } + +static inline void printk_legacy_allow_panic_sync(void) +{ +} + +static inline bool nbcon_device_try_acquire(struct console *con) +{ + return false; +} + +static inline void nbcon_device_release(struct console *con) +{ +} + +static inline void nbcon_atomic_flush_unsafe(void) +{ +} + #endif bool this_cpu_in_panic(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index a5f4b48fca184..913f76cf0107a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +54,7 @@ struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; +struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; @@ -734,6 +736,12 @@ enum perf_event_task_context { perf_nr_task_contexts, }; +/* + * Number of contexts where an event can trigger: + * task, softirq, hardirq, nmi. + */ +#define PERF_NR_CONTEXTS 4 + struct wake_q_node { struct wake_q_node *next; }; @@ -975,7 +983,9 @@ struct task_struct { /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif - +#ifdef CONFIG_PREEMPT_RT + struct netdev_xmit net_xmit; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; @@ -1256,6 +1266,7 @@ struct task_struct { unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS + u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; @@ -1506,6 +1517,8 @@ struct task_struct { /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif + /* Used by BPF for per-TASK xdp storage */ + struct bpf_net_context *bpf_net_context; #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; @@ -1799,6 +1812,7 @@ static inline int dl_task_check_affinity(struct task_struct *p, const struct cpu } #endif +extern bool task_is_pi_boosted(const struct task_struct *p); extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); @@ -1941,17 +1955,17 @@ static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, update_ti_thread_flag(task_thread_info(tsk), flag, value); } -static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) +static inline bool test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } -static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) +static inline bool test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } -static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) +static inline bool test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } @@ -1964,9 +1978,11 @@ static inline void set_tsk_need_resched(struct task_struct *tsk) static inline void clear_tsk_need_resched(struct task_struct *tsk) { clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); + if (IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO)) + clear_tsk_thread_flag(tsk, TIF_NEED_RESCHED_LAZY); } -static inline int test_tsk_need_resched(struct task_struct *tsk) +static inline bool test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } @@ -2107,7 +2123,7 @@ static inline bool preempt_model_preemptible(void) static __always_inline bool need_resched(void) { - return unlikely(tif_need_resched()); + return unlikely(tif_need_resched_lazy() || tif_need_resched()); } /* diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index e670ac282333e..00fed88e7671b 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -63,7 +63,7 @@ static __always_inline bool __must_check current_set_polling_and_test(void) */ smp_mb__after_atomic(); - return unlikely(tif_need_resched()); + return unlikely(need_resched()); } static __always_inline bool __must_check current_clr_polling_and_test(void) @@ -76,7 +76,7 @@ static __always_inline bool __must_check current_clr_polling_and_test(void) */ smp_mb__after_atomic(); - return unlikely(tif_need_resched()); + return unlikely(need_resched()); } #else @@ -85,11 +85,11 @@ static inline void __current_clr_polling(void) { } static inline bool __must_check current_set_polling_and_test(void) { - return unlikely(tif_need_resched()); + return unlikely(need_resched()); } static inline bool __must_check current_clr_polling_and_test(void) { - return unlikely(tif_need_resched()); + return unlikely(need_resched()); } #endif diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index fd59ed2cca53e..b74eae9c632be 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -153,6 +153,8 @@ struct uart_8250_port { #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; + bool console_newline_needed; + struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @@ -204,6 +206,10 @@ void serial8250_init_port(struct uart_8250_port *up); void serial8250_set_defaults(struct uart_8250_port *up); void serial8250_console_write(struct uart_8250_port *up, const char *s, unsigned int count); +void serial8250_console_write_atomic(struct uart_8250_port *up, + struct nbcon_write_context *wctxt); +void serial8250_console_write_thread(struct uart_8250_port *up, + struct nbcon_write_context *wctxt); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index aea25eef9a1a7..4ab65874a850b 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -590,6 +592,95 @@ struct uart_port { void *private_data; /* generic platform data pointer */ }; +/* + * Only for console->device_lock()/_unlock() callbacks and internal + * port lock wrapper synchronization. + */ +static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) +{ + spin_lock_irqsave(&up->lock, *flags); +} + +/* + * Only for console->device_lock()/_unlock() callbacks and internal + * port lock wrapper synchronization. + */ +static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) +{ + spin_unlock_irqrestore(&up->lock, flags); +} + +/** + * uart_port_set_cons - Safely set the @cons field for a uart + * @up: The uart port to set + * @con: The new console to set to + * + * This function must be used to set @up->cons. It uses the port lock to + * synchronize with the port lock wrappers in order to ensure that the console + * cannot change or disappear while another context is holding the port lock. + */ +static inline void uart_port_set_cons(struct uart_port *up, struct console *con) +{ + unsigned long flags; + + __uart_port_lock_irqsave(up, &flags); + up->cons = con; + __uart_port_unlock_irqrestore(up, flags); +} + +/* Only for internal port lock wrapper usage. */ +static inline bool __uart_port_using_nbcon(struct uart_port *up) +{ + lockdep_assert_held_once(&up->lock); + + if (likely(!uart_console(up))) + return false; + + /* + * @up->cons is only modified under the port lock. Therefore it is + * certain that it cannot disappear here. + * + * @up->cons->node is added/removed from the console list under the + * port lock. Therefore it is certain that the registration status + * cannot change here, thus @up->cons->flags can be read directly. + */ + if (hlist_unhashed_lockless(&up->cons->node) || + !(up->cons->flags & CON_NBCON) || + !up->cons->write_atomic) { + return false; + } + + return true; +} + +/* Only for internal port lock wrapper usage. */ +static inline bool __uart_port_nbcon_try_acquire(struct uart_port *up) +{ + if (!__uart_port_using_nbcon(up)) + return true; + + return nbcon_device_try_acquire(up->cons); +} + +/* Only for internal port lock wrapper usage. */ +static inline void __uart_port_nbcon_acquire(struct uart_port *up) +{ + if (!__uart_port_using_nbcon(up)) + return; + + while (!nbcon_device_try_acquire(up->cons)) + cpu_relax(); +} + +/* Only for internal port lock wrapper usage. */ +static inline void __uart_port_nbcon_release(struct uart_port *up) +{ + if (!__uart_port_using_nbcon(up)) + return; + + nbcon_device_release(up->cons); +} + /** * uart_port_lock - Lock the UART port * @up: Pointer to UART port structure @@ -597,6 +688,7 @@ struct uart_port { static inline void uart_port_lock(struct uart_port *up) { spin_lock(&up->lock); + __uart_port_nbcon_acquire(up); } /** @@ -606,6 +698,7 @@ static inline void uart_port_lock(struct uart_port *up) static inline void uart_port_lock_irq(struct uart_port *up) { spin_lock_irq(&up->lock); + __uart_port_nbcon_acquire(up); } /** @@ -616,6 +709,7 @@ static inline void uart_port_lock_irq(struct uart_port *up) static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) { spin_lock_irqsave(&up->lock, *flags); + __uart_port_nbcon_acquire(up); } /** @@ -626,7 +720,15 @@ static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *f */ static inline bool uart_port_trylock(struct uart_port *up) { - return spin_trylock(&up->lock); + if (!spin_trylock(&up->lock)) + return false; + + if (!__uart_port_nbcon_try_acquire(up)) { + spin_unlock(&up->lock); + return false; + } + + return true; } /** @@ -638,7 +740,15 @@ static inline bool uart_port_trylock(struct uart_port *up) */ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags) { - return spin_trylock_irqsave(&up->lock, *flags); + if (!spin_trylock_irqsave(&up->lock, *flags)) + return false; + + if (!__uart_port_nbcon_try_acquire(up)) { + spin_unlock_irqrestore(&up->lock, *flags); + return false; + } + + return true; } /** @@ -647,6 +757,7 @@ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long */ static inline void uart_port_unlock(struct uart_port *up) { + __uart_port_nbcon_release(up); spin_unlock(&up->lock); } @@ -656,6 +767,7 @@ static inline void uart_port_unlock(struct uart_port *up) */ static inline void uart_port_unlock_irq(struct uart_port *up) { + __uart_port_nbcon_release(up); spin_unlock_irq(&up->lock); } @@ -666,6 +778,7 @@ static inline void uart_port_unlock_irq(struct uart_port *up) */ static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) { + __uart_port_nbcon_release(up); spin_unlock_irqrestore(&up->lock, flags); } diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 795ef5a684294..cf5e7e891a776 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -18,6 +18,7 @@ enum task_work_notify_mode { TWA_RESUME, TWA_SIGNAL, TWA_SIGNAL_NO_IPI, + TWA_NMI_CURRENT, }; static inline bool task_work_pending(struct task_struct *task) @@ -30,7 +31,8 @@ int task_work_add(struct task_struct *task, struct callback_head *twork, struct callback_head *task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data); -struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t); +struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t); +bool task_work_cancel(struct task_struct *task, struct callback_head *cb); void task_work_run(void); static inline void exit_task_work(struct task_struct *task) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 9ea0b28068f49..5ded1450ac1a1 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -59,6 +59,16 @@ enum syscall_work_bit { #include +#ifdef CONFIG_PREEMPT_BUILD_AUTO +# define TIF_NEED_RESCHED_LAZY TIF_ARCH_RESCHED_LAZY +# define _TIF_NEED_RESCHED_LAZY _TIF_ARCH_RESCHED_LAZY +# define TIF_NEED_RESCHED_LAZY_OFFSET (TIF_NEED_RESCHED_LAZY - TIF_NEED_RESCHED) +#else +# define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED +# define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED +# define TIF_NEED_RESCHED_LAZY_OFFSET 0 +#endif + #ifdef __KERNEL__ #ifndef arch_set_restart_data @@ -185,6 +195,13 @@ static __always_inline bool tif_need_resched(void) (unsigned long *)(¤t_thread_info()->flags)); } +static __always_inline bool tif_need_resched_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && + arch_test_bit(TIF_NEED_RESCHED_LAZY, + (unsigned long *)(¤t_thread_info()->flags)); +} + #else static __always_inline bool tif_need_resched(void) @@ -193,6 +210,13 @@ static __always_inline bool tif_need_resched(void) (unsigned long *)(¤t_thread_info()->flags)); } +static __always_inline bool tif_need_resched_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && + test_bit(TIF_NEED_RESCHED_LAZY, + (unsigned long *)(¤t_thread_info()->flags)); +} + #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9df3e2973626b..c52e89f407415 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -184,8 +184,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_IRQS_NOSUPPORT = 0x02, - TRACE_FLAG_NEED_RESCHED = 0x04, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_NEED_RESCHED_LAZY = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, @@ -211,11 +211,11 @@ static inline unsigned int tracing_gen_ctx(void) static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); + return tracing_gen_ctx_irq_test(0); } static inline unsigned int tracing_gen_ctx(void) { - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); + return tracing_gen_ctx_irq_test(0); } #endif diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 2a536eea9424e..f88b682690127 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -93,17 +93,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, const int state); -void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, - struct inet_hashinfo *hashinfo); +void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo, + int timeo); void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm); -static inline void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) -{ - __inet_twsk_schedule(tw, timeo, false); -} - static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo) { __inet_twsk_schedule(tw, timeo, true); diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h index 3fab9dec2ec45..888c1ce6f5272 100644 --- a/include/net/seg6_local.h +++ b/include/net/seg6_local.h @@ -19,6 +19,7 @@ extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb); struct seg6_bpf_srh_state { + local_lock_t bh_lock; struct ipv6_sr_hdr *srh; u16 hdrlen; bool valid; diff --git a/include/net/sock.h b/include/net/sock.h index 953c8dc4e259e..7d6784ebb26f5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -544,6 +544,11 @@ struct sock { netns_tracker ns_tracker; }; +struct sock_bh_locked { + struct sock *sock; + local_lock_t bh_lock; +}; + enum sk_pacing { SK_PACING_NONE = 0, SK_PACING_NEEDED = 1, diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 3d54de168a6d9..bfe625b55d55d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -121,7 +121,7 @@ struct xsk_tx_metadata_ops { int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); -void __xsk_map_flush(void); +void __xsk_map_flush(struct list_head *flush_list); /** * xsk_tx_metadata_to_compl - Save enough relevant metadata information @@ -206,7 +206,7 @@ static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) return -EOPNOTSUPP; } -static inline void __xsk_map_flush(void) +static inline void __xsk_map_flush(struct list_head *flush_list) { } @@ -228,14 +228,4 @@ static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, } #endif /* CONFIG_XDP_SOCKETS */ - -#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET) -bool xsk_map_check_flush(void); -#else -static inline bool xsk_map_check_flush(void) -{ - return false; -} -#endif - #endif /* _LINUX_XDP_SOCK_H */ diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c2f1fd95a8214..0f3d4c2a41cb7 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -11,6 +11,13 @@ config PREEMPT_BUILD select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK +config PREEMPT_BUILD_AUTO + bool + select PREEMPT_BUILD + +config HAVE_PREEMPT_AUTO + bool + choice prompt "Preemption Model" default PREEMPT_NONE @@ -67,9 +74,17 @@ config PREEMPT embedded system with latency requirements in the milliseconds range. +config PREEMPT_AUTO + bool "Automagic preemption mode with runtime tweaking support" + depends on HAVE_PREEMPT_AUTO + select PREEMPT_BUILD_AUTO + help + Add some sensible blurb here + config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" depends on EXPERT && ARCH_SUPPORTS_RT + select PREEMPT_BUILD_AUTO if HAVE_PREEMPT_AUTO select PREEMPTION help This option turns the kernel into a real-time kernel by replacing @@ -95,7 +110,7 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" - depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT && !PREEMPT_AUTO select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY select PREEMPT_BUILD default y if HAVE_PREEMPT_DYNAMIC_CALL diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a8e34416e960f..fbdf5a1aabfe4 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -79,8 +79,6 @@ struct bpf_cpu_map { struct bpf_cpu_map_entry __rcu **cpu_map; }; -static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); - static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; @@ -240,12 +238,14 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, int xdp_n, struct xdp_cpumap_stats *stats, struct list_head *list) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int nframes; if (!rcpu->prog) return xdp_n; rcu_read_lock_bh(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); @@ -255,6 +255,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (unlikely(!list_empty(list))) cpu_map_bpf_prog_run_skb(rcpu, list, stats); + bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ return nframes; @@ -706,7 +707,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq) */ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) @@ -723,8 +723,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) */ bq->q[bq->count++] = xdpf; - if (!bq->flush_node.prev) + if (!bq->flush_node.prev) { + struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list(); + list_add(&bq->flush_node, flush_list); + } } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -756,9 +759,8 @@ int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, return ret; } -void __cpu_map_flush(void) +void __cpu_map_flush(struct list_head *flush_list) { - struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -768,24 +770,3 @@ void __cpu_map_flush(void) wake_up_process(bq->obj->kthread); } } - -#ifdef CONFIG_DEBUG_NET -bool cpu_map_check_flush(void) -{ - if (list_empty(this_cpu_ptr(&cpu_map_flush_list))) - return false; - __cpu_map_flush(); - return true; -} -#endif - -static int __init cpu_map_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); - return 0; -} - -subsys_initcall(cpu_map_init); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 7f3b34452243c..b18d4a14a0a70 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -83,7 +83,6 @@ struct bpf_dtab { u32 n_buckets; }; -static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -196,7 +195,14 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); - bpf_clear_redirect_map(map); + /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map() + * during NAPI callback and cleared after the XDP redirect. There is no + * explicit RCU read section which protects bpf_redirect_info->map but + * local_bh_disable() also marks the beginning an RCU section. This + * makes the complete softirq callback RCU protected. Thus after + * following synchronize_rcu() there no bpf_redirect_info->map == map + * assignment. + */ synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ @@ -406,9 +412,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) * driver before returning from its napi->poll() routine. See the comment above * xdp_do_flush() in filter.c. */ -void __dev_flush(void) +void __dev_flush(struct list_head *flush_list) { - struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -419,16 +424,6 @@ void __dev_flush(void) } } -#ifdef CONFIG_DEBUG_NET -bool dev_check_flush(void) -{ - if (list_empty(this_cpu_ptr(&dev_flush_list))) - return false; - __dev_flush(); - return true; -} -#endif - /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. @@ -453,7 +448,6 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -467,6 +461,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, * are only ever modified together. */ if (!bq->dev_rx) { + struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); + bq->dev_rx = dev_rx; bq->xdp_prog = xdp_prog; list_add(&bq->flush_node, flush_list); @@ -1153,15 +1149,11 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { - int cpu; - /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); return 0; } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 90843cc385880..3f31e6b42b328 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_UPROBE) @@ -307,7 +307,7 @@ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (test_tsk_need_resched(current)) preempt_schedule_irq(); } } diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 2e0f75bcb7fd1..d952fa5ee8801 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return -EINTR; } - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & (_TIF_NEED_RESCHED | TIF_NEED_RESCHED_LAZY)) schedule(); if (ti_work & _TIF_NOTIFY_RESUME) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 1273be84392cf..ad57944b6c40e 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -29,7 +29,7 @@ static inline size_t perf_callchain_entry__sizeof(void) sysctl_perf_event_max_contexts_per_stack)); } -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f908f0779354..0acf6ee4df528 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) state = PERF_EVENT_STATE_OFF; } - if (event->pending_sigtrap) { - bool dec = true; - - event->pending_sigtrap = 0; - if (state != PERF_EVENT_STATE_OFF && - !event->pending_work) { - event->pending_work = 1; - dec = false; - WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); - task_work_add(current, &event->pending_task, TWA_RESUME); - } - if (dec) - local_dec(&event->ctx->nr_pending); - } - perf_event_set_state(event, state); if (!is_software_event(event)) @@ -2466,7 +2451,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_irq it's OK because event->ctx + * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2506,7 +2491,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; - irq_work_queue(&event->pending_irq); + irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -5206,9 +5191,35 @@ static bool exclusive_event_installable(struct perf_event *event, static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); +static void perf_pending_task_sync(struct perf_event *event) +{ + struct callback_head *head = &event->pending_task; + + if (!event->pending_work) + return; + /* + * If the task is queued to the current task's queue, we + * obviously can't wait for it to complete. Simply cancel it. + */ + if (task_work_cancel(current, head)) { + event->pending_work = 0; + local_dec(&event->ctx->nr_pending); + return; + } + + /* + * All accesses related to the event are within the same RCU section in + * perf_pending_task(). The RCU grace period before the event is freed + * will make sure all those accesses are complete by then. + */ + rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); +} + static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); unaccount_event(event); @@ -6750,7 +6761,7 @@ static void perf_sigtrap(struct perf_event *event) /* * Deliver the pending work in-event-context or follow the context. */ -static void __perf_pending_irq(struct perf_event *event) +static void __perf_pending_disable(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); @@ -6765,11 +6776,6 @@ static void __perf_pending_irq(struct perf_event *event) * Yay, we hit home and are in the context of the event. */ if (cpu == smp_processor_id()) { - if (event->pending_sigtrap) { - event->pending_sigtrap = 0; - perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); - } if (event->pending_disable) { event->pending_disable = 0; perf_event_disable_local(event); @@ -6793,11 +6799,26 @@ static void __perf_pending_irq(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_irq() + * perf_pending_disable() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending_irq, cpu); + irq_work_queue_on(&event->pending_disable_irq, cpu); +} + +static void perf_pending_disable(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); + __perf_pending_disable(event); + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } static void perf_pending_irq(struct irq_work *entry) @@ -6820,8 +6841,6 @@ static void perf_pending_irq(struct irq_work *entry) perf_event_wakeup(event); } - __perf_pending_irq(event); - if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } @@ -6831,24 +6850,28 @@ static void perf_pending_task(struct callback_head *head) struct perf_event *event = container_of(head, struct perf_event, pending_task); int rctx; + /* + * All accesses to the event must belong to the same implicit RCU read-side + * critical section as the ->pending_work reset. See comment in + * perf_pending_task_sync(). + */ + rcu_read_lock(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ - preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_pending); + rcuwait_wake_up(&event->pending_work_wait); } + rcu_read_unlock(); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); - preempt_enable_notrace(); - - put_event(event); } #ifdef CONFIG_GUEST_PERF_EVENTS @@ -9706,16 +9729,26 @@ static int __perf_event_overflow(struct perf_event *event, */ bool valid_sample = sample_is_allowed(event, regs); unsigned int pending_id = 1; + enum task_work_notify_mode notify_mode; if (regs) pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; - if (!event->pending_sigtrap) { - event->pending_sigtrap = pending_id; + + notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; + + if (!event->pending_work && + !task_work_add(current, &event->pending_task, notify_mode)) { + event->pending_work = pending_id; local_inc(&event->ctx->nr_pending); + + event->pending_addr = 0; + if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) + event->pending_addr = data->addr; + } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without - * consuming pending_sigtrap; with exceptions: + * consuming pending_work; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. @@ -9725,13 +9758,8 @@ static int __perf_event_overflow(struct perf_event *event, * To approximate progress (with false negatives), * check 32-bit hash of the current IP. */ - WARN_ON_ONCE(event->pending_sigtrap != pending_id); + WARN_ON_ONCE(event->pending_work != pending_id); } - - event->pending_addr = 0; - if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) - event->pending_addr = data->addr; - irq_work_queue(&event->pending_irq); } READ_ONCE(event->overflow_handler)(event, data, regs); @@ -9759,11 +9787,7 @@ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; }; - static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* @@ -9961,17 +9985,13 @@ DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - return get_recursion_context(swhash->recursion); + return get_recursion_context(current->perf_recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - put_recursion_context(swhash->recursion, rctx); + put_recursion_context(current->perf_recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) @@ -11961,7 +11981,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); + event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); + rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -13637,6 +13659,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ret; + memset(child->perf_recursion, 0, sizeof(child->perf_recursion)); child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5150d5f84c033..f0daaa6f2a33b 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -208,7 +208,7 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) -static inline int get_recursion_context(int *recursion) +static inline int get_recursion_context(u8 *recursion) { unsigned char rctx = interrupt_context_level(); @@ -221,7 +221,7 @@ static inline int get_recursion_context(int *recursion) return rctx; } -static inline void put_recursion_context(int *recursion, int rctx) +static inline void put_recursion_context(u8 *recursion, unsigned char rctx) { barrier(); recursion[rctx]--; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 71b0fc2d0aeaa..dd53298ef1a5c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1337,7 +1337,7 @@ static int irq_thread(void *data) * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ - task_work_cancel(current, irq_thread_dtor); + task_work_cancel_func(current, irq_thread_dtor); return 0; } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 07fb5987b42bd..f01bf89f48d24 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -181,6 +181,15 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size); #endif /* CONFIG_VMCORE_INFO */ +#if defined(CONFIG_PREEMPT_RT) +static ssize_t realtime_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", 1); +} +KERNEL_ATTR_RO(realtime); +#endif + /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -278,6 +287,9 @@ static struct attribute * kernel_attrs[] = { #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, +#endif +#ifdef CONFIG_PREEMPT_RT + &realtime_attr.attr, #endif NULL }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 151bd3de59363..c06842e037d89 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -56,6 +56,7 @@ #include #include #include +#include #include @@ -574,8 +575,10 @@ static struct lock_trace *save_trace(void) if (!debug_locks_off_graph_unlock()) return NULL; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } @@ -888,11 +891,13 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { instrumentation_begin(); debug_locks_off(); + nbcon_cpu_emergency_enter(); printk(KERN_ERR "BUG: looking up invalid subclass: %u\n", subclass); printk(KERN_ERR "turning off the locking correctness validator.\n"); dump_stack(); + nbcon_cpu_emergency_exit(); instrumentation_end(); return NULL; } @@ -969,11 +974,13 @@ static bool assign_lock_key(struct lockdep_map *lock) else { /* Debug-check: all keys must be persistent! */ debug_locks_off(); + nbcon_cpu_emergency_enter(); pr_err("INFO: trying to register non-static key.\n"); pr_err("The code is fine but needs lockdep annotation, or maybe\n"); pr_err("you didn't initialize this object before use?\n"); pr_err("turning off the locking correctness validator.\n"); dump_stack(); + nbcon_cpu_emergency_exit(); return false; } @@ -1317,8 +1324,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) return NULL; } + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } nr_lock_classes++; @@ -1350,11 +1359,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); + nbcon_cpu_emergency_enter(); printk("\nnew class %px: %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit(); if (!graph_lock()) { return NULL; @@ -1393,8 +1404,10 @@ static struct lock_list *alloc_list_entry(void) if (!debug_locks_off_graph_unlock()) return NULL; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } nr_list_entries++; @@ -2040,6 +2053,8 @@ static noinline void print_circular_bug(struct lock_list *this, depth = get_lock_depth(target); + nbcon_cpu_emergency_enter(); + print_circular_bug_header(target, depth, check_src, check_tgt); parent = get_lock_parent(target); @@ -2058,6 +2073,8 @@ static noinline void print_circular_bug(struct lock_list *this, printk("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static noinline void print_bfs_bug(int ret) @@ -2570,6 +2587,8 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=====================================================\n"); pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", @@ -2619,11 +2638,13 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn(" and %s-irq-unsafe lock:\n", irqclass); next_root->trace = save_trace(); if (!next_root->trace) - return; + goto out; print_shortest_lock_dependencies(forwards_entry, next_root); pr_warn("\nstack backtrace:\n"); dump_stack(); +out: + nbcon_cpu_emergency_exit(); } static const char *state_names[] = { @@ -2988,6 +3009,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("============================================\n"); pr_warn("WARNING: possible recursive locking detected\n"); @@ -3010,6 +3033,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } /* @@ -3607,6 +3632,8 @@ static void print_collision(struct task_struct *curr, struct held_lock *hlock_next, struct lock_chain *chain) { + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("============================\n"); pr_warn("WARNING: chain_key collision\n"); @@ -3623,6 +3650,8 @@ static void print_collision(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } #endif @@ -3713,8 +3742,10 @@ static inline int add_chain_cache(struct task_struct *curr, if (!debug_locks_off_graph_unlock()) return 0; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } chain->chain_key = chain_key; @@ -3731,8 +3762,10 @@ static inline int add_chain_cache(struct task_struct *curr, if (!debug_locks_off_graph_unlock()) return 0; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } @@ -3971,6 +4004,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("================================\n"); pr_warn("WARNING: inconsistent lock state\n"); @@ -3999,6 +4034,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } /* @@ -4033,6 +4070,8 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("========================================================\n"); pr_warn("WARNING: possible irq lock inversion dependency detected\n"); @@ -4073,11 +4112,13 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); root->trace = save_trace(); if (!root->trace) - return; + goto out; print_shortest_lock_dependencies(other, root); pr_warn("\nstack backtrace:\n"); dump_stack(); +out: + nbcon_cpu_emergency_exit(); } /* @@ -4154,6 +4195,8 @@ void print_irqtrace_events(struct task_struct *curr) { const struct irqtrace_events *trace = &curr->irqtrace; + nbcon_cpu_emergency_enter(); + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, @@ -4167,6 +4210,8 @@ void print_irqtrace_events(struct task_struct *curr) printk("softirqs last disabled at (%u): [<%px>] %pS\n", trace->softirq_disable_event, (void *)trace->softirq_disable_ip, (void *)trace->softirq_disable_ip); + + nbcon_cpu_emergency_exit(); } static int HARDIRQ_verbose(struct lock_class *class) @@ -4687,10 +4732,12 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * We must printk outside of the graph_lock: */ if (ret == 2) { + nbcon_cpu_emergency_enter(); printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); print_lock(this); print_irqtrace_events(curr); dump_stack(); + nbcon_cpu_emergency_exit(); } return ret; @@ -4731,6 +4778,8 @@ print_lock_invalid_wait_context(struct task_struct *curr, if (debug_locks_silent) return 0; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=============================\n"); pr_warn("[ BUG: Invalid wait context ]\n"); @@ -4750,6 +4799,8 @@ print_lock_invalid_wait_context(struct task_struct *curr, pr_warn("stack backtrace:\n"); dump_stack(); + nbcon_cpu_emergency_exit(); + return 0; } @@ -4954,6 +5005,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("==================================\n"); pr_warn("WARNING: Nested lock was not taken\n"); @@ -4974,6 +5027,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static int __lock_is_held(const struct lockdep_map *lock, int read); @@ -5019,11 +5074,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, debug_class_ops_inc(class); if (very_verbose(class)) { + nbcon_cpu_emergency_enter(); printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit(); } /* @@ -5150,6 +5207,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); printk(KERN_DEBUG "depth: %i max: %lu!\n", curr->lockdep_depth, MAX_LOCK_DEPTH); @@ -5157,6 +5215,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, lockdep_print_held_locks(current); debug_show_all_locks(); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } @@ -5176,6 +5235,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=====================================\n"); pr_warn("WARNING: bad unlock balance detected!\n"); @@ -5192,6 +5253,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static noinstr int match_held_lock(const struct held_lock *hlock, @@ -5895,6 +5958,8 @@ static void print_lock_contention_bug(struct task_struct *curr, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=================================\n"); pr_warn("WARNING: bad contention detected!\n"); @@ -5911,6 +5976,8 @@ static void print_lock_contention_bug(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static void @@ -6524,6 +6591,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=========================\n"); pr_warn("WARNING: held lock freed!\n"); @@ -6536,6 +6605,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static inline int not_in_range(const void* mem_from, unsigned long mem_len, @@ -6582,6 +6653,8 @@ static void print_held_locks_bug(void) if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("====================================\n"); pr_warn("WARNING: %s/%d still has locks held!\n", @@ -6591,6 +6664,8 @@ static void print_held_locks_bug(void) lockdep_print_held_locks(current); pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } void debug_check_no_locks_held(void) @@ -6616,6 +6691,7 @@ void debug_show_all_locks(void) if (!p->lockdep_depth) continue; lockdep_print_held_locks(p); + nbcon_cpu_emergency_flush(); touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); } @@ -6648,6 +6724,7 @@ asmlinkage __visible void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("================================================\n"); pr_warn("WARNING: lock held when returning to user space!\n"); @@ -6656,6 +6733,7 @@ asmlinkage __visible void lockdep_sys_exit(void) pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); + nbcon_cpu_emergency_exit(); } /* @@ -6672,6 +6750,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) bool rcu = warn_rcu_enter(); /* Note: the following can be executed concurrently, so be careful. */ + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("=============================\n"); pr_warn("WARNING: suspicious RCU usage\n"); @@ -6710,6 +6789,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + nbcon_cpu_emergency_exit(); warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 8475a0794f8c5..438c6086d540e 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -413,3 +413,11 @@ notrace int in_lock_functions(unsigned long addr) && addr < (unsigned long)__lock_text_end; } EXPORT_SYMBOL(in_lock_functions); + +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_PREEMPT_RT) +void notrace lockdep_assert_in_softirq_func(void) +{ + lockdep_assert_in_softirq(); +} +EXPORT_SYMBOL(lockdep_assert_in_softirq_func); +#endif diff --git a/kernel/panic.c b/kernel/panic.c index 8bff183d6180e..7e207092576be 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -367,6 +367,8 @@ void panic(const char *fmt, ...) panic_other_cpus_shutdown(_crash_kexec_post_notifiers); + printk_legacy_allow_panic_sync(); + /* * Run any panic handlers, including those that might need to * add information to the kmsg dump output. @@ -456,6 +458,7 @@ void panic(const char *fmt, ...) * Explicitly flush the kernel log buffer one last time. */ console_flush_on_panic(CONSOLE_FLUSH_PENDING); + nbcon_atomic_flush_unsafe(); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { @@ -634,6 +637,7 @@ bool oops_may_print(void) */ void oops_enter(void) { + nbcon_cpu_emergency_enter(); tracing_off(); /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); @@ -656,6 +660,7 @@ void oops_exit(void) { do_oops_enter_exit(); print_oops_end_marker(); + nbcon_cpu_emergency_exit(); kmsg_dump(KMSG_DUMP_OOPS); } @@ -667,6 +672,8 @@ struct warn_args { void __warn(const char *file, int line, void *caller, unsigned taint, struct pt_regs *regs, struct warn_args *args) { + nbcon_cpu_emergency_enter(); + disable_trace_on_warning(); if (file) @@ -702,6 +709,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); + + nbcon_cpu_emergency_exit(); } #ifdef CONFIG_BUG diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 6c2afee5ef620..48c3564f95ebe 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -2,11 +2,13 @@ /* * internal.h - printk internal definitions */ -#include #include -#include "printk_ringbuffer.h" +#include +#include +#include #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) +struct ctl_table; void __init printk_sysctl_init(void); int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); @@ -20,6 +22,13 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, (con->flags & CON_BOOT) ? "boot" : "", \ con->name, con->index, ##__VA_ARGS__) +#ifdef CONFIG_PREEMPT_RT +# define force_printkthreads() (true) +#else +DECLARE_STATIC_KEY_FALSE(force_printkthreads_key); +# define force_printkthreads() (static_branch_unlikely(&force_printkthreads_key)) +#endif + #ifdef CONFIG_PRINTK #ifdef CONFIG_PRINTK_CALLER @@ -43,7 +52,11 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; +struct printk_ringbuffer; +struct dev_printk_info; + extern struct printk_ringbuffer *prb; +extern bool printk_threads_enabled; __printf(4, 0) int vprintk_store(int facility, int level, @@ -53,6 +66,9 @@ int vprintk_store(int facility, int level, __printf(1, 0) int vprintk_default(const char *fmt, va_list args); __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); +void __printk_safe_enter(void); +void __printk_safe_exit(void); + bool printk_percpu_data_ready(void); #define printk_safe_enter_irqsave(flags) \ @@ -69,14 +85,83 @@ bool printk_percpu_data_ready(void); void defer_console_output(void); +bool is_printk_deferred(void); + u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags); +void console_lock_spinning_enable(void); +int console_lock_spinning_disable_and_check(int cookie); u64 nbcon_seq_read(struct console *con); void nbcon_seq_force(struct console *con, u64 seq); bool nbcon_alloc(struct console *con); -void nbcon_init(struct console *con); +void nbcon_init(struct console *con, u64 init_seq); void nbcon_free(struct console *con); +enum nbcon_prio nbcon_get_default_prio(void); +void nbcon_atomic_flush_pending(void); +bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, + int cookie, bool use_atomic); +void nbcon_kthread_create(struct console *con); +void nbcon_wake_threads(void); +void nbcon_legacy_kthread_create(void); + +/* + * Check if the given console is currently capable and allowed to print + * records. Note that this function does not consider the current context, + * which can also play a role in deciding if @con can be used to print + * records. + */ +static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) +{ + if (!(flags & CON_ENABLED)) + return false; + + if ((flags & CON_SUSPENDED)) + return false; + + if (flags & CON_NBCON) { + if (use_atomic) { + if (!con->write_atomic) + return false; + } else { + if (!con->write_thread) + return false; + } + } else { + if (!con->write) + return false; + } + + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) + return false; + + return true; +} + +/** + * nbcon_kthread_wake - Wake up a printk thread + * @con: Console to operate on + */ +static inline void nbcon_kthread_wake(struct console *con) +{ + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the rcuwait is empty. + * + * The full memory barrier in rcuwait_wake_up() pairs with the full + * memory barrier within set_current_state() of + * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait() + * adds the waiter but before it has checked the wait condition. + * + * This pairs with nbcon_kthread_func:A. + */ + rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */ +} #else @@ -84,6 +169,10 @@ void nbcon_free(struct console *con); #define PRINTK_MESSAGE_MAX 0 #define PRINTKRB_RECORD_MAX 0 +static inline void nbcon_kthread_wake(struct console *con) { } +static inline void nbcon_kthread_create(struct console *con) { } +#define printk_threads_enabled (false) + /* * In !PRINTK builds we still export console_sem * semaphore and some of console functions (console_unlock()/etc.), so @@ -96,11 +185,29 @@ static inline bool printk_percpu_data_ready(void) { return false; } static inline u64 nbcon_seq_read(struct console *con) { return 0; } static inline void nbcon_seq_force(struct console *con, u64 seq) { } static inline bool nbcon_alloc(struct console *con) { return false; } -static inline void nbcon_init(struct console *con) { } +static inline void nbcon_init(struct console *con, u64 init_seq) { } static inline void nbcon_free(struct console *con) { } +static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; } +static inline void nbcon_atomic_flush_pending(void) { } +static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, + int cookie, bool use_atomic) { return false; } + +static inline bool console_is_usable(struct console *con, short flags, + bool use_atomic) { return false; } #endif /* CONFIG_PRINTK */ +extern bool have_boot_console; +extern bool have_legacy_console; + +/* + * Specifies if the console lock/unlock dance is needed for console + * printing. If @have_boot_console is true, the nbcon consoles will + * be printed serially along with the legacy consoles because nbcon + * consoles cannot print simultaneously with boot consoles. + */ +#define printing_via_unlock (have_legacy_console || have_boot_console) + extern struct printk_buffers printk_shared_pbufs; /** @@ -135,4 +242,5 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, #ifdef CONFIG_PRINTK void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped); +void console_prepend_replay(struct printk_message *pmsg); #endif diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index c8093bcc01fe6..0813ce88a49c5 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -2,11 +2,26 @@ // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner -#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include #include "internal.h" +#include "printk_ringbuffer.h" /* * Printk console printing implementation for consoles which does not depend * on the legacy style console_lock mechanism. @@ -172,9 +187,6 @@ void nbcon_seq_force(struct console *con, u64 seq) u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); - - /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */ - con->seq = 0; } /** @@ -201,6 +213,8 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) } } +bool printk_threads_enabled __ro_after_init; + /** * nbcon_context_try_acquire_direct - Try to acquire directly * @ctxt: The context of the caller @@ -531,6 +545,7 @@ static struct printk_buffers panic_nbcon_pbufs; * nbcon_context_try_acquire - Try to acquire nbcon console * @ctxt: The context of the caller * + * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. * * If the caller allowed an unsafe hostile takeover, on success the @@ -538,7 +553,6 @@ static struct printk_buffers panic_nbcon_pbufs; * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ -__maybe_unused static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) { unsigned int cpu = smp_processor_id(); @@ -824,9 +838,42 @@ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) } EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); +/** + * nbcon_reacquire - Reacquire a console after losing ownership while printing + * @wctxt: The write context that was handed to the write callback + * + * Since ownership can be lost at any time due to handover or takeover, a + * printing context _must_ be prepared to back out immediately and + * carefully. However, there are scenarios where the printing context must + * reacquire ownership in order to finalize or revert hardware changes. + * + * This function allows a printing context to reacquire ownership using the + * same priority as its previous ownership. + * + * Note that after a successful reacquire the printing context will have no + * output buffer because that has been lost. This function cannot be used to + * resume printing. + */ +void nbcon_reacquire(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + struct nbcon_state cur; + + while (!nbcon_context_try_acquire(ctxt)) + cpu_relax(); + + wctxt->outbuf = NULL; + wctxt->len = 0; + nbcon_state_read(con, &cur); + wctxt->unsafe_takeover = cur.unsafe_takeover; +} +EXPORT_SYMBOL_GPL(nbcon_reacquire); + /** * nbcon_emit_next_record - Emit a record in the acquired context * @wctxt: The write context that will be handed to the write function + * @use_atomic: True if the write_atomic() callback is to be used * * Return: True if this context still owns the console. False if * ownership was handed over or taken. @@ -840,8 +887,7 @@ EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); * When true is returned, @wctxt->ctxt.backlog indicates whether there are * still records pending in the ringbuffer, */ -__maybe_unused -static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) +static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; @@ -852,7 +898,7 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) unsigned long con_dropped; struct nbcon_state cur; unsigned long dropped; - bool done; + unsigned long ulseq; /* * The printk buffers are filled within an unsafe section. This @@ -878,6 +924,28 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) if (dropped && !is_extended) console_prepend_dropped(&pmsg, dropped); + /* + * If the previous owner was assigned the same record, this context + * has taken over ownership and is replaying the record. Prepend a + * message to let the user know the record is replayed. + */ + ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq)); + if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) { + console_prepend_replay(&pmsg); + } else { + /* + * Ensure this context is still the owner before trying to + * update @nbcon_prev_seq. Otherwise the value in @ulseq may + * not be from the previous owner. + */ + nbcon_state_read(con, &cur); + if (!nbcon_context_can_proceed(ctxt, &cur)) + return false; + + atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq, + __u64seq_to_ulseq(pmsg.seq)); + } + if (!nbcon_context_exit_unsafe(ctxt)) return false; @@ -891,17 +959,32 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) nbcon_state_read(con, &cur); wctxt->unsafe_takeover = cur.unsafe_takeover; - if (con->write_atomic) { - done = con->write_atomic(con, wctxt); + if (use_atomic && + con->write_atomic) { + con->write_atomic(con, wctxt); + + } else if (!use_atomic && + con->write_thread) { + con->write_thread(con, wctxt); + } else { - nbcon_context_release(ctxt); + /* + * This function should never be called for legacy consoles. + * Handle it as if ownership was lost and try to continue. + */ WARN_ON_ONCE(1); - done = false; + nbcon_context_release(ctxt); + return false; } - /* If not done, the emit was aborted. */ - if (!done) + if (!wctxt->outbuf) { + /* + * Ownership was lost and reacquired by the driver. + * Handle it as if ownership was lost. + */ + nbcon_context_release(ctxt); return false; + } /* * Since any dropped message was successfully output, reset the @@ -928,6 +1011,674 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) return nbcon_context_exit_unsafe(ctxt); } +/** + * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup + * @con: Console to operate on + * @ctxt: The nbcon context from nbcon_context_try_acquire() + * + * Return: True if the thread should shutdown or if the console is + * allowed to print and a record is available. False otherwise. + * + * After the thread wakes up, it must first check if it should shutdown before + * attempting any printing. + */ +static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) +{ + bool ret = false; + short flags; + int cookie; + + if (kthread_should_stop()) + return true; + + cookie = console_srcu_read_lock(); + + flags = console_srcu_read_flags(con); + if (console_is_usable(con, flags, false)) { + /* Bring the sequence in @ctxt up to date */ + ctxt->seq = nbcon_seq_read(con); + + ret = prb_read_valid(prb, ctxt->seq, NULL); + } + + console_srcu_read_unlock(cookie); + return ret; +} + +/** + * nbcon_kthread_func - The printer thread function + * @__console: Console to operate on + * + * Return: 0 + */ +static int nbcon_kthread_func(void *__console) +{ + struct console *con = __console; + struct nbcon_write_context wctxt = { + .ctxt.console = con, + .ctxt.prio = NBCON_PRIO_NORMAL, + }; + struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + short con_flags; + bool backlog; + int cookie; + int ret; + +wait_for_event: + /* + * Guarantee this task is visible on the rcuwait before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * ___rcuwait_wait_event() pairs with the full memory + * barrier within rcuwait_has_sleeper(). + * + * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. + */ + ret = rcuwait_wait_event(&con->rcuwait, + nbcon_kthread_should_wakeup(con, ctxt), + TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ + + if (kthread_should_stop()) + return 0; + + /* Wait was interrupted by a spurious signal, go back to sleep. */ + if (ret) + goto wait_for_event; + + do { + backlog = false; + + cookie = console_srcu_read_lock(); + + con_flags = console_srcu_read_flags(con); + + if (console_is_usable(con, con_flags, false)) { + unsigned long lock_flags; + + con->device_lock(con, &lock_flags); + + /* + * Ensure this stays on the CPU to make handover and + * takeover possible. + */ + cant_migrate(); + + if (nbcon_context_try_acquire(ctxt)) { + /* + * If the emit fails, this context is no + * longer the owner. + */ + if (nbcon_emit_next_record(&wctxt, false)) { + nbcon_context_release(ctxt); + backlog = ctxt->backlog; + } + } + + con->device_unlock(con, lock_flags); + } + + console_srcu_read_unlock(cookie); + cond_resched(); + + } while (backlog); + + goto wait_for_event; +} + +/** + * nbcon_irq_work - irq work to wake printk thread + * @irq_work: The irq work to operate on + */ +static void nbcon_irq_work(struct irq_work *irq_work) +{ + struct console *con = container_of(irq_work, struct console, irq_work); + + nbcon_kthread_wake(con); +} + +static inline bool rcuwait_has_sleeper(struct rcuwait *w) +{ + bool has_sleeper; + + rcu_read_lock(); + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the rcuwait is empty. + * + * This full memory barrier pairs with the full memory barrier within + * set_current_state() of ___rcuwait_wait_event(), which is called + * after prepare_to_rcuwait() adds the waiter but before it has + * checked the wait condition. + * + * This pairs with nbcon_kthread_func:A. + */ + smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ + has_sleeper = !!rcu_dereference(w->task); + rcu_read_unlock(); + + return has_sleeper; +} + +/** + * nbcon_wake_threads - Wake up printing threads using irq_work + */ +void nbcon_wake_threads(void) +{ + struct console *con; + int cookie; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + /* + * Only schedule irq_work if the printing thread is + * actively waiting. If not waiting, the thread will + * notice by itself that it has work to do. + */ + if (con->kthread && rcuwait_has_sleeper(&con->rcuwait)) + irq_work_queue(&con->irq_work); + } + console_srcu_read_unlock(cookie); +} + +/* Track the nbcon emergency nesting per CPU. */ +static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); +static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; + +/** + * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer + * + * Return: Either a pointer to the per CPU emergency nesting counter of + * the current CPU or to the init data during early boot. + */ +static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) +{ + /* + * The value of __printk_percpu_data_ready gets set in normal + * context and before SMP initialization. As a result it could + * never change while inside an nbcon emergency section. + */ + if (!printk_percpu_data_ready()) + return &early_nbcon_pcpu_emergency_nesting; + + return this_cpu_ptr(&nbcon_pcpu_emergency_nesting); +} + +/** + * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon + * printing on the current CPU + * + * Context: Any context which could not be migrated to another CPU. + * Return: The nbcon_prio to use for acquiring an nbcon console in this + * context for printing. + */ +enum nbcon_prio nbcon_get_default_prio(void) +{ + unsigned int *cpu_emergency_nesting; + + if (this_cpu_in_panic()) + return NBCON_PRIO_PANIC; + + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); + if (*cpu_emergency_nesting) + return NBCON_PRIO_EMERGENCY; + + return NBCON_PRIO_NORMAL; +} + +/* + * nbcon_emit_one - Print one record for an nbcon console using the + * specified callback + * @wctxt: An initialized write context struct to use for this context + * @use_atomic: True if the write_atomic() callback is to be used + * + * Return: True, when a record has been printed and there are still + * pending records. The caller might want to continue flushing. + * + * False, when there is no pending record, or when the console + * context cannot be acquired, or the ownership has been lost. + * The caller should give up. Either the job is done, cannot be + * done, or will be handled by the owning context. + * + * This is an internal helper to handle the locking of the console before + * calling nbcon_emit_next_record(). + */ +static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + if (!nbcon_context_try_acquire(ctxt)) + return false; + + /* + * nbcon_emit_next_record() returns false when the console was + * handed over or taken over. In both cases the context is no + * longer valid. + * + * The higher priority printing context takes over responsibility + * to print the pending records. + */ + if (!nbcon_emit_next_record(wctxt, use_atomic)) + return false; + + nbcon_context_release(ctxt); + + return ctxt->backlog; +} + +/** + * nbcon_legacy_emit_next_record - Print one record for an nbcon console + * in legacy contexts + * @con: The console to print on + * @handover: Will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding + * both the console_lock and the SRCU read lock. Otherwise it + * is set to false. + * @cookie: The cookie from the SRCU read lock. + * @use_atomic: True if the write_atomic() callback is to be used + * + * Context: Any context except NMI. + * Return: True, when a record has been printed and there are still + * pending records. The caller might want to continue flushing. + * + * False, when there is no pending record, or when the console + * context cannot be acquired, or the ownership has been lost. + * The caller should give up. Either the job is done, cannot be + * done, or will be handled by the owning context. + * + * This function is meant to be called by console_flush_all() to print records + * on nbcon consoles from legacy context (printing via console unlocking). + * Essentially it is the nbcon version of console_emit_next_record(). + */ +bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, + int cookie, bool use_atomic) +{ + struct nbcon_write_context wctxt = { }; + struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + unsigned long flags; + bool progress; + + ctxt->console = con; + + if (use_atomic) { + /* Use the same procedure as console_emit_next_record(). */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + stop_critical_timings(); + + ctxt->prio = nbcon_get_default_prio(); + progress = nbcon_emit_one(&wctxt, use_atomic); + + start_critical_timings(); + *handover = console_lock_spinning_disable_and_check(cookie); + printk_safe_exit_irqrestore(flags); + } else { + *handover = false; + + con->device_lock(con, &flags); + cant_migrate(); + + ctxt->prio = nbcon_get_default_prio(); + progress = nbcon_emit_one(&wctxt, use_atomic); + + con->device_unlock(con, flags); + } + + return progress; +} + +/** + * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its + * write_atomic() callback + * @con: The nbcon console to flush + * @stop_seq: Flush up until this record + * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers + * + * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on + * failure. + * + * Errors: + * + * -EPERM: Unable to acquire console ownership. + * + * -EAGAIN: Another context took over ownership while printing. + * + * -ENOENT: A record before @stop_seq is not available. + * + * If flushing up to @stop_seq was not successful, it only makes sense for the + * caller to try again when -EAGAIN was returned. When -EPERM is returned, + * this context is not allowed to acquire the console. When -ENOENT is + * returned, it cannot be expected that the unfinalized record will become + * available. + */ +static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, + bool allow_unsafe_takeover) +{ + struct nbcon_write_context wctxt = { }; + struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + int err = 0; + + ctxt->console = con; + ctxt->spinwait_max_us = 2000; + ctxt->prio = nbcon_get_default_prio(); + ctxt->allow_unsafe_takeover = allow_unsafe_takeover; + + if (!nbcon_context_try_acquire(ctxt)) + return -EPERM; + + while (nbcon_seq_read(con) < stop_seq) { + /* + * nbcon_emit_next_record() returns false when the console was + * handed over or taken over. In both cases the context is no + * longer valid. + */ + if (!nbcon_emit_next_record(&wctxt, true)) + return -EAGAIN; + + if (!ctxt->backlog) { + /* Are there reserved but not yet finalized records? */ + if (nbcon_seq_read(con) < stop_seq) + err = -ENOENT; + break; + } + } + + nbcon_context_release(ctxt); + return err; +} + +/** + * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its + * write_atomic() callback + * @con: The nbcon console to flush + * @stop_seq: Flush up until this record + * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers + * + * This will stop flushing before @stop_seq if another context has ownership. + * That context is then responsible for the flushing. Likewise, if new records + * are added while this context was flushing and there is no other context + * to handle the printing, this context must also flush those records. + */ +static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, + bool allow_unsafe_takeover) +{ + unsigned long flags; + int err; + +again: + /* + * Atomic flushing does not use console driver synchronization (i.e. + * it does not hold the port lock for uart consoles). Therefore IRQs + * must be disabled to avoid being interrupted and then calling into + * a driver that will deadlock trying to acquire console ownership. + */ + local_irq_save(flags); + + err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + + local_irq_restore(flags); + + /* + * If there was a new owner (-EPERM, -EAGAIN), that context is + * responsible for completing. + * + * Do not wait for records not yet finalized (-ENOENT) to avoid a + * possible deadlock. They will either get flushed by the writer or + * eventually skipped on panic CPU. + */ + if (err) + return; + + /* + * If flushing was successful but more records are available, this + * context must flush those remaining records if the printer thread + * is not available do it. + */ + if ((!con->kthread || (system_state > SYSTEM_RUNNING)) && + prb_read_valid(prb, nbcon_seq_read(con), NULL)) { + stop_seq = prb_next_reserve_seq(prb); + goto again; + } +} + +/** + * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their + * write_atomic() callback + * @stop_seq: Flush up until this record + * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers + */ +static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) +{ + struct console *con; + int cookie; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + + if (!(flags & CON_NBCON)) + continue; + + if (!console_is_usable(con, flags, true)) + continue; + + if (nbcon_seq_read(con) >= stop_seq) + continue; + + nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + } + console_srcu_read_unlock(cookie); +} + +/** + * nbcon_atomic_flush_pending - Flush all nbcon consoles using their + * write_atomic() callback + * + * Flush the backlog up through the currently newest record. Any new + * records added while flushing will not be flushed. This is to avoid + * one CPU printing unbounded because other CPUs continue to add records. + */ +void nbcon_atomic_flush_pending(void) +{ + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); +} + +/** + * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their + * write_atomic() callback and allowing unsafe hostile takeovers + * + * Flush the backlog up through the currently newest record. Unsafe hostile + * takeovers will be performed, if necessary. + */ +void nbcon_atomic_flush_unsafe(void) +{ + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); +} + +/** + * nbcon_cpu_emergency_enter - Enter an emergency section where printk() + * messages for that CPU are only stored + * + * Upon exiting the emergency section, all stored messages are flushed. + * + * Context: Any context. Disables preemption. + * + * When within an emergency section, no printing occurs on that CPU. This + * is to allow all emergency messages to be dumped into the ringbuffer before + * flushing the ringbuffer. The actual printing occurs when exiting the + * outermost emergency section. + */ +void nbcon_cpu_emergency_enter(void) +{ + unsigned int *cpu_emergency_nesting; + + preempt_disable(); + + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); + (*cpu_emergency_nesting)++; +} + +/** + * nbcon_cpu_emergency_exit - Exit an emergency section and flush the + * stored messages + * + * Flushing only occurs when exiting all nesting for the CPU. + * + * Context: Any context. Enables preemption. + */ +void nbcon_cpu_emergency_exit(void) +{ + unsigned int *cpu_emergency_nesting; + bool do_trigger_flush = false; + + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); + + /* + * Flush the messages before enabling preemtion to see them ASAP. + * + * Reduce the risk of potential softlockup by using the + * flush_pending() variant which ignores messages added later. It is + * called before decrementing the counter so that the printing context + * for the emergency messages is NBCON_PRIO_EMERGENCY. + */ + if (*cpu_emergency_nesting == 1) { + nbcon_atomic_flush_pending(); + + /* + * Safely attempt to flush the legacy consoles in this + * context. Otherwise an irq_work context is triggered + * to handle it. + */ + do_trigger_flush = true; + if (!force_printkthreads() && + printing_via_unlock && + !is_printk_deferred()) { + if (console_trylock()) { + do_trigger_flush = false; + console_unlock(); + } + } + } + + if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) + (*cpu_emergency_nesting)--; + + preempt_enable(); + + if (do_trigger_flush) + printk_trigger_flush(); +} + +/** + * nbcon_cpu_emergency_flush - Explicitly flush consoles while + * within emergency context + * + * Both nbcon and legacy consoles are flushed. + * + * It should be used only when there are too many messages printed + * in emergency context, for example, printing backtraces of all + * CPUs or processes. It is typically needed when the watchdogs + * need to be touched as well. + */ +void nbcon_cpu_emergency_flush(void) +{ + bool is_emergency; + + /* + * If this context is not an emergency context, preemption might be + * enabled. To be sure, disable preemption when checking if this is + * an emergency context. + */ + preempt_disable(); + is_emergency = (*nbcon_get_cpu_emergency_nesting() != 0); + preempt_enable(); + + /* The explicit flush is needed only in the emergency context. */ + if (!is_emergency) + return; + + nbcon_atomic_flush_pending(); + + if (!force_printkthreads() && + printing_via_unlock && + !is_printk_deferred()) { + if (console_trylock()) + console_unlock(); + } +} + +/* + * nbcon_kthread_stop - Stop a printer thread + * @con: Console to operate on + */ +static void nbcon_kthread_stop(struct console *con) +{ + lockdep_assert_console_list_lock_held(); + + if (!con->kthread) + return; + + kthread_stop(con->kthread); + con->kthread = NULL; +} + +/** + * nbcon_kthread_create - Create a printer thread + * @con: Console to operate on + * + * If it fails, let the console proceed. The atomic part might + * be usable and useful. + */ +void nbcon_kthread_create(struct console *con) +{ + struct task_struct *kt; + + lockdep_assert_console_list_lock_held(); + + if (!(con->flags & CON_NBCON) || !con->write_thread) + return; + + if (!printk_threads_enabled || con->kthread) + return; + + /* + * Printer threads cannot be started as long as any boot console is + * registered because there is no way to synchronize the hardware + * registers between boot console code and regular console code. + */ + if (have_boot_console) + return; + + kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); + if (IS_ERR(kt)) { + con_printk(KERN_ERR, con, "failed to start printing thread\n"); + return; + } + + con->kthread = kt; + + /* + * It is important that console printing threads are scheduled + * shortly after a printk call and with generous runtime budgets. + */ + sched_set_normal(con->kthread, -20); +} + +static int __init printk_setup_threads(void) +{ + struct console *con; + + console_list_lock(); + printk_threads_enabled = true; + for_each_console(con) + nbcon_kthread_create(con); + if (force_printkthreads() && printing_via_unlock) + nbcon_legacy_kthread_create(); + console_list_unlock(); + return 0; +} +early_initcall(printk_setup_threads); + /** * nbcon_alloc - Allocate buffers needed by the nbcon console * @con: Console to allocate buffers for @@ -961,21 +1712,24 @@ bool nbcon_alloc(struct console *con) /** * nbcon_init - Initialize the nbcon console specific data * @con: Console to initialize + * @init_seq: Sequence number of the first record to be emitted * * nbcon_alloc() *must* be called and succeed before this function * is called. - * - * This function expects that the legacy @con->seq has been set. */ -void nbcon_init(struct console *con) +void nbcon_init(struct console *con, u64 init_seq) { struct nbcon_state state = { }; /* nbcon_alloc() must have been called and successful! */ BUG_ON(!con->pbufs); - nbcon_seq_force(con, con->seq); + rcuwait_init(&con->rcuwait); + init_irq_work(&con->irq_work, nbcon_irq_work); + nbcon_seq_force(con, init_seq); + atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL); nbcon_state_set(con, &state); + nbcon_kthread_create(con); } /** @@ -986,6 +1740,7 @@ void nbcon_free(struct console *con) { struct nbcon_state state = { }; + nbcon_kthread_stop(con); nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ @@ -994,3 +1749,101 @@ void nbcon_free(struct console *con) con->pbufs = NULL; } + +/** + * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe + * section + * @con: The nbcon console to acquire + * + * Context: Under the locking mechanism implemented in + * @con->device_lock() including disabling migration. + * Return: True if the console was acquired. False otherwise. + * + * Console drivers will usually use their own internal synchronization + * mechasism to synchronize between console printing and non-printing + * activities (such as setting baud rates). However, nbcon console drivers + * supporting atomic consoles may also want to mark unsafe sections when + * performing non-printing activities in order to synchronize against their + * atomic_write() callback. + * + * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL + * and marks it unsafe for handover/takeover. + */ +bool nbcon_device_try_acquire(struct console *con) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); + + cant_migrate(); + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->console = con; + ctxt->prio = NBCON_PRIO_NORMAL; + + if (!nbcon_context_try_acquire(ctxt)) + return false; + + if (!nbcon_context_enter_unsafe(ctxt)) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(nbcon_device_try_acquire); + +/** + * nbcon_device_release - Exit unsafe section and release the nbcon console + * @con: The nbcon console acquired in nbcon_device_try_acquire() + */ +void nbcon_device_release(struct console *con) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); + int cookie; + + if (!nbcon_context_exit_unsafe(ctxt)) + return; + + nbcon_context_release(ctxt); + + /* + * This context must flush any new records added while the console + * was locked. The console_srcu_read_lock must be taken to ensure + * the console is usable throughout flushing. + */ + cookie = console_srcu_read_lock(); + if (console_is_usable(con, console_srcu_read_flags(con), true) && + (!con->kthread || (system_state > SYSTEM_RUNNING)) && + prb_read_valid(prb, nbcon_seq_read(con), NULL)) { + __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false); + } + console_srcu_read_unlock(cookie); +} +EXPORT_SYMBOL_GPL(nbcon_device_release); + +/** + * printk_kthread_shutdown - shutdown all threaded printers + * + * On system shutdown all threaded printers are stopped. This allows printk + * to transition back to atomic printing, thus providing a robust mechanism + * for the final shutdown/reboot messages to be output. + */ +static void printk_kthread_shutdown(void) +{ + struct console *con; + + console_list_lock(); + for_each_console(con) { + if (con->flags & CON_NBCON) + nbcon_kthread_stop(con); + } + console_list_unlock(); +} + +static struct syscore_ops printk_syscore_ops = { + .shutdown = printk_kthread_shutdown, +}; + +static int __init printk_init_ops(void) +{ + register_syscore_ops(&printk_syscore_ops); + return 0; +} +device_initcall(printk_init_ops); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index dddb15f48d595..5e9778f692067 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -195,6 +195,17 @@ static int __init control_devkmsg(char *str) } __setup("printk.devkmsg=", control_devkmsg); +#if !defined(CONFIG_PREEMPT_RT) +DEFINE_STATIC_KEY_FALSE(force_printkthreads_key); + +static int __init setup_forced_printkthreads(char *arg) +{ + static_branch_enable(&force_printkthreads_key); + return 0; +} +early_param("threadprintk", setup_forced_printkthreads); +#endif + char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, @@ -282,6 +293,7 @@ EXPORT_SYMBOL(console_list_unlock); * Return: A cookie to pass to console_srcu_read_unlock(). */ int console_srcu_read_lock(void) + __acquires(&console_srcu) { return srcu_read_lock_nmisafe(&console_srcu); } @@ -295,6 +307,7 @@ EXPORT_SYMBOL(console_srcu_read_lock); * Counterpart to console_srcu_read_lock() */ void console_srcu_read_unlock(int cookie) + __releases(&console_srcu) { srcu_read_unlock_nmisafe(&console_srcu, cookie); } @@ -461,8 +474,33 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); +/* + * Specifies if a legacy console is registered. If legacy consoles are + * present, it is necessary to perform the console lock/unlock dance + * whenever console flushing should occur. + */ +bool have_legacy_console; + +/* + * Specifies if an nbcon console is registered. If nbcon consoles are present, + * synchronous printing of legacy consoles will not occur during panic until + * the backtrace has been stored to the ringbuffer. + */ +static bool have_nbcon_console; + +/* + * Specifies if a boot console is registered. If boot consoles are present, + * nbcon consoles cannot print simultaneously and must be synchronized by + * the console lock. This is because boot consoles and nbcon consoles may + * have mapped the same hardware. + */ +bool have_boot_console; + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); + +static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); + /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; @@ -1850,7 +1888,7 @@ static bool console_waiter; * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ -static void console_lock_spinning_enable(void) +void console_lock_spinning_enable(void) { /* * Do not use spinning in panic(). The panic CPU wants to keep the lock. @@ -1889,7 +1927,7 @@ static void console_lock_spinning_enable(void) * * Return: 1 if the lock rights were passed, 0 otherwise. */ -static int console_lock_spinning_disable_and_check(int cookie) +int console_lock_spinning_disable_and_check(int cookie) { int waiter; @@ -2300,12 +2338,30 @@ int vprintk_store(int facility, int level, return ret; } +static bool legacy_allow_panic_sync; + +/* + * This acts as a one-way switch to allow legacy consoles to print from + * the printk() caller context on a panic CPU. It also attempts to flush + * the legacy consoles in this context. + */ +void printk_legacy_allow_panic_sync(void) +{ + legacy_allow_panic_sync = true; + + if (printing_via_unlock && !in_nmi()) { + if (console_trylock()) + console_unlock(); + } +} + asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { + bool do_trylock_unlock = !force_printkthreads() && + printing_via_unlock; int printed_len; - bool in_sched = false; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -2321,38 +2377,85 @@ asmlinkage int vprintk_emit(int facility, int level, if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; - in_sched = true; + /* If called from the scheduler, we can not call up(). */ + do_trylock_unlock = false; } printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); - /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { + if (have_nbcon_console && !have_boot_console) { + bool is_panic_context = this_cpu_in_panic(); + + /* + * In panic, the legacy consoles are not allowed to print from + * the printk calling context unless explicitly allowed. This + * gives the safe nbcon consoles a chance to print out all the + * panic messages first. This restriction only applies if + * there are nbcon consoles registered. + */ + if (is_panic_context) + do_trylock_unlock &= legacy_allow_panic_sync; + + /* + * There are situations where nbcon atomic printing should + * happen in the printk() caller context: + * + * - When this CPU is in panic. + * + * - When booting, before the printing threads have been + * started. + * + * - During shutdown, since the printing threads may not get + * a chance to print the final messages. + * + * Note that if boot consoles are registered, the console + * lock/unlock dance must be relied upon instead because nbcon + * consoles cannot print simultaneously with boot consoles. + */ + if (is_panic_context || + !printk_threads_enabled || + (system_state > SYSTEM_RUNNING)) { + nbcon_atomic_flush_pending(); + } + } + + nbcon_wake_threads(); + + if (do_trylock_unlock) { /* * The caller may be holding system-critical or * timing-sensitive locks. Disable preemption during * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. + * + * Also, nbcon_get_default_prio() requires migration disabled. */ preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers. With the * spinning variant, this context tries to take over the * printing from another printing context. + * + * Skip it in EMERGENCY priority. The console will be + * explicitly flushed when exiting the emergency section. */ - if (console_trylock_spinning()) - console_unlock(); + if (nbcon_get_default_prio() != NBCON_PRIO_EMERGENCY) { + if (console_trylock_spinning()) + console_unlock(); + } + preempt_enable(); } - if (in_sched) - defer_console_output(); - else + if (do_trylock_unlock) wake_up_klogd(); + else + defer_console_output(); return printed_len; } @@ -2380,6 +2483,14 @@ EXPORT_SYMBOL(_printk); static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); +static struct task_struct *nbcon_legacy_kthread; + +static inline void wake_up_legacy_kthread(void) +{ + if (nbcon_legacy_kthread) + wake_up_interruptible(&legacy_wait); +} + #else /* CONFIG_PRINTK */ #define printk_time false @@ -2393,6 +2504,8 @@ static u64 syslog_seq; static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } +static inline void nbcon_legacy_kthread_create(void) { } +static inline void wake_up_legacy_kthread(void) { } #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK @@ -2608,6 +2721,8 @@ void suspend_console(void) void resume_console(void) { struct console *con; + short flags; + int cookie; if (!console_suspend_enabled) return; @@ -2624,6 +2739,20 @@ void resume_console(void) */ synchronize_srcu(&console_srcu); + /* + * Since this runs in task context, wake the threaded printers + * directly rather than scheduling irq_work to do it. + */ + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + flags = console_srcu_read_flags(con); + if (flags & CON_NBCON) + nbcon_kthread_wake(con); + } + console_srcu_read_unlock(cookie); + + wake_up_legacy_kthread(); + pr_flush(1000, true); } @@ -2638,7 +2767,9 @@ void resume_console(void) */ static int console_cpu_notify(unsigned int cpu) { - if (!cpuhp_tasks_frozen) { + if (!force_printkthreads() && + !cpuhp_tasks_frozen && + printing_via_unlock) { /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); @@ -2695,36 +2826,6 @@ int is_console_locked(void) } EXPORT_SYMBOL(is_console_locked); -/* - * Check if the given console is currently capable and allowed to print - * records. - * - * Requires the console_srcu_read_lock. - */ -static inline bool console_is_usable(struct console *con) -{ - short flags = console_srcu_read_flags(con); - - if (!(flags & CON_ENABLED)) - return false; - - if ((flags & CON_SUSPENDED)) - return false; - - if (!con->write) - return false; - - /* - * Console drivers may assume that per-cpu resources have been - * allocated. So unless they're explicitly marked as being able to - * cope (CON_ANYTIME) don't call them until this CPU is officially up. - */ - if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) - return false; - - return true; -} - static void __console_unlock(void) { console_locked = 0; @@ -2734,30 +2835,25 @@ static void __console_unlock(void) #ifdef CONFIG_PRINTK /* - * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This - * is achieved by shifting the existing message over and inserting the dropped - * message. + * Prepend the message in @pmsg->pbufs->outbuf with the message in + * @pmsg->pbufs->scratchbuf. This is achieved by shifting the existing message + * over and inserting the scratchbuf message. * * @pmsg is the printk message to prepend. * - * @dropped is the dropped count to report in the dropped message. + * @len is the length of the message in @pmsg->pbufs->scratchbuf. * * If the message text in @pmsg->pbufs->outbuf does not have enough space for - * the dropped message, the message text will be sufficiently truncated. + * the scratchbuf message, the message text will be sufficiently truncated. * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ -void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +static void __console_prepend_scratch(struct printk_message *pmsg, size_t len) { struct printk_buffers *pbufs = pmsg->pbufs; - const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; - size_t len; - - len = scnprintf(scratchbuf, scratchbuf_sz, - "** %lu printk messages dropped **\n", dropped); /* * Make sure outbuf is sufficiently large before prepending. @@ -2779,6 +2875,46 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) pmsg->outbuf_len += len; } +/* + * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". + * @pmsg->outbuf_len is updated appropriately. + * + * @pmsg is the printk message to prepend. + * + * @dropped is the dropped count to report in the dropped message. + */ +void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +{ + struct printk_buffers *pbufs = pmsg->pbufs; + const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); + char *scratchbuf = &pbufs->scratchbuf[0]; + size_t len; + + len = scnprintf(scratchbuf, scratchbuf_sz, + "** %lu printk messages dropped **\n", dropped); + + __console_prepend_scratch(pmsg, len); +} + +/* + * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". + * @pmsg->outbuf_len is updated appropriately. + * + * @pmsg is the printk message to prepend. + */ +void console_prepend_replay(struct printk_message *pmsg) +{ + struct printk_buffers *pbufs = pmsg->pbufs; + const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); + char *scratchbuf = &pbufs->scratchbuf[0]; + size_t len; + + len = scnprintf(scratchbuf, scratchbuf_sz, + "** replaying previous printk message **\n"); + + __console_prepend_scratch(pmsg, len); +} + /* * Read and format the specified record (or a later record if the specified * record is not available). @@ -2844,6 +2980,33 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, return true; } +/* + * Legacy console printing from printk() caller context does not respect + * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a + * false positive. For PREEMPT_RT the false positive condition does not + * occur. + * + * This map is used to establish LD_WAIT_SLEEP context for the console write + * callbacks when legacy printing to avoid false positive lockdep complaints, + * thus allowing lockdep to continue to function for real issues. + */ +#ifdef CONFIG_PREEMPT_RT +static inline void printk_legacy_lock_map_acquire_try(void) { } +static inline void printk_legacy_lock_map_release(void) { } +#else +static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); + +static inline void printk_legacy_lock_map_acquire_try(void) +{ + lock_map_acquire_try(&printk_legacy_map); +} + +static inline void printk_legacy_lock_map_release(void) +{ + lock_map_release(&printk_legacy_map); +} +#endif /* CONFIG_PREEMPT_RT */ + /* * Used as the printk buffers for non-panic, serialized console printing. * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. @@ -2893,31 +3056,45 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co con->dropped = 0; } - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). - */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - - /* Do not trace print latency. */ - stop_critical_timings(); - /* Write everything out to the hardware. */ - con->write(con, outbuf, pmsg.outbuf_len); - start_critical_timings(); + if (force_printkthreads()) { + /* + * With forced threading this function is either in a thread + * or panic context. So there is no need for concern about + * printk reentrance, handovers, or lockdep complaints. + */ - con->seq = pmsg.seq + 1; + con->write(con, outbuf, pmsg.outbuf_len); + con->seq = pmsg.seq + 1; + } else { + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); - *handover = console_lock_spinning_disable_and_check(cookie); - printk_safe_exit_irqrestore(flags); + /* Do not trace print latency. */ + stop_critical_timings(); + + printk_legacy_lock_map_acquire_try(); + con->write(con, outbuf, pmsg.outbuf_len); + printk_legacy_lock_map_release(); + + start_critical_timings(); + + con->seq = pmsg.seq + 1; + + *handover = console_lock_spinning_disable_and_check(cookie); + printk_safe_exit_irqrestore(flags); + } skip: return true; } @@ -2970,13 +3147,29 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove cookie = console_srcu_read_lock(); for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + u64 printk_seq; bool progress; - if (!console_is_usable(con)) + /* + * console_flush_all() is only for legacy consoles, + * unless the nbcon console has no kthread printer. + */ + if ((flags & CON_NBCON) && con->kthread) + continue; + + if (!console_is_usable(con, flags, !do_cond_resched)) continue; any_usable = true; - progress = console_emit_next_record(con, handover, cookie); + if (flags & CON_NBCON) { + progress = nbcon_legacy_emit_next_record(con, handover, cookie, + !do_cond_resched); + printk_seq = nbcon_seq_read(con); + } else { + progress = console_emit_next_record(con, handover, cookie); + printk_seq = con->seq; + } /* * If a handover has occurred, the SRCU read lock @@ -2986,8 +3179,8 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove return false; /* Track the next of the highest seq flushed. */ - if (con->seq > *next_seq) - *next_seq = con->seq; + if (printk_seq > *next_seq) + *next_seq = printk_seq; if (!progress) continue; @@ -3010,19 +3203,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove return false; } -/** - * console_unlock - unblock the console subsystem from printing - * - * Releases the console_lock which the caller holds to block printing of - * the console subsystem. - * - * While the console_lock was held, console output may have been buffered - * by printk(). If this is the case, console_unlock(); emits - * the output prior to releasing the lock. - * - * console_unlock(); may be called from any context. - */ -void console_unlock(void) +static void console_flush_and_unlock(void) { bool do_cond_resched; bool handover; @@ -3066,6 +3247,32 @@ void console_unlock(void) */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } + +/** + * console_unlock - unblock the console subsystem from printing + * + * Releases the console_lock which the caller holds to block printing of + * the console subsystem. + * + * While the console_lock was held, console output may have been buffered + * by printk(). If this is the case, console_unlock(); emits + * the output prior to releasing the lock. + * + * console_unlock(); may be called from any context. + */ +void console_unlock(void) +{ + /* + * Forced threading relies on kthread and atomic consoles for + * printing. It never attempts to print from console_unlock(). + */ + if (force_printkthreads()) { + __console_unlock(); + return; + } + + console_flush_and_unlock(); +} EXPORT_SYMBOL(console_unlock); /** @@ -3211,7 +3418,10 @@ void console_flush_on_panic(enum con_flush_mode mode) if (mode == CONSOLE_REPLAY_ALL) __console_rewind_all(); - console_flush_all(false, &next_seq, &handover); + nbcon_atomic_flush_pending(); + + if (printing_via_unlock) + console_flush_all(false, &next_seq, &handover); } /* @@ -3268,13 +3478,125 @@ EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { + short flags; + int cookie; + console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. The related + * printing context must be able to see it is enabled so that + * it is guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + + cookie = console_srcu_read_lock(); + flags = console_srcu_read_flags(console); + if (flags & CON_NBCON) + nbcon_kthread_wake(console); + else + wake_up_legacy_kthread(); + console_srcu_read_unlock(cookie); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); +#ifdef CONFIG_PRINTK +static bool printer_should_wake(void) +{ + bool available = false; + struct console *con; + int cookie; + + if (kthread_should_stop()) + return true; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + u64 printk_seq; + + /* + * The legacy printer thread is only for legacy consoles, + * unless the nbcon console has no kthread printer. + */ + if ((flags & CON_NBCON) && con->kthread) + continue; + + if (!console_is_usable(con, flags, false)) + continue; + + if (flags & CON_NBCON) { + printk_seq = nbcon_seq_read(con); + } else { + /* + * It is safe to read @seq because only this + * thread context updates @seq. + */ + printk_seq = con->seq; + } + + if (prb_read_valid(prb, printk_seq, NULL)) { + available = true; + break; + } + } + console_srcu_read_unlock(cookie); + + return available; +} + +static int nbcon_legacy_kthread_func(void *unused) +{ + int error; + + for (;;) { + error = wait_event_interruptible(legacy_wait, printer_should_wake()); + + if (kthread_should_stop()) + break; + + if (error) + continue; + + console_lock(); + console_flush_and_unlock(); + } + + return 0; +} + +void nbcon_legacy_kthread_create(void) +{ + struct task_struct *kt; + + lockdep_assert_held(&console_mutex); + + if (!force_printkthreads()) + return; + + if (!printk_threads_enabled || nbcon_legacy_kthread) + return; + + kt = kthread_run(nbcon_legacy_kthread_func, NULL, "pr/legacy"); + if (IS_ERR(kt)) { + pr_err("unable to start legacy printing thread\n"); + return; + } + + nbcon_legacy_kthread = kt; + + /* + * It is important that console printing threads are scheduled + * shortly after a printk call and with generous runtime budgets. + */ + sched_set_normal(nbcon_legacy_kthread, -20); +} +#endif /* CONFIG_PRINTK */ + static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) @@ -3373,19 +3695,21 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_CONSDEV; } -static void console_init_seq(struct console *newcon, bool bootcon_registered) +/* Return the starting sequence number for a newly registered console. */ +static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered) { struct console *con; bool handover; + u64 init_seq; if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); - newcon->seq = syslog_seq; + init_seq = syslog_seq; mutex_unlock(&syslog_lock); } else { /* Begin with next message added to ringbuffer. */ - newcon->seq = prb_next_seq(prb); + init_seq = prb_next_seq(prb); /* * If any enabled boot consoles are due to be unregistered @@ -3406,7 +3730,7 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered) * Flush all consoles and set the console to start at * the next unprinted sequence number. */ - if (!console_flush_all(true, &newcon->seq, &handover)) { + if (!console_flush_all(true, &init_seq, &handover)) { /* * Flushing failed. Just choose the lowest * sequence of the enabled boot consoles. @@ -3419,19 +3743,30 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered) if (handover) console_lock(); - newcon->seq = prb_next_seq(prb); + init_seq = prb_next_seq(prb); for_each_console(con) { - if ((con->flags & CON_BOOT) && - (con->flags & CON_ENABLED) && - con->seq < newcon->seq) { - newcon->seq = con->seq; + u64 seq; + + if (!(con->flags & CON_BOOT) || + !(con->flags & CON_ENABLED)) { + continue; } + + if (con->flags & CON_NBCON) + seq = nbcon_seq_read(con); + else + seq = con->seq; + + if (seq < init_seq) + init_seq = seq; } } console_unlock(); } } + + return init_seq; } #define console_first() \ @@ -3460,9 +3795,12 @@ static int unregister_console_locked(struct console *console); */ void register_console(struct console *newcon) { - struct console *con; + bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic; bool bootcon_registered = false; bool realcon_registered = false; + struct console *con; + unsigned long flags; + u64 init_seq; int err; console_list_lock(); @@ -3540,10 +3878,32 @@ void register_console(struct console *newcon) } newcon->dropped = 0; - console_init_seq(newcon, bootcon_registered); + init_seq = get_init_console_seq(newcon, bootcon_registered); - if (newcon->flags & CON_NBCON) - nbcon_init(newcon); + if (newcon->flags & CON_NBCON) { + have_nbcon_console = true; + nbcon_init(newcon, init_seq); + } else { + have_legacy_console = true; + newcon->seq = init_seq; + nbcon_legacy_kthread_create(); + } + + if (newcon->flags & CON_BOOT) + have_boot_console = true; + + /* + * If another context is actively using the hardware of this new + * console, it will not be aware of the nbcon synchronization. This + * is a risk that two contexts could access the hardware + * simultaneously if this new console is used for atomic printing + * and the other context is still using the hardware. + * + * Use the driver synchronization to ensure that the hardware is not + * in use while this new console transitions to being registered. + */ + if (use_device_lock) + newcon->device_lock(newcon, &flags); /* * Put this console in the list - keep the @@ -3569,6 +3929,10 @@ void register_console(struct console *newcon) * register_console() completes. */ + /* This new console is now registered. */ + if (use_device_lock) + newcon->device_unlock(newcon, flags); + console_sysfs_notify(); /* @@ -3597,6 +3961,13 @@ EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { + bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic; + bool is_boot_con = (console->flags & CON_BOOT); + bool found_legacy_con = false; + bool found_nbcon_con = false; + bool found_boot_con = false; + unsigned long flags; + struct console *c; int res; lockdep_assert_console_list_lock_held(); @@ -3615,8 +3986,18 @@ static int unregister_console_locked(struct console *console) if (!console_is_registered_locked(console)) return -ENODEV; + /* + * Use the driver synchronization to ensure that the hardware is not + * in use while this console transitions to being unregistered. + */ + if (use_device_lock) + console->device_lock(console, &flags); + hlist_del_init_rcu(&console->node); + if (use_device_lock) + console->device_unlock(console, flags); + /* * * If this isn't the last console and it has CON_CONSDEV set, we @@ -3644,6 +4025,42 @@ static int unregister_console_locked(struct console *console) if (console->exit) res = console->exit(console); + /* + * With this console gone, the global flags tracking registered + * console types may have changed. Update them. + */ + for_each_console(c) { + if (c->flags & CON_BOOT) + found_boot_con = true; + + if (c->flags & CON_NBCON) + found_nbcon_con = true; + else + found_legacy_con = true; + } + if (!found_boot_con) + have_boot_console = found_boot_con; + if (!found_legacy_con) + have_legacy_console = found_legacy_con; + if (!found_nbcon_con) + have_nbcon_console = found_nbcon_con; + + /* + * When the last boot console unregisters, start up the + * printing threads. + */ + if (is_boot_con && !have_boot_console) { + for_each_console(c) + nbcon_kthread_create(c); + } + +#ifdef CONFIG_PRINTK + if (!printing_via_unlock && nbcon_legacy_kthread) { + kthread_stop(nbcon_legacy_kthread); + nbcon_legacy_kthread = NULL; + } +#endif + return res; } @@ -3802,23 +4219,39 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre seq = prb_next_reserve_seq(prb); - /* Flush the consoles so that records up to @seq are printed. */ - console_lock(); - console_unlock(); + /* + * Flush the consoles so that records up to @seq are printed. + * Otherwise this function will just wait for the threaded printers + * to print up to @seq. + */ + if (printing_via_unlock) { + console_lock(); + console_unlock(); + } for (;;) { unsigned long begin_jiffies; unsigned long slept_jiffies; + bool use_console_lock = printing_via_unlock; + + /* + * Ensure the compiler does not optimize @use_console_lock to + * be @printing_via_unlock since the latter can change at any + * time. + */ + barrier(); diff = 0; - /* - * Hold the console_lock to guarantee safe access to - * console->seq. Releasing console_lock flushes more - * records in case @seq is still not printed on all - * usable consoles. - */ - console_lock(); + if (use_console_lock) { + /* + * Hold the console_lock to guarantee safe access to + * console->seq. Releasing console_lock flushes more + * records in case @seq is still not printed on all + * usable consoles. + */ + console_lock(); + } cookie = console_srcu_read_lock(); for_each_console_srcu(c) { @@ -3832,12 +4265,15 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * that they make forward progress, so only increment * @diff for usable consoles. */ - if (!console_is_usable(c)) + if (!console_is_usable(c, flags, true) && + !console_is_usable(c, flags, false)) { continue; + } if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(c); } else { + WARN_ON_ONCE(!use_console_lock); printk_seq = c->seq; } @@ -3849,7 +4285,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre if (diff != last_diff && reset_on_progress) remaining_jiffies = timeout_jiffies; - console_unlock(); + if (use_console_lock) + console_unlock(); /* Note: @diff is 0 if there are no usable consoles. */ if (diff == 0 || remaining_jiffies == 0) @@ -3901,9 +4338,16 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { - /* If trylock fails, someone else is doing the printing */ - if (console_trylock()) - console_unlock(); + if (force_printkthreads()) { + wake_up_legacy_kthread(); + } else { + /* + * If trylock fails, some other context + * will do the printing. + */ + if (console_trylock()) + console_unlock(); + } } if (pending & PRINTK_PENDING_WAKEUP) @@ -3919,6 +4363,7 @@ static void __wake_up_klogd(int val) return; preempt_disable(); + /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the wait queue is empty. @@ -3930,11 +4375,22 @@ static void __wake_up_klogd(int val) * * This pairs with devkmsg_read:A and syslog_print:A. */ - if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ - (val & PRINTK_PENDING_OUTPUT)) { + if (!wq_has_sleeper(&log_wait)) /* LMM(__wake_up_klogd:A) */ + val &= ~PRINTK_PENDING_WAKEUP; + + /* + * Simple read is safe. register_console() would flush a newly + * registered legacy console when writing the message about it + * being enabled. + */ + if (!printing_via_unlock) + val &= ~PRINTK_PENDING_OUTPUT; + + if (val) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } + preempt_enable(); } @@ -3976,6 +4432,7 @@ void defer_console_output(void) void printk_trigger_flush(void) { + nbcon_wake_threads(); defer_console_output(); } @@ -4299,18 +4756,19 @@ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) EXPORT_SYMBOL_GPL(kmsg_dump_rewind); /** - * console_replay_all - replay kernel log on consoles + * console_try_replay_all - try to replay kernel log on consoles * * Try to obtain lock on console subsystem and replay all * available records in printk buffer on the consoles. * Does nothing if lock is not obtained. * - * Context: Any context. + * Context: Any, except for NMI. */ -void console_replay_all(void) +void console_try_replay_all(void) { if (console_trylock()) { __console_rewind_all(); + nbcon_wake_threads(); /* Consoles are flushed as part of console_unlock(). */ console_unlock(); } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 52626d0f1fa37..bd2a892deac1a 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -5,6 +5,8 @@ #include #include +#include +#include /* * Meta information about each stored message. diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 6d10927a07d83..f47fef7012652 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -26,6 +26,27 @@ void __printk_safe_exit(void) this_cpu_dec(printk_context); } +void __printk_deferred_enter(void) +{ + cant_migrate(); + __printk_safe_enter(); +} + +void __printk_deferred_exit(void) +{ + cant_migrate(); + __printk_safe_exit(); +} + +bool is_printk_deferred(void) +{ + /* + * The per-CPU variable @printk_context can be read safely in any + * context. The CPU migration always disabled when set. + */ + return (this_cpu_read(printk_context) || in_nmi()); +} + asmlinkage int vprintk(const char *fmt, va_list args) { #ifdef CONFIG_KGDB_KDB @@ -38,7 +59,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if (this_cpu_read(printk_context) || in_nmi()) + if (is_printk_deferred()) return vprintk_deferred(fmt, args); /* No obstacles. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 807fbf6123a77..0fd17cc1f8fa0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2420,6 +2420,12 @@ static int rcutorture_booster_init(unsigned int cpu) WARN_ON_ONCE(!t); sp.sched_priority = 2; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); +#ifdef CONFIG_PREEMPT_RT + t = per_cpu(timersd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); +#endif } /* Don't allow time recalculation while creating a new task. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8a1d9c8bd9f74..6ece151d0aa2a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -7,6 +7,7 @@ * Authors: Paul E. McKenney */ +#include #include static void rcu_exp_handler(void *unused); @@ -571,6 +572,9 @@ static void synchronize_rcu_expedited_wait(void) return; if (rcu_stall_is_suppressed()) continue; + + nbcon_cpu_emergency_enter(); + j = jiffies; rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); @@ -620,10 +624,14 @@ static void synchronize_rcu_expedited_wait(void) preempt_disable(); // For smp_processor_id() in dump_cpu_task(). dump_cpu_task(cpu); preempt_enable(); + nbcon_cpu_emergency_flush(); } rcu_exp_print_detail_task_stall_rnp(rnp); } jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); } } @@ -792,6 +800,7 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) */ touch_nmi_watchdog(); sched_show_task(t); + nbcon_cpu_emergency_flush(); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 460efecd077be..833a75167ac93 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -7,6 +7,7 @@ * Author: Paul E. McKenney */ +#include #include #include @@ -260,6 +261,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) */ touch_nmi_watchdog(); sched_show_task(t); + nbcon_cpu_emergency_flush(); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -523,6 +525,7 @@ static void print_cpu_stall_info(int cpu) falsepositive ? " (false positive?)" : ""); print_cpu_stat_info(cpu); + nbcon_cpu_emergency_flush(); } /* Complain about starvation of grace-period kthread. */ @@ -605,6 +608,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) if (rcu_stall_is_suppressed()) return; + nbcon_cpu_emergency_enter(); + /* * OK, time to rat on our buddy... * See Documentation/RCU/stallwarn.rst for info on how to debug @@ -657,6 +662,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); rcu_force_quiescent_state(); /* Kick them all. */ @@ -677,6 +684,8 @@ static void print_cpu_stall(unsigned long gps) if (rcu_stall_is_suppressed()) return; + nbcon_cpu_emergency_enter(); + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.rst for info on how to debug @@ -706,6 +715,8 @@ static void print_cpu_stall(unsigned long gps) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 59ce0841eb1fd..0d8f36c80fe08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -898,14 +898,15 @@ static inline void hrtick_rq_init(struct rq *rq) #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) /* - * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * Atomically set TIF_NEED_RESCHED[_LAZY] and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit) { struct thread_info *ti = task_thread_info(p); - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); + + return !(fetch_or(&ti->flags, 1 << tif_bit) & _TIF_POLLING_NRFLAG); } /* @@ -922,7 +923,7 @@ static bool set_nr_if_polling(struct task_struct *p) do { if (!(val & _TIF_POLLING_NRFLAG)) return false; - if (val & _TIF_NEED_RESCHED) + if (val & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) return true; } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); @@ -930,9 +931,9 @@ static bool set_nr_if_polling(struct task_struct *p) } #else -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct task_struct *p, int tif_bit) { - set_tsk_need_resched(p); + set_tsk_thread_flag(p, tif_bit); return true; } @@ -1037,28 +1038,47 @@ void wake_up_q(struct wake_q_head *head) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_curr(struct rq *rq) +static void __resched_curr(struct rq *rq, int lazy) { + int cpu, tif_bit = TIF_NEED_RESCHED + lazy; struct task_struct *curr = rq->curr; - int cpu; lockdep_assert_rq_held(rq); - if (test_tsk_need_resched(curr)) + if (unlikely(test_tsk_thread_flag(curr, tif_bit))) return; cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(curr); - set_preempt_need_resched(); + set_tsk_thread_flag(curr, tif_bit); + if (!lazy) + set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(curr)) - smp_send_reschedule(cpu); - else + if (set_nr_and_not_polling(curr, tif_bit)) { + if (!lazy) + smp_send_reschedule(cpu); + } else { trace_sched_wake_idle_without_ipi(cpu); + } +} + +void resched_curr(struct rq *rq) +{ + __resched_curr(rq, 0); +} + +void resched_curr_lazy(struct rq *rq) +{ + int lazy = IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && !sched_feat(FORCE_NEED_RESCHED) ? + TIF_NEED_RESCHED_LAZY_OFFSET : 0; + + if (lazy && unlikely(test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED))) + return; + + __resched_curr(rq, lazy); } void resched_cpu(int cpu) @@ -1153,7 +1173,7 @@ static void wake_up_idle_cpu(int cpu) * and testing of the above solutions didn't appear to report * much benefits. */ - if (set_nr_and_not_polling(rq->idle)) + if (set_nr_and_not_polling(rq->idle, TIF_NEED_RESCHED)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -8913,6 +8933,21 @@ static inline void preempt_dynamic_init(void) { } #endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ +/* + * task_is_pi_boosted - Check if task has been PI boosted. + * @p: Task to check. + * + * Return true if task is subject to priority inheritance. + */ +bool task_is_pi_boosted(const struct task_struct *p) +{ + int prio = p->prio; + + if (!rt_prio(prio)) + return false; + return prio != p->normal_prio; +} + /** * yield - yield the current processor to other threads. * diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c1eb9a1afd13e..272078fa8f5a6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -333,6 +333,23 @@ static const struct file_operations sched_debug_fops = { .release = seq_release, }; +static ssize_t sched_hog_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long end = jiffies + 60 * HZ; + + for (; time_before(jiffies, end) && !signal_pending(current);) + cpu_relax(); + + return cnt; +} + +static const struct file_operations sched_hog_fops = { + .write = sched_hog_write, + .open = simple_open, + .llseek = default_llseek, +}; + static struct dentry *debugfs_sched; static __init int sched_init_debug(void) @@ -374,6 +391,8 @@ static __init int sched_init_debug(void) debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + debugfs_create_file("hog", 0200, debugfs_sched, NULL, &sched_hog_fops); + return 0; } late_initcall(sched_init_debug); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 24dda708b6993..a143ecc3439dc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -974,8 +974,10 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i * this is probably good enough. */ -static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se, bool tick) { + struct rq *rq = rq_of(cfs_rq); + if ((s64)(se->vruntime - se->deadline) < 0) return; @@ -994,10 +996,19 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * The task has consumed its request, reschedule. */ - if (cfs_rq->nr_running > 1) { - resched_curr(rq_of(cfs_rq)); - clear_buddies(cfs_rq, se); + if (cfs_rq->nr_running < 2) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) || sched_feat(FORCE_NEED_RESCHED)) { + resched_curr(rq); + } else { + /* Did the task ignore the lazy reschedule request? */ + if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY)) + resched_curr(rq); + else + resched_curr_lazy(rq); } + clear_buddies(cfs_rq, se); } #include "pelt.h" @@ -1153,7 +1164,7 @@ s64 update_curr_common(struct rq *rq) /* * Update the current task's runtime statistics. */ -static void update_curr(struct cfs_rq *cfs_rq) +static void __update_curr(struct cfs_rq *cfs_rq, bool tick) { struct sched_entity *curr = cfs_rq->curr; s64 delta_exec; @@ -1166,7 +1177,7 @@ static void update_curr(struct cfs_rq *cfs_rq) return; curr->vruntime += calc_delta_fair(delta_exec, curr); - update_deadline(cfs_rq, curr); + update_deadline(cfs_rq, curr, tick); update_min_vruntime(cfs_rq); if (entity_is_task(curr)) @@ -1175,6 +1186,11 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); } +static inline void update_curr(struct cfs_rq *cfs_rq) +{ + __update_curr(cfs_rq, false); +} + static void update_curr_fair(struct rq *rq) { update_curr(cfs_rq_of(&rq->curr->se)); @@ -5512,7 +5528,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Update run-time statistics of the 'current'. */ - update_curr(cfs_rq); + __update_curr(cfs_rq, true); /* * Ensure that runnable average is periodically updated. @@ -5526,7 +5542,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } /* @@ -5672,7 +5688,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static __always_inline @@ -5932,7 +5948,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_curr(rq); + resched_curr_lazy(rq); } #ifdef CONFIG_SMP @@ -6647,7 +6663,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (task_current(rq, p)) - resched_curr(rq); + resched_curr_lazy(rq); return; } hrtick_start(rq, delta); @@ -8379,7 +8395,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ - if (test_tsk_need_resched(curr)) + if (need_resched()) return; /* Idle tasks are by definition preempted by non-idle tasks. */ @@ -8421,7 +8437,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); } #ifdef CONFIG_SMP @@ -12567,7 +12583,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) */ if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) - resched_curr(rq); + resched_curr_lazy(rq); } /* @@ -12734,7 +12750,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (task_current(rq, p)) { if (p->prio > oldprio) - resched_curr(rq); + resched_curr_lazy(rq); } else wakeup_preempt(rq, p, 0); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 143f55df890b1..6de570ab30078 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -87,3 +87,5 @@ SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) SCHED_FEAT(HZ_BW, true) + +SCHED_FEAT(FORCE_NEED_RESCHED, false) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6135fbe83d68c..21b36fb784e43 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -57,8 +57,7 @@ static noinline int __cpuidle cpu_idle_poll(void) ct_cpuidle_enter(); raw_local_irq_enable(); - while (!tif_need_resched() && - (cpu_idle_force_poll || tick_check_broadcast_expired())) + while (!need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); raw_local_irq_disable(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aa4c1c874fa44..f38ffe14aad2d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2193,8 +2193,11 @@ static int rto_next_cpu(struct root_domain *rd) rd->rto_cpu = cpu; - if (cpu < nr_cpu_ids) + if (cpu < nr_cpu_ids) { + if (!has_pushable_tasks(cpu_rq(cpu))) + continue; return cpu; + } rd->rto_cpu = -1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef20c61004ebf..43332bdb7fbf0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2467,6 +2467,7 @@ extern void init_sched_fair_class(void); extern void reweight_task(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; diff --git a/kernel/softirq.c b/kernel/softirq.c index 02582017759a2..00e32e279fa95 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -248,6 +248,19 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) } EXPORT_SYMBOL(__local_bh_enable_ip); +void softirq_preempt(void) +{ + if (WARN_ON_ONCE(!preemptible())) + return; + + if (WARN_ON_ONCE(__this_cpu_read(softirq_ctrl.cnt) != SOFTIRQ_OFFSET)) + return; + + __local_bh_enable(SOFTIRQ_OFFSET, true); + /* preemption point */ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); +} + /* * Invoked from ksoftirqd_run() outside of the interrupt disabled section * to acquire the per CPU local lock for reentrancy protection. @@ -624,6 +637,24 @@ static inline void tick_irq_exit(void) #endif } +#ifdef CONFIG_PREEMPT_RT +DEFINE_PER_CPU(struct task_struct *, timersd); +DEFINE_PER_CPU(unsigned long, pending_timer_softirq); + +static void wake_timersd(void) +{ + struct task_struct *tsk = __this_cpu_read(timersd); + + if (tsk) + wake_up_process(tsk); +} + +#else + +static inline void wake_timersd(void) { } + +#endif + static inline void __irq_exit_rcu(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -636,6 +667,10 @@ static inline void __irq_exit_rcu(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() && + !(in_nmi() | in_hardirq())) + wake_timersd(); + tick_irq_exit(); } @@ -972,12 +1007,70 @@ static struct smp_hotplug_thread softirq_threads = { .thread_comm = "ksoftirqd/%u", }; +#ifdef CONFIG_PREEMPT_RT +static void timersd_setup(unsigned int cpu) +{ + sched_set_fifo_low(current); +} + +static int timersd_should_run(unsigned int cpu) +{ + return local_pending_timers(); +} + +static void run_timersd(unsigned int cpu) +{ + unsigned int timer_si; + + ksoftirqd_run_begin(); + + timer_si = local_pending_timers(); + __this_cpu_write(pending_timer_softirq, 0); + or_softirq_pending(timer_si); + + __do_softirq(); + + ksoftirqd_run_end(); +} + +static void raise_ktimers_thread(unsigned int nr) +{ + trace_softirq_raise(nr); + __this_cpu_or(pending_timer_softirq, 1 << nr); +} + +void raise_hrtimer_softirq(void) +{ + raise_ktimers_thread(HRTIMER_SOFTIRQ); +} + +void raise_timer_softirq(void) +{ + unsigned long flags; + + local_irq_save(flags); + raise_ktimers_thread(TIMER_SOFTIRQ); + wake_timersd(); + local_irq_restore(flags); +} + +static struct smp_hotplug_thread timer_threads = { + .store = &timersd, + .setup = timersd_setup, + .thread_should_run = timersd_should_run, + .thread_fn = run_timersd, + .thread_comm = "ktimers/%u", +}; +#endif + static __init int spawn_ksoftirqd(void) { cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, takeover_tasklets); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); - +#ifdef CONFIG_PREEMPT_RT + BUG_ON(smpboot_register_percpu_thread(&timer_threads)); +#endif return 0; } early_initcall(spawn_ksoftirqd); diff --git a/kernel/task_work.c b/kernel/task_work.c index 95a7e1b7f1dab..8f9c30e732a0a 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -1,10 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include +#include static struct callback_head work_exited; /* all we need is ->next == NULL */ +#ifdef CONFIG_IRQ_WORK +static void task_work_set_notify_irq(struct irq_work *entry) +{ + test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); +} +static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = + IRQ_WORK_INIT_HARD(task_work_set_notify_irq); +#endif + /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback @@ -12,7 +23,7 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify - * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. + * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT. * * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted * task and run the task_work, regardless of whether the task is currently @@ -24,6 +35,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * kernel anyway. * @TWA_RESUME work is run only when the task exits the kernel and returns to * user mode, or before entering guest mode. + * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the + * current @task and if the current context is NMI. * * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task goes through one of @@ -44,8 +57,15 @@ int task_work_add(struct task_struct *task, struct callback_head *work, { struct callback_head *head; - /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + if (notify == TWA_NMI_CURRENT) { + if (WARN_ON_ONCE(task != current)) + return -EINVAL; + if (!IS_ENABLED(CONFIG_IRQ_WORK)) + return -EINVAL; + } else { + /* record the work call stack in order to print it in KASAN reports */ + kasan_record_aux_stack(work); + } head = READ_ONCE(task->task_works); do { @@ -66,6 +86,11 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; +#ifdef CONFIG_IRQ_WORK + case TWA_NMI_CURRENT: + irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); + break; +#endif default: WARN_ON_ONCE(1); break; @@ -120,9 +145,9 @@ static bool task_work_func_match(struct callback_head *cb, void *data) } /** - * task_work_cancel - cancel a pending work added by task_work_add() - * @task: the task which should execute the work - * @func: identifies the work to remove + * task_work_cancel_func - cancel a pending work matching a function added by task_work_add() + * @task: the task which should execute the func's work + * @func: identifies the func to match with a work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. @@ -131,11 +156,35 @@ static bool task_work_func_match(struct callback_head *cb, void *data) * The found work or NULL if not found. */ struct callback_head * -task_work_cancel(struct task_struct *task, task_work_func_t func) +task_work_cancel_func(struct task_struct *task, task_work_func_t func) { return task_work_cancel_match(task, task_work_func_match, func); } +static bool task_work_match(struct callback_head *cb, void *data) +{ + return cb == data; +} + +/** + * task_work_cancel - cancel a pending work added by task_work_add() + * @task: the task which should execute the work + * @cb: the callback to remove if queued + * + * Remove a callback from a task's queue if queued. + * + * RETURNS: + * True if the callback was queued and got cancelled, false otherwise. + */ +bool task_work_cancel(struct task_struct *task, struct callback_head *cb) +{ + struct callback_head *ret; + + ret = task_work_cancel_match(task, task_work_match, cb); + + return ret == cb; +} + /** * task_work_run - execute the works added by task_work_add() * @@ -168,7 +217,7 @@ void task_work_run(void) if (!work) break; /* - * Synchronize with task_work_cancel(). It can not remove + * Synchronize with task_work_cancel_match(). It can not remove * the first entry == work, cmpxchg(task_works) must fail. * But it can remove another entry from the ->next list. */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b8ee320208d41..fd78166a2ebe6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1809,7 +1809,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_hrtimer_softirq(); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); @@ -1904,7 +1904,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; - raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raise_hrtimer_softirq(); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 71a792cd89362..ea234ff627769 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -859,7 +859,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static inline bool local_timer_softirq_pending(void) { - return local_softirq_pending() & BIT(TIMER_SOFTIRQ); + return local_pending_timers() & BIT(TIMER_SOFTIRQ); } /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 48288dd4a102f..d36c962472180 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1562,9 +1562,16 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) */ static void timer_sync_wait_running(struct timer_base *base) { - if (atomic_read(&base->timer_waiters)) { + bool need_preempt; + + need_preempt = task_is_pi_boosted(current); + if (need_preempt || atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); spin_unlock(&base->expiry_lock); + + if (need_preempt) + softirq_preempt(); + spin_lock(&base->expiry_lock); raw_spin_lock_irq(&base->lock); } @@ -2465,7 +2472,7 @@ static void run_local_timers(void) /* Raise the softirq only if required. */ if (time_after_eq(jiffies, base->next_expiry) || (i == BASE_DEF && tmigr_requires_handle_remote())) { - raise_softirq(TIMER_SOFTIRQ); + raise_timer_softirq(); return; } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 578a49ff5c32e..55f934127d28c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2519,6 +2519,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) if (tif_need_resched()) trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (tif_need_resched_lazy()) + trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d8b302d010830..4f58a196e14c1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -460,17 +460,29 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : bh_off ? 'b' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : + !IS_ENABLED(CONFIG_TRACE_IRQFLAGS_SUPPORT) ? 'X' : '.'; - switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | + switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED)) { + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'B'; + break; case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: need_resched = 'N'; break; + case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'L'; + break; + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY: + need_resched = 'b'; + break; case TRACE_FLAG_NEED_RESCHED: need_resched = 'n'; break; + case TRACE_FLAG_NEED_RESCHED_LAZY: + need_resched = 'l'; + break; case TRACE_FLAG_PREEMPT_RESCHED: need_resched = 'p'; break; diff --git a/localversion-rt b/localversion-rt new file mode 100644 index 0000000000000..08b3e75841adc --- /dev/null +++ b/localversion-rt @@ -0,0 +1 @@ +-rt14 diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 36ae54f57bf57..a6d7f790cdda8 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -283,9 +283,10 @@ static int xdp_recv_frames(struct xdp_frame **frames, int nframes, static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, u32 repeat) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int err = 0, act, ret, i, nframes = 0, batch_sz; struct xdp_frame **frames = xdp->frames; + struct bpf_redirect_info *ri; struct xdp_page_head *head; struct xdp_frame *frm; bool redirect = false; @@ -295,6 +296,8 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, batch_sz = min_t(u32, repeat, xdp->batch_size); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + ri = bpf_net_ctx_get_ri(); xdp_set_return_frame_no_direct(); for (i = 0; i < batch_sz; i++) { @@ -359,6 +362,7 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, } xdp_clear_return_frame_no_direct(); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); return err; } @@ -394,6 +398,7 @@ static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog_array_item item = {.prog = prog}; struct bpf_run_ctx *old_ctx; struct bpf_cg_run_ctx run_ctx; @@ -419,10 +424,14 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, do { run_ctx.prog_item = &item; local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = bpf_prog_run(prog, ctx); + + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index bf30c50b56895..3c9f6538990ea 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -137,6 +137,7 @@ static inline bool is_pppoe_ipv6(const struct sk_buff *skb, #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) struct brnf_frag_data { + local_lock_t bh_lock; char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; u8 encap_size; u8 size; @@ -144,7 +145,9 @@ struct brnf_frag_data { __be16 vlan_proto; }; -static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); +static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static void nf_bridge_info_free(struct sk_buff *skb) { @@ -850,6 +853,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); unsigned int mtu, mtu_reserved; + int ret; mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; @@ -882,6 +886,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; + local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); if (skb_vlan_tag_present(skb)) { @@ -897,7 +902,9 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); + ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); + local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); + return ret; } if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { @@ -909,6 +916,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; @@ -916,8 +924,12 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - if (v6ops) - return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); + if (v6ops) { + ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); + local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); + return ret; + } + local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); kfree_skb(skb); return -EMSGSIZE; diff --git a/net/core/dev.c b/net/core/dev.c index 2b4819b610b8a..cf00b2f662d54 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -229,7 +229,7 @@ static inline void backlog_lock_irq_save(struct softnet_data *sd, { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_save(*flags); } @@ -237,7 +237,7 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_disable(); } @@ -246,7 +246,7 @@ static inline void backlog_unlock_irq_restore(struct softnet_data *sd, { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_restore(*flags); } @@ -254,7 +254,7 @@ static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); - else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + else local_irq_enable(); } @@ -449,7 +449,9 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * queue in the local softnet handler. */ -DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { + .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), +}; EXPORT_PER_CPU_SYMBOL(softnet_data); /* Page_pool has a lockless array/stack to alloc/recycle pages. @@ -3940,6 +3942,7 @@ netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); } +#ifndef CONFIG_PREEMPT_RT static bool netdev_xmit_txqueue_skipped(void) { return __this_cpu_read(softnet_data.xmit.skip_txqueue); @@ -3950,6 +3953,19 @@ void netdev_xmit_skip_txqueue(bool skip) __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); + +#else +static bool netdev_xmit_txqueue_skipped(void) +{ + return current->net_xmit.skip_txqueue; +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + current->net_xmit.skip_txqueue = skip; +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_NET_XGRESS @@ -4029,10 +4045,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, { struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; + + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; @@ -4061,10 +4080,12 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, break; } *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_RX_DROP; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: @@ -4074,8 +4095,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, fallthrough; case TC_ACT_CONSUMED: *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; } + bpf_net_ctx_clear(bpf_net_ctx); return skb; } @@ -4085,11 +4108,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller. */ @@ -4105,10 +4131,12 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: @@ -4118,8 +4146,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; } + bpf_net_ctx_clear(bpf_net_ctx); return skb; } @@ -5096,11 +5126,14 @@ static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { @@ -5114,11 +5147,14 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) generic_xdp_tx(*pskb, xdp_prog); break; } + bpf_net_ctx_clear(bpf_net_ctx); return XDP_DROP; } + bpf_net_ctx_clear(bpf_net_ctx); } return XDP_PASS; out_redir: + bpf_net_ctx_clear(bpf_net_ctx); kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } @@ -5935,6 +5971,7 @@ static void flush_backlog(struct work_struct *work) } backlog_unlock_irq_enable(sd); + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); @@ -5942,6 +5979,7 @@ static void flush_backlog(struct work_struct *work) rps_input_queue_head_incr(sd); } } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); } @@ -6063,7 +6101,9 @@ static int process_backlog(struct napi_struct *napi, int quota) while (again) { struct sk_buff *skb; + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); while ((skb = __skb_dequeue(&sd->process_queue))) { + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); @@ -6072,7 +6112,9 @@ static int process_backlog(struct napi_struct *napi, int quota) return work; } + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); } + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { @@ -6087,8 +6129,10 @@ static int process_backlog(struct napi_struct *napi, int quota) napi->state &= NAPIF_STATE_THREADED; again = false; } else { + local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); + local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); } backlog_unlock_irq_enable(sd); } @@ -6301,6 +6345,7 @@ enum { static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; bool skip_schedule = false; unsigned long timeout; int rc; @@ -6318,6 +6363,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); @@ -6340,6 +6386,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, netpoll_poll_unlock(have_poll_lock); if (rc == budget) __busy_poll_stop(napi, skip_schedule); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } @@ -6349,6 +6396,7 @@ static void __napi_busy_loop(unsigned int napi_id, { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; void *have_poll_lock = NULL; struct napi_struct *napi; @@ -6367,6 +6415,7 @@ static void __napi_busy_loop(unsigned int napi_id, int work = 0; local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -6397,6 +6446,7 @@ static void __napi_busy_loop(unsigned int napi_id, __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) @@ -6824,6 +6874,7 @@ static int napi_thread_wait(struct napi_struct *napi) static void napi_threaded_poll_loop(struct napi_struct *napi) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; unsigned long last_qs = jiffies; @@ -6832,6 +6883,8 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) void *have; local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + sd = this_cpu_ptr(&softnet_data); sd->in_napi_threaded_poll = true; @@ -6847,6 +6900,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) net_rps_action_and_irq_enable(sd); } skb_defer_free_flush(sd); + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!repoll) @@ -6872,10 +6926,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); start: sd->in_net_rx_action = true; local_irq_disable(); @@ -6928,7 +6984,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); -end:; +end: + bpf_net_ctx_clear(bpf_net_ctx); } struct netdev_adjacent { diff --git a/net/core/dev.h b/net/core/dev.h index b7b518bc2be55..4984dd9b334bc 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -150,6 +150,8 @@ struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 + +#ifndef CONFIG_PREEMPT_RT static inline bool dev_xmit_recursion(void) { return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > @@ -165,5 +167,21 @@ static inline void dev_xmit_recursion_dec(void) { __this_cpu_dec(softnet_data.xmit.recursion); } +#else +static inline bool dev_xmit_recursion(void) +{ + return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + current->net_xmit.recursion++; +} + +static inline void dev_xmit_recursion_dec(void) +{ + current->net_xmit.recursion--; +} +#endif #endif diff --git a/net/core/filter.c b/net/core/filter.c index 9933851c685e7..5002f4cd9f18e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1658,9 +1658,12 @@ struct bpf_scratchpad { __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; u8 buff[MAX_BPF_STACK]; }; + local_lock_t bh_lock; }; -static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) @@ -2021,6 +2024,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); u32 diff_size = from_size + to_size; int i, j = 0; + __wsum ret; /* This is quite flexible, some examples: * @@ -2034,12 +2038,15 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, diff_size > sizeof(sp->diff))) return -EINVAL; + local_lock_nested_bh(&bpf_sp.bh_lock); for (i = 0; i < from_size / sizeof(__be32); i++, j++) sp->diff[j] = ~from[i]; for (i = 0; i < to_size / sizeof(__be32); i++, j++) sp->diff[j] = to[i]; - return csum_partial(sp->diff, diff_size, seed); + ret = csum_partial(sp->diff, diff_size, seed); + local_unlock_nested_bh(&bpf_sp.bh_lock); + return ret; } static const struct bpf_func_proto bpf_csum_diff_proto = { @@ -2476,9 +2483,6 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); -EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); - static struct net_device *skb_get_peer_dev(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; @@ -2491,7 +2495,7 @@ static struct net_device *skb_get_peer_dev(struct net_device *dev) int skb_do_redirect(struct sk_buff *skb) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; @@ -2524,7 +2528,7 @@ int skb_do_redirect(struct sk_buff *skb) BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; @@ -2545,7 +2549,7 @@ static const struct bpf_func_proto bpf_redirect_proto = { BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return TC_ACT_SHOT; @@ -2567,7 +2571,7 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, int, plen, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; @@ -4273,50 +4277,50 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { */ void xdp_do_flush(void) { - __dev_flush(); - __cpu_map_flush(); - __xsk_map_flush(); + struct list_head *lh_map, *lh_dev, *lh_xsk; + + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) + __dev_flush(lh_dev); + if (lh_map) + __cpu_map_flush(lh_map); + if (lh_xsk) + __xsk_map_flush(lh_xsk); } EXPORT_SYMBOL_GPL(xdp_do_flush); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) void xdp_do_check_flushed(struct napi_struct *napi) { - bool ret; + struct list_head *lh_map, *lh_dev, *lh_xsk; + bool missed = false; - ret = dev_check_flush(); - ret |= cpu_map_check_flush(); - ret |= xsk_map_check_flush(); + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) { + __dev_flush(lh_dev); + missed = true; + } + if (lh_map) { + __cpu_map_flush(lh_map); + missed = true; + } + if (lh_xsk) { + __xsk_map_flush(lh_xsk); + missed = true; + } - WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n", + WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", napi->poll); } #endif -void bpf_clear_redirect_map(struct bpf_map *map) -{ - struct bpf_redirect_info *ri; - int cpu; - - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); - } -} - DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp) { + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net_device *master, *slave; - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); @@ -4388,7 +4392,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn - * down by bpf_clear_redirect_map() + * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; @@ -4433,7 +4437,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) @@ -4447,7 +4451,7 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) @@ -4464,7 +4468,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, enum bpf_map_type map_type, u32 map_id, u32 flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; int err; @@ -4476,7 +4480,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn - * down by bpf_clear_redirect_map() + * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; @@ -4518,7 +4522,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; @@ -4554,7 +4558,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return XDP_ABORTED; @@ -6455,6 +6459,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; + lockdep_assert_held(&srh_state->bh_lock); if (srh == NULL) return -EINVAL; @@ -6511,6 +6516,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, int hdroff = 0; int err; + lockdep_assert_held(&srh_state->bh_lock); switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) @@ -6587,6 +6593,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, int srhoff = 0; int ret; + lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return -EINVAL; @@ -11040,7 +11047,6 @@ const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { }; const struct bpf_prog_ops lwt_seg6local_prog_ops = { - .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops cg_sock_verifier_ops = { diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 4a0797f0a154b..afb05f58b64c5 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -38,13 +38,14 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, struct dst_entry *dst, bool can_redirect) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int ret; - /* Migration disable and BH disable are needed to protect per-cpu - * redirect_info between BPF prog and skb_do_redirect(). + /* Disabling BH is needed to protect per-CPU bpf_redirect_info between + * BPF prog and skb_do_redirect(). */ - migrate_disable(); local_bh_disable(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -77,8 +78,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, break; } + bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); - migrate_enable(); return ret; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 466999a7515e6..5000394a4f51a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -277,6 +277,7 @@ static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) #endif struct napi_alloc_cache { + local_lock_t bh_lock; struct page_frag_cache page; struct page_frag_1k page_small; unsigned int skb_count; @@ -284,7 +285,9 @@ struct napi_alloc_cache { }; static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /* Double check that napi_get_frags() allocates skbs with * skb->head being backed by slab, not a page fragment. @@ -306,11 +309,16 @@ void napi_get_frags_check(struct napi_struct *napi) void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + void *data; fragsz = SKB_DATA_ALIGN(fragsz); - return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + return data; + } EXPORT_SYMBOL(__napi_alloc_frag_align); @@ -318,19 +326,15 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { void *data; - fragsz = SKB_DATA_ALIGN(fragsz); if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); + fragsz = SKB_DATA_ALIGN(fragsz); data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); } else { - struct napi_alloc_cache *nc; - local_bh_disable(); - nc = this_cpu_ptr(&napi_alloc_cache); - data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, - align_mask); + data = __napi_alloc_frag_align(fragsz, align_mask); local_bh_enable(); } return data; @@ -342,16 +346,20 @@ static struct sk_buff *napi_skb_cache_get(void) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; + local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, GFP_ATOMIC, NAPI_SKB_CACHE_BULK, nc->skb_cache); - if (unlikely(!nc->skb_count)) + if (unlikely(!nc->skb_count)) { + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return NULL; + } } skb = nc->skb_cache[--nc->skb_count]; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); return skb; @@ -744,9 +752,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, pfmemalloc = nc->pfmemalloc; } else { local_bh_disable(); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache.page); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; + + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); local_bh_enable(); } @@ -810,11 +822,11 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } - nc = this_cpu_ptr(&napi_alloc_cache); - if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + nc = this_cpu_ptr(&napi_alloc_cache); if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { /* we are artificially inflating the allocation size, but * that is not as bad as it may look like, as: @@ -836,6 +848,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) data = page_frag_alloc(&nc->page, len, gfp_mask); pfmemalloc = nc->page.pfmemalloc; } + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) return NULL; @@ -1433,6 +1446,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) if (!kasan_mempool_poison_object(skb)) return; + local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { @@ -1444,6 +1458,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) nc->skb_cache + NAPI_SKB_CACHE_HALF); nc->skb_count = NAPI_SKB_CACHE_HALF; } + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); } void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 251a57cf58223..deb52d7d31b48 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -59,11 +59,10 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) * we complete the initialization. */ local_bh_disable(); - inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ - inet_twsk_hashdance(tw, sk, &dccp_hashinfo); + inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index e28075f0006e3..a70a3a16eea0b 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -92,13 +92,22 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, hlist_nulls_add_head_rcu(&tw->tw_node, list); } +static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) +{ + __inet_twsk_schedule(tw, timeo, false); +} + /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. + * + * The caller must not access @tw anymore after this function returns. */ -void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, - struct inet_hashinfo *hashinfo) +void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo, + int timeo) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -129,26 +138,33 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); + /* Step 2: Hash TW into tcp ehash chain */ inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - spin_unlock(lock); + /* Ensure above writes are committed into memory before updating the + * refcount. + * Provides ordering vs later refcount_inc(). + */ + smp_wmb(); /* tw_refcnt is set to 3 because we have : * - one reference for bhash chain. * - one reference for ehash chain. * - one reference for timer. - * We can use atomic_set() because prior spin_lock()/spin_unlock() - * committed into memory all tw fields. * Also note that after this point, we lost our implicit reference * so we are not allowed to use tw anymore. */ refcount_set(&tw->tw_refcnt, 3); + + inet_twsk_schedule(tw, timeo); + + spin_unlock(lock); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance); +EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { @@ -192,7 +208,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); - timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); + timer_setup(&tw->tw_timer, tw_timer_handler, 0); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this @@ -217,7 +233,34 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc); */ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { - if (del_timer_sync(&tw->tw_timer)) + struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; + spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); + + /* inet_twsk_purge() walks over all sockets, including tw ones, + * and removes them via inet_twsk_deschedule_put() after a + * refcount_inc_not_zero(). + * + * inet_twsk_hashdance_schedule() must (re)init the refcount before + * arming the timer, i.e. inet_twsk_purge can obtain a reference to + * a twsk that did not yet schedule the timer. + * + * The ehash lock synchronizes these two: + * After acquiring the lock, the timer is always scheduled (else + * timer_shutdown returns false), because hashdance_schedule releases + * the ehash lock only after completing the timer initialization. + * + * Without grabbing the ehash lock, we get: + * 1) cpu x sets twsk refcount to 3 + * 2) cpu y bumps refcount to 4 + * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down + * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown + * -> timer refcount is never decremented. + */ + spin_lock(lock); + /* Makes sure hashdance_schedule() has completed */ + spin_unlock(lock); + + if (timer_shutdown_sync(&tw->tw_timer)) inet_twsk_kill(tw); inet_twsk_put(tw); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b710958393e64..40227a9643966 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -93,7 +93,9 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); +static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static u32 tcp_v4_init_seq(const struct sk_buff *skb) { @@ -885,7 +887,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(ipv4_tcp_sk); + local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); + ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); + sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? @@ -910,6 +914,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); + local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); #ifdef CONFIG_TCP_MD5SIG @@ -1005,7 +1010,8 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(ipv4_tcp_sk); + local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); + ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); sock_net_set(ctl_sk, net); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); @@ -1020,6 +1026,7 @@ static void tcp_v4_send_ack(const struct sock *sk, sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); } @@ -3620,7 +3627,7 @@ void __init tcp_v4_init(void) */ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; - per_cpu(ipv4_tcp_sk, cpu) = sk; + per_cpu(ipv4_tcp_sk.sock, cpu) = sk; } if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 538c06f95918d..47de6f3efc857 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -344,11 +344,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) * we complete the initialization. */ local_bh_disable(); - inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ - inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo); + inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c index 8512cb09ebc09..d8a4f192873a2 100644 --- a/net/ipv4/tcp_sigpool.c +++ b/net/ipv4/tcp_sigpool.c @@ -10,7 +10,14 @@ #include static size_t __scratch_size; -static DEFINE_PER_CPU(void __rcu *, sigpool_scratch); +struct sigpool_scratch { + local_lock_t bh_lock; + void __rcu *pad; +}; + +static DEFINE_PER_CPU(struct sigpool_scratch, sigpool_scratch) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; struct sigpool_entry { struct crypto_ahash *hash; @@ -72,7 +79,7 @@ static int sigpool_reserve_scratch(size_t size) break; } - old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), + old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu), scratch, lockdep_is_held(&cpool_mutex)); if (!cpu_online(cpu) || !old_scratch) { kfree(old_scratch); @@ -93,7 +100,7 @@ static void sigpool_scratch_free(void) int cpu; for_each_possible_cpu(cpu) - kfree(rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), + kfree(rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu), NULL, lockdep_is_held(&cpool_mutex))); __scratch_size = 0; } @@ -277,7 +284,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RC /* Pairs with tcp_sigpool_reserve_scratch(), scratch area is * valid (allocated) until tcp_sigpool_end(). */ - c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch)); + local_lock_nested_bh(&sigpool_scratch.bh_lock); + c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch.pad)); return 0; } EXPORT_SYMBOL_GPL(tcp_sigpool_start); @@ -286,6 +294,7 @@ void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH) { struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req); + local_unlock_nested_bh(&sigpool_scratch.bh_lock); rcu_read_unlock_bh(); ahash_request_free(c->req); crypto_free_ahash(hash); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index c434940131b1d..c74705ead9849 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1380,7 +1380,9 @@ static int input_action_end_b6_encap(struct sk_buff *skb, return err; } -DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; bool seg6_bpf_has_valid_srh(struct sk_buff *skb) { @@ -1388,6 +1390,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; + lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return false; @@ -1408,8 +1411,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) static int input_action_end_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) { - struct seg6_bpf_srh_state *srh_state = - this_cpu_ptr(&seg6_bpf_srh_states); + struct seg6_bpf_srh_state *srh_state; struct ipv6_sr_hdr *srh; int ret; @@ -1420,10 +1422,14 @@ static int input_action_end_bpf(struct sk_buff *skb, } advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - /* preempt_disable is needed to protect the per-CPU buffer srh_state, - * which is also accessed by the bpf_lwt_seg6_* helpers + /* The access to the per-CPU buffer srh_state is protected by running + * always in softirq context (with disabled BH). On PREEMPT_RT the + * required locking is provided by the following local_lock_nested_bh() + * statement. It is also accessed by the bpf_lwt_seg6_* helpers via + * bpf_prog_run_save_cb(). */ - preempt_disable(); + local_lock_nested_bh(&seg6_bpf_srh_states.bh_lock); + srh_state = this_cpu_ptr(&seg6_bpf_srh_states); srh_state->srh = srh; srh_state->hdrlen = srh->hdrlen << 3; srh_state->valid = true; @@ -1446,15 +1452,15 @@ static int input_action_end_bpf(struct sk_buff *skb, if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) goto drop; + local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock); - preempt_enable(); if (ret != BPF_REDIRECT) seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); drop: - preempt_enable(); + local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock); kfree_skb(skb); return -EINVAL; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7d1c0986f9bb3..7e16336044b2d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -35,8 +35,6 @@ #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE) -static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); - void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -372,22 +370,23 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { - struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; err = xsk_rcv(xs, xdp); if (err) return err; - if (!xs->flush_node.prev) + if (!xs->flush_node.prev) { + struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); + list_add(&xs->flush_node, flush_list); + } return 0; } -void __xsk_map_flush(void) +void __xsk_map_flush(struct list_head *flush_list) { - struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { @@ -396,16 +395,6 @@ void __xsk_map_flush(void) } } -#ifdef CONFIG_DEBUG_NET -bool xsk_map_check_flush(void) -{ - if (list_empty(this_cpu_ptr(&xskmap_flush_list))) - return false; - __xsk_map_flush(); - return true; -} -#endif - void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { xskq_prod_submit_n(pool->cq, nb_entries); @@ -1772,7 +1761,7 @@ static struct pernet_operations xsk_net_ops = { static int __init xsk_init(void) { - int err, cpu; + int err; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) @@ -1790,8 +1779,6 @@ static int __init xsk_init(void) if (err) goto out_pernet; - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu)); return 0; out_pernet: diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 4bc3e9398ee3d..ab927a142f515 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1694,7 +1694,7 @@ long keyctl_session_to_parent(void) goto unlock; /* cancel an already pending keyring replacement */ - oldwork = task_work_cancel(parent, key_change_session_keyring); + oldwork = task_work_cancel_func(parent, key_change_session_keyring); /* the replacement session keyring is applied just prior to userspace * restarting */