diff --git a/fs/namespace.c b/fs/namespace.c
index 45571517b9c21..d02bd66933735 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -14,7 +14,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/namei.h>
-#include <linux/delay.h>
+#include <linux/hrtimer.h>
 #include <linux/security.h>
 #include <linux/cred.h>
 #include <linux/idr.h>
diff --git a/include/linux/delay.h b/include/linux/delay.h
index 02b37178b54f4..1d0e2ce6b6d9f 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -76,10 +76,4 @@ static inline void fsleep(unsigned long usecs)
 		msleep(DIV_ROUND_UP(usecs, 1000));
 }
 
-#ifdef CONFIG_PREEMPT_RT
-extern void cpu_chill(void);
-#else
-# define cpu_chill()	cpu_relax()
-#endif
-
 #endif /* defined(_LINUX_DELAY_H) */
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bb5e7b0a42746..e425a26a5ed88 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -540,4 +540,10 @@ int hrtimers_dead_cpu(unsigned int cpu);
 #define hrtimers_dead_cpu	NULL
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+extern void cpu_chill(void);
+#else
+# define cpu_chill()	cpu_relax()
+#endif
+
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index fa25113eda0dc..77d356fa8668e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1972,8 +1972,8 @@ config SHUFFLE_PAGE_ALLOCATOR
 	  Say Y if unsure.
 
 config SLUB_CPU_PARTIAL
-	default y
-	depends on SLUB && SMP && !PREEMPT_RT
+	default y if !PREEMPT_RT
+	depends on SLUB && SMP
 	bool "SLUB per cpu partial cache"
 	help
 	  Per cpu partial caches accelerate objects allocation and freeing
diff --git a/kernel/signal.c b/kernel/signal.c
index b87178eb0df69..e40ed99a62a17 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -405,20 +405,20 @@ void task_join_group_stop(struct task_struct *task)
 	task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
 }
 
-static inline struct sigqueue *get_task_cache(struct task_struct *t)
+static struct sigqueue *sigqueue_from_cache(struct task_struct *t)
 {
 	struct sigqueue *q = t->sigqueue_cache;
 
-	if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
-		return NULL;
-	return q;
+	if (q && cmpxchg(&t->sigqueue_cache, q, NULL) == q)
+		return q;
+	return NULL;
 }
 
-static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
+static bool sigqueue_add_cache(struct task_struct *t, struct sigqueue *q)
 {
-	if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
-		return 0;
-	return 1;
+	if (!t->sigqueue_cache && cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
+		return true;
+	return false;
 }
 
 /*
@@ -428,7 +428,7 @@ static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
  */
 static struct sigqueue *
 __sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
-		    int override_rlimit, int fromslab)
+		    int override_rlimit, bool fromslab)
 {
 	struct sigqueue *q = NULL;
 	struct user_struct *user;
@@ -451,7 +451,7 @@ __sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
 
 	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
 		if (!fromslab)
-			q = get_task_cache(t);
+			q = sigqueue_from_cache(t);
 		if (!q)
 			q = kmem_cache_alloc(sigqueue_cachep, flags);
 	} else {
@@ -474,7 +474,7 @@ static struct sigqueue *
 __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
 		 int override_rlimit)
 {
-	return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
+	return __sigqueue_do_alloc(sig, t, flags, override_rlimit, false);
 }
 
 static void __sigqueue_free(struct sigqueue *q)
@@ -486,7 +486,7 @@ static void __sigqueue_free(struct sigqueue *q)
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
-static void sigqueue_free_current(struct sigqueue *q)
+static void __sigqueue_cache_or_free(struct sigqueue *q)
 {
 	struct user_struct *up;
 
@@ -494,11 +494,10 @@ static void sigqueue_free_current(struct sigqueue *q)
 		return;
 
 	up = q->user;
-	if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
-		if (atomic_dec_and_test(&up->sigpending))
-			free_uid(up);
-	} else
-		  __sigqueue_free(q);
+	if (atomic_dec_and_test(&up->sigpending))
+		free_uid(up);
+	if (!task_is_realtime(current) || !sigqueue_add_cache(current, q))
+		kmem_cache_free(sigqueue_cachep, q);
 }
 
 void flush_sigqueue(struct sigpending *queue)
@@ -523,7 +522,7 @@ void flush_task_sigqueue(struct task_struct *tsk)
 
 	flush_sigqueue(&tsk->pending);
 
-	q = get_task_cache(tsk);
+	q = sigqueue_from_cache(tsk);
 	if (q)
 		kmem_cache_free(sigqueue_cachep, q);
 }
@@ -652,7 +651,7 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i
 			(info->si_code == SI_TIMER) &&
 			(info->si_sys_private);
 
-		sigqueue_free_current(first);
+		__sigqueue_cache_or_free(first);
 	} else {
 		/*
 		 * Ok, it wasn't in the queue.  This must be
@@ -1895,8 +1894,7 @@ EXPORT_SYMBOL(kill_pid);
  */
 struct sigqueue *sigqueue_alloc(void)
 {
-	/* Preallocated sigqueue objects always from the slabcache ! */
-	struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
+	struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, true);
 
 	if (q)
 		q->flags |= SIGQUEUE_PREALLOC;
diff --git a/localversion-rt b/localversion-rt
index 700c857efd9ba..22746d6390a42 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt8
+-rt9
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b1332e80dbec..36f314ae0e56a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -362,6 +362,8 @@ EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 
+int page_group_by_mobility_disabled __read_mostly;
+
 struct pa_lock {
 	local_lock_t l;
 };
@@ -369,8 +371,6 @@ static DEFINE_PER_CPU(struct pa_lock, pa_lock) = {
 	.l	= INIT_LOCAL_LOCK(l),
 };
 
-int page_group_by_mobility_disabled __read_mostly;
-
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 /*
  * During boot we initialize deferred pages on-demand, as needed, but once
@@ -1352,7 +1352,7 @@ static inline void prefetch_buddy(struct page *page)
 }
 
 /*
- * Frees a number of pages which have been collected from the pcp lists.
+ * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
@@ -1362,55 +1362,15 @@ static inline void prefetch_buddy(struct page *page)
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
-static void free_pcppages_bulk(struct zone *zone, struct list_head *head,
-			       bool zone_retry)
-{
-	bool isolated_pageblocks;
-	struct page *page, *tmp;
-
-	spin_lock(&zone->lock);
-	isolated_pageblocks = has_isolate_pageblock(zone);
-
-	/*
-	 * Use safe version since after __free_one_page(),
-	 * page->lru.next will not point to original list.
-	 */
-	list_for_each_entry_safe(page, tmp, head, lru) {
-		int mt = get_pcppage_migratetype(page);
-
-		if (page_zone(page) != zone) {
-			/*
-			 * free_unref_page_list() sorts pages by zone. If we end
-			 * up with pages from a different NUMA nodes belonging
-			 * to the same ZONE index then we need to redo with the
-			 * correct ZONE pointer. Skip the page for now, redo it
-			 * on the next iteration.
-			 */
-			WARN_ON_ONCE(zone_retry == false);
-			if (zone_retry)
-				continue;
-		}
-
-		/* MIGRATE_ISOLATE page should not go to pcplists */
-		VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
-		/* Pageblock could have been isolated meanwhile */
-		if (unlikely(isolated_pageblocks))
-			mt = get_pageblock_migratetype(page);
-
-		list_del(&page->lru);
-		__free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
-		trace_mm_page_pcpu_drain(page, 0, mt);
-	}
-	spin_unlock(&zone->lock);
-}
-
-static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
-			      struct list_head *dst)
+static void free_pcppages_bulk(struct zone *zone, int count,
+					struct per_cpu_pages *pcp)
 {
 	int migratetype = 0;
 	int batch_free = 0;
 	int prefetch_nr = READ_ONCE(pcp->batch);
-	struct page *page;
+	bool isolated_pageblocks;
+	struct page *page, *tmp;
+	LIST_HEAD(head);
 
 	/*
 	 * Ensure proper count is passed which otherwise would stuck in the
@@ -1447,7 +1407,7 @@ static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
 			if (bulkfree_pcp_prepare(page))
 				continue;
 
-			list_add_tail(&page->lru, dst);
+			list_add_tail(&page->lru, &head);
 
 			/*
 			 * We are going to put the page back to the global
@@ -1464,6 +1424,26 @@ static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
 			}
 		} while (--count && --batch_free && !list_empty(list));
 	}
+
+	spin_lock(&zone->lock);
+	isolated_pageblocks = has_isolate_pageblock(zone);
+
+	/*
+	 * Use safe version since after __free_one_page(),
+	 * page->lru.next will not point to original list.
+	 */
+	list_for_each_entry_safe(page, tmp, &head, lru) {
+		int mt = get_pcppage_migratetype(page);
+		/* MIGRATE_ISOLATE page should not go to pcplists */
+		VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+		/* Pageblock could have been isolated meanwhile */
+		if (unlikely(isolated_pageblocks))
+			mt = get_pageblock_migratetype(page);
+
+		__free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
+		trace_mm_page_pcpu_drain(page, 0, mt);
+	}
+	spin_unlock(&zone->lock);
 }
 
 static void free_one_page(struct zone *zone,
@@ -2984,18 +2964,13 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain, batch;
-	LIST_HEAD(dst);
 
 	local_lock_irqsave(&pa_lock.l, flags);
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
 	if (to_drain > 0)
-		isolate_pcp_pages(to_drain, pcp, &dst);
-
+		free_pcppages_bulk(zone, to_drain, pcp);
 	local_unlock_irqrestore(&pa_lock.l, flags);
-
-	if (to_drain > 0)
-		free_pcppages_bulk(zone, &dst, false);
 }
 #endif
 
@@ -3011,21 +2986,14 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 	unsigned long flags;
 	struct per_cpu_pageset *pset;
 	struct per_cpu_pages *pcp;
-	LIST_HEAD(dst);
-	int count;
 
 	local_lock_irqsave(&pa_lock.l, flags);
 	pset = per_cpu_ptr(zone->pageset, cpu);
 
 	pcp = &pset->pcp;
-	count = pcp->count;
-	if (count)
-		isolate_pcp_pages(count, pcp, &dst);
-
+	if (pcp->count)
+		free_pcppages_bulk(zone, pcp->count, pcp);
 	local_unlock_irqrestore(&pa_lock.l, flags);
-
-	if (count)
-		free_pcppages_bulk(zone, &dst, false);
 }
 
 /*
@@ -3245,8 +3213,7 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
 	return true;
 }
 
-static void free_unref_page_commit(struct page *page, unsigned long pfn,
-				   struct list_head *dst)
+static void free_unref_page_commit(struct page *page, unsigned long pfn)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
@@ -3275,7 +3242,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn,
 	list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= READ_ONCE(pcp->high))
-		isolate_pcp_pages(READ_ONCE(pcp->batch), pcp, dst);
+		free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp);
 }
 
 /*
@@ -3285,17 +3252,13 @@ void free_unref_page(struct page *page)
 {
 	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
-	struct zone *zone = page_zone(page);
-	LIST_HEAD(dst);
 
 	if (!free_unref_page_prepare(page, pfn))
 		return;
 
 	local_lock_irqsave(&pa_lock.l, flags);
-	free_unref_page_commit(page, pfn, &dst);
+	free_unref_page_commit(page, pfn);
 	local_unlock_irqrestore(&pa_lock.l, flags);
-	if (!list_empty(&dst))
-		free_pcppages_bulk(zone, &dst, false);
 }
 
 /*
@@ -3306,11 +3269,6 @@ void free_unref_page_list(struct list_head *list)
 	struct page *page, *next;
 	unsigned long flags, pfn;
 	int batch_count = 0;
-	struct list_head dsts[__MAX_NR_ZONES];
-	int i;
-
-	for (i = 0; i < __MAX_NR_ZONES; i++)
-		INIT_LIST_HEAD(&dsts[i]);
 
 	/* Prepare pages for freeing */
 	list_for_each_entry_safe(page, next, list, lru) {
@@ -3323,12 +3281,10 @@ void free_unref_page_list(struct list_head *list)
 	local_lock_irqsave(&pa_lock.l, flags);
 	list_for_each_entry_safe(page, next, list, lru) {
 		unsigned long pfn = page_private(page);
-		enum zone_type type;
 
 		set_page_private(page, 0);
 		trace_mm_page_free_batched(page);
-		type = page_zonenum(page);
-		free_unref_page_commit(page, pfn, &dsts[type]);
+		free_unref_page_commit(page, pfn);
 
 		/*
 		 * Guard against excessive IRQ disabled times when we get
@@ -3341,21 +3297,6 @@ void free_unref_page_list(struct list_head *list)
 		}
 	}
 	local_unlock_irqrestore(&pa_lock.l, flags);
-
-	for (i = 0; i < __MAX_NR_ZONES; ) {
-		struct page *page;
-		struct zone *zone;
-
-		if (list_empty(&dsts[i])) {
-			i++;
-			continue;
-		}
-
-		page = list_first_entry(&dsts[i], struct page, lru);
-		zone = page_zone(page);
-
-		free_pcppages_bulk(zone, &dsts[i], true);
-	}
 }
 
 /*
@@ -3552,6 +3493,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 	 * allocate greater than order-1 page units with __GFP_NOFAIL.
 	 */
 	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+
 	local_lock_irqsave(&pa_lock.l, flags);
 	spin_lock(&zone->lock);
 
@@ -3574,6 +3516,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 	spin_unlock(&zone->lock);
 	if (!page)
 		goto failed;
+
 	__mod_zone_freepage_state(zone, -(1 << order),
 				  get_pcppage_migratetype(page));
 
diff --git a/mm/slub.c b/mm/slub.c
index 570c7e32f4d63..0e317cbb8c255 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1499,12 +1499,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
 }
 #endif /* CONFIG_SLUB_DEBUG */
 
-struct slub_free_list {
-	raw_spinlock_t		lock;
-	struct list_head	list;
-};
-static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
-
 /*
  * Hooks for other subsystems that check memory allocations. In a typical
  * production configuration these hooks all should produce no code at all.
@@ -1860,16 +1854,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	__free_pages(page, order);
 }
 
-static void free_delayed(struct list_head *h)
-{
-	while (!list_empty(h)) {
-		struct page *page = list_first_entry(h, struct page, lru);
-
-		list_del(&page->lru);
-		__free_slab(page->slab_cache, page);
-	}
-}
-
 static void rcu_free_slab(struct rcu_head *h)
 {
 	struct page *page = container_of(h, struct page, rcu_head);
@@ -1881,22 +1865,33 @@ static void free_slab(struct kmem_cache *s, struct page *page)
 {
 	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
 		call_rcu(&page->rcu_head, rcu_free_slab);
-	} else if (irqs_disabled()) {
-		struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
-
-		raw_spin_lock(&f->lock);
-		list_add(&page->lru, &f->list);
-		raw_spin_unlock(&f->lock);
 	} else
 		__free_slab(s, page);
 }
 
+static void discard_slab_delayed(struct kmem_cache *s, struct page *page,
+				 struct list_head *delayed_free)
+{
+	dec_slabs_node(s, page_to_nid(page), page->objects);
+	list_add(&page->lru, delayed_free);
+}
+
 static void discard_slab(struct kmem_cache *s, struct page *page)
 {
 	dec_slabs_node(s, page_to_nid(page), page->objects);
 	free_slab(s, page);
 }
 
+static void discard_delayed(struct list_head *l)
+{
+	while (!list_empty(l)) {
+		struct page *page = list_first_entry(l, struct page, lru);
+
+		list_del(&page->lru);
+		__free_slab(page->slab_cache, page);
+	}
+}
+
 /*
  * Management of partially allocated slabs.
  */
@@ -1970,15 +1965,16 @@ static inline void *acquire_slab(struct kmem_cache *s,
 	WARN_ON(!freelist);
 	return freelist;
 }
-
-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain,
+			    struct list_head *delayed_free);
 static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
 
 /*
  * Try to allocate a partial slab from a specific node.
  */
 static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
-				struct kmem_cache_cpu *c, gfp_t flags)
+			      struct kmem_cache_cpu *c, gfp_t flags,
+			      struct list_head *delayed_free)
 {
 	struct page *page, *page2;
 	void *object = NULL;
@@ -2011,7 +2007,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 			stat(s, ALLOC_FROM_PARTIAL);
 			object = t;
 		} else {
-			put_cpu_partial(s, page, 0);
+			put_cpu_partial(s, page, 0, delayed_free);
 			stat(s, CPU_PARTIAL_NODE);
 		}
 		if (!kmem_cache_has_cpu_partial(s)
@@ -2027,7 +2023,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
  * Get a page from somewhere. Search in increasing NUMA distances.
  */
 static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
-		struct kmem_cache_cpu *c)
+			     struct kmem_cache_cpu *c,
+			     struct list_head *delayed_free)
 {
 #ifdef CONFIG_NUMA
 	struct zonelist *zonelist;
@@ -2069,7 +2066,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
 
 			if (n && cpuset_zone_allowed(zone, flags) &&
 					n->nr_partial > s->min_partial) {
-				object = get_partial_node(s, n, c, flags);
+				object = get_partial_node(s, n, c, flags, delayed_free);
 				if (object) {
 					/*
 					 * Don't check read_mems_allowed_retry()
@@ -2091,7 +2088,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
  * Get a partial page, lock it and return it.
  */
 static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
-		struct kmem_cache_cpu *c)
+			 struct kmem_cache_cpu *c,
+			 struct list_head *delayed_free)
 {
 	void *object;
 	int searchnode = node;
@@ -2099,11 +2097,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
 	if (node == NUMA_NO_NODE)
 		searchnode = numa_mem_id();
 
-	object = get_partial_node(s, get_node(s, searchnode), c, flags);
+	object = get_partial_node(s, get_node(s, searchnode), c, flags,
+				  delayed_free);
 	if (object || node != NUMA_NO_NODE)
 		return object;
 
-	return get_any_partial(s, flags, c);
+	return get_any_partial(s, flags, c, delayed_free);
 }
 
 #ifdef CONFIG_PREEMPTION
@@ -2179,7 +2178,8 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
  * Remove the cpu slab
  */
 static void deactivate_slab(struct kmem_cache *s, struct page *page,
-				void *freelist, struct kmem_cache_cpu *c)
+			    void *freelist, struct kmem_cache_cpu *c,
+			    struct list_head *delayed_free)
 {
 	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -2316,7 +2316,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
 		stat(s, DEACTIVATE_FULL);
 	else if (m == M_FREE) {
 		stat(s, DEACTIVATE_EMPTY);
-		discard_slab(s, page);
+		discard_slab_delayed(s, page, delayed_free);
 		stat(s, FREE_SLAB);
 	}
 
@@ -2331,8 +2331,8 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
  * for the cpu using c (or some other guarantee must be there
  * to guarantee no concurrent accesses).
  */
-static void unfreeze_partials(struct kmem_cache *s,
-		struct kmem_cache_cpu *c)
+static void unfreeze_partials(struct kmem_cache *s, struct kmem_cache_cpu *c,
+			      struct list_head *delayed_free)
 {
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	struct kmem_cache_node *n = NULL, *n2 = NULL;
@@ -2386,7 +2386,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 		discard_page = discard_page->next;
 
 		stat(s, DEACTIVATE_EMPTY);
-		discard_slab(s, page);
+		discard_slab_delayed(s, page, delayed_free);
 		stat(s, FREE_SLAB);
 	}
 #endif	/* CONFIG_SLUB_CPU_PARTIAL */
@@ -2399,7 +2399,8 @@ static void unfreeze_partials(struct kmem_cache *s,
  * If we did not find a slot then simply move all the partials to the
  * per node partial list.
  */
-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain,
+			    struct list_head *delayed_free)
 {
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	struct page *oldpage;
@@ -2416,21 +2417,15 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 			pobjects = oldpage->pobjects;
 			pages = oldpage->pages;
 			if (drain && pobjects > slub_cpu_partial(s)) {
-				struct slub_free_list *f;
 				unsigned long flags;
-				LIST_HEAD(tofree);
 				/*
 				 * partial array is full. Move the existing
 				 * set to the per node partial list.
 				 */
 				local_irq_save(flags);
-				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
-				f = this_cpu_ptr(&slub_free_list);
-				raw_spin_lock(&f->lock);
-				list_splice_init(&f->list, &tofree);
-				raw_spin_unlock(&f->lock);
+				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab),
+						  delayed_free);
 				local_irq_restore(flags);
-				free_delayed(&tofree);
 				oldpage = NULL;
 				pobjects = 0;
 				pages = 0;
@@ -2451,17 +2446,18 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 		unsigned long flags;
 
 		local_irq_save(flags);
-		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab), delayed_free);
 		local_irq_restore(flags);
 	}
 	preempt_enable();
 #endif	/* CONFIG_SLUB_CPU_PARTIAL */
 }
 
-static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c,
+			      struct list_head *delayed_free)
 {
 	stat(s, CPUSLAB_FLUSH);
-	deactivate_slab(s, c->page, c->freelist, c);
+	deactivate_slab(s, c->page, c->freelist, c, delayed_free);
 
 	c->tid = next_tid(c->tid);
 }
@@ -2471,46 +2467,81 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
  *
  * Called from IPI handler with interrupts disabled.
  */
-static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu,
+				    struct list_head *delayed_free)
 {
 	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 
 	if (c->page)
-		flush_slab(s, c);
+		flush_slab(s, c, delayed_free);
 
-	unfreeze_partials(s, c);
+	unfreeze_partials(s, c, delayed_free);
 }
 
-static void flush_cpu_slab(void *d)
-{
-	struct kmem_cache *s = d;
+struct slub_flush_work {
+	struct work_struct work;
+	struct kmem_cache *s;
+	bool skip;
+};
 
-	__flush_cpu_slab(s, smp_processor_id());
+static void flush_cpu_slab(struct work_struct *w)
+{
+	struct slub_flush_work *sfw;
+	LIST_HEAD(delayed_free);
+
+	sfw = container_of(w, struct slub_flush_work, work);
+
+	local_irq_disable();
+	__flush_cpu_slab(sfw->s, smp_processor_id(), &delayed_free);
+	local_irq_enable();
+
+	discard_delayed(&delayed_free);
 }
 
-static bool has_cpu_slab(int cpu, void *info)
+static bool has_cpu_slab(int cpu, struct kmem_cache *s)
 {
-	struct kmem_cache *s = info;
 	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 
 	return c->page || slub_percpu_partial(c);
 }
 
+static DEFINE_MUTEX(flush_lock);
+static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
+
+static void flush_all_locked(struct kmem_cache *s)
+{
+	struct slub_flush_work *sfw;
+	unsigned int cpu;
+
+	mutex_lock(&flush_lock);
+
+	for_each_online_cpu(cpu) {
+		sfw = &per_cpu(slub_flush, cpu);
+		if (!has_cpu_slab(cpu, s)) {
+			sfw->skip = true;
+			continue;
+		}
+		INIT_WORK(&sfw->work, flush_cpu_slab);
+		sfw->skip = false;
+		sfw->s = s;
+		schedule_work_on(cpu, &sfw->work);
+	}
+
+	for_each_online_cpu(cpu) {
+		sfw = &per_cpu(slub_flush, cpu);
+		if (sfw->skip)
+			continue;
+		flush_work(&sfw->work);
+	}
+
+	mutex_unlock(&flush_lock);
+}
+
 static void flush_all(struct kmem_cache *s)
 {
-	LIST_HEAD(tofree);
-	int cpu;
-
-	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
-	for_each_online_cpu(cpu) {
-		struct slub_free_list *f;
-
-		f = &per_cpu(slub_free_list, cpu);
-		raw_spin_lock_irq(&f->lock);
-		list_splice_init(&f->list, &tofree);
-		raw_spin_unlock_irq(&f->lock);
-		free_delayed(&tofree);
-	}
+	cpus_read_lock();
+	flush_all_locked(s);
+	cpus_read_unlock();
 }
 
 /*
@@ -2521,13 +2552,15 @@ static int slub_cpu_dead(unsigned int cpu)
 {
 	struct kmem_cache *s;
 	unsigned long flags;
+	LIST_HEAD(delayed_free);
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
 		local_irq_save(flags);
-		__flush_cpu_slab(s, cpu);
+		__flush_cpu_slab(s, cpu, &delayed_free);
 		local_irq_restore(flags);
 	}
+	discard_delayed(&delayed_free);
 	mutex_unlock(&slab_mutex);
 	return 0;
 }
@@ -2611,7 +2644,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 }
 
 static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
-			int node, struct kmem_cache_cpu **pc)
+				     int node, struct kmem_cache_cpu **pc,
+				     struct list_head *delayed_free)
 {
 	void *freelist;
 	struct kmem_cache_cpu *c = *pc;
@@ -2619,7 +2653,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
 
 	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
 
-	freelist = get_partial(s, flags, node, c);
+	freelist = get_partial(s, flags, node, c, delayed_free);
 
 	if (freelist)
 		return freelist;
@@ -2628,7 +2662,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
 	if (page) {
 		c = raw_cpu_ptr(s->cpu_slab);
 		if (c->page)
-			flush_slab(s, c);
+			flush_slab(s, c, delayed_free);
 
 		/*
 		 * No other reference to the page yet so we can
@@ -2708,9 +2742,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
  */
 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 			  unsigned long addr, struct kmem_cache_cpu *c,
-			  struct list_head *to_free)
+			  struct list_head *delayed_free)
 {
-	struct slub_free_list *f;
 	void *freelist;
 	struct page *page;
 
@@ -2739,7 +2772,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 			goto redo;
 		} else {
 			stat(s, ALLOC_NODE_MISMATCH);
-			deactivate_slab(s, page, c->freelist, c);
+			deactivate_slab(s, page, c->freelist, c, delayed_free);
 			goto new_slab;
 		}
 	}
@@ -2750,7 +2783,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	 * information when the page leaves the per-cpu allocator
 	 */
 	if (unlikely(!pfmemalloc_match(page, gfpflags))) {
-		deactivate_slab(s, page, c->freelist, c);
+		deactivate_slab(s, page, c->freelist, c, delayed_free);
 		goto new_slab;
 	}
 
@@ -2778,13 +2811,6 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	VM_BUG_ON(!c->page->frozen);
 	c->freelist = get_freepointer(s, freelist);
 	c->tid = next_tid(c->tid);
-
-out:
-	f = this_cpu_ptr(&slub_free_list);
-	raw_spin_lock(&f->lock);
-	list_splice_init(&f->list, to_free);
-	raw_spin_unlock(&f->lock);
-
 	return freelist;
 
 new_slab:
@@ -2796,11 +2822,11 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 		goto redo;
 	}
 
-	freelist = new_slab_objects(s, gfpflags, node, &c);
+	freelist = new_slab_objects(s, gfpflags, node, &c, delayed_free);
 
 	if (unlikely(!freelist)) {
 		slab_out_of_memory(s, gfpflags, node);
-		goto out;
+		return NULL;
 	}
 
 	page = c->page;
@@ -2812,8 +2838,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 			!alloc_debug_processing(s, page, freelist, addr))
 		goto new_slab;	/* Slab failed checks. Next slab needed */
 
-	deactivate_slab(s, page, get_freepointer(s, freelist), c);
-	goto out;
+	deactivate_slab(s, page, get_freepointer(s, freelist), c, delayed_free);
+	return freelist;
 }
 
 /*
@@ -2825,7 +2851,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 {
 	void *p;
 	unsigned long flags;
-	LIST_HEAD(tofree);
+	LIST_HEAD(delayed_free);
 
 	local_irq_save(flags);
 #ifdef CONFIG_PREEMPTION
@@ -2837,9 +2863,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	c = this_cpu_ptr(s->cpu_slab);
 #endif
 
-	p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
+	p = ___slab_alloc(s, gfpflags, node, addr, c, &delayed_free);
 	local_irq_restore(flags);
-	free_delayed(&tofree);
+	discard_delayed(&delayed_free);
 	return p;
 }
 
@@ -3094,11 +3120,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 			 */
 			stat(s, FREE_FROZEN);
 		} else if (new.frozen) {
+			LIST_HEAD(delayed_free);
 			/*
 			 * If we just froze the page then put it onto the
 			 * per cpu partial list.
 			 */
-			put_cpu_partial(s, page, 1);
+			put_cpu_partial(s, page, 1, &delayed_free);
+			discard_delayed(&delayed_free);
 			stat(s, CPU_PARTIAL_FREE);
 		}
 
@@ -3340,9 +3368,9 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 			  void **p)
 {
 	struct kmem_cache_cpu *c;
-	LIST_HEAD(to_free);
 	int i;
 	struct obj_cgroup *objcg = NULL;
+	LIST_HEAD(delayed_free);
 
 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP))
 		WARN_ON_ONCE(!preemptible() &&
@@ -3378,7 +3406,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 			 * of re-populating per CPU c->freelist
 			 */
 			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
-					    _RET_IP_, c, &to_free);
+					    _RET_IP_, c, &delayed_free);
 			if (unlikely(!p[i]))
 				goto error;
 
@@ -3393,7 +3421,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	}
 	c->tid = next_tid(c->tid);
 	local_irq_enable();
-	free_delayed(&to_free);
+
+	discard_delayed(&delayed_free);
 
 	/* Clear memory outside IRQ disabled fastpath loop */
 	if (unlikely(slab_want_init_on_alloc(flags, s))) {
@@ -3408,7 +3437,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	return i;
 error:
 	local_irq_enable();
-	free_delayed(&to_free);
+	discard_delayed(&delayed_free);
 	slab_post_alloc_hook(s, objcg, flags, i, p);
 	__kmem_cache_free_bulk(s, i, p);
 	return 0;
@@ -3994,7 +4023,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	int node;
 	struct kmem_cache_node *n;
 
-	flush_all(s);
+	flush_all_locked(s);
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
 		free_partial(s, n);
@@ -4234,7 +4263,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	unsigned long flags;
 	int ret = 0;
 
-	flush_all(s);
+	flush_all_locked(s);
 	for_each_kmem_cache_node(s, node, n) {
 		INIT_LIST_HEAD(&discard);
 		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
@@ -4418,6 +4447,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 	int node;
 	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
 	struct kmem_cache_node *n;
+	LIST_HEAD(delayed_free);
 
 	memcpy(s, static_cache, kmem_cache->object_size);
 
@@ -4426,7 +4456,8 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
 	 * IPIs around.
 	 */
-	__flush_cpu_slab(s, smp_processor_id());
+	__flush_cpu_slab(s, smp_processor_id(), &delayed_free);
+	discard_delayed(&delayed_free);
 	for_each_kmem_cache_node(s, node, n) {
 		struct page *p;
 
@@ -4446,12 +4477,6 @@ void __init kmem_cache_init(void)
 {
 	static __initdata struct kmem_cache boot_kmem_cache,
 		boot_kmem_cache_node;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
-		INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
-	}
 
 	if (debug_guardpage_minorder())
 		slub_max_order = 0;
@@ -4712,6 +4737,9 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
 	struct location *l;
 	int order;
 
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && flags == GFP_ATOMIC)
+		return 0;
+
 	order = get_order(sizeof(struct location) * max);
 
 	l = (void *)__get_free_pages(flags, order);