percpu: optimize locking in pcpu_balance_workfn()

pcpu_balance_workfn() unconditionally calls pcpu_balance_free(), pcpu_reclaim_populated(), pcpu_balance_populated() and pcpu_balance_free() again. Each call to pcpu_balance_free() and pcpu_reclaim_populated() will cause at least one acquisition of the pcpu_lock. So even if the balancing was scheduled because of a failed atomic allocation, pcpu_lock will be acquired at least 4 times. This obviously increases the contention on the pcpu_lock. To optimize the scheme let's grab the pcpu_lock on the upper level (in pcpu_balance_workfn()) and keep it generally locked for the whole duration of the scheduled work, but release conditionally to perform any slow operations like chunk (de)population and creation of new chunks. Signed-off-by: N Roman Gushchin <guro@fb.com> Signed-off-by: N Dennis Zhou <dennis@kernel.org>

percpu: optimize locking in pcpu_balance_workfn()
pcpu_balance_workfn() unconditionally calls pcpu_balance_free(), pcpu_reclaim_populated(), pcpu_balance_populated() and pcpu_balance_free() again. Each call to pcpu_balance_free() and pcpu_reclaim_populated() will cause at least one acquisition of the pcpu_lock. So even if the balancing was scheduled because of a failed atomic allocation, pcpu_lock will be acquired at least 4 times. This obviously increases the contention on the pcpu_lock. To optimize the scheme let's grab the pcpu_lock on the upper level (in pcpu_balance_workfn()) and keep it generally locked for the whole duration of the scheduled work, but release conditionally to perform any slow operations like chunk (de)population and creation of new chunks. Signed-off-by: N Roman Gushchin <guro@fb.com> Signed-off-by: N Dennis Zhou <dennis@kernel.org>
e4d77700 · Roman Gushchin · Dennis Zhou · 4829c791 · e4d77700
隐藏空白更改
内联并排

Showing with 29 addition and 12 deletion

mm/percpu.c mm/percpu.c +29 -12

未找到文件。
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1980,6 +1980,9 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 * If empty_only is %false, reclaim all fully free chunks regardless of the
 * number of populated pages.  Otherwise, only reclaim chunks that have no
 * populated pages.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
 */
 static void pcpu_balance_free(bool empty_only)
 {
@@ -1987,12 +1990,12 @@ static void pcpu_balance_free(bool empty_only)
 	struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
 	struct pcpu_chunk *chunk, *next;

+	lockdep_assert_held(&pcpu_lock);
+
 	/*
 	 * There's no reason to keep around multiple unused chunks and VM
 	 * areas can be scarce.  Destroy all free chunks except for one.
 	 */
-	spin_lock_irq(&pcpu_lock);
-
 	list_for_each_entry_safe(chunk, next, free_head, list) {
 		WARN_ON(chunk->immutable);

@@ -2004,8 +2007,10 @@ static void pcpu_balance_free(bool empty_only)
 			list_move(&chunk->list, &to_free);
 	}

-	spin_unlock_irq(&pcpu_lock);
+	if (list_empty(&to_free))
+		return;

+	spin_unlock_irq(&pcpu_lock);
 	list_for_each_entry_safe(chunk, next, &to_free, list) {
 		unsigned int rs, re;

@@ -2019,6 +2024,7 @@ static void pcpu_balance_free(bool empty_only)
 		pcpu_destroy_chunk(chunk);
 		cond_resched();
 	}
+	spin_lock_irq(&pcpu_lock);
 }

 /**
@@ -2029,6 +2035,9 @@ static void pcpu_balance_free(bool empty_only)
 * OOM killer to be triggered.  We should avoid doing so until an actual
 * allocation causes the failure as it is possible that requests can be
 * serviced from already backed regions.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
 */
 static void pcpu_balance_populated(void)
 {
@@ -2037,6 +2046,8 @@ static void pcpu_balance_populated(void)
 	struct pcpu_chunk *chunk;
 	int slot, nr_to_pop, ret;

+	lockdep_assert_held(&pcpu_lock);
+
 	/*
 	 * Ensure there are certain number of free populated pages for
 	 * atomic allocs.  Fill up from the most packed so that atomic
@@ -2064,13 +2075,11 @@ static void pcpu_balance_populated(void)
 		if (!nr_to_pop)
 			break;

-		spin_lock_irq(&pcpu_lock);
 		list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
 			nr_unpop = chunk->nr_pages - chunk->nr_populated;
 			if (nr_unpop)
 				break;
 		}
-		spin_unlock_irq(&pcpu_lock);

 		if (!nr_unpop)
 			continue;
@@ -2080,12 +2089,13 @@ static void pcpu_balance_populated(void)
 					     chunk->nr_pages) {
 			int nr = min_t(int, re - rs, nr_to_pop);

+			spin_unlock_irq(&pcpu_lock);
 			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
+			cond_resched();
+			spin_lock_irq(&pcpu_lock);
 			if (!ret) {
 				nr_to_pop -= nr;
-				spin_lock_irq(&pcpu_lock);
 				pcpu_chunk_populated(chunk, rs, rs + nr);
-				spin_unlock_irq(&pcpu_lock);
 			} else {
 				nr_to_pop = 0;
 			}
@@ -2097,11 +2107,12 @@ static void pcpu_balance_populated(void)

 	if (nr_to_pop) {
 		/* ran out of chunks to populate, create a new one and retry */
+		spin_unlock_irq(&pcpu_lock);
 		chunk = pcpu_create_chunk(gfp);
+		cond_resched();
+		spin_lock_irq(&pcpu_lock);
 		if (chunk) {
-			spin_lock_irq(&pcpu_lock);
 			pcpu_chunk_relocate(chunk, -1);
-			spin_unlock_irq(&pcpu_lock);
 			goto retry_pop;
 		}
 	}
@@ -2117,6 +2128,10 @@ static void pcpu_balance_populated(void)
 * populated pages threshold, reintegrate the chunk if it has empty free pages.
 * Each chunk is scanned in the reverse order to keep populated pages close to
 * the beginning of the chunk.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
+ *
 */
 static void pcpu_reclaim_populated(void)
 {
@@ -2124,7 +2139,7 @@ static void pcpu_reclaim_populated(void)
 	struct pcpu_block_md *block;
 	int i, end;

-	spin_lock_irq(&pcpu_lock);
+	lockdep_assert_held(&pcpu_lock);

 restart:
 	/*
@@ -2190,8 +2205,6 @@ static void pcpu_reclaim_populated(void)
 			list_move(&chunk->list,
 				  &pcpu_chunk_lists[pcpu_sidelined_slot]);
 	}
-
-	spin_unlock_irq(&pcpu_lock);
 }

 /**
@@ -2212,10 +2225,14 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	 * appropriate.
 	 */
 	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
+
 	pcpu_balance_free(false);
 	pcpu_reclaim_populated();
 	pcpu_balance_populated();
 	pcpu_balance_free(true);
+
+	spin_unlock_irq(&pcpu_lock);
 	mutex_unlock(&pcpu_alloc_mutex);
 }