From aa316fa10f0d125e2ff88c2cafa2c23ed9b96bb5 Mon Sep 17 00:00:00 2001
From: Wei Li <liwei391@huawei.com>
Date: Fri, 2 Jul 2021 11:12:53 +0800
Subject: [PATCH] arm64: clear_page: Add new implementation of clear_page() by
 STNP

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I3ZN72
CVE: NA

---------------------------

Currently, clear_page() clear the page through 'dc zva', while the page may
not be used immediately mostly, so the cache flush is in vain.

Add an optimized implementation of clear_page() by 'stnp' for performance
promotion. It can be switched by the boot cmdline 'mm.use_clearpage_stnp'.

In the hugetlb clear test, we gained about 53.7% performance improvement:

Set mm.use_clearpage_stnp = 0          |  Set mm.use_clearpage_stnp = 1
[root@localhost liwei]# ./a.out 50 20  |  [root@localhost liwei]# ./a.out 50 20
size is 50 Gib, test times is 20       |  size is 50 Gib, test times is 20
test_time[0] : use 8.438046 sec        |  test_time[0] : use 3.722682 sec
test_time[1] : use 8.028493 sec        |  test_time[1] : use 3.640274 sec
test_time[2] : use 8.646547 sec        |  test_time[2] : use 4.095052 sec
test_time[3] : use 8.122490 sec        |  test_time[3] : use 3.998446 sec
test_time[4] : use 8.053038 sec        |  test_time[4] : use 4.084259 sec
test_time[5] : use 8.843512 sec        |  test_time[5] : use 3.933871 sec
test_time[6] : use 8.308906 sec        |  test_time[6] : use 3.934334 sec
test_time[7] : use 8.093817 sec        |  test_time[7] : use 3.869142 sec
test_time[8] : use 8.303504 sec        |  test_time[8] : use 3.902916 sec
test_time[9] : use 8.178336 sec        |  test_time[9] : use 3.541885 sec
test_time[10] : use 8.003625 sec       |  test_time[10] : use 3.595554 sec
test_time[11] : use 8.163807 sec       |  test_time[11] : use 3.583813 sec
test_time[12] : use 8.267464 sec       |  test_time[12] : use 3.863033 sec
test_time[13] : use 8.055326 sec       |  test_time[13] : use 3.770953 sec
test_time[14] : use 8.246986 sec       |  test_time[14] : use 3.808006 sec
test_time[15] : use 8.546992 sec       |  test_time[15] : use 3.653194 sec
test_time[16] : use 8.727256 sec       |  test_time[16] : use 3.722395 sec
test_time[17] : use 8.288951 sec       |  test_time[17] : use 3.683508 sec
test_time[18] : use 8.019322 sec       |  test_time[18] : use 4.253087 sec
test_time[19] : use 8.250685 sec       |  test_time[19] : use 4.082845 sec
hugetlb test end!                      |  hugetlb test end!

Signed-off-by: Wei Li <liwei391@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 arch/arm64/include/asm/cpucaps.h |  3 ++-
 arch/arm64/kernel/cpufeature.c   | 34 ++++++++++++++++++++++++++++++++
 arch/arm64/lib/clear_page.S      | 21 ++++++++++++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index e7d2bdc92acf..47f8fae7ecdb 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -68,7 +68,8 @@
 #define ARM64_WORKAROUND_1508412		58
 #define ARM64_HAS_MPAM				59
 #define ARM64_WORKAROUND_HISI_HIP08_RU_PREFETCH 60
+#define ARM64_CLEARPAGE_STNP			61
 
-#define ARM64_NCAPS				61
+#define ARM64_NCAPS				62
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d8e1bdb5abed..15b49edfe2c8 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1736,6 +1736,34 @@ cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap)
 	return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT);
 }
 
+static bool use_clearpage_stnp;
+
+static int __init early_use_clearpage_stnp(char *p)
+{
+	return strtobool(p, &use_clearpage_stnp);
+}
+early_param("mm.use_clearpage_stnp", early_use_clearpage_stnp);
+
+static bool has_mor_nontemporal(const struct arm64_cpu_capabilities *entry)
+{
+	/*
+	 * List of CPUs which have memory ordering ruled non-temporal
+	 * load and store.
+	 */
+	static const struct midr_range cpus[] = {
+		MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
+		{},
+	};
+
+	return is_midr_in_range_list(read_cpuid_id(), cpus);
+}
+
+static bool can_clearpage_use_stnp(const struct arm64_cpu_capabilities *entry,
+				   int scope)
+{
+	return use_clearpage_stnp && has_mor_nontemporal(entry);
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
@@ -2158,6 +2186,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_enable_mte,
 	},
 #endif /* CONFIG_ARM64_MTE */
+	{
+		.desc = "Clear Page by STNP",
+		.capability = ARM64_CLEARPAGE_STNP,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = can_clearpage_use_stnp,
+	},
 	{},
 };
 
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S
index 073acbf02a7c..39ad3ee1860e 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -7,6 +7,25 @@
 #include <linux/const.h>
 #include <asm/assembler.h>
 #include <asm/page.h>
+#include <asm/alternative.h>
+
+/*
+ * Clear page @dest
+ *
+ * Parameters:
+ *	x0 - dest
+ */
+SYM_FUNC_START(clear_page_stnp)
+	.align	6
+1:	stnp xzr, xzr, [x0]
+	stnp xzr, xzr, [x0, #0x10]
+	stnp xzr, xzr, [x0, #0x20]
+	stnp xzr, xzr, [x0, #0x30]
+	add	x0, x0, #0x40
+	tst	x0, #(PAGE_SIZE - 1)
+	b.ne	1b
+	ret
+SYM_FUNC_END(clear_page_stnp)
 
 /*
  * Clear page @dest
@@ -15,6 +34,8 @@
  *	x0 - dest
  */
 SYM_FUNC_START(clear_page)
+	ALTERNATIVE("nop", "b clear_page_stnp", ARM64_CLEARPAGE_STNP)
+
 	mrs	x1, dczid_el0
 	and	w1, w1, #0xf
 	mov	x2, #4
-- 
GitLab