Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86.

Eliminate the separate 'len' variable from the loops, and also use the 4 byte instruction. This shaves off a few more cycles. Even though this routine that uses the special SSE 4.2 instructions is much faster than a generic routine, it's still a hot spot, so let's make it as fast as possible. Change the configure test to not test _mm_crc32_u64. That variant is only available in the 64-bit x86-64 architecture, not in 32-bit x86. Modify pg_comp_crc32c_sse42 so that it only uses _mm_crc32_u64 on x86-64. With these changes, the SSE accelerated CRC-32C implementation can also be used on 32-bit x86 systems. This also fixes the 32-bit MSVC build.

Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86.
Eliminate the separate 'len' variable from the loops, and also use the 4 byte instruction. This shaves off a few more cycles. Even though this routine that uses the special SSE 4.2 instructions is much faster than a generic routine, it's still a hot spot, so let's make it as fast as possible. Change the configure test to not test _mm_crc32_u64. That variant is only available in the 64-bit x86-64 architecture, not in 32-bit x86. Modify pg_comp_crc32c_sse42 so that it only uses _mm_crc32_u64 on x86-64. With these changes, the SSE accelerated CRC-32C implementation can also be used on 32-bit x86 systems. This also fixes the 32-bit MSVC build.
936546dc · Heikki Linnakangas · b73e7a07 · 936546dc · 936546dc · 936546dc
隐藏空白更改
内联并排

Showing with 41 addition and 22 deletion

config/c-compiler.m4 config/c-compiler.m4 +7 -3

configure configure +6 -6

src/port/pg_crc32c_sse42.c src/port/pg_crc32c_sse42.c +28 -13

未找到文件。
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -476,12 +476,16 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS

 # PGAC_SSE42_CRC32_INTRINSICS
 # -----------------------
-# Check if the compiler supports _mm_crc32_u8 and _mm_crc32_u64 intrinsics.
+# Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
+# using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
+# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
+# the other ones are, on x86-64 platforms)
+#
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
 # intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
 AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
 [define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=$1], [Ac_cachevar],
+AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
 [pgac_save_CFLAGS=$CFLAGS
 CFLAGS="$pgac_save_CFLAGS $1"
 ac_save_c_werror_flag=$ac_c_werror_flag
@@ -489,7 +493,7 @@ ac_c_werror_flag=yes
 AC_TRY_LINK([#include <nmmintrin.h>],
  [unsigned int crc = 0;
   crc = _mm_crc32_u8(crc, 0);
-   crc = (unsigned int) _mm_crc32_u64(crc, 0);],
+   crc = _mm_crc32_u32(crc, 0);],
  [Ac_cachevar=yes],
  [Ac_cachevar=no])
 ac_c_werror_flag=$ac_save_c_werror_flag

--- a/configure
+++ b/configure
@@ -14172,8 +14172,8 @@ fi
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
 # with the default compiler flags. If not, check if adding the -msse4.2
 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
 if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
  $as_echo_n "(cached) " >&6
 else
@@ -14189,7 +14189,7 @@ main ()
 {
 unsigned int crc = 0;
   crc = _mm_crc32_u8(crc, 0);
-   crc = (unsigned int) _mm_crc32_u64(crc, 0);
+   crc = _mm_crc32_u32(crc, 0);
  ;
  return 0;
 }
@@ -14212,8 +14212,8 @@ if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
 fi

 if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2... " >&6; }
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
 if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
  $as_echo_n "(cached) " >&6
 else
@@ -14229,7 +14229,7 @@ main ()
 {
 unsigned int crc = 0;
   crc = _mm_crc32_u8(crc, 0);
-   crc = (unsigned int) _mm_crc32_u64(crc, 0);
+   crc = _mm_crc32_u32(crc, 0);
  ;
  return 0;
 }

--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -22,30 +22,45 @@ pg_crc32c
 pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
-	const uint64 *p8;
+	const unsigned char *pend = p + len;

 	/*
 	 * Process eight bytes of data at a time.
 	 *
-	 * NB: We do unaligned 8-byte accesses here. The Intel architecture
-	 * allows that, and performance testing didn't show any performance
-	 * gain from aligning the beginning address.
+	 * NB: We do unaligned accesses here. The Intel architecture allows that,
+	 * and performance testing didn't show any performance gain from aligning
+	 * the begin address.
 	 */
-	p8 = (const uint64 *) p;
-	while (len >= 8)
+#ifdef __x86_64__
+	while (p + 8 <= pend)
 	{
-		crc = (uint32) _mm_crc32_u64(crc, *p8++);
-		len -= 8;
+		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
+		p += 8;
 	}

+	/* Process remaining full four bytes if any */
+	if (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+		p += 4;
+	}
+#else
 	/*
-	 * Handle any remaining bytes one at a time.
+	 * Process four bytes at a time. (The eight byte instruction is not
+	 * available on the 32-bit x86 architecture).
 	 */
-	p = (const unsigned char *) p8;
-	while (len > 0)
+	while (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+		p += 4;
+	}
+#endif /* __x86_64__ */
+
+	/* Process any remaining bytes one at a time. */
+	while (p < pend)
 	{
-		crc = _mm_crc32_u8(crc, *p++);
-		len--;
+		crc = _mm_crc32_u8(crc, *p);
+		p++;
 	}

 	return crc;