dbms: Server: Performance improvements. [#METR-15210]

8125cad1 · Alexey Arno · 79bad96d · 8125cad1 · 8125cad1
2 changed file
--- a/dbms/include/DB/Functions/FunctionsRound.h
+++ b/dbms/include/DB/Functions/FunctionsRound.h
@@ -295,13 +295,13 @@ namespace DB
 	struct FloatRoundingComputation<Float32, rounding_mode, PositiveScale>
 		: public BaseFloatRoundingComputation<Float32>
 	{
-		static inline void prepareScale(size_t scale, Scale & mm_scale)
+		static inline void prepare(size_t scale, Scale & mm_scale)
 		{
 			Float32 fscale = static_cast<Float32>(scale);
 			mm_scale = _mm_load1_ps(&fscale);
 		}

-		static inline void compute(const Float32 * in, const Scale & scale, Float32 * out)
+		static inline void compute(const Float32 * __restrict in, const Scale & scale, Float32 * __restrict out)
 		{
 			__m128 val = _mm_loadu_ps(in);
 			val = _mm_mul_ps(val, scale);
@@ -315,13 +315,13 @@ namespace DB
 	struct FloatRoundingComputation<Float32, rounding_mode, NegativeScale>
 		: public BaseFloatRoundingComputation<Float32>
 	{
-		static inline void prepareScale(size_t scale, Scale & mm_scale)
+		static inline void prepare(size_t scale, Scale & mm_scale)
 		{
 			Float32 fscale = static_cast<Float32>(scale);
 			mm_scale = _mm_load1_ps(&fscale);
 		}

-		static inline void compute(const Float32 * in, const Scale & scale, Float32 * out)
+		static inline void compute(const Float32 * __restrict in, const Scale & scale, Float32 * __restrict out)
 		{
 			__m128 val = _mm_loadu_ps(in);

@@ -379,11 +379,11 @@ namespace DB
 	struct FloatRoundingComputation<Float32, rounding_mode, ZeroScale>
 		: public BaseFloatRoundingComputation<Float32>
 	{
-		static inline void prepareScale(size_t scale, Scale & mm_scale)
+		static inline void prepare(size_t scale, Scale & mm_scale)
 		{
 		}

-		static inline void compute(const Float32 * in, const Scale & scale, Float32 * out)
+		static inline void compute(const Float32 * __restrict in, const Scale & scale, Float32 * __restrict out)
 		{
 			__m128 val = _mm_loadu_ps(in);
 			val = _mm_round_ps(val, rounding_mode);
@@ -395,13 +395,13 @@ namespace DB
 	struct FloatRoundingComputation<Float64, rounding_mode, PositiveScale>
 		: public BaseFloatRoundingComputation<Float64>
 	{
-		static inline void prepareScale(size_t scale, Scale & mm_scale)
+		static inline void prepare(size_t scale, Scale & mm_scale)
 		{
 			Float64 fscale = static_cast<Float64>(scale);
 			mm_scale = _mm_load1_pd(&fscale);
 		}

-		static inline void compute(const Float64 * in, const Scale & scale, Float64 * out)
+		static inline void compute(const Float64 * __restrict in, const Scale & scale, Float64 * __restrict out)
 		{
 			__m128d val = _mm_loadu_pd(in);
 			val = _mm_mul_pd(val, scale);
@@ -415,13 +415,13 @@ namespace DB
 	struct FloatRoundingComputation<Float64, rounding_mode, NegativeScale>
 		: public BaseFloatRoundingComputation<Float64>
 	{
-		static inline void prepareScale(size_t scale, Scale & mm_scale)
+		static inline void prepare(size_t scale, Scale & mm_scale)
 		{
 			Float64 fscale = static_cast<Float64>(scale);
 			mm_scale = _mm_load1_pd(&fscale);
 		}

-		static inline void compute(const Float64 * in, const Scale & scale, Float64 * out)
+		static inline void compute(const Float64 * __restrict in, const Scale & scale, Float64 * __restrict out)
 		{
 			__m128d val = _mm_loadu_pd(in);

@@ -479,11 +479,11 @@ namespace DB
 	struct FloatRoundingComputation<Float64, rounding_mode, ZeroScale>
 		: public BaseFloatRoundingComputation<Float64>
 	{
-		static inline void prepareScale(size_t scale, Scale & mm_scale)
+		static inline void prepare(size_t scale, Scale & mm_scale)
 		{
 		}

-		static inline void compute(const Float64 * in, const Scale & scale, Float64 * out)
+		static inline void compute(const Float64 * __restrict in, const Scale & scale, Float64 * __restrict out)
 		{
 			__m128d val = _mm_loadu_pd(in);
 			val = _mm_round_pd(val, rounding_mode);
@@ -512,11 +512,11 @@ namespace DB
 		{
 			auto divisor = Op::prepare(scale);

-			auto begin_in = &in[0];
-			auto end_in = begin_in + in.size();
-			auto p_out = &out[0];
+			const T* begin_in = &in[0];
+			const T* end_in = begin_in + in.size();

-			for (auto p_in = begin_in; p_in != end_in; ++p_in)
+			T* __restrict p_out = &out[0];
+			for (const T* __restrict p_in = begin_in; p_in != end_in; ++p_in)
 			{
 				*p_out = Op::compute(*p_in, divisor);
 				++p_out;
@@ -545,43 +545,45 @@ namespace DB
 		static inline void apply(const PODArray<T> & in, size_t scale, typename ColumnVector<T>::Container_t & out)
 		{
 			Scale mm_scale;
-			Op::prepareScale(scale, mm_scale);
+			Op::prepare(scale, mm_scale);

 			const size_t data_count = std::tuple_size<Data>();

-			auto begin_in = &in[0];
-			auto end_in = begin_in + in.size();
-			auto limit = end_in - (data_count - 1);
+			const T* begin_in = &in[0];
+			const T* end_in = begin_in + in.size();

-			auto begin_out = &out[0];
-			auto end_out = begin_out + out.size();
+			T* begin_out = &out[0];
+			const T* end_out = begin_out + out.size();

-			auto p_in = begin_in;
-			auto p_out = begin_out;
+			const T* limit = end_in - (data_count - 1);
+
+			const T* __restrict p_in = begin_in;
+			T* __restrict p_out = begin_out;
 			for (; p_in < limit; p_in += data_count)
 			{
-				Op::compute(reinterpret_cast<const T *>(p_in), mm_scale, reinterpret_cast<T *>(p_out));
+				Op::compute(p_in, mm_scale, p_out);
 				p_out += data_count;
 			}

 			if (p_in < end_in)
 			{
 				Data tmp{0};
-				auto begin_tmp = &tmp[0];
-				auto end_tmp = begin_tmp + data_count;
+				T* begin_tmp = &tmp[0];
+				const T* end_tmp = begin_tmp + data_count;

-				for (auto p_tmp = begin_tmp; (p_tmp != end_tmp) && (p_in != end_in); ++p_tmp)
+				for (T* __restrict p_tmp = begin_tmp; (p_tmp != end_tmp) && (p_in != end_in); ++p_tmp)
 				{
 					*p_tmp = *p_in;
 					++p_in;
 				}

 				Data res;
+				const T* begin_res = &res[0];
+				const T* end_res = begin_res + data_count;
+
 				Op::compute(reinterpret_cast<T *>(&tmp), mm_scale, reinterpret_cast<T *>(&res));

-				auto begin_res = &res[0];
-				auto end_res = begin_res + data_count;
-				for (auto p_res = begin_res; (p_res != end_res) && (p_out != end_out); ++p_res)
+				for (const T* __restrict p_res = begin_res; (p_res != end_res) && (p_out != end_out); ++p_res)
 				{
 					*p_out = *p_res;
 					++p_out;
@@ -596,7 +598,7 @@ namespace DB
 			else
 			{
 				Scale mm_scale;
-				Op::prepareScale(scale, mm_scale);
+				Op::prepare(scale, mm_scale);

 				Data tmp{0};
 				tmp[0] = val;

--- a/dbms/tests/queries/0_stateless/00161_rounding_functions.reference
+++ b/dbms/tests/queries/0_stateless/00161_rounding_functions.reference
@@ -458,16 +458,16 @@
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
-16	-6	-16
-16	-6	-16
-16	-6	-16
-16	-6	-16
-13	-6	-16
-13	-6	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-13	-20	-10
+-13	-20	-10
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
 0	0	0
 0	0	0
 -13	-13	-13
@@ -499,18 +499,18 @@
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
-16	-6	-16
-16	-6	-16
-16	-6	-16
-16	-6	-16
-13	-6	-16
-13	-6	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-13	-20	-10
+-13	-20	-10
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
@@ -540,18 +540,18 @@
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
-16	-6	-16
-16	-6	-16
-16	-6	-16
-16	-6	-16
-13	-6	-16
-13	-6	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-13	-20	-10
+-13	-20	-10
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
@@ -581,18 +581,18 @@
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
-16	-6	-16
-16	-6	-16
-16	-6	-16
-16	-6	-16
-13	-6	-16
-13	-6	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
-16	84	-16
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-13	-20	-10
+-13	-20	-10
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
@@ -622,18 +622,18 @@
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
-0	0	0
-0	0	0
-0	0	0
-0	0	0
-13	0	0
-13	0	0
-0	0	0
-0	0	0
-0	0	0
-0	0	0
-0	0	0
-0	0	0
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-13	-20	-10
+-13	-20	-10
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
@@ -663,18 +663,18 @@
 -13	-13	-13
 -13	-13	-13
 -13	-13	-13
-0	0	0
-0	0	0
-0	0	0
-0	0	0
-13	0	0
-13	0	0
-0	0	0
-0	0	0
-0	0	0
-0	0	0
-0	0	0
-0	0	0
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-10	-20	-10
+-13	-20	-10
+-13	-20	-10
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
+0	-100	0
 2.72	2.72	2.71
 2.72	2.72	2.71
 2.72	2.72	2.71
@@ -745,12 +745,12 @@
 -3	-2	-3
 -3	-2	-3
 -3	-2	-3
-0	0	0
-0	0	0
-0	0	0
-0	0	0
-2.7	0	0
-2.7	0	0
+0	-10	0
+0	-10	0
+0	-10	0
+0	-10	0
+-2.7	-10	0
+-2.7	-10	0
 0	0	0
 0	0	0
 0	0	0