• M
    include/linux/kernel.h: rewrite min3, max3 and clamp using min and max · 2e1d06e1
    Michal Nazarewicz 提交于
    It appears that gcc is better at optimising a double call to min and max
    rather than open coded min3 and max3.  This can be observed here:
    
        $ cat min-max.c
        #define min(x, y) ({				\
        	typeof(x) _min1 = (x);			\
        	typeof(y) _min2 = (y);			\
        	(void) (&_min1 == &_min2);		\
        	_min1 < _min2 ? _min1 : _min2; })
        #define min3(x, y, z) ({			\
        	typeof(x) _min1 = (x);			\
        	typeof(y) _min2 = (y);			\
        	typeof(z) _min3 = (z);			\
        	(void) (&_min1 == &_min2);		\
        	(void) (&_min1 == &_min3);		\
        	_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
        		(_min2 < _min3 ? _min2 : _min3); })
    
        int fmin3(int x, int y, int z) { return min3(x, y, z); }
        int fmin2(int x, int y, int z) { return min(min(x, y), z); }
    
        $ gcc -O2 -o min-max.s -S min-max.c; cat min-max.s
        	.file	"min-max.c"
        	.text
        	.p2align 4,,15
        	.globl	fmin3
        	.type	fmin3, @function
        fmin3:
        .LFB0:
        	.cfi_startproc
        	cmpl	%esi, %edi
        	jl	.L5
        	cmpl	%esi, %edx
        	movl	%esi, %eax
        	cmovle	%edx, %eax
        	ret
        	.p2align 4,,10
        	.p2align 3
        .L5:
        	cmpl	%edi, %edx
        	movl	%edi, %eax
        	cmovle	%edx, %eax
        	ret
        	.cfi_endproc
        .LFE0:
        	.size	fmin3, .-fmin3
        	.p2align 4,,15
        	.globl	fmin2
        	.type	fmin2, @function
        fmin2:
        .LFB1:
        	.cfi_startproc
        	cmpl	%edi, %esi
        	movl	%edx, %eax
        	cmovle	%esi, %edi
        	cmpl	%edx, %edi
        	cmovle	%edi, %eax
        	ret
        	.cfi_endproc
        .LFE1:
        	.size	fmin2, .-fmin2
        	.ident	"GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
        	.section	.note.GNU-stack,"",@progbits
    
    fmin3 function, which uses open-coded min3 macro, is compiled into total
    of ten instructions including a conditional branch, whereas fmin2
    function, which uses two calls to min2 macro, is compiled into six
    instructions with no branches.
    
    Similarly, open-coded clamp produces the same code as clamp using min and
    max macros, but the latter is much shorter:
    
        $ cat clamp.c
        #define clamp(val, min, max) ({			\
        	typeof(val) __val = (val);		\
        	typeof(min) __min = (min);		\
        	typeof(max) __max = (max);		\
        	(void) (&__val == &__min);		\
        	(void) (&__val == &__max);		\
        	__val = __val < __min ? __min: __val;	\
        	__val > __max ? __max: __val; })
        #define min(x, y) ({				\
        	typeof(x) _min1 = (x);			\
        	typeof(y) _min2 = (y);			\
        	(void) (&_min1 == &_min2);		\
        	_min1 < _min2 ? _min1 : _min2; })
        #define max(x, y) ({				\
        	typeof(x) _max1 = (x);			\
        	typeof(y) _max2 = (y);			\
        	(void) (&_max1 == &_max2);		\
        	_max1 > _max2 ? _max1 : _max2; })
    
        int fclamp(int v, int min, int max) { return clamp(v, min, max); }
        int fclampmm(int v, int min, int max) { return min(max(v, min), max); }
    
        $ gcc -O2 -o clamp.s -S clamp.c; cat clamp.s
        	.file	"clamp.c"
        	.text
        	.p2align 4,,15
        	.globl	fclamp
        	.type	fclamp, @function
        fclamp:
        .LFB0:
        	.cfi_startproc
        	cmpl	%edi, %esi
        	movl	%edx, %eax
        	cmovge	%esi, %edi
        	cmpl	%edx, %edi
        	cmovle	%edi, %eax
        	ret
        	.cfi_endproc
        .LFE0:
        	.size	fclamp, .-fclamp
        	.p2align 4,,15
        	.globl	fclampmm
        	.type	fclampmm, @function
        fclampmm:
        .LFB1:
        	.cfi_startproc
        	cmpl	%edi, %esi
        	cmovge	%esi, %edi
        	cmpl	%edi, %edx
        	movl	%edi, %eax
        	cmovle	%edx, %eax
        	ret
        	.cfi_endproc
        .LFE1:
        	.size	fclampmm, .-fclampmm
        	.ident	"GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
        	.section	.note.GNU-stack,"",@progbits
    
        Linux mpn-glaptop 3.13.0-29-generic #53~precise1-Ubuntu SMP Wed Jun 4 22:06:25 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
        gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
        Copyright (C) 2011 Free Software Foundation, Inc.
        This is free software; see the source for copying conditions.  There is NO
        warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
    
        -rwx------ 1 mpn eng 51224656 Jun 17 14:15 vmlinux.before
        -rwx------ 1 mpn eng 51224608 Jun 17 13:57 vmlinux.after
    
    48 bytes reduction.  The do_fault_around was a few instruction shorter
    and as far as I can tell saved 12 bytes on the stack, i.e.:
    
        $ grep -e rsp -e pop -e push do_fault_around.*
        do_fault_around.before.s:push   %rbp
        do_fault_around.before.s:mov    %rsp,%rbp
        do_fault_around.before.s:push   %r13
        do_fault_around.before.s:push   %r12
        do_fault_around.before.s:push   %rbx
        do_fault_around.before.s:sub    $0x38,%rsp
        do_fault_around.before.s:add    $0x38,%rsp
        do_fault_around.before.s:pop    %rbx
        do_fault_around.before.s:pop    %r12
        do_fault_around.before.s:pop    %r13
        do_fault_around.before.s:pop    %rbp
    
        do_fault_around.after.s:push   %rbp
        do_fault_around.after.s:mov    %rsp,%rbp
        do_fault_around.after.s:push   %r12
        do_fault_around.after.s:push   %rbx
        do_fault_around.after.s:sub    $0x30,%rsp
        do_fault_around.after.s:add    $0x30,%rsp
        do_fault_around.after.s:pop    %rbx
        do_fault_around.after.s:pop    %r12
        do_fault_around.after.s:pop    %rbp
    
    or here side-by-side:
    
        Before                    After
        push   %rbp               push   %rbp
        mov    %rsp,%rbp          mov    %rsp,%rbp
        push   %r13
        push   %r12               push   %r12
        push   %rbx               push   %rbx
        sub    $0x38,%rsp         sub    $0x30,%rsp
        add    $0x38,%rsp         add    $0x30,%rsp
        pop    %rbx               pop    %rbx
        pop    %r12               pop    %r12
        pop    %r13
        pop    %rbp               pop    %rbp
    
    There are also fewer branches:
    
        $ grep ^j do_fault_around.*
        do_fault_around.before.s:jae    ffffffff812079b7
        do_fault_around.before.s:jmp    ffffffff812079c5
        do_fault_around.before.s:jmp    ffffffff81207a14
        do_fault_around.before.s:ja     ffffffff812079f9
        do_fault_around.before.s:jb     ffffffff81207a10
        do_fault_around.before.s:jmp    ffffffff81207a63
        do_fault_around.before.s:jne    ffffffff812079df
    
        do_fault_around.after.s:jmp    ffffffff812079fd
        do_fault_around.after.s:ja     ffffffff812079e2
        do_fault_around.after.s:jb     ffffffff812079f9
        do_fault_around.after.s:jmp    ffffffff81207a4c
        do_fault_around.after.s:jne    ffffffff812079c8
    
    And here's with allyesconfig on a different machine:
    
        $ uname -a; gcc --version; ls -l vmlinux.*
        Linux erwin 3.14.7-mn #54 SMP Sun Jun 15 11:25:08 CEST 2014 x86_64 AMD Phenom(tm) II X3 710 Processor AuthenticAMD GNU/Linux
        gcc (GCC) 4.8.3
        Copyright (C) 2013 Free Software Foundation, Inc.
        This is free software; see the source for copying conditions.  There is NO
        warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
    
        -rwx------ 1 mpn eng 437027411 Jun 20 16:04 vmlinux.before
        -rwx------ 1 mpn eng 437026881 Jun 20 15:30 vmlinux.after
    
    530 bytes reduction.
    Signed-off-by: NMichal Nazarewicz <mina86@mina86.com>
    Signed-off-by: NHagen Paul Pfeifer <hagen@jauu.net>
    Acked-by: NSteven Rostedt <rostedt@goodmis.org>
    Cc: Hagen Paul Pfeifer <hagen@jauu.net>
    Cc: David Rientjes <rientjes@google.com>
    Cc: "Rustad, Mark D" <mark.d.rustad@intel.com>
    Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
    Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
    2e1d06e1
kernel.h 26.8 KB