• Y
    ARM64: enable GENERIC_FIND_FIRST_BIT · 98c5ec77
    Yury Norov 提交于
    ARM64 doesn't implement find_first_{zero}_bit in arch code and doesn't
    enable it in a config. It leads to using find_next_bit() which is less
    efficient:
    
    0000000000000000 <find_first_bit>:
       0:	aa0003e4 	mov	x4, x0
       4:	aa0103e0 	mov	x0, x1
       8:	b4000181 	cbz	x1, 38 <find_first_bit+0x38>
       c:	f9400083 	ldr	x3, [x4]
      10:	d2800802 	mov	x2, #0x40                  	// #64
      14:	91002084 	add	x4, x4, #0x8
      18:	b40000c3 	cbz	x3, 30 <find_first_bit+0x30>
      1c:	14000008 	b	3c <find_first_bit+0x3c>
      20:	f8408483 	ldr	x3, [x4], #8
      24:	91010045 	add	x5, x2, #0x40
      28:	b50000c3 	cbnz	x3, 40 <find_first_bit+0x40>
      2c:	aa0503e2 	mov	x2, x5
      30:	eb02001f 	cmp	x0, x2
      34:	54ffff68 	b.hi	20 <find_first_bit+0x20>  // b.pmore
      38:	d65f03c0 	ret
      3c:	d2800002 	mov	x2, #0x0                   	// #0
      40:	dac00063 	rbit	x3, x3
      44:	dac01063 	clz	x3, x3
      48:	8b020062 	add	x2, x3, x2
      4c:	eb02001f 	cmp	x0, x2
      50:	9a829000 	csel	x0, x0, x2, ls  // ls = plast
      54:	d65f03c0 	ret
    
      ...
    
    0000000000000118 <_find_next_bit.constprop.1>:
     118:	eb02007f 	cmp	x3, x2
     11c:	540002e2 	b.cs	178 <_find_next_bit.constprop.1+0x60>  // b.hs, b.nlast
     120:	d346fc66 	lsr	x6, x3, #6
     124:	f8667805 	ldr	x5, [x0, x6, lsl #3]
     128:	b4000061 	cbz	x1, 134 <_find_next_bit.constprop.1+0x1c>
     12c:	f8667826 	ldr	x6, [x1, x6, lsl #3]
     130:	8a0600a5 	and	x5, x5, x6
     134:	ca0400a6 	eor	x6, x5, x4
     138:	92800005 	mov	x5, #0xffffffffffffffff    	// #-1
     13c:	9ac320a5 	lsl	x5, x5, x3
     140:	927ae463 	and	x3, x3, #0xffffffffffffffc0
     144:	ea0600a5 	ands	x5, x5, x6
     148:	54000120 	b.eq	16c <_find_next_bit.constprop.1+0x54>  // b.none
     14c:	1400000e 	b	184 <_find_next_bit.constprop.1+0x6c>
     150:	d346fc66 	lsr	x6, x3, #6
     154:	f8667805 	ldr	x5, [x0, x6, lsl #3]
     158:	b4000061 	cbz	x1, 164 <_find_next_bit.constprop.1+0x4c>
     15c:	f8667826 	ldr	x6, [x1, x6, lsl #3]
     160:	8a0600a5 	and	x5, x5, x6
     164:	eb05009f 	cmp	x4, x5
     168:	540000c1 	b.ne	180 <_find_next_bit.constprop.1+0x68>  // b.any
     16c:	91010063 	add	x3, x3, #0x40
     170:	eb03005f 	cmp	x2, x3
     174:	54fffee8 	b.hi	150 <_find_next_bit.constprop.1+0x38>  // b.pmore
     178:	aa0203e0 	mov	x0, x2
     17c:	d65f03c0 	ret
     180:	ca050085 	eor	x5, x4, x5
     184:	dac000a5 	rbit	x5, x5
     188:	dac010a5 	clz	x5, x5
     18c:	8b0300a3 	add	x3, x5, x3
     190:	eb03005f 	cmp	x2, x3
     194:	9a839042 	csel	x2, x2, x3, ls  // ls = plast
     198:	aa0203e0 	mov	x0, x2
     19c:	d65f03c0 	ret
    
     ...
    
    0000000000000238 <find_next_bit>:
     238:	a9bf7bfd 	stp	x29, x30, [sp, #-16]!
     23c:	aa0203e3 	mov	x3, x2
     240:	d2800004 	mov	x4, #0x0                   	// #0
     244:	aa0103e2 	mov	x2, x1
     248:	910003fd 	mov	x29, sp
     24c:	d2800001 	mov	x1, #0x0                   	// #0
     250:	97ffffb2 	bl	118 <_find_next_bit.constprop.1>
     254:	a8c17bfd 	ldp	x29, x30, [sp], #16
     258:	d65f03c0 	ret
    
    Enabling find_{first,next}_bit() would also benefit for_each_{set,clear}_bit().
    On A-53 find_first_bit() is almost twice faster than find_next_bit(), according
    to lib/find_bit_benchmark (thanks to Alexey for testing):
    
    GENERIC_FIND_FIRST_BIT=n:
    [7126084.948181] find_first_bit:               47389224 ns,  16357 iterations
    [7126085.032315] find_first_bit:               19048193 ns,    655 iterations
    
    GENERIC_FIND_FIRST_BIT=y:
    [   84.158068] find_first_bit:               27193319 ns,  16406 iterations
    [   84.233005] find_first_bit:               11082437 ns,    656 iterations
    
    GENERIC_FIND_FIRST_BIT=n bloats the kernel despite that it disables generation
    of find_{first,next}_bit():
    
            yury:linux$ scripts/bloat-o-meter vmlinux vmlinux.ffb
            add/remove: 4/1 grow/shrink: 19/251 up/down: 564/-1692 (-1128)
            ...
    
    Overall, GENERIC_FIND_FIRST_BIT=n is harmful both in terms of performance and
    code size, and it's better to have GENERIC_FIND_FIRST_BIT enabled.
    Tested-by: NAlexey Klimov <aklimov@redhat.com>
    Signed-off-by: NYury Norov <yury.norov@gmail.com>
    Acked-by: NWill Deacon <will@kernel.org>
    Link: https://lore.kernel.org/r/20210225135700.1381396-2-yury.norov@gmail.comSigned-off-by: NCatalin Marinas <catalin.marinas@arm.com>
    98c5ec77
Kconfig 63.7 KB