1. 16 12月, 2011 2 次提交
    • D
      x86_64, asm: Optimise fls(), ffs() and fls64() · ca3d30cc
      David Howells 提交于
      fls(N), ffs(N) and fls64(N) can be optimised on x86_64.  Currently they use a
      CMOV instruction after the BSR/BSF to set the destination register to -1 if the
      value to be scanned was 0 (in which case BSR/BSF set the Z flag).
      
      Instead, according to the AMD64 specification, we can make use of the fact that
      BSR/BSF doesn't modify its output register if its input is 0.  By preloading
      the output with -1 and incrementing the result, we achieve the desired result
      without the need for a conditional check.
      
      The Intel x86_64 specification, however, says that the result of BSR/BSF in
      such a case is undefined.  That said, when queried, one of the Intel CPU
      architects said that the behaviour on all Intel CPUs is that:
      
       (1) with BSRQ/BSFQ, the 64-bit destination register is written with its
           original value if the source is 0, thus, in essence, giving the effect we
           want.  And,
      
       (2) with BSRL/BSFL, the lower half of the 64-bit destination register is
           written with its original value if the source is 0, and the upper half is
           cleared, thus giving us the effect we want (we return a 4-byte int).
      
      Further, it was indicated that they (Intel) are unlikely to get away with
      changing the behaviour.
      
      It might be possible to optimise the 32-bit versions of these functions, but
      there's a lot more variation, and so the effective non-destructive property of
      BSRL/BSRF cannot be relied on.
      
      [ hpa: specifically, some 486 chips are known to NOT have this property. ]
      
      I have benchmarked these functions on my Core2 Duo test machine using the
      following program:
      
      	#include <stdlib.h>
      	#include <stdio.h>
      
      	#ifndef __x86_64__
      	#error
      	#endif
      
      	#define PAGE_SHIFT 12
      
      	typedef unsigned long long __u64, u64;
      	typedef unsigned int __u32, u32;
      	#define noinline	__attribute__((noinline))
      
      	static __always_inline int fls64(__u64 x)
      	{
      		long bitpos = -1;
      
      		asm("bsrq %1,%0"
      		    : "+r" (bitpos)
      		    : "rm" (x));
      		return bitpos + 1;
      	}
      
      	static inline unsigned long __fls(unsigned long word)
      	{
      		asm("bsr %1,%0"
      		    : "=r" (word)
      		    : "rm" (word));
      		return word;
      	}
      	static __always_inline int old_fls64(__u64 x)
      	{
      		if (x == 0)
      			return 0;
      		return __fls(x) + 1;
      	}
      
      	static noinline // __attribute__((const))
      	int old_get_order(unsigned long size)
      	{
      		int order;
      
      		size = (size - 1) >> (PAGE_SHIFT - 1);
      		order = -1;
      		do {
      			size >>= 1;
      			order++;
      		} while (size);
      		return order;
      	}
      
      	static inline __attribute__((const))
      	int get_order_old_fls64(unsigned long size)
      	{
      		int order;
      		size--;
      		size >>= PAGE_SHIFT;
      		order = old_fls64(size);
      		return order;
      	}
      
      	static inline __attribute__((const))
      	int get_order(unsigned long size)
      	{
      		int order;
      		size--;
      		size >>= PAGE_SHIFT;
      		order = fls64(size);
      		return order;
      	}
      
      	unsigned long prevent_optimise_out;
      
      	static noinline unsigned long test_old_get_order(void)
      	{
      		unsigned long n, total = 0;
      		long rep, loop;
      
      		for (rep = 1000000; rep > 0; rep--) {
      			for (loop = 0; loop <= 16384; loop += 4) {
      				n = 1UL << loop;
      				total += old_get_order(n);
      			}
      		}
      		return total;
      	}
      
      	static noinline unsigned long test_get_order_old_fls64(void)
      	{
      		unsigned long n, total = 0;
      		long rep, loop;
      
      		for (rep = 1000000; rep > 0; rep--) {
      			for (loop = 0; loop <= 16384; loop += 4) {
      				n = 1UL << loop;
      				total += get_order_old_fls64(n);
      			}
      		}
      		return total;
      	}
      
      	static noinline unsigned long test_get_order(void)
      	{
      		unsigned long n, total = 0;
      		long rep, loop;
      
      		for (rep = 1000000; rep > 0; rep--) {
      			for (loop = 0; loop <= 16384; loop += 4) {
      				n = 1UL << loop;
      				total += get_order(n);
      			}
      		}
      		return total;
      	}
      
      	int main(int argc, char **argv)
      	{
      		unsigned long total;
      
      		switch (argc) {
      		case 1:  total = test_old_get_order();		break;
      		case 2:  total = test_get_order_old_fls64();	break;
      		default: total = test_get_order();		break;
      		}
      		prevent_optimise_out = total;
      		return 0;
      	}
      
      This allows me to test the use of the old fls64() implementation and the new
      fls64() implementation and also to contrast these to the out-of-line loop-based
      implementation of get_order().  The results were:
      
      	warthog>time ./get_order
      	real    1m37.191s
      	user    1m36.313s
      	sys     0m0.861s
      	warthog>time ./get_order x
      	real    0m16.892s
      	user    0m16.586s
      	sys     0m0.287s
      	warthog>time ./get_order x x
      	real    0m7.731s
      	user    0m7.727s
      	sys     0m0.002s
      
      Using the current upstream fls64() as a basis for an inlined get_order() [the
      second result above] is much faster than using the current out-of-line
      loop-based get_order() [the first result above].
      
      Using my optimised inline fls64()-based get_order() [the third result above]
      is even faster still.
      
      [ hpa: changed the selection of 32 vs 64 bits to use CONFIG_X86_64
        instead of comparing BITS_PER_LONG, updated comments, rebased manually
        on top of 83d99df7 x86, bitops: Move fls64.h inside __KERNEL__ ]
      Signed-off-by: NDavid Howells <dhowells@redhat.com>
      Link: http://lkml.kernel.org/r/20111213145654.14362.39868.stgit@warthog.procyon.org.uk
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Signed-off-by: NH. Peter Anvin <hpa@linux.intel.com>
      ca3d30cc
    • H
      x86, bitops: Move fls64.h inside __KERNEL__ · 83d99df7
      H. Peter Anvin 提交于
      We would include <asm-generic/bitops/fls64.h> even without __KERNEL__,
      but that doesn't make sense, as:
      
      1. That file provides fls64(), but the corresponding function fls() is
         not exported to user space.
      2. The implementation of fls64.h uses kernel-only symbols.
      3. fls64.h is not exported to user space.
      
      This appears to have been a bug introduced in checkin:
      
      d57594c2 bitops: use __fls for fls64 on 64-bit archs
      
      Cc: Stephen Hemminger <shemminger@vyatta.com>
      Cc: Alexander van Heukelum <heukelum@mailshack.com>
      Cc: David Howells <dhowells@redhat.com>
      Signed-off-by: NH. Peter Anvin <hpa@zytor.com>
      Link: http://lkml.kernel.org/r/4EEA77E1.6050009@zytor.com
      83d99df7
  2. 15 12月, 2011 2 次提交
    • J
      x86: Fix and improve percpu_cmpxchg{8,16}b_double() · cebef5be
      Jan Beulich 提交于
      They had several problems/shortcomings:
      
      Only the first memory operand was mentioned in the 2x32bit asm()
      operands, and 2x64-bit version had a memory clobber. The first
      allowed the compiler to not recognize the need to re-load the
      data in case it had it cached in some register, and the second
      was overly destructive.
      
      The memory operand in the 2x32-bit asm() was declared to only be
      an output.
      
      The types of the local copies of the old and new values were
      incorrect (as in other per-CPU ops, the types of the per-CPU
      variables accessed should be used here, to make sure the
      respective types are compatible).
      
      The __dummy variable was pointless (and needlessly initialized
      in the 2x32-bit case), given that local copies of the inputs
      already exist.
      
      The 2x64-bit variant forced the address of the first object into
      %rsi, even though this is needed only for the call to the
      emulation function. The real cmpxchg16b can operate on an
      memory.
      
      At once also change the return value type to what it really is -
      'bool'.
      Signed-off-by: NJan Beulich <jbeulich@suse.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Andrew Morton <akpm@linux-foundation.org>
      Cc: David Howells <dhowells@redhat.com>
      Cc: Christoph Lameter <cl@linux.com>
      Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
      Link: http://lkml.kernel.org/r/4EE86D6502000078000679FE@nat28.tlf.novell.comSigned-off-by: NIngo Molnar <mingo@elte.hu>
      cebef5be
    • J
      x86: Report cpb and eff_freq_ro flags correctly · 969df4b8
      Joerg Roedel 提交于
      Add the flags to get rid of the [9] and [10] feature names
      in cpuinfo's 'power management' fields and replace them with
      meaningful names.
      Signed-off-by: NJoerg Roedel <joerg.roedel@amd.com>
      Link: http://lkml.kernel.org/r/1323875574-17881-1-git-send-email-joerg.roedel@amd.comSigned-off-by: NIngo Molnar <mingo@elte.hu>
      969df4b8
  3. 13 12月, 2011 1 次提交
  4. 07 12月, 2011 1 次提交
  5. 06 12月, 2011 8 次提交
  6. 05 12月, 2011 5 次提交
  7. 04 12月, 2011 1 次提交
    • K
      xen/pm_idle: Make pm_idle be default_idle under Xen. · e5fd47bf
      Konrad Rzeszutek Wilk 提交于
      The idea behind commit d91ee586 ("cpuidle: replace xen access to x86
      pm_idle and default_idle") was to have one call - disable_cpuidle()
      which would make pm_idle not be molested by other code.  It disallows
      cpuidle_idle_call to be set to pm_idle (which is excellent).
      
      But in the select_idle_routine() and idle_setup(), the pm_idle can still
      be set to either: amd_e400_idle, mwait_idle or default_idle.  This
      depends on some CPU flags (MWAIT) and in AMD case on the type of CPU.
      
      In case of mwait_idle we can hit some instances where the hypervisor
      (Amazon EC2 specifically) sets the MWAIT and we get:
      
        Brought up 2 CPUs
        invalid opcode: 0000 [#1] SMP
      
        Pid: 0, comm: swapper Not tainted 3.1.0-0.rc6.git0.3.fc16.x86_64 #1
        RIP: e030:[<ffffffff81015d1d>]  [<ffffffff81015d1d>] mwait_idle+0x6f/0xb4
        ...
        Call Trace:
         [<ffffffff8100e2ed>] cpu_idle+0xae/0xe8
         [<ffffffff8149ee78>] cpu_bringup_and_idle+0xe/0x10
        RIP  [<ffffffff81015d1d>] mwait_idle+0x6f/0xb4
         RSP <ffff8801d28ddf10>
      
      In the case of amd_e400_idle we don't get so spectacular crashes, but we
      do end up making an MSR which is trapped in the hypervisor, and then
      follow it up with a yield hypercall.  Meaning we end up going to
      hypervisor twice instead of just once.
      
      The previous behavior before v3.0 was that pm_idle was set to
      default_idle regardless of select_idle_routine/idle_setup.
      
      We want to do that, but only for one specific case: Xen.  This patch
      does that.
      
      Fixes RH BZ #739499 and Ubuntu #881076
      Reported-by: NStefan Bader <stefan.bader@canonical.com>
      Signed-off-by: NKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      e5fd47bf
  8. 03 12月, 2011 6 次提交
    • L
      Merge branch 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb · af968e29
      Linus Torvalds 提交于
      * 'usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb: (21 commits)
        usb: ftdi_sio: add PID for Propox ISPcable III
        Revert "xHCI: reset-on-resume quirk for NEC uPD720200"
        xHCI: fix bug in xhci_clear_command_ring()
        usb: gadget: fsl_udc: fix dequeuing a request in progress
        usb: fsl_mxc_udc.c: Remove compile-time dependency of MX35 SoC type
        usb: fsl_mxc_udc.c: Fix build issue by including missing header file
        USB: fsl_udc_core: use usb_endpoint_xfer_isoc to judge ISO XFER
        usb: udc: Fix gadget driver's speed check in various UDC drivers
        usb: gadget: fix g_serial regression
        usb: renesas_usbhs: fixup driver speed
        usb: renesas_usbhs: fixup gadget.dev.driver when udc_stop.
        usb: renesas_usbhs: fixup signal the driver that cable was disconnected
        usb: renesas_usbhs: fixup device_register timing
        usb: musb: PM: fix context save/restore in suspend/resume path
        USB: linux-cdc-acm.inf: add support for the acm_ms gadget
        EHCI : Fix a regression in the ISO scheduler
        xHCI: reset-on-resume quirk for NEC uPD720200
        USB: whci-hcd: fix endian conversion in qset_clear()
        USB: usb-storage: unusual_devs entry for Kingston DT 101 G2
        usb: option: add SIMCom SIM5218
        ...
      af968e29
    • L
      Merge branch 'staging-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging · f9143eae
      Linus Torvalds 提交于
      * 'staging-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging:
        Staging: comedi: fix integer overflow in do_insnlist_ioctl()
        Revert "Staging: comedi: integer overflow in do_insnlist_ioctl()"
        Staging: comedi: integer overflow in do_insnlist_ioctl()
        Staging: comedi: fix signal handling in read and write
        Staging: comedi: fix mmap_count
        staging: comedi: fix oops for USB DAQ devices.
        staging: comedi: usbduxsigma: Fixed wrong range for the analogue channel.
        staging:rts_pstor:Complete scanning_done variable
        staging: usbip: bugfix for deadlock
      f9143eae
    • L
      Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs · ffb8fb54
      Linus Torvalds 提交于
      * 'for-linus' of git://oss.sgi.com/xfs/xfs:
        xfs: fix attr2 vs large data fork assert
        xfs: force buffer writeback before blocking on the ilock in inode reclaim
        xfs: validate acl count
      ffb8fb54
    • L
      7ed89aed
    • L
      Merge branch 'drm-fixes' of git://people.freedesktop.org/~airlied/linux · c2b5adb4
      Linus Torvalds 提交于
      * 'drm-fixes' of git://people.freedesktop.org/~airlied/linux:
        vmwgfx: integer overflow in vmw_kms_update_layout_ioctl()
        drm/radeon/kms: fix 2D tiling CS support on EG/CM
        drm/radeon/kms: fix scanout of 2D tiled buffers on EG/CM
        drm: Fix lack of CRTC disable for drm_crtc_helper_set_config(.fb=NULL)
        drm/radeon/kms: add some new pci ids
        drm/radeon/kms: Skip ACPI call to ATIF when possible
        drm/radeon/kms: Hide debugging message
        drm/radeon/kms: add some loop timeouts in pageflip code
        drm/nv50/disp: silence compiler warning
        drm/nouveau: fix oopses caused by clear being called on unpopulated ttms
        drm/nouveau: Keep RAMIN heap within the channel.
        drm/nvd0/disp: fix sor dpms typo, preventing dpms on in some situations
        drm/nvc0/gr: fix TP init for transform feedback offset queries
        drm/nouveau: add dumb ioctl support
      c2b5adb4
    • L
      Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound · 0efebaa7
      Linus Torvalds 提交于
      * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound:
        ALSA: hda - Fix S3/S4 problem on machines with VREF-pin mute-LED
        ALSA: hda_intel - revert a quirk that affect VIA chipsets
        ALSA: hda - Avoid touching mute-VREF pin for IDT codecs
        firmware: Sigma: Fix endianess issues
        firmware: Sigma: Skip header during CRC generation
        firmware: Sigma: Prevent out of bounds memory access
        ALSA: usb-audio - Support for Roland GAIA SH-01 Synthesizer
        ASoC: Supply dcs_codes for newer WM1811 revisions
        ASoC: Error out if we can't generate a LRCLK at all for WM8994
        ASoC: Correct name of Speyside Main Speaker widget
        ASoC: skip resume of soc-audio devices without codecs
        ASoC: cs42l51: Fix off-by-one for reg_cache_size
        ASoC: drop support for PlayPaq with WM8510
        ASoC: mpc8610: tell the CS4270 codec that it's the master
        ASoC: cs4720: use snd_soc_cache_sync()
        ASoC: SAMSUNG: Fix build error
        ASoC: max9877: Update register if either val or val2 is changed
        ASoC: Fix wrong define for AD1836_ADC_WORD_OFFSET
      0efebaa7
  9. 02 12月, 2011 14 次提交