提交 · 3108864e2d70891d30d60af6af256a1b517d7078 · openeuler / raspberrypi-kernel

20 11月, 2008 1 次提交

由 Ulrich Drepper 提交于 11月 19, 2008

Introduce a new accept4() system call.  The addition of this system call
matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(),
inotify_init1(), epoll_create1(), pipe2()) which added new system calls
that differed from analogous traditional system calls in adding a flags
argument that can be used to access additional functionality.

The accept4() system call is exactly the same as accept(), except that
it adds a flags bit-mask argument.  Two flags are initially implemented.
(Most of the new system calls in 2.6.27 also had both of these flags.)

SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled
for the new file descriptor returned by accept4().  This is a useful
security feature to avoid leaking information in a multithreaded
program where one thread is doing an accept() at the same time as
another thread is doing a fork() plus exec().  More details here:
http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling",
Ulrich Drepper).

The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag
to be enabled on the new open file description created by accept4().
(This flag is merely a convenience, saving the use of additional calls
fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result.

Here's a test program.  Works on x86-32.  Should work on x86-64, but
I (mtk) don't have a system to hand to test with.

It tests accept4() with each of the four possible combinations of
SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies
that the appropriate flags are set on the file descriptor/open file
description returned by accept4().

I tested Ulrich's patch in this thread by applying against 2.6.28-rc2,
and it passes according to my test program.

/* test_accept4.c

  Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk
       <mtk.manpages@gmail.com>

  Licensed under the GNU GPLv2 or later.
*/
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>

#define PORT_NUM 33333

#define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0)

/**********************************************************************/

/* The following is what we need until glibc gets a wrapper for
  accept4() */

/* Flags for socket(), socketpair(), accept4() */
#ifndef SOCK_CLOEXEC
#define SOCK_CLOEXEC    O_CLOEXEC
#endif
#ifndef SOCK_NONBLOCK
#define SOCK_NONBLOCK   O_NONBLOCK
#endif

#ifdef __x86_64__
#define SYS_accept4 288
#elif __i386__
#define USE_SOCKETCALL 1
#define SYS_ACCEPT4 18
#else
#error "Sorry -- don't know the syscall # on this architecture"
#endif

static int
accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags)
{
   printf("Calling accept4(): flags = %x", flags);
   if (flags != 0) {
       printf(" (");
       if (flags & SOCK_CLOEXEC)
           printf("SOCK_CLOEXEC");
       if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK))
           printf(" ");
       if (flags & SOCK_NONBLOCK)
           printf("SOCK_NONBLOCK");
       printf(")");
   }
   printf("\n");

#if USE_SOCKETCALL
   long args[6];

   args[0] = fd;
   args[1] = (long) sockaddr;
   args[2] = (long) addrlen;
   args[3] = flags;

   return syscall(SYS_socketcall, SYS_ACCEPT4, args);
#else
   return syscall(SYS_accept4, fd, sockaddr, addrlen, flags);
#endif
}

/**********************************************************************/

static int
do_test(int lfd, struct sockaddr_in *conn_addr,
       int closeonexec_flag, int nonblock_flag)
{
   int connfd, acceptfd;
   int fdf, flf, fdf_pass, flf_pass;
   struct sockaddr_in claddr;
   socklen_t addrlen;

   printf("=======================================\n");

   connfd = socket(AF_INET, SOCK_STREAM, 0);
   if (connfd == -1)
       die("socket");
   if (connect(connfd, (struct sockaddr *) conn_addr,
               sizeof(struct sockaddr_in)) == -1)
       die("connect");

   addrlen = sizeof(struct sockaddr_in);
   acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen,
                      closeonexec_flag | nonblock_flag);
   if (acceptfd == -1) {
       perror("accept4()");
       close(connfd);
       return 0;
   }

   fdf = fcntl(acceptfd, F_GETFD);
   if (fdf == -1)
       die("fcntl:F_GETFD");
   fdf_pass = ((fdf & FD_CLOEXEC) != 0) ==
              ((closeonexec_flag & SOCK_CLOEXEC) != 0);
   printf("Close-on-exec flag is %sset (%s); ",
           (fdf & FD_CLOEXEC) ? "" : "not ",
           fdf_pass ? "OK" : "failed");

   flf = fcntl(acceptfd, F_GETFL);
   if (flf == -1)
       die("fcntl:F_GETFD");
   flf_pass = ((flf & O_NONBLOCK) != 0) ==
              ((nonblock_flag & SOCK_NONBLOCK) !=0);
   printf("nonblock flag is %sset (%s)\n",
           (flf & O_NONBLOCK) ? "" : "not ",
           flf_pass ? "OK" : "failed");

   close(acceptfd);
   close(connfd);

   printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL");
   return fdf_pass && flf_pass;
}

static int
create_listening_socket(int port_num)
{
   struct sockaddr_in svaddr;
   int lfd;
   int optval;

   memset(&svaddr, 0, sizeof(struct sockaddr_in));
   svaddr.sin_family = AF_INET;
   svaddr.sin_addr.s_addr = htonl(INADDR_ANY);
   svaddr.sin_port = htons(port_num);

   lfd = socket(AF_INET, SOCK_STREAM, 0);
   if (lfd == -1)
       die("socket");

   optval = 1;
   if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval,
                  sizeof(optval)) == -1)
       die("setsockopt");

   if (bind(lfd, (struct sockaddr *) &svaddr,
            sizeof(struct sockaddr_in)) == -1)
       die("bind");

   if (listen(lfd, 5) == -1)
       die("listen");

   return lfd;
}

int
main(int argc, char *argv[])
{
   struct sockaddr_in conn_addr;
   int lfd;
   int port_num;
   int passed;

   passed = 1;

   port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM;

   memset(&conn_addr, 0, sizeof(struct sockaddr_in));
   conn_addr.sin_family = AF_INET;
   conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
   conn_addr.sin_port = htons(port_num);

   lfd = create_listening_socket(port_num);

   if (!do_test(lfd, &conn_addr, 0, 0))
       passed = 0;
   if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0))
       passed = 0;
   if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK))
       passed = 0;
   if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK))
       passed = 0;

   close(lfd);

   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
}

[mtk.manpages@gmail.com: rewrote changelog, updated test program]
Signed-off-by: NUlrich Drepper <drepper@redhat.com>
Tested-by: NMichael Kerrisk <mtk.manpages@gmail.com>
Acked-by: NMichael Kerrisk <mtk.manpages@gmail.com>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>

de11defe

18 11月, 2008 7 次提交

x86: more general identifier for Phoenix BIOS · 0af40a4b

由 Philipp Kohlbecher 提交于 11月 16, 2008

Impact: widen the reach of the low-memory-protect DMI quirk

Phoenix BIOSes variously identify their vendor as "Phoenix Technologies,
LTD" or "Phoenix Technologies LTD" (without the comma.)

This patch makes the identification string in the bad_bios_dmi_table
more general (following a suggestion by Ingo Molnar), so that both
versions are handled.

Again, the patched file compiles cleanly and the patch has been tested
successfully on my machine.
Signed-off-by: NPhilipp Kohlbecher <xt28@gmx.de>
Cc: <stable@kernel.org>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

0af40a4b

J
AMD IOMMU: check for next_bit also in unmapped area · 8501c45c
由 Joerg Roedel 提交于 11月 17, 2008
```
Impact: fix possible use of stale IO/TLB entries
Signed-off-by: NJoerg Roedel <joerg.roedel@amd.com>
```
8501c45c
J
AMD IOMMU: fix fullflush comparison length · 695b5676
由 Joerg Roedel 提交于 11月 17, 2008
```
Impact: fix comparison length for 'fullflush'
Signed-off-by: NJoerg Roedel <joerg.roedel@amd.com>
```
695b5676

AMD IOMMU: enable device isolation per default · 3ce1f93c

由 Joerg Roedel 提交于 11月 17, 2008

Impact: makes device isolation the default for AMD IOMMU

Some device drivers showed double-free bugs of DMA memory while testing
them with AMD IOMMU. If all devices share the same protection domain
this can lead to data corruption and data loss. Prevent this by putting
each device into its own protection domain per default.
Signed-off-by: NJoerg Roedel <joerg.roedel@amd.com>

3ce1f93c

AMD IOMMU: add parameter to disable device isolation · e5e1f606

由 Joerg Roedel 提交于 11月 17, 2008

Impact: add a new AMD IOMMU kernel command line parameter
Signed-off-by: NJoerg Roedel <joerg.roedel@amd.com>

e5e1f606

x86, PEBS/DS: fix code flow in ds_request() · 10db4ef7

由 Ingo Molnar 提交于 11月 18, 2008

this compiler warning:

  arch/x86/kernel/ds.c: In function 'ds_request':
  arch/x86/kernel/ds.c:368: warning: 'context' may be used uninitialized in this function

Shows that the code flow in ds_request() is buggy - it goes into
the unlock+release-context path even when the context is not allocated
yet.

First allocate the context, then do the other checks.

Also, take care with GFP allocations under the ds_lock spinlock.

Cc: <stable@kernel.org>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

10db4ef7

x86: add rdtsc barrier to TSC sync check · 93ce99e8

由 Venki Pallipadi 提交于 11月 17, 2008

Impact: fix incorrectly marked unstable TSC clock

Patch (commit 0d12cdd5 "sched: improve sched_clock() performance") has
a regression on one of the test systems here.

With the patch, I see:

 checking TSC synchronization [CPU#0 -> CPU#1]:
 Measured 28 cycles TSC warp between CPUs, turning off TSC clock.
 Marking TSC unstable due to check_tsc_sync_source failed

Whereas, without the patch syncs pass fine on all CPUs:

 checking TSC synchronization [CPU#0 -> CPU#1]: passed.

Due to this, TSC is marked unstable, when it is not actually unstable.
This is because syncs in check_tsc_wrap() goes away due to this commit.

As per the discussion on this thread, correct way to fix this is to add
explicit syncs as below?
Signed-off-by: NVenkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

93ce99e8

16 11月, 2008 3 次提交

x86: fix es7000 compiling · d3c6aa1e

由 Yinghai Lu 提交于 11月 16, 2008

Impact: fix es7000 build

  CC      arch/x86/kernel/es7000_32.o
arch/x86/kernel/es7000_32.c: In function find_unisys_acpi_oem_table:
arch/x86/kernel/es7000_32.c:255: error: implicit declaration of function acpi_get_table_with_size
arch/x86/kernel/es7000_32.c:261: error: implicit declaration of function early_acpi_os_unmap_memory
arch/x86/kernel/es7000_32.c: In function unmap_unisys_acpi_oem_table:
arch/x86/kernel/es7000_32.c:277: error: implicit declaration of function __acpi_unmap_table
make[1]: *** [arch/x86/kernel/es7000_32.o] Error 1

we applied one patch out of order...

| commit a73aaedd
| Author: Yinghai Lu <yhlu.kernel@gmail.com>
| Date:   Sun Sep 14 02:33:14 2008 -0700
|
|    x86: check dsdt before find oem table for es7000, v2
|
|    v2: use __acpi_unmap_table()

that patch need:

	x86: use early_ioremap in __acpi_map_table
	x86: always explicitly map acpi memory
	acpi: remove final __acpi_map_table mapping before setting acpi_gbl_permanent_mmap
	acpi/x86: introduce __apci_map_table, v4

submitted to the ACPI tree but not upstream yet.

fix it until those patches applied, need to revert this one
Signed-off-by: NYinghai Lu <yinghai@kernel.org>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

d3c6aa1e

x86, bts: fix unlock problem in ds.c · d1f1e9c0

由 Markus Metzger 提交于 11月 15, 2008

Fix a problem where ds_request() returned an error without releasing the
ds lock.
Reported-by: NStephane Eranian <eranian@gmail.com>
Signed-off-by: NMarkus Metzger <markus.t.metzger@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

d1f1e9c0

Revert "x86: blacklist DMAR on Intel G31/G33 chipsets" · 52168e60

由 David Woodhouse 提交于 11月 14, 2008

This reverts commit e51af663, which was
wrongly hoovered up and submitted about a month after a better fix had
already been merged.

The better fix is commit cbda1ba8
("PCI/iommu: blacklist DMAR on Intel G31/G33 chipsets"), where we do
this blacklisting based on the DMI identification for the offending
motherboard, since sometimes this chipset (or at least a chipset with
the same PCI ID) apparently _does_ actually have an IOMMU.
Signed-off-by: NDavid Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>

52168e60

13 11月, 2008 2 次提交

x86: make NUMA on 32-bit depend on EXPERIMENTAL again · 604d2055

由 Rafael J. Wysocki 提交于 11月 12, 2008

My previous patch to make CONFIG_NUMA on x86_32 depend on BROKEN
turned out to be unnecessary, after all, since the source of the
hibernation vs CONFIG_NUMA problem turned out to be the fact that
we didn't take the NUMA KVA remapping into account in the
hibernation code.
Signed-off-by: NRafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

604d2055

x86, hibernate: fix breakage on x86_32 with CONFIG_NUMA set · 97a70e54

由 Rafael J. Wysocki 提交于 11月 12, 2008

Impact: fix crash during hibernation on 32-bit NUMA

The NUMA code on x86_32 creates special memory mapping that allows
each node's pgdat to be located in this node's memory.  For this
purpose it allocates a memory area at the end of each node's memory
and maps this area so that it is accessible with virtual addresses
belonging to low memory.  As a result, if there is high memory,
these NUMA-allocated areas are physically located in high memory,
although they are mapped to low memory addresses.

Our hibernation code does not take that into account and for this
reason hibernation fails on all x86_32 systems with CONFIG_NUMA=y and
with high memory present.  Fix this by adding a special mapping for
the NUMA-allocated memory areas to the temporary page tables created
during the last phase of resume.
Signed-off-by: NRafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

97a70e54

12 11月, 2008 6 次提交

ACPI: pci_link: remove acpi_irq_balance_set() interface · 32836259

由 Bjorn Helgaas 提交于 11月 05, 2008

This removes the acpi_irq_balance_set() interface from the PCI
interrupt link driver.

x86 used acpi_irq_balance_set() to tell the PCI interrupt link
driver to configure links to minimize IRQ sharing.  But the link
driver can easily figure out whether to turn on IRQ balancing
based on the IRQ model (PIC/IOAPIC/etc), so we can get rid of
that external interface.

It's better for the driver to figure this out at init-time.  If
we set it externally via the x86 code, the interface reduces
modularity, and we depend on the fact that acpi_process_madt()
happens before we process the kernel command line.
Signed-off-by: NBjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: NLen Brown <len.brown@intel.com>

32836259

A
KVM: Fix pit memory leak if unable to allocate irq source id · e17d1dc0
由 Avi Kivity 提交于 11月 11, 2008
```
Reported-By: NDaniel Marjamäki <danielm77@spray.se>
Signed-off-by: NAvi Kivity <avi@qumranet.com>
```
e17d1dc0

KVM: VMX: Set IGMT bit in EPT entry · 928d4bf7

由 Sheng Yang 提交于 11月 06, 2008

There is a potential issue that, when guest using pagetable without vmexit when
EPT enabled, guest would use PAT/PCD/PWT bits to index PAT msr for it's memory,
which would be inconsistent with host side and would cause host MCE due to
inconsistent cache attribute.

The patch set IGMT bit in EPT entry to ignore guest PAT and use WB as default
memory type to protect host (notice that all memory mapped by KVM should be WB).
Signed-off-by: NSheng Yang <sheng@linux.intel.com>
Signed-off-by: NAvi Kivity <avi@redhat.com>

928d4bf7

KVM: Require the PCI subsystem · ca93e992

由 Avi Kivity 提交于 11月 04, 2008

PCI device assignment makes calls to pci code, so require it to be built
into the kernel.
Signed-off-by: NAvi Kivity <avi@qumranet.com>

ca93e992

x86: KVM guest: fix section mismatch warning in kvmclock.c · a29a2af3

由 Rakib Mullick 提交于 10月 29, 2008

WARNING: arch/x86/kernel/built-in.o(.text+0x1722c): Section mismatch
in reference from the function kvm_setup_secondary_clock() to the
function .devinit.text:setup_secondary_APIC_clock()
The function kvm_setup_secondary_clock() references
the function __devinit setup_secondary_APIC_clock().
This is often because kvm_setup_secondary_clock lacks a __devinit
annotation or the annotation of setup_secondary_APIC_clock is wrong.
Signed-off-by: NMd.Rakib H. Mullick <rakib.mullick@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NAvi Kivity <avi@redhat.com>

a29a2af3

KVM: MMU: increase per-vcpu rmap cache alloc size · c41ef344

由 Marcelo Tosatti 提交于 10月 28, 2008

The page fault path can use two rmap_desc structures, if:

- walk_addr's dirty pte update allocates one rmap_desc.
- mmu_lock is dropped, sptes are zapped resulting in rmap_desc being
freed.
- fetch->mmu_set_spte allocates another rmap_desc.

Increase to 4 for safety.
Signed-off-by: NMarcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: NAvi Kivity <avi@redhat.com>

c41ef344

11 11月, 2008 5 次提交

x86, voyager: fix smp generic helper voyager breakage · 6cd10f8d

由 James Bottomley 提交于 11月 09, 2008

Impact: build/boot fix for x86/Voyager

This change:

| commit 3d442233
| Author: Jens Axboe <jens.axboe@oracle.com>
| Date:   Thu Jun 26 11:21:34 2008 +0200
|
|     Add generic helpers for arch IPI function calls

didn't wire up the voyager smp call function correctly, so do that
here.  Also make CONFIG_USE_GENERIC_SMP_HELPERS a def_bool y again,
since we now use the generic helpers for every x86 architecture.
Signed-off-by: NJames Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jens Axboe <Jens.Axboe@oracle.com>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

6cd10f8d

x86: Make NUMA on 32-bit depend on BROKEN · 4694516d

由 Rafael J. Wysocki 提交于 11月 10, 2008

While investigating the failure of hibernation on 32-bit x86 with
CONFIG_NUMA set, as described in this message
http://marc.info/?l=linux-kernel&m=122634118116226&w=4
I asked some people for help and I was told that it wasn't really
worth the effort, because CONFIG_NUMA was generally broken on 32-bit
x86 systems and it shouldn't be used in such configs.  For this
reason, make CONFIG_NUMA depend on BROKEN instead of EXPERIMENTAL on
x86-32.
Signed-off-by: NRafael J. Wysocki <rjw@sisk.pl>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Pavel Machek <pavel@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>

4694516d

x86: HPET: enter hpet_interrupt_handler with interrupts disabled · 5ceb1a04

由 Matt Fleming 提交于 11月 02, 2008

Some functions that may be called from this handler require that
interrupts are disabled. Also, combining IRQF_DISABLED and
IRQF_SHARED does not reliably disable interrupts in a handler, so
remove IRQF_SHARED from the irq flags (this irq is not shared anyway).
Signed-off-by: NMatt Fleming <mjf@gentoo.org>
Cc: mingo@elte.hu
Cc: venkatesh.pallipadi@intel.com
Cc: "Will Newton" <will.newton@gmail.com>
Signed-off-by: NThomas Gleixner <tglx@linutronix.de>

5ceb1a04

x86: HPET: read from HPET_Tn_CMP() not HPET_T0_CMP · 89d77a1e

由 Matt Fleming 提交于 11月 02, 2008

In hpet_next_event() we check that the value we just wrote to
HPET_Tn_CMP(timer) has reached the chip. Currently, we're checking that
the value we wrote to HPET_Tn_CMP(timer) is in HPET_T0_CMP, which, if
timer is anything other than timer 0, is likely to fail.
Signed-off-by: NMatt Fleming <mjf@gentoo.org>
Cc: mingo@elte.hu
Cc: venkatesh.pallipadi@intel.com
Signed-off-by: NThomas Gleixner <tglx@linutronix.de>

89d77a1e

x86: HPET: convert WARN_ON to WARN_ON_ONCE · 1de5b085

由 Matt Fleming 提交于 11月 02, 2008

It is possible to flood the console with call traces if the WARN_ON
condition is true because of the frequency with which this function is
called.
Signed-off-by: NMatt Fleming <mjf@gentoo.org>
Cc: mingo@elte.hu
Cc: venkatesh.pallipadi@intel.com
Signed-off-by: NThomas Gleixner <tglx@linutronix.de>

1de5b085

10 11月, 2008 1 次提交

x86: move iomap.h to the new include location · 30446461

由 Arjan van de Ven 提交于 11月 09, 2008

a new file was accidentally added to include/asm-x86;
move it to the new arch/x86/include/asm location
Signed-off-by: NArjan van de Ven <arjan@linux.intel.com>

30446461

09 11月, 2008 1 次提交

sched: optimize sched_clock() a bit · 7cbaef9c

由 Ingo Molnar 提交于 11月 08, 2008

sched_clock() uses cycles_2_ns() needlessly - which is an irq-disabling
variant of __cycles_2_ns().

Most of the time sched_clock() is called with irqs disabled already.
The few places that call it with irqs enabled need to be updated.
Signed-off-by: NIngo Molnar <mingo@elte.hu>

7cbaef9c

08 11月, 2008 2 次提交

sched: improve sched_clock() performance · 0d12cdd5

由 Ingo Molnar 提交于 11月 08, 2008

in scheduler-intense workloads native_read_tsc() overhead accounts for
20% of the system overhead:

 659567 system_call                              41222.9375
 686796 schedule                                 435.7843
 718382 __switch_to                              665.1685
 823875 switch_mm                                4526.7857
 1883122 native_read_tsc                          55385.9412
 9761990 total                                      2.8468

this is large part due to the rdtsc_barrier() that is done before
and after reading the TSC.

But sched_clock() is not a precise clock in the GTOD sense, using such
barriers is completely pointless. So remove the barriers and only use
them in vget_cycles().

This improves lat_ctx performance by about 5%.
Signed-off-by: NIngo Molnar <mingo@elte.hu>

0d12cdd5

oprofile: Fix p6 counter overflow check · 7c64ade5

由 Andi Kleen 提交于 11月 07, 2008

Fix the counter overflow check for CPUs with counter width > 32

I had a similar change in a different patch that I didn't submit
and I didn't notice the problem earlier because it was always
tested together.
Signed-off-by: NAndi Kleen <ak@linux.intel.com>
Signed-off-by: NRobert Richter <robert.richter@amd.com>

7c64ade5

07 11月, 2008 2 次提交

xen: make sure stray alias mappings are gone before pinning · d05fdf31

由 Jeremy Fitzhardinge 提交于 10月 28, 2008

Xen requires that all mappings of pagetable pages are read-only, so
that they can't be updated illegally.  As a result, if a page is being
turned into a pagetable page, we need to make sure all its mappings
are RO.

If the page had been used for ioremap or vmalloc, it may still have
left over mappings as a result of not having been lazily unmapped.
This change makes sure we explicitly mop them all up before pinning
the page.

Unlike aliases created by kmap, the there can be vmalloc aliases even
for non-high pages, so we must do the flush unconditionally.
Signed-off-by: NJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Linux Memory Management List <linux-mm@kvack.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

d05fdf31

x86, xen: fix use of pgd_page now that it really does return a page · 47cb2ed9

由 Jeremy Fitzhardinge 提交于 11月 06, 2008

Impact: fix 32-bit Xen guest boot crash

On 32-bit PAE, pud_page, for no good reason, didn't really return a
struct page *.  Since Jan Beulich's fix "i386/PAE: fix pud_page()",
pud_page does return a struct page *.

Because PAE has 3 pagetable levels, the pud level is folded into the
pgd level, so pgd_page() is the same as pud_page(), and now returns
a struct page *.  Update the xen/mmu.c code which uses pgd_page()
accordingly.
Signed-off-by: NJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

47cb2ed9

06 11月, 2008 9 次提交

Revert "x86: default to reboot via ACPI" · 8d00450d

由 Eduardo Habkost 提交于 11月 04, 2008

This reverts commit c7ffa6c2.

the assumptio of this change was that this would not break
any existing machine. Andrey Borzenkov reported troubles with
the ACPI reboot method: the system would hang on reboot, necessiating
a power cycle. Probably more systems are affected as well.

Also, there are patches queued up for v2.6.29 to disable virtualization
on emergency_restart() - which was the original motivation of
this change.
Reported-by: NAndrey Borzenkov <arvidjaar@mail.ru>
Bisected-by: NAndrey Borzenkov <arvidjaar@mail.ru>
Signed-off-by: NEduardo Habkost <ehabkost@redhat.com>
Acked-by: NAvi Kivity <avi@redhat.com>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

8d00450d

x86: align DirectMap in /proc/meminfo · b9c3bfc2

由 Hugh Dickins 提交于 11月 06, 2008

Impact: right-align /proc/meminfo consistent with other fields

When the split-LRU patches added Inactive(anon) and Inactive(file) lines
to /proc/meminfo, all counts were moved two columns rightwards to fit in.
Now move x86's DirectMap lines two columns rightwards to line up.
Signed-off-by: NHugh Dickins <hugh@veritas.com>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

b9c3bfc2

AMD IOMMU: fix lazy IO/TLB flushing in unmap path · 80be308d

由 Joerg Roedel 提交于 11月 06, 2008

Lazy flushing needs to take care of the unmap path too which is not yet
implemented and leads to stale IO/TLB entries. This is fixed by this
patch.
Signed-off-by: NJoerg Roedel <joerg.roedel@amd.com>

80be308d

x86: add smp_mb() before sending INVALIDATE_TLB_VECTOR · d6f0f39b

由 Suresh Siddha 提交于 11月 04, 2008

Impact: fix rare x2apic hang

On x86, x2apic mode accesses for sending IPI's don't have serializing
semantics. If the IPI receivner refers(in lock-free fashion) to some
memory setup by the sender, the need for smp_mb() before sending the
IPI becomes critical in x2apic mode.

Add the smp_mb() in native_flush_tlb_others() before sending the IPI.
Signed-off-by: NSuresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

d6f0f39b

x86: remove VISWS and PARAVIRT around NR_IRQS puzzle · 7db282fa

由 Yinghai Lu 提交于 11月 05, 2008

Impact: fix warning message when PARAVIRT is set in config

Remove stale #ifdef components from our IRQ sizing logic.
x86/Voyager is the only holdout.
Signed-off-by: NYinghai Lu <yinghai@kernel.org>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

7db282fa

x86: mention ACPI in top-level Kconfig menu · da85f865

由 Bjorn Helgaas 提交于 11月 05, 2008

Impact: clarify menuconfig text

Mention ACPI in the top-level menu to give a clue as to where
it lives. This matches what ia64 does.
Signed-off-by: NBjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

da85f865

x86: size NR_IRQS on 32-bit systems the same way as 64-bit · 1b489768

由 Yinghai Lu 提交于 11月 04, 2008

Impact: make NR_IRQS big enough for system with lots of apic/pins

If lots of IO_APIC's are there (or can be there), size the same way
as 64-bit, depending on MAX_IO_APICS and NR_CPUS.

This fixes the boot problem reported by Ben Hutchings on a 32-bit
server with 5 IO-APICs and 240 IO-APIC pins.
Signed-off-by: NYinghai <yinghai@kernel.org>
Tested-by: NBen Hutchings <bhutchings@solarflare.com>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

1b489768

x86: don't allow nr_irqs > NR_IRQS · c78d0cf2

由 Ben Hutchings 提交于 11月 05, 2008

Impact: fix boot hang on 32-bit systems with more than 224 IO-APIC pins

On some 32-bit systems with a lot of IO-APICs probe_nr_irqs() can
return a value larger than NR_IRQS. This will lead to probe_irq_on()
overrunning the irq_desc array.

I hit this when running net-next-2.6 (close to 2.6.28-rc3) on a
Supermicro dual Xeon system. NR_IRQS is 224 but probe_nr_irqs() detects
5 IOAPICs and returns 240. Here are the log messages:

Tue Nov 4 16:53:47 2008 ACPI: IOAPIC (id[0x01] address[0xfec00000] gsi_base[0])
Tue Nov 4 16:53:47 2008 IOAPIC[0]: apic_id 1, version 32, address 0xfec00000, GSI 0-23
Tue Nov 4 16:53:47 2008 ACPI: IOAPIC (id[0x02] address[0xfec81000] gsi_base[24])
Tue Nov 4 16:53:47 2008 IOAPIC[1]: apic_id 2, version 32, address 0xfec81000, GSI 24-47
Tue Nov 4 16:53:47 2008 ACPI: IOAPIC (id[0x03] address[0xfec81400] gsi_base[48])
Tue Nov 4 16:53:47 2008 IOAPIC[2]: apic_id 3, version 32, address 0xfec81400, GSI 48-71
Tue Nov 4 16:53:47 2008 ACPI: IOAPIC (id[0x04] address[0xfec82000] gsi_base[72])
Tue Nov 4 16:53:47 2008 IOAPIC[3]: apic_id 4, version 32, address 0xfec82000, GSI 72-95
Tue Nov 4 16:53:47 2008 ACPI: IOAPIC (id[0x05] address[0xfec82400] gsi_base[96])
Tue Nov 4 16:53:47 2008 IOAPIC[4]: apic_id 5, version 32, address 0xfec82400, GSI 96-119
Tue Nov 4 16:53:47 2008 ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 high edge)
Tue Nov 4 16:53:47 2008 ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
Tue Nov 4 16:53:47 2008 Enabling APIC mode: Flat. Using 5 I/O APICs
Signed-off-by: NBen Hutchings <bhutchings@solarflare.com>
Acked-by: NYinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

c78d0cf2

sched: re-tune balancing · 9fcd18c9

由 Ingo Molnar 提交于 11月 05, 2008

Impact: improve wakeup affinity on NUMA systems, tweak SMP systems

Given the fixes+tweaks to the wakeup-buddy code, re-tweak the domain
balancing defaults on NUMA and SMP systems.

Turn on SD_WAKE_AFFINE which was off on x86 NUMA - there's no reason
why we would not want to have wakeup affinity across nodes as well.
(we already do this in the standard NUMA template.)

lat_ctx on a NUMA box is particularly happy about this change:

before:

 |   phoenix:~/l> ./lat_ctx -s 0 2
 |   "size=0k ovr=2.60
 |   2 5.70

after:

 |   phoenix:~/l> ./lat_ctx -s 0 2
 |   "size=0k ovr=2.65
 |   2 2.07

a 2.75x speedup.

pipe-test is similarly happy about it too:

 |  phoenix:~/sched-tests> ./pipe-test
 |   18.26 usecs/loop.
 |   14.70 usecs/loop.
 |   14.38 usecs/loop.
 |   10.55 usecs/loop.              # +WAKE_AFFINE on domain0+domain1
 |   8.63 usecs/loop.
 |   8.59 usecs/loop.
 |   9.03 usecs/loop.
 |   8.94 usecs/loop.
 |   8.96 usecs/loop.
 |   8.63 usecs/loop.

Also:

 - disable SD_BALANCE_NEWIDLE on NUMA and SMP domains (keep it for siblings)
 - enable SD_WAKE_BALANCE on SMP domains

Sysbench+postgresql improves all around the board, quite significantly:

           .28-rc3-11474e2c  .28-rc3-11474e2c-tune
-------------------------------------------------
    1:             571              688    +17.08%
    2:            1236             1206    -2.55%
    4:            2381             2642    +9.89%
    8:            4958             5164    +3.99%
   16:            9580             9574    -0.07%
   32:            7128             8118    +12.20%
   64:            7342             8266    +11.18%
  128:            7342             8064    +8.95%
  256:            7519             7884    +4.62%
  512:            7350             7731    +4.93%
-------------------------------------------------
  SUM:           55412            59341    +6.62%

So it's a win both for the runup portion, the peak area and the tail.
Signed-off-by: NIngo Molnar <mingo@elte.hu>

9fcd18c9

04 11月, 2008 1 次提交

x86: don't use tsc_khz to calculate lpj if notsc is passed · 70de9a97

由 Alok Kataria 提交于 11月 03, 2008

Impact: fix udelay when "notsc" boot parameter is passed

With notsc passed on commandline, tsc may not be used for
udelays, make sure that we do not use tsc_khz to calculate
the lpj value in such cases.
Reported-by: NBartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: NAlok N Kataria <akataria@vmware.com>
Cc: <stable@kernel.org>
Signed-off-by: NIngo Molnar <mingo@elte.hu>

70de9a97