1. 07 1月, 2015 1 次提交
  2. 31 12月, 2014 1 次提交
    • P
      rcu: Make rcu_nmi_enter() handle nesting · 734d1680
      Paul E. McKenney 提交于
      The x86 architecture has multiple types of NMI-like interrupts: real
      NMIs, machine checks, and, for some values of NMI-like, debugging
      and breakpoint interrupts.  These interrupts can nest inside each
      other.  Andy Lutomirski is adding RCU support to these interrupts,
      so rcu_nmi_enter() and rcu_nmi_exit() must now correctly handle nesting.
      
      This commit therefore introduces nesting, using a clever NMI-coordination
      algorithm suggested by Andy.  The trick is to atomically increment
      ->dynticks (if needed) before manipulating ->dynticks_nmi_nesting on entry
      (and, accordingly, after on exit).  In addition, ->dynticks_nmi_nesting
      is incremented by one if ->dynticks was incremented and by two otherwise.
      This means that when rcu_nmi_exit() sees ->dynticks_nmi_nesting equal
      to one, it knows that ->dynticks must be atomically incremented.
      
      This NMI-coordination algorithms has been validated by the following
      Promela model:
      
      ------------------------------------------------------------------------
      
      /*
       * Promela model for Andy Lutomirski's suggested change to rcu_nmi_enter()
       * that allows nesting.
       *
       * This program is free software; you can redistribute it and/or modify
       * it under the terms of the GNU General Public License as published by
       * the Free Software Foundation; either version 2 of the License, or
       * (at your option) any later version.
       *
       * This program is distributed in the hope that it will be useful,
       * but WITHOUT ANY WARRANTY; without even the implied warranty of
       * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       * GNU General Public License for more details.
       *
       * You should have received a copy of the GNU General Public License
       * along with this program; if not, you can access it online at
       * http://www.gnu.org/licenses/gpl-2.0.html.
       *
       * Copyright IBM Corporation, 2014
       *
       * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
       */
      
      byte dynticks_nmi_nesting = 0;
      byte dynticks = 0;
      
      /*
       * Promela verision of rcu_nmi_enter().
       */
      inline rcu_nmi_enter()
      {
      	byte incby;
      	byte tmp;
      
      	incby = BUSY_INCBY;
      	assert(dynticks_nmi_nesting >= 0);
      	if
      	:: (dynticks & 1) == 0 ->
      		atomic {
      			dynticks = dynticks + 1;
      		}
      		assert((dynticks & 1) == 1);
      		incby = 1;
      	:: else ->
      		skip;
      	fi;
      	tmp = dynticks_nmi_nesting;
      	tmp = tmp + incby;
      	dynticks_nmi_nesting = tmp;
      	assert(dynticks_nmi_nesting >= 1);
      }
      
      /*
       * Promela verision of rcu_nmi_exit().
       */
      inline rcu_nmi_exit()
      {
      	byte tmp;
      
      	assert(dynticks_nmi_nesting > 0);
      	assert((dynticks & 1) != 0);
      	if
      	:: dynticks_nmi_nesting != 1 ->
      		tmp = dynticks_nmi_nesting;
      		tmp = tmp - BUSY_INCBY;
      		dynticks_nmi_nesting = tmp;
      	:: else ->
      		dynticks_nmi_nesting = 0;
      		atomic {
      			dynticks = dynticks + 1;
      		}
      		assert((dynticks & 1) == 0);
      	fi;
      }
      
      /*
       * Base-level NMI runs non-atomically.  Crudely emulates process-level
       * dynticks-idle entry/exit.
       */
      proctype base_NMI()
      {
      	byte busy;
      
      	busy = 0;
      	do
      	::	/* Emulate base-level dynticks and not. */
      		if
      		:: 1 ->	atomic {
      				dynticks = dynticks + 1;
      			}
      			busy = 1;
      		:: 1 ->	skip;
      		fi;
      
      		/* Verify that we only sometimes have base-level dynticks. */
      		if
      		:: busy == 0 -> skip;
      		:: busy == 1 -> skip;
      		fi;
      
      		/* Model RCU's NMI entry and exit actions. */
      		rcu_nmi_enter();
      		assert((dynticks & 1) == 1);
      		rcu_nmi_exit();
      
      		/* Emulated re-entering base-level dynticks and not. */
      		if
      		:: !busy -> skip;
      		:: busy ->
      			atomic {
      				dynticks = dynticks + 1;
      			}
      			busy = 0;
      		fi;
      
      		/* We had better now be in dyntick-idle mode. */
      		assert((dynticks & 1) == 0);
      	od;
      }
      
      /*
       * Nested NMI runs atomically to emulate interrupting base_level().
       */
      proctype nested_NMI()
      {
      	do
      	::	/*
      		 * Use an atomic section to model a nested NMI.  This is
      		 * guaranteed to interleave into base_NMI() between a pair
      		 * of base_NMI() statements, just as a nested NMI would.
      		 */
      		atomic {
      			/* Verify that we only sometimes are in dynticks. */
      			if
      			:: (dynticks & 1) == 0 -> skip;
      			:: (dynticks & 1) == 1 -> skip;
      			fi;
      
      			/* Model RCU's NMI entry and exit actions. */
      			rcu_nmi_enter();
      			assert((dynticks & 1) == 1);
      			rcu_nmi_exit();
      		}
      	od;
      }
      
      init {
      	run base_NMI();
      	run nested_NMI();
      }
      
      ------------------------------------------------------------------------
      
      The following script can be used to run this model if placed in
      rcu_nmi.spin:
      
      ------------------------------------------------------------------------
      
      if ! spin -a rcu_nmi.spin
      then
      	echo Spin errors!!!
      	exit 1
      fi
      if ! cc -DSAFETY -o pan pan.c
      then
      	echo Compilation errors!!!
      	exit 1
      fi
      ./pan -m100000
      Signed-off-by: NPaul E. McKenney <paulmck@linux.vnet.ibm.com>
      Reviewed-by: NLai Jiangshan <laijs@cn.fujitsu.com>
      734d1680
  3. 20 12月, 2014 1 次提交
  4. 19 12月, 2014 1 次提交
    • T
      tick/powerclamp: Remove tick_nohz_idle abuse · a5fd9733
      Thomas Gleixner 提交于
      commit 4dbd2771 "tick: export nohz tick idle symbols for module
      use" was merged via the thermal tree without an explicit ack from the
      relevant maintainers.
      
      The exports are abused by the intel powerclamp driver which implements
      a fake idle state from a sched FIFO task. This causes all kinds of
      wreckage in the NOHZ core code which rightfully assumes that
      tick_nohz_idle_enter/exit() are only called from the idle task itself.
      
      Recent changes in the NOHZ core lead to a failure of the powerclamp
      driver and now people try to hack completely broken and backwards
      workarounds into the NOHZ core code. This is completely unacceptable
      and just papers over the real problem. There are way more subtle
      issues lurking around the corner.
      
      The real solution is to fix the powerclamp driver by rewriting it with
      a sane concept, but that's beyond the scope of this.
      
      So the only solution for now is to remove the calls into the core NOHZ
      code from the powerclamp trainwreck along with the exports. 
      
      Fixes: d6d71ee4 "PM: Introduce Intel PowerClamp Driver"
      Signed-off-by: NThomas Gleixner <tglx@linutronix.de>
      Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
      Cc: Viresh Kumar <viresh.kumar@linaro.org>
      Cc: Frederic Weisbecker <fweisbec@gmail.com>
      Cc: Fengguang Wu <fengguang.wu@intel.com>
      Cc: Frederic Weisbecker <frederic@kernel.org>
      Cc: Pan Jacob jun <jacob.jun.pan@intel.com>
      Cc: LKP <lkp@01.org>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Zhang Rui <rui.zhang@intel.com>
      Cc: stable@vger.kernel.org
      Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1412181110110.17382@nanosSigned-off-by: NThomas Gleixner <tglx@linutronix.de>
      a5fd9733
  5. 18 12月, 2014 1 次提交
    • K
      param: do not set store func without write perm · b0a65b0c
      Kees Cook 提交于
      When a module_param is defined without DAC write permissions, it can
      still be changed at runtime and updated. Drivers using a 0444 permission
      may be surprised that these values can still be changed.
      
      For drivers that want to allow updates, any S_IW* flag will set the
      "store" function as before. Drivers without S_IW* flags will have the
      "store" function unset, unforcing a read-only value. Drivers that wish
      neither "store" nor "get" can continue to use "0" for perms to stay out
      of sysfs entirely.
      
      Old behavior:
        # cd /sys/module/snd/parameters
        # ls -l
        total 0
        -r--r--r-- 1 root root 4096 Dec 11 13:55 cards_limit
        -r--r--r-- 1 root root 4096 Dec 11 13:55 major
        -r--r--r-- 1 root root 4096 Dec 11 13:55 slots
        # cat major
        116
        # echo -1 > major
        -bash: major: Permission denied
        # chmod u+w major
        # echo -1 > major
        # cat major
        -1
      
      New behavior:
        ...
        # chmod u+w major
        # echo -1 > major
        -bash: echo: write error: Input/output error
      Signed-off-by: NKees Cook <keescook@chromium.org>
      Signed-off-by: NRusty Russell <rusty@rustcorp.com.au>
      b0a65b0c
  6. 15 12月, 2014 2 次提交
  7. 14 12月, 2014 8 次提交
  8. 13 12月, 2014 2 次提交
    • T
      genirq: Prevent proc race against freeing of irq descriptors · c291ee62
      Thomas Gleixner 提交于
      Since the rework of the sparse interrupt code to actually free the
      unused interrupt descriptors there exists a race between the /proc
      interfaces to the irq subsystem and the code which frees the interrupt
      descriptor.
      
      CPU0				CPU1
      				show_interrupts()
      				  desc = irq_to_desc(X);
      free_desc(desc)
        remove_from_radix_tree();
        kfree(desc);
      				  raw_spinlock_irq(&desc->lock);
      
      /proc/interrupts is the only interface which can actively corrupt
      kernel memory via the lock access. /proc/stat can only read from freed
      memory. Extremly hard to trigger, but possible.
      
      The interfaces in /proc/irq/N/ are not affected by this because the
      removal of the proc file is serialized in procfs against concurrent
      readers/writers. The removal happens before the descriptor is freed.
      
      For architectures which have CONFIG_SPARSE_IRQ=n this is a non issue
      as the descriptor is never freed. It's merely cleared out with the irq
      descriptor lock held. So any concurrent proc access will either see
      the old correct value or the cleared out ones.
      
      Protect the lookup and access to the irq descriptor in
      show_interrupts() with the sparse_irq_lock.
      
      Provide kstat_irqs_usr() which is protecting the lookup and access
      with sparse_irq_lock and switch /proc/stat to use it.
      
      Document the existing kstat_irqs interfaces so it's clear that the
      caller needs to take care about protection. The users of these
      interfaces are either not affected due to SPARSE_IRQ=n or already
      protected against removal.
      
      Fixes: 1f5a5b87 "genirq: Implement a sane sparse_irq allocator"
      Signed-off-by: NThomas Gleixner <tglx@linutronix.de>
      Cc: stable@vger.kernel.org
      c291ee62
    • R
      tracing / PM: Replace CONFIG_PM_RUNTIME with CONFIG_PM · 798bc6d8
      Rafael J. Wysocki 提交于
      After commit b2b49ccb (PM: Kconfig: Set PM_RUNTIME if PM_SLEEP is
      selected) PM_RUNTIME is always set if PM is set, so files that are
      build conditionally if CONFIG_PM_RUNTIME is set may now be build
      if CONFIG_PM is set.
      
      Replace CONFIG_PM_RUNTIME with CONFIG_PM in kernel/trace/Makefile
      for this reason.
      Signed-off-by: NRafael J. Wysocki <rafael.j.wysocki@intel.com>
      Acked-by: Steven Rostedt <rostedt@goodmis.org.
      798bc6d8
  9. 12 12月, 2014 3 次提交
    • E
      userns; Correct the comment in map_write · 36476bea
      Eric W. Biederman 提交于
      It is important that all maps are less than PAGE_SIZE
      or else setting the last byte of the buffer to '0'
      could write off the end of the allocated storage.
      
      Correct the misleading comment.
      Signed-off-by: N"Eric W. Biederman" <ebiederm@xmission.com>
      36476bea
    • E
      userns: Allow setting gid_maps without privilege when setgroups is disabled · 66d2f338
      Eric W. Biederman 提交于
      Now that setgroups can be disabled and not reenabled, setting gid_map
      without privielge can now be enabled when setgroups is disabled.
      
      This restores most of the functionality that was lost when unprivileged
      setting of gid_map was removed.  Applications that use this functionality
      will need to check to see if they use setgroups or init_groups, and if they
      don't they can be fixed by simply disabling setgroups before writing to
      gid_map.
      
      Cc: stable@vger.kernel.org
      Reviewed-by: NAndy Lutomirski <luto@amacapital.net>
      Signed-off-by: N"Eric W. Biederman" <ebiederm@xmission.com>
      66d2f338
    • E
      userns: Add a knob to disable setgroups on a per user namespace basis · 9cc46516
      Eric W. Biederman 提交于
      - Expose the knob to user space through a proc file /proc/<pid>/setgroups
      
        A value of "deny" means the setgroups system call is disabled in the
        current processes user namespace and can not be enabled in the
        future in this user namespace.
      
        A value of "allow" means the segtoups system call is enabled.
      
      - Descendant user namespaces inherit the value of setgroups from
        their parents.
      
      - A proc file is used (instead of a sysctl) as sysctls currently do
        not allow checking the permissions at open time.
      
      - Writing to the proc file is restricted to before the gid_map
        for the user namespace is set.
      
        This ensures that disabling setgroups at a user namespace
        level will never remove the ability to call setgroups
        from a process that already has that ability.
      
        A process may opt in to the setgroups disable for itself by
        creating, entering and configuring a user namespace or by calling
        setns on an existing user namespace with setgroups disabled.
        Processes without privileges already can not call setgroups so this
        is a noop.  Prodcess with privilege become processes without
        privilege when entering a user namespace and as with any other path
        to dropping privilege they would not have the ability to call
        setgroups.  So this remains within the bounds of what is possible
        without a knob to disable setgroups permanently in a user namespace.
      
      Cc: stable@vger.kernel.org
      Signed-off-by: N"Eric W. Biederman" <ebiederm@xmission.com>
      9cc46516
  10. 11 12月, 2014 20 次提交
    • S
      printk: Do not disable preemption for accessing printk_func · 1fb8915b
      Steven Rostedt (Red Hat) 提交于
      As printk_func will either be the default function, or a per_cpu function
      for the current CPU, there's no reason to disable preemption to access
      it from printk. That's because if the printk_func is not the default
      then the caller had better disabled preemption as they were the one to
      change it.
      
      Link: http://lkml.kernel.org/r/CA+55aFz5-_LKW4JHEBoWinN9_ouNcGRWAF2FUA35u46FRN-Kxw@mail.gmail.comSuggested-by: NLinus Torvalds <torvalds@linux-foundation.org>
      Signed-off-by: NSteven Rostedt <rostedt@goodmis.org>
      1fb8915b
    • J
      perf: Fix events installation during moving group · 9fc81d87
      Jiri Olsa 提交于
      We allow PMU driver to change the cpu on which the event
      should be installed to. This happened in patch:
      
        e2d37cd2 ("perf: Allow the PMU driver to choose the CPU on which to install events")
      
      This patch also forces all the group members to follow
      the currently opened events cpu if the group happened
      to be moved.
      
      This and the change of event->cpu in perf_install_in_context()
      function introduced in:
      
        0cda4c02 ("perf: Introduce perf_pmu_migrate_context()")
      
      forces group members to change their event->cpu,
      if the currently-opened-event's PMU changed the cpu
      and there is a group move.
      
      Above behaviour causes problem for breakpoint events,
      which uses event->cpu to touch cpu specific data for
      breakpoints accounting. By changing event->cpu, some
      breakpoints slots were wrongly accounted for given
      cpu.
      
      Vinces's perf fuzzer hit this issue and caused following
      WARN on my setup:
      
         WARNING: CPU: 0 PID: 20214 at arch/x86/kernel/hw_breakpoint.c:119 arch_install_hw_breakpoint+0x142/0x150()
         Can't find any breakpoint slot
         [...]
      
      This patch changes the group moving code to keep the event's
      original cpu.
      Reported-by: NVince Weaver <vince@deater.net>
      Signed-off-by: NJiri Olsa <jolsa@redhat.com>
      Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
      Cc: Frederic Weisbecker <fweisbec@gmail.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Stephane Eranian <eranian@google.com>
      Cc: Vince Weaver <vince@deater.net>
      Cc: Yan, Zheng <zheng.z.yan@intel.com>
      Cc: <stable@vger.kernel.org>
      Link: http://lkml.kernel.org/r/1418243031-20367-3-git-send-email-jolsa@kernel.orgSigned-off-by: NIngo Molnar <mingo@kernel.org>
      9fc81d87
    • O
      exit: pidns: fix/update the comments in zap_pid_ns_processes() · a53b8315
      Oleg Nesterov 提交于
      The comments in zap_pid_ns_processes() are not clear, we need to explain
      how this code actually works.
      
      1. "Ignore SIGCHLD" looks like optimization but it is not, we also
         need this for correctness.
      
      2. The comment above sys_wait4() could tell more.
      
         EXIT_ZOMBIE child is only possible if it has exited before we
         ignored SIGCHLD. Or if it is traced from the parent namespace,
         but in this case it will be reaped by debugger after detach,
         sys_wait4() acts as a synchronization point.
      
      3. The comment about TASK_DEAD (EXIT_DEAD in fact) children is
         outdated. Contrary to what it says we do not need to make sure
         they all go away after 0a01f2cc "pidns: Make the pidns proc
         mount/umount logic obvious".
      
         At the same time, we do need to wait for nr_hashed==init_pids,
         but the reasons are quite different and not obvious: setns().
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: Pavel Emelyanov <xemul@parallels.com>
      Cc: Serge Hallyn <serge.hallyn@ubuntu.com>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      a53b8315
    • O
      exit: pidns: alloc_pid() leaks pid_namespace if child_reaper is exiting · 24c037eb
      Oleg Nesterov 提交于
      alloc_pid() does get_pid_ns() beforehand but forgets to put_pid_ns() if it
      fails because disable_pid_allocation() was called by the exiting
      child_reaper.
      
      We could simply move get_pid_ns() down to successful return, but this fix
      tries to be as trivial as possible.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Reviewed-by: N"Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: Pavel Emelyanov <xemul@parallels.com>
      Cc: Serge Hallyn <serge.hallyn@ubuntu.com>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      24c037eb
    • O
      exit: exit_notify: re-use "dead" list to autoreap current · 6c66e7db
      Oleg Nesterov 提交于
      After the previous change we can add just the exiting EXIT_DEAD task to
      the "dead" list and remove another release_task(tsk).
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      6c66e7db
    • O
      exit: reparent: call forget_original_parent() under tasklist_lock · 482a3767
      Oleg Nesterov 提交于
      Shift "release dead children" loop from forget_original_parent() to its
      caller, exit_notify().  It is safe to reap them even if our parent reaps
      us right after we drop tasklist_lock, those children no longer have any
      connection to the exiting task.
      
      And this allows us to avoid write_lock_irq(tasklist_lock) right after it
      was released by forget_original_parent(), we can simply call it with
      tasklist_lock held.
      
      While at it, move the comment about forget_original_parent() up to
      this function.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      482a3767
    • O
      exit: reparent: avoid find_new_reaper() if no children · ad9e206a
      Oleg Nesterov 提交于
      Now that pid_ns logic was isolated we can change forget_original_parent()
      to return right after find_child_reaper() when father->children is empty,
      there is nothing to reparent in this case.
      
      In particular this avoids find_alive_thread() and this can help if the
      whole process exits and it has a lot of PF_EXITING threads at the start of
      the thread list, this can easily lead to O(nr_threads ** 2) iterations.
      
      Trivial test case (tested under KVM, 2 CPUs):
      
          static void *tfunc(void *arg)
          {
              pause();
              return NULL;
          }
      
          static int child(unsigned int nt)
          {
              pthread_t pt;
      
              while (nt--)
                  assert(pthread_create(&pt, NULL, tfunc, NULL) == 0);
      
              pthread_kill(pt, SIGTRAP);
              pause();
              return 0;
          }
      
          int main(int argc, const char *argv[])
          {
              int stat;
              unsigned int nf = atoi(argv[1]);
              unsigned int nt = atoi(argv[2]);
      
              while (nf--) {
                  if (!fork())
                      return child(nt);
      
                  wait(&stat);
                  assert(stat == SIGTRAP);
              }
      
              return 0;
          }
      
      $ time ./test 16 16536 shows:
      
                    real        user         sys
          -    5m37.628s    0m4.437s    8m5.560s
          +    0m50.032s    0m7.130s    1m4.927s
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      ad9e206a
    • O
      exit: reparent: introduce find_alive_thread() · c9dc05bf
      Oleg Nesterov 提交于
      Add the new simple helper to factor out the for_each_thread() code in
      find_child_reaper() and find_new_reaper().  It can also simplify the
      potential PF_EXITING -> exit_state change, plus perhaps we can change this
      code to take SIGNAL_GROUP_EXIT into account.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Kay Sievers <kay@vrfy.org>
      Cc: Lennart Poettering <lennart@poettering.net>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      c9dc05bf
    • O
      exit: reparent: introduce find_child_reaper() · 1109909c
      Oleg Nesterov 提交于
      find_new_reaper() does 2 completely different things.  Not only it finds a
      reaper, it also updates pid_ns->child_reaper or kills the whole namespace
      if the caller is ->child_reaper.
      
      Now that has_child_subreaper logic doesn't depend on child_reaper check we
      can move that pid_ns code into a separate helper.  IMHO this makes the
      code more clean, and this allows the next changes.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Kay Sievers <kay@vrfy.org>
      Cc: Lennart Poettering <lennart@poettering.net>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      1109909c
    • O
      exit: reparent: document the ->has_child_subreaper checks · 175aed3f
      Oleg Nesterov 提交于
      Swap the "init_task" and same_thread_group() checks.  This way it is more
      simple to document these checks and we can remove the link to the previous
      discussion on lkml.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Kay Sievers <kay@vrfy.org>
      Cc: Lennart Poettering <lennart@poettering.net>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      175aed3f
    • O
      exit: reparent: s/while_each_thread/for_each_thread/ in find_new_reaper() · 3750ef97
      Oleg Nesterov 提交于
      Change find_new_reaper() to use for_each_thread() instead of deprecated
      while_each_thread().  We do not bother to check "thread != father" in the
      1st loop, we can rely on PF_EXITING check.
      
      Note: this means the minor behavioural change: for_each_thread() starts
      from the group leader.  But this should be fine, nobody should make any
      assumption about do_wait(__WNOTHREAD) when it comes to reparented tasks.
      And this can avoid the pointless reparenting to a short-living thread
      While zombie leaders are not that common.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Kay Sievers <kay@vrfy.org>
      Cc: Lennart Poettering <lennart@poettering.net>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      3750ef97
    • O
      exit: reparent: fix the cross-namespace PR_SET_CHILD_SUBREAPER reparenting · 7d24e2df
      Oleg Nesterov 提交于
      find_new_reaper() assumes that "has_child_subreaper" logic is safe as
      long as we are not the exiting ->child_reaper and this is doubly wrong:
      
      1. In fact it is safe if "pid_ns->child_reaper == father"; there must
         be no children after zap_pid_ns_processes() returns, so it doesn't
         matter what we return in this case and even pid_ns->child_reaper is
         wrong otherwise: we can't reparent to ->child_reaper == current.
      
         This is not a bug, but this is confusing.
      
      2. It is not safe if we are not pid_ns->child_reaper but from the same
         thread group. We drop tasklist_lock before zap_pid_ns_processes(),
         so another thread can lock it and choose the new reaper from the
         upper namespace if has_child_subreaper == T, and this is obviously
         wrong.
      
         This is not that bad, zap_pid_ns_processes() won't return until the
         the new reaper reaps all zombies, but this should be fixed anyway.
      
      We could change for_each_thread() loop to use ->exit_state instead of
      PF_EXITING which we had to use until 8aac6270, or we could change
      copy_signal() to check CLONE_NEWPID before setting has_child_subreaper,
      but lets change this code so that it is clear we can't look outside of
      our namespace, otherwise same_thread_group(reaper, child_reaper) check
      will look wrong and confusing anyway.
      
      We can simply start from "father" and fix the problem. We can't wrongly
      return a thread from the same thread group if ->is_child_subreaper == T,
      we know that all threads have PF_EXITING set.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Kay Sievers <kay@vrfy.org>
      Cc: Lennart Poettering <lennart@poettering.net>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      7d24e2df
    • O
      exit: reparent: fix the dead-parent PR_SET_CHILD_SUBREAPER reparenting · 8a1296ae
      Oleg Nesterov 提交于
      The ->has_child_subreaper code in find_new_reaper() finds alive "thread"
      but returns another "reaper" thread which can be dead.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Kay Sievers <kay@vrfy.org>
      Cc: Lennart Poettering <lennart@poettering.net>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      8a1296ae
    • O
      exit: release_task: fix the comment about group leader accounting · 26e75b5c
      Oleg Nesterov 提交于
      Contrary to what the comment in __exit_signal() says we do account the
      group leader. Fix this and explain why.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      26e75b5c
    • O
      exit: wait: drop tasklist_lock before psig->c* accounting · 986094df
      Oleg Nesterov 提交于
      wait_task_zombie() no longer needs tasklist_lock to accumulate the
      psig->c* counters, we can drop it right after cmpxchg(exit_state).
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      986094df
    • O
      exit: wait: don't use zombie->real_parent · f953ccd0
      Oleg Nesterov 提交于
      1. wait_task_zombie() uses p->real_parent to get psig/siglock. This is
         correct but needs tasklist_lock, ->real_parent can exit.
      
         We can use "current" instead. This is our natural child, its parent
         must be our sub-thread.
      
      2. Read psig/sig outside of ->siglock, ->signal is no longer protected
         by this lock.
      
      3. Fix the outdated comments about tasklist_lock. We can not race with
         __exit_signal(), the whole thread group is dead, nobody but us can
         call it.
      
         Also clarify the usage of ->stats_lock and ->siglock.
      
      Note: thread_group_cputime_adjusted() is sub-optimal in this case, we
      probably want to export cputime_adjust() to avoid thread_group_cputime().
      The comment says "all threads" but there are no other threads.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      f953ccd0
    • O
      exit: wait: cleanup the ptrace_reparented() checks · f6507f83
      Oleg Nesterov 提交于
      Now that EXIT_DEAD is the terminal state we can kill "int traced"
      variable and check "state == EXIT_DEAD" instead to cleanup the code.  In
      particular, this way it is clear that the check obviously doesn't need
      tasklist_lock.
      
      Also fix the type of "unsigned long state", "long" was always wrong
      although this doesn't matter because cmpxchg/xchg uses typeof(*ptr).
      
      [akpm@linux-foundation.org: don't make me google the C Operator Precedence table]
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Aaron Tomlin <atomlin@redhat.com>
      Cc: "Eric W. Biederman" <ebiederm@xmission.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Sterling Alexander <stalexan@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      f6507f83
    • O
      usermodehelper: kill the kmod_thread_locker logic · 7f6def9f
      Oleg Nesterov 提交于
      Now that we do not call kernel_thread(CLONE_VFORK) from the worker
      thread we can not deadlock if do_execve() in turn triggers another
      call_usermodehelper(), we can remove the kmod_thread_locker code.
      
      Note: we should probably kill khelper_wq and simply use one of the
      global workqueues, say, system_unbound_wq, this special wq for umh buys
      nothing nowadays.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      7f6def9f
    • O
      usermodehelper: don't use CLONE_VFORK for ____call_usermodehelper() · 7117bc88
      Oleg Nesterov 提交于
      After "kernel/kmod: fix use-after-free of the sub_infostructure"
      CLONE_VFORK in __call_usermodehelper() buys nothing, we rely on on
      umh_complete() in ____call_usermodehelper() anyway.
      
      Remove it.  This also eliminates the unnecessary sleep/wakeup in the
      likely case, and this allows the next change.
      
      While at it, kill the "int wait" locals in ____call_usermodehelper() and
      __call_usermodehelper(), they can safely use sub_info->wait.
      Signed-off-by: NOleg Nesterov <oleg@redhat.com>
      Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
      Cc: Oleg Nesterov <oleg@redhat.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      7117bc88
    • A
      printk: drop logbuf_cpu volatile qualifier · f099755d
      Alex Elder 提交于
      Pranith Kumar posted a patch in which removed the "volatile"
      qualifier for the "logbuf_cpu" variable in vprintk_emit().
          https://lkml.org/lkml/2014/11/13/894
      In his patch, he used ACCESS_ONCE() for all references to
      that symbol to provide whatever protection was intended.
      
      There was some discussion that followed, and in the end Steven Rostedt
      concluded that not only was "volatile" not needed, neither was it
      required to use ACCESS_ONCE().  I offered an elaborate description that
      concluded Steven was right, and Pranith asked me to submit an
      alternative patch.  And this is it.
      
      The basic reason "volatile" is not needed is that "logbuf_cpu" has
      static storage duration, and vprintk_emit() is an exported
      interface.  This means that the value of logbuf_cpu must be read
      from memory the first time it is used in a particular call of
      vprintk_emit().  The variable's value is read only once in that
      function, when it's read it'll be the copy from memory (or cache).
      
      In addition, the value of "logbuf_cpu" is only ever written under
      protection of a spinlock.  So the value that is read is the "real"
      value (and not an out-of-date cached one).  If its value is not
      UINT_MAX, it is the current CPU's processor id, and it will have
      been last written by the running CPU.
      Signed-off-by: NAlex Elder <elder@linaro.org>
      Reported-by: NPranith Kumar <bobby.prani@gmail.com>
      Suggested-by: NSteven Rostedt <rostedt@goodmis.org>
      Reviewed-by: NJan Kara <jack@suse.cz>
      Cc: Petr Mladek <pmladek@suse.cz>
      Cc: Luis R. Rodriguez <mcgrof@suse.com>
      Cc: Joe Perches <joe@perches.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      f099755d