1. 24 6月, 2020 1 次提交
  2. 01 12月, 2019 1 次提交
  3. 06 4月, 2019 1 次提交
  4. 10 9月, 2018 1 次提交
  5. 25 7月, 2018 1 次提交
  6. 13 6月, 2018 1 次提交
    • K
      treewide: kmalloc() -> kmalloc_array() · 6da2ec56
      Kees Cook 提交于
      The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
      patch replaces cases of:
      
              kmalloc(a * b, gfp)
      
      with:
              kmalloc_array(a * b, gfp)
      
      as well as handling cases of:
      
              kmalloc(a * b * c, gfp)
      
      with:
      
              kmalloc(array3_size(a, b, c), gfp)
      
      as it's slightly less ugly than:
      
              kmalloc_array(array_size(a, b), c, gfp)
      
      This does, however, attempt to ignore constant size factors like:
      
              kmalloc(4 * 1024, gfp)
      
      though any constants defined via macros get caught up in the conversion.
      
      Any factors with a sizeof() of "unsigned char", "char", and "u8" were
      dropped, since they're redundant.
      
      The tools/ directory was manually excluded, since it has its own
      implementation of kmalloc().
      
      The Coccinelle script used for this was:
      
      // Fix redundant parens around sizeof().
      @@
      type TYPE;
      expression THING, E;
      @@
      
      (
        kmalloc(
      -	(sizeof(TYPE)) * E
      +	sizeof(TYPE) * E
        , ...)
      |
        kmalloc(
      -	(sizeof(THING)) * E
      +	sizeof(THING) * E
        , ...)
      )
      
      // Drop single-byte sizes and redundant parens.
      @@
      expression COUNT;
      typedef u8;
      typedef __u8;
      @@
      
      (
        kmalloc(
      -	sizeof(u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(__u8) * (COUNT)
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(char) * (COUNT)
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(unsigned char) * (COUNT)
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(u8) * COUNT
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(__u8) * COUNT
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(char) * COUNT
      +	COUNT
        , ...)
      |
        kmalloc(
      -	sizeof(unsigned char) * COUNT
      +	COUNT
        , ...)
      )
      
      // 2-factor product with sizeof(type/expression) and identifier or constant.
      @@
      type TYPE;
      expression THING;
      identifier COUNT_ID;
      constant COUNT_CONST;
      @@
      
      (
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * (COUNT_ID)
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * COUNT_ID
      +	COUNT_ID, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * COUNT_CONST
      +	COUNT_CONST, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * (COUNT_ID)
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * COUNT_ID
      +	COUNT_ID, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * (COUNT_CONST)
      +	COUNT_CONST, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * COUNT_CONST
      +	COUNT_CONST, sizeof(THING)
        , ...)
      )
      
      // 2-factor product, only identifiers.
      @@
      identifier SIZE, COUNT;
      @@
      
      - kmalloc
      + kmalloc_array
        (
      -	SIZE * COUNT
      +	COUNT, SIZE
        , ...)
      
      // 3-factor product with 1 sizeof(type) or sizeof(expression), with
      // redundant parens removed.
      @@
      expression THING;
      identifier STRIDE, COUNT;
      type TYPE;
      @@
      
      (
        kmalloc(
      -	sizeof(TYPE) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(TYPE))
        , ...)
      |
        kmalloc(
      -	sizeof(THING) * (COUNT) * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kmalloc(
      -	sizeof(THING) * (COUNT) * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kmalloc(
      -	sizeof(THING) * COUNT * (STRIDE)
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      |
        kmalloc(
      -	sizeof(THING) * COUNT * STRIDE
      +	array3_size(COUNT, STRIDE, sizeof(THING))
        , ...)
      )
      
      // 3-factor product with 2 sizeof(variable), with redundant parens removed.
      @@
      expression THING1, THING2;
      identifier COUNT;
      type TYPE1, TYPE2;
      @@
      
      (
        kmalloc(
      -	sizeof(TYPE1) * sizeof(TYPE2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
        , ...)
      |
        kmalloc(
      -	sizeof(THING1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kmalloc(
      -	sizeof(THING1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(THING1), sizeof(THING2))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * COUNT
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      |
        kmalloc(
      -	sizeof(TYPE1) * sizeof(THING2) * (COUNT)
      +	array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
        , ...)
      )
      
      // 3-factor product, only identifiers, with redundant parens removed.
      @@
      identifier STRIDE, SIZE, COUNT;
      @@
      
      (
        kmalloc(
      -	(COUNT) * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	COUNT * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	COUNT * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	(COUNT) * (STRIDE) * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	COUNT * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	(COUNT) * STRIDE * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	(COUNT) * (STRIDE) * (SIZE)
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      |
        kmalloc(
      -	COUNT * STRIDE * SIZE
      +	array3_size(COUNT, STRIDE, SIZE)
        , ...)
      )
      
      // Any remaining multi-factor products, first at least 3-factor products,
      // when they're not all constants...
      @@
      expression E1, E2, E3;
      constant C1, C2, C3;
      @@
      
      (
        kmalloc(C1 * C2 * C3, ...)
      |
        kmalloc(
      -	(E1) * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kmalloc(
      -	(E1) * (E2) * E3
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kmalloc(
      -	(E1) * (E2) * (E3)
      +	array3_size(E1, E2, E3)
        , ...)
      |
        kmalloc(
      -	E1 * E2 * E3
      +	array3_size(E1, E2, E3)
        , ...)
      )
      
      // And then all remaining 2 factors products when they're not all constants,
      // keeping sizeof() as the second factor argument.
      @@
      expression THING, E1, E2;
      type TYPE;
      constant C1, C2, C3;
      @@
      
      (
        kmalloc(sizeof(THING) * C2, ...)
      |
        kmalloc(sizeof(TYPE) * C2, ...)
      |
        kmalloc(C1 * C2 * C3, ...)
      |
        kmalloc(C1 * C2, ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * (E2)
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(TYPE) * E2
      +	E2, sizeof(TYPE)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * (E2)
      +	E2, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	sizeof(THING) * E2
      +	E2, sizeof(THING)
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	(E1) * E2
      +	E1, E2
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	(E1) * (E2)
      +	E1, E2
        , ...)
      |
      - kmalloc
      + kmalloc_array
        (
      -	E1 * E2
      +	E1, E2
        , ...)
      )
      Signed-off-by: NKees Cook <keescook@chromium.org>
      6da2ec56
  7. 25 5月, 2018 1 次提交
    • J
      sched/topology: Clarify root domain(s) debug string · bf5015a5
      Juri Lelli 提交于
      When scheduler debug is enabled, building scheduling domains outputs
      information about how the domains are laid out and to which root domain
      each CPU (or sets of CPUs) belongs, e.g.:
      
       CPU0 attaching sched-domain(s):
        domain-0: span=0-5 level=MC
         groups: 0:{ span=0 }, 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 }
       CPU1 attaching sched-domain(s):
        domain-0: span=0-5 level=MC
         groups: 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 }, 0:{ span=0 }
      
       [...]
      
       span: 0-5 (max cpu_capacity = 1024)
      
      The fact that latest line refers to CPUs 0-5 root domain doesn't however look
      immediately obvious to me: one might wonder why span 0-5 is reported "again".
      
      Make it more clear by adding "root domain" to it, as to end with the
      following:
      
       CPU0 attaching sched-domain(s):
        domain-0: span=0-5 level=MC
         groups: 0:{ span=0 }, 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 }
       CPU1 attaching sched-domain(s):
        domain-0: span=0-5 level=MC
         groups: 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 }, 0:{ span=0 }
      
       [...]
      
       root domain span: 0-5 (max cpu_capacity = 1024)
      Signed-off-by: NJuri Lelli <juri.lelli@redhat.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Patrick Bellasi <patrick.bellasi@arm.com>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Link: http://lkml.kernel.org/r/20180524152936.17611-1-juri.lelli@redhat.comSigned-off-by: NIngo Molnar <mingo@kernel.org>
      bf5015a5
  8. 04 3月, 2018 1 次提交
    • I
      sched/headers: Simplify and clean up header usage in the scheduler · 325ea10c
      Ingo Molnar 提交于
      Do the following cleanups and simplifications:
      
       - sched/sched.h already includes <asm/paravirt.h>, so no need to
         include it in sched/core.c again.
      
       - order the <linux/sched/*.h> headers alphabetically
      
       - add all <linux/sched/*.h> headers to kernel/sched/sched.h
      
       - remove all unnecessary includes from the .c files that
         are already included in kernel/sched/sched.h.
      
      Finally, make all scheduler .c files use a single common header:
      
        #include "sched.h"
      
      ... which now contains a union of the relied upon headers.
      
      This makes the various .c files easier to read and easier to handle.
      
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      325ea10c
  9. 03 3月, 2018 1 次提交
    • I
      sched: Clean up and harmonize the coding style of the scheduler code base · 97fb7a0a
      Ingo Molnar 提交于
      A good number of small style inconsistencies have accumulated
      in the scheduler core, so do a pass over them to harmonize
      all these details:
      
       - fix speling in comments,
      
       - use curly braces for multi-line statements,
      
       - remove unnecessary parentheses from integer literals,
      
       - capitalize consistently,
      
       - remove stray newlines,
      
       - add comments where necessary,
      
       - remove invalid/unnecessary comments,
      
       - align structure definitions and other data types vertically,
      
       - add missing newlines for increased readability,
      
       - fix vertical tabulation where it's misaligned,
      
       - harmonize preprocessor conditional block labeling
         and vertical alignment,
      
       - remove line-breaks where they uglify the code,
      
       - add newline after local variable definitions,
      
      No change in functionality:
      
        md5:
           1191fa0a890cfa8132156d2959d7e9e2  built-in.o.before.asm
           1191fa0a890cfa8132156d2959d7e9e2  built-in.o.after.asm
      
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      97fb7a0a
  10. 06 2月, 2018 1 次提交
  11. 02 11月, 2017 1 次提交
    • G
      License cleanup: add SPDX GPL-2.0 license identifier to files with no license · b2441318
      Greg Kroah-Hartman 提交于
      Many source files in the tree are missing licensing information, which
      makes it harder for compliance tools to determine the correct license.
      
      By default all files without license information are under the default
      license of the kernel, which is GPL version 2.
      
      Update the files which contain no license information with the 'GPL-2.0'
      SPDX license identifier.  The SPDX identifier is a legally binding
      shorthand, which can be used instead of the full boiler plate text.
      
      This patch is based on work done by Thomas Gleixner and Kate Stewart and
      Philippe Ombredanne.
      
      How this work was done:
      
      Patches were generated and checked against linux-4.14-rc6 for a subset of
      the use cases:
       - file had no licensing information it it.
       - file was a */uapi/* one with no licensing information in it,
       - file was a */uapi/* one with existing licensing information,
      
      Further patches will be generated in subsequent months to fix up cases
      where non-standard license headers were used, and references to license
      had to be inferred by heuristics based on keywords.
      
      The analysis to determine which SPDX License Identifier to be applied to
      a file was done in a spreadsheet of side by side results from of the
      output of two independent scanners (ScanCode & Windriver) producing SPDX
      tag:value files created by Philippe Ombredanne.  Philippe prepared the
      base worksheet, and did an initial spot review of a few 1000 files.
      
      The 4.13 kernel was the starting point of the analysis with 60,537 files
      assessed.  Kate Stewart did a file by file comparison of the scanner
      results in the spreadsheet to determine which SPDX license identifier(s)
      to be applied to the file. She confirmed any determination that was not
      immediately clear with lawyers working with the Linux Foundation.
      
      Criteria used to select files for SPDX license identifier tagging was:
       - Files considered eligible had to be source code files.
       - Make and config files were included as candidates if they contained >5
         lines of source
       - File already had some variant of a license header in it (even if <5
         lines).
      
      All documentation files were explicitly excluded.
      
      The following heuristics were used to determine which SPDX license
      identifiers to apply.
      
       - when both scanners couldn't find any license traces, file was
         considered to have no license information in it, and the top level
         COPYING file license applied.
      
         For non */uapi/* files that summary was:
      
         SPDX license identifier                            # files
         ---------------------------------------------------|-------
         GPL-2.0                                              11139
      
         and resulted in the first patch in this series.
      
         If that file was a */uapi/* path one, it was "GPL-2.0 WITH
         Linux-syscall-note" otherwise it was "GPL-2.0".  Results of that was:
      
         SPDX license identifier                            # files
         ---------------------------------------------------|-------
         GPL-2.0 WITH Linux-syscall-note                        930
      
         and resulted in the second patch in this series.
      
       - if a file had some form of licensing information in it, and was one
         of the */uapi/* ones, it was denoted with the Linux-syscall-note if
         any GPL family license was found in the file or had no licensing in
         it (per prior point).  Results summary:
      
         SPDX license identifier                            # files
         ---------------------------------------------------|------
         GPL-2.0 WITH Linux-syscall-note                       270
         GPL-2.0+ WITH Linux-syscall-note                      169
         ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause)    21
         ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)    17
         LGPL-2.1+ WITH Linux-syscall-note                      15
         GPL-1.0+ WITH Linux-syscall-note                       14
         ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause)    5
         LGPL-2.0+ WITH Linux-syscall-note                       4
         LGPL-2.1 WITH Linux-syscall-note                        3
         ((GPL-2.0 WITH Linux-syscall-note) OR MIT)              3
         ((GPL-2.0 WITH Linux-syscall-note) AND MIT)             1
      
         and that resulted in the third patch in this series.
      
       - when the two scanners agreed on the detected license(s), that became
         the concluded license(s).
      
       - when there was disagreement between the two scanners (one detected a
         license but the other didn't, or they both detected different
         licenses) a manual inspection of the file occurred.
      
       - In most cases a manual inspection of the information in the file
         resulted in a clear resolution of the license that should apply (and
         which scanner probably needed to revisit its heuristics).
      
       - When it was not immediately clear, the license identifier was
         confirmed with lawyers working with the Linux Foundation.
      
       - If there was any question as to the appropriate license identifier,
         the file was flagged for further research and to be revisited later
         in time.
      
      In total, over 70 hours of logged manual review was done on the
      spreadsheet to determine the SPDX license identifiers to apply to the
      source files by Kate, Philippe, Thomas and, in some cases, confirmation
      by lawyers working with the Linux Foundation.
      
      Kate also obtained a third independent scan of the 4.13 code base from
      FOSSology, and compared selected files where the other two scanners
      disagreed against that SPDX file, to see if there was new insights.  The
      Windriver scanner is based on an older version of FOSSology in part, so
      they are related.
      
      Thomas did random spot checks in about 500 files from the spreadsheets
      for the uapi headers and agreed with SPDX license identifier in the
      files he inspected. For the non-uapi files Thomas did random spot checks
      in about 15000 files.
      
      In initial set of patches against 4.14-rc6, 3 files were found to have
      copy/paste license identifier errors, and have been fixed to reflect the
      correct identifier.
      
      Additionally Philippe spent 10 hours this week doing a detailed manual
      inspection and review of the 12,461 patched files from the initial patch
      version early this week with:
       - a full scancode scan run, collecting the matched texts, detected
         license ids and scores
       - reviewing anything where there was a license detected (about 500+
         files) to ensure that the applied SPDX license was correct
       - reviewing anything where there was no detection but the patch license
         was not GPL-2.0 WITH Linux-syscall-note to ensure that the applied
         SPDX license was correct
      
      This produced a worksheet with 20 files needing minor correction.  This
      worksheet was then exported into 3 different .csv files for the
      different types of files to be modified.
      
      These .csv files were then reviewed by Greg.  Thomas wrote a script to
      parse the csv files and add the proper SPDX tag to the file, in the
      format that the file expected.  This script was further refined by Greg
      based on the output to detect more types of files automatically and to
      distinguish between header and source .c files (which need different
      comment types.)  Finally Greg ran the script using the .csv files to
      generate the patches.
      Reviewed-by: NKate Stewart <kstewart@linuxfoundation.org>
      Reviewed-by: NPhilippe Ombredanne <pombredanne@nexb.com>
      Reviewed-by: NThomas Gleixner <tglx@linutronix.de>
      Signed-off-by: NGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      b2441318
  12. 27 10月, 2017 1 次提交
  13. 24 10月, 2017 1 次提交
    • R
      sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK · e22cdc3f
      Rakib Mullick 提交于
      cpulist_parse() uses nr_cpumask_bits as a limit to parse the
      passed buffer from kernel commandline. What nr_cpumask_bits
      represents varies depending upon the CONFIG_CPUMASK_OFFSTACK option:
      
       - If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is the same as
         NR_CPUS, which might not represent the # of CPUs that really exist
         (default 64). So, there's a chance of a gap between nr_cpu_ids
         and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
         operation. For example, if isolcpus=9 is passed on an 8 cpu
         system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
         that it's supposed to.
      
      This patch fixes this bug by finding the last CPU of the passed
      isolcpus= list and checking it against nr_cpu_ids.
      
      It also fixes the error message where the nr_cpu_ids should be
      nr_cpu_ids-1, since CPU numbering starts from 0.
      Signed-off-by: NRakib Mullick <rakib.mullick@gmail.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: adobriyan@gmail.com
      Cc: akpm@linux-foundation.org
      Cc: longman@redhat.com
      Cc: mka@chromium.org
      Cc: tj@kernel.org
      Link: http://lkml.kernel.org/r/20171023130154.9050-1-rakib.mullick@gmail.com
      [ Enhanced the changelog and the kernel message. ]
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      
       include/linux/cpumask.h |   16 ++++++++++++++++
       kernel/sched/topology.c |    4 ++--
       2 files changed, 18 insertions(+), 2 deletions(-)
      e22cdc3f
  14. 10 10月, 2017 3 次提交
    • S
      sched/rt: Simplify the IPI based RT balancing logic · 4bdced5c
      Steven Rostedt (Red Hat) 提交于
      When a CPU lowers its priority (schedules out a high priority task for a
      lower priority one), a check is made to see if any other CPU has overloaded
      RT tasks (more than one). It checks the rto_mask to determine this and if so
      it will request to pull one of those tasks to itself if the non running RT
      task is of higher priority than the new priority of the next task to run on
      the current CPU.
      
      When we deal with large number of CPUs, the original pull logic suffered
      from large lock contention on a single CPU run queue, which caused a huge
      latency across all CPUs. This was caused by only having one CPU having
      overloaded RT tasks and a bunch of other CPUs lowering their priority. To
      solve this issue, commit:
      
        b6366f04 ("sched/rt: Use IPI to trigger RT task push migration instead of pulling")
      
      changed the way to request a pull. Instead of grabbing the lock of the
      overloaded CPU's runqueue, it simply sent an IPI to that CPU to do the work.
      
      Although the IPI logic worked very well in removing the large latency build
      up, it still could suffer from a large number of IPIs being sent to a single
      CPU. On a 80 CPU box, I measured over 200us of processing IPIs. Worse yet,
      when I tested this on a 120 CPU box, with a stress test that had lots of
      RT tasks scheduling on all CPUs, it actually triggered the hard lockup
      detector! One CPU had so many IPIs sent to it, and due to the restart
      mechanism that is triggered when the source run queue has a priority status
      change, the CPU spent minutes! processing the IPIs.
      
      Thinking about this further, I realized there's no reason for each run queue
      to send its own IPI. As all CPUs with overloaded tasks must be scanned
      regardless if there's one or many CPUs lowering their priority, because
      there's no current way to find the CPU with the highest priority task that
      can schedule to one of these CPUs, there really only needs to be one IPI
      being sent around at a time.
      
      This greatly simplifies the code!
      
      The new approach is to have each root domain have its own irq work, as the
      rto_mask is per root domain. The root domain has the following fields
      attached to it:
      
        rto_push_work	 - the irq work to process each CPU set in rto_mask
        rto_lock	 - the lock to protect some of the other rto fields
        rto_loop_start - an atomic that keeps contention down on rto_lock
      		    the first CPU scheduling in a lower priority task
      		    is the one to kick off the process.
        rto_loop_next	 - an atomic that gets incremented for each CPU that
      		    schedules in a lower priority task.
        rto_loop	 - a variable protected by rto_lock that is used to
      		    compare against rto_loop_next
        rto_cpu	 - The cpu to send the next IPI to, also protected by
      		    the rto_lock.
      
      When a CPU schedules in a lower priority task and wants to make sure
      overloaded CPUs know about it. It increments the rto_loop_next. Then it
      atomically sets rto_loop_start with a cmpxchg. If the old value is not "0",
      then it is done, as another CPU is kicking off the IPI loop. If the old
      value is "0", then it will take the rto_lock to synchronize with a possible
      IPI being sent around to the overloaded CPUs.
      
      If rto_cpu is greater than or equal to nr_cpu_ids, then there's either no
      IPI being sent around, or one is about to finish. Then rto_cpu is set to the
      first CPU in rto_mask and an IPI is sent to that CPU. If there's no CPUs set
      in rto_mask, then there's nothing to be done.
      
      When the CPU receives the IPI, it will first try to push any RT tasks that is
      queued on the CPU but can't run because a higher priority RT task is
      currently running on that CPU.
      
      Then it takes the rto_lock and looks for the next CPU in the rto_mask. If it
      finds one, it simply sends an IPI to that CPU and the process continues.
      
      If there's no more CPUs in the rto_mask, then rto_loop is compared with
      rto_loop_next. If they match, everything is done and the process is over. If
      they do not match, then a CPU scheduled in a lower priority task as the IPI
      was being passed around, and the process needs to start again. The first CPU
      in rto_mask is sent the IPI.
      
      This change removes this duplication of work in the IPI logic, and greatly
      lowers the latency caused by the IPIs. This removed the lockup happening on
      the 120 CPU machine. It also simplifies the code tremendously. What else
      could anyone ask for?
      
      Thanks to Peter Zijlstra for simplifying the rto_loop_start atomic logic and
      supplying me with the rto_start_trylock() and rto_start_unlock() helper
      functions.
      Signed-off-by: NSteven Rostedt (VMware) <rostedt@goodmis.org>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Clark Williams <williams@redhat.com>
      Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
      Cc: John Kacur <jkacur@redhat.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Scott Wood <swood@redhat.com>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Link: http://lkml.kernel.org/r/20170424114732.1aac6dc4@gandalf.local.homeSigned-off-by: NIngo Molnar <mingo@kernel.org>
      4bdced5c
    • S
      sched/topology: Introduce NUMA identity node sched domain · 051f3ca0
      Suravee Suthikulpanit 提交于
      On AMD Family17h-based (EPYC) system, a logical NUMA node can contain
      upto 8 cores (16 threads) with the following topology.
      
                   ----------------------------
               C0  | T0 T1 |    ||    | T0 T1 | C4
                   --------|    ||    |--------
               C1  | T0 T1 | L3 || L3 | T0 T1 | C5
                   --------|    ||    |--------
               C2  | T0 T1 | #0 || #1 | T0 T1 | C6
                   --------|    ||    |--------
               C3  | T0 T1 |    ||    | T0 T1 | C7
                   ----------------------------
      
      Here, there are 2 last-level (L3) caches per logical NUMA node.
      A socket can contain upto 4 NUMA nodes, and a system can support
      upto 2 sockets. With full system configuration, current scheduler
      creates 4 sched domains:
      
        domain0 SMT       (span a core)
        domain1 MC        (span a last-level-cache)
        domain2 NUMA      (span a socket: 4 nodes)
        domain3 NUMA      (span a system: 8 nodes)
      
      Note that there is no domain to represent cpus spaning a logical
      NUMA node.  With this hierarchy of sched domains, the scheduler does
      not balance properly in the following cases:
      
      Case1:
      
       When running 8 tasks, a properly balanced system should
       schedule a task per logical NUMA node. This is not the case for
       the current scheduler.
      
      Case2:
      
       In some cases, threads are scheduled on the same cpu, while other
       cpus are idle. This results in run-to-run inconsistency. For example:
      
        taskset -c 0-7 sysbench --num-threads=8 --test=cpu \
                                --cpu-max-prime=100000 run
      
      Total execution time ranges from 25.1s to 33.5s depending on threads
      placement, where 25.1s is when all 8 threads are balanced properly
      on 8 cpus.
      
      Introducing NUMA identity node sched domain, which is based on how
      SRAT/SLIT table define a logical NUMA node. This results in the following
      hierarchy of sched domains on the same system described above.
      
        domain0 SMT       (span a core)
        domain1 MC        (span a last-level-cache)
        domain2 NODE      (span a logical NUMA node)
        domain3 NUMA      (span a socket: 4 nodes)
        domain4 NUMA      (span a system: 8 nodes)
      
      This fixes the improper load balancing cases mentioned above.
      Signed-off-by: NSuravee Suthikulpanit <suravee.suthikulpanit@amd.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: bp@suse.de
      Link: http://lkml.kernel.org/r/1504768805-46716-1-git-send-email-suravee.suthikulpanit@amd.comSigned-off-by: NIngo Molnar <mingo@kernel.org>
      051f3ca0
    • P
      sched/topology: Restore SD_PREFER_SIBLING on MC domains · ed4ad1ca
      Peter Zijlstra 提交于
      The normal x86_topology on NHM+ machines degenerates because the MC
      and CPU domains are of the same size, therefore MC inherits
      SD_PREFER_SIBLING from CPU (which then gets taken out). The result is
      that we'll spread tasks across the first NUMA level in order to
      maximize cache utilization.
      
      However, for the x86_numa_in_package_topology we loose the CPU domain,
      and we'll not have SD_PREFER_SIBLING set anywhere, giving a distinct
      difference in behaviour.
      
      Commit:
      
        8e7fbcbc ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs")
      
      made a fail by not preserving the SD_PREFER_SIBLING for the !power_saving
      case on both CPU and MC.
      
      Then commit:
      
        6956dc56 ("sched/numa: Add SD_PERFER_SIBLING to CPU domain")
      
      adds it back to the CPU but not MC.
      
      Restore that now, such that we get consistent spreading behaviour wrt
      L3 and NUMA.
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      ed4ad1ca
  15. 12 9月, 2017 1 次提交
  16. 09 9月, 2017 1 次提交
  17. 25 8月, 2017 4 次提交
    • P
      sched/debug: Optimize sched_domain sysctl generation · bbdacdfe
      Peter Zijlstra 提交于
      Currently we unconditionally destroy all sysctl bits and regenerate
      them after we've rebuild the domains (even if that rebuild is a
      no-op).
      
      And since we unconditionally (re)build the sysctl for all possible
      CPUs, onlining all CPUs gets us O(n^2) time. Instead change this to
      only rebuild the bits for CPUs we've actually installed new domains
      on.
      Reported-by: NOfer Levi(SW) <oferle@mellanox.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      bbdacdfe
    • P
      sched/topology: Avoid pointless rebuild · 09e0dd8e
      Peter Zijlstra 提交于
      Fix partition_sched_domains() to try and preserve the existing machine
      wide domain instead of unconditionally destroying it. We do this by
      attempting to allocate the new single domain, only when that fails to
      we reuse the fallback_doms.
      
      When using fallback_doms we need to first destroy and then recreate
      because both the old and new could be backed by it.
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Ofer Levi(SW) <oferle@mellanox.com>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Tejun Heo <tj@kernel.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: Vineet.Gupta1@synopsys.com <Vineet.Gupta1@synopsys.com>
      Cc: rusty@rustcorp.com.au <rusty@rustcorp.com.au>
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      09e0dd8e
    • P
      sched/topology: Improve comments · a090c4f2
      Peter Zijlstra 提交于
      Mike provided a better comment for destroy_sched_domain() ...
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      a090c4f2
    • S
      sched/topology: Fix memory leak in __sdt_alloc() · 213c5a45
      Shu Wang 提交于
      Found this issue by kmemleak: the 'sg' and 'sgc' pointers from
      __sdt_alloc() might be leaked as each domain holds many groups' ref,
      but in destroy_sched_domain(), it only declined the first group ref.
      
      Onlining and offlining a CPU can trigger this leak, and cause OOM.
      
      Reproducer for my 6 CPUs machine:
      
        while true
        do
            echo 0 > /sys/devices/system/cpu/cpu5/online;
            echo 1 > /sys/devices/system/cpu/cpu5/online;
        done
      
        unreferenced object 0xffff88007d772a80 (size 64):
          comm "cpuhp/5", pid 39, jiffies 4294719962 (age 35.251s)
          hex dump (first 32 bytes):
            c0 22 77 7d 00 88 ff ff 02 00 00 00 01 00 00 00  ."w}............
            40 2a 77 7d 00 88 ff ff 00 00 00 00 00 00 00 00  @*w}............
          backtrace:
            [<ffffffff8176525a>] kmemleak_alloc+0x4a/0xa0
            [<ffffffff8121efe1>] __kmalloc_node+0xf1/0x280
            [<ffffffff810d94a8>] build_sched_domains+0x1e8/0xf20
            [<ffffffff810da674>] partition_sched_domains+0x304/0x360
            [<ffffffff81139557>] cpuset_update_active_cpus+0x17/0x40
            [<ffffffff810bdb2e>] sched_cpu_activate+0xae/0xc0
            [<ffffffff810900e0>] cpuhp_invoke_callback+0x90/0x400
            [<ffffffff81090597>] cpuhp_up_callbacks+0x37/0xb0
            [<ffffffff81090887>] cpuhp_thread_fun+0xd7/0xf0
            [<ffffffff810b37e0>] smpboot_thread_fn+0x110/0x160
            [<ffffffff810af5d9>] kthread+0x109/0x140
            [<ffffffff81770e45>] ret_from_fork+0x25/0x30
            [<ffffffffffffffff>] 0xffffffffffffffff
      
        unreferenced object 0xffff88007d772a40 (size 64):
          comm "cpuhp/5", pid 39, jiffies 4294719962 (age 35.251s)
          hex dump (first 32 bytes):
            03 00 00 00 00 00 00 00 00 04 00 00 00 00 00 00  ................
            00 04 00 00 00 00 00 00 4f 3c fc ff 00 00 00 00  ........O<......
          backtrace:
            [<ffffffff8176525a>] kmemleak_alloc+0x4a/0xa0
            [<ffffffff8121efe1>] __kmalloc_node+0xf1/0x280
            [<ffffffff810da16d>] build_sched_domains+0xead/0xf20
            [<ffffffff810da674>] partition_sched_domains+0x304/0x360
            [<ffffffff81139557>] cpuset_update_active_cpus+0x17/0x40
            [<ffffffff810bdb2e>] sched_cpu_activate+0xae/0xc0
            [<ffffffff810900e0>] cpuhp_invoke_callback+0x90/0x400
            [<ffffffff81090597>] cpuhp_up_callbacks+0x37/0xb0
            [<ffffffff81090887>] cpuhp_thread_fun+0xd7/0xf0
            [<ffffffff810b37e0>] smpboot_thread_fn+0x110/0x160
            [<ffffffff810af5d9>] kthread+0x109/0x140
            [<ffffffff81770e45>] ret_from_fork+0x25/0x30
            [<ffffffffffffffff>] 0xffffffffffffffff
      Reported-by: NChunyu Hu <chuhu@redhat.com>
      Signed-off-by: NShu Wang <shuwang@redhat.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Acked-by: NChunyu Hu <chuhu@redhat.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: liwang@redhat.com
      Link: http://lkml.kernel.org/r/1502351536-9108-1-git-send-email-shuwang@redhat.comSigned-off-by: NIngo Molnar <mingo@kernel.org>
      213c5a45
  18. 10 8月, 2017 2 次提交
  19. 15 5月, 2017 16 次提交
    • P
      sched/topology: Rename sched_group_cpus() · ae4df9d6
      Peter Zijlstra 提交于
      There's a discrepancy in naming between the sched_domain and
      sched_group cpumask accessor. Since we're doing changes, fix it.
      
        $ git grep sched_group_cpus | wc -l
        28
        $ git grep sched_domain_span | wc -l
        38
      
      Suggests changing sched_group_cpus() into sched_group_span():
      
        for i  in `git grep -l sched_group_cpus`
        do
          sed -ie 's/sched_group_cpus/sched_group_span/g' $i
        done
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      ae4df9d6
    • P
      sched/topology: Rename sched_group_mask() · e5c14b1f
      Peter Zijlstra 提交于
      Since sched_group_mask() is now an independent cpumask (it no longer
      masks sched_group_cpus()), rename the thing.
      Suggested-by: NLauro Ramos Venancio <lvenanci@redhat.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      e5c14b1f
    • P
      sched/topology: Simplify sched_group_mask() usage · af218122
      Peter Zijlstra 提交于
      While writing the comments, it occurred to me that:
      
        sg_cpus & sg_mask == sg_mask
      
      at least conceptually; the !overlap case sets the all 1s mask. If we
      correct that we can simplify things and directly use sg_mask.
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      af218122
    • P
      sched/topology: Rewrite get_group() · 0c0e776a
      Peter Zijlstra 提交于
      We want to attain:
      
        sg_cpus() & sg_mask() == sg_mask()
      
      for this to be so we must initialize sg_mask() to sg_cpus() for the
      !overlap case (its currently cpumask_setall()).
      
      Since the code makes my head hurt bad, rewrite it into a simpler form,
      inspired by the now fixed overlap code.
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      0c0e776a
    • P
      sched/topology: Add a few comments · 35a566e6
      Peter Zijlstra 提交于
      Try and describe what this code is about..
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      35a566e6
    • P
      sched/topology: Fix overlapping sched_group_capacity · 1676330e
      Peter Zijlstra 提交于
      When building the overlapping groups we need to attach a consistent
      sched_group_capacity structure. That is, all 'identical' sched_group's
      should have the _same_ sched_group_capacity.
      
      This can (once again) be demonstrated with a topology like:
      
        node   0   1   2   3
          0:  10  20  30  20
          1:  20  10  20  30
          2:  30  20  10  20
          3:  20  30  20  10
      
      But we need at least 2 CPUs per node for this to show up, after all,
      if there is only one CPU per node, our CPU @i is per definition a
      unique CPU that reaches this domain (aka balance-cpu).
      
      Given the above NUMA topo and 2 CPUs per node:
      
        [] CPU0 attaching sched-domain(s):
        []  domain-0: span=0,4 level=DIE
        []   groups: 0:{ span=0 }, 4:{ span=4 }
        []   domain-1: span=0-1,3-5,7 level=NUMA
        []    groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 }
        []    domain-2: span=0-7 level=NUMA
        []     groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 }
        [] CPU1 attaching sched-domain(s):
        []  domain-0: span=1,5 level=DIE
        []   groups: 1:{ span=1 }, 5:{ span=5 }
        []   domain-1: span=0-2,4-6 level=NUMA
        []    groups: 1:{ span=1,5 mask=1,5 cap=2048 }, 2:{ span=2,6 mask=2,6 cap=2048 }, 4:{ span=0,4 mask=0,4 cap=2048 }
        []    domain-2: span=0-7 level=NUMA
        []     groups: 1:{ span=0-2,4-6 mask=1,5 cap=6144 }, 3:{ span=0,2-4,6-7 mask=3,7 cap=6144 }
      
      Observe how CPU0-domain1-group0 and CPU1-domain1-group4 are the
      'same' but have a different id (0 vs 4).
      
      To fix this, use the group balance CPU to select the SGC. This means
      we have to compute the full mask for each CPU and require a second
      temporary mask to store the group mask in (it otherwise lives in the
      SGC).
      
      The fixed topology looks like:
      
        [] CPU0 attaching sched-domain(s):
        []  domain-0: span=0,4 level=DIE
        []   groups: 0:{ span=0 }, 4:{ span=4 }
        []   domain-1: span=0-1,3-5,7 level=NUMA
        []    groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 }
        []    domain-2: span=0-7 level=NUMA
        []     groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 }
        [] CPU1 attaching sched-domain(s):
        []  domain-0: span=1,5 level=DIE
        []   groups: 1:{ span=1 }, 5:{ span=5 }
        []   domain-1: span=0-2,4-6 level=NUMA
        []    groups: 1:{ span=1,5 mask=1,5 cap=2048 }, 2:{ span=2,6 mask=2,6 cap=2048 }, 0:{ span=0,4 mask=0,4 cap=2048 }
        []    domain-2: span=0-7 level=NUMA
        []     groups: 1:{ span=0-2,4-6 mask=1,5 cap=6144 }, 3:{ span=0,2-4,6-7 mask=3,7 cap=6144 }
      Debugged-by: NLauro Ramos Venancio <lvenanci@redhat.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Fixes: e3589f6c ("sched: Allow for overlapping sched_domain spans")
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      1676330e
    • P
      sched/topology: Add sched_group_capacity debugging · 005f874d
      Peter Zijlstra 提交于
      Add sgc::id to easier spot domain construction issues.
      
      Take the opportunity to slightly rework the group printing, because
      adding more "(id: %d)" strings makes the entire thing very hard to
      read. Also the individual groups are very hard to separate, so add
      explicit visual grouping, which allows replacing all the "(%s: %d)"
      format things with shorter "%s=%d" variants.
      
      Then fix up some inconsistencies in surrounding prints for domains.
      
      The end result looks like:
      
        [] CPU0 attaching sched-domain(s):
        []  domain-0: span=0,4 level=DIE
        []   groups: 0:{ span=0 }, 4:{ span=4 }
        []   domain-1: span=0-1,3-5,7 level=NUMA
        []    groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 }
        []    domain-2: span=0-7 level=NUMA
        []     groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 }
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      005f874d
    • P
      sched/topology: Small cleanup · 8d5dc512
      Peter Zijlstra 提交于
      Move the allocation of topology specific cpumasks into the topology
      code.
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      8d5dc512
    • P
      sched/topology: Fix overlapping sched_group_mask · 73bb059f
      Peter Zijlstra 提交于
      The point of sched_group_mask is to select those CPUs from
      sched_group_cpus that can actually arrive at this balance domain.
      
      The current code gets it wrong, as can be readily demonstrated with a
      topology like:
      
        node   0   1   2   3
          0:  10  20  30  20
          1:  20  10  20  30
          2:  30  20  10  20
          3:  20  30  20  10
      
      Where (for example) domain 1 on CPU1 ends up with a mask that includes
      CPU0:
      
        [] CPU1 attaching sched-domain:
        []  domain 0: span 0-2 level NUMA
        []   groups: 1 (mask: 1), 2, 0
        []   domain 1: span 0-3 level NUMA
        []    groups: 0-2 (mask: 0-2) (cpu_capacity: 3072), 0,2-3 (cpu_capacity: 3072)
      
      This causes sched_balance_cpu() to compute the wrong CPU and
      consequently should_we_balance() will terminate early resulting in
      missed load-balance opportunities.
      
      The fixed topology looks like:
      
        [] CPU1 attaching sched-domain:
        []  domain 0: span 0-2 level NUMA
        []   groups: 1 (mask: 1), 2, 0
        []   domain 1: span 0-3 level NUMA
        []    groups: 0-2 (mask: 1) (cpu_capacity: 3072), 0,2-3 (cpu_capacity: 3072)
      
      (note: this relies on OVERLAP domains to always have children, this is
       true because the regular topology domains are still here -- this is
       before degenerate trimming)
      Debugged-by: NLauro Ramos Venancio <lvenanci@redhat.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Cc: stable@vger.kernel.org
      Fixes: e3589f6c ("sched: Allow for overlapping sched_domain spans")
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      73bb059f
    • P
      sched/topology: Remove FORCE_SD_OVERLAP · af85596c
      Peter Zijlstra 提交于
      Its an obsolete debug mechanism and future code wants to rely on
      properties this undermines.
      
      Namely, it would be good to assume that SD_OVERLAP domains have
      children, but if we build the entire hierarchy with SD_OVERLAP this is
      obviously false.
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      af85596c
    • L
      sched/topology: Move comment about asymmetric node setups · c20e1ea4
      Lauro Ramos Venancio 提交于
      Signed-off-by: NLauro Ramos Venancio <lvenanci@redhat.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: lwang@redhat.com
      Cc: riel@redhat.com
      Link: http://lkml.kernel.org/r/1492717903-5195-4-git-send-email-lvenanci@redhat.comSigned-off-by: NIngo Molnar <mingo@kernel.org>
      c20e1ea4
    • L
      sched/topology: Optimize build_group_mask() · f32d782e
      Lauro Ramos Venancio 提交于
      The group mask is always used in intersection with the group CPUs. So,
      when building the group mask, we don't have to care about CPUs that are
      not part of the group.
      Signed-off-by: NLauro Ramos Venancio <lvenanci@redhat.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: lwang@redhat.com
      Cc: riel@redhat.com
      Link: http://lkml.kernel.org/r/1492717903-5195-2-git-send-email-lvenanci@redhat.comSigned-off-by: NIngo Molnar <mingo@kernel.org>
      f32d782e
    • P
      sched/topology: Verify the first group matches the child domain · a420b063
      Peter Zijlstra 提交于
      We want sched_groups to be sibling child domains (or individual CPUs
      when there are no child domains). Furthermore, since the first group
      of a domain should include the CPU of that domain, the first group of
      each domain should match the child domain.
      
      Verify this is indeed so.
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      a420b063
    • P
      sched/debug: Print the scheduler topology group mask · b0151c25
      Peter Zijlstra 提交于
      In order to determine the balance_cpu (for should_we_balance()) we need
      the sched_group_mask() for overlapping domains.
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      b0151c25
    • P
      sched/topology: Simplify build_overlap_sched_groups() · 91eaed0d
      Peter Zijlstra 提交于
      Now that the first group will always be the previous domain of this
      @cpu this can be simplified.
      
      In fact, writing the code now removed should've been a big clue I was
      doing it wrong :/
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      91eaed0d
    • P
      sched/topology: Fix building of overlapping sched-groups · 0372dd27
      Peter Zijlstra 提交于
      When building the overlapping groups, we very obviously should start
      with the previous domain of _this_ @cpu, not CPU-0.
      
      This can be readily demonstrated with a topology like:
      
        node   0   1   2   3
          0:  10  20  30  20
          1:  20  10  20  30
          2:  30  20  10  20
          3:  20  30  20  10
      
      Where (for example) CPU1 ends up generating the following nonsensical groups:
      
        [] CPU1 attaching sched-domain:
        []  domain 0: span 0-2 level NUMA
        []   groups: 1 2 0
        []   domain 1: span 0-3 level NUMA
        []    groups: 1-3 (cpu_capacity = 3072) 0-1,3 (cpu_capacity = 3072)
      
      Where the fact that domain 1 doesn't include a group with span 0-2 is
      the obvious fail.
      
      With patch this looks like:
      
        [] CPU1 attaching sched-domain:
        []  domain 0: span 0-2 level NUMA
        []   groups: 1 0 2
        []   domain 1: span 0-3 level NUMA
        []    groups: 0-2 (cpu_capacity = 3072) 0,2-3 (cpu_capacity = 3072)
      Debugged-by: NLauro Ramos Venancio <lvenanci@redhat.com>
      Signed-off-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Mike Galbraith <efault@gmx.de>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: linux-kernel@vger.kernel.org
      Cc: stable@vger.kernel.org
      Fixes: e3589f6c ("sched: Allow for overlapping sched_domain spans")
      Signed-off-by: NIngo Molnar <mingo@kernel.org>
      0372dd27