1. 11 7月, 2018 1 次提交
  2. 06 6月, 2018 1 次提交
    • D
      vfs: change inode times to use struct timespec64 · 95582b00
      Deepa Dinamani 提交于
      struct timespec is not y2038 safe. Transition vfs to use
      y2038 safe struct timespec64 instead.
      
      The change was made with the help of the following cocinelle
      script. This catches about 80% of the changes.
      All the header file and logic changes are included in the
      first 5 rules. The rest are trivial substitutions.
      I avoid changing any of the function signatures or any other
      filesystem specific data structures to keep the patch simple
      for review.
      
      The script can be a little shorter by combining different cases.
      But, this version was sufficient for my usecase.
      
      virtual patch
      
      @ depends on patch @
      identifier now;
      @@
      - struct timespec
      + struct timespec64
        current_time ( ... )
        {
      - struct timespec now = current_kernel_time();
      + struct timespec64 now = current_kernel_time64();
        ...
      - return timespec_trunc(
      + return timespec64_trunc(
        ... );
        }
      
      @ depends on patch @
      identifier xtime;
      @@
       struct \( iattr \| inode \| kstat \) {
       ...
      -       struct timespec xtime;
      +       struct timespec64 xtime;
       ...
       }
      
      @ depends on patch @
      identifier t;
      @@
       struct inode_operations {
       ...
      int (*update_time) (...,
      -       struct timespec t,
      +       struct timespec64 t,
      ...);
       ...
       }
      
      @ depends on patch @
      identifier t;
      identifier fn_update_time =~ "update_time$";
      @@
       fn_update_time (...,
      - struct timespec *t,
      + struct timespec64 *t,
       ...) { ... }
      
      @ depends on patch @
      identifier t;
      @@
      lease_get_mtime( ... ,
      - struct timespec *t
      + struct timespec64 *t
        ) { ... }
      
      @te depends on patch forall@
      identifier ts;
      local idexpression struct inode *inode_node;
      identifier i_xtime =~ "^i_[acm]time$";
      identifier ia_xtime =~ "^ia_[acm]time$";
      identifier fn_update_time =~ "update_time$";
      identifier fn;
      expression e, E3;
      local idexpression struct inode *node1;
      local idexpression struct inode *node2;
      local idexpression struct iattr *attr1;
      local idexpression struct iattr *attr2;
      local idexpression struct iattr attr;
      identifier i_xtime1 =~ "^i_[acm]time$";
      identifier i_xtime2 =~ "^i_[acm]time$";
      identifier ia_xtime1 =~ "^ia_[acm]time$";
      identifier ia_xtime2 =~ "^ia_[acm]time$";
      @@
      (
      (
      - struct timespec ts;
      + struct timespec64 ts;
      |
      - struct timespec ts = current_time(inode_node);
      + struct timespec64 ts = current_time(inode_node);
      )
      
      <+... when != ts
      (
      - timespec_equal(&inode_node->i_xtime, &ts)
      + timespec64_equal(&inode_node->i_xtime, &ts)
      |
      - timespec_equal(&ts, &inode_node->i_xtime)
      + timespec64_equal(&ts, &inode_node->i_xtime)
      |
      - timespec_compare(&inode_node->i_xtime, &ts)
      + timespec64_compare(&inode_node->i_xtime, &ts)
      |
      - timespec_compare(&ts, &inode_node->i_xtime)
      + timespec64_compare(&ts, &inode_node->i_xtime)
      |
      ts = current_time(e)
      |
      fn_update_time(..., &ts,...)
      |
      inode_node->i_xtime = ts
      |
      node1->i_xtime = ts
      |
      ts = inode_node->i_xtime
      |
      <+... attr1->ia_xtime ...+> = ts
      |
      ts = attr1->ia_xtime
      |
      ts.tv_sec
      |
      ts.tv_nsec
      |
      btrfs_set_stack_timespec_sec(..., ts.tv_sec)
      |
      btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
      |
      - ts = timespec64_to_timespec(
      + ts =
      ...
      -)
      |
      - ts = ktime_to_timespec(
      + ts = ktime_to_timespec64(
      ...)
      |
      - ts = E3
      + ts = timespec_to_timespec64(E3)
      |
      - ktime_get_real_ts(&ts)
      + ktime_get_real_ts64(&ts)
      |
      fn(...,
      - ts
      + timespec64_to_timespec(ts)
      ,...)
      )
      ...+>
      (
      <... when != ts
      - return ts;
      + return timespec64_to_timespec(ts);
      ...>
      )
      |
      - timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
      + timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
      |
      - timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
      + timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
      |
      - timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
      + timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
      |
      node1->i_xtime1 =
      - timespec_trunc(attr1->ia_xtime1,
      + timespec64_trunc(attr1->ia_xtime1,
      ...)
      |
      - attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
      + attr1->ia_xtime1 =  timespec64_trunc(attr2->ia_xtime2,
      ...)
      |
      - ktime_get_real_ts(&attr1->ia_xtime1)
      + ktime_get_real_ts64(&attr1->ia_xtime1)
      |
      - ktime_get_real_ts(&attr.ia_xtime1)
      + ktime_get_real_ts64(&attr.ia_xtime1)
      )
      
      @ depends on patch @
      struct inode *node;
      struct iattr *attr;
      identifier fn;
      identifier i_xtime =~ "^i_[acm]time$";
      identifier ia_xtime =~ "^ia_[acm]time$";
      expression e;
      @@
      (
      - fn(node->i_xtime);
      + fn(timespec64_to_timespec(node->i_xtime));
      |
       fn(...,
      - node->i_xtime);
      + timespec64_to_timespec(node->i_xtime));
      |
      - e = fn(attr->ia_xtime);
      + e = fn(timespec64_to_timespec(attr->ia_xtime));
      )
      
      @ depends on patch forall @
      struct inode *node;
      struct iattr *attr;
      identifier i_xtime =~ "^i_[acm]time$";
      identifier ia_xtime =~ "^ia_[acm]time$";
      identifier fn;
      @@
      {
      + struct timespec ts;
      <+...
      (
      + ts = timespec64_to_timespec(node->i_xtime);
      fn (...,
      - &node->i_xtime,
      + &ts,
      ...);
      |
      + ts = timespec64_to_timespec(attr->ia_xtime);
      fn (...,
      - &attr->ia_xtime,
      + &ts,
      ...);
      )
      ...+>
      }
      
      @ depends on patch forall @
      struct inode *node;
      struct iattr *attr;
      struct kstat *stat;
      identifier ia_xtime =~ "^ia_[acm]time$";
      identifier i_xtime =~ "^i_[acm]time$";
      identifier xtime =~ "^[acm]time$";
      identifier fn, ret;
      @@
      {
      + struct timespec ts;
      <+...
      (
      + ts = timespec64_to_timespec(node->i_xtime);
      ret = fn (...,
      - &node->i_xtime,
      + &ts,
      ...);
      |
      + ts = timespec64_to_timespec(node->i_xtime);
      ret = fn (...,
      - &node->i_xtime);
      + &ts);
      |
      + ts = timespec64_to_timespec(attr->ia_xtime);
      ret = fn (...,
      - &attr->ia_xtime,
      + &ts,
      ...);
      |
      + ts = timespec64_to_timespec(attr->ia_xtime);
      ret = fn (...,
      - &attr->ia_xtime);
      + &ts);
      |
      + ts = timespec64_to_timespec(stat->xtime);
      ret = fn (...,
      - &stat->xtime);
      + &ts);
      )
      ...+>
      }
      
      @ depends on patch @
      struct inode *node;
      struct inode *node2;
      identifier i_xtime1 =~ "^i_[acm]time$";
      identifier i_xtime2 =~ "^i_[acm]time$";
      identifier i_xtime3 =~ "^i_[acm]time$";
      struct iattr *attrp;
      struct iattr *attrp2;
      struct iattr attr ;
      identifier ia_xtime1 =~ "^ia_[acm]time$";
      identifier ia_xtime2 =~ "^ia_[acm]time$";
      struct kstat *stat;
      struct kstat stat1;
      struct timespec64 ts;
      identifier xtime =~ "^[acmb]time$";
      expression e;
      @@
      (
      ( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1  ;
      |
       node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
      |
       node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
      |
       node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
      |
       stat->xtime = node2->i_xtime1;
      |
       stat1.xtime = node2->i_xtime1;
      |
      ( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1  ;
      |
      ( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
      |
      - e = node->i_xtime1;
      + e = timespec64_to_timespec( node->i_xtime1 );
      |
      - e = attrp->ia_xtime1;
      + e = timespec64_to_timespec( attrp->ia_xtime1 );
      |
      node->i_xtime1 = current_time(...);
      |
       node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
      - e;
      + timespec_to_timespec64(e);
      |
       node->i_xtime1 = node->i_xtime3 =
      - e;
      + timespec_to_timespec64(e);
      |
      - node->i_xtime1 = e;
      + node->i_xtime1 = timespec_to_timespec64(e);
      )
      Signed-off-by: NDeepa Dinamani <deepa.kernel@gmail.com>
      Cc: <anton@tuxera.com>
      Cc: <balbi@kernel.org>
      Cc: <bfields@fieldses.org>
      Cc: <darrick.wong@oracle.com>
      Cc: <dhowells@redhat.com>
      Cc: <dsterba@suse.com>
      Cc: <dwmw2@infradead.org>
      Cc: <hch@lst.de>
      Cc: <hirofumi@mail.parknet.co.jp>
      Cc: <hubcap@omnibond.com>
      Cc: <jack@suse.com>
      Cc: <jaegeuk@kernel.org>
      Cc: <jaharkes@cs.cmu.edu>
      Cc: <jslaby@suse.com>
      Cc: <keescook@chromium.org>
      Cc: <mark@fasheh.com>
      Cc: <miklos@szeredi.hu>
      Cc: <nico@linaro.org>
      Cc: <reiserfs-devel@vger.kernel.org>
      Cc: <richard@nod.at>
      Cc: <sage@redhat.com>
      Cc: <sfrench@samba.org>
      Cc: <swhiteho@redhat.com>
      Cc: <tj@kernel.org>
      Cc: <trond.myklebust@primarydata.com>
      Cc: <tytso@mit.edu>
      Cc: <viro@zeniv.linux.org.uk>
      95582b00
  3. 31 5月, 2018 3 次提交
  4. 29 5月, 2018 1 次提交
  5. 26 5月, 2018 2 次提交
  6. 21 5月, 2018 1 次提交
  7. 18 5月, 2018 1 次提交
  8. 14 5月, 2018 1 次提交
  9. 01 5月, 2018 1 次提交
    • K
      fasync: Fix deadlock between task-context and interrupt-context kill_fasync() · 7a107c0f
      Kirill Tkhai 提交于
      I observed the following deadlock between them:
      
      [task 1]                          [task 2]                         [task 3]
      kill_fasync()                     mm_update_next_owner()           copy_process()
       spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
        send_sigio()                    <IRQ>                             ...
         read_lock(&fown->lock)         kill_fasync()                     ...
          read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
      
      Task 1 can't acquire read locked tasklist_lock, since there is
      already task 3 expressed its wish to take the lock exclusive.
      Task 2 holds the read locked lock, but it can't take the spin lock.
      
      Also, there is possible another deadlock (which I haven't observed):
      
      [task 1]                            [task 2]
      f_getown()                          kill_fasync()
       read_lock(&f_own->lock)             spin_lock_irqsave(&fa->fa_lock,)
       <IRQ>                               send_sigio()                     write_lock_irq(&f_own->lock)
        kill_fasync()                       read_lock(&fown->lock)
         spin_lock_irqsave(&fa->fa_lock,)
      
      Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(),
      as it guarantees fa->fa_file->f_owner integrity only. It may seem,
      that it used to give a task a small possibility to receive two sequential
      signals, if there are two parallel kill_fasync() callers, and task
      handles the first signal fastly, but the behaviour won't become
      different, since there is exclusive sighand lock in do_send_sig_info().
      
      The patch converts fa_lock into rwlock_t, and this fixes two above
      deadlocks, as rwlock is allowed to be taken from interrupt handler
      by qrwlock design.
      Signed-off-by: NKirill Tkhai <ktkhai@virtuozzo.com>
      Signed-off-by: NJeff Layton <jlayton@redhat.com>
      7a107c0f
  10. 12 4月, 2018 1 次提交
  11. 10 4月, 2018 1 次提交
    • D
      vfs: Remove the const from dir_context::actor · a09acf4b
      David Howells 提交于
      Remove the const marking from the actor function pointer in the dir_context
      struct.  The const prevents the structure from being used as part of a
      kmalloc'd object as it makes the compiler require that the actor member be
      set at object initialisation time (or not at all), incuring something like
      the following error if you try and set it later:
      
      	fs/afs/dir.c:556:20: error: assignment of read-only member 'actor'
      
      Marking the member const like this adds very little in the way of sanity
      checking as the type checking system is likely to provide sufficient - and
      if not, the kernel is very likely to oops repeatably in this case.
      
      Fixes: ac6614b7 ("[readdir] constify ->actor")
      Signed-off-by: NDavid Howells <dhowells@redhat.com>
      Reviewed-by: NAl Viro <viro@zeniv.linux.org.uk>
      a09acf4b
  12. 31 3月, 2018 1 次提交
  13. 28 3月, 2018 1 次提交
  14. 23 3月, 2018 1 次提交
  15. 19 3月, 2018 2 次提交
    • M
      buffer.c: call thaw_super during emergency thaw · 08fdc8a0
      Mateusz Guzik 提交于
      There are 2 distinct freezing mechanisms - one operates on block
      devices and another one directly on super blocks. Both end up with the
      same result, but thaw of only one of these does not thaw the other.
      
      In particular fsfreeze --freeze uses the ioctl variant going to the
      super block. Since prior to this patch emergency thaw was not doing
      a relevant thaw, filesystems frozen with this method remained
      unaffected.
      
      The patch is a hack which adds blind unfreezing.
      
      In order to keep the super block write-locked the whole time the code
      is shuffled around and the newly introduced __iterate_supers is
      employed.
      Signed-off-by: NMateusz Guzik <mguzik@redhat.com>
      Signed-off-by: NAl Viro <viro@zeniv.linux.org.uk>
      08fdc8a0
    • R
      vfs: make sure struct filename->iname is word-aligned · 1c949843
      Rasmus Villemoes 提交于
      I noticed that offsetof(struct filename, iname) is actually 28 on 64
      bit platforms, so we always pass an unaligned pointer to
      strncpy_from_user. This is mostly a problem for those 64 bit platforms
      without HAVE_EFFICIENT_UNALIGNED_ACCESS, but even on x86_64, unaligned
      accesses carry a penalty.
      
      A user-space microbenchmark doing nothing but strncpy_from_user from the
      same (aligned) source string runs about 5% faster when the destination
      is aligned. That number increases to 20% when the string is long
      enough (~32 bytes) that we cross a cache line boundary - that's for
      example the case for about half the files a "git status" in a kernel
      tree ends up stat'ing.
      
      This won't make any real-life workloads 5%, or even 1%, faster, but path
      lookup is common enough that cutting even a few cycles should be
      worthwhile. So ensure we always pass an aligned destination pointer to
      strncpy_from_user. Instead of explicit padding, simply swap the refcnt
      and aname members, as suggested by Al Viro.
      Signed-off-by: NRasmus Villemoes <linux@rasmusvillemoes.dk>
      Signed-off-by: NAl Viro <viro@zeniv.linux.org.uk>
      1c949843
  16. 16 3月, 2018 1 次提交
    • E
      fs: Teach path_connected to handle nfs filesystems with multiple roots. · 95dd7758
      Eric W. Biederman 提交于
      On nfsv2 and nfsv3 the nfs server can export subsets of the same
      filesystem and report the same filesystem identifier, so that the nfs
      client can know they are the same filesystem.  The subsets can be from
      disjoint directory trees.  The nfsv2 and nfsv3 filesystems provides no
      way to find the common root of all directory trees exported form the
      server with the same filesystem identifier.
      
      The practical result is that in struct super s_root for nfs s_root is
      not necessarily the root of the filesystem.  The nfs mount code sets
      s_root to the root of the first subset of the nfs filesystem that the
      kernel mounts.
      
      This effects the dcache invalidation code in generic_shutdown_super
      currently called shrunk_dcache_for_umount and that code for years
      has gone through an additional list of dentries that might be dentry
      trees that need to be freed to accomodate nfs.
      
      When I wrote path_connected I did not realize nfs was so special, and
      it's hueristic for avoiding calling is_subdir can fail.
      
      The practical case where this fails is when there is a move of a
      directory from the subtree exposed by one nfs mount to the subtree
      exposed by another nfs mount.  This move can happen either locally or
      remotely.  With the remote case requiring that the move directory be cached
      before the move and that after the move someone walks the path
      to where the move directory now exists and in so doing causes the
      already cached directory to be moved in the dcache through the magic
      of d_splice_alias.
      
      If someone whose working directory is in the move directory or a
      subdirectory and now starts calling .. from the initial mount of nfs
      (where s_root == mnt_root), then path_connected as a heuristic will
      not bother with the is_subdir check.  As s_root really is not the root
      of the nfs filesystem this heuristic is wrong, and the path may
      actually not be connected and path_connected can fail.
      
      The is_subdir function might be cheap enough that we can call it
      unconditionally.  Verifying that will take some benchmarking and
      the result may not be the same on all kernels this fix needs
      to be backported to.  So I am avoiding that for now.
      
      Filesystems with snapshots such as nilfs and btrfs do something
      similar.  But as the directory tree of the snapshots are disjoint
      from one another and from the main directory tree rename won't move
      things between them and this problem will not occur.
      
      Cc: stable@vger.kernel.org
      Reported-by: NAl Viro <viro@ZenIV.linux.org.uk>
      Fixes: 397d425d ("vfs: Test for and handle paths that are unreachable from their mnt_root")
      Signed-off-by: N"Eric W. Biederman" <ebiederm@xmission.com>
      Signed-off-by: NAl Viro <viro@zeniv.linux.org.uk>
      95dd7758
  17. 13 3月, 2018 2 次提交
  18. 27 2月, 2018 1 次提交
  19. 29 1月, 2018 3 次提交
    • D
      xfs: only grab shared inode locks for source file during reflink · 01c2e13d
      Darrick J. Wong 提交于
      Reflink and dedupe operations remap blocks from a source file into a
      destination file.  The destination file needs exclusive locks on all
      levels because we're updating its block map, but the source file isn't
      undergoing any block map changes so we can use a shared lock.
      Signed-off-by: NDarrick J. Wong <darrick.wong@oracle.com>
      Reviewed-by: NChristoph Hellwig <hch@lst.de>
      01c2e13d
    • J
      fs: handle inode->i_version more efficiently · f02a9ad1
      Jeff Layton 提交于
      Since i_version is mostly treated as an opaque value, we can exploit that
      fact to avoid incrementing it when no one is watching. With that change,
      we can avoid incrementing the counter on writes, unless someone has
      queried for it since it was last incremented. If the a/c/mtime don't
      change, and the i_version hasn't changed, then there's no need to dirty
      the inode metadata on a write.
      
      Convert the i_version counter to an atomic64_t, and use the lowest order
      bit to hold a flag that will tell whether anyone has queried the value
      since it was last incremented.
      
      When we go to maybe increment it, we fetch the value and check the flag
      bit.  If it's clear then we don't need to do anything if the update
      isn't being forced.
      
      If we do need to update, then we increment the counter by 2, and clear
      the flag bit, and then use a CAS op to swap it into place. If that
      works, we return true. If it doesn't then do it again with the value
      that we fetch from the CAS operation.
      
      On the query side, if the flag is already set, then we just shift the
      value down by 1 bit and return it. Otherwise, we set the flag in our
      on-stack value and again use cmpxchg to swap it into place if it hasn't
      changed. If it has, then we use the value from the cmpxchg as the new
      "old" value and try again.
      
      This method allows us to avoid incrementing the counter on writes (and
      dirtying the metadata) under typical workloads. We only need to increment
      if it has been queried since it was last changed.
      Signed-off-by: NJeff Layton <jlayton@redhat.com>
      Reviewed-by: NJan Kara <jack@suse.cz>
      Acked-by: NDave Chinner <dchinner@redhat.com>
      Tested-by: NKrzysztof Kozlowski <krzk@kernel.org>
      f02a9ad1
    • J
      fs: new API for handling inode->i_version · ae5e165d
      Jeff Layton 提交于
      Add a documentation blob that explains what the i_version field is, how
      it is expected to work, and how it is currently implemented by various
      filesystems.
      
      We already have inode_inc_iversion. Add several other functions for
      manipulating and accessing the i_version counter. For now, the
      implementation is trivial and basically works the way that all of the
      open-coded i_version accesses work today.
      
      Future patches will convert existing users of i_version to use the new
      API, and then convert the backend implementation to do things more
      efficiently.
      Signed-off-by: NJeff Layton <jlayton@redhat.com>
      Reviewed-by: NJan Kara <jack@suse.cz>
      ae5e165d
  20. 26 1月, 2018 2 次提交
  21. 09 1月, 2018 1 次提交
    • D
      iomap: report collisions between directio and buffered writes to userspace · 5a9d929d
      Darrick J. Wong 提交于
      If two programs simultaneously try to write to the same part of a file
      via direct IO and buffered IO, there's a chance that the post-diowrite
      pagecache invalidation will fail on the dirty page.  When this happens,
      the dio write succeeded, which means that the page cache is no longer
      coherent with the disk!
      
      Programs are not supposed to mix IO types and this is a clear case of
      data corruption, so store an EIO which will be reflected to userspace
      during the next fsync.  Replace the WARN_ON with a ratelimited pr_crit
      so that the developers have /some/ kind of breadcrumb to track down the
      offending program(s) and file(s) involved.
      Signed-off-by: NDarrick J. Wong <darrick.wong@oracle.com>
      Reviewed-by: NLiu Bo <bo.li.liu@oracle.com>
      5a9d929d
  22. 06 1月, 2018 1 次提交
  23. 26 12月, 2017 1 次提交
    • N
      VFS: don't keep disconnected dentries on d_anon · f1ee6162
      NeilBrown 提交于
      The original purpose of the per-superblock d_anon list was to
      keep disconnected dentries in the cache between consecutive
      requests to the NFS server.  Dentries can be disconnected if
      a client holds a file open and repeatedly performs IO on it,
      and if the server drops the dentry, whether due to memory
      pressure, server restart, or "echo 3 > /proc/sys/vm/drop_caches".
      
      This purpose was thwarted by commit 75a6f82a ("freeing unlinked
      file indefinitely delayed") which caused disconnected dentries
      to be freed as soon as their refcount reached zero.
      
      This means that, when a dentry being used by nfsd gets disconnected, a
      new one needs to be allocated for every request (unless requests
      overlap).  As the dentry has no name, no parent, and no children,
      there is little of value to cache.  As small memory allocations are
      typically fast (from per-cpu free lists) this likely has little cost.
      
      This means that the original purpose of s_anon is no longer relevant:
      there is no longer any need to keep disconnected dentries on a list so
      they appear to be hashed.
      
      However, s_anon now has a new use.  When you mount an NFS filesystem,
      the dentry stored in s_root is just a placebo.  The "real" root dentry
      is allocated using d_obtain_root() and so it kept on the s_anon list.
      I don't know the reason for this, but suspect it related to NFSv4
      where a mount of "server:/some/path" require NFS to look up the root
      filehandle on the server, then walk down "/some" and "/path" to get
      the filehandle to mount.
      
      Whatever the reason, NFS depends on the s_anon list and on
      shrink_dcache_for_umount() pruning all dentries on this list.  So we
      cannot simply remove s_anon.
      
      We could just leave the code unchanged, but apart from that being
      potentially confusing, the (unfair) bit-spin-lock which protects
      s_anon can become a bottle neck when lots of disconnected dentries are
      being created.
      
      So this patch renames s_anon to s_roots, and stops storing
      disconnected dentries on the list.  Only dentries obtained with
      d_obtain_root() are now stored on this list.  There are many fewer of
      these (only NFS and NILFS2 use the call, and only during filesystem
      mount) so contention on the bit-lock will not be a problem.
      
      Possibly an alternate solution should be found for NFS and NILFS2, but
      that would require understanding their needs first.
      Signed-off-by: NNeilBrown <neilb@suse.com>
      Signed-off-by: NAl Viro <viro@zeniv.linux.org.uk>
      f1ee6162
  24. 30 11月, 2017 2 次提交
    • I
      autofs: revert "autofs: fix AT_NO_AUTOMOUNT not being honored" · 5d38f049
      Ian Kent 提交于
      Commit 42f46148 ("autofs: fix AT_NO_AUTOMOUNT not being honored")
      allowed the fstatat(2) system call to properly honor the AT_NO_AUTOMOUNT
      flag but introduced a semantic change.
      
      In order to honor AT_NO_AUTOMOUNT a semantic change was made to the
      negative dentry case for stat family system calls in follow_automount().
      
      This changed the unconditional triggering of an automount in this case
      to no longer be done and an error returned instead.
      
      This has caused more problems than I expected so reverting the change is
      needed.
      
      In a discussion with Neil Brown it was concluded that the automount(8)
      daemon can implement this change without kernel modifications.  So that
      will be done instead and the autofs module documentation updated with a
      description of the problem and what needs to be done by module users for
      this specific case.
      
      Link: http://lkml.kernel.org/r/151174730120.6162.3848002191530283984.stgit@pluto.themaw.net
      Fixes: 42f46148 ("autofs: fix AT_NO_AUTOMOUNT not being honored")
      Signed-off-by: NIan Kent <raven@themaw.net>
      Cc: Neil Brown <neilb@suse.com>
      Cc: Al Viro <viro@ZenIV.linux.org.uk>
      Cc: David Howells <dhowells@redhat.com>
      Cc: Colin Walters <walters@redhat.com>
      Cc: Ondrej Holy <oholy@redhat.com>
      Cc: <stable@vger.kernel.org>	[4.11+]
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      5d38f049
    • D
      mm: introduce get_user_pages_longterm · 2bb6d283
      Dan Williams 提交于
      Patch series "introduce get_user_pages_longterm()", v2.
      
      Here is a new get_user_pages api for cases where a driver intends to
      keep an elevated page count indefinitely.  This is distinct from usages
      like iov_iter_get_pages where the elevated page counts are transient.
      The iov_iter_get_pages cases immediately turn around and submit the
      pages to a device driver which will put_page when the i/o operation
      completes (under kernel control).
      
      In the longterm case userspace is responsible for dropping the page
      reference at some undefined point in the future.  This is untenable for
      filesystem-dax case where the filesystem is in control of the lifetime
      of the block / page and needs reasonable limits on how long it can wait
      for pages in a mapping to become idle.
      
      Fixing filesystems to actually wait for dax pages to be idle before
      blocks from a truncate/hole-punch operation are repurposed is saved for
      a later patch series.
      
      Also, allowing longterm registration of dax mappings is a future patch
      series that introduces a "map with lease" semantic where the kernel can
      revoke a lease and force userspace to drop its page references.
      
      I have also tagged these for -stable to purposely break cases that might
      assume that longterm memory registrations for filesystem-dax mappings
      were supported by the kernel.  The behavior regression this policy
      change implies is one of the reasons we maintain the "dax enabled.
      Warning: EXPERIMENTAL, use at your own risk" notification when mounting
      a filesystem in dax mode.
      
      It is worth noting the device-dax interface does not suffer the same
      constraints since it does not support file space management operations
      like hole-punch.
      
      This patch (of 4):
      
      Until there is a solution to the dma-to-dax vs truncate problem it is
      not safe to allow long standing memory registrations against
      filesytem-dax vmas.  Device-dax vmas do not have this problem and are
      explicitly allowed.
      
      This is temporary until a "memory registration with layout-lease"
      mechanism can be implemented for the affected sub-systems (RDMA and
      V4L2).
      
      [akpm@linux-foundation.org: use kcalloc()]
      Link: http://lkml.kernel.org/r/151068939435.7446.13560129395419350737.stgit@dwillia2-desk3.amr.corp.intel.com
      Fixes: 3565fce3 ("mm, x86: get_user_pages() for dax mappings")
      Signed-off-by: NDan Williams <dan.j.williams@intel.com>
      Suggested-by: NChristoph Hellwig <hch@lst.de>
      Cc: Doug Ledford <dledford@redhat.com>
      Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
      Cc: Inki Dae <inki.dae@samsung.com>
      Cc: Jan Kara <jack@suse.cz>
      Cc: Jason Gunthorpe <jgg@mellanox.com>
      Cc: Jeff Moyer <jmoyer@redhat.com>
      Cc: Joonyoung Shim <jy0922.shim@samsung.com>
      Cc: Kyungmin Park <kyungmin.park@samsung.com>
      Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
      Cc: Mel Gorman <mgorman@suse.de>
      Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
      Cc: Sean Hefty <sean.hefty@intel.com>
      Cc: Seung-Woo Kim <sw0312.kim@samsung.com>
      Cc: Vlastimil Babka <vbabka@suse.cz>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      2bb6d283
  25. 28 11月, 2017 2 次提交
    • A
      ->poll() methods should return __poll_t · a3f8683b
      Al Viro 提交于
      The most common place to find POLL... bitmaps: return values
      of ->poll() and its subsystem counterparts.
      Signed-off-by: NAl Viro <viro@zeniv.linux.org.uk>
      a3f8683b
    • L
      Rename superblock flags (MS_xyz -> SB_xyz) · 1751e8a6
      Linus Torvalds 提交于
      This is a pure automated search-and-replace of the internal kernel
      superblock flags.
      
      The s_flags are now called SB_*, with the names and the values for the
      moment mirroring the MS_* flags that they're equivalent to.
      
      Note how the MS_xyz flags are the ones passed to the mount system call,
      while the SB_xyz flags are what we then use in sb->s_flags.
      
      The script to do this was:
      
          # places to look in; re security/*: it generally should *not* be
          # touched (that stuff parses mount(2) arguments directly), but
          # there are two places where we really deal with superblock flags.
          FILES="drivers/mtd drivers/staging/lustre fs ipc mm \
                  include/linux/fs.h include/uapi/linux/bfs_fs.h \
                  security/apparmor/apparmorfs.c security/apparmor/include/lib.h"
          # the list of MS_... constants
          SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \
                DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \
                POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \
                I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \
                ACTIVE NOUSER"
      
          SED_PROG=
          for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done
      
          # we want files that contain at least one of MS_...,
          # with fs/namespace.c and fs/pnode.c excluded.
          L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c')
      
          for f in $L; do sed -i $f $SED_PROG; done
      Requested-by: NAl Viro <viro@zeniv.linux.org.uk>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      1751e8a6
  26. 09 11月, 2017 1 次提交
  27. 06 11月, 2017 1 次提交
  28. 03 11月, 2017 1 次提交
    • D
      mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new mmap flags · 1c972597
      Dan Williams 提交于
      The mmap(2) syscall suffers from the ABI anti-pattern of not validating
      unknown flags. However, proposals like MAP_SYNC need a mechanism to
      define new behavior that is known to fail on older kernels without the
      support. Define a new MAP_SHARED_VALIDATE flag pattern that is
      guaranteed to fail on all legacy mmap implementations.
      
      It is worth noting that the original proposal was for a standalone
      MAP_VALIDATE flag. However, when that  could not be supported by all
      archs Linus observed:
      
          I see why you *think* you want a bitmap. You think you want
          a bitmap because you want to make MAP_VALIDATE be part of MAP_SYNC
          etc, so that people can do
      
          ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED
      		    | MAP_SYNC, fd, 0);
      
          and "know" that MAP_SYNC actually takes.
      
          And I'm saying that whole wish is bogus. You're fundamentally
          depending on special semantics, just make it explicit. It's already
          not portable, so don't try to make it so.
      
          Rename that MAP_VALIDATE as MAP_SHARED_VALIDATE, make it have a value
          of 0x3, and make people do
      
          ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED_VALIDATE
      		    | MAP_SYNC, fd, 0);
      
          and then the kernel side is easier too (none of that random garbage
          playing games with looking at the "MAP_VALIDATE bit", but just another
          case statement in that map type thing.
      
          Boom. Done.
      
      Similar to ->fallocate() we also want the ability to validate the
      support for new flags on a per ->mmap() 'struct file_operations'
      instance basis.  Towards that end arrange for flags to be generically
      validated against a mmap_supported_flags exported by 'struct
      file_operations'. By default all existing flags are implicitly
      supported, but new flags require MAP_SHARED_VALIDATE and
      per-instance-opt-in.
      
      Cc: Jan Kara <jack@suse.cz>
      Cc: Arnd Bergmann <arnd@arndb.de>
      Cc: Andy Lutomirski <luto@kernel.org>
      Cc: Andrew Morton <akpm@linux-foundation.org>
      Suggested-by: NChristoph Hellwig <hch@lst.de>
      Suggested-by: NLinus Torvalds <torvalds@linux-foundation.org>
      Reviewed-by: NRoss Zwisler <ross.zwisler@linux.intel.com>
      Signed-off-by: NDan Williams <dan.j.williams@intel.com>
      Signed-off-by: NJan Kara <jack@suse.cz>
      Signed-off-by: NDan Williams <dan.j.williams@intel.com>
      1c972597
  29. 02 11月, 2017 1 次提交
    • G
      License cleanup: add SPDX GPL-2.0 license identifier to files with no license · b2441318
      Greg Kroah-Hartman 提交于
      Many source files in the tree are missing licensing information, which
      makes it harder for compliance tools to determine the correct license.
      
      By default all files without license information are under the default
      license of the kernel, which is GPL version 2.
      
      Update the files which contain no license information with the 'GPL-2.0'
      SPDX license identifier.  The SPDX identifier is a legally binding
      shorthand, which can be used instead of the full boiler plate text.
      
      This patch is based on work done by Thomas Gleixner and Kate Stewart and
      Philippe Ombredanne.
      
      How this work was done:
      
      Patches were generated and checked against linux-4.14-rc6 for a subset of
      the use cases:
       - file had no licensing information it it.
       - file was a */uapi/* one with no licensing information in it,
       - file was a */uapi/* one with existing licensing information,
      
      Further patches will be generated in subsequent months to fix up cases
      where non-standard license headers were used, and references to license
      had to be inferred by heuristics based on keywords.
      
      The analysis to determine which SPDX License Identifier to be applied to
      a file was done in a spreadsheet of side by side results from of the
      output of two independent scanners (ScanCode & Windriver) producing SPDX
      tag:value files created by Philippe Ombredanne.  Philippe prepared the
      base worksheet, and did an initial spot review of a few 1000 files.
      
      The 4.13 kernel was the starting point of the analysis with 60,537 files
      assessed.  Kate Stewart did a file by file comparison of the scanner
      results in the spreadsheet to determine which SPDX license identifier(s)
      to be applied to the file. She confirmed any determination that was not
      immediately clear with lawyers working with the Linux Foundation.
      
      Criteria used to select files for SPDX license identifier tagging was:
       - Files considered eligible had to be source code files.
       - Make and config files were included as candidates if they contained >5
         lines of source
       - File already had some variant of a license header in it (even if <5
         lines).
      
      All documentation files were explicitly excluded.
      
      The following heuristics were used to determine which SPDX license
      identifiers to apply.
      
       - when both scanners couldn't find any license traces, file was
         considered to have no license information in it, and the top level
         COPYING file license applied.
      
         For non */uapi/* files that summary was:
      
         SPDX license identifier                            # files
         ---------------------------------------------------|-------
         GPL-2.0                                              11139
      
         and resulted in the first patch in this series.
      
         If that file was a */uapi/* path one, it was "GPL-2.0 WITH
         Linux-syscall-note" otherwise it was "GPL-2.0".  Results of that was:
      
         SPDX license identifier                            # files
         ---------------------------------------------------|-------
         GPL-2.0 WITH Linux-syscall-note                        930
      
         and resulted in the second patch in this series.
      
       - if a file had some form of licensing information in it, and was one
         of the */uapi/* ones, it was denoted with the Linux-syscall-note if
         any GPL family license was found in the file or had no licensing in
         it (per prior point).  Results summary:
      
         SPDX license identifier                            # files
         ---------------------------------------------------|------
         GPL-2.0 WITH Linux-syscall-note                       270
         GPL-2.0+ WITH Linux-syscall-note                      169
         ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause)    21
         ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)    17
         LGPL-2.1+ WITH Linux-syscall-note                      15
         GPL-1.0+ WITH Linux-syscall-note                       14
         ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause)    5
         LGPL-2.0+ WITH Linux-syscall-note                       4
         LGPL-2.1 WITH Linux-syscall-note                        3
         ((GPL-2.0 WITH Linux-syscall-note) OR MIT)              3
         ((GPL-2.0 WITH Linux-syscall-note) AND MIT)             1
      
         and that resulted in the third patch in this series.
      
       - when the two scanners agreed on the detected license(s), that became
         the concluded license(s).
      
       - when there was disagreement between the two scanners (one detected a
         license but the other didn't, or they both detected different
         licenses) a manual inspection of the file occurred.
      
       - In most cases a manual inspection of the information in the file
         resulted in a clear resolution of the license that should apply (and
         which scanner probably needed to revisit its heuristics).
      
       - When it was not immediately clear, the license identifier was
         confirmed with lawyers working with the Linux Foundation.
      
       - If there was any question as to the appropriate license identifier,
         the file was flagged for further research and to be revisited later
         in time.
      
      In total, over 70 hours of logged manual review was done on the
      spreadsheet to determine the SPDX license identifiers to apply to the
      source files by Kate, Philippe, Thomas and, in some cases, confirmation
      by lawyers working with the Linux Foundation.
      
      Kate also obtained a third independent scan of the 4.13 code base from
      FOSSology, and compared selected files where the other two scanners
      disagreed against that SPDX file, to see if there was new insights.  The
      Windriver scanner is based on an older version of FOSSology in part, so
      they are related.
      
      Thomas did random spot checks in about 500 files from the spreadsheets
      for the uapi headers and agreed with SPDX license identifier in the
      files he inspected. For the non-uapi files Thomas did random spot checks
      in about 15000 files.
      
      In initial set of patches against 4.14-rc6, 3 files were found to have
      copy/paste license identifier errors, and have been fixed to reflect the
      correct identifier.
      
      Additionally Philippe spent 10 hours this week doing a detailed manual
      inspection and review of the 12,461 patched files from the initial patch
      version early this week with:
       - a full scancode scan run, collecting the matched texts, detected
         license ids and scores
       - reviewing anything where there was a license detected (about 500+
         files) to ensure that the applied SPDX license was correct
       - reviewing anything where there was no detection but the patch license
         was not GPL-2.0 WITH Linux-syscall-note to ensure that the applied
         SPDX license was correct
      
      This produced a worksheet with 20 files needing minor correction.  This
      worksheet was then exported into 3 different .csv files for the
      different types of files to be modified.
      
      These .csv files were then reviewed by Greg.  Thomas wrote a script to
      parse the csv files and add the proper SPDX tag to the file, in the
      format that the file expected.  This script was further refined by Greg
      based on the output to detect more types of files automatically and to
      distinguish between header and source .c files (which need different
      comment types.)  Finally Greg ran the script using the .csv files to
      generate the patches.
      Reviewed-by: NKate Stewart <kstewart@linuxfoundation.org>
      Reviewed-by: NPhilippe Ombredanne <pombredanne@nexb.com>
      Reviewed-by: NThomas Gleixner <tglx@linutronix.de>
      Signed-off-by: NGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      b2441318
  30. 19 10月, 2017 1 次提交
    • E
      fs, fscrypt: add an S_ENCRYPTED inode flag · 2ee6a576
      Eric Biggers 提交于
      Introduce a flag S_ENCRYPTED which can be set in ->i_flags to indicate
      that the inode is encrypted using the fscrypt (fs/crypto/) mechanism.
      
      Checking this flag will give the same information that
      inode->i_sb->s_cop->is_encrypted(inode) currently does, but will be more
      efficient.  This will be useful for adding higher-level helper functions
      for filesystems to use.  For example we'll be able to replace this:
      
      	if (ext4_encrypted_inode(inode)) {
      		ret = fscrypt_get_encryption_info(inode);
      		if (ret)
      			return ret;
      		if (!fscrypt_has_encryption_key(inode))
      			return -ENOKEY;
      	}
      
      with this:
      
      	ret = fscrypt_require_key(inode);
      	if (ret)
      		return ret;
      
      ... since we'll be able to retain the fast path for unencrypted files as
      a single flag check, using an inline function.  This wasn't possible
      before because we'd have had to frequently call through the
      ->i_sb->s_cop->is_encrypted function pointer, even when the encryption
      support was disabled or not being used.
      
      Note: we don't define S_ENCRYPTED to 0 if CONFIG_FS_ENCRYPTION is
      disabled because we want to continue to return an error if an encrypted
      file is accessed without encryption support, rather than pretending that
      it is unencrypted.
      Reviewed-by: NChao Yu <yuchao0@huawei.com>
      Acked-by: NDave Chinner <dchinner@redhat.com>
      Signed-off-by: NEric Biggers <ebiggers@google.com>
      Signed-off-by: NTheodore Ts'o <tytso@mit.edu>
      2ee6a576