diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 1a5e8e893d7529e805ee016f6852a35423e128bf..bad0b24cb77398aac29fd871f5bb62eda298a2f8 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -364,6 +364,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) else vma->vm_ops = &gfs2_vm_ops_private; + vma->vm_flags |= VM_CAN_INVALIDATE; + gfs2_glock_dq_uninit(&i_gh); return error; diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c index 404b7cc9f8c4887803d10437c91783a18e655fa0..d5a98cbfebdc8356bfb39fadf7f02c806a0f18eb 100644 --- a/fs/gfs2/ops_vm.c +++ b/fs/gfs2/ops_vm.c @@ -138,6 +138,8 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area, if (alloc_required) { error = alloc_page_backing(ip, result); if (error) { + if (area->vm_flags & VM_CAN_INVALIDATE) + unlock_page(result); page_cache_release(result); result = NULL; goto out; diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 70a69115500f1603b589fb3d65e72acc5eb771db..5416673418b88ecfa3e382d0a9d85d670f127553 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -123,6 +123,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma) return -EFBIG; vma->vm_ops = &ncp_file_mmap; + vma->vm_flags |= VM_CAN_INVALIDATE; file_accessed(file); return 0; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index d79aa12137d205868bbc7d868132dd421203e9d0..904f39ff5340374944519d0529201dd74117a928 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -226,6 +226,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level); out: vma->vm_ops = &ocfs2_file_vm_ops; + vma->vm_flags |= VM_CAN_INVALIDATE; return 0; } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index cbcd40c8c2a0738afcd99c4dcf2542a2bf8cb8b1..92b2f225712f97e501846a087f417575221c1828 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -310,6 +310,7 @@ xfs_file_mmap( struct vm_area_struct *vma) { vma->vm_ops = &xfs_file_vm_ops; + vma->vm_flags |= VM_CAN_INVALIDATE; #ifdef CONFIG_XFS_DMAPI if (vn_from_inode(filp->f_path.dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI) diff --git a/include/linux/mm.h b/include/linux/mm.h index a5c451816fdca003350570aa3139e2f32fbe45f9..ca9536a348c85a025c08144443f7307019c85aa5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -168,6 +168,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ +#define VM_CAN_INVALIDATE 0x08000000 /* The mapping may be invalidated, + * eg. truncate or invalidate_inode_*. + * In this case, do_no_page must + * return with the page locked. + */ + #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif diff --git a/mm/filemap.c b/mm/filemap.c index 5d5449f3d41c83810e002d7fc41cf0cb55b88536..462cda58a18e0f1863d73637c2165e722ff3f531 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1325,9 +1325,10 @@ struct page *filemap_nopage(struct vm_area_struct *area, unsigned long size, pgoff; int did_readaround = 0, majmin = VM_FAULT_MINOR; + BUG_ON(!(area->vm_flags & VM_CAN_INVALIDATE)); + pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; -retry_all: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (pgoff >= size) goto outside_data_content; @@ -1349,7 +1350,7 @@ struct page *filemap_nopage(struct vm_area_struct *area, * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_lock_page(mapping, pgoff); if (!page) { unsigned long ra_pages; @@ -1383,7 +1384,7 @@ struct page *filemap_nopage(struct vm_area_struct *area, start = pgoff - ra_pages / 2; do_page_cache_readahead(mapping, file, start, ra_pages); } - page = find_get_page(mapping, pgoff); + page = find_lock_page(mapping, pgoff); if (!page) goto no_cached_page; } @@ -1392,13 +1393,19 @@ struct page *filemap_nopage(struct vm_area_struct *area, ra->mmap_hit++; /* - * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. + * We have a locked page in the page cache, now we need to check + * that it's up-to-date. If not, it is going to be due to an error. */ - if (!PageUptodate(page)) + if (unlikely(!PageUptodate(page))) goto page_not_uptodate; -success: + /* Must recheck i_size under page lock */ + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(pgoff >= size)) { + unlock_page(page); + goto outside_data_content; + } + /* * Found the page and have a reference on it. */ @@ -1440,6 +1447,7 @@ struct page *filemap_nopage(struct vm_area_struct *area, return NOPAGE_SIGBUS; page_not_uptodate: + /* IO error path */ if (!did_readaround) { majmin = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); @@ -1451,37 +1459,15 @@ struct page *filemap_nopage(struct vm_area_struct *area, * because there really aren't any performance issues here * and we need to check for errors. */ - lock_page(page); - - /* Somebody truncated the page on us? */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry_all; - } - - /* Somebody else successfully read it in? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } ClearPageError(page); error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); + page_cache_release(page); + + if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - } - /* - * Things didn't work out. Return zero to tell the - * mm layer so, possibly freeing the page cache page first. - */ + /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(file, ra); - page_cache_release(page); return NOPAGE_SIGBUS; } EXPORT_SYMBOL(filemap_nopage); @@ -1674,6 +1660,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; + vma->vm_flags |= VM_CAN_INVALIDATE; return 0; } diff --git a/mm/memory.c b/mm/memory.c index 9c6ff7fffdc8cf653d1e04e717a6b3a8bb8fe1c9..e6c99f6b56493ea5448787404d7a2c8738c5fc79 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1831,6 +1831,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long restart_addr; int need_break; + /* + * files that support invalidating or truncating portions of the + * file from under mmaped areas must set the VM_CAN_INVALIDATE flag, and + * have their .nopage function return the page locked. + */ + BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE)); + again: restart_addr = vma->vm_truncate_count; if (is_restart_addr(restart_addr) && start_addr < restart_addr) { @@ -1959,17 +1966,8 @@ void unmap_mapping_range(struct address_space *mapping, spin_lock(&mapping->i_mmap_lock); - /* serialize i_size write against truncate_count write */ - smp_wmb(); - /* Protect against page faults, and endless unmapping loops */ + /* Protect against endless unmapping loops */ mapping->truncate_count++; - /* - * For archs where spin_lock has inclusive semantics like ia64 - * this smp_mb() will prevent to read pagetable contents - * before the truncate_count increment is visible to - * other cpus. - */ - smp_mb(); if (unlikely(is_restart_addr(mapping->truncate_count))) { if (mapping->truncate_count == 0) reset_vma_truncate_counts(mapping); @@ -2008,8 +2006,18 @@ int vmtruncate(struct inode * inode, loff_t offset) if (IS_SWAPFILE(inode)) goto out_busy; i_size_write(inode, offset); + + /* + * unmap_mapping_range is called twice, first simply for efficiency + * so that truncate_inode_pages does fewer single-page unmaps. However + * after this first call, and before truncate_inode_pages finishes, + * it is possible for private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second unmap_mapping_range + * call must be made for correctness. + */ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); goto out_truncate; do_expand: @@ -2049,6 +2057,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) down_write(&inode->i_alloc_sem); unmap_mapping_range(mapping, offset, (end - offset), 1); truncate_inode_pages_range(mapping, offset, end); + unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); @@ -2206,7 +2215,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); - lazy_mmu_prot_update(pte); unlock: pte_unmap_unlock(page_table, ptl); out: @@ -2297,10 +2305,8 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, int write_access) { spinlock_t *ptl; - struct page *new_page; - struct address_space *mapping = NULL; + struct page *page, *nopage_page; pte_t entry; - unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; struct page *dirty_page = NULL; @@ -2308,74 +2314,53 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - sequence = mapping->truncate_count; - smp_rmb(); /* serializes i_size against truncate_count */ - } -retry: - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); - /* - * No smp_rmb is needed here as long as there's a full - * spin_lock/unlock sequence inside the ->nopage callback - * (for the pagecache lookup) that acts as an implicit - * smp_mb() and prevents the i_size read to happen - * after the next truncate_count read. - */ - + nopage_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); /* no page was available -- either SIGBUS, OOM or REFAULT */ - if (unlikely(new_page == NOPAGE_SIGBUS)) + if (unlikely(nopage_page == NOPAGE_SIGBUS)) return VM_FAULT_SIGBUS; - else if (unlikely(new_page == NOPAGE_OOM)) + else if (unlikely(nopage_page == NOPAGE_OOM)) return VM_FAULT_OOM; - else if (unlikely(new_page == NOPAGE_REFAULT)) + else if (unlikely(nopage_page == NOPAGE_REFAULT)) return VM_FAULT_MINOR; + BUG_ON(vma->vm_flags & VM_CAN_INVALIDATE && !PageLocked(nopage_page)); + /* + * For consistency in subsequent calls, make the nopage_page always + * locked. + */ + if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE))) + lock_page(nopage_page); + /* * Should we do an early C-O-W break? */ + page = nopage_page; if (write_access) { if (!(vma->vm_flags & VM_SHARED)) { - struct page *page; - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); - if (!page) - goto oom; - copy_user_highpage(page, new_page, address, vma); - page_cache_release(new_page); - new_page = page; + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto out_error; + } + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!page) { + ret = VM_FAULT_OOM; + goto out_error; + } + copy_user_highpage(page, nopage_page, address, vma); anon = 1; - } else { /* if the page will be shareable, see if the backing * address space wants to know that the page is about * to become writable */ if (vma->vm_ops->page_mkwrite && - vma->vm_ops->page_mkwrite(vma, new_page) < 0 - ) { - page_cache_release(new_page); - return VM_FAULT_SIGBUS; + vma->vm_ops->page_mkwrite(vma, page) < 0) { + ret = VM_FAULT_SIGBUS; + goto out_error; } } } page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - /* - * For a file-backed vma, someone could have truncated or otherwise - * invalidated this page. If unmap_mapping_range got called, - * retry getting the page. - */ - if (mapping && unlikely(sequence != mapping->truncate_count)) { - pte_unmap_unlock(page_table, ptl); - page_cache_release(new_page); - cond_resched(); - sequence = mapping->truncate_count; - smp_rmb(); - goto retry; - } /* * This silly early PAGE_DIRTY setting removes a race @@ -2388,43 +2373,51 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * handle that later. */ /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); + if (likely(pte_none(*page_table))) { + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); if (anon) { - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(new_page); - page_add_new_anon_rmap(new_page, vma, address); + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); - page_add_file_rmap(new_page); + page_add_file_rmap(page); if (write_access) { - dirty_page = new_page; + dirty_page = page; get_page(dirty_page); } } + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); } else { - /* One of our sibling threads was faster, back out. */ - page_cache_release(new_page); - goto unlock; + if (anon) + page_cache_release(page); + else + anon = 1; /* not anon, but release nopage_page */ } - /* no need to invalidate: a not-present page shouldn't be cached */ - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); -unlock: pte_unmap_unlock(page_table, ptl); - if (dirty_page) { + +out: + unlock_page(nopage_page); + if (anon) + page_cache_release(nopage_page); + else if (dirty_page) { set_page_dirty_balance(dirty_page); put_page(dirty_page); } + return ret; -oom: - page_cache_release(new_page); - return VM_FAULT_OOM; + +out_error: + anon = 1; /* relase nopage_page */ + goto out; } /* diff --git a/mm/shmem.c b/mm/shmem.c index 96fa79fb6ad37d483b816ec92804ecd2dfe0a825..5808fadd39445ff4b49917afcb3aa51f717d0f10 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -83,6 +83,7 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate page */ + SGP_NOPAGE, /* same as SGP_CACHE, return with page locked */ }; static int shmem_getpage(struct inode *inode, unsigned long idx, @@ -1289,8 +1290,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, } done: if (*pagep != filepage) { - unlock_page(filepage); *pagep = filepage; + if (sgp != SGP_NOPAGE) + unlock_page(filepage); + } return 0; @@ -1310,13 +1313,15 @@ static struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long idx; int error; + BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE)); + idx = (address - vma->vm_start) >> PAGE_SHIFT; idx += vma->vm_pgoff; idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return NOPAGE_SIGBUS; - error = shmem_getpage(inode, idx, &page, SGP_CACHE, type); + error = shmem_getpage(inode, idx, &page, SGP_NOPAGE, type); if (error) return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; @@ -1414,6 +1419,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; + vma->vm_flags |= VM_CAN_INVALIDATE; return 0; } @@ -2596,5 +2602,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; + vma->vm_flags |= VM_CAN_INVALIDATE; return 0; } diff --git a/mm/truncate.c b/mm/truncate.c index f47e46d1be3b1d9f42b1e83057bf696a3469df27..aed85f0b707f3ca4665835ced9b009ad07dfb836 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -192,6 +192,11 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page_index<index<index > next) next = page->index; next++; @@ -405,7 +415,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, break; } wait_on_page_writeback(page); - while (page_mapped(page)) { + if (page_mapped(page)) { if (!did_range_unmap) { /* * Zap the rest of the file in one hit. @@ -425,6 +435,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, PAGE_CACHE_SIZE, 0); } } + BUG_ON(page_mapped(page)); ret = do_launder_page(mapping, page); if (ret == 0 && !invalidate_complete_page2(mapping, page)) ret = -EIO;