diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 72acbb29d2cc68ca50161af90d3cbe042c9852aa..65616941a383a3304bbccdd50a1858b112195534 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -43,6 +43,16 @@ static DEFINE_SPINLOCK(hugetlb_lock); /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. + * + * The region data structures are protected by a combination of the mmap_sem + * and the hugetlb_instantion_mutex. To access or modify a region the caller + * must either hold the mmap_sem for write, or the mmap_sem for read and + * the hugetlb_instantiation mutex: + * + * down_write(&mm->mmap_sem); + * or + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); */ struct file_region { struct list_head link; @@ -165,6 +175,30 @@ static long region_truncate(struct list_head *head, long end) return chg; } +static long region_count(struct list_head *head, long f, long t) +{ + struct file_region *rg; + long chg = 0; + + /* Locate each segment we overlap with, and count that overlap. */ + list_for_each_entry(rg, head, link) { + int seg_from; + int seg_to; + + if (rg->to <= f) + continue; + if (rg->from >= t) + break; + + seg_from = max(rg->from, f); + seg_to = min(rg->to, t); + + chg += seg_to - seg_from; + } + + return chg; +} + /* * Convert the address within this vma to the page offset within * the mapping, in base page units. @@ -187,9 +221,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); } -#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) -#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) +/* + * Flags for MAP_PRIVATE reservations. These are stored in the bottom + * bits of the reservation map pointer, which are always clear due to + * alignment. + */ +#define HPAGE_RESV_OWNER (1UL << 0) +#define HPAGE_RESV_UNMAPPED (1UL << 1) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) + /* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() @@ -199,6 +239,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. + * + * The private mapping reservation is represented in a subtly different + * manner to a shared mapping. A shared mapping has a region map associated + * with the underlying file, this region map represents the backing file + * pages which have ever had a reservation assigned which this persists even + * after the page is instantiated. A private mapping has a region map + * associated with the original mmap which is attached to all VMAs which + * reference it, this region map represents those offsets which have consumed + * reservation ie. where pages have been instantiated. */ static unsigned long get_vma_private_data(struct vm_area_struct *vma) { @@ -211,22 +260,48 @@ static void set_vma_private_data(struct vm_area_struct *vma, vma->vm_private_data = (void *)value; } -static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) +struct resv_map { + struct kref refs; + struct list_head regions; +}; + +struct resv_map *resv_map_alloc(void) +{ + struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); + if (!resv_map) + return NULL; + + kref_init(&resv_map->refs); + INIT_LIST_HEAD(&resv_map->regions); + + return resv_map; +} + +void resv_map_release(struct kref *ref) +{ + struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + + /* Clear out any active regions before we release the map. */ + region_truncate(&resv_map->regions, 0); + kfree(resv_map); +} + +static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); if (!(vma->vm_flags & VM_SHARED)) - return get_vma_private_data(vma) & ~HPAGE_RESV_MASK; + return (struct resv_map *)(get_vma_private_data(vma) & + ~HPAGE_RESV_MASK); return 0; } -static void set_vma_resv_huge_pages(struct vm_area_struct *vma, - unsigned long reserve) +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); VM_BUG_ON(vma->vm_flags & VM_SHARED); - set_vma_private_data(vma, - (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve); + set_vma_private_data(vma, (get_vma_private_data(vma) & + HPAGE_RESV_MASK) | (unsigned long)map); } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) @@ -253,19 +328,12 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) if (vma->vm_flags & VM_SHARED) { /* Shared mappings always use reserves */ resv_huge_pages--; - } else { + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { /* * Only the process that called mmap() has reserves for * private mappings. */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - unsigned long flags, reserve; - resv_huge_pages--; - flags = (unsigned long)vma->vm_private_data & - HPAGE_RESV_MASK; - reserve = (unsigned long)vma->vm_private_data - 1; - vma->vm_private_data = (void *)(reserve | flags); - } + resv_huge_pages--; } } @@ -282,7 +350,7 @@ static int vma_has_private_reserves(struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) return 0; - if (!vma_resv_huge_pages(vma)) + if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 0; return 1; } @@ -742,12 +810,19 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) return region_chg(&inode->i_mapping->private_list, idx, idx + 1); - } else { - if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 1; - } + } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + return 1; - return 0; + } else { + int err; + pgoff_t idx = vma_pagecache_offset(vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + err = region_chg(&reservations->regions, idx, idx + 1); + if (err < 0) + return err; + return 0; + } } static void vma_commit_reservation(struct vm_area_struct *vma, unsigned long addr) @@ -758,6 +833,13 @@ static void vma_commit_reservation(struct vm_area_struct *vma, if (vma->vm_flags & VM_SHARED) { pgoff_t idx = vma_pagecache_offset(vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); + + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + pgoff_t idx = vma_pagecache_offset(vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + /* Mark this page used in the map. */ + region_add(&reservations->regions, idx, idx + 1); } } @@ -1047,11 +1129,41 @@ static int hugetlb_acct_memory(long delta) return ret; } +static void hugetlb_vm_op_open(struct vm_area_struct *vma) +{ + struct resv_map *reservations = vma_resv_map(vma); + + /* + * This new VMA should share its siblings reservation map if present. + * The VMA will only ever have a valid reservation map pointer where + * it is being copied for another still existing VMA. As that VMA + * has a reference to the reservation map it cannot dissappear until + * after this open call completes. It is therefore safe to take a + * new reference here without additional locking. + */ + if (reservations) + kref_get(&reservations->refs); +} + static void hugetlb_vm_op_close(struct vm_area_struct *vma) { - unsigned long reserve = vma_resv_huge_pages(vma); - if (reserve) - hugetlb_acct_memory(-reserve); + struct resv_map *reservations = vma_resv_map(vma); + unsigned long reserve; + unsigned long start; + unsigned long end; + + if (reservations) { + start = vma_pagecache_offset(vma, vma->vm_start); + end = vma_pagecache_offset(vma, vma->vm_end); + + reserve = (end - start) - + region_count(&reservations->regions, start, end); + + kref_put(&reservations->refs, resv_map_release); + + if (reserve) + hugetlb_acct_memory(-reserve); + } } /* @@ -1068,6 +1180,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, + .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, }; @@ -1617,8 +1730,13 @@ int hugetlb_reserve_pages(struct inode *inode, if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); else { + struct resv_map *resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; + chg = to - from; - set_vma_resv_huge_pages(vma, chg); + + set_vma_resv_map(vma, resv_map); set_vma_resv_flags(vma, HPAGE_RESV_OWNER); }