Patch from Dave McCracken This patch should fix the remap_file_pages problem for object-based rmap. I added a function that converts an object-based page to a pte_chain-based page. It's a little tricky because it has to preallocated all the pte_chains it needs before taking the lock since the page can't be allowed outside the lock in an intermediate state. Fortunately I know how many pte_chains are needed in advance. I also condensed some of the common code and added comments. One side effect is I removed taking the page_table_lock during the pte lookups. I've convinced myself that anyone else trying to touch those areas will block on the locks I already hold. It survives the abuse test I've thrown at it. include/linux/rmap-locking.h | 2 mm/fremap.c | 15 +- mm/rmap.c | 266 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 224 insertions(+), 59 deletions(-) diff -puN mm/fremap.c~objrmap-nonlinear-fixes mm/fremap.c --- 25/mm/fremap.c~objrmap-nonlinear-fixes 2003-03-13 02:39:13.000000000 -0800 +++ 25-akpm/mm/fremap.c 2003-03-13 02:39:13.000000000 -0800 @@ -62,6 +62,16 @@ int install_page(struct mm_struct *mm, s pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; + + /* + * Convert this page to anon for objrmap if it's nonlinear + */ + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (!PageAnon(page) && (page->index != pgidx)) + page_convert_anon(page); + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -80,11 +90,6 @@ int install_page(struct mm_struct *mm, s flush_icache_page(vma, page); entry = mk_pte(page, prot); set_pte(pte, entry); - pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; - pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) - SetPageAnon(page); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); if (flush) diff -puN mm/rmap.c~objrmap-nonlinear-fixes mm/rmap.c --- 25/mm/rmap.c~objrmap-nonlinear-fixes 2003-03-13 02:39:13.000000000 -0800 +++ 25-akpm/mm/rmap.c 2003-03-13 02:40:08.000000000 -0800 @@ -76,18 +76,21 @@ kmem_cache_t *pte_chain_cache; **/ /** - * page_referenced - test if the page was referenced - * @page: the page to test + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of processes which referenced the page. - * Caller needs to hold the pte_chain_lock. + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. * - * If the page has a single-entry pte_chain, collapse that back to a PageDirect - * representation. This way, it's only done under memory pressure. + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. */ -static inline int -page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -95,7 +98,6 @@ page_referenced_obj_one(struct vm_area_s pte_t *pte; unsigned long loffset; unsigned long address; - int referenced = 0; loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); if (loffset < vma->vm_pgoff) @@ -106,17 +108,13 @@ page_referenced_obj_one(struct vm_area_s if (address >= vma->vm_end) goto out; - if (!spin_trylock(&mm->page_table_lock)) { - referenced = 1; - goto out; - } pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - goto out_unlock; + goto out; pmd = pmd_offset(pgd, address); if (!pmd_present(*pmd)) - goto out_unlock; + goto out; pte = pte_offset_map(pmd, address); if (!pte_present(*pte)) @@ -125,18 +123,57 @@ page_referenced_obj_one(struct vm_area_s if (page_to_pfn(page) != pte_pfn(*pte)) goto out_unmap; - if (ptep_test_and_clear_young(pte)) - referenced++; + if (addr) + *addr = address; + + return pte; + out_unmap: pte_unmap(pte); +out: + return NULL; +} -out_unlock: - spin_unlock(&mm->page_table_lock); +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_obj. + */ +static int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + pte_t *pte; + int referenced = 0; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } -out: return referenced; } +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ static int page_referenced_obj(struct page *page) { @@ -167,6 +204,17 @@ page_referenced_obj(struct page *page) return referenced; } +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of processes which referenced the page. + * Caller needs to hold the pte_chain_lock. + * + * If the page has a single-entry pte_chain, collapse that back to a PageDirect + * representation. This way, it's only done under memory pressure. + */ int page_referenced(struct page * page) { struct pte_chain * pc; @@ -245,6 +293,10 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + /* + * If this is an object-based page, just count it. We can + * find the mappings by walking the object vma chain for that object. + */ if (!PageAnon(page)) { if (!page->mapping) BUG(); @@ -345,6 +397,10 @@ void page_remove_rmap(struct page * page pte_chain_lock(page); + /* + * If this is an object-based page, just uncount it. We can + * find the mappings by walking the object vma chain for that object. + */ if (!PageAnon(page)) { if (!page->mapping) BUG(); @@ -422,46 +478,27 @@ out: return; } +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_obj. + */ static inline int try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pmd_t *pmd; + unsigned long address; pte_t *pte; pte_t pteval; - unsigned long loffset; - unsigned long address; int ret = SWAP_SUCCESS; - loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); - if (loffset < vma->vm_pgoff) - goto out; - - address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); - - if (address >= vma->vm_end) + pte = find_pte(vma, page, &address); + if (!pte) goto out; - if (!spin_trylock(&mm->page_table_lock)) { - ret = SWAP_AGAIN; - goto out; - } - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto out_unlock; - - pmd = pmd_offset(pgd, address); - if (!pmd_present(*pmd)) - goto out_unlock; - - pte = pte_offset_map(pmd, address); - if (!pte_present(*pte)) - goto out_unmap; - - if (page_to_pfn(page) != pte_pfn(*pte)) - goto out_unmap; - if (vma->vm_flags & VM_LOCKED) { ret = SWAP_FAIL; goto out_unmap; @@ -484,13 +521,22 @@ try_to_unmap_obj_one(struct vm_area_stru out_unmap: pte_unmap(pte); -out_unlock: - spin_unlock(&mm->page_table_lock); - out: return ret; } +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ static int try_to_unmap_obj(struct page *page) { @@ -648,6 +694,10 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + /* + * If it's an object-based page, use the object vma chain to find all + * the mappings. + */ if (!PageAnon(page)) { ret = try_to_unmap_obj(page); goto out; @@ -717,6 +767,114 @@ out: } /** + * page_convert_anon - Convert an object-based mapped page to pte_chain-based. + * @page: the page to convert + * + * Find all the mappings for an object-based page and convert them + * to 'anonymous', ie create a pte_chain and store all the pte pointers there. + * + * This function takes the address_space->i_shared_sem and the pte_chain_lock + * for the page. It jumps through some hoops to preallocate the correct number + * of pte_chain structures to ensure that it can complete without releasing + * the lock. + */ +void page_convert_anon(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL, *ptec; + pte_t *pte; + pte_addr_t pte_paddr; + int mapcount; + int index = 0; + + if (PageAnon(page)) + goto out; + +retry: + /* + * Preallocate the pte_chains outside the lock. + */ + mapcount = page->pte.mapcount; + if (mapcount > 1) { + for (; index < mapcount; index += NRPTE) { + ptec = pte_chain_alloc(GFP_KERNEL); + ptec->next = pte_chain; + pte_chain = ptec; + } + } + down(&mapping->i_shared_sem); + pte_chain_lock(page); + + /* + * Check to make sure the number of mappings didn't change. If they + * did, either retry or free enough pte_chains to compensate. + */ + if (mapcount < page->pte.mapcount) { + pte_chain_unlock(page); + goto retry; + } else if ((mapcount > page->pte.mapcount) && (mapcount > 1)) { + mapcount = page->pte.mapcount; + while ((index - NRPTE) > mapcount) { + index -= NRPTE; + ptec = pte_chain->next; + pte_chain_free(pte_chain); + pte_chain = ptec; + } + if (mapcount <= 1) + pte_chain_free(pte_chain); + } + SetPageAnon(page); + + if (mapcount == 0) + goto out; + else if (mapcount == 1) { + SetPageDirect(page); + page->pte.direct = 0; + } else + page->pte.chain = pte_chain; + + index = NRPTE-1; + list_for_each_entry(vma, &mapping->i_mmap, shared) { + pte = find_pte(vma, page, NULL); + if (pte) { + pte_paddr = ptep_to_paddr(pte); + pte_unmap(pte); + if (PageDirect(page)) { + page->pte.direct = pte_paddr; + goto out_unlock; + } + pte_chain->ptes[index] = pte_paddr; + if (!--index) { + pte_chain = pte_chain->next; + index = NRPTE-1; + } + } + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + pte = find_pte(vma, page, NULL); + if (pte) { + pte_paddr = ptep_to_paddr(pte); + pte_unmap(pte); + if (PageDirect(page)) { + page->pte.direct = pte_paddr; + goto out_unlock; + } + pte_chain->ptes[index] = pte_paddr; + if (!--index) { + pte_chain = pte_chain->next; + index = NRPTE-1; + } + } + } +out_unlock: + pte_chain_unlock(page); + up(&mapping->i_shared_sem); +out: + return; +} + +/** ** No more VM stuff below this comment, only pte_chain helper ** functions. **/ diff -puN include/linux/rmap-locking.h~objrmap-nonlinear-fixes include/linux/rmap-locking.h --- 25/include/linux/rmap-locking.h~objrmap-nonlinear-fixes 2003-03-13 02:39:50.000000000 -0800 +++ 25-akpm/include/linux/rmap-locking.h 2003-03-13 02:40:18.000000000 -0800 @@ -45,3 +45,5 @@ static inline void pte_chain_free(struct if (pte_chain) __pte_chain_free(pte_chain); } + +void page_convert_anon(struct page *page); _