This is guaranteed to be nonfunctional. This is basically a first attempt to crowbar in Hugh's folio API and otherwise totally blow away the rather unsuccessful and non-performant homebrew attempt at the pte vectoring implementation. I'll be debugging this for a while since luserspace won't be able to run at all with this until it's fixed up properly for rmap and highpte. This thing has had a long history of exploding on contact with changes to surrounding code; for once, the explosion is actually a result of an in-progress functional improvement instead of pure merge crap. diff -prauN pgcl-2.6.0-test5-bk3-1/arch/i386/mm/highmem.c pgcl-2.6.0-test5-bk3-2/arch/i386/mm/highmem.c --- pgcl-2.6.0-test5-bk3-1/arch/i386/mm/highmem.c 2003-09-14 23:49:19.000000000 -0700 +++ pgcl-2.6.0-test5-bk3-2/arch/i386/mm/highmem.c 2003-09-16 20:50:50.000000000 -0700 @@ -55,15 +55,18 @@ void *kmap_atomic(struct page *page, enu #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(pte[k])); #endif + if (pte_pfn(pte[k]) == pfn + k) + continue; + set_pte(&pte[k], pfn_pte(pfn + k, kmap_prot)); __flush_tlb_one(addr); } return (void *)vaddr; } +#ifdef CONFIG_DEBUG_HIGHMEM void kunmap_atomic(void *kvaddr, enum km_type type) { -#ifdef CONFIG_DEBUG_HIGHMEM unsigned long vaddr = (unsigned long)kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); unsigned long lo, hi; @@ -93,10 +96,10 @@ void kunmap_atomic(void *kvaddr, enum km pte_clear(&pte[k]); __flush_tlb_one(vaddr); } -#endif dec_preempt_count(); } +#endif struct page *kmap_atomic_to_page(void *ptr) { @@ -119,3 +122,39 @@ struct page *kmap_atomic_to_page(void *p return pte_page(*pte); } +void kmap_atomic_sg(pte_t *ptes[], pte_addr_t paddrs[], enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int k; + + inc_preempt_count(); + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_END) + PAGE_SIZE*idx; + + pgd = pgd_offset_k(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + for (k = 0; k < PAGE_MMUCOUNT; ++k, vaddr += MMUPAGE_SIZE) { + unsigned long pfn = paddrs[k]/MMUPAGE_SIZE; + + if (!paddrs[k]) { + ptes[k] = NULL; + continue; + } + ptes[k] = (pte_t *)(vaddr + ((u32)paddrs[k] & ~MMUPAGE_MASK)); + + if (pte_pfn(pte[k]) != pfn) { + set_pte(&pte[k], pfn_pte(pfn, kmap_prot)); + __flush_tlb_one(vaddr); + } + } +} + +void kunmap_atomic_sg(pte_t *ptes[], enum km_type type) +{ + dec_preempt_count(); +} diff -prauN pgcl-2.6.0-test5-bk3-1/include/asm-i386/highmem.h pgcl-2.6.0-test5-bk3-2/include/asm-i386/highmem.h --- pgcl-2.6.0-test5-bk3-1/include/asm-i386/highmem.h 2003-09-14 23:49:20.000000000 -0700 +++ pgcl-2.6.0-test5-bk3-2/include/asm-i386/highmem.h 2003-09-16 20:50:56.000000000 -0700 @@ -40,7 +40,25 @@ void FASTCALL(kunmap_high(struct page *p void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); +void kmap_atomic_sg(pte_t *[], pte_addr_t [], enum km_type); + +#ifndef CONFIG_DEBUG_HIGHMEM +static inline void kunmap_atomic_sg(pte_t *[], enum km_type) +{ + dec_preempt_count(); +} + +static inline void kunmap_atomic(void *kvaddr, enum km_type) +{ + dec_preempt_count(); +} +#else +void kunmap_atomic_sg(pte_t *[], enum km_type); +void kunmap_atomic(void *, enum km_type); +#endif + +void *kmap_atomic_pfns(unsigned long [], enum km_type); +void kunmap_atomic_pfns(unsigned long [], enum km_type); struct page *kmap_atomic_to_page(void *ptr); #endif /* __KERNEL__ */ diff -prauN pgcl-2.6.0-test5-bk3-1/include/asm-i386/kmap_types.h pgcl-2.6.0-test5-bk3-2/include/asm-i386/kmap_types.h --- pgcl-2.6.0-test5-bk3-1/include/asm-i386/kmap_types.h 2003-09-08 12:50:18.000000000 -0700 +++ pgcl-2.6.0-test5-bk3-2/include/asm-i386/kmap_types.h 2003-09-16 18:57:25.000000000 -0700 @@ -24,7 +24,8 @@ D(10) KM_IRQ0, D(11) KM_IRQ1, D(12) KM_SOFTIRQ0, D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(14) KM_FOLIO, +D(15) KM_TYPE_NR }; #undef D diff -prauN pgcl-2.6.0-test5-bk3-1/include/asm-i386/mach-numaq/mach_apic.h pgcl-2.6.0-test5-bk3-2/include/asm-i386/mach-numaq/mach_apic.h --- pgcl-2.6.0-test5-bk3-1/include/asm-i386/mach-numaq/mach_apic.h 2003-09-08 12:50:02.000000000 -0700 +++ pgcl-2.6.0-test5-bk3-2/include/asm-i386/mach-numaq/mach_apic.h 2003-09-15 03:10:00.000000000 -0700 @@ -37,6 +37,7 @@ static inline void init_apic_ldr(void) static inline void clustered_apic_check(void) { + nr_ioapics = min(2, nr_ioapics); printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "NUMA-Q", nr_ioapics); } diff -prauN pgcl-2.6.0-test5-bk3-1/include/asm-i386/pgtable.h pgcl-2.6.0-test5-bk3-2/include/asm-i386/pgtable.h --- pgcl-2.6.0-test5-bk3-1/include/asm-i386/pgtable.h 2003-09-14 23:49:20.000000000 -0700 +++ pgcl-2.6.0-test5-bk3-2/include/asm-i386/pgtable.h 2003-09-16 19:10:31.000000000 -0700 @@ -320,6 +320,10 @@ static inline pte_t pte_modify(pte_t pte #define pte_unmap_nested(pte) do { } while (0) #endif +#define pte_offset_phys(pmd, addr) \ +( \ + (pte_addr_t)pmd_val(*(pmd)) + pte_index(addr)*sizeof(pte_t) \ +) #if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) typedef u32 pte_addr_t; #endif diff -prauN pgcl-2.6.0-test5-bk3-1/include/linux/folio.h pgcl-2.6.0-test5-bk3-2/include/linux/folio.h --- pgcl-2.6.0-test5-bk3-1/include/linux/folio.h 1969-12-31 16:00:00.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-2/include/linux/folio.h 2003-09-16 20:06:52.000000000 -0700 @@ -0,0 +1,478 @@ +#ifndef _LINUX_FOLIO_H +#define _LINUX_FOLIO_H + +/* + * include/linux/folio.h by Hugh Dickins hugh@veritas.com 31may01 + * + * This header file is intended for inclusion in Linux kernel source + * mm/memory.c alone. It sequesters the "folio" functions for use by + * its fault handlers do_no_page(), do_anonymous_page(), do_swap_page() + * and do_wp_page(). On a standard system, these "folio" functions are + * trivial; but on a system with "large pages" i.e. PAGE_MMUSHIFT != 0, + * they manage the awkwardness of presenting small MMUPAGE_SIZE pages + * to user programs, from a kernel pool of large PAGE_SIZE pages. + * Shared file mappings present little problem, but without this folio + * treatment, private mappings might quickly degenerate into needing + * one PAGE_SIZE page to support each MMUPAGE_SIZE mapping. + */ + +#if PAGE_MMUSHIFT + +/* + * Test whether pte2 indicates the same page as pte1. + */ +static inline int pte_match(pte_t *pte1, pte_t *pte2) +{ + if (pte_none(*pte1)) + return pte_none(*pte2); + if (pte_present(*pte1)) { + if (!pte_present(*pte2)) + return 0; + return pte_page(*pte2) == pte_page(*pte1); + } + if (pte_none(*pte2) || pte_present(*pte2)) + return 0; + return pte_to_swp_entry(*pte2).val == pte_to_swp_entry(*pte1).val; +} + +/* + * Test whether nearby vma2 could ever share a private page with vma1. + */ +static inline int vma_neighbourly(struct vm_area_struct *vma1, + struct vm_area_struct *vma2) +{ + if ((vma1->vm_flags | vma2->vm_flags) & VM_MAYSHARE) + return 0; + if ((((vma1->vm_start - vma2->vm_start) >> MMUPAGE_SHIFT) - + (vma1->vm_pgoff - vma2->vm_pgoff)) & (PAGE_MMUCOUNT-1)) + return 0; + return 1; +} + +#define NOPTE (~((pte_addr_t)0)) + +/* + * Prepare folio of page table pointers for the do_ fault handlers. + */ +static int prepare_folio(pte_addr_t folio[], struct vm_area_struct *vma, + unsigned long address, pte_addr_t ptep, int wide) +{ + struct vm_area_struct *vmp; + unsigned long suboffset; + unsigned long base, addr; + int subnr, ptenr; + int j, limit; + pgd_t *pgd; + pmd_t *pmd; + int reprep = 0; + + suboffset = vma_suboffset(vma, address); + base = (address - suboffset) & MMUPAGE_MASK; + subnr = suboffset >> MMUPAGE_SHIFT; + ptenr = (address & ~PMD_MASK) >> MMUPAGE_SHIFT; + + /* First approximation: set full vector of probable pteps + */ + ptep -= subnr*sizeof(pte_t); + for (j = 0; j < PAGE_MMUCOUNT; j++) + folio[j] = ptep + j*sizeof(pte_t); + j = 0; + + /* Second approximation: wipe pteps which don't belong to vma; + * but if wide, include neighbouring vmas perhaps sharing page. + */ + addr = base; + if (addr > TASK_SIZE) { /* wrapped */ + for (; addr > TASK_SIZE; addr += MMUPAGE_SIZE, j++) + folio[j] = NOPTE; + } + if (addr < vma->vm_start) { + if (wide) { + for (vmp = find_vma(vma->vm_mm, addr); + vmp != vma; vmp = vmp->vm_next) { + for (; addr < vmp->vm_start; addr += MMUPAGE_SIZE, j++) + folio[j] = NOPTE; + if (vma_neighbourly(vma, vmp)) { + j += (vmp->vm_end - addr) >> MMUPAGE_SHIFT; + addr = vmp->vm_end; + } + } + } + for (; addr < vma->vm_start; addr += MMUPAGE_SIZE, j++) + folio[j] = NOPTE; + } + if (vma->vm_end < base + PAGE_SIZE) { + j = (vma->vm_end - base) >> MMUPAGE_SHIFT; + if (wide) { + addr = vma->vm_end; + for (vmp = vma->vm_next; vmp && + vmp->vm_start < base + PAGE_SIZE; vmp = vmp->vm_next) { + for (; addr < vmp->vm_start; addr += MMUPAGE_SIZE, j++) + folio[j] = NOPTE; + if (vma_neighbourly(vma, vmp)) { + j += (vmp->vm_end - addr) >> MMUPAGE_SHIFT; + addr = vmp->vm_end; + } + } + } + for (; j < PAGE_MMUCOUNT; j++) + folio[j] = NOPTE; + } + + /* Third approximation: fix pteps to page table below or above. + */ + if (subnr > ptenr) { + limit = subnr - ptenr; + for (j = 0; folio[j] == NOPTE; j++) + ; + if (j < limit) { + ptep = NOPTE; + pgd = pgd_offset(vma->vm_mm, base); + if (!pgd_none(*pgd) && !pgd_bad(*pgd)) { + pmd = pmd_offset(pgd, base); + if (!pmd_none(*pmd) && !pmd_bad(*pmd)) + ptep = pte_offset_phys(pmd, base); + } + if (ptep != NOPTE) { + for (; j < limit; j++) { + if (folio[j] != NOPTE) + folio[j] = ptep + j*sizeof(pte_t); + } + } else { + for (; j < limit; j++) + folio[j] = NOPTE; + reprep = 1; + } + } + } + if (ptenr > subnr + PTRS_PER_PTE - PAGE_MMUCOUNT) { + j = subnr + PTRS_PER_PTE - ptenr; + for (limit = PAGE_MMUCOUNT; folio[limit-1] == NOPTE; limit--) + ; + if (j < limit) { + ptep = NOPTE; + base += PAGE_SIZE; + pgd = pgd_offset(vma->vm_mm, base); + if (!pgd_none(*pgd) && !pgd_bad(*pgd)) { + pmd = pmd_offset(pgd, base); + if (!pmd_none(*pmd) && !pmd_bad(*pmd)) + ptep = pte_offset_phys(pmd, base); + } + if (ptep) { + ptep -= PAGE_MMUCOUNT*sizeof(pte_t); + for (; j < limit; j++) { + if (folio[j] != NOPTE) + folio[j] = ptep + j*sizeof(pte_t); + } + } else { + for (; j < limit; j++) + folio[j] = NOPTE; + reprep = 1; + } + } + } + return reprep; /* needs recall if page_table_lock dropped */ +} + +/* + * Check if the wide folio already has a private page allocated to it. + */ +static struct page *private_folio_page(pte_addr_t paddrs[], struct page *swap_page) +{ + pte_t *folio[PAGE_MMUCOUNT]; + unsigned long pfn; + struct page *page; + swp_entry_t entry; + pte_t swap_pte; + int fcount, pcount, scount, tcount; + int i, j; + + kmap_atomic_sg(folio, paddrs, KM_FOLIO); + + for (j = PAGE_MMUCOUNT - 1; !folio[j]; j--) + ; + fcount = j + 1; + /* + * The easiest way to handle the do_swap_page() case is + * to make up one extra element on the end of the folio: + * typically all the folio entries will be swapped out, + * and we need one present page to make sense of them. + */ + if (swap_page) { + swap_pte = mk_pte(swap_page, PAGE_KERNEL); + folio[fcount] = &swap_pte; + fcount++; + } + + j = 0; + while (j < fcount) { + if (!folio[j] || !pte_present(*folio[j])) { + j++; + continue; + } + tcount = 1; + pfn = pte_pfn(*folio[j]); + page = pfn_valid(pfn) ? pfn_to_page(pfn) : NULL; + while (++j < fcount) { + if (!folio[j] || !pte_present(*folio[j])) + continue; + if (page && pfn_valid(pte_pfn(*folio[j])) && pte_page(*folio[j]) == page) + break; + tcount++; + } + if (!pfn_valid(pfn) || PageReserved(page)) + continue; + if (PageSwapCache(page)) { + if (page != swap_page) { + if (TestSetPageLocked(page)) + continue; + if (!PageSwapCache(page)) { + unlock_page(page); + continue; + } + } + entry.val = page->index; + pcount = page_count(page) - 1; /* omit swap cache */ + if (PagePrivate(page)) + pcount--; + scount = swap_count(page) - 1; /* omit swap cache */ + if (page != swap_page) + unlock_page(page); + if (pcount + scount > fcount) + continue; + } else { + if (page->mapping) + continue; + pcount = page_count(page); + if (PagePrivate(page)) + pcount--; + scount = 0; + } + pcount -= tcount; + if (j + pcount > fcount) + continue; + for (i = j + 1; pcount && i < fcount; i++) { + if (!folio[i] || !pte_present(*folio[i])) + continue; + if (pte_page(*folio[i]) == page) + pcount--; + } + if (pcount) + continue; + for (i = 0; scount && i < fcount; i++) { + if (!folio[i] || pte_present(*folio[i])) + continue; + if (pte_to_swp_entry(*folio[i]).val == entry.val) + scount--; + } + if (scount) + continue; + kunmap_atomic_sg(folio, KM_FOLIO); + return page; + } + kunmap_atomic_sg(folio, KM_FOLIO); + return NULL; +} + +/* + * Replace page just allocated by private folio page if it has one. + */ +static inline struct page *private_folio_page_xchg(pte_addr_t folio[], struct page *new_page) +{ + struct page *folio_page = private_folio_page(folio, NULL); + if (!folio_page) + return new_page; + page_cache_release(new_page); + page_cache_get(folio_page); + return folio_page; +} + +/* + * Limit folio to page table entries of this vma matching this *ptep. + */ +static void restrict_folio(pte_addr_t paddrs[], struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t *folio[PAGE_MMUCOUNT]; + unsigned long addr; + int j; + + kmap_atomic_sg(folio, paddrs, KM_FOLIO); + + addr = address - vma_suboffset(vma, address); + for (j = 0; j < PAGE_MMUCOUNT; j++, addr += MMUPAGE_SIZE) { + if (!folio[j]) + continue; + if (addr < vma->vm_start || addr >= vma->vm_end || + !pte_match(folio[j], ptep)) + folio[j] = NULL; + } + kunmap_atomic_sg(folio, KM_FOLIO); +} + +/* + * Copy (or clear) folio of mmupages from src_page to dst_page. + */ +static void copy_folio(pte_addr_t paddrs[], struct page *dst_page, + struct page *src_page, unsigned long address) +{ + pte_t *folio[PAGE_MMUCOUNT]; + char *src, *dst; + unsigned int size; + unsigned int offset = 0; + int j = 0; + + kmap_atomic_sg(folio, paddrs, KM_FOLIO); + + dst = kmap_atomic(dst_page, KM_USER0); + if (src_page != ZERO_PAGE(address)) + src = kmap_atomic(src_page, KM_USER1); + else + src = NULL; + while (j < PAGE_MMUCOUNT) { + if (!folio[j]) { + offset += MMUPAGE_SIZE; + j++; + continue; + } + size = MMUPAGE_SIZE; + while (++j < PAGE_MMUCOUNT) { + if (!folio[j]) + break; + size += MMUPAGE_SIZE; + } + /* We assume one long op is faster than several shorts. + * But ia64 sh sparc64 need to use clear/copy_user_page. + */ + if (src) + memcpy(dst + offset, src + offset, size); + else + memset(dst + offset, 0, size); + offset += size; + } + if (src) + kunmap_atomic(src, KM_USER1); + kunmap_atomic(dst, KM_USER0); + kunmap_atomic_sg(folio, KM_FOLIO); +} + +/* + * Update page table entries of the folio, counting how many done. + */ +static inline unsigned long +set_folio_page(pte_addr_t paddrs[], struct page *page, pgprot_t prot, + unsigned long flags) +{ + pte_t *folio[PAGE_MMUCOUNT]; + unsigned long offset = 0, rss = 0, pfn = page_to_pfn(page); + int j; + + kmap_atomic_sg(folio, paddrs, KM_FOLIO); + + for (j = 0; j < PAGE_MMUCOUNT; j++, offset += MMUPAGE_SIZE) { + if (!folio[j]) + continue; + set_pte(folio[j], + pte_modify(pfn_pte(pfn + j, prot), __pgprot(flags))); + rss++; + } + kunmap_atomic_sg(folio, KM_FOLIO); + return rss; +} + +/* + * Flush TLB entries for the folio (if ptes were present before). + */ +static inline void flush_folio(pte_addr_t folio[], struct vm_area_struct *vma, + unsigned long address) +{ + unsigned long start, end; + int j; + + start = (address - vma_suboffset(vma, address)) & MMUPAGE_MASK; + end = start + PAGE_SIZE; + for (j = 0; folio[j] == NOPTE; j++) + start += MMUPAGE_SIZE; + for (j = PAGE_MMUCOUNT - 1; folio[j] == NOPTE; j--) + end -= MMUPAGE_SIZE; + flush_tlb_range(vma, start, end); +} + +#define adjust_page_count(page, extra) \ + atomic_add((extra), &(page)->count) + +#else /* PAGE_MMUSHIFT 0 */ + +static inline int prepare_folio(pte_t *folio[], struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, int wide) +{ + folio[0] = ptep; + return 0; +} + +/* + * Calling convention different if !PAGE_MMUSHIFT: page always passed in + */ +static inline struct page *private_folio_page(pte_t *folio[], struct page *page) +{ + int doing_wp = pte_present(*folio[0]); + int count; + + if (PageReserved(page)) + return NULL; + if (PageSwapCache(page)) { + if (doing_wp) { + if (TryLockPage(page)) + return NULL; + if (!PageSwapCache(page)) { + UnlockPage(page); + return NULL; + } + } + count = page_count(page) + swap_count(page) - 3; + if (doing_wp) + UnlockPage(page); + else + count--; /* swap not yet freed */ + } else { + count = page_count(page) - 1; + } + if (PagePrivate(page)) + count--; + return count? NULL: page; +} + +#define private_folio_page_xchg(folio, new_page) \ + (new_page) + +#define restrict_folio(folio, vma, address, ptep) \ + do {} while (0) + +static inline void copy_folio(pte_t *folio[], struct page *dst_page, + struct page *src_page, unsigned long address) +{ + char *dst = kmap(dst_page); + if (src_page == ZERO_PAGE(address)) { + clear_user_page(dst, address); + } else { + copy_user_page(dst, kmap(src_page), address); + kunmap(src_page); + } + kunmap(dst_page); +} + +static inline unsigned long set_folio(pte_t *folio[], pte_t pte) +{ + set_pte(folio[0], pte); + return 1; +} + +#define flush_folio(folio, vma, address) \ + flush_tlb_page((vma), (address)) + +#define adjust_page_count(page, extra) \ + do {} while (0) + +#endif /* PAGE_MMUSHIFT 0 */ + +#endif /* _LINUX_FOLIO_H */ diff -prauN pgcl-2.6.0-test5-bk3-1/include/linux/swapops.h pgcl-2.6.0-test5-bk3-2/include/linux/swapops.h --- pgcl-2.6.0-test5-bk3-1/include/linux/swapops.h 2003-09-08 12:49:51.000000000 -0700 +++ pgcl-2.6.0-test5-bk3-2/include/linux/swapops.h 2003-09-16 19:50:52.000000000 -0700 @@ -1,3 +1,5 @@ +#ifndef _LINUX_SWAPOPS_H +#define _LINUX_SWAPOPS_H /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in @@ -68,3 +70,7 @@ static inline pte_t swp_entry_to_pte(swp BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } + +int swap_count(struct page *page); +void __swap_free(swp_entry_t entry, unsigned short count); +#endif /* _LINUX_SWAPOPS_H */ diff -prauN pgcl-2.6.0-test5-bk3-1/mm/memory.c pgcl-2.6.0-test5-bk3-2/mm/memory.c --- pgcl-2.6.0-test5-bk3-1/mm/memory.c 2003-09-15 00:57:34.000000000 -0700 +++ pgcl-2.6.0-test5-bk3-2/mm/memory.c 2003-09-16 21:11:01.000000000 -0700 @@ -45,7 +45,8 @@ #include #include #include - +#include +#include #include #include #include @@ -69,31 +70,25 @@ struct page *highmem_start_page; * We special-case the C-O-W ZERO_PAGE, because it's such * a common occurrence (no need to read the page to know * that it's zero - better for the cache and memory subsystem). - * - * ptes are passed in here to figure out addresses etc. but there's no - * clear indication of how to hook this properly for non-i386 yet. */ -static inline void copy_cow_page(unsigned long src[], struct page *dst, - pte_t *ptes[]) +static inline void copy_cow_page(unsigned long src[], struct page *dst, pte_t *ptes[]) { int k; - char *dstaddr = kmap_atomic(dst, KM_USER0); - + char *srcaddr, *dstaddr; + dstaddr = kmap_atomic(dst, KM_USER0); + srcaddr = kmap_atomic_pfns(src, KM_USER1); for (k = 0; k < PAGE_MMUCOUNT; ++k) { if (!src[k] || !ptes[k]) continue; if (pfn_to_page(src[k]) == ZERO_PAGE((unsigned long)ptes[k])) memset(&dstaddr[MMUPAGE_SIZE*k], 0, MMUPAGE_SIZE); - else { - char *srcaddr = kmap_atomic(pfn_to_page(src[k]), - KM_USER1); + else memcpy(&dstaddr[MMUPAGE_SIZE*k], - &srcaddr[MMUPAGE_SIZE*(src[k]%PAGE_MMUCOUNT)], + &srcaddr[MMUPAGE_SIZE*k], MMUPAGE_SIZE); - kunmap_atomic(srcaddr, KM_USER1); - } } - kunmap_atomic(dstaddr, KM_USER0); + kunmap_atomic(dst, KM_USER0); + kunmap_atomic_pfns(src, KM_USER1); } /* @@ -167,32 +162,13 @@ pte_t * pte_alloc_map(struct mm_struct * /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. - * If we raced, we also need to drop all the reference - * counts originally taken with the intent of conferring - * them to all the pte entries spanned by the pte page. */ if (pmd_present(*pmd)) { if (PAGE_MMUCOUNT > 1) - atomic_sub(PAGE_MMUCOUNT-1, &new->count); + atomic_sub(PAGE_MMUCOUNT - 1, &new->count); pte_free(new); goto out; } -#if 0 - { - int k; - pmd_t *base; - unsigned long addr, __pmd = (unsigned long)pmd; - addr = address & ~(PAGE_MMUCOUNT*PMD_SIZE - 1); - base = pmd - ((__pmd/sizeof(pmd_t)) % PAGE_MMUCOUNT); - for (k = 0; k < PAGE_MMUCOUNT; ++k) - if (!pmd_none(base[k]) || pmd_present(base[k])) - printk(KERN_DEBUG - "redundant pmd instantiation " - "at vaddr 0x%lx pmd = 0x%p\n", - addr + PMD_SIZE*k, - &base[k]); - } -#endif pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); } @@ -455,9 +431,9 @@ zap_pte_range(struct mmu_gather *tlb, pm } } } else { - pte_clear(ptep); if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); + pte_clear(ptep); } } pte_unmap(ptep-1); @@ -562,10 +538,10 @@ int unmap_vmas(struct mmu_gather **tlbp, if (vma) { /* debug. killme. */ if (end_addr <= vma->vm_start) - pr_debug("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n", + printk("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n", __FUNCTION__, end_addr, vma->vm_start); if (start_addr >= vma->vm_end) - pr_debug("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n", + printk("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n", __FUNCTION__, start_addr, vma->vm_end); } @@ -613,7 +589,7 @@ int unmap_vmas(struct mmu_gather **tlbp, zap_bytes = ZAP_BLOCK_SIZE; } if (vma->vm_next && vma->vm_next->vm_start < vma->vm_end) - pr_debug("%s: VMA list is not sorted correctly!\n", + printk("%s: VMA list is not sorted correctly!\n", __FUNCTION__); } return ret; @@ -673,11 +649,6 @@ unsigned long follow_page(struct mm_stru pmd = pmd_offset(pgd, address); if (pmd_none(*pmd)) goto out; - - /* - * hugetlb's still broken in pgcl; not difficult to fix, - * but an unnecessary distraction while it's in flux - */ #if 0 if (pmd_huge(*pmd)) return follow_huge_pmd(mm, address, pmd, write); @@ -713,27 +684,12 @@ out: * with IO-aperture pages for direct-IO. */ -static inline struct page *get_page_map(struct page *page) -{ - if (!pfn_valid(page_to_pfn(page))) - return NULL; - return page; -} - -static inline unsigned long get_pfn_map(unsigned long pfn) +unsigned long get_pfn_map(unsigned long pfn) { return pfn_valid(pfn) ? pfn : 0; } - -/* - * This puppy is handing back MMUPAGE_SIZE -sized slots. - * Callers need auditing. - * This function is a goddamn train wreck. Someone needs to - * janitor the idiot thing for mainline to at very least kill - * the #ifdef FIXADDR_START bullcrap. - */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +int get_user_pages(task_t *task, struct mm_struct *mm, unsigned long start, int len, int write, int force, unsigned long *pfns, struct vm_area_struct **vmas) { @@ -812,18 +768,14 @@ int get_user_pages(struct task_struct *t spin_unlock(&mm->page_table_lock); switch (handle_mm_fault(mm,vma,start,write)) { case VM_FAULT_MINOR: - tsk->min_flt++; + task->min_flt++; break; case VM_FAULT_MAJOR: - tsk->maj_flt++; + task->maj_flt++; break; case VM_FAULT_SIGBUS: - if (!i) - pr_debug("get_user_pages(): VM_FAULT_SIGBUS\n"); return i ? i : -EFAULT; case VM_FAULT_OOM: - if (!i) - pr_debug("get_user_pages(): VM_FAULT_OOM\n"); return i ? i : -ENOMEM; default: BUG(); @@ -835,20 +787,24 @@ int get_user_pages(struct task_struct *t if (!pfns[i]) { spin_unlock(&mm->page_table_lock); while (i--) { - struct page *map; - map = pfn_to_page(pfns[i]); - page_cache_release(map); + if (pfns[i]) + page_cache_release(pfn_to_page(pfns[i])); } i = -EFAULT; - pr_debug("get_user_pages(): saw a zero pfn\n"); goto out; } + flush_dcache_page(pages[i]); if (1) { struct page *map; - map = pfn_to_page(pfns[i]); - flush_dcache_page(map); - if (!PageReserved(map)) - page_cache_get(map); + if (pfns[i]) + map = pfn_to_page(pfns[i]); + else + map = NULL; + if (map) { + flush_dcache_page(map); + if (!PageReserved(map)) + page_cache_get(map); + } } } if (vmas) @@ -860,8 +816,6 @@ int get_user_pages(struct task_struct *t spin_unlock(&mm->page_table_lock); } while(len); out: - if (i < 0) - pr_debug("get_user_pages() returning an error\n"); return i; } @@ -915,7 +869,8 @@ int zeromap_page_range(struct vm_area_st dir = pgd_offset(mm, address); flush_cache_range(vma, beg, end); - BUG_ON(address >= end); + if (address >= end) + BUG(); spin_lock(&mm->page_table_lock); do { @@ -995,7 +950,8 @@ int remap_page_range(struct vm_area_stru phys_addr -= from; dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); - BUG_ON(from >= end); + if (from >= end) + BUG(); spin_lock(&mm->page_table_lock); do { @@ -1032,317 +988,15 @@ static inline void establish_pte(struct /* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void break_cow(struct vm_area_struct * vma, - struct page *new_page, unsigned long address, - pte_t *page_table, int subpfn) +static inline void +break_cow(struct vm_area_struct *vma, struct page *new_page, + unsigned long address, pte_t *page_table, int subpfn) { pte_t pte = pfn_pte(page_to_pfn(new_page) + subpfn, vma->vm_page_prot); flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(pte))); } -static pte_t *fill_anonymizable_ptevec(struct vm_area_struct *vma, - pmd_t *pmd, pte_t *page_table, - pte_t *ptes[], unsigned long addr, - int map_pte, int cow) -{ - unsigned long lo_vaddr, hi_vaddr, dn_vaddr, up_vaddr, - dn_subpfn, up_subpfn, rss = 0, loop; - pte_t *up_pte, *dn_pte; - - pr_debug("fill_anonymizable_ptevec() for addr 0x%lx\n", addr); - if (cow) - pr_debug("fill_anonymizable_ptevec() for COW fault\n"); - - lo_vaddr = max(addr & ~(PAGE_MMUCOUNT*PMD_SIZE - 1), vma->vm_start); - hi_vaddr = min(vma->vm_end, (addr + PAGE_MMUCOUNT*PMD_SIZE - 1) - & ~(PAGE_MMUCOUNT*PMD_SIZE - 1)); - dn_subpfn = 0; - up_subpfn = PAGE_MMUCOUNT - 1; - dn_vaddr = addr & MMUPAGE_MASK; - up_vaddr = MMUPAGE_ALIGN(addr + 1); - - pr_debug("vma->vm_start = 0x%lx, vma->vm_end = 0x%lx\n", - vma->vm_start, vma->vm_end); - pr_debug("lo_vaddr = 0x%lx, hi_vaddr = 0x%lx\n", lo_vaddr, hi_vaddr); - pr_debug("dn_vaddr = 0x%lx, up_vaddr = 0x%lx\n", dn_vaddr, up_vaddr); - pr_debug("starting PTE search loop\n"); - if (map_pte) - page_table = dn_pte = pte_offset_map(pmd, dn_vaddr); - else - dn_pte = page_table; - up_pte = dn_pte + 1; - - do { - loop = 0; - - if (dn_vaddr >= lo_vaddr && dn_subpfn <= up_subpfn) { - if (pte_none(*dn_pte) || (cow && pte_present(*dn_pte) && !pte_write(*dn_pte))) { - ptes[dn_subpfn] = dn_pte; - dn_subpfn++; - ++rss; - pr_debug("dn side grabbing pte\n"); - } else if (ptep_to_address(dn_pte)==(addr&MMUPAGE_MASK)) - pr_debug("dn badness, skipping needed pte\n"); - else - pr_debug("dn side legitimately skipping\n"); - dn_vaddr -= MMUPAGE_SIZE; - dn_pte--; - loop = 1; - } - - if (up_vaddr < hi_vaddr && up_subpfn > dn_subpfn) { - if (pte_none(*up_pte) || (cow && pte_present(*up_pte) && !pte_write(*up_pte))) { - ptes[up_subpfn] = up_pte; - up_subpfn--; - pr_debug("up side grabbing pte\n"); - ++rss; - } else if (ptep_to_address(up_pte)==(addr&MMUPAGE_MASK)) - pr_debug("up badness, skipping needed pte\n"); - else - pr_debug("up side legitimately skipping\n"); - up_vaddr += MMUPAGE_SIZE; - up_pte++; - loop = 1; - } - - pr_debug("dn_vaddr = 0x%lx, up_vaddr = 0x%lx\n", - dn_vaddr, up_vaddr); - pr_debug("dn_subpfn = 0x%lx, up_subpfn = 0x%lx\n", - dn_subpfn, up_subpfn); - } while ((up_vaddr < hi_vaddr || dn_vaddr >= lo_vaddr) && - loop && up_subpfn >= dn_subpfn); - - pr_debug("finishing PTE search loop\n"); - pr_debug("starting PTE instantiation loop\n"); - pr_debug("fill_anonymizable_ptevec() saw %lu ptes set\n", rss); - return page_table; -} - - -static int set_anon_ptes(struct vm_area_struct *vma, struct page *page, - unsigned long addr, pte_t *ptes[], - struct pte_chain **pte_chain, int write_access) -{ - unsigned long pfn, subpfn; - int rss; - - pfn = page_to_pfn(page); - rss = 0; - for (subpfn = 0; subpfn < PAGE_MMUCOUNT; ++subpfn) { - pte_t pte; - - pr_debug("subpfn = 0x%lx, ptep = 0x%p\n", subpfn, ptes[subpfn]); - - if (!ptes[subpfn]) { - pr_debug("pte empty\n"); - continue; - } else if (!pte_none(*ptes[subpfn])) { - pr_debug("pte non-none\n"); - continue; - } - - pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); - if (!write_access) { - pr_debug("setting pte to zero page\n"); - set_pte(ptes[subpfn], pte_wrprotect(pte)); - } else { - pr_debug("setting pte to newly zeroed anonymous page\n"); - BUG_ON(!*pte_chain); - pr_debug("setting pte for anonymous zeroed page\n"); - pr_debug("ptep = 0x%p, pte = 0x%Lx\n", - ptes[subpfn], (u64)pte_val(pte)); - set_pte(ptes[subpfn], pte_mkwrite(pte_mkdirty(pte))); - pr_debug("about to page_add_rmap()\n"); - *pte_chain = page_add_rmap_chained(page, ptes[subpfn], - *pte_chain); - pr_debug("about to update_mmu_cache()\n"); - update_mmu_cache(vma, addr, pte); - rss++; - pr_debug("about to page_cache_get()\n"); - } - pr_debug("falling through to next subpfn\n"); - } - return rss; -} - -static void get_cow_pages(struct vm_area_struct *vma, unsigned long pfns[], - pte_t *ptes[]) -{ - int k, rss = 0; - - pr_debug("get_cow_pages()\n"); - for (k = 0; k < PAGE_MMUCOUNT; ++k) { - if (!ptes[k]) { - pfns[k] = 0; - continue; - } - - if (pte_present(*ptes[k])) { - if (pte_write(*ptes[k])) - pr_debug("writable pte in get_cow_pages()!\n"); - pfns[k] = pte_pfn(*ptes[k]); - if (pfn_valid(pfns[k])) - page_cache_get(pfn_to_page(pfns[k])); - ++rss; - } else { - if (!pte_none(*ptes[k])) - pr_debug("bogus pte in get_cow_pages()\n"); - if (!vma->vm_file) { - pr_debug("saw no vma->vm_file, wiping pte\n"); - ptes[k] = NULL; - pfns[k] = 0; - } else { - struct page *page; - unsigned long offset, subpfn; - struct address_space *mapping; - - pr_debug("doing find_get_page()\n"); - - offset = vma->vm_pgoff + - (ptep_to_address(ptes[k]) - - vma->vm_start)/MMUPAGE_SIZE; - subpfn = offset % PAGE_MMUCOUNT; - offset /= PAGE_MMUCOUNT; - mapping = vma->vm_file->f_dentry->d_inode->i_mapping; - page = find_get_page(mapping, offset); - if (!page) { - ptes[k] = NULL; - pfns[k] = 0; - pr_debug("find_get_page() failed, wiping pte\n"); - } else { - pfns[k] = page_to_pfn(page) + subpfn; - ++rss; - } - } - } - } - pr_debug("return from get_cow_pages()\n"); - pr_debug("erm, get_cow_pages() saw %d ptes set!\n", rss); -} - -static void save_ptes(pte_t *ptes[], pte_t ptevals[]) -{ - int k, rss = 0; - pr_debug("save_ptes()\n"); - for (k = 0; k < PAGE_MMUCOUNT; ++k) { - if (!ptes[k]) - continue; - - ptevals[k] = *ptes[k]; - ptes[k] = (pte_t *)ptep_to_address(ptes[k]); - ++rss; - } - pr_debug("return from save_ptes()\n"); - pr_debug("erm, save_ptes() saw %d ptes set!\n", rss); -} - -static pte_t *reconstitute_ptes(pmd_t *pmd, pte_t *ptes[]) -{ - int j, k; - unsigned long addr; - - pr_debug("reconstitute_ptes()\n"); - - for (j = 0; j < PAGE_MMUCOUNT; ++j) - if (ptes[j]) - break; - - /* - * I don't know how to handle this - * this could get really fugly - */ - if (j >= PAGE_MMUCOUNT) - return NULL; - - addr = (unsigned long)ptes[j]; - ptes[j] = pte_offset_map(pmd, addr); - - for (k = j + 1; k < PAGE_MMUCOUNT; ++k) { - unsigned long vaddr; - - if (!ptes[k]) - continue; - - vaddr = (unsigned long)ptes[k]; - BUG_ON(vaddr == addr); - - if (vaddr < addr) - ptes[k] = ptes[j] - (addr - vaddr)/MMUPAGE_SIZE; - else - ptes[k] = ptes[j] + (vaddr - addr)/MMUPAGE_SIZE; - } - pr_debug("return 0x%p from reconstitute_ptes()\n", ptes[j]); - return ptes[j]; -} - -static int recheck_ptes(pte_t *ptes[], unsigned long pfns[], pte_t ptevals[]) -{ - int k, rss = 0; - - pr_debug("recheck_ptes()\n"); - for (k = 0; k < PAGE_MMUCOUNT; ++k) { - if (!ptes[k] || !pfns[k]) - continue; - - if (pte_same(*ptes[k], ptevals[k])) - ++rss; - else { - pr_debug("recheck_ptes() dropped racy pfn\n"); - if (pfn_valid(pfns[k])) - page_cache_release(pfn_to_page(pfns[k])); - ptes[k] = NULL; - pfns[k] = 0; - } - } - pr_debug("return %d from recheck_ptes()\n", rss); - return rss; -} - -static void release_pfns(unsigned long pfns[]) -{ - int k; - pr_debug("release_pfns()\n"); - for (k = 0; k < PAGE_MMUCOUNT; ++k) - if (pfns[k] && pfn_valid(pfns[k])) - page_cache_release(pfn_to_page(pfns[k])); - pr_debug("return from release_pfns()\n"); -} - -static struct pte_chain *move_mappings(struct vm_area_struct *vma, - struct page *new_page, - unsigned long pfns[], pte_t *ptes[], - struct pte_chain *pte_chain) -{ - unsigned long k; - - pr_debug("move_mappings()\n"); - for (k = 0; k < PAGE_MMUCOUNT; ++k) { - struct page *page; - int release; - - if (!ptes[k] || !pfns[k]) - continue; - - release = pte_present(*ptes[k]); - page = pfn_valid(pfns[k]) ? pfn_to_page(pfns[k]) : NULL; - if (page) { - if (PageReserved(page)) - ptep_to_mm(ptes[k])->rss++; - else - page_remove_rmap(page, ptes[k]); - } - break_cow(vma, new_page, ptep_to_address(ptes[k]), ptes[k], k); - pte_chain = page_add_rmap_chained(new_page,ptes[k],pte_chain); - /* nuke the pte's reference since we retargeted the pte */ - if (page && release && !PageReserved(page)) - page_cache_release(page); - } - - pr_debug("return 0x%p from move_mappings()\n", pte_chain); - return pte_chain; -} - /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -1369,12 +1023,8 @@ static int do_wp_page(struct mm_struct * struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); struct pte_chain *pte_chain = NULL; - pte_t ptevals[PAGE_MMUCOUNT]; - pte_t *ptes[PAGE_MMUCOUNT] = { [0 ... PAGE_MMUCOUNT-1] = NULL }; - unsigned long pfns[PAGE_MMUCOUNT] = { [0 ... PAGE_MMUCOUNT-1] = 0 }; - int rss, ret; - - pr_debug("do_wp_page() on addr 0x%lx\n", address); + pte_addr_t folio[PAGE_MMUCOUNT]; + int reprep, rss, ret; if (unlikely(!pfn_valid(pfn))) { /* @@ -1385,112 +1035,70 @@ static int do_wp_page(struct mm_struct * pte_unmap(page_table); printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address); - ret = VM_FAULT_OOM; - goto out_unlock; + goto oom; } old_page = pfn_to_page(pfn); - - if (!TestSetPageLocked(old_page)) { - int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { - pr_debug("do_wp_page() reusing old page\n"); - /* - * XXX: this should sweep the pagetables to - * prefault all the pte's. This is free, take it. - */ - flush_cache_page(vma, address); - establish_pte(vma, address, page_table, - pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); - pte_unmap(page_table); - ret = VM_FAULT_MINOR; - goto out_unlock; - } + reprep = prepare_folio(folio, vma, address, ptep_to_paddr(page_table), 1); + new_page = private_folio_page(folio, PAGE_MMUSHIFT ? NULL : old_page); + if (new_page) { + page_cache_get(new_page); + goto got_page; } - /* fill the anonymizable ptes */ - pr_debug("calling fill_anonymizable_ptevec()\n"); - page_table = fill_anonymizable_ptevec(vma, pmd, page_table, ptes, - address, 0, 1); - /* get all the relevant pages */ - pr_debug("calling get_cow_pages()\n"); - get_cow_pages(vma, pfns, ptes); - - /* save all the ptes */ - pr_debug("calling save_ptes()\n"); - save_ptes(ptes, ptevals); + pte_unmap(page_table); - /* Ok, we need to copy. Oh, well.. */ + page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pr_debug("calling pte_chain_alloc_chained()\n"); - pte_chain = pte_chain_alloc_chained(GFP_KERNEL); - if (!pte_chain) { - release_pfns(pfns); - ret = VM_FAULT_OOM; - goto out; - } + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto no_mem; new_page = alloc_page(GFP_HIGHUSER); - if (!new_page) { - release_pfns(pfns); - ret = VM_FAULT_OOM; - goto out; - } - /* copy all the pieces of pages */ - pr_debug("calling copy_cow_page()\n"); - copy_cow_page(pfns, new_page, ptes); + if (!new_page) + goto no_mem; /* * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); - /* reconstitute all the ptes - * page_table = pte_offset_map(pmd, address); - */ - pr_debug("calling reconstitute_ptes()\n"); - page_table = reconstitute_ptes(pmd, ptes); - if (!page_table) { - pr_debug("reconstitute_ptes() returned NULL, bailing out\n"); - page_cache_release(new_page); - release_pfns(pfns); - ret = VM_FAULT_MINOR; - goto out_unlock; - } - /* recheck all the ptes, dropping already set up pieces */ - pr_debug("calling recheck_ptes()\n"); - rss = recheck_ptes(ptes, pfns, ptevals); - if (!rss) { - pr_debug("recheck_ptes() returned 0, bailing out\n"); - page_cache_release(new_page); - release_pfns(pfns); - ret = VM_FAULT_MINOR; - goto out_unlock; + page_table = pte_offset_map(pmd, address); + if (reprep) + prepare_folio(folio, vma, address, ptep_to_paddr(page_table), 1); + new_page = private_folio_page_xchg(folio, new_page); +got_page: + restrict_folio(folio, vma, address, page_table); + if (new_page != old_page) { + copy_folio(folio, new_page, old_page, address); + flush_cache_page(vma, address); + } + rss = set_folio_page(folio, new_page, vma->vm_page_prot, _PAGE_DIRTY|_PAGE_RW); + if (new_page != old_page) { + adjust_page_count(new_page, rss - 1); + if (PageReserved(old_page)) + mm->rss += rss; + else + adjust_page_count(old_page, 1 - rss); } - pr_debug("calling move_mappings()\n"); - pte_chain = move_mappings(vma, new_page, pfns, ptes, pte_chain); - atomic_add(rss, &new_page->count); - - pr_debug("do_wp_page() returning VM_FAULT_MINOR\n"); pte_unmap(page_table); - - pr_debug("calling lru_cache_add_active(0x%p)\n", new_page); - lru_cache_add_active(new_page); - - /* release reference we acquired as well as pagetable references */ - pr_debug("calling release_pfns()\n"); - release_pfns(pfns); - page_cache_release(new_page); + flush_folio(folio, vma, address); + update_mmu_cache(vma, address, folio); + page_cache_release(old_page); ret = VM_FAULT_MINOR; -out_unlock: - spin_unlock(&mm->page_table_lock); + goto out; + +no_mem: + page_cache_release(old_page); +oom: + ret = VM_FAULT_OOM; out: - pte_chain_free_chained(pte_chain); + spin_unlock(&mm->page_table_lock); + pte_chain_free(pte_chain); return ret; } /* * Helper function for invalidate_mmap_range(). - * Both hba and hlen are page numbers in PAGE_SIZE units. + * Both hba and hlen are page numbers in MMUPAGE_SIZE units. * An hlen of zero blows away the entire portion file after hba. */ static void @@ -1529,24 +1137,24 @@ invalidate_mmap_range_list(struct list_h * page range in the underlying file. * @address_space: the address space containing mmaps to be invalidated. * @holebegin: byte in first page to invalidate, relative to the start of - * the underlying file. This will be rounded down to a PAGE_SIZE + * the underlying file. This will be rounded down to a MMUPAGE_SIZE * boundary. Note that this is different from vmtruncate(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded - * up to a PAGE_SIZE boundary. A holelen of zero truncates to the + * up to a MMUPAGE_SIZE boundary. A holelen of zero truncates to the * end of the file. */ void invalidate_mmap_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { - unsigned long hba = holebegin >> PAGE_SHIFT; - unsigned long hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long hba = holebegin >> MMUPAGE_SHIFT; + unsigned long hlen = (holelen + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { long long holeend = - (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + (holebegin + holelen + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; @@ -1635,24 +1243,19 @@ static int do_swap_page(struct mm_struct struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) { - struct page *page; + struct page *page, *swap_page; + pte_addr_t folio[PAGE_MMUCOUNT+1]; swp_entry_t entry = pte_to_swp_entry(orig_pte); - pte_t pte; int rss, ret = VM_FAULT_MINOR; struct pte_chain *pte_chain = NULL; - unsigned long subpfn, flt_subpfn = swp_offset(entry) % PAGE_MMUCOUNT; - unsigned long pfn, lo_vaddr, hi_vaddr, vaddr; - - lo_vaddr = max(address & PAGE_MASK, vma->vm_start); - hi_vaddr = min(PAGE_ALIGN(address), vma->vm_end); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - page = lookup_swap_cache(entry); - if (!page) { + swap_page = lookup_swap_cache(entry); + if (!swap_page) { swapin_readahead(entry); - page = read_swap_cache_async(entry); - if (!page) { + swap_page = read_swap_cache_async(entry); + if (!swap_page) { /* * Back out if somebody else faulted in this pte while * we released the page table lock. @@ -1673,13 +1276,13 @@ static int do_swap_page(struct mm_struct inc_page_state(pgmajfault); } - mark_page_accessed(page); + mark_page_accessed(swap_page); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) { - ret = VM_FAULT_OOM; + ret = -ENOMEM; goto out; } - lock_page(page); + lock_page(swap_page); /* * Back out if somebody else faulted in this pte while we @@ -1687,101 +1290,50 @@ static int do_swap_page(struct mm_struct */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (!pte_same(*page_table, orig_pte)) { + if (pte_to_swp_entry(*page_table).val != entry.val) { pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); + unlock_page(swap_page); + page_cache_release(swap_page); ret = VM_FAULT_MINOR; goto out; } /* The page isn't present yet, go ahead with the fault. */ - /* - * Something is massively fscked here. - swap_free(entry); if (vm_swap_full()) - remove_exclusive_swap_page(page); - */ - - rss = 0; - vaddr = lo_vaddr; - page_table -= (address - lo_vaddr)/MMUPAGE_SIZE; + remove_exclusive_swap_page(swap_page); - flush_icache_page(vma, page); - - pfn = page_to_pfn(page); - - do { - /* already faulted in? less work for me */ - if (pte_present(*page_table)) - goto next; - - entry = pte_to_swp_entry(*page_table); - - if (!pte_none(*page_table) && - swp_offset(entry)/PAGE_MMUCOUNT == page->index) { - swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); - subpfn = swp_offset(entry) % PAGE_MMUCOUNT; - pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); - - } else if (pte_none(*page_table)) { - - subpfn = flt_subpfn + (vaddr - address)/MMUPAGE_SHIFT; - - /* it'd fall outside the page */ - if (subpfn >= PAGE_MMUCOUNT) - goto next; - - pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); - - /* !pte_none() && swp_offset()/PAGE_MMUCOUNT != page->index */ - } else - goto next; - - if (write_access && can_share_swap_page(page)) - pte = pte_mkdirty(pte_mkwrite(pte)); - - if (!pte_chain) - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = VM_FAULT_OOM; - spin_lock(&mm->page_table_lock); - mm->rss += rss; - spin_unlock(&mm->page_table_lock); - goto no_mem; - } - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, vaddr); + prepare_folio(folio, vma, address, ptep_to_paddr(page_table), write_access); + if (write_access) { + page = private_folio_page(folio, swap_page); + restrict_folio(folio, vma, address, page_table); + if (!page) { + page = swap_page; + write_access = 0; + } else if (page != swap_page) { + page_cache_get(page); + copy_folio(folio, page, swap_page, address); } + } else { + restrict_folio(folio, vma, address, page_table); + page = swap_page; + } - set_pte(page_table, pte); - ++rss; - pte_chain = page_add_rmap(page, page_table, pte_chain); -next: - vaddr += MMUPAGE_SIZE; - page_table++; - } while (vaddr < hi_vaddr); + flush_icache_page(vma, page); + rss = set_folio_page(folio, page, vma->vm_page_prot, write_access ? (_PAGE_DIRTY|_PAGE_RW) : 0); + pte_chain = page_add_rmap(page, page_table, pte_chain); + adjust_page_count(page, rss - 1); + mm->rss += rss; + __swap_free(entry, rss); + unlock_page(swap_page); + if (page != swap_page) + page_cache_release(swap_page); - unlock_page(page); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); - mm->rss += rss; - pte_unmap(page_table-1); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); -no_mem: - if (!page) - goto out; - if (!rss) - page_cache_release(page); - else if (rss > 1) - atomic_add(rss - 1, &page->count); out: pte_chain_free(pte_chain); return ret; @@ -1797,62 +1349,75 @@ do_anonymous_page(struct mm_struct *mm, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr) { - struct page *page = NULL; - struct pte_chain *pte_chain = NULL; - pte_t *ptes[PAGE_MMUCOUNT] = { [0 ... PAGE_MMUCOUNT-1] = NULL }; - int rss, ret = VM_FAULT_MINOR; + pte_addr_t folio[PAGE_MMUCOUNT]; + struct page *new_page, *page = ZERO_PAGE(addr); + struct pte_chain *pte_chain; + int ret; - if (write_access) - pr_debug("write fault on 0x%lx\n", addr); - else - pr_debug("read fault on 0x%lx\n", addr); - pr_debug("page_table = 0x%p\n", page_table); - - if (!write_access) - page = ZERO_PAGE(addr); - else { - if (!pte_chain) - pte_chain = pte_chain_alloc_chained(GFP_ATOMIC); + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - if (!pte_chain) { - pte_chain = pte_chain_alloc_chained(GFP_KERNEL); - if (!pte_chain) - return VM_FAULT_OOM; - } - page = alloc_page(GFP_HIGHUSER); - if (!page) { - pte_chain_free_chained(pte_chain); - return VM_FAULT_OOM; - } - clear_user_highpage(page, addr); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto no_mem; + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); } - if (write_access) - spin_lock(&mm->page_table_lock); - page_table = fill_anonymizable_ptevec(vma, pmd, page_table, ptes, - addr, write_access, 0); - rss = set_anon_ptes(vma, page, addr, ptes, &pte_chain, write_access); - pr_debug("doing pte_unmap(0x%p)\n", page_table); - pte_unmap(page_table); - pr_debug("adding %d to rss\n", rss); - mm->rss += rss; - spin_unlock(&mm->page_table_lock); - pr_debug("broke out of PTE instantiation loop\n"); + /* ..except if it's a write access */ if (write_access) { - if (rss) { - if (rss > 1) - atomic_add(rss - 1, &page->count); - pr_debug("adding page 0x%p to LRU\n", page); - lru_cache_add_active(page); - mark_page_accessed(page); - } else { - pr_debug("releasing page\n"); - page_cache_release(page); + /* Allocate our own private page. */ + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + + new_page = alloc_page(GFP_HIGHUSER); + if (!new_page) + goto no_mem; + + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); + + if (!pte_none(*page_table)) { + pte_unmap(page_table); + page_cache_release(new_page); + spin_unlock(&mm->page_table_lock); + ret = VM_FAULT_MINOR; + goto out; } + lru_cache_add_active(new_page); + mark_page_accessed(new_page); + } else + new_page = NULL; + + prepare_folio(folio, vma, addr, ptep_to_paddr(page_table), write_access); + if (write_access) { + int rss; + new_page = private_folio_page_xchg(folio, new_page); + restrict_folio(folio, vma, addr, page_table); + copy_folio(folio, new_page, page, addr); + page = new_page; + rss = set_folio_page(folio, page, vma->vm_page_prot, _PAGE_RW|_PAGE_DIRTY); + adjust_page_count(page, rss - 1); + mm->rss += rss; + } else { + restrict_folio(folio, vma, addr, page_table); + set_folio_page(folio, page, vma->vm_page_prot, 0); } - pr_debug("doing pte_chain_free()\n"); - pte_chain_free_chained(pte_chain); + /* ignores ZERO_PAGE */ + pte_chain = page_add_rmap(page, page_table, pte_chain); + pte_unmap(page_table); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, entry); + spin_unlock(&mm->page_table_lock); + ret = VM_FAULT_MINOR; + goto out; + +no_mem: + ret = VM_FAULT_OOM; +out: + pte_chain_free(pte_chain); return ret; } @@ -1872,16 +1437,12 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { - struct page * new_page; + struct page *page, *new_page = NULL; + pte_addr_t folio[PAGE_MMUCOUNT]; struct address_space *mapping = NULL; - pte_t entry; struct pte_chain *pte_chain; - int sequence = 0; - int ret; + int ret, rss, sequence = 0; - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, page_table, - pmd, write_access, address); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1891,7 +1452,7 @@ do_no_page(struct mm_struct *mm, struct } smp_rmb(); /* Prevent CPU from reordering lock-free ->nopage() */ retry: - new_page = vma->vm_ops->nopage(vma, address & MMUPAGE_MASK, 0); + page = vma->vm_ops->nopage(vma, address, 0); /* no page was available -- either SIGBUS or OOM */ if (new_page == NOPAGE_SIGBUS) @@ -1907,17 +1468,16 @@ retry: * Should we do an early C-O-W break? */ if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page * page = alloc_page(GFP_HIGHUSER); - if (!page) { - page_cache_release(new_page); - goto oom; + if (page_count(page) > 1 || PageReserved(page)) { + new_page = alloc_page(GFP_HIGHUSER); + if (!new_page) { + page_cache_release(page); + goto oom; + } } - copy_user_highpage(page, new_page, address); - page_cache_release(new_page); - pr_debug("adding page 0x%p to LRU\n", page); - lru_cache_add_active(page); - new_page = page; } + lru_cache_add_active(page); + new_page = page; spin_lock(&mm->page_table_lock); /* @@ -1934,33 +1494,7 @@ retry: } page_table = pte_offset_map(pmd, address); - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if write_access is true, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. - * - * XXX: this should sweep pagetables and prefault - */ - /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - unsigned long pfn; - if (!PageReserved(new_page)) - ++mm->rss; - flush_icache_page(vma, new_page); - pfn = page_to_pfn(new_page) - + vma_suboffset(vma, address)/MMUPAGE_SIZE; - entry = pfn_pte(pfn, vma->vm_page_prot); - if (write_access) - entry = pte_mkwrite(pte_mkdirty(entry)); - set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); - pte_unmap(page_table); - } else { + if (!pte_none(*page_table)) { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); page_cache_release(new_page); @@ -1969,8 +1503,25 @@ retry: goto out; } - /* no need to invalidate: a not-present page shouldn't be cached */ - update_mmu_cache(vma, address, entry); + prepare_folio(folio, vma, address, ptep_to_paddr(page_table), !!new_page); + if (new_page) { + new_page = private_folio_page_xchg(folio, new_page); + restrict_folio(folio, vma, address, page_table); + copy_folio(folio, new_page, page, address); + page_cache_release(page); + page = new_page; + } else + restrict_folio(folio, vma, address, page_table); + + flush_icache_page(vma, page); + rss = set_folio_page(folio, page, vma->vm_page_prot, write_access ? (_PAGE_RW|_PAGE_DIRTY) : 0); + if (!PageReserved(page)) { + adjust_page_count(page, rss - 1); + mm->rss += rss; + } + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MAJOR; goto out; @@ -2043,8 +1594,6 @@ static inline int handle_pte_fault(struc { pte_t entry; - BUG_ON((unsigned long)pte > (unsigned long)(-PAGE_SIZE)); - entry = *pte; if (!pte_present(entry)) { /* @@ -2052,8 +1601,11 @@ static inline int handle_pte_fault(struc * and the PTE updates will not touch it later. So * drop the lock. */ - if (pte_none(entry)) + if (pte_none(entry)) { + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, pte, pmd, write_access, address); return do_no_page(mm, vma, address, write_access, pte, pmd); + } if (pte_file(entry)) return do_file_page(mm, vma, address, write_access, pte, pmd); return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); @@ -2144,8 +1696,10 @@ int make_pages_present(unsigned long add vma = find_vma(current->mm, addr); write = (vma->vm_flags & VM_WRITE) != 0; - BUG_ON(addr >= end); - BUG_ON(end > vma->vm_end); + if (addr >= end) + BUG(); + if (end > vma->vm_end) + BUG(); len = (end+MMUPAGE_SIZE-1)/MMUPAGE_SIZE-addr/MMUPAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); diff -prauN pgcl-2.6.0-test5-bk3-1/mm/swapfile.c pgcl-2.6.0-test5-bk3-2/mm/swapfile.c --- pgcl-2.6.0-test5-bk3-1/mm/swapfile.c 2003-09-14 23:49:20.000000000 -0700 +++ pgcl-2.6.0-test5-bk3-2/mm/swapfile.c 2003-09-16 19:53:09.000000000 -0700 @@ -1645,3 +1645,97 @@ int valid_swaphandles(swp_entry_t entry, swap_device_unlock(swapdev); return ret; } + +int swap_count(struct page *page) +{ + struct swap_info_struct * p; + unsigned long offset, type; + swp_entry_t entry; + int retval = 0; + + entry.val = page->index; + if (!entry.val) + goto bad_entry; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = swp_offset(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_unused; + retval = p->swap_map[offset]; +out: + return retval; +bad_entry: + printk(KERN_ERR "swap_count: null entry!\n"); + goto out; +bad_file: + printk("Bad swap file entry %08lx\n", entry.val); + goto out; +bad_offset: + printk("Bad swap offset entry %08lx\n", entry.val); + goto out; +bad_unused: + printk("Unused swap offset entry in swap_count %08lx\n", entry.val); + goto out; +} + +void __swap_free(swp_entry_t entry, unsigned short count) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry.val) + goto out; + + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + p = & swap_info[type]; + if (!(p->flags & SWP_USED)) + goto bad_device; + offset = swp_offset(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_free; + swap_list_lock(); + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = type; + swap_device_lock(p); + if (p->swap_map[offset] < SWAP_MAP_MAX) { + if (p->swap_map[offset] < count) + goto bad_count; + if (!(p->swap_map[offset] -= count)) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + nr_swap_pages++; + } + } + swap_device_unlock(p); + swap_list_unlock(); +out: + return; + +bad_nofile: + printk("swap_free: Trying to free nonexistent swap-page\n"); + goto out; +bad_device: + printk("swap_free: Trying to free swap from unused swap-device\n"); + goto out; +bad_offset: + printk("swap_free: offset exceeds max\n"); + goto out; +bad_free: + printk("VM: Bad swap entry %08lx\n", entry.val); + goto out; +bad_count: + swap_device_unlock(p); + swap_list_unlock(); + printk(KERN_ERR "VM: Bad count %hd current count %hd\n", count, p->swap_map[offset]); + goto out; +}