Implement Hugh's 2nd heuristic and scan around neighboring PTE's on the same pagetable page for PTE's to map an anonymous PAGE_SIZE-sized page with. --- pgcl-2.5.65-virgin/mm/memory.c 2003-03-21 04:26:35.000000000 -0800 +++ pgcl-2.5.65-1/mm/memory.c 2003-03-21 04:17:36.000000000 -0800 @@ -1360,97 +1360,200 @@ do_anonymous_page(struct mm_struct *mm, { struct page *page = NULL; struct pte_chain *pte_chain = NULL; - unsigned long vaddr, lo_vaddr, hi_vaddr; - unsigned long pfn; - int rss = 0, ret = VM_FAULT_MINOR; + unsigned long up_vaddr, dn_vaddr, lo_vaddr, hi_vaddr; + unsigned long pfn, subpfn, dn_subpfn, up_subpfn; + pte_t *ptes[PAGE_MMUCOUNT] = { [0 ... PAGE_MMUCOUNT-1] = NULL }; + pte_t *up_pte, *dn_pte; + int rss, ret = VM_FAULT_MINOR; + + if (write_access) + pr_debug("write fault on 0x%lx\n", addr); + else + pr_debug("read fault on 0x%lx\n", addr); + pr_debug("page_table = 0x%p\n", page_table); - lo_vaddr = max(addr & PAGE_MASK, vma->vm_start); - hi_vaddr = min(PAGE_ALIGN(addr), vma->vm_end); - - if (!write_access) { + if (!write_access) page = ZERO_PAGE(addr); - page_table -= (addr - lo_vaddr)/MMUPAGE_SIZE; - } else { + else { if (!pte_chain) pte_chain = pte_chain_alloc(GFP_ATOMIC); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); if (!pte_chain) { pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = VM_FAULT_OOM; - goto no_mem; - } + if (!pte_chain) + return VM_FAULT_OOM; } page = alloc_page(GFP_HIGHUSER); if (!page) { - ret = VM_FAULT_OOM; - goto no_mem; + pte_chain_free(pte_chain); + return VM_FAULT_OOM; } clear_user_highpage(page, addr); + } + + lo_vaddr = max(addr & PMD_MASK, vma->vm_start); + hi_vaddr = min((addr + PMD_SIZE - 1) & PMD_MASK, vma->vm_end); + dn_subpfn = 0; + up_subpfn = PAGE_MMUCOUNT - 1; + dn_vaddr = addr & MMUPAGE_MASK; + up_vaddr = MMUPAGE_ALIGN(addr + 1); + + pr_debug("vma->vm_start = 0x%lx, vma->vm_end = 0x%lx\n", vma->vm_start, vma->vm_end); + pr_debug("lo_vaddr = 0x%lx, hi_vaddr = 0x%lx\n", lo_vaddr, hi_vaddr); + pr_debug("dn_vaddr = 0x%lx, up_vaddr = 0x%lx\n", dn_vaddr, up_vaddr); + + if (write_access) { + pr_debug("about to take mm->page_table_lock\n"); + if (spin_is_locked(&mm->page_table_lock)) + printk("hmm, I see a deadlock coming\n"); spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, lo_vaddr); } - flush_page_to_ram(page); - /* - * XXX: locks are dropped in the interior of the loop; - * is an elevated reference count required to pin the page - * while it's being operated on? - */ - pfn = page_to_pfn(page) + ((lo_vaddr/MMUPAGE_SIZE) % PAGE_MMUCOUNT); - vaddr = lo_vaddr; - rss = 0; + pr_debug("starting PTE search loop\n"); + if (write_access) + page_table = dn_pte = pte_offset_map(pmd, dn_vaddr); + else + dn_pte = page_table; + up_pte = dn_pte + 1; + do { - if (!write_access && pte_none(*page_table)) { - pte_t entry = pte_wrprotect(pfn_pte(pfn, vma->vm_page_prot)); - set_pte(page_table, entry); + if (up_vaddr < hi_vaddr && up_subpfn > dn_subpfn) { + if (pte_none(*up_pte)) { + ptes[up_subpfn] = up_pte; + up_subpfn--; + } + up_vaddr += MMUPAGE_SIZE; + up_pte++; + } + + if (dn_vaddr >= lo_vaddr && dn_subpfn < up_subpfn) { + if (pte_none(*dn_pte)) { + ptes[dn_subpfn] = dn_pte; + dn_subpfn++; + } + dn_vaddr -= MMUPAGE_SIZE; + dn_pte--; + } + pr_debug("dn_vaddr = 0x%lx, up_vaddr = 0x%lx\n", dn_vaddr, up_vaddr); + pr_debug("dn_subpfn = 0x%lx, up_subpfn = 0x%lx\n", dn_subpfn, up_subpfn); + } while ((up_vaddr < hi_vaddr || dn_vaddr >= lo_vaddr) && up_subpfn > dn_subpfn); + + pr_debug("finishing PTE search loop\n"); + pr_debug("starting PTE instantiation loop\n"); + + pfn = page_to_pfn(page); + rss = 0; + for (subpfn = 0; subpfn < PAGE_MMUCOUNT; ++subpfn) { + pte_t pte; + + pr_debug("subpfn = 0x%lx, ptep = 0x%p\n", subpfn, ptes[subpfn]); + + if (!ptes[subpfn]) { + pr_debug("pte empty\n"); + continue; + } else if (!pte_none(*ptes[subpfn])) { + pr_debug("pte non-none\n"); + continue; + } + + pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); + if (!write_access) { + pr_debug("setting pte to zero page\n"); + set_pte(ptes[subpfn], pte_wrprotect(pte)); } else { + pr_debug("setting pte to newly zeroed anonymous page\n"); if (!pte_chain) pte_chain = pte_chain_alloc(GFP_ATOMIC); if (!pte_chain) { - pte_unmap(page_table); + unsigned long vaddr, offset; + int k; + + pr_debug("doing sleeping alloc of non-anonymous page\n"); + + /* fugly. wtf else can I do? */ + vaddr = (unsigned long)ptes[subpfn]; + + pr_debug("pte vaddr = 0x%lx\n", vaddr); + + /* this will not port to non-x86 */ + vaddr -= fix_to_virt(KM_PTE0); + + pr_debug("vaddr offset = 0x%lx\n", vaddr); + + /* I wanted this but they may not be aligned */ + /* vaddr &= PTRS_PER_PTE*sizeof(pte_t) - 1; */ + + vaddr /= sizeof(pte_t); + + pr_debug("vaddr offset in ptes = 0x%lx\n", vaddr); + + vaddr = (lo_vaddr & PMD_MASK) + vaddr * MMUPAGE_SIZE; + + pr_debug("vaddr = 0x%lx\n", vaddr); + + pte_unmap(ptes[subpfn]); spin_unlock(&mm->page_table_lock); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) { + pr_debug("going to out_oom\n"); ret = VM_FAULT_OOM; - spin_lock(&mm->page_table_lock); - mm->rss += rss; - spin_unlock(&mm->page_table_lock); - goto no_mem; + goto out_oom; } spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, vaddr); - } - if (pte_none(*page_table)) { - pte_t entry = pte_mkwrite(pte_mkdirty(pfn_pte(pfn, vma->vm_page_prot))); - ++rss; - set_pte(page_table, entry); - pte_chain = page_add_rmap(page, page_table, pte_chain); + page_table = pte_offset_map(pmd, vaddr); + + /* is this safe from gcc? NFI */ + if (page_table != ptes[subpfn]) { + pr_debug("(page_table) 0x%p != 0x%p (ptes[subpfn])\n", page_table, ptes[subpfn]); + offset = (unsigned long) + (page_table - ptes[subpfn]); + pr_debug("adjusting all ptes by offset 0x%lx\n", + offset); + for (k = subpfn; k < PAGE_MMUCOUNT; ++k) { + pr_debug("pte before 0x%p\n", ptes[k]); + ptes[k] += offset; + pr_debug("pte after 0x%p\n", ptes[k]); + } + } + + /* check for races */ + if (!pte_none(*ptes[subpfn])) { + pr_debug("raced, skipping PTE\n"); + continue; + } } + pr_debug("setting pte for anonymous zeroed page thing\n"); + pr_debug("ptep = 0x%p, pte = 0x%Lx\n",ptes[subpfn],pte_val(pte)); + set_pte(ptes[subpfn], pte_mkwrite(pte_mkdirty(pte))); + pr_debug("about to page_add_rmap()\n"); + pte_chain = page_add_rmap(page, ptes[subpfn], pte_chain); + pr_debug("about to update_mmu_cache()\n"); + update_mmu_cache(vma, addr, pte); + rss++; + pr_debug("about to page_cache_get()\n"); + page_cache_get(page); } - vaddr += MMUPAGE_SIZE; - pfn++; - page_table++; - } while (vaddr < hi_vaddr); - - pte_unmap(page_table-1); - update_mmu_cache(vma, addr, entry); + pr_debug("falling through to next subpfn\n"); + } + pr_debug("doing pte_unmap(0x%p)\n", page_table); + pte_unmap(page_table); + pr_debug("adding %d to rss\n", rss); mm->rss += rss; spin_unlock(&mm->page_table_lock); - -no_mem: - if (write_access && page) { - if (!rss) - page_cache_release(page); - else { - if (rss != 1) - atomic_add(rss - 1, &page->count); + pr_debug("broke out of PTE instantiation loop\n"); +out_oom: + pr_debug("at out_oom\n"); + if (write_access) { + if (rss) { + pr_debug("adding page to LRU\n"); lru_cache_add_active(page); mark_page_accessed(page); } + pr_debug("releasing page\n"); + page_cache_release(page); } - + pr_debug("doing pte_chain_free()\n"); pte_chain_free(pte_chain); return ret; }