diff -urpN pgcl-2.6.0-test5-bk3-4/arch/i386/kernel/process.c pgcl-2.6.0-test5-bk3-5/arch/i386/kernel/process.c --- pgcl-2.6.0-test5-bk3-4/arch/i386/kernel/process.c 2003-11-05 18:49:39.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/arch/i386/kernel/process.c 2003-11-06 21:23:33.000000000 -0800 @@ -606,7 +606,7 @@ asmlinkage int sys_execve(struct pt_regs int error; char * filename; - printk("sys_execve()\n"); + pr_debug("sys_execve()\n"); filename = getname((char __user *) regs.ebx); error = PTR_ERR(filename); @@ -623,7 +623,7 @@ asmlinkage int sys_execve(struct pt_regs } putname(filename); out: - printk("return from sys_execve()\n"); + pr_debug("return from sys_execve()\n"); return error; } diff -urpN pgcl-2.6.0-test5-bk3-4/arch/i386/mm/fault.c pgcl-2.6.0-test5-bk3-5/arch/i386/mm/fault.c --- pgcl-2.6.0-test5-bk3-4/arch/i386/mm/fault.c 2003-11-05 22:11:33.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/arch/i386/mm/fault.c 2003-11-07 15:56:33.000000000 -0800 @@ -4,8 +4,6 @@ * Copyright (C) 1995 Linus Torvalds */ -#define DEBUG - #include #include #include @@ -83,7 +81,7 @@ asmlinkage void do_page_fault(struct pt_ /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); - printk("%d: faulted on %lx, EIP = 0x%lx\n", + pr_debug("%d: faulted on %lx, EIP = 0x%lx\n", tsk->pid, address, regs->eip); @@ -116,17 +114,17 @@ asmlinkage void do_page_fault(struct pt_ if (1) { pgd_t *pgd = pgd_offset(mm, address); pmd_t *pmd = pmd_offset(pgd, address); - printk("%d: fault handled by PGD at vaddr %p, %Lx\n", + pr_debug("%d: fault handled by PGD at vaddr %p, %Lx\n", current->pid, pgd, (u64)pgd_val(*pgd)); - printk("%d: fault handled by PMD at vaddr %p, %Lx\n", + pr_debug("%d: fault handled by PMD at vaddr %p, %Lx\n", current->pid, pmd, (u64)pmd_val(*pmd)); if (pmd_present(*pmd)) { - printk("%d: fault will be handled by PTE at paddr %Lx\n", + pr_debug("%d: fault will be handled by PTE at paddr %Lx\n", current->pid, (u64)(pmd_val(*pmd) & MMUPAGE_MASK) + pte_index(address)*sizeof(pte_t)); } else - printk("pmd not present\n"); + pr_debug("pmd not present\n"); } /* * If we're in an interrupt, have no user context or are running in an @@ -135,7 +133,7 @@ asmlinkage void do_page_fault(struct pt_ if (in_atomic() || !mm) goto no_context; - printk("%d: about to down_read(&mm->mmap_sem)\n", current->pid); + pr_debug("%d: about to down_read(&mm->mmap_sem)\n", current->pid); down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -237,7 +235,7 @@ bad_area: /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { -#ifdef DEBUG +#if 1 || defined(DEBUG) printk("user mode SIGSEGV, pid = %d, comm = %16s, EIP = %p, ESP = %p, CR2 = %p\n", current->pid, current->comm, (void *)regs->eip, (void *)regs->esp, (void *)address); spin_lock(&mm->page_table_lock); diff -urpN pgcl-2.6.0-test5-bk3-4/fs/exec.c pgcl-2.6.0-test5-bk3-5/fs/exec.c --- pgcl-2.6.0-test5-bk3-4/fs/exec.c 2003-11-06 20:55:53.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/fs/exec.c 2003-11-06 21:13:29.000000000 -0800 @@ -297,7 +297,7 @@ void put_dirty_page(task_t *task, struct page_pfn = page_to_pfn(page); - printk("%d: put_dirty_page: page = %p, start pfn = 0x%lx, min_subpfn = 0x%x, addr = 0x%lx, prot = %lx\n", + pr_debug("%d: put_dirty_page: page = %p, start pfn = 0x%lx, min_subpfn = 0x%x, addr = 0x%lx, prot = %lx\n", current->pid, page, page_pfn, min_subpfn, addr, pgprot_val(prot)); spin_lock(&mm->page_table_lock); @@ -327,14 +327,14 @@ void put_dirty_page(task_t *task, struct goto out; if (!pte_none(*pte)) { pte_unmap(pte); - printk("%d: put_dirty_page: skipping addr 0x%lx\n", + pr_debug("%d: put_dirty_page: skipping addr 0x%lx\n", current->pid, addr); continue; } pfn = page_pfn + subpfn; set_pte(pte, pte_mkdirty(pte_mkwrite(pfn_pte(pfn, prot)))); pte_chain = page_add_rmap(page, pte, pte_chain); - printk("%d: put_dirty_page translating 0x%lx to pfn 0x%lx\n", + pr_debug("%d: put_dirty_page translating 0x%lx to pfn 0x%lx\n", current->pid, addr, pfn); page_cache_get(page); pte_unmap(pte); @@ -1077,7 +1077,7 @@ int do_execve(char * filename, struct file *file; int retval; - printk("do_execve(%p, %p, %p, %p)\n", filename, argv, envp, regs); + pr_debug("do_execve(%p, %p, %p, %p)\n", filename, argv, envp, regs); sched_balance_exec(); @@ -1085,7 +1085,7 @@ int do_execve(char * filename, retval = PTR_ERR(file); if (IS_ERR(file)) { - printk("return 1 from do_execve()\n"); + pr_debug("return 1 from do_execve()\n"); return retval; } @@ -1143,7 +1143,7 @@ int do_execve(char * filename, /* execve success */ security_bprm_free(&bprm); - printk("return 2 from do_execve()\n"); + pr_debug("return 2 from do_execve()\n"); return retval; } @@ -1164,7 +1164,7 @@ out_file: fput(bprm.file); } - printk("return 3 from do_execve()\n"); + pr_debug("return 3 from do_execve()\n"); return retval; } diff -urpN pgcl-2.6.0-test5-bk3-4/include/asm-i386/tlb.h pgcl-2.6.0-test5-bk3-5/include/asm-i386/tlb.h --- pgcl-2.6.0-test5-bk3-4/include/asm-i386/tlb.h 2003-11-05 18:48:00.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/include/asm-i386/tlb.h 2003-11-07 21:52:02.000000000 -0800 @@ -17,10 +17,14 @@ #define GFP_PTE __GFP_PTE #endif +/* + * There are probably better ways to set these thresholds. + * The degenerate cases bother me. + */ #define PG_PTE PG_arch_1 -#define NR_PTE 128 +#define NR_PTE (128 > 8*PAGE_MMUCOUNT ? 128/PAGE_MMUCOUNT : 8) #define FREE_PTE_NR NR_PTE -#define NR_NONPTE 512 +#define NR_NONPTE (512 > 8*PAGE_MMUCOUNT ? 512/PAGE_MMUCOUNT : 8) #define MAX_ZONE_ID (MAX_NUMNODES * MAX_NR_ZONES) #define PagePTE(page) test_bit(PG_PTE, &(page)->flags) @@ -30,7 +34,9 @@ /* * x86 doesn't need any special per-pte or - * per-vma handling.. + * per-vma handling... + * + * We do it anyway to cache pagetables with highpte. */ struct vm_area_struct; struct mmu_gather { @@ -94,6 +100,7 @@ static inline struct mmu_gather *tlb_gat struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); tlb->mm = mm; tlb->fullmm = flush; + tlb->freed = 0; put_cpu(); return tlb; } @@ -144,6 +151,7 @@ static inline void tlb_finish_mmu(struct tlb->mm->rss -= tlb->freed; else tlb->mm->rss = 0; + tlb->freed = 0; tlb_flush_mmu(tlb, start, end); } diff -urpN pgcl-2.6.0-test5-bk3-4/include/linux/folio.h pgcl-2.6.0-test5-bk3-5/include/linux/folio.h --- pgcl-2.6.0-test5-bk3-4/include/linux/folio.h 2003-11-06 20:30:13.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/include/linux/folio.h 2003-11-07 22:08:11.000000000 -0800 @@ -25,14 +25,30 @@ static /* inline */ int pte_match(pte_t { if (pte_none(*pte1)) return pte_none(*pte2); - if (pte_present(*pte1)) { + else if (pte_present(*pte1)) { + unsigned long pfn1, pfn2; + if (!pte_present(*pte2)) return 0; - return pte_page(*pte2) == pte_page(*pte1); - } - if (pte_none(*pte2) || pte_present(*pte2)) + + pfn1 = pte_pfn(*pte1); + pfn2 = pte_pfn(*pte2); + + /* if pfn1 is invalid, someone screwed up */ + if (!pfn_valid(pfn1) || !pfn_valid(pfn2)) + return 0; + /* + * We want to do the following with less overhead: + * return pfn_to_page(pfn1) == pfn_to_page(pfn2); + * If gcc doesn't turn this into a shift, it's time + * to start gunning down gcc hackers. + */ + return pfn1/PAGE_MMUCOUNT == pfn2/PAGE_MMUCOUNT; + } else if (pte_none(*pte2) || pte_present(*pte2)) return 0; - return pte_to_swp_entry(*pte2).val == pte_to_swp_entry(*pte1).val; + else + return pte_to_swp_entry(*pte2).val/PAGE_MMUCOUNT + == pte_to_swp_entry(*pte1).val/PAGE_MMUCOUNT; } /* @@ -176,6 +192,9 @@ static int prepare_folio(pte_addr_t foli /* * Check if the wide folio already has a private page allocated to it. + * i.e. we're trying to see if we are the sole owners of the page we + * would otherwise COW in this folio so as to merely map it read/write + * ourselves without copying, allocating, freeing, etc. */ static struct page *private_folio_page(pte_addr_t paddrs[], struct page *swap_page) { @@ -187,13 +206,13 @@ static struct page *private_folio_page(p int fcount, pcount, scount, tcount; int i, j; - printk("%d: private_folio_page(%p, %p)\n", + pr_debug("%d: private_folio_page(%p, %p)\n", current->pid, paddrs, swap_page); kmap_atomic_sg(folio, paddrs, KM_FOLIO); for (j = PAGE_MMUCOUNT - 1; !folio[j]; j--) { - printk("%d: skipping %d\n", current->pid, j); + pr_debug("%d: skipping %d\n", current->pid, j); } fcount = j + 1; /* @@ -209,10 +228,10 @@ static struct page *private_folio_page(p } j = 0; - printk("%d: starting fcount = %d\n", current->pid, fcount); + pr_debug("%d: starting fcount = %d\n", current->pid, fcount); while (j < fcount) { if (!folio[j] || !pte_present(*folio[j])) { - printk("%d: skipping folio[%d] = %p (0x%Lx), " + pr_debug("%d: skipping folio[%d] = %p (0x%Lx), " "presence = %d\n", current->pid, j, @@ -243,7 +262,7 @@ static struct page *private_folio_page(p continue; } } - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; pcount = page_count(page) - 1; /* omit swap cache */ if (PagePrivate(page)) pcount--; @@ -266,7 +285,10 @@ static struct page *private_folio_page(p for (i = j + 1; pcount && i < fcount; i++) { if (!folio[i] || !pte_present(*folio[i])) continue; - if (pte_page(*folio[i]) == page) + pfn = pte_pfn(*folio[i]); + if (!pfn_valid(pfn)) + continue; + if (pfn_to_page(pfn) == page) pcount--; } if (pcount) @@ -274,17 +296,27 @@ static struct page *private_folio_page(p for (i = 0; scount && i < fcount; i++) { if (!folio[i] || pte_present(*folio[i])) continue; - if (pte_to_swp_entry(*folio[i]).val == entry.val) + /* + * entry.val % PAGE_MMUCOUNT represents the mmupage + * within the page; divide by PAGE_MMUCOUNT to see + * if they refer to the same swap entry; all indexing + * into ->swap_map[] is done with this scaling. + */ + if (pte_to_swp_entry(*folio[i]).val/PAGE_MMUCOUNT + == entry.val/PAGE_MMUCOUNT) scount--; } if (scount) continue; kunmap_atomic_sg(folio, KM_FOLIO); - printk("private_folio_page: page=%p, fcount=%d, pcount=%d, scount=%d, tcount=%d\n", page, fcount, pcount, scount, tcount); + pr_debug("private_folio_page() found a private page: " + "page=%p, fcount=%d, pcount=%d, scount=%d, " + "tcount=%d\n", + page, fcount, pcount, scount, tcount); return page; } kunmap_atomic_sg(folio, KM_FOLIO); - printk("%d: private_folio_page: page=NULL, j=%d, fcount=%d\n", + pr_debug("%d: private_folio_page: page=NULL, j=%d, fcount=%d\n", current->pid, j, fcount); return NULL; } @@ -312,7 +344,7 @@ static void restrict_folio(pte_addr_t pa unsigned long addr; int j; - printk("%d: restrict_folio(%p, %p = [0x%lx,0x%lx), 0x%lx, %p)\n", + pr_debug("%d: restrict_folio(%p, %p = [0x%lx,0x%lx), 0x%lx, %p)\n", current->pid, paddrs, vma, vma->vm_start, vma->vm_end, address, ptep); @@ -321,23 +353,23 @@ static void restrict_folio(pte_addr_t pa addr = address - vma_suboffset(vma, address); for (j = 0; j < PAGE_MMUCOUNT; j++, addr += MMUPAGE_SIZE) { if (!folio[j]) { - printk("%d: restrict_folio() saw folio[%d], addr 0x%lx NULL\n", + pr_debug("%d: restrict_folio() saw folio[%d], addr 0x%lx NULL\n", current->pid, j, addr); continue; } if (addr < vma->vm_start || addr >= vma->vm_end) { - printk("%d: restrict_folio() saw folio[%d], addr 0x%lx outside vma\n", + pr_debug("%d: restrict_folio() saw folio[%d], addr 0x%lx outside vma\n", current->pid, j, addr); folio[j] = NULL; paddrs[j] = NOPTE; } else if (!pte_match(folio[j], ptep)) { - printk("%d: restrict_folio() saw folio[%d], addr 0x%lx not match, folio[j] = 0x%Lx, ptep = 0x%Lx\n", + pr_debug("%d: restrict_folio() saw folio[%d], addr 0x%lx not match, folio[j] = 0x%Lx, ptep = 0x%Lx\n", current->pid, j, addr, (u64)pte_val(*folio[j]), (u64)pte_val(*ptep)); folio[j] = NULL; paddrs[j] = NOPTE; } else - printk("%d: restrict folio saw folio[%d], addr 0x%lx = 0x%Lx match with ptep = %Lx\n", + pr_debug("%d: restrict folio saw folio[%d], addr 0x%lx = 0x%Lx match with ptep = %Lx\n", current->pid, j, addr, (u64)pte_val(*folio[j]), (u64)pte_val(*ptep)); } @@ -395,11 +427,11 @@ static void copy_folio(pte_addr_t paddrs if (src_page != ZERO_PAGE(address)) { src = kmap_atomic(src_page, KM_USER1); src_pfn = page_to_pfn(src_page); - printk("%d: copying nonzero page\n", current->pid); + pr_debug("%d: copying nonzero page\n", current->pid); } else { src = NULL; src_pfn = 0; - printk("%d: zeroing out page\n", current->pid); + pr_debug("%d: zeroing out page\n", current->pid); } while (j < PAGE_MMUCOUNT) { if (!folio[j]) { @@ -417,14 +449,14 @@ static void copy_folio(pte_addr_t paddrs * But ia64 sh sparc64 need to use clear/copy_user_page. */ if (src) { - printk("%d: copying %d mmupages from pfn " + pr_debug("%d: copying %d mmupages from pfn " "0x%lx to 0x%lx\n", current->pid, size/MMUPAGE_SIZE, src_pfn + offset/MMUPAGE_SIZE, dst_pfn + offset/MMUPAGE_SIZE); memcpy(dst + offset, src + offset, size); } else { - printk("%d: zeroing %d mmupages at pfn 0x%lx\n", + pr_debug("%d: zeroing %d mmupages at pfn 0x%lx\n", current->pid, size/MMUPAGE_SIZE, dst_pfn + offset/MMUPAGE_SIZE); memset(dst + offset, 0, size); @@ -459,7 +491,7 @@ set_folio_page(pte_addr_t paddrs[], stru old_pte = *folio[j]; set_pte(folio[j], pfn_pte(pfn + j, __pgprot(prot.pgprot | flags))); - printk("%d: translating vaddr 0x%lx to pfn 0x%lx, " + pr_debug("%d: translating vaddr 0x%lx to pfn 0x%lx, " "new pte = 0x%Lx, old pte = 0x%Lx\n", current->pid, ptep_to_address(folio[j]), pfn + j, diff -urpN pgcl-2.6.0-test5-bk3-4/include/linux/swap.h pgcl-2.6.0-test5-bk3-5/include/linux/swap.h --- pgcl-2.6.0-test5-bk3-4/include/linux/swap.h 2003-11-05 18:48:01.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/include/linux/swap.h 2003-11-07 00:34:17.000000000 -0800 @@ -8,8 +8,10 @@ #include #include #include +#include #include #include +#include #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff @@ -180,21 +182,29 @@ extern int vm_swappiness; /* linux/mm/rmap.c */ #ifdef CONFIG_MMU -int FASTCALL(page_referenced(struct page *)); -struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, +#if 0 +#define RMAP_FASTCALL(x) FASTCALL(x) +#else +#define RMAP_FASTCALL(x) x +#endif + +int RMAP_FASTCALL(page_referenced(struct page *)); +struct pte_chain *RMAP_FASTCALL(page_add_rmap(struct page *, pte_t *, struct pte_chain *)); -void FASTCALL(page_remove_rmap(struct page *, pte_t *)); -int FASTCALL(try_to_unmap(struct page *)); -struct pte_chain *FASTCALL(pte_chain_alloc_chained(int)); -void FASTCALL(pte_chain_free_chained(struct pte_chain *)); -struct pte_chain *FASTCALL(page_add_rmap_chained(struct page *, pte_t *, +struct pte_chain *RMAP_FASTCALL(rmap_add_folio(struct page *, + pte_addr_t [], struct pte_chain *)); +void RMAP_FASTCALL(rmap_remove_folio(struct page *, pte_addr_t [])); +void RMAP_FASTCALL(page_remove_rmap(struct page *, pte_t *)); +int RMAP_FASTCALL(try_to_unmap(struct page *)); /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else #define page_referenced(page) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL + +/* people really need to make sure these macro-like things aren't abused */ +#define try_to_unmap(page) ({ SWAP_FAIL; }) #endif /* CONFIG_MMU */ /* return values of try_to_unmap */ diff -urpN pgcl-2.6.0-test5-bk3-4/kernel/fork.c pgcl-2.6.0-test5-bk3-5/kernel/fork.c --- pgcl-2.6.0-test5-bk3-4/kernel/fork.c 2003-11-05 18:56:59.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/kernel/fork.c 2003-11-06 21:14:31.000000000 -0800 @@ -1087,7 +1087,7 @@ long do_fork(unsigned long clone_flags, int trace = 0; long pid; - printk("%d: do_fork()\n", current->pid); + pr_debug("%d: do_fork()\n", current->pid); if (unlikely(current->ptrace)) { trace = fork_traceflag (clone_flags); @@ -1139,7 +1139,7 @@ long do_fork(unsigned long clone_flags, */ set_need_resched(); } - printk("%d: do_fork() = %ld\n", current->pid, pid); + pr_debug("%d: do_fork() = %ld\n", current->pid, pid); return pid; } diff -urpN pgcl-2.6.0-test5-bk3-4/mm/memory.c pgcl-2.6.0-test5-bk3-5/mm/memory.c --- pgcl-2.6.0-test5-bk3-4/mm/memory.c 2003-11-06 19:27:22.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/mm/memory.c 2003-11-07 19:32:22.000000000 -0800 @@ -36,8 +36,6 @@ * (Gerhard.Wichert@pdb.siemens.de) */ -#define DEBUG - #include #include #include @@ -68,6 +66,8 @@ unsigned long num_physpages; void * high_memory; struct page *highmem_start_page; +struct pte_chain *rmap_add_folio(struct page *, pte_addr_t [], struct pte_chain *); + /* * We special-case the C-O-W ZERO_PAGE, because it's such * a common occurrence (no need to read the page to know @@ -1028,7 +1028,7 @@ static int do_wp_page(struct mm_struct * pte_addr_t folio[PAGE_MMUCOUNT+1]; int reprep, rss, ret; - printk("%d: do_wp_page(%p, %p, 0x%lx, %p, %p, %Lx\n", + pr_debug("%d: do_wp_page(%p, %p, 0x%lx, %p, %p, %Lx\n", current->pid, mm, vma, address, page_table, pmd, (u64)pte_val(pte)); if (unlikely(!pfn_valid(pfn))) { @@ -1046,7 +1046,7 @@ static int do_wp_page(struct mm_struct * reprep = prepare_folio(folio, vma, address, ptep_to_paddr(page_table), 1); new_page = private_folio_page(folio, PAGE_MMUSHIFT ? NULL : old_page); if (new_page) { - printk("%d: got private page\n", current->pid); + pr_debug("%d: got private page\n", current->pid); page_cache_get(new_page); goto got_page; } @@ -1060,7 +1060,7 @@ static int do_wp_page(struct mm_struct * if (!pte_chain) goto no_mem; new_page = alloc_page(GFP_HIGHUSER); - printk("%d: allocated page at 0x%lx\n", + pr_debug("%d: allocated page at 0x%lx\n", current->pid, page_to_pfn(new_page)); if (!new_page) @@ -1072,25 +1072,27 @@ static int do_wp_page(struct mm_struct * spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, pte)) { - printk("%d: pte changed, back out of fault\n", current->pid); + pr_debug("%d: pte changed, back out of fault\n", current->pid); page_cache_release(new_page); return VM_FAULT_MINOR; } if (reprep) { - printk("%d: reprepping folio\n", current->pid); + pr_debug("%d: reprepping folio\n", current->pid); prepare_folio(folio, vma, address, ptep_to_paddr(page_table), 1); } new_page = private_folio_page_xchg(folio, new_page); got_page: restrict_folio(folio, vma, address, page_table); if (new_page != old_page) { - printk("%d: copying folio\n", current->pid); + pr_debug("%d: copying folio\n", current->pid); copy_folio(folio, new_page, old_page, address); flush_cache_page(vma, address); + rmap_remove_folio(old_page, folio); } rss = set_folio_page(folio, new_page, vma->vm_page_prot, _PAGE_DIRTY|_PAGE_RW); if (new_page != old_page) { + pte_chain = rmap_add_folio(new_page, folio, pte_chain); adjust_page_count(new_page, rss - 1); if (PageReserved(old_page)) mm->rss += rss; @@ -1100,7 +1102,8 @@ got_page: pte_unmap(page_table); flush_folio(folio, vma, address); update_mmu_cache(vma, address, folio); - page_cache_release(old_page); + if (!PageReserved(old_page)) + page_cache_release(old_page); ret = VM_FAULT_MINOR; goto out; @@ -1111,7 +1114,7 @@ oom: out: spin_unlock(&mm->page_table_lock); pte_chain_free(pte_chain); - printk("%d: return from do_wp_page()\n", current->pid); + pr_debug("%d: return from do_wp_page()\n", current->pid); return ret; } @@ -1268,7 +1271,7 @@ static int do_swap_page(struct mm_struct int rss, ret = VM_FAULT_MINOR; struct pte_chain *pte_chain = NULL; - printk("%d: do_swap_page(%p, %p, %lx, %p, %p, %Lx, %d)\n", + pr_debug("%d: do_swap_page(%p, %p, %lx, %p, %p, %Lx, %d)\n", current->pid, mm, vma, address, page_table, pmd, (u64)pte_val(orig_pte), write_access); pte_unmap(page_table); @@ -1344,7 +1347,8 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); rss = set_folio_page(folio, page, vma->vm_page_prot, write_access ? (_PAGE_DIRTY|_PAGE_RW) : 0); - pte_chain = page_add_rmap(page, page_table, pte_chain); + /* pte_chain = page_add_rmap(page, page_table, pte_chain); */ + pte_chain = rmap_add_folio(page, folio, pte_chain); adjust_page_count(page, rss - 1); mm->rss += rss; __swap_free(entry, rss); @@ -1358,7 +1362,7 @@ static int do_swap_page(struct mm_struct spin_unlock(&mm->page_table_lock); out: pte_chain_free(pte_chain); - printk("%d: return from do_swap_page()\n", current->pid); + pr_debug("%d: return from do_swap_page()\n", current->pid); return ret; } @@ -1377,7 +1381,7 @@ do_anonymous_page(struct mm_struct *mm, struct pte_chain *pte_chain; int ret; - printk("%d: do_anonymous_page(%p, %p, %p, %p, %d, %lx)\n", + pr_debug("%d: do_anonymous_page(%p, %p, %p, %p, %d, %lx)\n", current->pid, mm, vma, page_table, pmd, write_access, addr); pte_chain = pte_chain_alloc(GFP_ATOMIC); @@ -1398,7 +1402,7 @@ do_anonymous_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); new_page = alloc_page(GFP_HIGHUSER); - printk("%d: allocated page at 0x%lx\n", + pr_debug("%d: allocated page at 0x%lx\n", current->pid, page_to_pfn(new_page)); if (!new_page) @@ -1434,7 +1438,8 @@ do_anonymous_page(struct mm_struct *mm, set_folio_page(folio, page, vma->vm_page_prot, 0); } /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); + /* pte_chain = page_add_rmap(page, page_table, pte_chain); */ + pte_chain = rmap_add_folio(page, folio, pte_chain); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ @@ -1447,7 +1452,7 @@ no_mem: ret = VM_FAULT_OOM; out: pte_chain_free(pte_chain); - printk("%d: return from do_anonymous_page()\n", current->pid); + pr_debug("%d: return from do_anonymous_page()\n", current->pid); return ret; } @@ -1473,7 +1478,7 @@ do_no_page(struct mm_struct *mm, struct struct pte_chain *pte_chain; int ret, rss, sequence = 0; - printk("%d: do_no_page(%p, %p, %lx, %d, %p, %p)\n", + pr_debug("%d: do_no_page(%p, %p, %lx, %d, %p, %p)\n", current->pid, mm, vma, address, write_access, page_table, pmd); pte_unmap(page_table); @@ -1506,7 +1511,7 @@ retry: if ((write_access && !(vma->vm_flags & VM_SHARED)) && (page_count(page) > 1 || PageReserved(page))) { new_page = alloc_page(GFP_HIGHUSER); - printk("%d: allocated page at 0x%lx\n", + pr_debug("%d: allocated page at 0x%lx\n", current->pid, page_to_pfn(new_page)); if (!new_page) { @@ -1553,6 +1558,7 @@ retry: flush_icache_page(vma, page); rss = set_folio_page(folio, page, vma->vm_page_prot, write_access ? (_PAGE_RW|_PAGE_DIRTY) : 0); + pte_chain = rmap_add_folio(page, folio, pte_chain); if (!PageReserved(page)) { adjust_page_count(page, rss - 1); mm->rss += rss; @@ -1568,7 +1574,7 @@ oom: ret = VM_FAULT_OOM; out: pte_chain_free(pte_chain); - printk("%d: return %d from do_no_page()\n", current->pid, ret); + pr_debug("%d: return %d from do_no_page()\n", current->pid, ret); return ret; } @@ -1583,7 +1589,7 @@ static int do_file_page(struct mm_struct unsigned long pgoff; int err; - printk("%d: do_file_page(%p, %p, %lx, %d, %p, %p)\n", + pr_debug("%d: do_file_page(%p, %p, %lx, %d, %p, %p)\n", current->pid, mm, vma, address, write_access, pte, pmd); BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); @@ -1603,7 +1609,7 @@ static int do_file_page(struct mm_struct spin_unlock(&mm->page_table_lock); err = vma->vm_ops->populate(vma, address & MMUPAGE_MASK, MMUPAGE_SIZE, vma->vm_page_prot, pgoff, 0); - printk("%d: return from do_file_page()\n", current->pid); + pr_debug("%d: return from do_file_page()\n", current->pid); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) @@ -1679,7 +1685,7 @@ int handle_mm_fault(struct mm_struct *mm address &= MMUPAGE_MASK; - printk("%d: handle_mm_fault(%p, %p = [%lx, %lx), %lx, %d)\n", + pr_debug("%d: handle_mm_fault(%p, %p = [%lx, %lx), %lx, %d)\n", current->pid, mm, vma, vma->vm_start, vma->vm_end, address, write_access); diff -urpN pgcl-2.6.0-test5-bk3-4/mm/page_alloc.c pgcl-2.6.0-test5-bk3-5/mm/page_alloc.c --- pgcl-2.6.0-test5-bk3-4/mm/page_alloc.c 2003-11-05 18:50:36.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/mm/page_alloc.c 2003-11-07 00:10:06.000000000 -0800 @@ -74,20 +74,23 @@ static void bad_page(const char *functio { printk("Bad page state for 0x%p (pfn=0x%lx) at %s\n", page, page_to_pfn(page), function); - printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n", + printk("flags:0x%08lx mapping:%p mapped:%d count:%d private:0x%lx\n", page->flags, page->mapping, - page_mapped(page), page_count(page)); + page_mapped(page), page_count(page), page->private); printk("Backtrace:\n"); dump_stack(); printk("Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_private | - 1 << PG_locked | - 1 << PG_lru | - 1 << PG_active | - 1 << PG_dirty | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_direct | + 1 << PG_chainlock | 1 << PG_writeback); set_page_count(page, 0); page->mapping = NULL; + page->pte.direct = (pte_addr_t)NULL; } #ifndef CONFIG_HUGETLB_PAGE @@ -210,20 +213,67 @@ static inline void __free_pages_bulk (st list_add(&(base + page_idx)->list, &area->free_list); } +/* + * I ruined this goddamn thing performancewise, but I desperately need + * more accessible debugging info. + */ static inline void free_pages_check(const char *function, struct page *page) { - if ( page_mapped(page) || - page->mapping != NULL || - page_count(page) != 0 || - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_writeback ))) + int bad = 0; + + if (unlikely(PageDirect(page))) { + printk("PageDirect(page)\n"); + bad = 1; + } + + if (unlikely(test_bit(PG_chainlock, &page->flags))) { + printk("pte_chain_lock(page) held!\n"); + bad = 1; + } + + if (unlikely(page_mapped(page))) { + printk("page_mapped(page)\n"); + bad = 1; + } + if (unlikely(page->mapping)) { + printk("page->mapping == %p != NULL\n", page->mapping); + bad = 1; + } + if (unlikely(page_count(page))) { + printk("page->count == %d != 0\n", page_count(page)); + bad = 1; + } + if (unlikely(PageLRU(page))) { + printk("PageLRU(page)\n"); + bad = 1; + } + if (unlikely(page->private)) { + printk("PagePrivate(page)\n"); + bad = 1; + } + if (unlikely(PageLocked(page))) { + printk("PageLocked(page)\n"); + bad = 1; + } + if (unlikely(PageActive(page))) { + printk("PageActive(page)\n"); + bad = 1; + } + if (unlikely(PageReclaim(page))) { + printk("PageReclaim(page)\n"); + bad = 1; + } + if (unlikely(PageSlab(page))) { + printk("PageSlab(page)\n"); + bad = 1; + } + if (unlikely(PageWriteback(page))) { + printk("PageWriteback(page)\n"); + bad = 1; + } + if (unlikely(bad)) bad_page(function, page); + if (PageDirty(page)) ClearPageDirty(page); } @@ -319,20 +369,24 @@ static inline void set_page_refs(struct */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapped(page) || + if (page->mapping || page_mapped(page) || PagePrivate(page) || (page->flags & ( - 1 << PG_private | - 1 << PG_locked | - 1 << PG_lru | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_direct | + 1 << PG_chainlock | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); + page->flags &= ~( + 1<< PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_direct | 1 << PG_chainlock | + 1 << PG_checked | 1 << PG_mappedtodisk); page->private = 0; set_page_refs(page, order); } diff -urpN pgcl-2.6.0-test5-bk3-4/mm/rmap.c pgcl-2.6.0-test5-bk3-5/mm/rmap.c --- pgcl-2.6.0-test5-bk3-4/mm/rmap.c 2003-11-05 18:48:01.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/mm/rmap.c 2003-11-07 00:43:28.000000000 -0800 @@ -35,7 +35,13 @@ #include #include -/* #define DEBUG_RMAP */ +#define DEBUG_RMAP + +#ifdef DEBUG_RMAP +#define RMAP_BUG_ON(p) BUG_ON(p) +#else +#define RMAP_BUG_ON(p) do { } while (0) +#endif /* * Shared pages have a chain of pte_chain structures, used to locate @@ -45,8 +51,27 @@ * * We use an array of pte pointers in this structure to minimise cache misses * while traversing reverse maps. + * + * What we want here is + * NRPTE = (N*L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t) + * where N is the least such that NRPTE >= PAGE_MMUCOUNT. + * This looks hairier than it truly is. + * + * Suppose we know that: + * (N*L1_CACHE_BYTES - sizeof(long))/sizeof(pte_addr_t) >= PAGE_MMUCOUNT + * We then have + * N*L1_CACHE_BYTES - sizeof(long) >= PAGE_MMUCOUNT*sizeof(pte_addr_t) + * and + * N*L1_CACHE_BYTES >= PAGE_MMUCOUNT*sizeof(pte_addr_t) + sizeof(long) + * and in turn + * N >= (PAGE_MMUCOUNT*sizeof(pte_addr_t)+sizeof(long)+L1_CACHE_BYTES-1) + * /L1_CACHE_BYTES */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) + +#define __NL1CB__ (PAGE_MMUCOUNT*sizeof(pte_addr_t) + sizeof(long)) +#define __NL1CL__ ((__NL1CB__ + L1_CACHE_BYTES - 1)/L1_CACHE_BYTES) +#define _NL1CL_ (__NL1CL__ > 0 ? __NL1CL__ : __NL1CL__) +#define NRPTE ((_NL1CL_*L1_CACHE_BYTES - sizeof(long))/sizeof(pte_addr_t)) /* * next_and_idx encodes both the address of the next pte_chain and the @@ -156,6 +181,24 @@ int page_referenced(struct page * page) return referenced; } +static inline int check_pte_paddr_present(struct page *page, pte_addr_t pte) +{ + struct pte_chain *chain; + + if (PageDirect(page)) + return page->pte.direct == pte; + + for (chain = page->pte.chain; chain; chain = pte_chain_next(chain)) { + int k; + for (k = pte_chain_idx(chain); k < NRPTE; ++k) { + if (chain->ptes[k] == pte) + return 1; + } + } + + return 0; +} + /** * page_add_rmap - add reverse mapping entry to a page * @page: the page to add the mapping to @@ -164,16 +207,15 @@ int page_referenced(struct page * page) * Add a new pte reverse mapping to a page. * The caller needs to hold the mm->page_table_lock. */ -struct pte_chain * -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) +static struct pte_chain * +RMAP_FASTCALL(__page_add_rmap(struct page *, pte_addr_t, struct pte_chain *)); + +static struct pte_chain * +__page_add_rmap(struct page *page, pte_addr_t pte_paddr, struct pte_chain *pte_chain) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); struct pte_chain *cur_pte_chain; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return pte_chain; - - pte_chain_lock(page); + RMAP_BUG_ON(check_pte_paddr_present(page, pte_paddr)); if (page->pte.direct == 0) { page->pte.direct = pte_paddr; @@ -206,21 +248,49 @@ page_add_rmap(struct page *page, pte_t * cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; cur_pte_chain->next_and_idx--; out: + return pte_chain; +} + +struct pte_chain *page_add_rmap(struct page *page, + pte_t *ptep, + struct pte_chain *pte_chain) +{ + pte_addr_t pte_paddr = ptep_to_paddr(ptep); + + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return pte_chain; + + pte_chain_lock(page); + pte_chain = __page_add_rmap(page, pte_paddr, pte_chain); pte_chain_unlock(page); return pte_chain; } -struct pte_chain * -page_add_rmap_chained(struct page *page, pte_t *pte, struct pte_chain *pc) +/* + * Ultralame. The whole interaction with rmap needs rewriting anyway + * in order to reap an expected O(PAGE_MMUCOUNT) overhead reduction. + */ +struct pte_chain *rmap_add_folio(struct page *page, + pte_addr_t folio[], + struct pte_chain *pte_chain) { - struct pte_chain *rest = pte_chain_next(pc); - pc->next_and_idx = 0; - pc = page_add_rmap(page, pte, pc); - if (pc) { - pc->next_and_idx = pte_chain_encode(rest, 0); - rest = pc; + int k; + + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return pte_chain; + + pte_chain_lock(page); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + /* + * I'd like to BUG_ON(!pte_chain) here, but we can + * consume the goddamn thing in __page_add_rmap() + * while still being able to accomplish insertions. + */ + if (folio[k]) + pte_chain = __page_add_rmap(page, folio[k], pte_chain); } - return rest; + pte_chain_unlock(page); + return pte_chain; } /** @@ -233,20 +303,12 @@ page_add_rmap_chained(struct page *page, * the page. * Caller needs to hold the mm->page_table_lock. */ -void page_remove_rmap(struct page *page, pte_t *ptep) +static void RMAP_FASTCALL(__page_remove_rmap(struct page *, pte_addr_t)); +static void __page_remove_rmap(struct page *page, pte_addr_t pte_paddr) { - pte_addr_t pte_paddr; struct pte_chain *pc; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; - - pte_paddr = ptep_to_paddr(ptep); - - pte_chain_lock(page); - - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + RMAP_BUG_ON(!check_pte_paddr_present(page, pte_paddr)); if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { @@ -287,11 +349,47 @@ void page_remove_rmap(struct page *page, } } out: + return; +} + +void page_remove_rmap(struct page *page, pte_t *ptep) +{ + pte_addr_t pte_paddr; + + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; + + pte_paddr = ptep_to_paddr(ptep); + + pte_chain_lock(page); + if (page_mapped(page)) + __page_remove_rmap(page, pte_paddr); + else if (!page_mapped(page)) + dec_page_state(nr_mapped); + pte_chain_unlock(page); +} + +void rmap_remove_folio(struct page *page, pte_addr_t folio[]) +{ + int k; + + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; + + pte_chain_lock(page); + + if (!page_mapped(page)) + goto out_unlock; + + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + if (folio[k]) + __page_remove_rmap(page, folio[k]); + } + if (!page_mapped(page)) dec_page_state(nr_mapped); out_unlock: pte_chain_unlock(page); - return; } /** @@ -307,7 +405,7 @@ out_unlock: * pte_chain_lock shrink_list() * mm->page_table_lock try_to_unmap_one(), trylock */ -static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); +static int RMAP_FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); static int try_to_unmap_one(struct page * page, pte_addr_t paddr) { pte_t *ptep = rmap_ptep_map(paddr); @@ -536,34 +634,6 @@ struct pte_chain *pte_chain_alloc(int gf return ret; } -struct pte_chain *pte_chain_alloc_chained(int gfp_flags) -{ - int k; - struct pte_chain *ret = NULL; - - for (k = 0; k < PAGE_MMUCOUNT; ++k) { - struct pte_chain *tmp = pte_chain_alloc(gfp_flags); - if (tmp) { - tmp->next_and_idx = pte_chain_encode(ret, 0); - ret = tmp; - } else { - pte_chain_free_chained(ret); - return NULL; - } - } - return ret; -} - -void pte_chain_free_chained(struct pte_chain *pc) -{ - while (pc) { - struct pte_chain *next = pte_chain_next(pc); - pc->next_and_idx = 0; - __pte_chain_free(pc); - pc = next; - } -} - void __init pte_chain_init(void) { pte_chain_cache = kmem_cache_create( "pte_chain", diff -urpN pgcl-2.6.0-test5-bk3-4/mm/swapfile.c pgcl-2.6.0-test5-bk3-5/mm/swapfile.c --- pgcl-2.6.0-test5-bk3-4/mm/swapfile.c 2003-11-05 18:48:48.000000000 -0800 +++ pgcl-2.6.0-test5-bk3-5/mm/swapfile.c 2003-11-07 16:58:16.000000000 -0800 @@ -3,6 +3,7 @@ * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie + * Page clustering swap adjustments Nov 2003, William Irwin */ #include @@ -48,7 +49,22 @@ struct swap_info_struct swap_info[MAX_SW /* * returns offset into ->swap_map[] array, each entry of which - * tracks PAGE_SIZE (not MMUPAGE_SIZE) + * tracks PAGE_SIZE (not MMUPAGE_SIZE). + * Here comes the pain. Every usage of swp_offset() needs to be + * watched like a hawk. swp_offset(entry) % PAGE_MMUCOUNT tracks the + * MMUPAGE_SIZE -sized subblock within the PAGE_SIZE region swapped + * and is largely for the benefit of ptes, so that unaligned swapped + * areas can have their proper pieces of pages recovered from ptes. + * More advanced implementations may utilize rmap information to + * accomplish private_folio_page() and the like, at which time the + * swap layer will have no means of recovering sub-block information + * from virtual addresses. This is a serious concern, as incremental + * methods that would scatter pieces of a page to the four winds are + * required for scaling to very large values of PAGE_SIZE. + * + * The upshot is that indexing wildly off of ->swap_map[] without + * scaling the results of swp_offset() will hurt. Badly. I've seen bad + * swp_type() results here too; I may be in trouble. */ static inline int scan_swap_map(struct swap_info_struct *si) { @@ -164,7 +180,7 @@ out: static struct swap_info_struct * swap_info_get(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p = NULL; unsigned long offset, type; if (!entry.val) @@ -175,7 +191,7 @@ static struct swap_info_struct * swap_in "in swap_info_get()\n", type, nr_swapfiles); goto bad_nofile; } - p = & swap_info[type]; + p = &swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -197,18 +213,23 @@ static struct swap_info_struct * swap_in bad_free: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); WARN_ON(1); - goto out; + goto out_err; bad_offset: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); WARN_ON(1); - goto out; + goto out_err; bad_device: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); WARN_ON(1); - goto out; + goto out_err; bad_nofile: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); WARN_ON(1); +out_err: + printk("swap_free: type = %x, offset = 0x%lx, max = 0x%lx\n", + swp_type(entry), + swp_offset(entry), + p ? p->max*PAGE_MMUCOUNT : 0); /* dump pagetables */ #if 1 @@ -1653,7 +1674,7 @@ int swap_count(struct page *page) swp_entry_t entry; int retval = 0; - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; if (!entry.val) goto bad_entry; type = swp_type(entry); @@ -1661,11 +1682,11 @@ int swap_count(struct page *page) goto bad_file; p = type + swap_info; offset = swp_offset(entry); - if (offset >= p->max) + if (offset >= p->max*PAGE_MMUCOUNT) goto bad_offset; - if (!p->swap_map[offset]) + if (!p->swap_map[offset/PAGE_MMUCOUNT]) goto bad_unused; - retval = p->swap_map[offset]; + retval = p->swap_map[offset/PAGE_MMUCOUNT]; out: return retval; bad_entry: @@ -1697,18 +1718,18 @@ void __swap_free(swp_entry_t entry, unsi if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); - if (offset >= p->max) + if (offset >= p->max*PAGE_MMUCOUNT) goto bad_offset; - if (!p->swap_map[offset]) + if (!p->swap_map[offset/PAGE_MMUCOUNT]) goto bad_free; swap_list_lock(); if (p->prio > swap_info[swap_list.next].prio) swap_list.next = type; swap_device_lock(p); - if (p->swap_map[offset] < SWAP_MAP_MAX) { - if (p->swap_map[offset] < count) + if (p->swap_map[offset/PAGE_MMUCOUNT] < SWAP_MAP_MAX) { + if (p->swap_map[offset/PAGE_MMUCOUNT] < count) goto bad_count; - if (!(p->swap_map[offset] -= count)) { + if (!(p->swap_map[offset/PAGE_MMUCOUNT] -= count)) { if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit)