Brute-force forward port to 2.5.69. Includes the pagetable fragmentation fixes originally done for 2.5.68. Unified handling of anonymizing faults still needs trivial error cases handled and so isn't included here. arch/i386/Kconfig | 12 arch/i386/boot/setup.S | 5 arch/i386/kernel/apic.c | 4 arch/i386/kernel/cpu/amd.c | 2 arch/i386/kernel/cpu/mtrr/amd.c | 10 arch/i386/kernel/cpu/mtrr/centaur.c | 12 arch/i386/kernel/cpu/mtrr/cyrix.c | 4 arch/i386/kernel/cpu/mtrr/generic.c | 16 - arch/i386/kernel/cpu/mtrr/if.c | 28 - arch/i386/kernel/cpu/mtrr/main.c | 10 arch/i386/kernel/entry.S | 2 arch/i386/kernel/head.S | 3 arch/i386/kernel/microcode.c | 2 arch/i386/kernel/mpparse.c | 6 arch/i386/kernel/numaq.c | 2 arch/i386/kernel/setup.c | 18 - arch/i386/kernel/smpboot.c | 2 arch/i386/kernel/sys_i386.c | 4 arch/i386/kernel/sysenter.c | 2 arch/i386/kernel/traps.c | 3 arch/i386/lib/getuser.S | 2 arch/i386/mm/discontig.c | 62 ++-- arch/i386/mm/fault.c | 177 +++++++++-- arch/i386/mm/highmem.c | 60 +++ arch/i386/mm/init.c | 189 +++++++----- arch/i386/mm/ioremap.c | 24 - arch/i386/mm/pageattr.c | 36 +- arch/i386/mm/pgtable.c | 110 ++++--- arch/i386/pci/i386.c | 2 drivers/block/ll_rw_blk.c | 2 drivers/char/agp/backend.c | 8 drivers/char/agp/generic.c | 12 drivers/char/mem.c | 42 +- drivers/oprofile/buffer_sync.c | 2 drivers/scsi/qlogicisp.c | 2 drivers/scsi/sym53c8xx.c | 6 drivers/scsi/sym53c8xx_2/sym_glue.c | 4 drivers/scsi/sym53c8xx_comm.h | 4 fs/aio.c | 37 +- fs/binfmt_elf.c | 22 - fs/bio.c | 18 - fs/direct-io.c | 82 ++++- fs/exec.c | 62 ++-- fs/ext2/dir.c | 8 fs/file_table.c | 2 fs/inode.c | 4 fs/proc/base.c | 46 ++- fs/proc/proc_misc.c | 2 fs/proc/task_mmu.c | 2 include/asm-alpha/page.h | 2 include/asm-arm/page.h | 2 include/asm-cris/page.h | 2 include/asm-generic/page.h | 11 include/asm-generic/rmap.h | 59 +++ include/asm-i386/dma-mapping.h | 2 include/asm-i386/fixmap.h | 40 ++ include/asm-i386/highmem.h | 19 - include/asm-i386/io.h | 2 include/asm-i386/io_apic.h | 2 include/asm-i386/mmzone.h | 39 +- include/asm-i386/numaq.h | 4 include/asm-i386/page.h | 50 ++- include/asm-i386/pci.h | 4 include/asm-i386/pgalloc.h | 101 ++++++ include/asm-i386/pgtable-2level.h | 11 include/asm-i386/pgtable-3level.h | 15 include/asm-i386/pgtable.h | 75 ++-- include/asm-i386/rmap.h | 11 include/asm-i386/setup.h | 8 include/asm-i386/shmparam.h | 2 include/asm-i386/thread_info.h | 10 include/asm-i386/tlbflush.h | 11 include/asm-ia64/page.h | 2 include/asm-m68k/page.h | 2 include/asm-m68knommu/page.h | 2 include/asm-mips/page.h | 2 include/asm-mips64/page.h | 2 include/asm-parisc/page.h | 2 include/asm-ppc/page.h | 2 include/asm-ppc64/page.h | 2 include/asm-s390/page.h | 2 include/asm-sh/page.h | 2 include/asm-sparc/page.h | 2 include/asm-sparc64/page.h | 2 include/asm-v850/page.h | 1 include/asm-x86_64/page.h | 2 include/linux/aio.h | 4 include/linux/binfmts.h | 10 include/linux/bio.h | 5 include/linux/highmem.h | 11 include/linux/ide.h | 2 include/linux/mm.h | 44 ++ include/linux/mmzone.h | 2 include/linux/pagemap.h | 11 include/linux/sched.h | 9 include/linux/shm.h | 2 include/linux/sunrpc/svc.h | 3 include/linux/swap.h | 6 init/main.c | 5 ipc/shm.c | 10 kernel/fork.c | 9 kernel/futex.c | 26 + kernel/ksyms.c | 1 kernel/ptrace.c | 25 + mm/bootmem.c | 118 +++---- mm/filemap.c | 38 +- mm/fremap.c | 11 mm/highmem.c | 51 ++- mm/madvise.c | 10 mm/memory.c | 505 +++++++++++++++++++++++++-------- mm/mincore.c | 32 +- mm/mlock.c | 18 - mm/mmap.c | 94 +++--- mm/mprotect.c | 12 mm/mremap.c | 30 - mm/msync.c | 6 mm/page-writeback.c | 4 mm/page_alloc.c | 14 mm/page_io.c | 4 mm/rmap.c | 9 mm/shmem.c | 58 +-- mm/slab.c | 4 mm/swap.c | 2 mm/swap_state.c | 16 - mm/swapfile.c | 136 +++++++- mm/vcache.c | 2 mm/vmalloc.c | 136 +++----- mm/vmscan.c | 2 net/ipv4/netfilter/ip_conntrack_core.c | 4 net/ipv4/tcp.c | 4 130 files changed, 2108 insertions(+), 1082 deletions(-) diff -prauwN linux-2.5.69/arch/i386/Kconfig pgcl-2.5.69-3/arch/i386/Kconfig --- linux-2.5.69/arch/i386/Kconfig 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/Kconfig 2003-05-26 07:14:19.000000000 -0700 @@ -663,6 +663,18 @@ config X86_PAE depends on HIGHMEM64G default y +config PAGE_CLUSTER + int "Page clustering factor" + default 3 if HIGHMEM64G + default 2 if HIGHMEM4G + default 1 + help + Select page clustering factor as a power of 2. + Defaults and examples: + 3 => 32KB PAGE_SIZE + 2 => 16KB PAGE_SIZE + 1 => 8KB PAGE_SIZE + # Common NUMA Features config NUMA bool "Numa Memory Allocation Support" diff -prauwN linux-2.5.69/arch/i386/boot/setup.S pgcl-2.5.69-3/arch/i386/boot/setup.S --- linux-2.5.69/arch/i386/boot/setup.S 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/boot/setup.S 2003-05-26 07:14:19.000000000 -0700 @@ -58,6 +58,9 @@ #include #include #include + +#define VMALLOC_START (-0xC0000000 - 128*1024*1024) +#include #include /* Signature words to ensure LILO loaded us right */ @@ -162,7 +165,7 @@ cmd_line_ptr: .long 0 # (Header versio # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long MAXMEM-1 # (Header version 0x0203 or later) +ramdisk_max: .long __MAXMEM-1 # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd diff -prauwN linux-2.5.69/arch/i386/kernel/apic.c pgcl-2.5.69-3/arch/i386/kernel/apic.c --- linux-2.5.69/arch/i386/kernel/apic.c 2003-05-04 16:53:57.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/apic.c 2003-05-26 07:14:19.000000000 -0700 @@ -674,7 +674,7 @@ void __init init_apic_mappings(void) * one for the IO-APIC. */ if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = (unsigned long) alloc_bootmem_pages(MMUPAGE_SIZE); apic_phys = __pa(apic_phys); } else apic_phys = mp_lapic_addr; @@ -706,7 +706,7 @@ void __init init_apic_mappings(void) } } else { fake_ioapic_page: - ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long) alloc_bootmem_pages(MMUPAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); diff -prauwN linux-2.5.69/arch/i386/kernel/cpu/amd.c pgcl-2.5.69-3/arch/i386/kernel/cpu/amd.c --- linux-2.5.69/arch/i386/kernel/cpu/amd.c 2003-05-04 16:53:42.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/cpu/amd.c 2003-05-26 07:14:19.000000000 -0700 @@ -25,7 +25,7 @@ __asm__(".align 4\nvide: ret"); static void __init init_amd(struct cpuinfo_x86 *c) { u32 l, h; - int mbytes = num_physpages >> (20-PAGE_SHIFT); + int mbytes = num_physpages >> (20-MMUPAGE_SHIFT); int r; /* diff -prauwN linux-2.5.69/arch/i386/kernel/cpu/mtrr/amd.c pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/amd.c --- linux-2.5.69/arch/i386/kernel/cpu/mtrr/amd.c 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/amd.c 2003-05-26 07:14:19.000000000 -0700 @@ -16,7 +16,7 @@ amd_get_mtrr(unsigned int reg, unsigned if (reg == 1) low = high; /* The base masks off on the right alignment */ - *base = (low & 0xFFFE0000) >> PAGE_SHIFT; + *base = (low & 0xFFFE0000) >> MMUPAGE_SHIFT; *type = 0; if (low & 1) *type = MTRR_TYPE_UNCACHABLE; @@ -42,7 +42,7 @@ amd_get_mtrr(unsigned int reg, unsigned * *128K ... */ low = (~low) & 0x1FFFC; - *size = (low + 4) << (15 - PAGE_SHIFT); + *size = (low + 4) << (15 - MMUPAGE_SHIFT); return; } @@ -77,8 +77,8 @@ static void amd_set_mtrr(unsigned int re desired 111 1111 1111 1100 mask But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ - regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) - | (base << PAGE_SHIFT) | (type + 1); + regs[reg] = (-size >> (15 - MMUPAGE_SHIFT) & 0x0001FFFC) + | (base << MMUPAGE_SHIFT) | (type + 1); /* * The writeback rule is quite specific. See the manual. Its @@ -97,7 +97,7 @@ static int amd_validate_add_page(unsigne o Power of 2 block o base suitably aligned to the power */ - if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) + if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - MMUPAGE_SHIFT)) || (size & ~(size - 1)) - size || (base & (size - 1))) return -EINVAL; return 0; diff -prauwN linux-2.5.69/arch/i386/kernel/cpu/mtrr/centaur.c pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/centaur.c --- linux-2.5.69/arch/i386/kernel/cpu/mtrr/centaur.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/centaur.c 2003-05-26 07:14:19.000000000 -0700 @@ -51,8 +51,8 @@ static void centaur_get_mcr(unsigned int reg, unsigned long *base, unsigned int *size, mtrr_type * type) { - *base = centaur_mcr[reg].high >> PAGE_SHIFT; - *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; + *base = centaur_mcr[reg].high >> MMUPAGE_SHIFT; + *size = -(centaur_mcr[reg].low & 0xfffff000) >> MMUPAGE_SHIFT; *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) *type = MTRR_TYPE_UNCACHABLE; @@ -72,14 +72,14 @@ static void centaur_set_mcr(unsigned int /* Disable */ high = low = 0; } else { - high = base << PAGE_SHIFT; + high = base << MMUPAGE_SHIFT; if (centaur_mcr_type == 0) - low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ + low = -size << MMUPAGE_SHIFT | 0x1f; /* only support write-combining... */ else { if (type == MTRR_TYPE_UNCACHABLE) - low = -size << PAGE_SHIFT | 0x02; /* NC */ + low = -size << MMUPAGE_SHIFT | 0x02; /* NC */ else - low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + low = -size << MMUPAGE_SHIFT | 0x09; /* WWO,WC */ } } centaur_mcr[reg].high = high; diff -prauwN linux-2.5.69/arch/i386/kernel/cpu/mtrr/cyrix.c pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/cyrix.c --- linux-2.5.69/arch/i386/kernel/cpu/mtrr/cyrix.c 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/cyrix.c 2003-05-26 07:14:19.000000000 -0700 @@ -30,7 +30,7 @@ cyrix_get_arr(unsigned int reg, unsigned /* Enable interrupts if it was enabled previously */ local_irq_restore(flags); shift = ((unsigned char *) base)[1] & 0x0f; - *base >>= PAGE_SHIFT; + *base >>= MMUPAGE_SHIFT; /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 * Note: shift==0xf means 4G, this is unsupported. @@ -203,7 +203,7 @@ static void cyrix_set_arr(unsigned int r prepare_set(); - base <<= PAGE_SHIFT; + base <<= MMUPAGE_SHIFT; setCx86(arr, ((unsigned char *) &base)[3]); setCx86(arr + 1, ((unsigned char *) &base)[2]); setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); diff -prauwN linux-2.5.69/arch/i386/kernel/cpu/mtrr/generic.c pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/generic.c --- linux-2.5.69/arch/i386/kernel/cpu/mtrr/generic.c 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/generic.c 2003-05-26 07:14:19.000000000 -0700 @@ -131,13 +131,13 @@ void generic_get_mtrr(unsigned int reg, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; + mask_lo = size_or_mask | mask_hi << (32 - MMUPAGE_SHIFT) + | mask_lo >> MMUPAGE_SHIFT; /* This works correctly if size is a power of two, i.e. a contiguous range. */ *size = -mask_lo; - *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *base = base_hi << (32 - MMUPAGE_SHIFT) | base_lo >> MMUPAGE_SHIFT; *type = base_lo & 0xff; } @@ -317,10 +317,10 @@ static void generic_set_mtrr(unsigned in relevant mask register to disable a range. */ wrmsr(MTRRphysMask_MSR(reg), 0, 0); } else { - wrmsr(MTRRphysBase_MSR(reg), base << PAGE_SHIFT | type, - (base & size_and_mask) >> (32 - PAGE_SHIFT)); - wrmsr(MTRRphysMask_MSR(reg), -size << PAGE_SHIFT | 0x800, - (-size & size_and_mask) >> (32 - PAGE_SHIFT)); + wrmsr(MTRRphysBase_MSR(reg), base << MMUPAGE_SHIFT | type, + (base & size_and_mask) >> (32 - MMUPAGE_SHIFT)); + wrmsr(MTRRphysMask_MSR(reg), -size << MMUPAGE_SHIFT | 0x800, + (-size & size_and_mask) >> (32 - MMUPAGE_SHIFT)); } post_set(); @@ -335,7 +335,7 @@ int generic_validate_add_page(unsigned l if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { - if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { + if (base & ((1 << (22 - MMUPAGE_SHIFT)) - 1)) { printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } diff -prauwN linux-2.5.69/arch/i386/kernel/cpu/mtrr/if.c pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/if.c --- linux-2.5.69/arch/i386/kernel/cpu/mtrr/if.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/if.c 2003-05-26 07:14:19.000000000 -0700 @@ -33,10 +33,10 @@ mtrr_file_add(unsigned long base, unsign FILE_FCOUNT(file) = fcount; } if (!page) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) return -EINVAL; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; + base >>= MMUPAGE_SHIFT; + size >>= MMUPAGE_SHIFT; } reg = mtrr_add_page(base, size, type, 1); if (reg >= 0) @@ -52,10 +52,10 @@ mtrr_file_del(unsigned long base, unsign unsigned int *fcount = file->private_data; if (!page) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) return -EINVAL; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; + base >>= MMUPAGE_SHIFT; + size >>= MMUPAGE_SHIFT; } reg = mtrr_del_page(-1, base, size); if (reg < 0) @@ -119,8 +119,8 @@ mtrr_write(struct file *file, const char for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) continue; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; + base >>= MMUPAGE_SHIFT; + size >>= MMUPAGE_SHIFT; err = mtrr_add_page((unsigned long) base, (unsigned long) size, i, 1); @@ -193,8 +193,8 @@ mtrr_ioctl(struct inode *inode, struct f || gentry.size == 0x100000) gentry.base = gentry.size = gentry.type = 0; else { - gentry.base <<= PAGE_SHIFT; - gentry.size <<= PAGE_SHIFT; + gentry.base <<= MMUPAGE_SHIFT; + gentry.size <<= MMUPAGE_SHIFT; gentry.type = type; } @@ -319,18 +319,18 @@ static int mtrr_seq_show(struct seq_file if (size == 0) usage_table[i] = 0; else { - if (size < (0x100000 >> PAGE_SHIFT)) { + if (size < (0x100000 >> MMUPAGE_SHIFT)) { /* less than 1MB */ factor = 'K'; - size <<= PAGE_SHIFT - 10; + size <<= MMUPAGE_SHIFT - 10; } else { factor = 'M'; - size >>= 20 - PAGE_SHIFT; + size >>= 20 - MMUPAGE_SHIFT; } /* RED-PEN: base can be > 32bit */ len += seq_printf(seq, "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", - i, base, base >> (20 - PAGE_SHIFT), size, factor, + i, base, base >> (20 - MMUPAGE_SHIFT), size, factor, attrib_to_str(type), usage_table[i]); } } diff -prauwN linux-2.5.69/arch/i386/kernel/cpu/mtrr/main.c pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/main.c --- linux-2.5.69/arch/i386/kernel/cpu/mtrr/main.c 2003-05-04 16:53:00.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/cpu/mtrr/main.c 2003-05-26 07:14:19.000000000 -0700 @@ -410,12 +410,12 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type, char increment) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) { printk("mtrr: size and base must be multiples of 4 kiB\n"); printk("mtrr: size: 0x%lx base: 0x%lx\n", size, base); return -EINVAL; } - return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + return mtrr_add_page(base >> MMUPAGE_SHIFT, size >> MMUPAGE_SHIFT, type, increment); } @@ -506,12 +506,12 @@ int mtrr_del_page(int reg, unsigned long int mtrr_del(int reg, unsigned long base, unsigned long size) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) { printk("mtrr: size and base must be multiples of 4 kiB\n"); printk("mtrr: size: 0x%lx base: 0x%lx\n", size, base); return -EINVAL; } - return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); + return mtrr_del_page(reg, base >> MMUPAGE_SHIFT, size >> MMUPAGE_SHIFT); } EXPORT_SYMBOL(mtrr_add); @@ -579,7 +579,7 @@ static int __init mtrr_init(void) u32 phys_addr; phys_addr = cpuid_eax(0x80000008) & 0xff; size_or_mask = - ~((1 << (phys_addr - PAGE_SHIFT)) - 1); + ~((1 << (phys_addr - MMUPAGE_SHIFT)) - 1); size_and_mask = ~size_or_mask & 0xfff00000; } /* Athlon MTRRs use an Intel-compatible interface for diff -prauwN linux-2.5.69/arch/i386/kernel/entry.S pgcl-2.5.69-3/arch/i386/kernel/entry.S --- linux-2.5.69/arch/i386/kernel/entry.S 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/entry.S 2003-05-26 07:14:19.000000000 -0700 @@ -160,7 +160,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + andl $~(THREAD_SIZE-1), %ebp # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp diff -prauwN linux-2.5.69/arch/i386/kernel/head.S pgcl-2.5.69-3/arch/i386/kernel/head.S --- linux-2.5.69/arch/i386/kernel/head.S 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/head.S 2003-05-26 07:14:19.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -prauwN linux-2.5.69/arch/i386/kernel/microcode.c pgcl-2.5.69-3/arch/i386/kernel/microcode.c --- linux-2.5.69/arch/i386/kernel/microcode.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/microcode.c 2003-05-26 07:14:19.000000000 -0700 @@ -319,7 +319,7 @@ static ssize_t microcode_write(struct fi sizeof(struct microcode)); return -EINVAL; } - if ((len >> PAGE_SHIFT) > num_physpages) { + if ((len >> MMUPAGE_SHIFT) > num_physpages) { printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); return -EINVAL; } diff -prauwN linux-2.5.69/arch/i386/kernel/mpparse.c pgcl-2.5.69-3/arch/i386/kernel/mpparse.c --- linux-2.5.69/arch/i386/kernel/mpparse.c 2003-05-04 16:53:09.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/mpparse.c 2003-05-26 07:14:19.000000000 -0700 @@ -708,7 +708,7 @@ static int __init smp_scan_config (unsig smp_found_config = 1; printk(KERN_INFO "found SMP MP-table at %08lx\n", virt_to_phys(mpf)); - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + reserve_bootmem(virt_to_phys(mpf), MMUPAGE_SIZE); if (mpf->mpf_physptr) { /* * We cannot access to MPC table to compute @@ -719,8 +719,8 @@ static int __init smp_scan_config (unsig * PAGE_SIZE from mpg->mpf_physptr yields BUG() * in reserve_bootmem. */ - unsigned long size = PAGE_SIZE; - unsigned long end = max_low_pfn * PAGE_SIZE; + unsigned long size = MMUPAGE_SIZE; + unsigned long end = max_low_pfn * MMUPAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; reserve_bootmem(mpf->mpf_physptr, size); diff -prauwN linux-2.5.69/arch/i386/kernel/numaq.c pgcl-2.5.69-3/arch/i386/kernel/numaq.c --- linux-2.5.69/arch/i386/kernel/numaq.c 2003-05-04 16:53:36.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/numaq.c 2003-05-26 07:14:19.000000000 -0700 @@ -33,7 +33,7 @@ /* These are needed before the pgdat's are created */ extern long node_start_pfn[], node_end_pfn[]; -#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) +#define MB_TO_PAGES(addr) ((addr) << (20 - MMUPAGE_SHIFT)) /* * Function: smp_dump_qct() diff -prauwN linux-2.5.69/arch/i386/kernel/setup.c pgcl-2.5.69-3/arch/i386/kernel/setup.c --- linux-2.5.69/arch/i386/kernel/setup.c 2003-05-04 16:53:14.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/setup.c 2003-05-26 07:14:19.000000000 -0700 @@ -554,6 +554,8 @@ void __init find_max_pfn(void) continue; if (end > max_pfn) max_pfn = end; + + max_pfn &= ~(PAGE_MMUCOUNT - 1); } } @@ -564,6 +566,8 @@ unsigned long __init find_max_low_pfn(vo { unsigned long max_low_pfn; + printk("MAXMEM = %p\n", (void *)MAXMEM); + max_low_pfn = max_pfn; if (max_low_pfn > MAXMEM_PFN) { if (highmem_pages == -1) @@ -677,10 +681,10 @@ static unsigned long __init setup_memory highstart_pfn = max_low_pfn; } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); + (highend_pfn - highstart_pfn) >> (20 - MMUPAGE_SHIFT)); #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); + max_low_pfn >> (20 - MMUPAGE_SHIFT)); /* * Initialize the boot-time allocator (with low memory only): */ @@ -701,7 +705,7 @@ static unsigned long __init setup_memory * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ - reserve_bootmem(0, PAGE_SIZE); + reserve_bootmem(0, MMUPAGE_SIZE); #ifdef CONFIG_SMP /* @@ -709,7 +713,7 @@ static unsigned long __init setup_memory * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_bootmem(PAGE_SIZE, PAGE_SIZE); + reserve_bootmem(MMUPAGE_SIZE, MMUPAGE_SIZE); #endif #ifdef CONFIG_ACPI_SLEEP /* @@ -726,7 +730,7 @@ static unsigned long __init setup_memory #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { + if (INITRD_START + INITRD_SIZE <= PFN_PHYS(max_low_pfn)) { reserve_bootmem(INITRD_START, INITRD_SIZE); initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0; @@ -736,7 +740,7 @@ static unsigned long __init setup_memory printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", INITRD_START + INITRD_SIZE, - max_low_pfn << PAGE_SHIFT); + PFN_PHYS(max_low_pfn)); initrd_start = 0; } } @@ -790,7 +794,7 @@ static void __init register_memory(unsig request_resource(&ioport_resource, standard_io_resources+i); /* Tell the PCI layer not to allocate too close to the RAM area.. */ - low_mem_size = ((max_low_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff; + low_mem_size = ((max_low_pfn << MMUPAGE_SHIFT) + 0xfffff) & ~0xfffff; if (low_mem_size > pci_mem_start) pci_mem_start = low_mem_size; } diff -prauwN linux-2.5.69/arch/i386/kernel/smpboot.c pgcl-2.5.69-3/arch/i386/kernel/smpboot.c --- linux-2.5.69/arch/i386/kernel/smpboot.c 2003-05-04 16:53:14.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/smpboot.c 2003-05-26 07:14:19.000000000 -0700 @@ -100,7 +100,7 @@ static unsigned long __init setup_trampo */ void __init smp_alloc_memory(void) { - trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); + trampoline_base = (void *) alloc_bootmem_low_pages(MMUPAGE_SIZE); /* * Has to be in very low memory so we can execute * real-mode AP code. diff -prauwN linux-2.5.69/arch/i386/kernel/sys_i386.c pgcl-2.5.69-3/arch/i386/kernel/sys_i386.c --- linux-2.5.69/arch/i386/kernel/sys_i386.c 2003-05-04 16:53:42.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/sys_i386.c 2003-05-26 07:14:19.000000000 -0700 @@ -97,10 +97,10 @@ asmlinkage int old_mmap(struct mmap_arg_ goto out; err = -EINVAL; - if (a.offset & ~PAGE_MASK) + if (a.offset & ~MMUPAGE_MASK) goto out; - err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> MMUPAGE_SHIFT); out: return err; } diff -prauwN linux-2.5.69/arch/i386/kernel/sysenter.c pgcl-2.5.69-3/arch/i386/kernel/sysenter.c --- linux-2.5.69/arch/i386/kernel/sysenter.c 2003-05-04 16:53:57.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/sysenter.c 2003-05-26 07:14:19.000000000 -0700 @@ -34,7 +34,7 @@ struct fake_sep_struct { struct task_struct task; unsigned char trampoline[32] __attribute__((aligned(1024))); unsigned char stack[0]; -} __attribute__((aligned(8192))); +} __attribute__((aligned(THREAD_SIZE))); void enable_sep_cpu(void *info) { diff -prauwN linux-2.5.69/arch/i386/kernel/traps.c pgcl-2.5.69-3/arch/i386/kernel/traps.c --- linux-2.5.69/arch/i386/kernel/traps.c 2003-05-04 16:53:03.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/kernel/traps.c 2003-05-26 07:14:19.000000000 -0700 @@ -120,7 +120,7 @@ void show_trace_task(struct task_struct unsigned long esp = tsk->thread.esp; /* User space on another CPU? */ - if ((esp ^ (unsigned long)tsk->thread_info) & (PAGE_MASK<<1)) + if ((esp ^ (unsigned long)tsk->thread_info) & ~(THREAD_SIZE-1)) return; show_trace((unsigned long *)esp); } @@ -431,6 +431,7 @@ static void unknown_nmi_error(unsigned c reason, smp_processor_id()); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + dump_stack(); } static void default_do_nmi(struct pt_regs * regs) diff -prauwN linux-2.5.69/arch/i386/lib/getuser.S pgcl-2.5.69-3/arch/i386/lib/getuser.S --- linux-2.5.69/arch/i386/lib/getuser.S 2003-05-04 16:52:49.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/lib/getuser.S 2003-05-26 07:14:19.000000000 -0700 @@ -8,9 +8,9 @@ * return an error value in addition to the "real" * return value. */ +#include #include - /* * __get_user_X * diff -prauwN linux-2.5.69/arch/i386/mm/discontig.c pgcl-2.5.69-3/arch/i386/mm/discontig.c --- linux-2.5.69/arch/i386/mm/discontig.c 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/mm/discontig.c 2003-05-26 07:14:19.000000000 -0700 @@ -71,8 +71,6 @@ extern unsigned long max_low_pfn; extern unsigned long totalram_pages; extern unsigned long totalhigh_pages; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - unsigned long node_remap_start_pfn[MAX_NUMNODES]; unsigned long node_remap_size[MAX_NUMNODES]; unsigned long node_remap_offset[MAX_NUMNODES]; @@ -129,7 +127,7 @@ static void __init allocate_pgdat(int ni if (nid) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { - NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); + NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn*MMUPAGE_SIZE)); min_low_pfn += PFN_UP(sizeof(pg_data_t)); memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); } @@ -182,8 +180,8 @@ void __init remap_numa_kva(void) int node; for (node = 1; node < numnodes; ++node) { - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn< system_max_low_pfn) highstart_pfn = system_max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); + (highend_pfn - highstart_pfn) >> (20 - MMUPAGE_SHIFT)); #endif system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages; printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(system_max_low_pfn)); + system_max_low_pfn >> (20 - MMUPAGE_SHIFT)); printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", min_low_pfn, max_low_pfn, highstart_pfn); @@ -261,6 +260,11 @@ unsigned long __init setup_memory(void) (ulong) pfn_to_kaddr(highstart_pfn)); for (nid = 0; nid < numnodes; nid++) find_max_pfn_node(nid); + printk("vmallocspace = [0x%lx, 0x%lx)\n", + VMALLOC_START, VMALLOC_END); + printk("fixmapspace = [0x%lx, 0x%lx)\n", + FIXADDR_START, FIXADDR_TOP); + printk("MAXMEM = 0x%lx\n", MAXMEM); NODE_DATA(0)->bdata = &node0_bdata; @@ -277,21 +281,21 @@ unsigned long __init setup_memory(void) * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem_node(NODE_DATA(0), HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); + reserve_bootmem_node(NODE_DATA(0), HIGH_MEMORY, PFN_PHYS(min_low_pfn) + + bootmap_size - HIGH_MEMORY); /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ - reserve_bootmem_node(NODE_DATA(0), 0, PAGE_SIZE); + reserve_bootmem_node(NODE_DATA(0), 0, MMUPAGE_SIZE); /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_bootmem_node(NODE_DATA(0), PAGE_SIZE, PAGE_SIZE); + reserve_bootmem_node(NODE_DATA(0), MMUPAGE_SIZE, MMUPAGE_SIZE); #ifdef CONFIG_ACPI_SLEEP /* @@ -307,7 +311,7 @@ unsigned long __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << PAGE_SHIFT)) { + if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << MMUPAGE_SHIFT)) { reserve_bootmem_node(NODE_DATA(0), INITRD_START, INITRD_SIZE); initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0; @@ -317,7 +321,7 @@ unsigned long __init setup_memory(void) printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", INITRD_START + INITRD_SIZE, - system_max_low_pfn << PAGE_SHIFT); + system_max_low_pfn << MMUPAGE_SHIFT); initrd_start = 0; } } @@ -350,20 +354,20 @@ void __init zone_sizes_init(void) unsigned long start = node_start_pfn[nid]; unsigned long high = node_end_pfn[nid]; - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> MMUPAGE_SHIFT; if (start > low) { #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - start; + zones_size[ZONE_HIGHMEM] = (high - start) >> PAGE_MMUSHIFT; #endif } else { if (low < max_dma) - zones_size[ZONE_DMA] = low; + zones_size[ZONE_DMA] = low >> PAGE_MMUSHIFT; else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; + zones_size[ZONE_DMA] = max_dma >> PAGE_MMUSHIFT; + zones_size[ZONE_NORMAL] = (low - max_dma) >> PAGE_MMUSHIFT; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; + zones_size[ZONE_HIGHMEM] = (high - low) >> PAGE_MMUSHIFT; #endif } } @@ -403,10 +407,14 @@ void __init set_highmem_pages_init(int b zone_start_pfn = NODE_DATA(nid)->node_zones[ZONE_HIGHMEM].zone_start_pfn; printk("Initializing highpages for node %d\n", nid); - for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) { - one_highpage_init((struct page *)(zone_mem_map + node_pfn), - zone_start_pfn + node_pfn, bad_ppro); - } + + /* + * Note: zone->spanned_pages is in PAGE_SIZE units. + */ + for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) + one_highpage_init(&zone_mem_map[node_pfn], + zone_start_pfn + node_pfn*PAGE_MMUCOUNT, + bad_ppro); } totalram_pages += totalhigh_pages; #endif diff -prauwN linux-2.5.69/arch/i386/mm/fault.c pgcl-2.5.69-3/arch/i386/mm/fault.c --- linux-2.5.69/arch/i386/mm/fault.c 2003-05-04 16:52:48.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/mm/fault.c 2003-05-26 07:40:02.000000000 -0700 @@ -20,6 +20,8 @@ #include #include /* For unblank_screen() */ #include +#include /* for max_low_pfn */ +#include #include #include @@ -53,9 +55,9 @@ good_area: if (!(vma->vm_flags & VM_WRITE)) goto bad_area; size--; - size += start & ~PAGE_MASK; - size >>= PAGE_SHIFT; - start &= PAGE_MASK; + size += start & ~MMUPAGE_MASK; + size >>= MMUPAGE_SHIFT; + start &= MMUPAGE_MASK; for (;;) { survive: @@ -73,7 +75,7 @@ good_area: if (!size) break; size--; - start += PAGE_SIZE; + start += MMUPAGE_SIZE; if (start < vma->vm_end) continue; vma = vma->vm_next; @@ -154,19 +156,22 @@ asmlinkage void do_page_fault(struct pt_ struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; - unsigned long page; int write; siginfo_t info; /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); + pr_debug("faulted on %lx,", address); + /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); tsk = current; + pr_debug(" pid = %d\n", current->pid); + /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. @@ -185,7 +190,20 @@ asmlinkage void do_page_fault(struct pt_ mm = tsk->mm; info.si_code = SEGV_MAPERR; - + if (1) { + pgd_t *pgd = pgd_offset(mm, address); + pmd_t *pmd = pmd_offset(pgd, address); + pr_debug("fault handled by PGD at vaddr %p, %Lx\n", + pgd, (u64)pgd_val(*pgd)); + pr_debug("fault handled by PMD at vaddr %p, %Lx\n", + pmd, (u64)pmd_val(*pmd)); + if (pmd_present(*pmd)) { + pr_debug("fault will be handled by PTE at paddr %Lx\n", + (u64)(pmd_val(*pmd) & MMUPAGE_MASK) + + pte_index(address)*sizeof(pte_t)); + } else + pr_debug("pmd not present\n"); + } /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault.. @@ -196,12 +214,16 @@ asmlinkage void do_page_fault(struct pt_ down_read(&mm->mmap_sem); vma = find_vma(mm, address); - if (!vma) + if (!vma) { + pr_debug("no vma, goto bad_area\n"); goto bad_area; + } if (vma->vm_start <= address) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!(vma->vm_flags & VM_GROWSDOWN)) { + pr_debug("VM_GROWSDOWN not in vma->vm_flags, goto bad_area\n"); goto bad_area; + } if (error_code & 4) { /* * accessing the stack below %esp is always a bug. @@ -209,11 +231,15 @@ asmlinkage void do_page_fault(struct pt_ * pusha) doing post-decrement on the stack and that * doesn't show up until later.. */ - if (address + 32 < regs->esp) + if (address + 32 < regs->esp) { + pr_debug("postdecrement on stack, goto bad_area\n"); goto bad_area; } - if (expand_stack(vma, address)) + } + if (expand_stack(vma, address)) { + pr_debug("expand_stack() failed, goto bad_area\n"); goto bad_area; + } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. @@ -225,20 +251,25 @@ good_area: default: /* 3: write, present */ #ifdef TEST_VERIFY_AREA if (regs->cs == KERNEL_CS) - printk("WP fault at %08lx\n", regs->eip); + pr_debug("WP fault at %08lx\n", regs->eip); #endif /* fall through */ case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) + if (!(vma->vm_flags & VM_WRITE)) { + pr_debug("vma not writable, goto bad_area\n"); goto bad_area; + } write++; break; case 1: /* read, present */ + pr_debug("NFI what happened, goto bad_area\n"); goto bad_area; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) { + pr_debug("vma not read/exec, goto bad_area\n"); goto bad_area; } + } survive: /* @@ -265,7 +296,7 @@ good_area: * Did it hit the DOS screen memory VA from vm86 mode? */ if (regs->eflags & VM_MASK) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + unsigned long bit = (address - 0xA0000) >> MMUPAGE_SHIFT; if (bit < 32) tsk->thread.screen_bitmap |= 1 << bit; } @@ -281,6 +312,45 @@ bad_area: /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { + printk("user mode SIGSEGV, pid = %d, comm = %16s, EIP = %p, ESP = %p, CR2 = %p\n", + current->pid, current->comm, (void *)regs->eip, (void *)regs->esp, (void *)address); + spin_lock(&mm->page_table_lock); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long addr; + + printk("vma = [%lx,%lx) prot=%lx flags=%lx\n", + vma->vm_start, vma->vm_end, + vma->vm_page_prot.pgprot, vma->vm_flags); + + for (addr = vma->vm_start; addr < vma->vm_end; addr += MMUPAGE_SIZE) { + pgd_t *pgd = pgd_offset(mm, addr); + pmd_t *pmd; + pte_t *pte; + struct page *page; + void *mem; + + if (pgd_none(*pgd) || pgd_bad(*pgd)) + continue; + + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + continue; + + pte = pte_offset_map(pmd, addr); + if (pte_none(*pte) || !pte_present(*pte) || + !pfn_valid(pte_pfn(*pte))) { + pte_unmap(pte); + continue; + } + page = pte_page(*pte); + mem = kmap_atomic(page, KM_USER0); + if (!memcmp(mem, page_address(ZERO_PAGE(0)), PAGE_SIZE)) + printk("page at 0x%lx zero!\n", addr); + kunmap_atomic(mem, KM_USER0); + pte_unmap(pte); + } + } + spin_unlock(&mm->page_table_lock); tsk->thread.cr2 = address; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; @@ -288,6 +358,13 @@ bad_area: info.si_errno = 0; /* info.si_code has been set above */ info.si_addr = (void *)address; +#if 0 + if (current->pid >= 1024) { + while (1) { + schedule_timeout(HZ); + } + } +#endif force_sig_info(SIGSEGV, &info, tsk); return; } @@ -320,30 +397,53 @@ no_context: bust_spinlocks(1); - if (address < PAGE_SIZE) + if (address < MMUPAGE_SIZE) printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); else printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at virtual address %08lx\n",address); printk(" printing eip:\n"); printk("%08lx\n", regs->eip); - asm("movl %%cr3,%0":"=r" (page)); - page = ((unsigned long *) __va(page))[address >> 22]; - printk(KERN_ALERT "*pde = %08lx\n", page); - /* - * We must not directly access the pte in the highpte - * case, the page table might be allocated in highmem. - * And lets rather not kmap-atomic the pte, just in case - * it's allocated already. - */ -#ifndef CONFIG_HIGHPTE - if (page & 1) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = %08lx\n", page); + { + unsigned long cr3; + pgd_t *pgd; + pmd_t *pmd; + char *fmt; + + asm("movl %%cr3,%0":"=r" (cr3)); + cr3 &= ~0x1f; /* lower 5 bits of %cr3 are flags */ + /* pgd's in lowmem, but only need to be < 4G (32-bit %cr3) */ + pgd = (pgd_t *)__va(cr3); + fmt = PTRS_PER_PMD > 1 ? KERN_ALERT "*pdpte = %Lx\n" : NULL; + if (PTRS_PER_PMD > 1) + printk(fmt, pgd_val(*pgd)); + + /* pmd's in lowmem, but can be anywhere (64-bit PDPTE) */ + pmd = pmd_offset(pgd, address); + if (PTRS_PER_PMD > 1) + fmt = KERN_ALERT "*pde = %Lx\n"; + else + fmt = KERN_ALERT "*pde = %08lx\n"; + printk(fmt, pmd_val(*pmd)); + + /* + * this is getting at what are potentially user + * PTE's with pte_offset_kernel(); it's mostly + * unsafe to try editing kernel PTE's at this + * point for kmap_atomic() so just drop out of it + * if pmd_val(*pmd)/MMUPAGE_SIZE > max_low_pfn + */ + + if (pmd_present(*pmd) && !pmd_large(*pmd) + && pmd_val(*pmd)/MMUPAGE_SIZE <= max_low_pfn) { + pte_t *pte = pte_offset_kernel(pmd, address); + if (PTRS_PER_PMD > 1) + fmt = KERN_ALERT "*pte = %Lx\n"; + else + fmt = KERN_ALERT "*pte = %08lx\n"; + printk(fmt, pte_val(*pte)); + } } -#endif die("Oops", regs, error_code); bust_spinlocks(0); do_exit(SIGKILL); @@ -371,6 +471,7 @@ do_sigbus: * Send a sigbus, regardless of whether we were in kernel * or user mode. */ + pr_debug("sending SIGBUS\n"); tsk->thread.cr2 = address; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; @@ -399,23 +500,31 @@ vmalloc_fault: pmd_t *pmd, *pmd_k; pte_t *pte_k; + printk("took vmalloc fault on address %lx\n", address); + asm("movl %%cr3,%0":"=r" (pgd)); pgd = index + (pgd_t *)__va(pgd); pgd_k = init_mm.pgd + index; - if (!pgd_present(*pgd_k)) + if (!pgd_present(*pgd_k)) { + printk("missing pgd in vmalloc_fault()!\n"); goto no_context; + } set_pgd(pgd, *pgd_k); pmd = pmd_offset(pgd, address); pmd_k = pmd_offset(pgd_k, address); - if (!pmd_present(*pmd_k)) + if (!pmd_present(*pmd_k)) { + printk("missing pmd in vmalloc_fault()!\n"); goto no_context; + } set_pmd(pmd, *pmd_k); pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) + if (!pte_present(*pte_k)) { + printk("missing pte in vmalloc_fault()!\n"); goto no_context; + } return; } } diff -prauwN linux-2.5.69/arch/i386/mm/highmem.c pgcl-2.5.69-3/arch/i386/mm/highmem.c --- linux-2.5.69/arch/i386/mm/highmem.c 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/mm/highmem.c 2003-05-26 07:14:19.000000000 -0700 @@ -28,20 +28,29 @@ void kunmap(struct page *page) void *kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; - unsigned long vaddr; + unsigned long vaddr, pfn; + int k; inc_preempt_count(); if (page < highmem_start_page) return page_address(page); idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + vaddr = __fix_to_virt(FIX_KMAP_END) + idx*PAGE_SIZE; + WARN_ON(vaddr > __fix_to_virt(FIX_KMAP_BEGIN)); + WARN_ON(vaddr < __fix_to_virt(FIX_KMAP_END)); + pfn = page_to_pfn(page); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long addr = vaddr + k*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); #if CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); + BUG_ON(!pte_none(*pte)); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); + set_pte(pte, pfn_pte(pfn + k, kmap_prot)); + __flush_tlb_one(addr); + } return (void*) vaddr; } @@ -49,23 +58,38 @@ void *kmap_atomic(struct page *page, enu void kunmap_atomic(void *kvaddr, enum km_type type) { #if CONFIG_DEBUG_HIGHMEM - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + unsigned long vaddr = (unsigned long) kvaddr & MMUPAGE_MASK; + /* enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); */ + enum fixed_addresses idx = (vaddr - __fix_to_virt(FIX_KMAP_END))/PAGE_SIZE; + unsigned long lower_bound = __fix_to_virt(FIX_KMAP_END) + idx*PAGE_SIZE; + unsigned long upper_bound = lower_bound + PAGE_SIZE; + int k; if (vaddr < FIXADDR_START) { // FIXME dec_preempt_count(); return; } - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) + if (vaddr < lower_bound || vaddr > upper_bound) { + printk("vaddr %lx outside [%lx,%lx)\n", vaddr, lower_bound, upper_bound); BUG(); + } /* * force other mappings to Oops if they'll try to access * this pte without first remap it */ - pte_clear(kmap_pte-idx); - __flush_tlb_one(vaddr); + vaddr = __fix_to_virt(FIX_KMAP_END) + idx*PAGE_SIZE; + WARN_ON(vaddr > __fix_to_virt(FIX_KMAP_BEGIN)); + WARN_ON(vaddr < __fix_to_virt(FIX_KMAP_END)); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long addr = vaddr + k*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + pte_clear(pte); + __flush_tlb_one(addr); + } #endif dec_preempt_count(); @@ -73,14 +97,22 @@ void kunmap_atomic(void *kvaddr, enum km struct page *kmap_atomic_to_page(void *ptr) { - unsigned long idx, vaddr = (unsigned long)ptr; + unsigned long vaddr = (unsigned long)ptr; + pgd_t *pgd; + pmd_t *pmd; pte_t *pte; if (vaddr < FIXADDR_START) return virt_to_page(ptr); - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); + pgd = pgd_offset_k(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + + /* + * unsigned long idx = virt_to_fix(vaddr); + * pte = &kmap_pte[idx*PAGE_MMUCOUNT]; + */ return pte_page(*pte); } diff -prauwN linux-2.5.69/arch/i386/mm/init.c pgcl-2.5.69-3/arch/i386/mm/init.c --- linux-2.5.69/arch/i386/mm/init.c 2003-05-04 16:53:36.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/mm/init.c 2003-05-26 07:14:19.000000000 -0700 @@ -43,6 +43,7 @@ struct mmu_gather mmu_gathers[NR_CPUS]; unsigned long highstart_pfn, highend_pfn; +struct page *zero_page; static int do_test_wp_bit(void); @@ -56,7 +57,7 @@ static pmd_t * __init one_md_table_init( pmd_t *pmd_table; #if CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + pmd_table = (pmd_t *) alloc_bootmem_low_pages(MMUPAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); if (pmd_table != pmd_offset(pgd, 0)) BUG(); @@ -74,7 +75,7 @@ static pmd_t * __init one_md_table_init( static pte_t * __init one_page_table_init(pmd_t *pmd) { if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(MMUPAGE_SIZE); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); if (page_table != pte_offset_kernel(pmd, 0)) BUG(); @@ -95,6 +96,12 @@ static pte_t * __init one_page_table_ini * NOTE: The pagetables are allocated contiguous on the physical space * so we can cache the place of the first one and move around without * checking the pgd every time. + * + * Something happened here and I'm not sure what. This might back the + * thing out (I think). I think it was just a rename so I won't care + * unless it burns me. + * + * -- wli */ static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { @@ -111,7 +118,14 @@ static void __init page_table_range_init for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { if (pgd_none(*pgd)) one_md_table_init(pgd); + } + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { pmd = pmd_offset(pgd, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) @@ -180,8 +194,8 @@ static inline int page_is_ram(unsigned l * are not. Notably the 640->1Mb area. We need a sanity * check here. */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + addr = (e820.map[i].addr+MMUPAGE_SIZE-1) >> MMUPAGE_SHIFT; + end = (e820.map[i].addr+e820.map[i].size) >> MMUPAGE_SHIFT; if ((pagenr >= addr) && (pagenr < end)) return 1; } @@ -189,37 +203,12 @@ static inline int page_is_ram(unsigned l } #if CONFIG_HIGHMEM -pte_t *kmap_pte; pgprot_t kmap_prot; - -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) - -void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} +#define kmap_init() do { kmap_prot = PAGE_KERNEL; } while (0) void __init permanent_kmaps_init(pgd_t *pgd_base) { - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - vaddr = PKMAP_BASE; - page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + page_table_range_init(PKMAP_BASE, PKMAP_BASE + PAGE_SIZE*LAST_PKMAP, pgd_base); } void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) @@ -238,7 +227,7 @@ void __init one_highpage_init(struct pag void __init set_highmem_pages_init(int bad_ppro) { int pfn; - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) + for (pfn = highstart_pfn; pfn < highend_pfn; pfn += PAGE_MMUCOUNT) one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } @@ -305,6 +294,34 @@ static void __init pagetable_init (void) */ pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; #endif + { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long addr = VMALLOC_START; + + do { + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd) || pgd_bad(*pgd)) { + addr += MMUPAGE_SIZE; + continue; + } + do { + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) { + addr += MMUPAGE_SIZE; + continue; + } + do { + pte = pte_offset_kernel(pmd, addr); + if (!pte_none(*pte) || pte_present(*pte)) { + printk("bad vmallocspace PTE at vaddr 0x%lx\n", addr); + } + addr += MMUPAGE_SIZE; + } while (addr < VMALLOC_END); + } while (addr < VMALLOC_END); + } while (addr < VMALLOC_END); + } } void zap_low_mappings (void) @@ -331,17 +348,17 @@ void __init zone_sizes_init(void) unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; unsigned int max_dma, high, low; - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> MMUPAGE_SHIFT; low = max_low_pfn; high = highend_pfn; if (low < max_dma) - zones_size[ZONE_DMA] = low; + zones_size[ZONE_DMA] = low >> PAGE_MMUSHIFT; else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; + zones_size[ZONE_DMA] = max_dma >> PAGE_MMUSHIFT; + zones_size[ZONE_NORMAL] = (low - max_dma) >> PAGE_MMUSHIFT; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; + zones_size[ZONE_HIGHMEM] = (high - low) >> PAGE_MMUSHIFT; #endif } free_area_init(zones_size); @@ -372,7 +389,6 @@ void __init paging_init(void) set_in_cr4(X86_CR4_PAE); #endif __flush_tlb_all(); - kmap_init(); zone_sizes_init(); } @@ -418,6 +434,7 @@ static void __init set_max_mapnr_init(vo #else max_mapnr = num_physpages = max_low_pfn; #endif + max_mapnr /= PAGE_MMUCOUNT; } #define __free_all_bootmem() free_all_bootmem() #else @@ -425,11 +442,14 @@ static void __init set_max_mapnr_init(vo extern void set_max_mapnr_init(void); #endif /* !CONFIG_DISCONTIGMEM */ +/* + * Most of the reporting here needs doublechecking. + */ void __init mem_init(void) { extern int ppro_with_ram_bug(void); int codesize, reservedpages, datasize, initsize; - int tmp; + int pfn; int bad_ppro; #ifndef CONFIG_DISCONTIGMEM @@ -439,36 +459,31 @@ void __init mem_init(void) bad_ppro = ppro_with_ram_bug(); -#ifdef CONFIG_HIGHMEM - /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); - BUG(); - } -#endif - set_max_mapnr_init(); #ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE); + high_memory = (void *) __va(highstart_pfn * MMUPAGE_SIZE); #else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + high_memory = (void *) __va(max_low_pfn * MMUPAGE_SIZE); #endif /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); + memset(empty_zero_page, 0, MMUPAGE_SIZE); /* this will put all low memory onto the freelists */ totalram_pages += __free_all_bootmem(); + zero_page = alloc_page(GFP_ATOMIC|GFP_DMA); + clear_page(page_address(zero_page)); + SetPageReserved(zero_page); + totalram_pages--; + reservedpages = 0; - for (tmp = 0; tmp < max_low_pfn; tmp++) + for (pfn = 0; pfn < max_low_pfn; pfn += PAGE_MMUCOUNT) /* * Only count reserved RAM pages */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) + if (page_is_ram(pfn) && PageReserved(pfn_to_page(pfn))) reservedpages++; set_highmem_pages_init(bad_ppro); @@ -479,13 +494,18 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), + num_physpages << (MMUPAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); + printk("MAXMEM=0x%lx\n", MAXMEM); + printk("vmalloc: start = 0x%lx, end = 0x%lx\n", + VMALLOC_START, VMALLOC_END); + printk("fixaddr: start = 0x%lx, end = 0x%lx\n", + FIXADDR_START, FIXADDR_TOP); #if CONFIG_X86_PAE if (!cpu_has_pae) @@ -505,20 +525,32 @@ void __init mem_init(void) #endif } -#if CONFIG_X86_PAE -struct kmem_cache_s *pae_pgd_cachep; +kmem_cache_t *pgd_cache; +kmem_cache_t *pmd_cache; void __init pgtable_cache_init(void) { - /* - * PAE pgds must be 16-byte aligned: - */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); + WARN_ON(!pgd_val(swapper_pg_dir[PTRS_PER_PGD-1])); + + if (PTRS_PER_PMD > 1) { + pmd_cache = kmem_cache_create("pmd", + PTRS_PER_PMD*sizeof(pmd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pmd_ctor, + NULL); + if (!pmd_cache) + panic("pgtable_cache_init(): cannot create pmd cache"); + } + pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) + panic("pgtable_cache_init(): Cannot create pgd cache"); } -#endif /* * This function cannot be __init, since exceptions don't work in that @@ -549,28 +581,43 @@ static int do_test_wp_bit(void) void free_initmem(void) { - unsigned long addr; + unsigned long addr, freed = 0;; addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { + addr = (addr + PAGE_SIZE - 1) & PAGE_MASK; + while(addr < (((unsigned long)(&__init_end)) & PAGE_MASK)) { ClearPageReserved(virt_to_page(addr)); set_page_count(virt_to_page(addr), 1); free_page(addr); totalram_pages++; + freed++; + addr += PAGE_SIZE; } - printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (&__init_end - &__init_begin) >> 10); + printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n", + freed*(PAGE_SIZE/1024)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start < end) - printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { + unsigned long freed = 0; + + start = (start + PAGE_SIZE - 1) & PAGE_MASK; + end &= PAGE_MASK; + + if (start >= end) + return; + + while (start < end) { ClearPageReserved(virt_to_page(start)); set_page_count(virt_to_page(start), 1); free_page(start); totalram_pages++; + freed++; + start += PAGE_SIZE; } + + printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", + freed*(PAGE_SIZE/1024)); } #endif diff -prauwN linux-2.5.69/arch/i386/mm/ioremap.c pgcl-2.5.69-3/arch/i386/mm/ioremap.c --- linux-2.5.69/arch/i386/mm/ioremap.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/mm/ioremap.c 2003-05-26 07:14:19.000000000 -0700 @@ -30,7 +30,7 @@ static inline void remap_area_pte(pte_t end = PMD_SIZE; if (address >= end) BUG(); - pfn = phys_addr >> PAGE_SHIFT; + pfn = phys_addr >> MMUPAGE_SHIFT; do { if (!pte_none(*pte)) { printk("remap_area_pte: page already exists\n"); @@ -38,7 +38,7 @@ static inline void remap_area_pte(pte_t } set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | flags))); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pfn++; pte++; } while (address && (address < end)); @@ -196,7 +196,7 @@ void *ioremap_nocache (unsigned long phy if (phys_addr + size < virt_to_phys(high_memory)) { struct page *ppage = virt_to_page(__va(phys_addr)); - unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long npages = (size + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; BUG_ON(phys_addr+size > (unsigned long)high_memory); BUG_ON(phys_addr + size < phys_addr); @@ -216,7 +216,7 @@ void iounmap(void *addr) struct vm_struct *p; if (addr <= high_memory) return; - p = remove_vm_area((void *) (PAGE_MASK & (unsigned long) addr)); + p = remove_vm_area((void *) (MMUPAGE_MASK & (unsigned long) addr)); if (!p) { printk("__iounmap: bad address %p\n", addr); return; @@ -225,7 +225,7 @@ void iounmap(void *addr) unmap_vm_area(p); if (p->flags && p->phys_addr < virt_to_phys(high_memory)) { change_page_attr(virt_to_page(__va(p->phys_addr)), - p->size >> PAGE_SHIFT, + p->size >> MMUPAGE_SHIFT, PAGE_KERNEL); global_flush_tlb(); } @@ -252,14 +252,14 @@ void __init *bt_ioremap(unsigned long ph /* * Mappings have to be page-aligned */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; + offset = phys_addr & ~MMUPAGE_MASK; + phys_addr &= MMUPAGE_MASK; + size = MMUPAGE_ALIGN(last_addr) - phys_addr; /* * Mappings have to fit in the FIX_BTMAP area. */ - nrpages = size >> PAGE_SHIFT; + nrpages = size >> MMUPAGE_SHIFT; if (nrpages > NR_FIX_BTMAPS) return NULL; @@ -269,7 +269,7 @@ void __init *bt_ioremap(unsigned long ph idx = FIX_BTMAP_BEGIN; while (nrpages > 0) { set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; + phys_addr += MMUPAGE_SIZE; --idx; --nrpages; } @@ -286,8 +286,8 @@ void __init bt_iounmap(void *addr, unsig virt_addr = (unsigned long)addr; if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) return; - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + offset = virt_addr & ~MMUPAGE_MASK; + nrpages = MMUPAGE_ALIGN(offset + size - 1) >> MMUPAGE_SHIFT; idx = FIX_BTMAP_BEGIN; while (nrpages > 0) { diff -prauwN linux-2.5.69/arch/i386/mm/pageattr.c pgcl-2.5.69-3/arch/i386/mm/pageattr.c --- linux-2.5.69/arch/i386/mm/pageattr.c 2003-05-04 16:53:13.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/mm/pageattr.c 2003-05-26 07:14:19.000000000 -0700 @@ -38,8 +38,8 @@ static struct page *split_large_page(uns address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, + for (i = 0; i < PTRS_PER_PTE; i++, addr += MMUPAGE_SIZE) { + pbase[i] = pfn_pte(addr/MMUPAGE_SIZE, addr == address ? prot : PAGE_KERNEL); } return base; @@ -58,19 +58,22 @@ static void flush_kernel_map(void *dummy static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { + struct page *page; + unsigned long flags; + set_pte_atomic(kpte, pte); /* change init_mm */ -#ifndef CONFIG_X86_PAE - { - struct list_head *l; - spin_lock(&mmlist_lock); - list_for_each(l, &init_mm.mmlist) { - struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); - pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); + if (PTRS_PER_PMD > 1) + return; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pmd_t *pmd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pmd = pmd_offset(pgd, address); set_pte_atomic((pte_t *)pmd, pte); } - spin_unlock(&mmlist_lock); - } -#endif + spin_unlock_irqrestore(&pgd_lock, flags); } /* @@ -82,7 +85,7 @@ static inline void revert_page(struct pa pte_t *linear = (pte_t *) pmd_offset(pgd_offset(&init_mm, address), address); set_pmd_pte(linear, address, - pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, + pfn_pte((__pa(address) & LARGE_PAGE_MASK)/MMUPAGE_SIZE, PAGE_KERNEL_LARGE)); } @@ -94,15 +97,14 @@ __change_page_attr(struct page *page, pg struct page *kpte_page; #ifdef CONFIG_HIGHMEM - if (page >= highmem_start_page) - BUG(); + BUG_ON(page >= highmem_start_page); #endif address = (unsigned long)page_address(page); kpte = lookup_address(address); if (!kpte) return -EINVAL; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + kpte_page = virt_to_page(((unsigned long)kpte) & MMUPAGE_MASK); if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { if ((pte_val(*kpte) & _PAGE_PSE) == 0) { pte_t old = *kpte; @@ -159,6 +161,8 @@ int change_page_attr(struct page *page, struct page *fpage; int i; + numpages = (numpages + PAGE_MMUCOUNT - 1)& ~(PAGE_MMUCOUNT-1); + down_write(&init_mm.mmap_sem); for (i = 0; i < numpages; i++, page++) { fpage = NULL; diff -prauwN linux-2.5.69/arch/i386/mm/pgtable.c pgcl-2.5.69-3/arch/i386/mm/pgtable.c --- linux-2.5.69/arch/i386/mm/pgtable.c 2003-05-04 16:53:57.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/mm/pgtable.c 2003-05-26 07:40:02.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -98,10 +99,12 @@ void set_pmd_pfn(unsigned long vaddr, un if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ printk ("set_pmd_pfn: vaddr misaligned\n"); + printk ("vaddr = %lx, pfn = %lx\n", vaddr, pfn); return; /* BUG(); */ } - if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + if (pfn & (PMD_SIZE/MMUPAGE_SIZE-1)) { /* pfn is misaligned */ printk ("set_pmd_pfn: pfn misaligned\n"); + printk ("vaddr = %lx, pfn = %lx\n", vaddr, pfn); return; /* BUG(); */ } pgd = swapper_pg_dir + pgd_index(vaddr); @@ -122,11 +125,13 @@ void __set_fixmap (enum fixed_addresses { unsigned long address = __fix_to_virt(idx); + printk("__set_fixmap(%d,%lx)\n", idx, phys); + if (idx >= __end_of_fixed_addresses) { BUG(); return; } - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + set_pte_pfn(address, phys >> MMUPAGE_SHIFT, flags); } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) @@ -151,61 +156,88 @@ struct page *pte_alloc_one(struct mm_str return pte; } -#if CONFIG_X86_PAE +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) +{ + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); +} -pgd_t *pgd_alloc(struct mm_struct *mm) +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * If the locking proves to be non-performant, a ticketing scheme with + * checks at dup_mmap(), exec(), and other mmlist addition points + * could be used. The locking scheme was chosen on the basis of + * manfred's recommendations and having no core impact whatsoever. + * -- wli + */ +spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(pgd_list); + +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) { - int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); + unsigned long flags; - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); - } - memcpy(pgd + USER_PTRS_PER_PGD, + if (PTRS_PER_PMD == 1) + spin_lock_irqsave(&pgd_lock, flags); + + memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } - return pgd; -out_oom: - for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); - return NULL; + + if (PTRS_PER_PMD > 1) + return; + + list_add(&virt_to_page(pgd)->lru, &pgd_list); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); } -void pgd_free(pgd_t *pgd) +/* never called when PTRS_PER_PMD > 1 */ +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) { - int i; + unsigned long flags; /* can be called from interrupt context */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); + spin_lock_irqsave(&pgd_lock, flags); + list_del(&virt_to_page(pgd)->lru); + spin_unlock_irqrestore(&pgd_lock, flags); } -#else - pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); } return pgd; + +out_oom: + for (i--; i >= 0; i--) + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pgd_cache, pgd); + return NULL; } void pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); -} - -#endif /* CONFIG_X86_PAE */ + int i; + /* in the PAE case user pgd entries are overwritten before usage */ + if (PTRS_PER_PMD > 1) + for (i = 0; i < USER_PTRS_PER_PGD; ++i) + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + /* in the non-PAE case, clear_page_tables() clears user pgd entries */ + kmem_cache_free(pgd_cache, pgd); +} diff -prauwN linux-2.5.69/arch/i386/pci/i386.c pgcl-2.5.69-3/arch/i386/pci/i386.c --- linux-2.5.69/arch/i386/pci/i386.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/arch/i386/pci/i386.c 2003-05-26 07:14:19.000000000 -0700 @@ -291,7 +291,7 @@ int pci_mmap_page_range(struct pci_dev * /* Write-combine setting is ignored, it is changed via the mtrr * interfaces on this platform. */ - if (remap_page_range(vma, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, + if (remap_page_range(vma, vma->vm_start, vma->vm_pgoff << MMUPAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot)) return -EAGAIN; diff -prauwN linux-2.5.69/drivers/block/ll_rw_blk.c pgcl-2.5.69-3/drivers/block/ll_rw_blk.c --- linux-2.5.69/drivers/block/ll_rw_blk.c 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/drivers/block/ll_rw_blk.c 2003-05-26 07:14:19.000000000 -0700 @@ -240,7 +240,7 @@ void blk_queue_make_request(request_queu **/ void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr) { - unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; + unsigned long bounce_pfn = dma_addr >> MMUPAGE_SHIFT; unsigned long mb = dma_addr >> 20; static request_queue_t *last_q; diff -prauwN linux-2.5.69/drivers/char/agp/backend.c pgcl-2.5.69-3/drivers/char/agp/backend.c --- linux-2.5.69/drivers/char/agp/backend.c 2003-05-04 16:53:35.000000000 -0700 +++ pgcl-2.5.69-3/drivers/char/agp/backend.c 2003-05-26 07:14:19.000000000 -0700 @@ -89,7 +89,7 @@ static int agp_find_max (void) { long memory, index, result; - memory = (num_physpages << PAGE_SHIFT) >> 20; + memory = (num_physpages << MMUPAGE_SHIFT) >> 20; index = 1; while ((memory > maxes_table[index].mem) && (index < 8)) @@ -101,7 +101,7 @@ static int agp_find_max (void) (maxes_table[index].mem - maxes_table[index - 1].mem); printk(KERN_INFO PFX "Maximum main memory to use for agp memory: %ldM\n", result); - result = result << (20 - PAGE_SHIFT); + result = result << (20 - MMUPAGE_SHIFT); return result; } @@ -145,7 +145,7 @@ static int agp_backend_initialize(struct } got_gatt = 1; - agp_bridge->key_list = vmalloc(PAGE_SIZE * 4); + agp_bridge->key_list = vmalloc(MMUPAGE_SIZE * 4); if (agp_bridge->key_list == NULL) { printk(KERN_ERR PFX "error allocating memory for key lists.\n"); rc = -ENOMEM; @@ -154,7 +154,7 @@ static int agp_backend_initialize(struct got_keylist = 1; /* FIXME vmalloc'd memory not guaranteed contiguous */ - memset(agp_bridge->key_list, 0, PAGE_SIZE * 4); + memset(agp_bridge->key_list, 0, MMUPAGE_SIZE * 4); if (agp_bridge->configure()) { printk(KERN_ERR PFX "error configuring host chipset.\n"); diff -prauwN linux-2.5.69/drivers/char/agp/generic.c pgcl-2.5.69-3/drivers/char/agp/generic.c --- linux-2.5.69/drivers/char/agp/generic.c 2003-05-04 16:53:12.000000000 -0700 +++ pgcl-2.5.69-3/drivers/char/agp/generic.c 2003-05-26 07:14:19.000000000 -0700 @@ -84,7 +84,7 @@ agp_memory *agp_create_memory(int scratc kfree(new); return NULL; } - new->memory = vmalloc(PAGE_SIZE * scratch_pages); + new->memory = vmalloc(MMUPAGE_SIZE * scratch_pages); if (new->memory == NULL) { agp_free_key(new->key); @@ -119,7 +119,7 @@ void agp_free_memory(agp_memory * curr) kfree(curr); } -#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define ENTRIES_PER_PAGE (MMUPAGE_SIZE / sizeof(unsigned long)) agp_memory *agp_allocate_memory(size_t page_count, u32 type) { @@ -515,7 +515,7 @@ int agp_generic_create_gatt_table(void) if (table == NULL) return -ENOMEM; - table_end = table + ((PAGE_SIZE * (1 << page_order)) - 1); + table_end = table + ((MMUPAGE_SIZE * (1 << page_order)) - 1); for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) SetPageReserved(page); @@ -524,7 +524,7 @@ int agp_generic_create_gatt_table(void) agp_gatt_table = (void *)table; CACHE_FLUSH(); agp_bridge->gatt_table = ioremap_nocache(virt_to_phys(table), - (PAGE_SIZE * (1 << page_order))); + (MMUPAGE_SIZE * (1 << page_order))); CACHE_FLUSH(); if (agp_bridge->gatt_table == NULL) { @@ -592,7 +592,7 @@ int agp_generic_free_gatt_table(void) iounmap(agp_bridge->gatt_table); table = (char *) agp_bridge->gatt_table_real; - table_end = table + ((PAGE_SIZE * (1 << page_order)) - 1); + table_end = table + ((MMUPAGE_SIZE * (1 << page_order)) - 1); for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) ClearPageReserved(page); @@ -632,7 +632,7 @@ int agp_generic_insert_memory(agp_memory break; } - num_entries -= agp_memory_reserved/PAGE_SIZE; + num_entries -= agp_memory_reserved/MMUPAGE_SIZE; if (num_entries < 0) num_entries = 0; if (type != 0 || mem->type != 0) { diff -prauwN linux-2.5.69/drivers/char/mem.c pgcl-2.5.69-3/drivers/char/mem.c --- linux-2.5.69/drivers/char/mem.c 2003-05-04 16:53:29.000000000 -0700 +++ pgcl-2.5.69-3/drivers/char/mem.c 2003-05-26 07:14:19.000000000 -0700 @@ -43,8 +43,8 @@ static ssize_t do_write_mem(struct file written = 0; #if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) /* we don't have page 0 mapped on sparc and m68k.. */ - if (realp < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-realp; + if (realp < MMUPAGE_SIZE) { + unsigned long sz = MMUPAGE_SIZE-realp; if (sz > count) sz = count; /* Hmm. Do something? */ buf+=sz; @@ -80,8 +80,8 @@ static ssize_t read_mem(struct file * fi read = 0; #if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-p; + if (p < MMUPAGE_SIZE) { + unsigned long sz = MMUPAGE_SIZE-p; if (sz > count) sz = count; if (sz > 0) { @@ -177,7 +177,7 @@ static inline int noncached_address(unsi static int mmap_mem(struct file * file, struct vm_area_struct * vma) { - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long offset = vma->vm_pgoff << MMUPAGE_SHIFT; /* * Accessing memory above the top the kernel knows about or @@ -223,8 +223,8 @@ static ssize_t read_kmem(struct file *fi #if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE && read > 0) { - size_t tmp = PAGE_SIZE - p; + if (p < MMUPAGE_SIZE && read > 0) { + size_t tmp = MMUPAGE_SIZE - p; if (tmp > read) tmp = read; if (clear_user(buf, tmp)) return -EFAULT; @@ -248,8 +248,8 @@ static ssize_t read_kmem(struct file *fi while (count > 0) { int len = count; - if (len > PAGE_SIZE) - len = PAGE_SIZE; + if (len > MMUPAGE_SIZE) + len = MMUPAGE_SIZE; len = vread(kbuf, (char *)p, len); if (!len) break; @@ -298,8 +298,8 @@ static ssize_t write_kmem(struct file * while (count > 0) { int len = count; - if (len > PAGE_SIZE) - len = PAGE_SIZE; + if (len > MMUPAGE_SIZE) + len = MMUPAGE_SIZE; if (len && copy_from_user(kbuf, buf, len)) { free_page((unsigned long)kbuf); return -EFAULT; @@ -409,12 +409,12 @@ static inline size_t read_zero_pagealign /* The shared case is hard. Let's do the conventional zeroing. */ do { - unsigned long unwritten = clear_user(buf, PAGE_SIZE); + unsigned long unwritten = clear_user(buf, MMUPAGE_SIZE); if (unwritten) - return size + unwritten - PAGE_SIZE; + return size + unwritten - MMUPAGE_SIZE; cond_resched(); - buf += PAGE_SIZE; - size -= PAGE_SIZE; + buf += MMUPAGE_SIZE; + size -= MMUPAGE_SIZE; } while (size); return size; @@ -437,23 +437,23 @@ static ssize_t read_zero(struct file * f left = count; /* do we want to be clever? Arbitrary cut-off */ - if (count >= PAGE_SIZE*4) { + if (count >= MMUPAGE_SIZE*4) { unsigned long partial; /* How much left of the page? */ - partial = (PAGE_SIZE-1) & -(unsigned long) buf; + partial = (MMUPAGE_SIZE-1) & -(unsigned long) buf; unwritten = clear_user(buf, partial); written = partial - unwritten; if (unwritten) goto out; left -= partial; buf += partial; - unwritten = read_zero_pagealigned(buf, left & PAGE_MASK); - written += (left & PAGE_MASK) - unwritten; + unwritten = read_zero_pagealigned(buf, left & MMUPAGE_MASK); + written += (left & MMUPAGE_MASK) - unwritten; if (unwritten) goto out; - buf += left & PAGE_MASK; - left &= ~PAGE_MASK; + buf += left & MMUPAGE_MASK; + left &= ~MMUPAGE_MASK; } unwritten = clear_user(buf, left); written += left - unwritten; diff -prauwN linux-2.5.69/drivers/oprofile/buffer_sync.c pgcl-2.5.69-3/drivers/oprofile/buffer_sync.c --- linux-2.5.69/drivers/oprofile/buffer_sync.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/drivers/oprofile/buffer_sync.c 2003-05-26 07:14:19.000000000 -0700 @@ -241,7 +241,7 @@ static unsigned long lookup_dcookie(stru cookie = fast_get_dcookie(vma->vm_file->f_dentry, vma->vm_file->f_vfsmnt); - *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - vma->vm_start; + *offset = MMUPAGE_SIZE*vma->vm_pgoff + addr - vma->vm_start; break; } out: diff -prauwN linux-2.5.69/drivers/scsi/qlogicisp.c pgcl-2.5.69-3/drivers/scsi/qlogicisp.c --- linux-2.5.69/drivers/scsi/qlogicisp.c 2003-05-04 16:53:55.000000000 -0700 +++ pgcl-2.5.69-3/drivers/scsi/qlogicisp.c 2003-05-26 07:14:19.000000000 -0700 @@ -1415,7 +1415,7 @@ static int isp1020_init(struct Scsi_Host if ((command & PCI_COMMAND_MEMORY) && ((mem_flags & 1) == 0)) { - mem_base = (u_long) ioremap(mem_base, PAGE_SIZE); + mem_base = (u_long) ioremap(mem_base, MMUPAGE_SIZE); if (!mem_base) { printk("qlogicisp : i/o remapping failed.\n"); goto out_release; diff -prauwN linux-2.5.69/drivers/scsi/sym53c8xx.c pgcl-2.5.69-3/drivers/scsi/sym53c8xx.c --- linux-2.5.69/drivers/scsi/sym53c8xx.c 2003-05-04 16:53:35.000000000 -0700 +++ pgcl-2.5.69-3/drivers/scsi/sym53c8xx.c 2003-05-26 07:14:19.000000000 -0700 @@ -686,7 +686,8 @@ spinlock_t sym53c8xx_lock = SPIN_LOCK_UN #ifndef SCSI_NCR_PCI_MEM_NOT_SUPPORTED static u_long __init remap_pci_mem(u_long base, u_long size) { - u_long page_base = ((u_long) base) & PAGE_MASK; + /* ioremap()/vmalloc() have MMUPAGE_SIZE granularity */ + u_long page_base = ((u_long) base) & MMUPAGE_MASK; u_long page_offs = ((u_long) base) - page_base; u_long page_remapped = (u_long) ioremap(page_base, page_offs+size); @@ -695,8 +696,9 @@ static u_long __init remap_pci_mem(u_lon static void __init unmap_pci_mem(u_long vaddr, u_long size) { + /* iounmap()/vfree() have MMUPAGE_SIZE granularity */ if (vaddr) - iounmap((void *) (vaddr & PAGE_MASK)); + iounmap((void *) (vaddr & MMUPAGE_MASK)); } #endif /* not def SCSI_NCR_PCI_MEM_NOT_SUPPORTED */ diff -prauwN linux-2.5.69/drivers/scsi/sym53c8xx_2/sym_glue.c pgcl-2.5.69-3/drivers/scsi/sym53c8xx_2/sym_glue.c --- linux-2.5.69/drivers/scsi/sym53c8xx_2/sym_glue.c 2003-05-04 16:53:57.000000000 -0700 +++ pgcl-2.5.69-3/drivers/scsi/sym53c8xx_2/sym_glue.c 2003-05-26 07:14:19.000000000 -0700 @@ -215,7 +215,7 @@ m_addr_t __vtobus(m_pool_ident_t dev_dma #ifndef SYM_OPT_NO_BUS_MEMORY_MAPPING static u_long __init pci_map_mem(u_long base, u_long size) { - u_long page_base = ((u_long) base) & PAGE_MASK; + u_long page_base = ((u_long) base) & MMUPAGE_MASK; u_long page_offs = ((u_long) base) - page_base; u_long page_remapped = (u_long) ioremap(page_base, page_offs+size); @@ -225,7 +225,7 @@ static u_long __init pci_map_mem(u_long static void __init pci_unmap_mem(u_long vaddr, u_long size) { if (vaddr) - iounmap((void *) (vaddr & PAGE_MASK)); + iounmap((void *) (vaddr & MMUPAGE_MASK)); } #endif diff -prauwN linux-2.5.69/drivers/scsi/sym53c8xx_comm.h pgcl-2.5.69-3/drivers/scsi/sym53c8xx_comm.h --- linux-2.5.69/drivers/scsi/sym53c8xx_comm.h 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/drivers/scsi/sym53c8xx_comm.h 2003-05-26 07:14:19.000000000 -0700 @@ -491,7 +491,7 @@ spinlock_t DRIVER_SMP_LOCK = SPIN_LOCK_U #ifndef SCSI_NCR_PCI_MEM_NOT_SUPPORTED static u_long __init remap_pci_mem(u_long base, u_long size) { - u_long page_base = ((u_long) base) & PAGE_MASK; + u_long page_base = ((u_long) base) & MMUPAGE_MASK; u_long page_offs = ((u_long) base) - page_base; u_long page_remapped = (u_long) ioremap(page_base, page_offs+size); @@ -501,7 +501,7 @@ static u_long __init remap_pci_mem(u_lon static void __init unmap_pci_mem(u_long vaddr, u_long size) { if (vaddr) - iounmap((void *) (vaddr & PAGE_MASK)); + iounmap((void *) (vaddr & MMUPAGE_MASK)); } #endif /* not def SCSI_NCR_PCI_MEM_NOT_SUPPORTED */ diff -prauwN linux-2.5.69/fs/aio.c pgcl-2.5.69-3/fs/aio.c --- linux-2.5.69/fs/aio.c 2003-05-04 16:53:14.000000000 -0700 +++ pgcl-2.5.69-3/fs/aio.c 2003-05-26 07:14:19.000000000 -0700 @@ -87,7 +87,7 @@ static void aio_free_ring(struct kioctx long i; for (i=0; inr_pages; i++) - put_page(info->ring_pages[i]); + put_page(pfn_to_page(info->ring_pages[i])); if (info->mmap_size) { down_write(&ctx->mm->mmap_sem); @@ -114,25 +114,25 @@ static int aio_setup_ring(struct kioctx size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + nr_pages = (size + MMUPAGE_SIZE-1) >> MMUPAGE_SHIFT; if (nr_pages < 0) return -EINVAL; info->nr_pages = nr_pages; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + nr_events = (MMUPAGE_SIZE*nr_pages - sizeof(struct aio_ring))/sizeof(struct io_event); info->nr = 0; info->ring_pages = info->internal_pages; if (nr_pages > AIO_RING_PAGES) { - info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + info->ring_pages = kmalloc(sizeof(unsigned long)*nr_pages, GFP_KERNEL); if (!info->ring_pages) return -ENOMEM; - memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); + memset(info->ring_pages, 0, sizeof(unsigned long)*nr_pages); } - info->mmap_size = nr_pages * PAGE_SIZE; + info->mmap_size = nr_pages*MMUPAGE_SIZE; dprintk("attempting mmap of %lu bytes\n", info->mmap_size); down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap(NULL, 0, info->mmap_size, @@ -161,7 +161,8 @@ static int aio_setup_ring(struct kioctx info->nr = nr_events; /* trusted copy */ - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(pfn_to_page(info->ring_pages[0]), KM_USER0) + + (info->ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -178,15 +179,17 @@ static int aio_setup_ring(struct kioctx /* aio_ring_event: returns a pointer to the event at the given index from * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); */ -#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) -#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) +#define AIO_EVENTS_PER_PAGE (MMUPAGE_SIZE/sizeof(struct io_event)) +#define AIO_EVENTS_FIRST_PAGE ((MMUPAGE_SIZE-sizeof(struct aio_ring))/sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) #define aio_ring_event(info, nr, km) ({ \ unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ + unsigned long pfn; \ + pfn = (info)->ring_pages[pos/AIO_EVENTS_PER_PAGE]; \ + __event = kmap_atomic(pfn_to_page(pfn), km); \ + __event += (pfn % PAGE_MMUCOUNT) * MMUPAGE_SIZE; \ __event += pos % AIO_EVENTS_PER_PAGE; \ __event; \ }) @@ -194,7 +197,7 @@ static int aio_setup_ring(struct kioctx #define put_aio_ring_event(event, km) do { \ struct io_event *__event = (event); \ (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ + kunmap_atomic((void *)((unsigned long)__event & MMUPAGE_MASK), km); \ } while(0) /* ioctx_alloc @@ -400,7 +403,8 @@ static struct kiocb *__aio_get_req(struc * accept an event from this io. */ spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); + ring = kmap_atomic(pfn_to_page(ctx->ring_info.ring_pages[0]), KM_USER0) + + (ctx->ring_info.ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { list_add(&req->ki_list, &ctx->active_reqs); get_ioctx(ctx); @@ -664,8 +668,8 @@ int aio_complete(struct kiocb *iocb, lon */ spin_lock_irqsave(&ctx->ctx_lock, flags); - ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); - + ring = kmap_atomic(pfn_to_page(info->ring_pages[0]), KM_IRQ1) + + (info->ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; tail = info->tail; event = aio_ring_event(info, tail, KM_IRQ0); tail = (tail + 1) % info->nr; @@ -720,7 +724,8 @@ static int aio_read_evt(struct kioctx *i unsigned long head; int ret = 0; - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(pfn_to_page(info->ring_pages[0]), KM_USER0) + + (info->ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; dprintk("in aio_read_evt h%lu t%lu m%lu\n", (unsigned long)ring->head, (unsigned long)ring->tail, (unsigned long)ring->nr); diff -prauwN linux-2.5.69/fs/binfmt_elf.c pgcl-2.5.69-3/fs/binfmt_elf.c --- linux-2.5.69/fs/binfmt_elf.c 2003-05-04 16:53:14.000000000 -0700 +++ pgcl-2.5.69-3/fs/binfmt_elf.c 2003-05-26 07:14:19.000000000 -0700 @@ -61,10 +61,10 @@ static int elf_core_dump(long signr, str #define elf_core_dump NULL #endif -#if ELF_EXEC_PAGESIZE > PAGE_SIZE +#if ELF_EXEC_PAGESIZE > MMUPAGE_SIZE # define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE #else -# define ELF_MIN_ALIGN PAGE_SIZE +# define ELF_MIN_ALIGN MMUPAGE_SIZE #endif #define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) @@ -781,9 +781,8 @@ static int load_elf_binary(struct linux_ and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ - /* N.B. Shouldn't the size here be PAGE_SIZE?? */ down_write(¤t->mm->mmap_sem); - error = do_mmap(NULL, 0, 4096, PROT_READ | PROT_EXEC, + error = do_mmap(NULL, 0, MMUPAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); up_write(¤t->mm->mmap_sem); } @@ -1370,21 +1369,26 @@ static int elf_core_dump(long signr, str for (addr = vma->vm_start; addr < vma->vm_end; - addr += PAGE_SIZE) { + addr += MMUPAGE_SIZE) { struct page* page; + unsigned long pfn = 0; struct vm_area_struct *vma; if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &vma) <= 0) { - DUMP_SEEK (file->f_pos + PAGE_SIZE); + &pfn, &vma) <= 0) { + DUMP_SEEK (file->f_pos + MMUPAGE_SIZE); } else { + page = pfn_to_page(pfn); if (page == ZERO_PAGE(addr)) { - DUMP_SEEK (file->f_pos + PAGE_SIZE); + DUMP_SEEK (file->f_pos + MMUPAGE_SIZE); } else { void *kaddr; + unsigned long subpfn; + subpfn = pfn % PAGE_MMUCOUNT; flush_cache_page(vma, addr); kaddr = kmap(page); - DUMP_WRITE(kaddr, PAGE_SIZE); + kaddr += subpfn * MMUPAGE_SIZE; + DUMP_WRITE(kaddr, MMUPAGE_SIZE); kunmap(page); } page_cache_release(page); diff -prauwN linux-2.5.69/fs/bio.c pgcl-2.5.69-3/fs/bio.c --- linux-2.5.69/fs/bio.c 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/fs/bio.c 2003-05-26 07:15:41.000000000 -0700 @@ -438,12 +438,12 @@ static struct bio *__bio_map_user(struct unsigned long uaddr, unsigned int len, int write_to_vm) { - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; + unsigned long end = (uaddr + len + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; + unsigned long start = uaddr >> MMUPAGE_SHIFT; const int nr_pages = end - start; request_queue_t *q = bdev_get_queue(bdev); int ret, offset, i; - struct page **pages; + unsigned long *pages; struct bio *bio; /* @@ -457,7 +457,7 @@ static struct bio *__bio_map_user(struct if (!bio) return NULL; - pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc(nr_pages * sizeof(unsigned long), GFP_KERNEL); if (!pages) goto out; @@ -471,9 +471,11 @@ static struct bio *__bio_map_user(struct bio->bi_bdev = bdev; - offset = uaddr & ~PAGE_MASK; + offset = uaddr & ~MMUPAGE_MASK; for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; + unsigned int bytes = MMUPAGE_SIZE - offset; + int suboff = (pages[i] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; + struct page *pg = pfn_to_page(pages[i]); if (len <= 0) break; @@ -484,7 +486,7 @@ static struct bio *__bio_map_user(struct /* * sorry... */ - if (bio_add_page(bio, pages[i], bytes, offset) < bytes) + if (bio_add_page(bio, pg, bytes, offset + suboff) < bytes) break; len -= bytes; @@ -495,7 +497,7 @@ static struct bio *__bio_map_user(struct * release the pages we didn't map into the bio, if any */ while (i < nr_pages) - page_cache_release(pages[i++]); + page_cache_release(pfn_to_page(pages[i++])); kfree(pages); diff -prauwN linux-2.5.69/fs/direct-io.c pgcl-2.5.69-3/fs/direct-io.c --- linux-2.5.69/fs/direct-io.c 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/fs/direct-io.c 2003-05-26 07:14:19.000000000 -0700 @@ -35,7 +35,9 @@ /* * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure on the stack. + * the size of a structure on the stack. But these are mmupages; this + * will _not_ even be able to see a whole PAGE_SIZE area if you make + * PAGE_MMUCOUNT > DIO_PAGES. */ #define DIO_PAGES 64 @@ -49,6 +51,20 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. + * + * XXX: + * Okay, I just broke this and I'm not sure how to put it back together. + * Basically the issue is that we're pointed at _pfn's_ only by + * get_user_pages() so the assumption of virtual contiguity doesn't even + * guarantee PAGE_SIZE -aligned physical contiguity. + * + * AFAICT the fixup is to "opportunistically" merge all this stuff together + * into PAGE_SIZE-aligned contiguous bits and either special-case or be + * able to handle the rest as they come. I've left this broken for now. + * I'm relatively fearful of eating stackspace to keep count of the number + * mmupages starting at a given pfn there are while merging. + * + * -- wli */ struct dio { @@ -100,7 +116,7 @@ struct dio { * Page queue. These variables belong to dio_refill_pages() and * dio_get_page(). */ - struct page *pages[DIO_PAGES]; /* page buffer */ + unsigned long pages[DIO_PAGES]; /* page buffer */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ int page_errors; /* errno from get_user_pages() */ @@ -155,7 +171,7 @@ static int dio_refill_pages(struct dio * */ if (dio->page_errors == 0) dio->page_errors = ret; - dio->pages[0] = ZERO_PAGE(dio->curr_user_address); + dio->pages[0] = page_to_pfn(ZERO_PAGE(dio->curr_user_address)); dio->head = 0; dio->tail = 1; ret = 0; @@ -163,7 +179,7 @@ static int dio_refill_pages(struct dio * } if (ret >= 0) { - dio->curr_user_address += ret * PAGE_SIZE; + dio->curr_user_address += ret * MMUPAGE_SIZE; dio->curr_page += ret; dio->head = 0; dio->tail = ret; @@ -179,8 +195,13 @@ out: * decent number of pages, less frequently. To provide nicer use of the * L1 cache. */ -static struct page *dio_get_page(struct dio *dio) +static struct page *dio_get_page(struct dio *dio, int *pfoff_in_page, int *page_size) { + int pg_size = MMUPAGE_SIZE; + int pfn, tpfn; + struct page *page; + int i = 0; + if (dio_pages_present(dio) == 0) { int ret; @@ -189,7 +210,33 @@ static struct page *dio_get_page(struct return ERR_PTR(ret); BUG_ON(dio_pages_present(dio) == 0); } - return dio->pages[dio->head++]; + + pfn = dio->pages[dio->head++]; + *pfoff_in_page = (pfn % PAGE_MMUCOUNT) * MMUPAGE_SIZE; + + /* Try to cluster all pfns that belongs to this page together */ + tpfn = pfn + 1; + while (pg_size + *pfoff_in_page < PAGE_SIZE) { + if (dio->head == dio->tail) break; + if (tpfn != dio->pages[dio->head]) break; + tpfn++; + dio->head++; + pg_size += MMUPAGE_SIZE; + i++; + } + + page = pfn_to_page(pfn); + *page_size = pg_size; + + /* + * FIXME - get_user_pages got ref for each pfn, we need to drop + * the extra refs for this page + */ + while (i--) { + page_cache_release(page); + } + + return page; } /* @@ -293,8 +340,9 @@ static void dio_bio_submit(struct dio *d */ static void dio_cleanup(struct dio *dio) { + int a, b; while (dio_pages_present(dio)) - page_cache_release(dio_get_page(dio)); + page_cache_release(dio_get_page(dio, &a, &b)); } /* @@ -686,22 +734,26 @@ static void dio_zero_block(struct dio *d static int do_direct_IO(struct dio *dio) { const unsigned blkbits = dio->blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + unsigned blocks_per_page = PAGE_SIZE >> blkbits; struct page *page; unsigned block_in_page; struct buffer_head *map_bh = &dio->map_bh; int ret = 0; + int page_size; + int pf_pgoff; /* The I/O can start at any block offset within the first page */ block_in_page = dio->first_block_in_page; while (dio->block_in_file < dio->final_block_in_request) { - page = dio_get_page(dio); + page = dio_get_page(dio, &pf_pgoff, &page_size); + if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } + blocks_per_page = page_size >> blkbits; while (block_in_page < blocks_per_page) { unsigned offset_in_page = block_in_page << blkbits; unsigned this_chunk_bytes; /* # of bytes mapped */ @@ -785,7 +837,7 @@ do_holes: * can add to this page */ this_chunk_blocks = dio->blocks_available; - u = (PAGE_SIZE - offset_in_page) >> blkbits; + u = (page_size - offset_in_page) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; u = dio->final_block_in_request - dio->block_in_file; @@ -795,7 +847,7 @@ do_holes: BUG_ON(this_chunk_bytes == 0); dio->boundary = buffer_boundary(map_bh); - ret = submit_page_section(dio, page, offset_in_page, + ret = submit_page_section(dio, page, pf_pgoff + offset_in_page, this_chunk_bytes, dio->next_block_for_io); if (ret) { page_cache_release(page); @@ -882,7 +934,7 @@ direct_io_worker(int rw, struct kiocb *i bytes = iov[seg].iov_len; /* Index into the first page of the first block */ - dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; + dio->first_block_in_page = (user_addr & ~MMUPAGE_MASK) >> blkbits; dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits); /* Page fetching state */ @@ -891,11 +943,11 @@ direct_io_worker(int rw, struct kiocb *i dio->curr_page = 0; dio->total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { + if (user_addr & (MMUPAGE_SIZE-1)) { dio->total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); + bytes -= MMUPAGE_SIZE - (user_addr & (MMUPAGE_SIZE - 1)); } - dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + dio->total_pages += (bytes + MMUPAGE_SIZE - 1) / MMUPAGE_SIZE; dio->curr_user_address = user_addr; ret = do_direct_IO(dio); diff -prauwN linux-2.5.69/fs/exec.c pgcl-2.5.69-3/fs/exec.c --- linux-2.5.69/fs/exec.c 2003-05-04 16:53:13.000000000 -0700 +++ pgcl-2.5.69-3/fs/exec.c 2003-05-26 07:14:19.000000000 -0700 @@ -287,34 +287,33 @@ int copy_strings_kernel(int argc,char ** * * tsk->mmap_sem is held for writing. */ -void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address) +static void put_dirty_page(task_t *tsk, struct page *page, int min_subpfn, unsigned long address) { + unsigned long page_pfn, subpfn; + struct pte_chain *pte_chain; + + page_pfn = page_to_pfn(page); + + for (subpfn = min_subpfn; subpfn < PAGE_MMUCOUNT; ++subpfn) { pgd_t * pgd; pmd_t * pmd; pte_t * pte; - struct pte_chain *pte_chain; - - if (page_count(page) != 1) - printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); + unsigned long pfn, vaddr = address + subpfn*MMUPAGE_SIZE; - pgd = pgd_offset(tsk->mm, address); + pgd = pgd_offset(tsk->mm, vaddr); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) - goto out_sig; + goto out_nolock; spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); + pmd = pmd_alloc(tsk->mm, pgd, vaddr); if (!pmd) goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); + pte = pte_alloc_map(tsk->mm, pmd, vaddr); if (!pte) goto out; - if (!pte_none(*pte)) { - pte_unmap(pte); - goto out; - } - lru_cache_add_active(page); - flush_dcache_page(page); - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); + pfn = page_pfn + subpfn; + set_pte(pte, pte_mkdirty(pte_mkwrite(pfn_pte(pfn, PAGE_COPY)))); + page_cache_get(page); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); tsk->mm->rss++; @@ -322,10 +321,15 @@ void put_dirty_page(struct task_struct * /* no need for flush_tlb */ pte_chain_free(pte_chain); + } + lru_cache_add_active(page); + flush_dcache_page(page); + + page_cache_release(page); /* want to add PAGE_MMUCOUNT-1 */ return; out: spin_unlock(&tsk->mm->page_table_lock); -out_sig: +out_nolock: __free_page(page); force_sig(SIGKILL, tsk); pte_chain_free(pte_chain); @@ -390,7 +394,8 @@ int setup_arg_pages(struct linux_binprm if (!mpnt) return -ENOMEM; - if (!vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { + /* must match the length of mpnt below */ + if (!vm_enough_memory((STACK_TOP-(MMUPAGE_MASK&(unsigned long)bprm->p))/MMUPAGE_SIZE)) { kmem_cache_free(vm_area_cachep, mpnt); return -ENOMEM; } @@ -400,10 +405,9 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_mm = mm; #ifdef CONFIG_STACK_GROWSUP mpnt->vm_start = stack_base; - mpnt->vm_end = PAGE_MASK & - (PAGE_SIZE - 1 + (unsigned long) bprm->p); + mpnt->vm_end = MMUPAGE_ALIGN((unsigned long)bprm->p); #else - mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; + mpnt->vm_start = MMUPAGE_MASK & (unsigned long)bprm->p; mpnt->vm_end = STACK_TOP; #endif mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7]; @@ -412,16 +416,23 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_pgoff = 0; mpnt->vm_file = NULL; INIT_LIST_HEAD(&mpnt->shared); - mpnt->vm_private_data = (void *) 0; + mpnt->vm_private_data = NULL; insert_vm_struct(mm, mpnt); - mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> MMUPAGE_SHIFT; } for (i = 0 ; i < MAX_ARG_PAGES ; i++) { struct page *page = bprm->page[i]; if (page) { + int min_subpfn; + + if (mpnt->vm_start <= stack_base) + min_subpfn = 0; + else + min_subpfn = (mpnt->vm_start - stack_base)/MMUPAGE_SIZE; + bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base); + put_dirty_page(current, page, min_subpfn, stack_base); } stack_base += PAGE_SIZE; } @@ -434,7 +445,7 @@ int setup_arg_pages(struct linux_binprm #else -#define put_dirty_page(tsk, page, address) +#define put_dirty_page(tsk, page, min_subpfn, address) #define setup_arg_pages(bprm) (0) static inline void free_arg_pages(struct linux_binprm *bprm) { @@ -1143,6 +1154,7 @@ out_file: allow_write_access(bprm.file); fput(bprm.file); } + return retval; } diff -prauwN linux-2.5.69/fs/ext2/dir.c pgcl-2.5.69-3/fs/ext2/dir.c --- linux-2.5.69/fs/ext2/dir.c 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/fs/ext2/dir.c 2003-05-26 07:14:19.000000000 -0700 @@ -432,15 +432,15 @@ int ext2_add_link (struct dentry *dentry struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned chunk_size = ext2_chunk_size(dir); - unsigned reclen = EXT2_DIR_REC_LEN(namelen); - unsigned short rec_len, name_len; + unsigned long chunk_size = ext2_chunk_size(dir); + unsigned long reclen = EXT2_DIR_REC_LEN(namelen); + unsigned long rec_len, name_len; struct page *page = NULL; ext2_dirent * de; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; - unsigned from, to; + unsigned long from, to; int err; /* diff -prauwN linux-2.5.69/fs/file_table.c pgcl-2.5.69-3/fs/file_table.c --- linux-2.5.69/fs/file_table.c 2003-05-04 16:53:00.000000000 -0700 +++ pgcl-2.5.69-3/fs/file_table.c 2003-05-26 07:14:19.000000000 -0700 @@ -249,7 +249,7 @@ void __init files_init(unsigned long mem * Per default don't use more than 10% of our memory for files. */ - n = (mempages * (PAGE_SIZE / 1024)) / 10; + n = (mempages * (MMUPAGE_SIZE / 1024)) / 10; files_stat.max_files = n; if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; diff -prauwN linux-2.5.69/fs/inode.c pgcl-2.5.69-3/fs/inode.c --- linux-2.5.69/fs/inode.c 2003-05-04 16:53:56.000000000 -0700 +++ pgcl-2.5.69-3/fs/inode.c 2003-05-26 07:14:20.000000000 -0700 @@ -1254,7 +1254,11 @@ void __init inode_init(unsigned long mem for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) init_waitqueue_head(&i_wait_queue_heads[i].wqh); +#if PAGE_SHIFT <= 14 mempages >>= (14 - PAGE_SHIFT); +#else + mempages <<= PAGE_SHIFT - 14; +#endif mempages *= sizeof(struct hlist_head); for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) ; diff -prauwN linux-2.5.69/fs/proc/base.c pgcl-2.5.69-3/fs/proc/base.c --- linux-2.5.69/fs/proc/base.c 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/fs/proc/base.c 2003-05-26 07:14:20.000000000 -0700 @@ -32,6 +32,7 @@ #include #include #include +#include /* * For hysterical raisins we keep the same inumbers as in the old procfs. @@ -433,29 +434,37 @@ static ssize_t mem_read(struct file * fi size_t count, loff_t *ppos) { struct task_struct *task = proc_task(file->f_dentry->d_inode); - char *page; + char *kbuf; + struct page *page; unsigned long src = *ppos; int ret = -ESRCH; struct mm_struct *mm; - if (!MAY_PTRACE(task)) + if (0 && !MAY_PTRACE(task)) goto out; ret = -ENOMEM; - page = (char *)__get_free_page(GFP_USER); - if (!page) + page = alloc_page(GFP_HIGHUSER); + if (!page) { + printk("alloc_page() failed in mem_read()\n"); goto out; + } + kbuf = kmap(page); ret = 0; mm = get_task_mm(task); - if (!mm) + if (!mm) { + printk("get_task_mm() failed in mem_read()\n"); goto out_free; + } +#if 0 ret = -EIO; if (file->private_data != (void*)((long)current->self_exec_id)) goto out_put; +#endif ret = 0; @@ -463,14 +472,16 @@ static ssize_t mem_read(struct file * fi int this_len, retval; this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - retval = access_process_vm(task, src, page, this_len, 0); + retval = access_process_vm(task, src, kbuf, this_len, 0); if (!retval) { + printk("access_process_vm() failed in mem_read()\n"); if (!ret) ret = -EIO; break; } - if (copy_to_user(buf, page, retval)) { + if (copy_to_user(buf, kbuf, retval)) { + printk("copy_to_user() failed in mem_read()\n"); ret = -EFAULT; break; } @@ -482,15 +493,17 @@ static ssize_t mem_read(struct file * fi } *ppos = src; -out_put: mmput(mm); out_free: - free_page((unsigned long) page); + kunmap(page); + __free_page(page); out: return ret; } +#if 0 #define mem_write NULL +#endif #ifndef mem_write /* This is a security hazard */ @@ -498,26 +511,28 @@ static ssize_t mem_write(struct file * f size_t count, loff_t *ppos) { int copied = 0; - char *page; + char *kbuf; + struct page *page; struct task_struct *task = proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; - if (!MAY_PTRACE(task)) + if (0 && !MAY_PTRACE(task)) return -ESRCH; - page = (char *)__get_free_page(GFP_USER); + page = alloc_page(GFP_HIGHUSER); if (!page) return -ENOMEM; + kbuf = kmap(page); while (count > 0) { int this_len, retval; this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - if (copy_from_user(page, buf, this_len)) { + if (copy_from_user(kbuf, buf, this_len)) { copied = -EFAULT; break; } - retval = access_process_vm(task, dst, page, this_len, 1); + retval = access_process_vm(task, dst, kbuf, this_len, 1); if (!retval) { if (!copied) copied = -EIO; @@ -529,7 +544,8 @@ static ssize_t mem_write(struct file * f count -= retval; } *ppos = dst; - free_page((unsigned long) page); + kunmap(page); + __free_page(page); return copied; } #endif diff -prauwN linux-2.5.69/fs/proc/proc_misc.c pgcl-2.5.69-3/fs/proc/proc_misc.c --- linux-2.5.69/fs/proc/proc_misc.c 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/fs/proc/proc_misc.c 2003-05-26 07:14:20.000000000 -0700 @@ -241,7 +241,7 @@ static int meminfo_read_proc(char *page, K(ps.nr_writeback), K(ps.nr_mapped), K(ps.nr_slab), - K(committed), + committed << (MMUPAGE_SHIFT - 10), K(ps.nr_page_table_pages), vmtot, vmi.used, diff -prauwN linux-2.5.69/fs/proc/task_mmu.c pgcl-2.5.69-3/fs/proc/task_mmu.c --- linux-2.5.69/fs/proc/task_mmu.c 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/fs/proc/task_mmu.c 2003-05-26 07:14:20.000000000 -0700 @@ -56,7 +56,7 @@ int task_statm(struct mm_struct *mm, int *resident = mm->rss; for (vma = mm->mmap; vma; vma = vma->vm_next) { - int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int pages = (vma->vm_end - vma->vm_start) >> MMUPAGE_SHIFT; size += pages; if (is_vm_hugetlb_page(vma)) { diff -prauwN linux-2.5.69/include/asm-alpha/page.h pgcl-2.5.69-3/include/asm-alpha/page.h --- linux-2.5.69/include/asm-alpha/page.h 2003-05-04 16:53:36.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-alpha/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -98,6 +98,8 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _ALPHA_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-arm/page.h pgcl-2.5.69-3/include/asm-arm/page.h --- linux-2.5.69/include/asm-arm/page.h 2003-05-04 16:53:03.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-arm/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -181,6 +181,8 @@ static inline int get_order(unsigned lon #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif diff -prauwN linux-2.5.69/include/asm-cris/page.h pgcl-2.5.69-3/include/asm-cris/page.h --- linux-2.5.69/include/asm-cris/page.h 2003-05-04 16:53:09.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-cris/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -109,6 +109,8 @@ extern unsigned long dram_start, dram_en #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _CRIS_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-generic/page.h pgcl-2.5.69-3/include/asm-generic/page.h --- linux-2.5.69/include/asm-generic/page.h 1969-12-31 16:00:00.000000000 -0800 +++ pgcl-2.5.69-3/include/asm-generic/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -0,0 +1,11 @@ +#ifndef _ASM_GENERIC_PAGE_H +#define _ASM_GENERIC_PAGE_H + +#define MMUPAGE_SHIFT PAGE_SHIFT +#define MMUPAGE_SIZE PAGE_SIZE +#define MMUPAGE_MASK PAGE_MASK +#define MMUPAGE_ALIGN(x) PAGE_ALIGN(x) +#define PAGE_MMUSHIFT 0 +#define PAGE_MMUCOUNT 1 + +#endif /* _ASM_GENERIC_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-generic/rmap.h pgcl-2.5.69-3/include/asm-generic/rmap.h --- linux-2.5.69/include/asm-generic/rmap.h 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-generic/rmap.h 2003-05-26 10:36:53.000000000 -0700 @@ -15,7 +15,7 @@ * offset of the page table entry within the page table page * * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE + * scalar pte_addr_t. The pfn of the pte's page is shifted left by MMUPAGE_SIZE * bits and is then ORed with the byte offset of the pte within its page. * * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for @@ -26,7 +26,15 @@ */ #include -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) +/* + * This looks bizarre, but it's actually meaningful. + */ +#define MMUPAGES_MAPPED_PER_PTE_PAGE (PTRS_PER_PTE * PAGE_MMUCOUNT) +#define VIRT_AREA_MAPPED_PER_PTE_PAGE \ + (MMUPAGES_MAPPED_PER_PTE_PAGE*MMUPAGE_SIZE) + +static inline void pgtable_add_rmap(struct page *page, struct mm_struct *mm, + unsigned long address) { #ifdef BROKEN_PPC_PTE_ALLOC_ONE /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ @@ -35,13 +43,38 @@ static inline void pgtable_add_rmap(stru if (!mem_init_done) return; #endif + + /* rmap's accounting is already set up */ + if (page->mapping) { + /* + * address is presumably large. if smaller, overflow traps + * the error; if larger, check the distance + */ + WARN_ON(address - page->index >= VIRT_AREA_MAPPED_PER_PTE_PAGE); + return; + } + page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + page->index = address & ~(VIRT_AREA_MAPPED_PER_PTE_PAGE - 1); inc_page_state(nr_page_table_pages); } static inline void pgtable_remove_rmap(struct page * page) { + /* we're not down to a unique reference */ + if (PAGE_MMUCOUNT > 1) { + if (atomic_read(&page->count) > 1) + return; + + /* + * A zero reference count should not be possible; + * put_page() should have freed the things outright + * so this essentially means use-after-free is happening. + */ + else + BUG_ON(atomic_read(&page->count) <= 0); + } + page->mapping = NULL; page->index = 0; dec_page_state(nr_page_table_pages); @@ -53,20 +86,28 @@ static inline struct mm_struct * ptep_to return (struct mm_struct *) page->mapping; } +/* + * I did a poor job of isolating this from the workaround + */ static inline unsigned long ptep_to_address(pte_t * ptep) { struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; + unsigned long swpage_voff = ((unsigned long)ptep)/sizeof(pte_t); + swpage_voff %= MMUPAGES_MAPPED_PER_PTE_PAGE; + return page->index + MMUPAGE_SIZE*swpage_voff; } #if CONFIG_HIGHPTE +/* + * Recover the subpfn and add in mmupage offset to remain independent + * of pte blocksize. + */ static inline pte_addr_t ptep_to_paddr(pte_t *ptep) { - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); + unsigned long pfn, subpfn, vaddr = (unsigned long)ptep; + subpfn = (vaddr/MMUPAGE_SIZE) & (PAGE_MMUCOUNT-1); /* Z/nZ vpfndiff */ + pfn = page_to_pfn(kmap_atomic_to_page(ptep)) + subpfn; + return (pte_addr_t)pfn*MMUPAGE_SIZE + (vaddr & ~MMUPAGE_MASK); } #else static inline pte_addr_t ptep_to_paddr(pte_t *ptep) diff -prauwN linux-2.5.69/include/asm-i386/dma-mapping.h pgcl-2.5.69-3/include/asm-i386/dma-mapping.h --- linux-2.5.69/include/asm-i386/dma-mapping.h 2003-05-04 16:53:57.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/dma-mapping.h 2003-05-26 07:14:20.000000000 -0700 @@ -51,7 +51,7 @@ dma_map_page(struct device *dev, struct size_t size, enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); - return (dma_addr_t)(page_to_pfn(page)) * PAGE_SIZE + offset; + return (dma_addr_t)(page_to_pfn(page)) * MMUPAGE_SIZE + offset; } static inline void diff -prauwN linux-2.5.69/include/asm-i386/fixmap.h pgcl-2.5.69-3/include/asm-i386/fixmap.h --- linux-2.5.69/include/asm-i386/fixmap.h 2003-05-04 16:53:00.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/fixmap.h 2003-05-26 07:39:52.000000000 -0700 @@ -41,8 +41,29 @@ * TLB entries of such buffers will not be flushed across * task switches. */ + +/* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +#define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) +#define LAST_PKMAP 1024 +#define LAST_PKMAP_MASK (LAST_PKMAP-1) + enum fixed_addresses { - FIX_HOLE, + /* + * leave a hole of exactly PAGE_SIZE at the top for CONFIG_HIGHMEM + * this makes things easier on core code; the math works out funny + */ + FIX_HOLE = PAGE_MMUCOUNT > 1 ? PAGE_MMUCOUNT - 1 : 0, +#ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+((KM_TYPE_NR*NR_CPUS+1)*PAGE_MMUCOUNT)-1, + FIX_PKMAP_BEGIN, + FIX_PKMAP_END = FIX_PKMAP_BEGIN + (LAST_PKMAP+1)*PAGE_MMUCOUNT - 1, +#endif FIX_VSYSCALL, #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ @@ -63,10 +84,6 @@ enum fixed_addresses { #ifdef CONFIG_X86_SUMMIT FIX_CYCLONE_TIMER, /*cyclone timer register*/ #endif -#ifdef CONFIG_HIGHMEM - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif #ifdef CONFIG_ACPI_BOOT FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, @@ -101,11 +118,11 @@ extern void __set_fixmap (enum fixed_add * the start of the fixmap. */ #define FIXADDR_TOP (0xfffff000UL) -#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << MMUPAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << MMUPAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x) & MMUPAGE_MASK)) >> MMUPAGE_SHIFT) extern void __this_fixmap_does_not_exist(void); @@ -133,8 +150,13 @@ static inline unsigned long fix_to_virt( static inline unsigned long virt_to_fix(const unsigned long vaddr) { - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + if (vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START) { + printk("bad vaddr in virt_to_fix 0x%lx\n", vaddr); + BUG(); + } return __virt_to_fix(vaddr); } +#define PKMAP_BASE fix_to_virt(FIX_PKMAP_END) + #endif diff -prauwN linux-2.5.69/include/asm-i386/highmem.h pgcl-2.5.69-3/include/asm-i386/highmem.h --- linux-2.5.69/include/asm-i386/highmem.h 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/highmem.h 2003-05-26 07:14:20.000000000 -0700 @@ -34,23 +34,8 @@ extern pte_t *pkmap_page_table; extern void kmap_init(void); -/* - * Right now we initialize only a single pte table. It can be extended - * easily, subsequent pte tables have to be allocated in one physical - * chunk of RAM. - */ -#define PKMAP_BASE (0xff800000UL) -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif -#define LAST_PKMAP_MASK (LAST_PKMAP-1) -#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) -#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) - -extern void * FASTCALL(kmap_high(struct page *page)); -extern void FASTCALL(kunmap_high(struct page *page)); +void *FASTCALL(kmap_high(struct page *page)); +void FASTCALL(kunmap_high(struct page *page)); void *kmap(struct page *page); void kunmap(struct page *page); diff -prauwN linux-2.5.69/include/asm-i386/io.h pgcl-2.5.69-3/include/asm-i386/io.h --- linux-2.5.69/include/asm-i386/io.h 2003-05-04 16:53:57.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/io.h 2003-05-26 07:14:20.000000000 -0700 @@ -95,7 +95,7 @@ static inline void * phys_to_virt(unsign /* * Change "struct page" to physical address. */ -#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) +#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << MMUPAGE_SHIFT) extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); diff -prauwN linux-2.5.69/include/asm-i386/io_apic.h pgcl-2.5.69-3/include/asm-i386/io_apic.h --- linux-2.5.69/include/asm-i386/io_apic.h 2003-05-04 16:53:35.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/io_apic.h 2003-05-26 07:14:20.000000000 -0700 @@ -17,7 +17,7 @@ #define IO_APIC_BASE(idx) \ ((volatile int *)(__fix_to_virt(FIX_IO_APIC_BASE_0 + idx) \ - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK))) + + (mp_ioapics[idx].mpc_apicaddr & ~MMUPAGE_MASK))) /* * The structure of the IO-APIC: diff -prauwN linux-2.5.69/include/asm-i386/mmzone.h pgcl-2.5.69-3/include/asm-i386/mmzone.h --- linux-2.5.69/include/asm-i386/mmzone.h 2003-05-04 16:53:28.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/mmzone.h 2003-05-26 07:14:20.000000000 -0700 @@ -22,18 +22,18 @@ extern struct pglist_data *node_data[]; #define alloc_bootmem_low(x) \ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, 0) #define alloc_bootmem_node(ignore, x) \ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, 0) #define node_size(nid) (node_data[nid]->node_size) -#define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) +#define node_localnr(pfn, nid) (((pfn) - node_data[nid]->node_start_pfn) / PAGE_MMUCOUNT) /* * Following are macros that each numa implmentation must define. @@ -42,25 +42,41 @@ extern struct pglist_data *node_data[]; /* * Given a kernel address, find the home node of the underlying memory. */ -#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) +#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> MMUPAGE_SHIFT) /* * Return a pointer to the node data for node n. */ #define NODE_DATA(nid) (node_data[nid]) +/* + * These names clash. I blame mbligh. + */ #define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) + +/* + * pgdat->node_size is calculated from zone_sizes[], which is in + * units of PAGE_SIZE. I don't trust this. + */ #define node_end_pfn(nid) \ ({ \ pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_size; \ + __pgdat->node_start_pfn + __pgdat->node_size*PAGE_MMUCOUNT; \ }) #define local_mapnr(kvaddr) \ ({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ + unsigned long __pfn = __pa(kvaddr) >> MMUPAGE_SHIFT; \ + (__pfn - node_start_pfn(pfn_to_nid(__pfn)))/PAGE_MMUCOUNT; \ +}) + +#define local_pfn(pg) \ +({ \ + struct page *__pg = pg; \ + unsigned long __nr; \ + __nr = (unsigned long)(__pg - page_zone(__pg)->zone_mem_map); \ + __nr*PAGE_MMUCOUNT; \ }) #define kern_addr_valid(kaddr) \ @@ -81,10 +97,9 @@ extern struct pglist_data *node_data[]; ({ \ struct page *__page = pg; \ struct zone *__zone = page_zone(__page); \ - (unsigned long)(__page - __zone->zone_mem_map) \ - + __zone->zone_start_pfn; \ + local_pfn(__page) + __zone->zone_start_pfn; \ }) -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> MMUPAGE_SHIFT)) /* * pfn_valid should be made as fast as possible, and the current definition * is valid for machines that are NUMA, but still contiguous, which is what diff -prauwN linux-2.5.69/include/asm-i386/numaq.h pgcl-2.5.69-3/include/asm-i386/numaq.h --- linux-2.5.69/include/asm-i386/numaq.h 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/numaq.h 2003-05-26 07:14:20.000000000 -0700 @@ -28,7 +28,7 @@ #ifdef CONFIG_X86_NUMAQ -#define MAX_NUMNODES 8 +#define MAX_NUMNODES 16 extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() @@ -159,7 +159,7 @@ struct sys_cfg_data { static inline unsigned long *get_zholes_size(int nid) { - return 0; + return NULL; } #endif /* CONFIG_X86_NUMAQ */ #endif /* NUMAQ_H */ diff -prauwN linux-2.5.69/include/asm-i386/page.h pgcl-2.5.69-3/include/asm-i386/page.h --- linux-2.5.69/include/asm-i386/page.h 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -1,13 +1,36 @@ #ifndef _I386_PAGE_H #define _I386_PAGE_H -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 +#include /* for CONFIG_PAGE_CLUSTER */ + +/* + * One mmupage is represented by one Page Table Entry at the MMU level, + * and corresponds to one page at the user process level: its size is + * the same as param.h EXEC_PAGESIZE (for getpagesize(2) and mmap(2)). + */ +#define MMUPAGE_SHIFT 12 +#define MMUPAGE_SIZE (1 << MMUPAGE_SHIFT) +#define MMUPAGE_MASK (~(MMUPAGE_SIZE-1)) + +/* + * 2**N adjacent mmupages may be clustered to make up one kernel page. + * Reasonable and tested values for PAGE_MMUSHIFT are 0 (4k page), + * 1 (8k page), 2 (16k page), 3 (32k page). Higher values will not + * work without further changes e.g. to unsigned short b_size. + */ +#define PAGE_MMUSHIFT CONFIG_PAGE_CLUSTER +#define PAGE_MMUCOUNT (1 << PAGE_MMUSHIFT) + +/* + * One kernel page is represented by one struct page (see mm.h), + * and is the kernel's principal unit of memory allocation. + */ +#define PAGE_SHIFT (PAGE_MMUSHIFT + MMUPAGE_SHIFT) #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) +#define LARGE_PAGE_SIZE (1 << PMD_SHIFT) #ifdef __KERNEL__ #ifndef __ASSEMBLY__ @@ -53,7 +76,7 @@ typedef struct { unsigned long pgd; } pg #define pte_val(x) ((x).pte_low) #define HPAGE_SHIFT 22 #endif -#define PTE_MASK PAGE_MASK +#define PTE_MASK MMUPAGE_MASK #ifdef CONFIG_HUGETLB_PAGE #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) @@ -76,6 +99,7 @@ typedef struct { unsigned long pgprot; } /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) +#define MMUPAGE_ALIGN(addr) (((addr)+MMUPAGE_SIZE-1)&MMUPAGE_MASK) /* * This handles the memory map.. We could make this a config @@ -123,18 +147,22 @@ static __inline__ int get_order(unsigned #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) -#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define __MAXMEM \ + ((VMALLOC_START-2*MMUPAGE_SIZE-__PAGE_OFFSET) & LARGE_PAGE_MASK) +#define MAXMEM \ + __pa((VMALLOC_START-2*MMUPAGE_SIZE) & LARGE_PAGE_MASK) #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) +#define pfn_to_kaddr(pfn) __va(MMUPAGE_SIZE*(pfn)) #ifndef CONFIG_DISCONTIGMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) -#define pfn_valid(pfn) ((pfn) < max_mapnr) +#define pfn_to_page(pfn) (&mem_map[(pfn)/PAGE_MMUCOUNT]) +#define page_to_mapnr(page) ((unsigned long)((page) - mem_map)) +#define page_to_pfn(page) (PAGE_MMUCOUNT*page_to_mapnr(page)) +#define pfn_valid(pfn) ((pfn) < max_mapnr*PAGE_MMUCOUNT) #endif /* !CONFIG_DISCONTIGMEM */ -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr)/MMUPAGE_SIZE) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr)/MMUPAGE_SIZE) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff -prauwN linux-2.5.69/include/asm-i386/pci.h pgcl-2.5.69-3/include/asm-i386/pci.h --- linux-2.5.69/include/asm-i386/pci.h 2003-05-04 16:53:13.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/pci.h 2003-05-26 07:39:52.000000000 -0700 @@ -67,13 +67,13 @@ pci_dac_page_to_dma(struct pci_dev *pdev static __inline__ struct page * pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) { - return pfn_to_page(dma_addr >> PAGE_SHIFT); + return pfn_to_page(dma_addr >> MMUPAGE_SHIFT); } static __inline__ unsigned long pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) { - return (dma_addr & ~PAGE_MASK); + return dma_addr & ~PAGE_MASK; } static __inline__ void diff -prauwN linux-2.5.69/include/asm-i386/pgalloc.h pgcl-2.5.69-3/include/asm-i386/pgalloc.h --- linux-2.5.69/include/asm-i386/pgalloc.h 2003-05-04 16:53:00.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/pgalloc.h 2003-05-26 10:37:56.000000000 -0700 @@ -2,19 +2,89 @@ #define _I386_PGALLOC_H #include +#include #include #include #include #include /* for struct page */ +#include /* to make asm-generic/rmap.h happy */ +#include /* for pgtable_remove_rmap() */ #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) +static inline void pgtable_remove_rmap(struct page *); +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *page) { - set_pmd(pmd, __pmd(_PAGE_TABLE + - ((unsigned long long)page_to_pfn(pte) << - (unsigned long long) PAGE_SHIFT))); + unsigned long pfn, pmd_off = (unsigned long)pmd; + int k; + + pmd_off = (pmd_off/sizeof(pmd_t)) % PAGE_MMUCOUNT; + pfn = page_to_pfn(page); + pmd -= pmd_off; + + if (PAGE_MMUCOUNT > 1) { + struct page *old_page = NULL; + + if (atomic_read(&page->count) != 1) { + WARN_ON(1); + printk(KERN_DEBUG "bad pte refcount = %d\n", + atomic_read(&page->count)); + } + + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + if (pmd_present(pmd[k]) || !pmd_none(pmd[k])) { + if (old_page) + WARN_ON(old_page != pmd_page(pmd[k])); + else + old_page = pmd_page(pmd[k]); + } + } + + if (!old_page || old_page == page) + atomic_set(&page->count, PAGE_MMUCOUNT); + else { + /* + * old_page->index can legitimately be 0 + * but something's corrupt if it's mapping's wrong + */ + BUG_ON((struct mm_struct *)old_page->mapping != mm); + + /* + * errant callers can potentially do things + * out-of-order + */ + WARN_ON((struct mm_struct *)page->mapping != mm); + /* if (old_page->mapping != mm) + pgtable_add_rmap(page, mm, page->index); */ + pgtable_remove_rmap(page); + put_page(page); + atomic_set(&old_page->count, PAGE_MMUCOUNT); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long long pmdval; + pmdval = page_to_pfn(old_page) + k; + pmdval <<= MMUPAGE_SHIFT; + if (pmd_present(pmd[k]) || !pmd_none(pmd[k])) { + WARN_ON(old_page != pmd_page(pmd[k])); + continue; + } else + set_pmd(&pmd[k], __pmd(_PAGE_TABLE + pmdval)); + } + return; + } + } + + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long long pmdval; + pmdval = (unsigned long long)(pfn + k) << MMUPAGE_SHIFT; + if (likely(pmd_none(pmd[k]) || !pmd_present(pmd[k]))) + set_pmd(&pmd[k], __pmd(_PAGE_TABLE + pmdval)); + else { + WARN_ON(1); + printk(KERN_DEBUG "pmdval=%Lx\n", (u64)pmd_val(pmd[k])); + put_page(page); /* a reference will be omitted */ + } + } } /* * Allocate and free page tables. @@ -31,13 +101,28 @@ static inline void pte_free_kernel(pte_t free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct page *page) { - __free_page(pte); + if (PAGE_MMUCOUNT == 1) + __free_page(page); + else + put_page(page); } -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +struct mmu_gather; +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *page) +{ + /* restore the reference count so later frees don't BUG() */ + if (PAGE_MMUCOUNT > 1) { + if (atomic_dec_and_test(&page->count)) + atomic_set(&page->count, 1); + else + return; + } + tlb_remove_page(tlb, page); +} /* * allocating and freeing a pmd is trivial: the 1-entry pmd is @@ -52,4 +137,6 @@ static inline void pte_free(struct page #define check_pgt_cache() do { } while (0) +#include + #endif /* _I386_PGALLOC_H */ diff -prauwN linux-2.5.69/include/asm-i386/pgtable-2level.h pgcl-2.5.69-3/include/asm-i386/pgtable-2level.h --- linux-2.5.69/include/asm-i386/pgtable-2level.h 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/pgtable-2level.h 2003-05-26 07:14:20.000000000 -0700 @@ -17,6 +17,7 @@ #define PTRS_PER_PTE 1024 +#ifndef __ASSEMBLY__ #define pte_ERROR(e) \ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) #define pmd_ERROR(e) \ @@ -49,7 +50,7 @@ static inline int pgd_present(pgd_t pgd) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) #define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +((unsigned long) __va(pgd_val(pgd) & MMUPAGE_MASK)) static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { @@ -59,9 +60,11 @@ static inline pmd_t * pmd_offset(pgd_t * #define pte_same(a, b) ((a).pte_low == (b).pte_low) #define pte_page(x) pfn_to_page(pte_pfn(x)) #define pte_none(x) (!(x).pte_low) -#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) -#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#define pte_pfn(x) ((unsigned long)(((x).pte_low>>MMUPAGE_SHIFT))) +#define pfn_pte(pfn, prot) __pte(((pfn)<> PAGE_SHIFT) | - (pte.pte_high << (32 - PAGE_SHIFT)); + return (pte.pte_low >> MMUPAGE_SHIFT) | + (pte.pte_high << (32 - MMUPAGE_SHIFT)); } static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { pte_t pte; - pte.pte_high = page_nr >> (32 - PAGE_SHIFT); - pte.pte_low = (page_nr << PAGE_SHIFT) | pgprot_val(pgprot); + pte.pte_high = page_nr >> (32 - MMUPAGE_SHIFT); + pte.pte_low = (page_nr << MMUPAGE_SHIFT) | pgprot_val(pgprot); return pte; } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - return __pmd(((unsigned long long)page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); + return __pmd(((unsigned long long)page_nr << MMUPAGE_SHIFT) | pgprot_val(pgprot)); } /* @@ -123,6 +124,6 @@ static inline pmd_t pfn_pmd(unsigned lon #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define PTE_FILE_MAX_BITS 32 -extern struct kmem_cache_s *pae_pgd_cachep; +#endif /* !__ASSEMBLY__ */ #endif /* _I386_PGTABLE_3LEVEL_H */ diff -prauwN linux-2.5.69/include/asm-i386/pgtable.h pgcl-2.5.69-3/include/asm-i386/pgtable.h --- linux-2.5.69/include/asm-i386/pgtable.h 2003-05-04 16:53:36.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/pgtable.h 2003-05-26 07:40:02.000000000 -0700 @@ -21,15 +21,28 @@ #include #endif -extern pgd_t swapper_pg_dir[1024]; -extern void paging_init(void); +#include +#include +#include /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[1024]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +extern struct page *zero_page; +#define ZERO_PAGE(vaddr) (zero_page) +extern pgd_t swapper_pg_dir[1024]; +extern kmem_cache_t *pgd_cache; +extern kmem_cache_t *pmd_cache; +extern spinlock_t pgd_lock; +extern struct list_head pgd_list; + +void pmd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_dtor(void *, kmem_cache_t *, unsigned long); +void pgtable_cache_init(void); +void paging_init(void); #endif /* !__ASSEMBLY__ */ @@ -38,24 +51,10 @@ extern unsigned long empty_zero_page[102 * implements both the traditional 2-level x86 page tables and the * newer 3-level PAE-mode page tables. */ -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include - -/* - * Need to initialise the X86 PAE caches - */ -extern void pgtable_cache_init(void); - #else # include - -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - -#endif #endif #define PMD_SIZE (1UL << PMD_SHIFT) @@ -82,15 +81,15 @@ extern void pgtable_cache_init(void); * The vmalloc() routines leaves a hole of 4kB between each vmalloced * area for the same reason. ;) */ -#define VMALLOC_OFFSET (8*1024*1024) -#define VMALLOC_START (((unsigned long) high_memory + 2*VMALLOC_OFFSET-1) & \ - ~(VMALLOC_OFFSET-1)) #define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) -#else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) -#endif +#define VMALLOC_END (FIXADDR_START-2*MMUPAGE_SIZE) + +#define __VMALLOC_START (VMALLOC_END - VMALLOC_RESERVE - 2*MMUPAGE_SIZE) +#define VMALLOC_START \ + (high_memory \ + ? max(__VMALLOC_START, (unsigned long)high_memory) \ + : __VMALLOC_START \ + ) /* * The 4MB page is guessing.. Detailed in the infamous "Chapter H" @@ -183,7 +182,7 @@ extern unsigned long pg0[1024]; #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_bad(x) ((pmd_val(x) & (~MMUPAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) @@ -238,10 +237,10 @@ static inline pte_t pte_modify(pte_t pte #define page_pte(page) page_pte_prot(page, __pgprot(0)) #define pmd_page_kernel(pmd) \ -((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +((unsigned long) __va(pmd_val(pmd) & MMUPAGE_MASK)) #ifndef CONFIG_DISCONTIGMEM -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> MMUPAGE_SHIFT)) #endif /* !CONFIG_DISCONTIGMEM */ #define pmd_large(pmd) \ @@ -283,20 +282,32 @@ static inline pte_t pte_modify(pte_t pte * control the given virtual address */ #define pte_index(address) \ - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + (((address) >> MMUPAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) +( \ + (pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) \ + + (PTRS_PER_PTE*((pmd_val(*(dir))/MMUPAGE_SIZE)%PAGE_MMUCOUNT)\ + + pte_index(address)) \ +) #define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) +( \ + (pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) \ + + (PTRS_PER_PTE*((pmd_val(*(dir))/MMUPAGE_SIZE)%PAGE_MMUCOUNT)\ + + pte_index(address)) \ +) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else #define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) +( \ + (pte_t *)page_address(pmd_page(*(dir))) \ + + (PTRS_PER_PTE*((pmd_val(*(dir))/MMUPAGE_SIZE)%PAGE_MMUCOUNT)\ + + pte_index(address)) \ +) #define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) diff -prauwN linux-2.5.69/include/asm-i386/rmap.h pgcl-2.5.69-3/include/asm-i386/rmap.h --- linux-2.5.69/include/asm-i386/rmap.h 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/rmap.h 2003-05-26 07:14:20.000000000 -0700 @@ -5,10 +5,17 @@ #include #ifdef CONFIG_HIGHPTE +/* + * The byte offset needs to be relative to PAGE_SIZE, the pfn will be + * implicitly truncated to a PAGE_SIZE boundary, the mapping will be + * returned rounded downward, and will need compensation by adding in + * the paddr's offset within the PAGE_SIZE-aligned region to the vaddr + * returned from kmap_atomic(). + */ static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) { - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; + unsigned long pfn = (unsigned long)(pte_paddr/MMUPAGE_SIZE); + unsigned long off = (unsigned long)pte_paddr & ~PAGE_MASK; return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); } diff -prauwN linux-2.5.69/include/asm-i386/setup.h pgcl-2.5.69-3/include/asm-i386/setup.h --- linux-2.5.69/include/asm-i386/setup.h 2003-05-04 16:53:33.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/setup.h 2003-05-26 07:14:20.000000000 -0700 @@ -6,15 +6,15 @@ #ifndef _i386_SETUP_H #define _i386_SETUP_H -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) +#define PFN_UP(x) (((x) + MMUPAGE_SIZE-1) >> MMUPAGE_SHIFT) +#define PFN_DOWN(x) ((x) >> MMUPAGE_SHIFT) +#define PFN_PHYS(x) ((x) << MMUPAGE_SHIFT) /* * Reserved space for vmalloc and iomap - defined in asm/page.h */ #define MAXMEM_PFN PFN_DOWN(MAXMEM) -#define MAX_NONPAE_PFN (1 << 20) +#define MAX_NONPAE_PFN (1 << (32 - MMUPAGE_SHIFT)) /* * This is set up by the setup-routine at boot-time diff -prauwN linux-2.5.69/include/asm-i386/shmparam.h pgcl-2.5.69-3/include/asm-i386/shmparam.h --- linux-2.5.69/include/asm-i386/shmparam.h 2003-05-04 16:53:09.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/shmparam.h 2003-05-26 07:14:20.000000000 -0700 @@ -1,6 +1,6 @@ #ifndef _ASMI386_SHMPARAM_H #define _ASMI386_SHMPARAM_H -#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ +#define SHMLBA MMUPAGE_SIZE /* attach addr a multiple of this */ #endif /* _ASMI386_SHMPARAM_H */ diff -prauwN linux-2.5.69/include/asm-i386/thread_info.h pgcl-2.5.69-3/include/asm-i386/thread_info.h --- linux-2.5.69/include/asm-i386/thread_info.h 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/thread_info.h 2003-05-26 07:14:20.000000000 -0700 @@ -53,6 +53,7 @@ struct thread_info { #endif #define PREEMPT_ACTIVE 0x4000000 +#define THREAD_SIZE (2*MMUPAGE_SIZE) /* * macros/functions for gaining access to the thread information structure @@ -81,14 +82,13 @@ struct thread_info { static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } /* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info() ((struct thread_info *) __get_free_pages(GFP_KERNEL,1)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 1) +#define alloc_thread_info() ((struct thread_info *) kmalloc(THREAD_SIZE, SLAB_KERNEL)) +#define free_thread_info(ti) kfree(ti) #define get_thread_info(ti) get_task_struct((ti)->task) #define put_thread_info(ti) put_task_struct((ti)->task) @@ -96,7 +96,7 @@ static inline struct thread_info *curren /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $~(THREAD_SIZE-1), reg; \ andl %esp, reg #endif diff -prauwN linux-2.5.69/include/asm-i386/tlbflush.h pgcl-2.5.69-3/include/asm-i386/tlbflush.h --- linux-2.5.69/include/asm-i386/tlbflush.h 2003-05-04 16:53:42.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-i386/tlbflush.h 2003-05-26 07:14:20.000000000 -0700 @@ -92,8 +92,17 @@ static inline void flush_tlb_mm(struct m static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { + int k; if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long vaddr = addr + k*MMUPAGE_SIZE; + if (vaddr < vma->vm_start) + continue; + else if (vaddr >= vma->vm_end) + break; + else + __flush_tlb_one(vaddr); + } } static inline void flush_tlb_range(struct vm_area_struct *vma, diff -prauwN linux-2.5.69/include/asm-ia64/page.h pgcl-2.5.69-3/include/asm-ia64/page.h --- linux-2.5.69/include/asm-ia64/page.h 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-ia64/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -190,4 +190,6 @@ get_order (unsigned long size) (((current->thread.flags & IA64_THREAD_XSTACK) != 0) \ ? VM_EXEC : 0)) +#include + #endif /* _ASM_IA64_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-m68k/page.h pgcl-2.5.69-3/include/asm-m68k/page.h --- linux-2.5.69/include/asm-m68k/page.h 2003-05-04 16:53:57.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-m68k/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -192,6 +192,8 @@ static inline void *__va(unsigned long x #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _M68K_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-m68knommu/page.h pgcl-2.5.69-3/include/asm-m68knommu/page.h --- linux-2.5.69/include/asm-m68knommu/page.h 2003-05-04 16:53:14.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-m68knommu/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -94,6 +94,8 @@ extern unsigned long memory_end; #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) +#include + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff -prauwN linux-2.5.69/include/asm-mips/page.h pgcl-2.5.69-3/include/asm-mips/page.h --- linux-2.5.69/include/asm-mips/page.h 2003-05-04 16:53:12.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-mips/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -85,6 +85,8 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* defined (__KERNEL__) */ #endif /* __ASM_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-mips64/page.h pgcl-2.5.69-3/include/asm-mips64/page.h --- linux-2.5.69/include/asm-mips64/page.h 2003-05-04 16:52:48.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-mips64/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -80,6 +80,8 @@ typedef struct { unsigned long pgprot; } #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* defined (__KERNEL__) */ #endif /* _ASM_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-parisc/page.h pgcl-2.5.69-3/include/asm-parisc/page.h --- linux-2.5.69/include/asm-parisc/page.h 2003-05-04 16:53:07.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-parisc/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -112,6 +112,8 @@ extern int npmem_ranges; #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _PARISC_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-ppc/page.h pgcl-2.5.69-3/include/asm-ppc/page.h --- linux-2.5.69/include/asm-ppc/page.h 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-ppc/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -142,5 +142,7 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _PPC_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-ppc64/page.h pgcl-2.5.69-3/include/asm-ppc64/page.h --- linux-2.5.69/include/asm-ppc64/page.h 2003-05-04 16:52:49.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-ppc64/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -200,5 +200,7 @@ static inline int get_order(unsigned lon #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _PPC64_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-s390/page.h pgcl-2.5.69-3/include/asm-s390/page.h --- linux-2.5.69/include/asm-s390/page.h 2003-05-04 16:53:03.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-s390/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -181,6 +181,8 @@ typedef struct { unsigned long pgd; } pg #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _S390_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-sh/page.h pgcl-2.5.69-3/include/asm-sh/page.h --- linux-2.5.69/include/asm-sh/page.h 2003-05-04 16:53:41.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-sh/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -106,6 +106,8 @@ static __inline__ int get_order(unsigned #endif +#include + #endif /* __KERNEL__ */ #endif /* __ASM_SH_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-sparc/page.h pgcl-2.5.69-3/include/asm-sparc/page.h --- linux-2.5.69/include/asm-sparc/page.h 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-sparc/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -171,6 +171,8 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _SPARC_PAGE_H */ diff -prauwN linux-2.5.69/include/asm-sparc64/page.h pgcl-2.5.69-3/include/asm-sparc64/page.h --- linux-2.5.69/include/asm-sparc64/page.h 2003-05-04 16:53:13.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-sparc64/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -168,6 +168,8 @@ static __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* !(__KERNEL__) */ #endif /* !(_SPARC64_PAGE_H) */ diff -prauwN linux-2.5.69/include/asm-v850/page.h pgcl-2.5.69-3/include/asm-v850/page.h --- linux-2.5.69/include/asm-v850/page.h 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-v850/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -140,6 +140,7 @@ extern __inline__ int get_order (unsigne #define __pa(x) __virt_to_phys ((unsigned long)(x)) #define __va(x) ((void *)__phys_to_virt ((unsigned long)(x))) +#include #endif /* KERNEL */ diff -prauwN linux-2.5.69/include/asm-x86_64/page.h pgcl-2.5.69-3/include/asm-x86_64/page.h --- linux-2.5.69/include/asm-x86_64/page.h 2003-05-04 16:53:00.000000000 -0700 +++ pgcl-2.5.69-3/include/asm-x86_64/page.h 2003-05-26 07:14:20.000000000 -0700 @@ -122,6 +122,8 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _X86_64_PAGE_H */ diff -prauwN linux-2.5.69/include/linux/aio.h pgcl-2.5.69-3/include/linux/aio.h --- linux-2.5.69/include/linux/aio.h 2003-05-04 16:52:49.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/aio.h 2003-05-26 07:14:20.000000000 -0700 @@ -104,13 +104,13 @@ struct aio_ring_info { unsigned long mmap_base; unsigned long mmap_size; - struct page **ring_pages; + unsigned long *ring_pages; spinlock_t ring_lock; long nr_pages; unsigned nr, tail; - struct page *internal_pages[AIO_RING_PAGES]; + unsigned long internal_pages[AIO_RING_PAGES]; /* pfn's */ }; struct kioctx { diff -prauwN linux-2.5.69/include/linux/binfmts.h pgcl-2.5.69-3/include/linux/binfmts.h --- linux-2.5.69/include/linux/binfmts.h 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/binfmts.h 2003-05-26 07:14:20.000000000 -0700 @@ -2,6 +2,7 @@ #define _LINUX_BINFMTS_H #include +#include /* for PAGE_MMUCOUNT */ struct pt_regs; @@ -9,8 +10,15 @@ struct pt_regs; * MAX_ARG_PAGES defines the number of pages allocated for arguments * and envelope for the new program. 32 should suffice, this gives * a maximum env+arg of 128kB w/4KB pages! + * Now that PAGE_SIZE is a software construct and varies wildly, + * MAX_ARG_PAGES should represent a constant size of 128KB. When + * PAGE_SIZE exceeds that, we're in trouble. */ -#define MAX_ARG_PAGES 32 +#if PAGE_MMUCOUNT <= 32 +#define MAX_ARG_PAGES (32/PAGE_MMUCOUNT) +#else +#error PAGE_SIZE too large to enforce MAX_ARG_PAGES! +#endif /* sizeof(linux_binprm->buf) */ #define BINPRM_BUF_SIZE 128 diff -prauwN linux-2.5.69/include/linux/bio.h pgcl-2.5.69-3/include/linux/bio.h --- linux-2.5.69/include/linux/bio.h 2003-05-04 16:53:36.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/bio.h 2003-05-26 07:14:20.000000000 -0700 @@ -242,15 +242,14 @@ extern inline char *bio_kmap_irq(struct local_irq_save(*flags); addr = (unsigned long) kmap_atomic(bio_page(bio), KM_BIO_SRC_IRQ); - if (addr & ~PAGE_MASK) - BUG(); + BUG_ON (addr & ~MMUPAGE_MASK); return (char *) addr + bio_offset(bio); } extern inline void bio_kunmap_irq(char *buffer, unsigned long *flags) { - unsigned long ptr = (unsigned long) buffer & PAGE_MASK; + unsigned long ptr = (unsigned long) buffer & MMUPAGE_MASK; kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); local_irq_restore(*flags); diff -prauwN linux-2.5.69/include/linux/highmem.h pgcl-2.5.69-3/include/linux/highmem.h --- linux-2.5.69/include/linux/highmem.h 2003-05-04 16:53:36.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/highmem.h 2003-05-26 07:14:20.000000000 -0700 @@ -81,6 +81,17 @@ static inline void copy_user_highpage(st kunmap_atomic(vto, KM_USER1); } +static inline void copy_user_mmupages(struct page *dst, struct page *src, int offset, int size) +{ + char *vfrom, *vto; + + vfrom = kmap_atomic(src, KM_USER0); + vto = kmap_atomic(dst, KM_USER1); + memcpy(&vto[offset], &vfrom[offset], size); + kunmap_atomic(src, KM_USER0); + kunmap_atomic(dst, KM_USER1); +} + static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; diff -prauwN linux-2.5.69/include/linux/ide.h pgcl-2.5.69-3/include/linux/ide.h --- linux-2.5.69/include/linux/ide.h 2003-05-04 16:53:13.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/ide.h 2003-05-26 07:14:20.000000000 -0700 @@ -225,7 +225,7 @@ typedef unsigned char byte; /* used ever * allowing each to have about 256 entries (8 bytes each) from this. */ #define PRD_BYTES 8 -#define PRD_ENTRIES (PAGE_SIZE / (2 * PRD_BYTES)) +#define PRD_ENTRIES (MMUPAGE_SIZE / (2 * PRD_BYTES)) /* * Some more useful definitions diff -prauwN linux-2.5.69/include/linux/mm.h pgcl-2.5.69-3/include/linux/mm.h --- linux-2.5.69/include/linux/mm.h 2003-05-04 16:53:00.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/mm.h 2003-05-26 07:44:46.000000000 -0700 @@ -68,7 +68,7 @@ struct vm_area_struct { struct vm_operations_struct * vm_ops; /* Information about our backing store: */ - unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + unsigned long vm_pgoff; /* Offset (within vm_file) in MMUPAGE_SIZE units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ @@ -163,7 +163,8 @@ struct page { atomic_t count; /* Usage count, see below. */ struct list_head list; /* ->mapping has some page lists. */ struct address_space *mapping; /* The inode (or ...) we belong to. */ - unsigned long index; /* Our offset within mapping. */ + unsigned long index; /* Our offset within mapping. + * in PAGE_CACHE_SIZE units. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ union { @@ -330,10 +331,19 @@ static inline void set_page_zone(struct page->flags |= zone_num << ZONE_SHIFT; } -static inline void * lowmem_page_address(struct page *page) -{ - return __va( ( (page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) << PAGE_SHIFT); -} + +#if 1 +#define lowmem_page_address(page) __va(page_to_pfn(page)*MMUPAGE_SIZE) +#else + #define lowmem_page_address(page) \ +({ \ + extern unsigned long max_low_pfn; \ + const unsigned long __lpa_pfn = page_to_pfn(page); \ + BUG_ON(max_low_pfn && __lpa_pfn > max_low_pfn); \ + BUG_ON(__lpa_pfn >= (~PAGE_OFFSET+1)/MMUPAGE_SIZE); \ + __va(__lpa_pfn*MMUPAGE_SIZE); \ +}) +#endif #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL @@ -416,7 +426,7 @@ extern int vmtruncate(struct inode * ino extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); +extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot, int subpfn); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); @@ -424,7 +434,7 @@ extern long sys_remap_file_pages(unsigne int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + int len, int write, int force, unsigned long *pages, struct vm_area_struct **vmas); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); @@ -509,10 +519,10 @@ static inline unsigned long do_mmap(stru unsigned long flag, unsigned long offset) { unsigned long ret = -EINVAL; - if ((offset + PAGE_ALIGN(len)) < offset) + if ((offset + MMUPAGE_ALIGN(len)) < offset) goto out; - if (!(offset & ~PAGE_MASK)) - ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); + if (!(offset & ~MMUPAGE_MASK)) + ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> MMUPAGE_SHIFT); out: return ret; } @@ -594,8 +604,18 @@ extern struct vm_area_struct *find_exten extern unsigned int nr_used_zone_pages(void); +/* + * Return byte offset from start of page containing virtual address in + * vma, to start of mmupage containing it: 0 if PAGE_MMUSHIFT 0. + */ +static inline unsigned long vma_suboffset(struct vm_area_struct *vma, unsigned long address) +{ + return (address - vma->vm_start + MMUPAGE_SIZE * vma->vm_pgoff) + & (MMUPAGE_MASK - PAGE_MASK); +} + extern struct page * vmalloc_to_page(void *addr); -extern struct page * follow_page(struct mm_struct *mm, unsigned long address, +unsigned long follow_page(struct mm_struct *mm, unsigned long address, int write); extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); diff -prauwN linux-2.5.69/include/linux/mmzone.h pgcl-2.5.69-3/include/linux/mmzone.h --- linux-2.5.69/include/linux/mmzone.h 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/mmzone.h 2003-05-26 07:14:20.000000000 -0700 @@ -20,7 +20,7 @@ /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER -#define MAX_ORDER 11 +#define MAX_ORDER (11 - PAGE_MMUSHIFT) #else #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif diff -prauwN linux-2.5.69/include/linux/pagemap.h pgcl-2.5.69-3/include/linux/pagemap.h --- linux-2.5.69/include/linux/pagemap.h 2003-05-04 16:52:49.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/pagemap.h 2003-05-26 07:14:20.000000000 -0700 @@ -23,6 +23,9 @@ #define PAGE_CACHE_MASK PAGE_MASK #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK) +#define PAGE_CACHE_MMUSHIFT (PAGE_CACHE_SHIFT - MMUPAGE_SHIFT) +#define PAGE_CACHE_MMUCOUNT (PAGE_CACHE_SIZE/MMUPAGE_SIZE) + #define page_cache_get(page) get_page(page) #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); @@ -188,8 +191,8 @@ static inline int fault_in_pages_writeab * If the page was already mapped, this will get a cache miss * for sure, so try to avoid doing it. */ - if (((unsigned long)uaddr & PAGE_MASK) != - ((unsigned long)end & PAGE_MASK)) + if (((unsigned long)uaddr & MMUPAGE_MASK) != + ((unsigned long)end & MMUPAGE_MASK)) ret = __put_user(0, end); } return ret; @@ -204,8 +207,8 @@ static inline void fault_in_pages_readab if (ret == 0) { const char *end = uaddr + size - 1; - if (((unsigned long)uaddr & PAGE_MASK) != - ((unsigned long)end & PAGE_MASK)) + if (((unsigned long)uaddr & MMUPAGE_MASK) != + ((unsigned long)end & MMUPAGE_MASK)) __get_user(c, (char *)end); } } diff -prauwN linux-2.5.69/include/linux/sched.h pgcl-2.5.69-3/include/linux/sched.h --- linux-2.5.69/include/linux/sched.h 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/sched.h 2003-05-26 07:14:20.000000000 -0700 @@ -195,7 +195,7 @@ struct mm_struct { unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long rss, total_vm, locked_vm; + unsigned long rss, total_vm, locked_vm; /* in MMUPAGE_SIZE units */ unsigned long def_flags; unsigned long cpu_vm_mask; unsigned long swap_address; @@ -599,12 +599,7 @@ static inline int capable(int cap) extern struct mm_struct * mm_alloc(void); /* mmdrop drops the mm and the page tables */ -extern inline void FASTCALL(__mmdrop(struct mm_struct *)); -static inline void mmdrop(struct mm_struct * mm) -{ - if (atomic_dec_and_test(&mm->mm_count)) - __mmdrop(mm); -} +void mmdrop(struct mm_struct * mm); /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); diff -prauwN linux-2.5.69/include/linux/shm.h pgcl-2.5.69-3/include/linux/shm.h --- linux-2.5.69/include/linux/shm.h 2003-05-04 16:53:36.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/shm.h 2003-05-26 07:14:20.000000000 -0700 @@ -12,7 +12,7 @@ #define SHMMAX 0x2000000 /* max shared seg size (bytes) */ #define SHMMIN 1 /* min shared seg size (bytes) */ #define SHMMNI 4096 /* max num of segs system wide */ -#define SHMALL (SHMMAX/PAGE_SIZE*(SHMMNI/16)) /* max shm system wide (pages) */ +#define SHMALL (SHMMAX/MMUPAGE_SIZE*(SHMMNI/16)) /* max shm system wide (mmupages) */ #define SHMSEG SHMMNI /* max shared segs per process */ #include diff -prauwN linux-2.5.69/include/linux/sunrpc/svc.h pgcl-2.5.69-3/include/linux/sunrpc/svc.h --- linux-2.5.69/include/linux/sunrpc/svc.h 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/sunrpc/svc.h 2003-05-26 07:14:20.000000000 -0700 @@ -73,7 +73,8 @@ struct svc_serv { * This assumes that the non-page part of an rpc reply will fit * in a page - NFSd ensures this. lockd also has no trouble. */ -#define RPCSVC_MAXPAGES ((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE + 1) + +#define RPCSVC_MAXPAGES (2+((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE+1)) static inline u32 svc_getu32(struct iovec *iov) { diff -prauwN linux-2.5.69/include/linux/swap.h pgcl-2.5.69-3/include/linux/swap.h --- linux-2.5.69/include/linux/swap.h 2003-05-04 16:52:49.000000000 -0700 +++ pgcl-2.5.69-3/include/linux/swap.h 2003-05-26 07:14:20.000000000 -0700 @@ -45,7 +45,7 @@ static inline int current_is_kswapd(void */ union swap_header { struct { - char reserved[PAGE_SIZE - 10]; + char reserved[MMUPAGE_SIZE - 10]; char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ } magic; struct { @@ -104,8 +104,8 @@ enum { #define SWAP_CLUSTER_MAX 32 -#define SWAP_MAP_MAX 0x7fff -#define SWAP_MAP_BAD 0x8000 +#define SWAP_MAP_MAX 0xfffe +#define SWAP_MAP_BAD 0xffff /* * The in-memory structure used to track swap areas. diff -prauwN linux-2.5.69/init/main.c pgcl-2.5.69-3/init/main.c --- linux-2.5.69/init/main.c 2003-05-04 16:53:03.000000000 -0700 +++ pgcl-2.5.69-3/init/main.c 2003-05-26 07:14:20.000000000 -0700 @@ -372,6 +372,7 @@ static void rest_init(void) cpu_idle(); } + /* * Activate the first processor. */ @@ -422,9 +423,9 @@ asmlinkage void __init start_kernel(void calibrate_delay(); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && - initrd_start < min_low_pfn << PAGE_SHIFT) { + initrd_start < min_low_pfn << MMUPAGE_SHIFT) { printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " - "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); + "disabling it.\n",initrd_start,min_low_pfn << MMUPAGE_SHIFT); initrd_start = 0; } #endif diff -prauwN linux-2.5.69/ipc/shm.c pgcl-2.5.69-3/ipc/shm.c --- linux-2.5.69/ipc/shm.c 2003-05-04 16:53:28.000000000 -0700 +++ pgcl-2.5.69-3/ipc/shm.c 2003-05-26 07:23:28.000000000 -0700 @@ -110,7 +110,7 @@ static void shm_open (struct vm_area_str */ static void shm_destroy (struct shmid_kernel *shp) { - shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; + shm_tot -= (shp->shm_segsz + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; shm_rmid (shp->id); shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) @@ -169,7 +169,7 @@ static int newseg (key_t key, int shmflg { int error; struct shmid_kernel *shp; - int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; + int numpages = (size + MMUPAGE_SIZE -1) >> MMUPAGE_SHIFT; struct file * file; char name[13]; int id; @@ -713,7 +713,7 @@ asmlinkage long sys_shmat (int shmid, ch * space left for the stack to grow (at least 4 pages). */ if (addr < current->mm->start_stack && - addr > current->mm->start_stack - size - PAGE_SIZE * 5) + addr > current->mm->start_stack - size - MMUPAGE_SIZE * 5) goto invalid; } @@ -771,7 +771,7 @@ asmlinkage long sys_shmdt(char *shmaddr) * otherwise it starts at this address with no hassles. */ if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) && - (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { + (vma->vm_start - addr)/MMUPAGE_SIZE == vma->vm_pgoff) { size = vma->vm_file->f_dentry->d_inode->i_size; @@ -799,7 +799,7 @@ asmlinkage long sys_shmdt(char *shmaddr) /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) && - (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) + (vma->vm_start - addr)/MMUPAGE_SIZE == vma->vm_pgoff) do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); vma = next; diff -prauwN linux-2.5.69/kernel/fork.c pgcl-2.5.69-3/kernel/fork.c --- linux-2.5.69/kernel/fork.c 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/kernel/fork.c 2003-05-26 07:14:20.000000000 -0700 @@ -199,7 +199,7 @@ void __init fork_init(unsigned long memp * value: the thread structures can take up at most half * of memory. */ - max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + max_threads = mempages / 8; /* * we need to allow at least 20 threads to boot a system */ @@ -279,7 +279,7 @@ static inline int dup_mmap(struct mm_str if(mpnt->vm_flags & VM_DONTCOPY) continue; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> MMUPAGE_SHIFT; if (!vm_enough_memory(len)) goto fail_nomem; charge += len; @@ -401,8 +401,11 @@ struct mm_struct * mm_alloc(void) * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ -inline void __mmdrop(struct mm_struct *mm) +void mmdrop(struct mm_struct *mm) { + if (!atomic_dec_and_test(&mm->mm_count)) + return; + BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); diff -prauwN linux-2.5.69/kernel/futex.c pgcl-2.5.69-3/kernel/futex.c --- linux-2.5.69/kernel/futex.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/kernel/futex.c 2003-05-26 07:14:20.000000000 -0700 @@ -109,14 +109,15 @@ static inline void tell_waiter(struct fu static struct page *__pin_page(unsigned long addr) { struct mm_struct *mm = current->mm; - struct page *page, *tmp; + unsigned long pfn, tmp; int err; /* * Do a quick atomic lookup first - this is the fastpath. */ - page = follow_page(mm, addr, 0); - if (likely(page != NULL)) { + pfn = follow_page(mm, addr, 0); + if (likely(pfn != 0)) { + struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) get_page(page); return page; @@ -130,7 +131,7 @@ repeat_lookup: unlock_futex_mm(); down_read(&mm->mmap_sem); - err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL); + err = get_user_pages(current, mm, addr, 1, 0, 0, &pfn, NULL); up_read(&mm->mmap_sem); lock_futex_mm(); @@ -142,12 +143,18 @@ repeat_lookup: * check for races: */ tmp = follow_page(mm, addr, 0); - if (tmp != page) { + + /* + * XXX: this is weird, it can refer to a different pfn in the + * same page. Counts as a race in my book. + */ + if (tmp != pfn) { + struct page *page = pfn_to_page(pfn); put_page(page); goto repeat_lookup; } - return page; + return pfn ? pfn_to_page(pfn) : NULL; } static inline void unpin_page(struct page *page) @@ -173,6 +180,11 @@ static int futex_wake(unsigned long uadd return -EFAULT; } + /* + * XXX: I broke this. + * This needs to include a suboffset w/in the struct page's area. + * -- wli + */ head = hash_futex(page, offset); list_for_each_safe(i, next, head) { @@ -425,7 +437,7 @@ long do_futex(unsigned long uaddr, int o unsigned long pos_in_page; int ret; - pos_in_page = uaddr % PAGE_SIZE; + pos_in_page = uaddr % MMUPAGE_SIZE; /* Must be "naturally" aligned */ if (pos_in_page % sizeof(u32)) diff -prauwN linux-2.5.69/kernel/ksyms.c pgcl-2.5.69-3/kernel/ksyms.c --- linux-2.5.69/kernel/ksyms.c 2003-05-04 16:52:49.000000000 -0700 +++ pgcl-2.5.69-3/kernel/ksyms.c 2003-05-26 07:14:20.000000000 -0700 @@ -128,7 +128,6 @@ EXPORT_SYMBOL(kmap_high); EXPORT_SYMBOL(kunmap_high); EXPORT_SYMBOL(highmem_start_page); EXPORT_SYMBOL(kmap_prot); -EXPORT_SYMBOL(kmap_pte); #endif #ifdef HASHED_PAGE_VIRTUAL EXPORT_SYMBOL(page_address); diff -prauwN linux-2.5.69/kernel/ptrace.c pgcl-2.5.69-3/kernel/ptrace.c --- linux-2.5.69/kernel/ptrace.c 2003-05-04 16:53:57.000000000 -0700 +++ pgcl-2.5.69-3/kernel/ptrace.c 2003-05-26 07:14:20.000000000 -0700 @@ -155,27 +155,32 @@ int access_process_vm(struct task_struct struct mm_struct *mm; struct vm_area_struct *vma; struct page *page; + unsigned long pfn = 0; void *old_buf = buf; mm = get_task_mm(tsk); - if (!mm) + if (!mm) { + printk("get_task_mm() failed in access_process_vm()\n"); return 0; + } down_read(&mm->mmap_sem); /* ignore errors, just check how much was sucessfully transfered */ while (len) { int bytes, ret, offset; + unsigned long dst_off; void *maddr; - ret = get_user_pages(current, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) + ret = get_user_pages(current, mm, addr, 1, write, 1, &pfn, &vma); + if (ret <= 0) { + printk("get_uesr_pages() failed in access_process_vm()\n"); break; + } bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; + offset = addr & ~MMUPAGE_MASK; + if (bytes > MMUPAGE_SIZE-offset) + bytes = MMUPAGE_SIZE-offset; flush_cache_page(vma, addr); @@ -185,12 +190,14 @@ int access_process_vm(struct task_struct * to handle this correctly. */ + page = pfn_to_page(pfn); maddr = kmap(page); + dst_off = (pfn % PAGE_MMUCOUNT)*MMUPAGE_SIZE; if (write) { - memcpy(maddr + offset, buf, bytes); + memcpy(maddr + offset + dst_off, buf, bytes); flush_icache_user_range(vma, page, addr, bytes); } else { - memcpy(buf, maddr + offset, bytes); + memcpy(buf, maddr + offset + dst_off, bytes); } kunmap(page); page_cache_release(page); diff -prauwN linux-2.5.69/mm/bootmem.c pgcl-2.5.69-3/mm/bootmem.c --- linux-2.5.69/mm/bootmem.c 2003-05-04 16:53:09.000000000 -0700 +++ pgcl-2.5.69-3/mm/bootmem.c 2003-05-26 07:14:20.000000000 -0700 @@ -33,10 +33,7 @@ unsigned long __init bootmem_bootmap_pag unsigned long mapsize; mapsize = (pages+7)/8; - mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; - mapsize >>= PAGE_SHIFT; - - return mapsize; + return (mapsize + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; } /* @@ -46,14 +43,17 @@ static unsigned long __init init_bootmem unsigned long mapstart, unsigned long start, unsigned long end) { bootmem_data_t *bdata = pgdat->bdata; - unsigned long mapsize = ((end - start)+7)/8; + unsigned long mapsize; pgdat->pgdat_next = pgdat_list; pgdat_list = pgdat; + /* round start down to simplify free_all_bootmem_core() */ + start &= ~(PAGE_MMUCOUNT - 1); + mapsize = ((end - start)+7)/8; mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); - bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); - bdata->node_boot_start = (start << PAGE_SHIFT); + bdata->node_bootmem_map = phys_to_virt(mapstart << MMUPAGE_SHIFT); + bdata->node_boot_start = (start << MMUPAGE_SHIFT); bdata->node_low_pfn = end; /* @@ -77,10 +77,10 @@ static void __init reserve_bootmem_core( * round up, partially reserved pages are considered * fully reserved. */ - unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; + unsigned long sidx = (addr - bdata->node_boot_start)/MMUPAGE_SIZE; unsigned long eidx = (addr + size - bdata->node_boot_start + - PAGE_SIZE-1)/PAGE_SIZE; - unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; + MMUPAGE_SIZE-1)/MMUPAGE_SIZE; + unsigned long end_pfn = (addr + size + MMUPAGE_SIZE-1)/MMUPAGE_SIZE; if (!size) BUG(); @@ -90,13 +90,11 @@ static void __init reserve_bootmem_core( BUG(); if (sidx >= eidx) BUG(); - if ((addr >> PAGE_SHIFT) >= bdata->node_low_pfn) - BUG(); - if (end > bdata->node_low_pfn) + if (end_pfn > bdata->node_low_pfn) BUG(); for (i = sidx; i < eidx; i++) if (test_and_set_bit(i, bdata->node_bootmem_map)) - printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); + printk("hm, page %08lx reserved twice.\n", i*MMUPAGE_SIZE); } static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) @@ -108,11 +106,11 @@ static void __init free_bootmem_core(boo * considered reserved. */ unsigned long sidx; - unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; - unsigned long end = (addr + size)/PAGE_SIZE; + unsigned long eidx = (addr + size - bdata->node_boot_start)/MMUPAGE_SIZE; + unsigned long end_pfn = (addr + size)/MMUPAGE_SIZE; if (!size) BUG(); - if (end > bdata->node_low_pfn) + if (end_pfn > bdata->node_low_pfn) BUG(); if (addr < bdata->last_success) @@ -121,8 +119,8 @@ static void __init free_bootmem_core(boo /* * Round up the beginning of the address. */ - start = (addr + PAGE_SIZE-1) / PAGE_SIZE; - sidx = start - (bdata->node_boot_start/PAGE_SIZE); + start = (addr + MMUPAGE_SIZE-1) / MMUPAGE_SIZE; + sidx = start - (bdata->node_boot_start/MMUPAGE_SIZE); for (i = sidx; i < eidx; i++) { if (!test_and_clear_bit(i, bdata->node_bootmem_map)) @@ -154,19 +152,19 @@ __alloc_bootmem_core(struct bootmem_data BUG_ON(!size); BUG_ON(align & (align-1)); - eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + eidx = bdata->node_low_pfn - (bdata->node_boot_start >> MMUPAGE_SHIFT); offset = 0; if (align && (bdata->node_boot_start & (align - 1UL)) != 0) offset = (align - (bdata->node_boot_start & (align - 1UL))); - offset >>= PAGE_SHIFT; + offset >>= MMUPAGE_SHIFT; /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. */ if (goal && (goal >= bdata->node_boot_start) && - ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) { + ((goal >> MMUPAGE_SHIFT) < bdata->node_low_pfn)) { preferred = goal - bdata->node_boot_start; if (bdata->last_success >= preferred) @@ -174,10 +172,10 @@ __alloc_bootmem_core(struct bootmem_data } else preferred = 0; - preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; + preferred = ((preferred + align - 1) & ~(align - 1)) >> MMUPAGE_SHIFT; preferred += offset; - areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; - incr = align >> PAGE_SHIFT ? : 1; + areasize = (size+MMUPAGE_SIZE-1)/MMUPAGE_SIZE; + incr = align >> MMUPAGE_SHIFT ? : 1; restart_scan: for (i = preferred; i < eidx; i += incr) { @@ -205,7 +203,7 @@ restart_scan: return NULL; found: - bdata->last_success = start << PAGE_SHIFT; + bdata->last_success = start << MMUPAGE_SHIFT; BUG_ON(start >= eidx); /* @@ -213,30 +211,30 @@ found: * of this allocation's buffer? If yes then we can 'merge' * the previous partial page with this allocation. */ - if (align < PAGE_SIZE && + if (align < MMUPAGE_SIZE && bdata->last_offset && bdata->last_pos+1 == start) { offset = (bdata->last_offset+align-1) & ~(align-1); - BUG_ON(offset > PAGE_SIZE); - remaining_size = PAGE_SIZE-offset; + BUG_ON(offset > MMUPAGE_SIZE); + remaining_size = MMUPAGE_SIZE-offset; if (size < remaining_size) { areasize = 0; /* last_pos unchanged */ bdata->last_offset = offset+size; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + ret = phys_to_virt(bdata->last_pos*MMUPAGE_SIZE + offset + bdata->node_boot_start); } else { remaining_size = size - remaining_size; - areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + areasize = (remaining_size+MMUPAGE_SIZE-1)/MMUPAGE_SIZE; + ret = phys_to_virt(bdata->last_pos*MMUPAGE_SIZE + offset + bdata->node_boot_start); bdata->last_pos = start+areasize-1; bdata->last_offset = remaining_size; } - bdata->last_offset &= ~PAGE_MASK; + bdata->last_offset &= ~MMUPAGE_MASK; } else { bdata->last_pos = start + areasize - 1; - bdata->last_offset = size & ~PAGE_MASK; - ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); + bdata->last_offset = size & ~MMUPAGE_MASK; + ret = phys_to_virt(start * MMUPAGE_SIZE + bdata->node_boot_start); } /* @@ -253,49 +251,37 @@ static unsigned long __init free_all_boo { struct page *page = pgdat->node_mem_map; bootmem_data_t *bdata = pgdat->bdata; - unsigned long i, count, total = 0; - unsigned long idx; + unsigned long i, total = 0; + unsigned long idx, mapnr, node_low_mapnr; unsigned long *map; - if (!bdata->node_bootmem_map) BUG(); - - count = 0; - idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + BUG_ON(!bdata->node_bootmem_map); map = bdata->node_bootmem_map; - for (i = 0; i < idx; ) { - unsigned long v = ~map[i / BITS_PER_LONG]; - if (v) { - unsigned long m; - for (m = 1; m && i < idx; m<<=1, page++, i++) { - if (v & m) { - count++; + i = 0; + idx = bdata->node_low_pfn - (bdata->node_boot_start >> MMUPAGE_SHIFT); + + node_low_mapnr = (bdata->node_low_pfn - bdata->node_boot_start/MMUPAGE_SIZE)/PAGE_MMUCOUNT; + for (mapnr = 0; mapnr < node_low_mapnr; ++mapnr) { + int k, should_free = 1; + for (k = 0; k < PAGE_MMUCOUNT; ++k) + if (test_bit(mapnr*PAGE_MMUCOUNT + k, map)) + should_free = 0; + if (should_free) { + page = &pgdat->node_mem_map[mapnr]; ClearPageReserved(page); set_page_count(page, 1); __free_page(page); + ++total; } } - } else { - i+=BITS_PER_LONG; - page += BITS_PER_LONG; - } - } - total += count; /* - * Now free the allocator bitmap itself, it's not - * needed anymore: - */ - page = virt_to_page(bdata->node_bootmem_map); - count = 0; - for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); } - total += count; - bdata->node_bootmem_map = NULL; + * Leak the allocator bitmap; it's not worth saving. + */ + bdata->node_bootmem_map = NULL; + printk("bootmem: freed %lx pages in node %d\n", total, pgdat->node_id); return total; } diff -prauwN linux-2.5.69/mm/filemap.c pgcl-2.5.69-3/mm/filemap.c --- linux-2.5.69/mm/filemap.c 2003-05-04 16:53:09.000000000 -0700 +++ pgcl-2.5.69-3/mm/filemap.c 2003-05-26 07:14:20.000000000 -0700 @@ -946,8 +946,8 @@ struct page * filemap_nopage(struct vm_a unsigned long size, pgoff, endoff; int did_readahead; - pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; - endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + pgoff = (address - area->vm_start)/MMUPAGE_SIZE + area->vm_pgoff; + endoff = (area->vm_end - area->vm_start + MMUPAGE_SIZE - 1)/MMUPAGE_SIZE + area->vm_pgoff; retry_all: /* @@ -955,15 +955,15 @@ retry_all: * accessible.. */ size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if ((pgoff >= size) && (area->vm_mm == current->mm)) + if ((pgoff/PAGE_CACHE_MMUCOUNT >= size) && (area->vm_mm == current->mm)) return NULL; /* * The "size" of the file, as far as mmap is concerned, isn't bigger * than the mapping */ - if (size > endoff) - size = endoff; + if (size > endoff/PAGE_CACHE_MMUCOUNT) + size = endoff/PAGE_CACHE_MMUCOUNT; did_readahead = 0; @@ -973,23 +973,23 @@ retry_all: */ if (VM_SequentialReadHint(area)) { did_readahead = 1; - page_cache_readahead(mapping, ra, file, pgoff); + page_cache_readahead(mapping, ra, file, pgoff/PAGE_CACHE_MMUCOUNT); } /* * If the offset is outside the mapping size we're off the end * of a privately mapped file, so we need to map a zero page. */ - if ((pgoff < size) && !VM_RandomReadHint(area)) { + if ((pgoff/PAGE_CACHE_MMUCOUNT < size) && !VM_RandomReadHint(area)) { did_readahead = 1; - page_cache_readaround(mapping, ra, file, pgoff); + page_cache_readaround(mapping, ra, file, pgoff/PAGE_CACHE_MMUCOUNT); } /* * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_get_page(mapping, pgoff/PAGE_CACHE_MMUCOUNT); if (!page) { if (did_readahead) { handle_ra_miss(mapping, ra, pgoff); @@ -1017,7 +1017,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, pgoff); + error = page_cache_read(file, pgoff/PAGE_CACHE_MMUCOUNT); /* * The page we want has now been added to the page cache. @@ -1105,7 +1105,7 @@ static struct page * filemap_getpage(str * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_get_page(mapping, pgoff/PAGE_CACHE_MMUCOUNT); if (!page) { if (nonblock) return NULL; @@ -1127,7 +1127,7 @@ success: return page; no_cached_page: - error = page_cache_read(file, pgoff); + error = page_cache_read(file, pgoff/PAGE_CACHE_MMUCOUNT); /* * The page we want has now been added to the page cache. @@ -1218,26 +1218,30 @@ static int filemap_populate(struct vm_ar if (!nonblock) do_page_cache_readahead(mapping, vma->vm_file, - pgoff, len >> PAGE_CACHE_SHIFT); + pgoff/PAGE_MMUCOUNT, len >> PAGE_CACHE_SHIFT); repeat: size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) + if (pgoff + len/MMUPAGE_SIZE > size) return -EINVAL; page = filemap_getpage(file, pgoff, nonblock); if (!page && !nonblock) return -ENOMEM; if (page) { - err = install_page(mm, vma, addr, page, prot); + /* + * page caches bytes index*PAGE_SIZE to index*(PAGE_SIZE+1)-1 + * pgoff % PAGE_MMUCOUNT is the subpfn w/in the page + */ + err = install_page(mm, vma, addr, page, prot, pgoff % PAGE_MMUCOUNT); if (err) { page_cache_release(page); return err; } } - len -= PAGE_SIZE; - addr += PAGE_SIZE; + len -= MMUPAGE_SIZE; + addr += MMUPAGE_SIZE; pgoff++; if (len) goto repeat; diff -prauwN linux-2.5.69/mm/fremap.c pgcl-2.5.69-3/mm/fremap.c --- linux-2.5.69/mm/fremap.c 2003-05-04 16:52:49.000000000 -0700 +++ pgcl-2.5.69-3/mm/fremap.c 2003-05-26 07:14:20.000000000 -0700 @@ -53,13 +53,14 @@ static inline int zap_pte(struct mm_stru * previously existing mapping. */ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, struct page *page, pgprot_t prot) + unsigned long addr, struct page *page, pgprot_t prot, int subpfn) { int err = -ENOMEM, flush; pte_t *pte; pgd_t *pgd; pmd_t *pmd; struct pte_chain *pte_chain; + unsigned long pfn = page_to_pfn(page) + subpfn; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) @@ -79,7 +80,7 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); - set_pte(pte, mk_pte(page, prot)); + set_pte(pte, pfn_pte(pfn, prot)); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); if (flush) @@ -127,8 +128,8 @@ long sys_remap_file_pages(unsigned long /* * Sanitize the syscall parameters: */ - start = start & PAGE_MASK; - size = size & PAGE_MASK; + start = start & MMUPAGE_MASK; + size = size & MMUPAGE_MASK; /* Does the address range wrap, or is the span zero-sized? */ if (start + size <= start) @@ -136,7 +137,7 @@ long sys_remap_file_pages(unsigned long /* Can we represent this offset inside this architecture's pte's? */ #if PTE_FILE_MAX_BITS < BITS_PER_LONG - if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) + if (pgoff + (size >> MMUPAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) return err; #endif diff -prauwN linux-2.5.69/mm/highmem.c pgcl-2.5.69-3/mm/highmem.c --- linux-2.5.69/mm/highmem.c 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/mm/highmem.c 2003-05-26 07:14:20.000000000 -0700 @@ -54,8 +54,6 @@ static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -pte_t * pkmap_page_table; - static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); static void flush_all_zero_pkmaps(void) @@ -65,6 +63,8 @@ static void flush_all_zero_pkmaps(void) flush_cache_all(); for (i = 0; i < LAST_PKMAP; i++) { + int j; + unsigned long vaddr = PKMAP_ADDR(i); struct page *page; /* @@ -78,8 +78,14 @@ static void flush_all_zero_pkmaps(void) pkmap_count[i] = 0; /* sanity check */ - if (pte_none(pkmap_page_table[i])) - BUG(); + for (j = 0; j < PAGE_MMUCOUNT; ++j) { + unsigned long addr = vaddr + j*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + + BUG_ON(pte_none(*pte)); + } /* * Don't need an atomic fetch-and-clear op here; @@ -88,8 +94,20 @@ static void flush_all_zero_pkmaps(void) * getting the kmap_lock (which is held here). * So no dangers, even with speculative execution. */ - page = pte_page(pkmap_page_table[i]); - pte_clear(&pkmap_page_table[i]); + { + pgd_t *pgd = pgd_offset_k(vaddr); + pmd_t *pmd = pmd_offset(pgd, vaddr); + pte_t *pte = pte_offset_kernel(pmd, vaddr); + page = pte_page(*pte); + } + + for (j = 0; j < PAGE_MMUCOUNT; ++j) { + unsigned long addr = vaddr + j*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + pte_clear(pte); + } set_page_address(page, NULL); } @@ -99,7 +117,7 @@ static void flush_all_zero_pkmaps(void) static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; - int count; + int k, count; start: count = LAST_PKMAP; @@ -137,7 +155,15 @@ start: } } vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + WARN_ON(vaddr > __fix_to_virt(FIX_PKMAP_BEGIN)); + WARN_ON(vaddr < __fix_to_virt(FIX_PKMAP_END)); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long addr = vaddr + k * MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + set_pte(pte, pfn_pte(page_to_pfn(page) + k, kmap_prot)); + } pkmap_count[last_pkmap_nr] = 1; set_page_address(page, (void *)vaddr); @@ -479,12 +505,19 @@ void check_highmem_ptes(void) preempt_disable(); for (type = 0; type < KM_TYPE_NR; type++) { + int k; idx = type + KM_TYPE_NR*smp_processor_id(); - if (!pte_none(*(kmap_pte-idx))) { + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long addr = __fix_to_virt(FIX_KMAP_END) + idx*PAGE_SIZE + k*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + if (!pte_none(*pte)) { printk("scheduling with KM_TYPE %d held!\n", type); BUG(); } } + } preempt_enable(); } #endif diff -prauwN linux-2.5.69/mm/madvise.c pgcl-2.5.69-3/mm/madvise.c --- linux-2.5.69/mm/madvise.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/mm/madvise.c 2003-05-26 07:14:20.000000000 -0700 @@ -60,10 +60,12 @@ static long madvise_willneed(struct vm_a if (!file) return -EBADF; - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + start = ((start - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + end = ((end - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; + start /= PAGE_MMUCOUNT; + end /= PAGE_MMUCOUNT; do_page_cache_readahead(file->f_dentry->d_inode->i_mapping, file, start, max_sane_readahead(end - start)); @@ -170,9 +172,9 @@ asmlinkage long sys_madvise(unsigned lon down_write(¤t->mm->mmap_sem); - if (start & ~PAGE_MASK) + if (start & ~MMUPAGE_MASK) goto out; - len = (len + ~PAGE_MASK) & PAGE_MASK; + len = (len + ~MMUPAGE_MASK) & MMUPAGE_MASK; end = start + len; if (end < start) goto out; diff -prauwN linux-2.5.69/mm/memory.c pgcl-2.5.69-3/mm/memory.c --- linux-2.5.69/mm/memory.c 2003-05-04 16:53:14.000000000 -0700 +++ pgcl-2.5.69-3/mm/memory.c 2003-05-26 09:39:31.000000000 -0700 @@ -150,11 +150,32 @@ pte_t * pte_alloc_map(struct mm_struct * /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. + * If we raced, we also need to drop all the reference + * counts originally taken with the intent of conferring + * them to all the pte entries spanned by the pte page. */ if (pmd_present(*pmd)) { + if (PAGE_MMUCOUNT > 1) + atomic_sub(PAGE_MMUCOUNT-1, &new->count); pte_free(new); goto out; } +#if 0 + { + int k; + pmd_t *base; + unsigned long addr, __pmd = (unsigned long)pmd; + addr = address & ~(PAGE_MMUCOUNT*PMD_SIZE - 1); + base = pmd - ((__pmd/sizeof(pmd_t)) % PAGE_MMUCOUNT); + for (k = 0; k < PAGE_MMUCOUNT; ++k) + if (!pmd_none(base[k]) || pmd_present(base[k])) + printk(KERN_DEBUG + "redundant pmd instantiation " + "at vaddr 0x%lx pmd = 0x%p\n", + addr + PMD_SIZE*k, + &base[k]); + } +#endif pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); } @@ -347,7 +368,7 @@ skip_copy_pte_range: src_pte = pte_offset_map_nested(src_pmd, address); cont_copy_pte_range_noset: - address += PAGE_SIZE; + address += MMUPAGE_SIZE; if (address >= end) { pte_unmap_nested(src_pte); pte_unmap(dst_pte); @@ -393,8 +414,8 @@ zap_pte_range(struct mmu_gather *tlb, pm offset = address & ~PMD_MASK; if (offset + size > PMD_SIZE) size = PMD_SIZE - offset; - size &= PAGE_MASK; - for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { + size &= MMUPAGE_MASK; + for (offset=0; offset < size; ptep++, offset += MMUPAGE_SIZE) { pte_t pte = *ptep; if (pte_none(pte)) continue; @@ -417,8 +438,14 @@ zap_pte_range(struct mmu_gather *tlb, pm } } } else { - if (!pte_file(pte)) + if (!pte_file(pte)) { + if (pte_to_swp_entry(pte).val == 0x8073756dUL) + printk(KERN_DEBUG "detected fsckup " + "early, leaking stuff to " + "work around it\n"); + else free_swap_and_cache(pte_to_swp_entry(pte)); + } pte_clear(ptep); } } @@ -474,12 +501,12 @@ void unmap_page_range(struct mmu_gather /* Dispose of an entire struct mmu_gather per rescheduling point */ #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (FREE_PTE_NR * PAGE_SIZE) +#define ZAP_BLOCK_SIZE (FREE_PTE_NR * MMUPAGE_SIZE) #endif /* For UP, 256 pages at a time gives nice low latency */ #if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (256 * PAGE_SIZE) +#define ZAP_BLOCK_SIZE (256 * MMUPAGE_SIZE) #endif /* No preempt: go for the best straight-line efficiency */ @@ -524,10 +551,10 @@ int unmap_vmas(struct mmu_gather **tlbp, if (vma) { /* debug. killme. */ if (end_addr <= vma->vm_start) - printk("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n", + pr_debug("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n", __FUNCTION__, end_addr, vma->vm_start); if (start_addr >= vma->vm_end) - printk("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n", + pr_debug("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n", __FUNCTION__, start_addr, vma->vm_end); } @@ -543,7 +570,7 @@ int unmap_vmas(struct mmu_gather **tlbp, continue; if (vma->vm_flags & VM_ACCOUNT) - *nr_accounted += (end - start) >> PAGE_SHIFT; + *nr_accounted += (end - start) >> MMUPAGE_SHIFT; ret++; while (start != end) { @@ -573,7 +600,7 @@ int unmap_vmas(struct mmu_gather **tlbp, zap_bytes = ZAP_BLOCK_SIZE; } if (vma->vm_next && vma->vm_next->vm_start < vma->vm_end) - printk("%s: VMA list is not sorted correctly!\n", + pr_debug("%s: VMA list is not sorted correctly!\n", __FUNCTION__); } return ret; @@ -612,18 +639,19 @@ void zap_page_range(struct vm_area_struc * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -struct page * -follow_page(struct mm_struct *mm, unsigned long address, int write) +unsigned long follow_page(struct mm_struct *mm, unsigned long address, int write) { pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; unsigned long pfn; - struct vm_area_struct *vma; +#if 0 + struct vm_area_struct *vma; vma = hugepage_vma(mm, address); if (vma) return follow_huge_addr(mm, vma, address, write); +#endif pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || pgd_bad(*pgd)) @@ -632,8 +660,15 @@ follow_page(struct mm_struct *mm, unsign pmd = pmd_offset(pgd, address); if (pmd_none(*pmd)) goto out; + + /* + * hugetlb's still broken in pgcl; not difficult to fix, + * but an unnecessary distraction while it's in flux + */ +#if 0 if (pmd_huge(*pmd)) return follow_huge_pmd(mm, address, pmd, write); +#endif if (pmd_bad(*pmd)) goto out; @@ -647,12 +682,12 @@ follow_page(struct mm_struct *mm, unsign if (!write || (pte_write(pte) && pte_dirty(pte))) { pfn = pte_pfn(pte); if (pfn_valid(pfn)) - return pfn_to_page(pfn); + return pfn; /* pfn_to_page(pfn) */ } } out: - return NULL; + return 0; /* NULL */ } /* @@ -664,14 +699,26 @@ out: static inline struct page *get_page_map(struct page *page) { if (!pfn_valid(page_to_pfn(page))) - return 0; + return NULL; return page; } +static inline unsigned long get_pfn_map(unsigned long pfn) +{ + return pfn_valid(pfn) ? pfn : 0; +} + +/* + * This puppy is handing back MMUPAGE_SIZE -sized slots. + * Callers need auditing. + * This function is a goddamn train wreck. Someone needs to + * janitor the idiot thing for mainline to at very least kill + * the #ifdef FIXADDR_START bullcrap. + */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) + unsigned long *pfns, struct vm_area_struct **vmas) { int i; unsigned int flags; @@ -701,46 +748,48 @@ int get_user_pages(struct task_struct *t .vm_page_prot = PAGE_READONLY, .vm_flags = VM_READ | VM_EXEC, }; - unsigned long pg = start & PAGE_MASK; + unsigned long pg = start & MMUPAGE_MASK; pgd_t *pgd; pmd_t *pmd; pte_t *pte; pgd = pgd_offset_k(pg); if (!pgd) - return i ? : -EFAULT; + return i ? i : -EFAULT; pmd = pmd_offset(pgd, pg); if (!pmd) - return i ? : -EFAULT; + return i ? i : -EFAULT; pte = pte_offset_kernel(pmd, pg); if (!pte || !pte_present(*pte) || !pte_user(*pte) || !(write ? pte_write(*pte) : pte_read(*pte))) - return i ? : -EFAULT; - if (pages) { - pages[i] = pte_page(*pte); - get_page(pages[i]); + return i ? i : -EFAULT; + if (pfns) { + pfns[i] = pte_pfn(*pte); + get_page(pfn_to_page(pfns[i])); } if (vmas) vmas[i] = &fixmap_vma; i++; - start += PAGE_SIZE; + start += MMUPAGE_SIZE; len--; continue; } #endif - if (!vma || (pages && (vma->vm_flags & VM_IO)) + if (!vma || (pfns && (vma->vm_flags & VM_IO)) || !(flags & vma->vm_flags)) - return i ? : -EFAULT; + return i ? i : -EFAULT; +#if 0 if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &len, i); continue; } +#endif spin_lock(&mm->page_table_lock); do { - struct page *map; - while (!(map = follow_page(mm, start, write))) { + unsigned long map_pfn; + while (!(map_pfn = follow_page(mm, start, write))) { spin_unlock(&mm->page_table_lock); switch (handle_mm_fault(mm,vma,start,write)) { case VM_FAULT_MINOR: @@ -750,36 +799,50 @@ int get_user_pages(struct task_struct *t tsk->maj_flt++; break; case VM_FAULT_SIGBUS: + if (!i) + printk("get_user_pages(): VM_FAULT_SIGBUS\n"); return i ? i : -EFAULT; case VM_FAULT_OOM: + if (!i) + printk("get_user_pages(): VM_FAULT_OOM\n"); return i ? i : -ENOMEM; default: BUG(); } spin_lock(&mm->page_table_lock); } - if (pages) { - pages[i] = get_page_map(map); - if (!pages[i]) { + if (pfns) { + pfns[i] = get_pfn_map(map_pfn); + if (!pfns[i]) { spin_unlock(&mm->page_table_lock); - while (i--) - page_cache_release(pages[i]); + while (i--) { + struct page *map; + map = pfn_to_page(pfns[i]); + page_cache_release(map); + } i = -EFAULT; + printk("get_user_pages(): saw a zero pfn\n"); goto out; } - flush_dcache_page(pages[i]); - if (!PageReserved(pages[i])) - page_cache_get(pages[i]); + if (1) { + struct page *map; + map = pfn_to_page(pfns[i]); + flush_dcache_page(map); + if (!PageReserved(map)) + page_cache_get(map); + } } if (vmas) vmas[i] = vma; i++; - start += PAGE_SIZE; + start += MMUPAGE_SIZE; len--; } while(len && start < vma->vm_end); spin_unlock(&mm->page_table_lock); } while(len); out: + if (i < 0) + pr_debug("get_user_pages() returning an error\n"); return i; } @@ -796,7 +859,7 @@ static void zeromap_pte_range(pte_t * pt pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); BUG_ON(!pte_none(*pte)); set_pte(pte, zero_pte); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); } @@ -832,8 +895,7 @@ int zeromap_page_range(struct vm_area_st dir = pgd_offset(mm, address); flush_cache_range(vma, beg, end); - if (address >= end) - BUG(); + BUG_ON(address >= end); spin_lock(&mm->page_table_lock); do { @@ -867,12 +929,12 @@ static inline void remap_pte_range(pte_t end = address + size; if (end > PMD_SIZE) end = PMD_SIZE; - pfn = phys_addr >> PAGE_SHIFT; + pfn = phys_addr >> MMUPAGE_SHIFT; do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) set_pte(pte, pfn_pte(pfn, prot)); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pfn++; pte++; } while (address && (address < end)); @@ -913,8 +975,7 @@ int remap_page_range(struct vm_area_stru phys_addr -= from; dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); - if (from >= end) - BUG(); + BUG_ON(from >= end); spin_lock(&mm->page_table_lock); do { @@ -951,12 +1012,12 @@ static inline void establish_pte(struct /* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, pte_t *page_table, unsigned long subpfn) { + pte_t pte = pfn_pte(page_to_pfn(new_page) + subpfn, vma->vm_page_prot); invalidate_vcache(address, vma->vm_mm, new_page); flush_cache_page(vma, address); - establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); + establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(pte))); } /* @@ -1004,6 +1065,10 @@ static int do_wp_page(struct mm_struct * int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { + /* + * XXX: this should sweep the pagetables to + * prefault all the pte's. This is free, take it. + */ flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); @@ -1016,6 +1081,8 @@ static int do_wp_page(struct mm_struct * /* * Ok, we need to copy. Oh, well.. + * XXX: This needs to sweep the pagetables in an analogous + * manner to do_anonymous_page(). */ page_cache_get(old_page); spin_unlock(&mm->page_table_lock); @@ -1034,10 +1101,11 @@ static int do_wp_page(struct mm_struct * spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { + unsigned long subpfn = pfn & (PAGE_MMUCOUNT-1); if (PageReserved(old_page)) ++mm->rss; page_remove_rmap(old_page, page_table); - break_cow(vma, new_page, address, page_table); + break_cow(vma, new_page, address, page_table, subpfn); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1079,14 +1147,14 @@ static void vmtruncate_list(struct list_ } /* mapping wholly unaffected? */ - len = len >> PAGE_SHIFT; + len = len >> MMUPAGE_SHIFT; diff = pgoff - vma->vm_pgoff; if (diff >= len) continue; /* Ok, partially affected.. */ - start += diff << PAGE_SHIFT; - len = (len - diff) << PAGE_SHIFT; + start += diff << MMUPAGE_SHIFT; + len = (len - diff) << MMUPAGE_SHIFT; zap_page_range(vma, start, len); } } @@ -1112,7 +1180,7 @@ int vmtruncate(struct inode * inode, lof if (list_empty(&mapping->i_mmap) && list_empty(&mapping->i_mmap_shared)) goto out_unlock; - pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + pgoff = (offset + MMUPAGE_SIZE - 1) / MMUPAGE_SIZE; if (!list_empty(&mapping->i_mmap)) vmtruncate_list(&mapping->i_mmap, pgoff); if (!list_empty(&mapping->i_mmap_shared)) @@ -1179,8 +1247,13 @@ static int do_swap_page(struct mm_struct struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; - int ret = VM_FAULT_MINOR; + int rss, ret = VM_FAULT_MINOR; struct pte_chain *pte_chain = NULL; + unsigned long subpfn, flt_subpfn = swp_offset(entry) % PAGE_MMUCOUNT; + unsigned long pfn, lo_vaddr, hi_vaddr, vaddr; + + lo_vaddr = max(address & PAGE_MASK, vma->vm_start); + hi_vaddr = min(PAGE_ALIGN(address), vma->vm_end); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1212,7 +1285,7 @@ static int do_swap_page(struct mm_struct mark_page_accessed(page); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) { - ret = -ENOMEM; + ret = VM_FAULT_OOM; goto out; } lock_page(page); @@ -1234,24 +1307,90 @@ static int do_swap_page(struct mm_struct /* The page isn't present yet, go ahead with the fault. */ + /* + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + */ + + rss = 0; + vaddr = lo_vaddr; + page_table -= (address - lo_vaddr)/MMUPAGE_SIZE; + + flush_icache_page(vma, page); + + pfn = page_to_pfn(page); + + do { + /* already faulted in? less work for me */ + if (pte_present(*page_table)) + goto next; + + entry = pte_to_swp_entry(*page_table); + + if (!pte_none(*page_table) && + swp_offset(entry)/PAGE_MMUCOUNT == page->index) { + swap_free(entry); if (vm_swap_full()) remove_exclusive_swap_page(page); + subpfn = swp_offset(entry) % PAGE_MMUCOUNT; + pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); + + } else if (pte_none(*page_table)) { + + subpfn = flt_subpfn + (vaddr - address)/MMUPAGE_SHIFT; + + /* it'd fall outside the page */ + if (subpfn >= PAGE_MMUCOUNT) + goto next; + + pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); + + /* !pte_none() && swp_offset()/PAGE_MMUCOUNT != page->index */ + } else + goto next; - mm->rss++; - pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); - unlock_page(page); - flush_icache_page(vma, page); + if (!pte_chain) + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + ret = VM_FAULT_OOM; + spin_lock(&mm->page_table_lock); + mm->rss += rss; + spin_unlock(&mm->page_table_lock); + goto no_mem; + } + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, vaddr); + } + set_pte(page_table, pte); + ++rss; pte_chain = page_add_rmap(page, page_table, pte_chain); +next: + vaddr += MMUPAGE_SIZE; + page_table++; + } while (vaddr < hi_vaddr); - /* No need to invalidate - it was non-present before */ + unlock_page(page); update_mmu_cache(vma, address, pte); - pte_unmap(page_table); + mm->rss += rss; + pte_unmap(page_table-1); spin_unlock(&mm->page_table_lock); +no_mem: + if (!page) + goto out; + if (!rss) + page_cache_release(page); + else if (rss > 1) + atomic_add(rss - 1, &page->count); out: pte_chain_free(pte_chain); return ret; @@ -1267,66 +1406,207 @@ do_anonymous_page(struct mm_struct *mm, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr) { - pte_t entry; - struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; - int ret; + struct page *page = NULL; + struct pte_chain *pte_chain = NULL; + unsigned long up_vaddr, dn_vaddr, lo_vaddr, hi_vaddr; + unsigned long pfn, subpfn, dn_subpfn, up_subpfn; + pte_t *ptes[PAGE_MMUCOUNT] = { [0 ... PAGE_MMUCOUNT-1] = NULL }; + pte_t *up_pte, *dn_pte; + int rss, ret = VM_FAULT_MINOR; + if (write_access) + pr_debug("write fault on 0x%lx\n", addr); + else + pr_debug("read fault on 0x%lx\n", addr); + pr_debug("page_table = 0x%p\n", page_table); + + if (!write_access) + page = ZERO_PAGE(addr); + else { + if (!pte_chain) pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + if (!pte_chain) { pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); + return VM_FAULT_OOM; + } + page = alloc_page(GFP_HIGHUSER); + if (!page) { + pte_chain_free(pte_chain); + return VM_FAULT_OOM; + } + clear_user_highpage(page, addr); } - /* Read-only mapping of ZERO_PAGE. */ - entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + lo_vaddr = max(addr & ~(PAGE_MMUCOUNT*PMD_SIZE - 1), vma->vm_start); + hi_vaddr = min(vma->vm_end, (addr + PAGE_MMUCOUNT*PMD_SIZE - 1) + & ~(PAGE_MMUCOUNT*PMD_SIZE - 1)); + dn_subpfn = 0; + up_subpfn = PAGE_MMUCOUNT - 1; + dn_vaddr = addr & MMUPAGE_MASK; + up_vaddr = MMUPAGE_ALIGN(addr + 1); + + pr_debug("vma->vm_start = 0x%lx, vma->vm_end = 0x%lx\n", vma->vm_start, vma->vm_end); + pr_debug("lo_vaddr = 0x%lx, hi_vaddr = 0x%lx\n", lo_vaddr, hi_vaddr); + pr_debug("dn_vaddr = 0x%lx, up_vaddr = 0x%lx\n", dn_vaddr, up_vaddr); - /* ..except if it's a write access */ if (write_access) { - /* Allocate our own private page. */ - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pr_debug("about to take mm->page_table_lock\n"); + if (spin_is_locked(&mm->page_table_lock)) + printk("hmm, I see a deadlock coming\n"); + spin_lock(&mm->page_table_lock); + } - page = alloc_page(GFP_HIGHUSER); - if (!page) - goto no_mem; - clear_user_highpage(page, addr); + pr_debug("starting PTE search loop\n"); + if (write_access) + page_table = dn_pte = pte_offset_map(pmd, dn_vaddr); + else + dn_pte = page_table; + up_pte = dn_pte + 1; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); + do { + if (up_vaddr < hi_vaddr && up_subpfn > dn_subpfn) { + if (pte_none(*up_pte)) { + ptes[up_subpfn] = up_pte; + up_subpfn--; + } + up_vaddr += MMUPAGE_SIZE; + up_pte++; + } - if (!pte_none(*page_table)) { - pte_unmap(page_table); - page_cache_release(page); - spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; - goto out; + if (dn_vaddr >= lo_vaddr && dn_subpfn < up_subpfn) { + if (pte_none(*dn_pte)) { + ptes[dn_subpfn] = dn_pte; + dn_subpfn++; } - mm->rss++; - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - lru_cache_add_active(page); - mark_page_accessed(page); + dn_vaddr -= MMUPAGE_SIZE; + dn_pte--; } + pr_debug("dn_vaddr = 0x%lx, up_vaddr = 0x%lx\n", dn_vaddr, up_vaddr); + pr_debug("dn_subpfn = 0x%lx, up_subpfn = 0x%lx\n", dn_subpfn, up_subpfn); + } while ((up_vaddr < hi_vaddr || dn_vaddr >= lo_vaddr) && up_subpfn > dn_subpfn); - set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); - pte_unmap(page_table); + pr_debug("finishing PTE search loop\n"); + pr_debug("starting PTE instantiation loop\n"); - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, entry); - spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; - goto out; + pfn = page_to_pfn(page); + rss = 0; + for (subpfn = 0; subpfn < PAGE_MMUCOUNT; ++subpfn) { + pte_t pte; -no_mem: + pr_debug("subpfn = 0x%lx, ptep = 0x%p\n", subpfn, ptes[subpfn]); + + if (!ptes[subpfn]) { + pr_debug("pte empty\n"); + continue; + } else if (!pte_none(*ptes[subpfn])) { + pr_debug("pte non-none\n"); + continue; + } + + pte = pfn_pte(pfn + subpfn, vma->vm_page_prot); + if (!write_access) { + pr_debug("setting pte to zero page\n"); + set_pte(ptes[subpfn], pte_wrprotect(pte)); + } else { + pr_debug("setting pte to newly zeroed anonymous page\n"); + if (!pte_chain) + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + unsigned long vaddr, offset; + int k; + + pr_debug("doing sleeping alloc of non-anonymous page\n"); + + /* fugly. wtf else can I do? */ + vaddr = (unsigned long)ptes[subpfn]; + + pr_debug("pte vaddr = 0x%lx\n", vaddr); + + /* + * this computes the offset from the + * PAGE_SIZE-aligned kmap_atomic() aperture + * the PAGE_SIZE-sized pte pages end up + * mapping PAGE_MMUCOUNT*PMD_SIZE; hence... + */ + vaddr &= PAGE_MASK; + + pr_debug("vaddr offset = 0x%lx\n", vaddr); + + vaddr /= sizeof(pte_t); + + pr_debug("vaddr offset in ptes = 0x%lx\n", vaddr); + + vaddr = (lo_vaddr & ~(PAGE_MMUCOUNT*PMD_SIZE-1)) + + vaddr * MMUPAGE_SIZE; + + pr_debug("vaddr = 0x%lx\n", vaddr); + + pte_unmap(ptes[subpfn]); + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + pr_debug("going to out_oom\n"); ret = VM_FAULT_OOM; -out: + goto out_oom; + } + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, vaddr); + + /* is this safe from gcc? NFI */ + if (page_table != ptes[subpfn]) { + pr_debug("(page_table) 0x%p != 0x%p (ptes[subpfn])\n", page_table, ptes[subpfn]); + offset = (unsigned long) + (page_table - ptes[subpfn]); + pr_debug("adjusting all ptes by offset 0x%lx\n", + offset); + for (k = subpfn; k < PAGE_MMUCOUNT; ++k) { + pr_debug("pte before 0x%p\n", ptes[k]); + ptes[k] += offset; + pr_debug("pte after 0x%p\n", ptes[k]); + } + } + + /* check for races */ + if (!pte_none(*ptes[subpfn])) { + pr_debug("raced, skipping PTE\n"); + continue; + } + } + pr_debug("setting pte for anonymous zeroed page thing\n"); + pr_debug("ptep = 0x%p, pte = 0x%Lx\n", + ptes[subpfn], (u64)pte_val(pte)); + set_pte(ptes[subpfn], pte_mkwrite(pte_mkdirty(pte))); + pr_debug("about to page_add_rmap()\n"); + pte_chain = page_add_rmap(page, ptes[subpfn], pte_chain); + pr_debug("about to update_mmu_cache()\n"); + update_mmu_cache(vma, addr, pte); + rss++; + pr_debug("about to page_cache_get()\n"); + page_cache_get(page); + } + pr_debug("falling through to next subpfn\n"); + } + pr_debug("doing pte_unmap(0x%p)\n", page_table); + pte_unmap(page_table); + pr_debug("adding %d to rss\n", rss); + mm->rss += rss; + spin_unlock(&mm->page_table_lock); + pr_debug("broke out of PTE instantiation loop\n"); +out_oom: + pr_debug("at out_oom\n"); + if (write_access) { + if (rss) { + pr_debug("adding page to LRU\n"); + lru_cache_add_active(page); + mark_page_accessed(page); + } + pr_debug("releasing page\n"); + page_cache_release(page); + } + pr_debug("doing pte_chain_free()\n"); pte_chain_free(pte_chain); return ret; } @@ -1358,12 +1638,12 @@ do_no_page(struct mm_struct *mm, struct pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + new_page = vma->vm_ops->nopage(vma, address & MMUPAGE_MASK, 0); /* no page was available -- either SIGBUS or OOM */ if (new_page == NOPAGE_SIGBUS) return VM_FAULT_SIGBUS; - if (new_page == NOPAGE_OOM) + else if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; pte_chain = pte_chain_alloc(GFP_KERNEL); @@ -1397,12 +1677,17 @@ do_no_page(struct mm_struct *mm, struct * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. + * + * XXX: this should sweep pagetables and prefault */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { + unsigned long pfn; ++mm->rss; flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); + pfn = page_to_pfn(new_page) + + vma_suboffset(vma, address)/MMUPAGE_SIZE; + entry = pfn_pte(pfn, vma->vm_page_prot); if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); set_pte(page_table, entry); @@ -1456,7 +1741,7 @@ static int do_file_page(struct mm_struct pte_unmap(pte); spin_unlock(&mm->page_table_lock); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + err = vma->vm_ops->populate(vma, address & MMUPAGE_MASK, MMUPAGE_SIZE, vma->vm_page_prot, pgoff, 0); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) @@ -1590,11 +1875,9 @@ int make_pages_present(unsigned long add vma = find_vma(current->mm, addr); write = (vma->vm_flags & VM_WRITE) != 0; - if (addr >= end) - BUG(); - if (end > vma->vm_end) - BUG(); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + BUG_ON(addr >= end); + BUG_ON(end > vma->vm_end); + len = (end+MMUPAGE_SIZE-1)/MMUPAGE_SIZE-addr/MMUPAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); return ret == len ? 0 : -1; diff -prauwN linux-2.5.69/mm/mincore.c pgcl-2.5.69-3/mm/mincore.c --- linux-2.5.69/mm/mincore.c 2003-05-04 16:53:56.000000000 -0700 +++ pgcl-2.5.69-3/mm/mincore.c 2003-05-26 07:14:20.000000000 -0700 @@ -29,7 +29,7 @@ static unsigned char mincore_page(struct struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; struct page * page; - page = find_get_page(as, pgoff); + page = find_get_page(as, pgoff/PAGE_CACHE_MMUCOUNT); if (page) { present = PageUptodate(page); page_cache_release(page); @@ -42,41 +42,43 @@ static long mincore_vma(struct vm_area_s unsigned long start, unsigned long end, unsigned char * vec) { long error, i, remaining; - unsigned char * tmp; + unsigned char *kaddr; + struct page *page; error = -ENOMEM; if (!vma->vm_file) return error; - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + start = ((start - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + end = ((end - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; error = -EAGAIN; - tmp = (unsigned char *) __get_free_page(GFP_KERNEL); - if (!tmp) + page = alloc_page(GFP_HIGHUSER); + if (!page) return error; /* (end - start) is # of pages, and also # of bytes in "vec */ - remaining = (end - start), + remaining = end - start; error = 0; + kaddr = kmap_atomic(page, KM_USER0); for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { int j = 0; long thispiece = (remaining < PAGE_SIZE) ? remaining : PAGE_SIZE; while (j < thispiece) - tmp[j++] = mincore_page(vma, start++); + kaddr[j++] = mincore_page(vma, start++); - if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { + if (copy_to_user(vec + PAGE_SIZE * i, kaddr, thispiece)) { error = -EFAULT; break; } } - - free_page((unsigned long) tmp); + kunmap_atomic(kaddr, KM_USER0); + __free_page(page); return error; } @@ -116,15 +118,15 @@ asmlinkage long sys_mincore(unsigned lon down_read(¤t->mm->mmap_sem); - if (start & ~PAGE_CACHE_MASK) + if (start & ~MMUPAGE_MASK) goto out; - len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK; + len = (len + ~MMUPAGE_MASK) & MMUPAGE_MASK; end = start + len; if (end < start) goto out; error = -EFAULT; - if (!access_ok(VERIFY_WRITE, (unsigned long) vec, len >> PAGE_SHIFT)) + if (!access_ok(VERIFY_WRITE, (unsigned long) vec, len >> MMUPAGE_SHIFT)) goto out; error = 0; @@ -164,7 +166,7 @@ asmlinkage long sys_mincore(unsigned lon error = mincore_vma(vma, start, vma->vm_end, &vec[index]); if (error) goto out; - index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; + index += (vma->vm_end - start)/MMUPAGE_SIZE; start = vma->vm_end; vma = vma->vm_next; } diff -prauwN linux-2.5.69/mm/mlock.c pgcl-2.5.69-3/mm/mlock.c --- linux-2.5.69/mm/mlock.c 2003-05-04 16:53:01.000000000 -0700 +++ pgcl-2.5.69-3/mm/mlock.c 2003-05-26 07:14:20.000000000 -0700 @@ -37,7 +37,7 @@ static int mlock_fixup(struct vm_area_st /* * Keep track of amount of locked VM. */ - pages = (end - start) >> PAGE_SHIFT; + pages = (end - start) >> MMUPAGE_SHIFT; if (newflags & VM_LOCKED) { pages = -pages; make_pages_present(start, end); @@ -55,7 +55,7 @@ static int do_mlock(unsigned long start, if (on && !capable(CAP_IPC_LOCK)) return -EPERM; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; @@ -101,14 +101,14 @@ asmlinkage long sys_mlock(unsigned long int error = -ENOMEM; down_write(¤t->mm->mmap_sem); - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); - start &= PAGE_MASK; + len = MMUPAGE_ALIGN(len + (start & ~MMUPAGE_MASK)); + start &= MMUPAGE_MASK; - locked = len >> PAGE_SHIFT; + locked = len >> MMUPAGE_SHIFT; locked += current->mm->locked_vm; lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; + lock_limit >>= MMUPAGE_SHIFT; /* check against resource limits */ if (locked <= lock_limit) @@ -122,8 +122,8 @@ asmlinkage long sys_munlock(unsigned lon int ret; down_write(¤t->mm->mmap_sem); - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); - start &= PAGE_MASK; + len = MMUPAGE_ALIGN(len + (start & ~MMUPAGE_MASK)); + start &= MMUPAGE_MASK; ret = do_mlock(start, len, 0); up_write(¤t->mm->mmap_sem); return ret; @@ -167,7 +167,7 @@ asmlinkage long sys_mlockall(int flags) goto out; lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; + lock_limit >>= MMUPAGE_SHIFT; ret = -ENOMEM; if (current->mm->total_vm <= lock_limit) diff -prauwN linux-2.5.69/mm/mmap.c pgcl-2.5.69-3/mm/mmap.c --- linux-2.5.69/mm/mmap.c 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/mm/mmap.c 2003-05-26 07:14:20.000000000 -0700 @@ -56,7 +56,8 @@ atomic_t vm_committed_space = ATOMIC_INI /* * Check that a process has enough memory to allocate a new virtual * mapping. 1 means there is enough memory for the allocation to - * succeed and 0 implies there is not. + * succeed and 0 implies there is not. the "pages" argument is in + * mmupages. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-acounting @@ -77,9 +78,9 @@ int vm_enough_memory(long pages) return 1; if (sysctl_overcommit_memory == 0) { - free = get_page_cache_size(); - free += nr_free_pages(); - free += nr_swap_pages; + free = get_page_cache_size()*PAGE_MMUCOUNT; + free += nr_free_pages()*PAGE_MMUCOUNT; + free += nr_swap_pages*PAGE_MMUCOUNT; /* * The code below doesn't account for free space in the @@ -89,9 +90,9 @@ int vm_enough_memory(long pages) * factors balance out... */ free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> - PAGE_SHIFT; + MMUPAGE_SHIFT; free += (inodes_stat.nr_unused * sizeof(struct inode)) >> - PAGE_SHIFT; + MMUPAGE_SHIFT; if (free > pages) return 1; @@ -99,14 +100,13 @@ int vm_enough_memory(long pages) return 0; } - allowed = totalram_pages * sysctl_overcommit_ratio / 100; - allowed += total_swap_pages; + allowed = totalram_pages*(sysctl_overcommit_ratio/100)*PAGE_MMUCOUNT; + allowed += total_swap_pages*PAGE_MMUCOUNT; if (atomic_read(&vm_committed_space) < allowed) return 1; vm_unacct_memory(pages); - return 0; } @@ -156,8 +156,8 @@ asmlinkage unsigned long sys_brk(unsigne if (brk < mm->end_code) goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); + newbrk = MMUPAGE_ALIGN(brk); + oldbrk = MMUPAGE_ALIGN(mm->brk); if (oldbrk == newbrk) goto set_brk; @@ -174,7 +174,7 @@ asmlinkage unsigned long sys_brk(unsigne goto out; /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + if (find_vma_intersection(mm, oldbrk, newbrk+MMUPAGE_SIZE)) goto out; /* Ok, looks good - let it rip. */ @@ -529,10 +529,10 @@ unsigned long do_mmap_pgoff(struct file if (len > TASK_SIZE) return -EINVAL; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + if ((pgoff + (len >> MMUPAGE_SHIFT)) < pgoff) return -EINVAL; /* Too many mappings? */ @@ -543,7 +543,7 @@ unsigned long do_mmap_pgoff(struct file * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) + if (addr & ~MMUPAGE_MASK) return addr; /* Do simple checking here so the lower-level routines won't have @@ -560,7 +560,7 @@ unsigned long do_mmap_pgoff(struct file } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { - unsigned long locked = mm->locked_vm << PAGE_SHIFT; + unsigned long locked = mm->locked_vm << MMUPAGE_SHIFT; locked += len; if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN; @@ -628,7 +628,7 @@ munmap_back: } /* Check against address space limit. */ - if ((mm->total_vm << PAGE_SHIFT) + len + if ((mm->total_vm << MMUPAGE_SHIFT) + len > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; @@ -640,7 +640,7 @@ munmap_back: /* * Private writable mapping: check memory availability */ - charged = len >> PAGE_SHIFT; + charged = len >> MMUPAGE_SHIFT; if (!vm_enough_memory(charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT; @@ -724,9 +724,9 @@ munmap_back: kmem_cache_free(vm_area_cachep, vma); } out: - mm->total_vm += len >> PAGE_SHIFT; + mm->total_vm += len >> MMUPAGE_SHIFT; if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; + mm->locked_vm += len >> MMUPAGE_SHIFT; make_pages_present(addr, addr + len); } if (flags & MAP_POPULATE) { @@ -759,7 +759,7 @@ unacct_error: * Ugly calling convention alert: * Return value with the low bits set means error value, * ie - * if (ret & ~PAGE_MASK) + * if (ret & ~MMUPAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. @@ -777,7 +777,7 @@ arch_get_unmapped_area(struct file *filp return -ENOMEM; if (addr) { - addr = PAGE_ALIGN(addr); + addr = MMUPAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) @@ -816,7 +816,7 @@ get_unmapped_area(struct file *file, uns if (addr > TASK_SIZE - len) return -ENOMEM; - if (addr & ~PAGE_MASK) + if (addr & ~MMUPAGE_MASK) return -EINVAL; if (file && is_file_hugepages(file)) { /* @@ -922,18 +922,20 @@ int expand_stack(struct vm_area_struct * { unsigned long grow; - if (!(vma->vm_flags & VM_GROWSUP)) + if (!(vma->vm_flags & VM_GROWSUP)) { + printk("bad vma flags in expand_stack()\n"); return -EFAULT; + } /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need to get * the spinlock only before relocating the vma range ourself. */ - address += 4 + PAGE_SIZE - 1; - address &= PAGE_MASK; + address += 4 + MMUPAGE_SIZE - 1; + address &= MMUPAGE_MASK; spin_lock(&vma->vm_mm->page_table_lock); - grow = (address - vma->vm_end) >> PAGE_SHIFT; + grow = (address - vma->vm_end) >> MMUPAGE_SHIFT; /* Overcommit.. */ if (!vm_enough_memory(grow)) { @@ -942,7 +944,7 @@ int expand_stack(struct vm_area_struct * } if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + ((vma->vm_mm->total_vm + grow) << MMUPAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); @@ -961,7 +963,7 @@ find_extend_vma(struct mm_struct *mm, un { struct vm_area_struct *vma, *prev; - addr &= PAGE_MASK; + addr &= MMUPAGE_MASK; vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; @@ -985,9 +987,9 @@ int expand_stack(struct vm_area_struct * * is required to hold the mmap_sem in read mode. We need to get * the spinlock only before relocating the vma range ourself. */ - address &= PAGE_MASK; + address &= MMUPAGE_MASK; spin_lock(&vma->vm_mm->page_table_lock); - grow = (vma->vm_start - address) >> PAGE_SHIFT; + grow = (vma->vm_start - address) >> MMUPAGE_SHIFT; /* Overcommit.. */ if (!vm_enough_memory(grow)) { @@ -996,7 +998,7 @@ int expand_stack(struct vm_area_struct * } if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + ((vma->vm_mm->total_vm + grow) << MMUPAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); @@ -1017,7 +1019,7 @@ find_extend_vma(struct mm_struct * mm, u struct vm_area_struct * vma; unsigned long start; - addr &= PAGE_MASK; + addr &= MMUPAGE_MASK; vma = find_vma(mm,addr); if (!vma) return NULL; @@ -1109,9 +1111,9 @@ static void unmap_vma(struct mm_struct * { size_t len = area->vm_end - area->vm_start; - area->vm_mm->total_vm -= len >> PAGE_SHIFT; + area->vm_mm->total_vm -= len >> MMUPAGE_SHIFT; if (area->vm_flags & VM_LOCKED) - area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + area->vm_mm->locked_vm -= len >> MMUPAGE_SHIFT; /* * Is this a new hole at the lowest possible address? */ @@ -1216,11 +1218,11 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + vma->vm_pgoff += ((addr - new->vm_start) >> MMUPAGE_SHIFT); } else { vma->vm_end = addr; new->vm_start = addr; - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + new->vm_pgoff += ((addr - vma->vm_start) >> MMUPAGE_SHIFT); } if (new->vm_file) @@ -1243,10 +1245,10 @@ int do_munmap(struct mm_struct *mm, unsi unsigned long end; struct vm_area_struct *mpnt, *prev, *last; - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + if ((start & ~MMUPAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; - if ((len = PAGE_ALIGN(len)) == 0) + if ((len = MMUPAGE_ALIGN(len)) == 0) return -EINVAL; /* Find the first overlapping VMA */ @@ -1329,7 +1331,7 @@ unsigned long do_brk(unsigned long addr, unsigned long flags; struct rb_node ** rb_link, * rb_parent; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); if (!len) return addr; @@ -1337,7 +1339,7 @@ unsigned long do_brk(unsigned long addr, * mlock MCL_FUTURE? */ if (mm->def_flags & VM_LOCKED) { - unsigned long locked = mm->locked_vm << PAGE_SHIFT; + unsigned long locked = mm->locked_vm << MMUPAGE_SHIFT; locked += len; if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN; @@ -1355,14 +1357,14 @@ unsigned long do_brk(unsigned long addr, } /* Check against address space limits *after* clearing old maps... */ - if ((mm->total_vm << PAGE_SHIFT) + len + if ((mm->total_vm << MMUPAGE_SHIFT) + len > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; if (mm->map_count > MAX_MAP_COUNT) return -ENOMEM; - if (!vm_enough_memory(len >> PAGE_SHIFT)) + if (!vm_enough_memory(len >> MMUPAGE_SHIFT)) return -ENOMEM; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; @@ -1377,7 +1379,7 @@ unsigned long do_brk(unsigned long addr, */ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); + vm_unacct_memory(len >> MMUPAGE_SHIFT); return -ENOMEM; } @@ -1395,9 +1397,9 @@ unsigned long do_brk(unsigned long addr, vma_link(mm, vma, prev, rb_link, rb_parent); out: - mm->total_vm += len >> PAGE_SHIFT; + mm->total_vm += len >> MMUPAGE_SHIFT; if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; + mm->locked_vm += len >> MMUPAGE_SHIFT; make_pages_present(addr, addr + len); } return addr; diff -prauwN linux-2.5.69/mm/mprotect.c pgcl-2.5.69-3/mm/mprotect.c --- linux-2.5.69/mm/mprotect.c 2003-05-04 16:53:13.000000000 -0700 +++ pgcl-2.5.69-3/mm/mprotect.c 2003-05-26 07:14:20.000000000 -0700 @@ -53,7 +53,7 @@ change_pte_range(pmd_t *pmd, unsigned lo entry = ptep_get_and_clear(pte); set_pte(pte, pte_modify(entry, newprot)); } - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); pte_unmap(pte - 1); @@ -174,9 +174,11 @@ mprotect_fixup(struct vm_area_struct *vm */ if (newflags & VM_WRITE) { if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { - charged = (end - start) >> PAGE_SHIFT; - if (!vm_enough_memory(charged)) + charged = (end - start) >> MMUPAGE_SHIFT; + if (!vm_enough_memory(charged)) { + printk("mprotect_fixup(): OOM\n"); return -ENOMEM; + } newflags |= VM_ACCOUNT; } } @@ -228,9 +230,9 @@ sys_mprotect(unsigned long start, size_t struct vm_area_struct * vma, * next, * prev; int error = -EINVAL; - if (start & ~PAGE_MASK) + if (start & ~MMUPAGE_MASK) return -EINVAL; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; diff -prauwN linux-2.5.69/mm/mremap.c pgcl-2.5.69-3/mm/mremap.c --- linux-2.5.69/mm/mremap.c 2003-05-04 16:53:31.000000000 -0700 +++ pgcl-2.5.69-3/mm/mremap.c 2003-05-26 07:14:20.000000000 -0700 @@ -162,7 +162,7 @@ static int move_page_tables(struct vm_ar * only a few pages.. This also makes error recovery easier. */ while (offset) { - offset -= PAGE_SIZE; + offset -= MMUPAGE_SIZE; if (move_one_page(vma, old_addr + offset, new_addr + offset)) goto oops_we_failed; } @@ -177,7 +177,7 @@ static int move_page_tables(struct vm_ar */ oops_we_failed: flush_cache_range(vma, new_addr, new_addr + len); - while ((offset += PAGE_SIZE) < len) + while ((offset += MMUPAGE_SIZE) < len) move_one_page(vma, new_addr + offset, old_addr + offset); zap_page_range(vma, new_addr, len); return -1; @@ -253,7 +253,7 @@ static unsigned long move_vma(struct vm_ INIT_LIST_HEAD(&new_vma->shared); new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; - new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; + new_vma->vm_pgoff += (addr - vma->vm_start) >> MMUPAGE_SHIFT; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) @@ -287,9 +287,9 @@ static unsigned long move_vma(struct vm_ vma->vm_next->vm_flags |= VM_ACCOUNT; } - current->mm->total_vm += new_len >> PAGE_SHIFT; + current->mm->total_vm += new_len >> MMUPAGE_SHIFT; if (must_fault_in) { - current->mm->locked_vm += new_len >> PAGE_SHIFT; + current->mm->locked_vm += new_len >> MMUPAGE_SHIFT; make_pages_present(fault_in_start, fault_in_end); } return new_addr; @@ -318,15 +318,15 @@ unsigned long do_mremap(unsigned long ad if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) goto out; - if (addr & ~PAGE_MASK) + if (addr & ~MMUPAGE_MASK) goto out; - old_len = PAGE_ALIGN(old_len); - new_len = PAGE_ALIGN(new_len); + old_len = MMUPAGE_ALIGN(old_len); + new_len = MMUPAGE_ALIGN(new_len); /* new_addr is only valid if MREMAP_FIXED is specified */ if (flags & MREMAP_FIXED) { - if (new_addr & ~PAGE_MASK) + if (new_addr & ~MMUPAGE_MASK) goto out; if (!(flags & MREMAP_MAYMOVE)) goto out; @@ -378,19 +378,19 @@ unsigned long do_mremap(unsigned long ad goto out; } if (vma->vm_flags & VM_LOCKED) { - unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; + unsigned long locked = current->mm->locked_vm << MMUPAGE_SHIFT; locked += new_len - old_len; ret = -EAGAIN; if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) goto out; } ret = -ENOMEM; - if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) + if ((current->mm->total_vm << MMUPAGE_SHIFT) + (new_len - old_len) > current->rlim[RLIMIT_AS].rlim_cur) goto out; if (vma->vm_flags & VM_ACCOUNT) { - charged = (new_len - old_len) >> PAGE_SHIFT; + charged = (new_len - old_len) >> MMUPAGE_SHIFT; if (!vm_enough_memory(charged)) goto out_nc; } @@ -406,7 +406,7 @@ unsigned long do_mremap(unsigned long ad max_addr = vma->vm_next->vm_start; /* can we just expand the current mapping? */ if (max_addr - addr >= new_len) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + int pages = (new_len - old_len) >> MMUPAGE_SHIFT; spin_lock(&vma->vm_mm->page_table_lock); vma->vm_end = addr + new_len; spin_unlock(&vma->vm_mm->page_table_lock); @@ -435,13 +435,13 @@ unsigned long do_mremap(unsigned long ad new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); ret = new_addr; - if (new_addr & ~PAGE_MASK) + if (new_addr & ~MMUPAGE_MASK) goto out; } ret = move_vma(vma, addr, old_len, new_len, new_addr); } out: - if (ret & ~PAGE_MASK) + if (ret & ~MMUPAGE_MASK) vm_unacct_memory(charged); out_nc: return ret; diff -prauwN linux-2.5.69/mm/msync.c pgcl-2.5.69-3/mm/msync.c --- linux-2.5.69/mm/msync.c 2003-05-04 16:53:13.000000000 -0700 +++ pgcl-2.5.69-3/mm/msync.c 2003-05-26 07:14:20.000000000 -0700 @@ -59,7 +59,7 @@ static int filemap_sync_pte_range(pmd_t error = 0; do { error |= filemap_sync_pte(pte, vma, address, flags); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); @@ -174,12 +174,12 @@ asmlinkage long sys_msync(unsigned long down_read(¤t->mm->mmap_sem); if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; - if (start & ~PAGE_MASK) + if (start & ~MMUPAGE_MASK) goto out; if ((flags & MS_ASYNC) && (flags & MS_SYNC)) goto out; error = -ENOMEM; - len = (len + ~PAGE_MASK) & PAGE_MASK; + len = (len + ~MMUPAGE_MASK) & MMUPAGE_MASK; end = start + len; if (end < start) goto out; diff -prauwN linux-2.5.69/mm/page-writeback.c pgcl-2.5.69-3/mm/page-writeback.c --- linux-2.5.69/mm/page-writeback.c 2003-05-04 16:53:35.000000000 -0700 +++ pgcl-2.5.69-3/mm/page-writeback.c 2003-05-26 07:14:20.000000000 -0700 @@ -363,8 +363,8 @@ static void set_ratelimit(void) ratelimit_pages = total_pages / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; - if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) - ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; + if (ratelimit_pages * PAGE_CACHE_SIZE > PAGE_SIZE * 1024) + ratelimit_pages = (PAGE_SIZE * 1024) / PAGE_CACHE_SIZE; } static int diff -prauwN linux-2.5.69/mm/page_alloc.c pgcl-2.5.69-3/mm/page_alloc.c --- linux-2.5.69/mm/page_alloc.c 2003-05-04 16:53:01.000000000 -0700 +++ pgcl-2.5.69-3/mm/page_alloc.c 2003-05-26 07:39:37.000000000 -0700 @@ -57,7 +57,7 @@ static int zone_balance_max[MAX_NR_ZONES */ static int bad_range(struct zone *zone, struct page *page) { - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages*PAGE_MMUCOUNT) return 1; if (page_to_pfn(page) < zone->zone_start_pfn) return 1; @@ -1173,9 +1173,9 @@ void __init memmap_init_zone(struct page #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ if (zone != ZONE_HIGHMEM) - set_page_address(page, __va(start_pfn << PAGE_SHIFT)); + set_page_address(page, __va(start_pfn << MMUPAGE_SHIFT)); #endif - start_pfn++; + start_pfn += PAGE_MMUCOUNT; } } @@ -1194,7 +1194,7 @@ static void __init free_area_init_core(s unsigned long *zones_size, unsigned long *zholes_size) { unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-PAGE_MMUSHIFT-1); int cpu, nid = pgdat->node_id; struct page *lmem_map = pgdat->node_mem_map; unsigned long zone_start_pfn = pgdat->node_start_pfn; @@ -1253,7 +1253,7 @@ static void __init free_area_init_core(s INIT_LIST_HEAD(&pcp->list); } printk(" %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_names[j], realsize*PAGE_MMUCOUNT, batch); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); atomic_set(&zone->refill_counter, 0); @@ -1295,7 +1295,7 @@ static void __init free_area_init_core(s memmap_init(lmem_map, size, nid, j, zone_start_pfn); - zone_start_pfn += size; + zone_start_pfn += PAGE_MMUCOUNT*size; lmem_map += size; for (i = 0; ; i++) { @@ -1366,7 +1366,7 @@ struct pglist_data contig_page_data = { void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, &contig_page_data, NULL, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + __pa(PAGE_OFFSET) >> MMUPAGE_SHIFT, NULL); mem_map = contig_page_data.node_mem_map; } #endif diff -prauwN linux-2.5.69/mm/page_io.c pgcl-2.5.69-3/mm/page_io.c --- linux-2.5.69/mm/page_io.c 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/mm/page_io.c 2003-05-26 07:14:20.000000000 -0700 @@ -32,7 +32,7 @@ get_swap_bio(int gfp_flags, struct page swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -152,7 +152,7 @@ int rw_swap_page_sync(int rw, swp_entry_ BUG_ON(page->mapping); page->mapping = &swapper_space; - page->index = entry.val; + page->index = entry.val/PAGE_MMUCOUNT; if (rw == READ) { ret = swap_readpage(NULL, page); diff -prauwN linux-2.5.69/mm/rmap.c pgcl-2.5.69-3/mm/rmap.c --- linux-2.5.69/mm/rmap.c 2003-05-04 16:53:41.000000000 -0700 +++ pgcl-2.5.69-3/mm/rmap.c 2003-05-26 07:14:20.000000000 -0700 @@ -337,7 +337,8 @@ static int try_to_unmap_one(struct page * Store the swap location in the pte. * See handle_pte_fault() ... */ - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->index*PAGE_MMUCOUNT + + (pte_pfn(pte) % PAGE_MMUCOUNT) }; swap_duplicate(entry); set_pte(ptep, swp_entry_to_pte(entry)); BUG_ON(pte_file(*ptep)); @@ -347,11 +348,11 @@ static int try_to_unmap_one(struct page * If a nonlinear mapping then store the file page offset * in the pte. */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; + pgidx = (address - vma->vm_start) >> MMUPAGE_SHIFT; pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + pgidx >>= PAGE_CACHE_SHIFT - MMUPAGE_SHIFT; if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); + set_pte(ptep, pgoff_to_pte(page->index*PAGE_MMUCOUNT)); BUG_ON(!pte_file(*ptep)); } } diff -prauwN linux-2.5.69/mm/shmem.c pgcl-2.5.69-3/mm/shmem.c --- linux-2.5.69/mm/shmem.c 2003-05-04 16:53:13.000000000 -0700 +++ pgcl-2.5.69-3/mm/shmem.c 2003-05-26 07:14:20.000000000 -0700 @@ -47,7 +47,7 @@ #define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) #define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) -#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) +#define VM_ACCT(size) (MMUPAGE_ALIGN(size)/MMUPAGE_SIZE) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 @@ -71,14 +71,14 @@ static inline struct page *shmem_dir_all /* * The above definition of ENTRIES_PER_PAGE, and the use of * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: - * might be reconsidered if it ever diverges from PAGE_SIZE. + * might be reconsidered if it ever diverges from MMUPAGE_SIZE. */ - return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); + return alloc_pages(gfp_mask, PAGE_CACHE_MMUSHIFT); } static inline void shmem_dir_free(struct page *page) { - __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); + __free_pages(page, PAGE_CACHE_MMUSHIFT); } static struct page **shmem_dir_map(struct page *page) @@ -297,7 +297,7 @@ static swp_entry_t *shmem_swp_alloc(stru static const swp_entry_t unswapped = {0}; if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size) + (loff_t)index*PAGE_CACHE_SIZE >= inode->i_size) return ERR_PTR(-EINVAL); while (!(entry = shmem_swp_entry(info, index, &page))) { @@ -330,7 +330,7 @@ static swp_entry_t *shmem_swp_alloc(stru return ERR_PTR(-ENOMEM); } if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size) { + (loff_t)index*PAGE_CACHE_SIZE >= inode->i_size) { entry = ERR_PTR(-EINVAL); break; } @@ -383,7 +383,7 @@ static void shmem_truncate(struct inode int freed; inode->i_ctime = inode->i_mtime = CURRENT_TIME; - idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + idx = (inode->i_size + PAGE_CACHE_SIZE - 1)/PAGE_CACHE_SIZE; if (idx >= info->next_index) return; @@ -509,7 +509,7 @@ static int shmem_notify_change(struct de long change = 0; int error; - if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size <= SHMEM_MAX_BYTES)) { + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size <= SHMEM_MAX_BYTES) { /* * Account swap file usage based on new file size, * but just let vmtruncate fail on out-of-range sizes. @@ -527,9 +527,9 @@ static int shmem_notify_change(struct de * truncate_partial_page cannnot miss it were * it assigned to swap. */ - if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + if (attr->ia_size % PAGE_CACHE_SIZE) { (void) shmem_getpage(inode, - attr->ia_size>>PAGE_CACHE_SHIFT, + attr->ia_size/PAGE_CACHE_SIZE, &page, SGP_READ); } } @@ -940,16 +940,14 @@ struct page *shmem_nopage(struct vm_area { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page = NULL; - unsigned long idx; + unsigned long pgoff; int error; - idx = (address - vma->vm_start) >> PAGE_SHIFT; - idx += vma->vm_pgoff; - idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + pgoff = (address - vma->vm_start)/MMUPAGE_SIZE + vma->vm_pgoff; - error = shmem_getpage(inode, idx, &page, SGP_CACHE); + error = shmem_getpage(inode, pgoff/PAGE_CACHE_MMUCOUNT, &page, SGP_CACHE); if (error) - return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; + return error == -ENOMEM ? NOPAGE_OOM : NOPAGE_SIGBUS; mark_page_accessed(page); return page; @@ -964,8 +962,8 @@ static int shmem_populate(struct vm_area enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; unsigned long size; - size = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size) + size = (inode->i_size + MMUPAGE_SIZE - 1)/MMUPAGE_SIZE; + if (pgoff >= size || pgoff + len/MMUPAGE_SIZE > size) return -EINVAL; while ((long) len > 0) { @@ -974,19 +972,19 @@ static int shmem_populate(struct vm_area /* * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE */ - err = shmem_getpage(inode, pgoff, &page, sgp); + err = shmem_getpage(inode, pgoff/PAGE_CACHE_MMUCOUNT, &page, sgp); if (err) return err; if (page) { mark_page_accessed(page); - err = install_page(mm, vma, addr, page, prot); + err = install_page(mm, vma, addr, page, prot, pgoff % PAGE_CACHE_MMUCOUNT); if (err) { page_cache_release(page); return err; } } - len -= PAGE_SIZE; - addr += PAGE_SIZE; + len -= MMUPAGE_SIZE; + addr += MMUPAGE_SIZE; pgoff++; } return 0; @@ -1155,8 +1153,8 @@ shmem_file_write(struct file *file, cons char *kaddr; int left; - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; + offset = pos % PAGE_CACHE_SIZE; /* Within page */ + index = pos/PAGE_CACHE_SIZE; bytes = PAGE_CACHE_SIZE - offset; if (bytes > count) bytes = count; @@ -1231,18 +1229,18 @@ static void do_shmem_file_read(struct fi struct address_space *mapping = inode->i_mapping; unsigned long index, offset; - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + index = *ppos/PAGE_CACHE_SIZE; + offset = *ppos % PAGE_CACHE_SIZE; for (;;) { struct page *page = NULL; unsigned long end_index, nr, ret; - end_index = inode->i_size >> PAGE_CACHE_SHIFT; + end_index = inode->i_size/PAGE_CACHE_SIZE; if (index > end_index) break; if (index == end_index) { - nr = inode->i_size & ~PAGE_CACHE_MASK; + nr = inode->i_size % PAGE_CACHE_SIZE; if (nr <= offset) break; } @@ -1259,9 +1257,9 @@ static void do_shmem_file_read(struct fi * are called without i_sem protection against truncate */ nr = PAGE_CACHE_SIZE; - end_index = inode->i_size >> PAGE_CACHE_SHIFT; + end_index = inode->i_size/PAGE_CACHE_SIZE; if (index == end_index) { - nr = inode->i_size & ~PAGE_CACHE_MASK; + nr = inode->i_size % PAGE_CACHE_SIZE; if (nr <= offset) { page_cache_release(page); break; diff -prauwN linux-2.5.69/mm/slab.c pgcl-2.5.69-3/mm/slab.c --- linux-2.5.69/mm/slab.c 2003-05-04 16:53:36.000000000 -0700 +++ pgcl-2.5.69-3/mm/slab.c 2003-05-26 07:14:20.000000000 -0700 @@ -604,7 +604,7 @@ void __init kmem_cache_sizes_init(void) * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ - if (num_physpages > (32 << 20) >> PAGE_SHIFT) + if (num_physpages > (32 << 20) >> MMUPAGE_SHIFT) slab_break_gfp_order = BREAK_GFP_ORDER_HI; while (sizes->cs_size) { @@ -966,7 +966,7 @@ kmem_cache_create (const char *name, siz align = L1_CACHE_BYTES; /* Determine if the slab management is 'on' or 'off' slab. */ - if (size >= (PAGE_SIZE>>3)) + if (size >= PAGE_SIZE/8 || ((flags & SLAB_MUST_HWCACHE_ALIGN) && size >= MMUPAGE_SIZE)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). diff -prauwN linux-2.5.69/mm/swap.c pgcl-2.5.69-3/mm/swap.c --- linux-2.5.69/mm/swap.c 2003-05-04 16:53:13.000000000 -0700 +++ pgcl-2.5.69-3/mm/swap.c 2003-05-26 07:14:20.000000000 -0700 @@ -379,7 +379,7 @@ void vm_acct_memory(long pages) */ void __init swap_setup(void) { - unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + unsigned long megs = num_physpages >> (20 - MMUPAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff -prauwN linux-2.5.69/mm/swap_state.c pgcl-2.5.69-3/mm/swap_state.c --- linux-2.5.69/mm/swap_state.c 2003-05-04 16:53:14.000000000 -0700 +++ pgcl-2.5.69-3/mm/swap_state.c 2003-05-26 07:14:20.000000000 -0700 @@ -77,7 +77,7 @@ static int add_to_swap_cache(struct page INC_CACHE_INFO(noent_race); return -ENOENT; } - error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL); + error = add_to_page_cache(page, &swapper_space, entry.val/PAGE_MMUCOUNT, GFP_KERNEL); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ @@ -149,7 +149,7 @@ int add_to_swap(struct page * page) * Add it to the swap cache and mark it dirty */ err = add_to_page_cache(page, &swapper_space, - entry.val, GFP_ATOMIC); + entry.val/PAGE_MMUCOUNT, GFP_ATOMIC); if (pf_flags & PF_MEMALLOC) current->flags |= PF_MEMALLOC; @@ -188,7 +188,7 @@ void delete_from_swap_cache(struct page BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; spin_lock(&swapper_space.page_lock); __delete_from_swap_cache(page); @@ -206,10 +206,10 @@ int move_to_swap_cache(struct page *page spin_lock(&swapper_space.page_lock); spin_lock(&mapping->page_lock); - err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + err = radix_tree_insert(&swapper_space.page_tree, entry.val/PAGE_MMUCOUNT, page); if (!err) { __remove_from_page_cache(page); - ___add_to_page_cache(page, &swapper_space, entry.val); + ___add_to_page_cache(page, &swapper_space, entry.val/PAGE_MMUCOUNT); } spin_unlock(&mapping->page_lock); @@ -237,7 +237,7 @@ int move_from_swap_cache(struct page *pa BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; spin_lock(&swapper_space.page_lock); spin_lock(&mapping->page_lock); @@ -320,7 +320,7 @@ struct page * lookup_swap_cache(swp_entr { struct page *found; - found = find_get_page(&swapper_space, entry.val); + found = find_get_page(&swapper_space, entry.val/PAGE_MMUCOUNT); /* * Unsafe to assert PageSwapCache and mapping on page found: * if SMP nothing prevents swapoff from deleting this page from @@ -351,7 +351,7 @@ struct page * read_swap_cache_async(swp_ * that would confuse statistics: use find_get_page() * directly. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(&swapper_space, entry.val/PAGE_MMUCOUNT); if (found_page) break; diff -prauwN linux-2.5.69/mm/swapfile.c pgcl-2.5.69-3/mm/swapfile.c --- linux-2.5.69/mm/swapfile.c 2003-05-04 16:53:07.000000000 -0700 +++ pgcl-2.5.69-3/mm/swapfile.c 2003-05-26 07:40:48.000000000 -0700 @@ -41,6 +41,10 @@ struct swap_info_struct swap_info[MAX_SW #define SWAPFILE_CLUSTER 256 +/* + * returns offset into ->swap_map[] array, each entry of which + * tracks PAGE_SIZE (not MMUPAGE_SIZE) + */ static inline int scan_swap_map(struct swap_info_struct *si) { unsigned long offset; @@ -127,7 +131,7 @@ swp_entry_t get_swap_page(void) offset = scan_swap_map(p); swap_device_unlock(p); if (offset) { - entry = swp_entry(type,offset); + entry = swp_entry(type, offset*PAGE_MMUCOUNT); type = swap_info[type].next; if (type < 0 || p->prio != swap_info[type].prio) { @@ -161,15 +165,23 @@ static struct swap_info_struct * swap_in if (!entry.val) goto out; type = swp_type(entry); - if (type >= nr_swapfiles) + if (type >= nr_swapfiles) { + printk(KERN_ERR "bad type %lu beyond nr_swapfiles %u " + "in swap_info_get()\n", type, nr_swapfiles); goto bad_nofile; + } p = & swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); - if (offset >= p->max) + + /* + * offset returned by swp_offset() is in MMUPAGE_SIZE units, + * p->max is in PAGE_SIZE units + */ + if (offset >= p->max*PAGE_MMUCOUNT) goto bad_offset; - if (!p->swap_map[offset]) + if (!p->swap_map[offset/PAGE_MMUCOUNT]) goto bad_free; swap_list_lock(); if (p->prio > swap_info[swap_list.next].prio) @@ -179,15 +191,70 @@ static struct swap_info_struct * swap_in bad_free: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + WARN_ON(1); goto out; bad_offset: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + WARN_ON(1); goto out; bad_device: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + WARN_ON(1); goto out; bad_nofile: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); + WARN_ON(1); + +/* dump pagetables */ +#if 1 + { + struct mm_struct *mm = current->mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long vaddr; + + if (!mm) { + /* we're dead here anyway, but... */ + printk(KERN_ERR "bug in free_swap_and_cache() " + "with no mm!\n"); + goto out_noscan; + } + + for (vaddr = 0; vaddr < TASK_SIZE; vaddr += PGDIR_SIZE) { + pgd = pgd_offset(mm, vaddr); + printk(KERN_DEBUG "pgd for 0x%lx = 0x%Lx\n", + vaddr, (u64)pgd_val(*pgd)); + } + + if (PTRS_PER_PMD > 1) { + for (vaddr = 0; vaddr < TASK_SIZE; vaddr += PMD_SIZE) { + pgd = pgd_offset(mm, vaddr); + if (pgd_none(*pgd) || !pgd_present(*pgd)) + continue; + pmd = pmd_offset(pgd, vaddr); + printk(KERN_DEBUG "pmd for 0x%lx = 0x%Lx\n", + vaddr, (u64)pmd_val(*pmd)); + } + } + + for (vaddr = 0; vaddr < TASK_SIZE; vaddr += MMUPAGE_SIZE) { + pgd = pgd_offset(mm, vaddr); + if (pgd_none(*pgd) || !pgd_present(*pgd)) + continue; + pmd = pmd_offset(pgd, vaddr); + if (pmd_none(*pmd) || !pmd_present(*pmd)) + continue; + pte = pte_offset_map_nested(pmd, vaddr); + if (!pte_none(*pte) && pte_present(*pte)) + printk(KERN_DEBUG "pte for 0x%lx = 0x%Lx\n", + vaddr, (u64)pte_val(*pte)); + pte_unmap_nested(pte); + } +out_noscan: + ; + } +#endif out: return NULL; } @@ -198,6 +265,9 @@ static void swap_info_put(struct swap_in swap_list_unlock(); } +/* + * offset is entry.val/PAGE_MMUCOUNT + */ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) { int count = p->swap_map[offset]; @@ -226,11 +296,12 @@ void swap_free(swp_entry_t entry) struct swap_info_struct * p; p = swap_info_get(entry); - if (p) { - swap_entry_free(p, swp_offset(entry)); + if (!p) + return; + + swap_entry_free(p, swp_offset(entry)/PAGE_MMUCOUNT); swap_info_put(p); } -} /* * Check if we're the only user of a swap page, @@ -242,11 +313,11 @@ static int exclusive_swap_page(struct pa struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ - if (p->swap_map[swp_offset(entry)] == 1) { + if (p->swap_map[swp_offset(entry)/PAGE_MMUCOUNT] == 1) { /* Recheck the page count with the pagecache lock held.. */ spin_lock(&swapper_space.page_lock); if (page_count(page) - !!PagePrivate(page) == 2) @@ -310,14 +381,14 @@ int remove_exclusive_swap_page(struct pa if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; p = swap_info_get(entry); if (!p) return 0; /* Is the only swap cache user the cache itself? */ retval = 0; - if (p->swap_map[swp_offset(entry)] == 1) { + if (p->swap_map[swp_offset(entry)/PAGE_MMUCOUNT] == 1) { /* Recheck the page count with the pagecache lock held.. */ spin_lock(&swapper_space.page_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { @@ -348,8 +419,8 @@ void free_swap_and_cache(swp_entry_t ent p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); + if (swap_entry_free(p, swp_offset(entry)/PAGE_MMUCOUNT) == 1) + page = find_trylock_page(&swapper_space, entry.val/PAGE_MMUCOUNT); swap_info_put(p); } if (page) { @@ -382,9 +453,12 @@ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { + unsigned long pfn; + /* vma_suboffset() would be meaningless; these are anonymous */ + pfn = page_to_pfn(page) + (entry.val % PAGE_MMUCOUNT); vma->vm_mm->rss++; get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + set_pte(dir, pte_mkold(pfn_pte(pfn, vma->vm_page_prot))); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } @@ -422,7 +496,7 @@ static int unuse_pmd(struct vm_area_stru pte_unmap(pte); return 1; } - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); pte_unmap(pte - 1); @@ -572,6 +646,9 @@ static int try_to_unuse(unsigned int typ * child immediately after parent. If we race with dup_mmap(), * we very much want to resolve parent before child, otherwise * we may miss some entries: using last mm would invert that. + * + * The whole of the preceding discussion is bogus now that + * physical scanning is in place. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -601,7 +678,7 @@ static int try_to_unuse(unsigned int typ * page and read the swap into it. */ swap_map = &si->swap_map[i]; - entry = swp_entry(type, i); + entry = swp_entry(type, i*PAGE_MMUCOUNT); page = read_swap_cache_async(entry); if (!page) { /* @@ -710,6 +787,10 @@ static int try_to_unuse(unsigned int typ * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. + * + * The whole of the preceding discussion is bogus given + * the new process capacities and there are probably + * resource leaks to fix up here. */ if (*swap_map == SWAP_MAP_MAX) { swap_device_lock(si); @@ -786,6 +867,8 @@ sector_t map_swap_page(struct swap_info_ struct swap_extent *se = sis->curr_swap_extent; struct swap_extent *start_se = se; + offset /= PAGE_MMUCOUNT; + for ( ; ; ) { struct list_head *lh; @@ -988,7 +1071,7 @@ int page_queue_congested(struct page *pa bdi = page->mapping->backing_dev_info; if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->index*PAGE_MMUCOUNT }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); @@ -1353,20 +1436,20 @@ asmlinkage long sys_swapon(const char __ maxpages = swp_offset(swp_entry(0,~0UL)) - 1; if (maxpages > swap_header->info.last_page) maxpages = swap_header->info.last_page; - p->highest_bit = maxpages - 1; + p->highest_bit = maxpages/PAGE_MMUCOUNT - 1; error = -EINVAL; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + if (!(p->swap_map = vmalloc(maxpages/PAGE_MMUCOUNT * sizeof(short)))) { error = -ENOMEM; goto bad_swap; } error = 0; - memset(p->swap_map, 0, maxpages * sizeof(short)); + memset(p->swap_map, 0, maxpages/PAGE_MMUCOUNT * sizeof(short)); for (i=0; iinfo.nr_badpages; i++) { int page = swap_header->info.badpages[i]; if (page <= 0 || page >= swap_header->info.last_page) @@ -1374,14 +1457,14 @@ asmlinkage long sys_swapon(const char __ else p->swap_map[page] = SWAP_MAP_BAD; } - nr_good_pages = swap_header->info.last_page - + nr_good_pages = (swap_header->info.last_page - swap_header->info.nr_badpages - - 1 /* header page */; + 1)/PAGE_MMUCOUNT /* header page */; if (error) goto bad_swap; } - if (swapfilesize && maxpages > swapfilesize) { + if (swapfilesize && maxpages/PAGE_MMUCOUNT > swapfilesize) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); error = -EINVAL; @@ -1393,7 +1476,7 @@ asmlinkage long sys_swapon(const char __ goto bad_swap; } p->swap_map[0] = SWAP_MAP_BAD; - p->max = maxpages; + p->max = maxpages/PAGE_MMUCOUNT; p->pages = nr_good_pages; if (setup_swap_extents(p)) @@ -1488,7 +1571,7 @@ int swap_duplicate(swp_entry_t entry) if (type >= nr_swapfiles) goto bad_file; p = type + swap_info; - offset = swp_offset(entry); + offset = swp_offset(entry)/PAGE_MMUCOUNT; swap_device_lock(p); if (offset < p->max && p->swap_map[offset]) { @@ -1508,6 +1591,7 @@ out: bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + BUG(); goto out; } @@ -1529,7 +1613,7 @@ int valid_swaphandles(swp_entry_t entry, if (!page_cluster) /* no readahead */ return 0; - toff = (swp_offset(entry) >> page_cluster) << page_cluster; + toff = (swp_offset(entry)/PAGE_MMUCOUNT) & ~((1UL << page_cluster)-1); if (!toff) /* first page is swap header */ toff++, i--; *offset = toff; diff -prauwN linux-2.5.69/mm/vcache.c pgcl-2.5.69-3/mm/vcache.c --- linux-2.5.69/mm/vcache.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/mm/vcache.c 2003-05-26 07:14:20.000000000 -0700 @@ -34,7 +34,7 @@ void __attach_vcache(vcache_t *vcache, { struct list_head *hash_head; - address &= PAGE_MASK; + address &= MMUPAGE_MASK; vcache->address = address; vcache->mm = mm; vcache->callback = callback; diff -prauwN linux-2.5.69/mm/vmalloc.c pgcl-2.5.69-3/mm/vmalloc.c --- linux-2.5.69/mm/vmalloc.c 2003-05-04 16:53:32.000000000 -0700 +++ pgcl-2.5.69-3/mm/vmalloc.c 2003-05-26 07:36:49.000000000 -0700 @@ -44,15 +44,12 @@ static void unmap_area_pte(pmd_t *pmd, u end = PMD_SIZE; do { - pte_t page; - page = ptep_get_and_clear(pte); - address += PAGE_SIZE; - pte++; - if (pte_none(page)) - continue; - if (pte_present(page)) - continue; + if (pte_present(*pte)) + pte_clear(pte); + else if (!pte_none(*pte)) printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n"); + pte++; + address += MMUPAGE_SIZE; } while (address < end); } @@ -83,57 +80,6 @@ static void unmap_area_pmd(pgd_t *dir, u } while (address < end); } -static int map_area_pte(pte_t *pte, unsigned long address, - unsigned long size, pgprot_t prot, - struct page ***pages) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - - do { - struct page *page = **pages; - - if (!pte_none(*pte)) - printk(KERN_ERR "alloc_area_pte: page already exists\n"); - if (!page) - return -ENOMEM; - - set_pte(pte, mk_pte(page, prot)); - address += PAGE_SIZE; - pte++; - (*pages)++; - } while (address < end); - return 0; -} - -static int map_area_pmd(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t prot, - struct page ***pages) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - - do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); - if (!pte) - return -ENOMEM; - if (map_area_pte(pte, address, end - address, prot, pages)) - return -ENOMEM; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - - return 0; -} - void unmap_vm_area(struct vm_struct *area) { unsigned long address = VMALLOC_VMADDR(area->addr); @@ -150,30 +96,48 @@ void unmap_vm_area(struct vm_struct *are flush_tlb_kernel_range(VMALLOC_VMADDR(area->addr), end); } +#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { unsigned long address = VMALLOC_VMADDR(area->addr); - unsigned long end = address + (area->size-PAGE_SIZE); - pgd_t *dir; + /* don't instantiate PTE's for the guard page */ + unsigned long end = address + area->size - MMUPAGE_SIZE; + unsigned long voffset = 0; + pgd_t *pgd; int err = 0; - dir = pgd_offset_k(address); + pgd = pgd_offset_k(address); spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); + pmd_t *pmd = pmd_alloc(&init_mm, pgd, address); if (!pmd) { err = -ENOMEM; - break; + goto out; } - if (map_area_pmd(pmd, address, end - address, prot, pages)) { + + do { + pte_t *pte = pte_alloc_kernel(&init_mm, pmd, address); + if (!pte) { err = -ENOMEM; - break; + goto out; } - - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - + do { + unsigned long pfn; + pfn = page_to_pfn((*pages)[voffset/PAGE_SIZE]); + pfn += (voffset/MMUPAGE_SIZE) % PAGE_MMUCOUNT; + set_pte(pte, pfn_pte(pfn, prot)); + ++pte; + address += MMUPAGE_SIZE; + voffset += MMUPAGE_SIZE; + } while (((unsigned long)pte & PTE_TABLE_MASK) && address < end); + ++pmd; + } while (((unsigned long)pmd & PMD_TABLE_MASK) && address < end); + ++pgd; + /* presumably address could wrap to 0, but I doubt it */ + } while (address && address < end); +out: spin_unlock(&init_mm.page_table_lock); flush_cache_all(); return err; @@ -202,7 +166,7 @@ struct vm_struct *get_vm_area(unsigned l /* * We always allocate a guard page. */ - size += PAGE_SIZE; + size += MMUPAGE_SIZE; if (unlikely(!size)) { kfree (area); return NULL; @@ -231,6 +195,9 @@ found: area->phys_addr = 0; write_unlock(&vmlist_lock); + printk("vmalloc, returning [0x%p, 0x%p)\n", + area->addr, ((char *)area->addr) + area->size); + return area; out: @@ -273,7 +240,7 @@ void __vunmap(void *addr, int deallocate if (!addr) return; - if ((PAGE_SIZE-1) & (unsigned long)addr) { + if ((MMUPAGE_SIZE-1) & (unsigned long)addr) { printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); return; } @@ -291,8 +258,7 @@ void __vunmap(void *addr, int deallocate int i; for (i = 0; i < area->nr_pages; i++) { - if (unlikely(!area->pages[i])) - BUG(); + BUG_ON(unlikely(!area->pages[i])); __free_page(area->pages[i]); } @@ -351,10 +317,10 @@ void *vmap(struct page **pages, unsigned { struct vm_struct *area; - if (count > num_physpages) + if (PAGE_MMUCOUNT*count > num_physpages) return NULL; - area = get_vm_area((count << PAGE_SHIFT), flags); + area = get_vm_area(PAGE_SIZE*count, flags); if (!area) return NULL; if (map_vm_area(area, prot, &pages)) { @@ -382,16 +348,16 @@ void *__vmalloc(unsigned long size, int struct page **pages; unsigned int nr_pages, array_size, i; - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) + size = MMUPAGE_ALIGN(size); + if (!size || (size >> MMUPAGE_SHIFT) > num_physpages) return NULL; area = get_vm_area(size, VM_ALLOC); if (!area) return NULL; - nr_pages = size >> PAGE_SHIFT; - array_size = (nr_pages * sizeof(struct page *)); + nr_pages = PAGE_ALIGN(size)/PAGE_SIZE; + array_size = nr_pages * sizeof(struct page *); area->nr_pages = nr_pages; area->pages = pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); @@ -462,7 +428,7 @@ long vread(char *buf, char *addr, unsign read_lock(&vmlist_lock); for (tmp = vmlist; tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + if (addr >= vaddr + tmp->size - MMUPAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -472,7 +438,7 @@ long vread(char *buf, char *addr, unsign addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + tmp->size - MMUPAGE_SIZE - addr; do { if (count == 0) goto finished; @@ -500,7 +466,7 @@ long vwrite(char *buf, char *addr, unsig read_lock(&vmlist_lock); for (tmp = vmlist; tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + if (addr >= vaddr + tmp->size - MMUPAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -509,7 +475,7 @@ long vwrite(char *buf, char *addr, unsig addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + tmp->size - MMUPAGE_SIZE - addr; do { if (count == 0) goto finished; diff -prauwN linux-2.5.69/mm/vmscan.c pgcl-2.5.69-3/mm/vmscan.c --- linux-2.5.69/mm/vmscan.c 2003-05-04 16:53:02.000000000 -0700 +++ pgcl-2.5.69-3/mm/vmscan.c 2003-05-26 07:14:20.000000000 -0700 @@ -398,7 +398,7 @@ shrink_list(struct list_head *page_list, #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->index*PAGE_MMUCOUNT }; __delete_from_swap_cache(page); spin_unlock(&mapping->page_lock); swap_free(swap); diff -prauwN linux-2.5.69/net/ipv4/netfilter/ip_conntrack_core.c pgcl-2.5.69-3/net/ipv4/netfilter/ip_conntrack_core.c --- linux-2.5.69/net/ipv4/netfilter/ip_conntrack_core.c 2003-05-04 16:52:48.000000000 -0700 +++ pgcl-2.5.69-3/net/ipv4/netfilter/ip_conntrack_core.c 2003-05-26 07:14:20.000000000 -0700 @@ -1416,9 +1416,9 @@ int __init ip_conntrack_init(void) ip_conntrack_htable_size = hashsize; } else { ip_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) + = (((num_physpages << MMUPAGE_SHIFT) / 16384) / sizeof(struct list_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + if (num_physpages > (1024 * 1024 * 1024 / MMUPAGE_SIZE)) ip_conntrack_htable_size = 8192; if (ip_conntrack_htable_size < 16) ip_conntrack_htable_size = 16; diff -prauwN linux-2.5.69/net/ipv4/tcp.c pgcl-2.5.69-3/net/ipv4/tcp.c --- linux-2.5.69/net/ipv4/tcp.c 2003-05-04 16:53:08.000000000 -0700 +++ pgcl-2.5.69-3/net/ipv4/tcp.c 2003-05-26 07:14:20.000000000 -0700 @@ -2609,9 +2609,9 @@ void __init tcp_init(void) * The methodology is similar to that of the buffer cache. */ if (num_physpages >= (128 * 1024)) - goal = num_physpages >> (21 - PAGE_SHIFT); + goal = num_physpages >> (21 - MMUPAGE_SHIFT); else - goal = num_physpages >> (23 - PAGE_SHIFT); + goal = num_physpages >> (23 - MMUPAGE_SHIFT); for (order = 0; (1UL << order) < goal; order++) ;