diff -prauN linux-2.6.0-test11/arch/i386/Kconfig pgcl-2.6.0-test11-1/arch/i386/Kconfig --- linux-2.6.0-test11/arch/i386/Kconfig 2003-11-26 12:43:07.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/Kconfig 2003-11-27 21:55:16.000000000 -0800 @@ -692,6 +692,18 @@ config X86_PAE depends on HIGHMEM64G default y +config PAGE_CLUSTER + int "Page clustering factor" + default 3 if HIGHMEM64G + default 2 if HIGHMEM4G + default 1 + help + Select page clustering factor as a power of 2. + Defaults and examples: + 3 => 32KB PAGE_SIZE + 2 => 16KB PAGE_SIZE + 1 => 8KB PAGE_SIZE + # Common NUMA Features config NUMA bool "Numa Memory Allocation Support" @@ -1217,6 +1229,18 @@ config FRAME_POINTER If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config EARLY_CONSOLE_3F8 + bool "Early printk() on serial I/O port 0x3F8" + +config EARLY_CONSOLE_3E8 + bool "Early printk() on serial I/O port 0x3E8" + +config EARLY_CONSOLE_VGA + bool "Early printk() on VGA text console" + +config EARLY_CONSOLE_BOCHS_E9_HACK + bool "Early printk() via the bochs 0xE9 hack" + config X86_EXTRA_IRQS bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -prauN linux-2.6.0-test11/arch/i386/boot/setup.S pgcl-2.6.0-test11-1/arch/i386/boot/setup.S --- linux-2.6.0-test11/arch/i386/boot/setup.S 2003-11-26 12:44:20.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/boot/setup.S 2003-11-27 21:55:16.000000000 -0800 @@ -58,6 +58,9 @@ #include #include #include + +#define VMALLOC_START (-0xC0000000 - 128*1024*1024) +#include #include /* Signature words to ensure LILO loaded us right */ @@ -162,7 +165,7 @@ cmd_line_ptr: .long 0 # (Header versio # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long MAXMEM-1 # (Header version 0x0203 or later) +ramdisk_max: .long __MAXMEM-1 # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd diff -prauN linux-2.6.0-test11/arch/i386/kernel/acpi/boot.c pgcl-2.6.0-test11-1/arch/i386/kernel/acpi/boot.c --- linux-2.6.0-test11/arch/i386/kernel/acpi/boot.c 2003-11-26 12:45:28.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/acpi/boot.c 2003-11-27 21:55:16.000000000 -0800 @@ -73,8 +73,8 @@ char *__acpi_map_table(unsigned long phy if (phys + size < 8*1024*1024) return __va(phys); - offset = phys & (PAGE_SIZE - 1); - mapped_size = PAGE_SIZE - offset; + offset = phys & (MMUPAGE_SIZE - 1); + mapped_size = MMUPAGE_SIZE - offset; set_fixmap(FIX_ACPI_END, phys); base = fix_to_virt(FIX_ACPI_END); @@ -85,9 +85,9 @@ char *__acpi_map_table(unsigned long phy while (mapped_size < size) { if (--idx < FIX_ACPI_BEGIN) return 0; /* cannot handle this */ - phys += PAGE_SIZE; + phys += MMUPAGE_SIZE; set_fixmap(idx, phys); - mapped_size += PAGE_SIZE; + mapped_size += MMUPAGE_SIZE; } return ((unsigned char *) base + offset); diff -prauN linux-2.6.0-test11/arch/i386/kernel/apic.c pgcl-2.6.0-test11-1/arch/i386/kernel/apic.c --- linux-2.6.0-test11/arch/i386/kernel/apic.c 2003-11-26 12:46:07.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/apic.c 2003-11-27 21:55:16.000000000 -0800 @@ -715,7 +715,7 @@ void __init init_apic_mappings(void) * one for the IO-APIC. */ if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = (unsigned long) alloc_bootmem_pages(MMUPAGE_SIZE); apic_phys = __pa(apic_phys); } else apic_phys = mp_lapic_addr; @@ -747,7 +747,7 @@ void __init init_apic_mappings(void) } } else { fake_ioapic_page: - ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long) alloc_bootmem_pages(MMUPAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); diff -prauN linux-2.6.0-test11/arch/i386/kernel/cpu/amd.c pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/amd.c --- linux-2.6.0-test11/arch/i386/kernel/cpu/amd.c 2003-11-26 12:45:44.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/amd.c 2003-11-27 21:55:16.000000000 -0800 @@ -25,7 +25,7 @@ __asm__(".align 4\nvide: ret"); static void __init init_amd(struct cpuinfo_x86 *c) { u32 l, h; - int mbytes = num_physpages >> (20-PAGE_SHIFT); + int mbytes = num_physpages >> (20-MMUPAGE_SHIFT); int r; /* diff -prauN linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/amd.c pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/amd.c --- linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/amd.c 2003-11-26 12:44:14.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/amd.c 2003-11-27 21:55:16.000000000 -0800 @@ -16,7 +16,7 @@ amd_get_mtrr(unsigned int reg, unsigned if (reg == 1) low = high; /* The base masks off on the right alignment */ - *base = (low & 0xFFFE0000) >> PAGE_SHIFT; + *base = (low & 0xFFFE0000) >> MMUPAGE_SHIFT; *type = 0; if (low & 1) *type = MTRR_TYPE_UNCACHABLE; @@ -42,7 +42,7 @@ amd_get_mtrr(unsigned int reg, unsigned * *128K ... */ low = (~low) & 0x1FFFC; - *size = (low + 4) << (15 - PAGE_SHIFT); + *size = (low + 4) << (15 - MMUPAGE_SHIFT); return; } @@ -77,8 +77,8 @@ static void amd_set_mtrr(unsigned int re desired 111 1111 1111 1100 mask But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ - regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) - | (base << PAGE_SHIFT) | (type + 1); + regs[reg] = (-size >> (15 - MMUPAGE_SHIFT) & 0x0001FFFC) + | (base << MMUPAGE_SHIFT) | (type + 1); /* * The writeback rule is quite specific. See the manual. Its @@ -97,7 +97,7 @@ static int amd_validate_add_page(unsigne o Power of 2 block o base suitably aligned to the power */ - if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) + if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - MMUPAGE_SHIFT)) || (size & ~(size - 1)) - size || (base & (size - 1))) return -EINVAL; return 0; diff -prauN linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/centaur.c pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/centaur.c --- linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/centaur.c 2003-11-26 12:43:28.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/centaur.c 2003-11-27 21:55:16.000000000 -0800 @@ -51,8 +51,8 @@ static void centaur_get_mcr(unsigned int reg, unsigned long *base, unsigned int *size, mtrr_type * type) { - *base = centaur_mcr[reg].high >> PAGE_SHIFT; - *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; + *base = centaur_mcr[reg].high >> MMUPAGE_SHIFT; + *size = -(centaur_mcr[reg].low & 0xfffff000) >> MMUPAGE_SHIFT; *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) *type = MTRR_TYPE_UNCACHABLE; @@ -72,14 +72,14 @@ static void centaur_set_mcr(unsigned int /* Disable */ high = low = 0; } else { - high = base << PAGE_SHIFT; + high = base << MMUPAGE_SHIFT; if (centaur_mcr_type == 0) - low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ + low = -size << MMUPAGE_SHIFT | 0x1f; /* only support write-combining... */ else { if (type == MTRR_TYPE_UNCACHABLE) - low = -size << PAGE_SHIFT | 0x02; /* NC */ + low = -size << MMUPAGE_SHIFT | 0x02; /* NC */ else - low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + low = -size << MMUPAGE_SHIFT | 0x09; /* WWO,WC */ } } centaur_mcr[reg].high = high; diff -prauN linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/cyrix.c pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/cyrix.c --- linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/cyrix.c 2003-11-26 12:44:18.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/cyrix.c 2003-11-27 21:55:16.000000000 -0800 @@ -30,7 +30,7 @@ cyrix_get_arr(unsigned int reg, unsigned /* Enable interrupts if it was enabled previously */ local_irq_restore(flags); shift = ((unsigned char *) base)[1] & 0x0f; - *base >>= PAGE_SHIFT; + *base >>= MMUPAGE_SHIFT; /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 * Note: shift==0xf means 4G, this is unsupported. @@ -203,7 +203,7 @@ static void cyrix_set_arr(unsigned int r prepare_set(); - base <<= PAGE_SHIFT; + base <<= MMUPAGE_SHIFT; setCx86(arr, ((unsigned char *) &base)[3]); setCx86(arr + 1, ((unsigned char *) &base)[2]); setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); diff -prauN linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/generic.c pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/generic.c --- linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/generic.c 2003-11-26 12:44:23.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/generic.c 2003-11-27 21:55:16.000000000 -0800 @@ -132,13 +132,13 @@ void generic_get_mtrr(unsigned int reg, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; + mask_lo = size_or_mask | mask_hi << (32 - MMUPAGE_SHIFT) + | mask_lo >> MMUPAGE_SHIFT; /* This works correctly if size is a power of two, i.e. a contiguous range. */ *size = -mask_lo; - *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *base = base_hi << (32 - MMUPAGE_SHIFT) | base_lo >> MMUPAGE_SHIFT; *type = base_lo & 0xff; } @@ -321,10 +321,10 @@ static void generic_set_mtrr(unsigned in relevant mask register to disable a range. */ wrmsr(MTRRphysMask_MSR(reg), 0, 0); } else { - wrmsr(MTRRphysBase_MSR(reg), base << PAGE_SHIFT | type, - (base & size_and_mask) >> (32 - PAGE_SHIFT)); - wrmsr(MTRRphysMask_MSR(reg), -size << PAGE_SHIFT | 0x800, - (-size & size_and_mask) >> (32 - PAGE_SHIFT)); + wrmsr(MTRRphysBase_MSR(reg), base << MMUPAGE_SHIFT | type, + (base & size_and_mask) >> (32 - MMUPAGE_SHIFT)); + wrmsr(MTRRphysMask_MSR(reg), -size << MMUPAGE_SHIFT | 0x800, + (-size & size_and_mask) >> (32 - MMUPAGE_SHIFT)); } post_set(); @@ -339,7 +339,7 @@ int generic_validate_add_page(unsigned l if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { - if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { + if (base & ((1 << (22 - MMUPAGE_SHIFT)) - 1)) { printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } diff -prauN linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/if.c pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/if.c --- linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/if.c 2003-11-26 12:43:25.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/if.c 2003-11-27 21:55:16.000000000 -0800 @@ -50,10 +50,10 @@ mtrr_file_add(unsigned long base, unsign FILE_FCOUNT(file) = fcount; } if (!page) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) return -EINVAL; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; + base >>= MMUPAGE_SHIFT; + size >>= MMUPAGE_SHIFT; } reg = mtrr_add_page(base, size, type, 1); if (reg >= 0) @@ -69,10 +69,10 @@ mtrr_file_del(unsigned long base, unsign unsigned int *fcount = FILE_FCOUNT(file); if (!page) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) return -EINVAL; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; + base >>= MMUPAGE_SHIFT; + size >>= MMUPAGE_SHIFT; } reg = mtrr_del_page(-1, base, size); if (reg < 0) @@ -136,8 +136,8 @@ mtrr_write(struct file *file, const char for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) continue; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; + base >>= MMUPAGE_SHIFT; + size >>= MMUPAGE_SHIFT; err = mtrr_add_page((unsigned long) base, (unsigned long) size, i, 1); @@ -211,8 +211,8 @@ mtrr_ioctl(struct inode *inode, struct f || gentry.size == 0x100000) gentry.base = gentry.size = gentry.type = 0; else { - gentry.base <<= PAGE_SHIFT; - gentry.size <<= PAGE_SHIFT; + gentry.base <<= MMUPAGE_SHIFT; + gentry.size <<= MMUPAGE_SHIFT; gentry.type = type; } @@ -332,18 +332,18 @@ static int mtrr_seq_show(struct seq_file if (size == 0) usage_table[i] = 0; else { - if (size < (0x100000 >> PAGE_SHIFT)) { + if (size < (0x100000 >> MMUPAGE_SHIFT)) { /* less than 1MB */ factor = 'K'; - size <<= PAGE_SHIFT - 10; + size <<= MMUPAGE_SHIFT - 10; } else { factor = 'M'; - size >>= 20 - PAGE_SHIFT; + size >>= 20 - MMUPAGE_SHIFT; } /* RED-PEN: base can be > 32bit */ len += seq_printf(seq, "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", - i, base, base >> (20 - PAGE_SHIFT), size, factor, + i, base, base >> (20 - MMUPAGE_SHIFT), size, factor, mtrr_attrib_to_str(type), usage_table[i]); } } diff -prauN linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/main.c pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/main.c --- linux-2.6.0-test11/arch/i386/kernel/cpu/mtrr/main.c 2003-11-26 12:42:55.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/cpu/mtrr/main.c 2003-11-27 21:55:16.000000000 -0800 @@ -419,12 +419,12 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type, char increment) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) { printk(KERN_WARNING "mtrr: size and base must be multiples of 4 kiB\n"); printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); return -EINVAL; } - return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + return mtrr_add_page(base >> MMUPAGE_SHIFT, size >> MMUPAGE_SHIFT, type, increment); } @@ -515,12 +515,12 @@ int mtrr_del_page(int reg, unsigned long int mtrr_del(int reg, unsigned long base, unsigned long size) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + if ((base & (MMUPAGE_SIZE - 1)) || (size & (MMUPAGE_SIZE - 1))) { printk(KERN_INFO "mtrr: size and base must be multiples of 4 kiB\n"); printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); return -EINVAL; } - return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); + return mtrr_del_page(reg, base >> MMUPAGE_SHIFT, size >> MMUPAGE_SHIFT); } EXPORT_SYMBOL(mtrr_add); @@ -640,7 +640,7 @@ static int __init mtrr_init(void) u32 phys_addr; phys_addr = cpuid_eax(0x80000008) & 0xff; size_or_mask = - ~((1 << (phys_addr - PAGE_SHIFT)) - 1); + ~((1 << (phys_addr - MMUPAGE_SHIFT)) - 1); size_and_mask = ~size_or_mask & 0xfff00000; } /* Athlon MTRRs use an Intel-compatible interface for diff -prauN linux-2.6.0-test11/arch/i386/kernel/entry.S pgcl-2.6.0-test11-1/arch/i386/kernel/entry.S --- linux-2.6.0-test11/arch/i386/kernel/entry.S 2003-11-26 12:43:26.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/entry.S 2003-11-27 21:55:16.000000000 -0800 @@ -162,7 +162,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + andl $~(THREAD_SIZE-1), %ebp # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -515,8 +515,8 @@ ENTRY(nmi) /* Do not access memory above the end of our stack page, * it might not exist. */ - andl $0x1fff,%eax - cmpl $0x1fec,%eax + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax popl %eax jae nmi_stack_correct cmpl $sysenter_entry,12(%esp) diff -prauN linux-2.6.0-test11/arch/i386/kernel/head.S pgcl-2.6.0-test11-1/arch/i386/kernel/head.S --- linux-2.6.0-test11/arch/i386/kernel/head.S 2003-11-26 12:42:58.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/head.S 2003-11-27 21:55:16.000000000 -0800 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ @@ -344,6 +345,10 @@ ignore_int: movl %eax,%es pushl $int_msg call printk + call dump_stack +.give_up: + rep; nop + jmp .give_up popl %eax popl %ds popl %es diff -prauN linux-2.6.0-test11/arch/i386/kernel/i386_ksyms.c pgcl-2.6.0-test11-1/arch/i386/kernel/i386_ksyms.c --- linux-2.6.0-test11/arch/i386/kernel/i386_ksyms.c 2003-11-26 12:46:01.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/i386_ksyms.c 2003-11-27 21:55:16.000000000 -0800 @@ -200,7 +200,7 @@ EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(kmap_atomic_to_pfn); #endif #ifdef CONFIG_EDD_MODULE diff -prauN linux-2.6.0-test11/arch/i386/kernel/microcode.c pgcl-2.6.0-test11-1/arch/i386/kernel/microcode.c --- linux-2.6.0-test11/arch/i386/kernel/microcode.c 2003-11-26 12:43:25.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/microcode.c 2003-11-27 22:01:25.000000000 -0800 @@ -432,7 +432,7 @@ static ssize_t microcode_write (struct f return -EINVAL; } - if ((len >> PAGE_SHIFT) > num_physpages) { + if ((len >> MMUPAGE_SHIFT) > num_physpages) { printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); return -EINVAL; } diff -prauN linux-2.6.0-test11/arch/i386/kernel/mpparse.c pgcl-2.6.0-test11-1/arch/i386/kernel/mpparse.c --- linux-2.6.0-test11/arch/i386/kernel/mpparse.c 2003-11-26 12:43:31.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/mpparse.c 2003-11-27 21:55:16.000000000 -0800 @@ -721,7 +721,7 @@ static int __init smp_scan_config (unsig smp_found_config = 1; printk(KERN_INFO "found SMP MP-table at %08lx\n", virt_to_phys(mpf)); - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + reserve_bootmem(virt_to_phys(mpf), MMUPAGE_SIZE); if (mpf->mpf_physptr) { /* * We cannot access to MPC table to compute @@ -732,8 +732,8 @@ static int __init smp_scan_config (unsig * PAGE_SIZE from mpg->mpf_physptr yields BUG() * in reserve_bootmem. */ - unsigned long size = PAGE_SIZE; - unsigned long end = max_low_pfn * PAGE_SIZE; + unsigned long size = MMUPAGE_SIZE; + unsigned long end = max_low_pfn * MMUPAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; reserve_bootmem(mpf->mpf_physptr, size); diff -prauN linux-2.6.0-test11/arch/i386/kernel/numaq.c pgcl-2.6.0-test11-1/arch/i386/kernel/numaq.c --- linux-2.6.0-test11/arch/i386/kernel/numaq.c 2003-11-26 12:44:57.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/numaq.c 2003-11-27 21:55:16.000000000 -0800 @@ -30,10 +30,7 @@ #include #include -/* These are needed before the pgdat's are created */ -extern long node_start_pfn[], node_end_pfn[]; - -#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) +#define MB_TO_PAGES(addr) ((addr) << (20 - MMUPAGE_SHIFT)) /* * Function: smp_dump_qct() @@ -60,8 +57,20 @@ static void __init smp_dump_qct(void) eq->hi_shrd_mem_start - eq->priv_mem_size); node_end_pfn[node] = MB_TO_PAGES( eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); - } + printk("hi_shrd_mem_start[%d] = 0x%x\n" + "priv_mem_size[%d] = 0x%x\n" + "hi_shrd_mem_size[%d] = 0x%x\n" + "node_start_pfn[%d] = 0x%lx\n" + "node_end_pfn[%d] = 0x%lx\n", + node, eq->hi_shrd_mem_start, + node, eq->priv_mem_size, + node, eq->hi_shrd_mem_size, + node, node_start_pfn[node], + node, node_end_pfn[node]); + } else + printk("node %d not present\n", node); } + printk("%d nodes found in QCT\n", numnodes); } /* diff -prauN linux-2.6.0-test11/arch/i386/kernel/process.c pgcl-2.6.0-test11-1/arch/i386/kernel/process.c --- linux-2.6.0-test11/arch/i386/kernel/process.c 2003-11-26 12:42:37.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/process.c 2003-11-27 21:55:16.000000000 -0800 @@ -610,6 +610,8 @@ asmlinkage int sys_execve(struct pt_regs int error; char * filename; + pr_debug("sys_execve()\n"); + filename = getname((char __user *) regs.ebx); error = PTR_ERR(filename); if (IS_ERR(filename)) @@ -625,6 +627,7 @@ asmlinkage int sys_execve(struct pt_regs } putname(filename); out: + pr_debug("return from sys_execve()\n"); return error; } diff -prauN linux-2.6.0-test11/arch/i386/kernel/setup.c pgcl-2.6.0-test11-1/arch/i386/kernel/setup.c --- linux-2.6.0-test11/arch/i386/kernel/setup.c 2003-11-26 12:44:05.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/setup.c 2003-11-27 21:55:16.000000000 -0800 @@ -602,6 +602,8 @@ void __init find_max_pfn(void) continue; if (end > max_pfn) max_pfn = end; + + max_pfn &= ~(PAGE_MMUCOUNT - 1); } } @@ -612,6 +614,8 @@ unsigned long __init find_max_low_pfn(vo { unsigned long max_low_pfn; + printk("MAXMEM = %p\n", (void *)MAXMEM); + max_low_pfn = max_pfn; if (max_low_pfn > MAXMEM_PFN) { if (highmem_pages == -1) @@ -725,10 +729,10 @@ static unsigned long __init setup_memory highstart_pfn = max_low_pfn; } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); + (highend_pfn - highstart_pfn) >> (20 - MMUPAGE_SHIFT)); #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); + max_low_pfn >> (20 - MMUPAGE_SHIFT)); /* * Initialize the boot-time allocator (with low memory only): */ @@ -749,7 +753,7 @@ static unsigned long __init setup_memory * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ - reserve_bootmem(0, PAGE_SIZE); + reserve_bootmem(0, MMUPAGE_SIZE); #ifdef CONFIG_SMP /* @@ -757,7 +761,7 @@ static unsigned long __init setup_memory * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_bootmem(PAGE_SIZE, PAGE_SIZE); + reserve_bootmem(MMUPAGE_SIZE, MMUPAGE_SIZE); #endif #ifdef CONFIG_ACPI_SLEEP /* @@ -774,7 +778,7 @@ static unsigned long __init setup_memory #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { + if (INITRD_START + INITRD_SIZE <= PFN_PHYS(max_low_pfn)) { reserve_bootmem(INITRD_START, INITRD_SIZE); initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0; @@ -784,7 +788,7 @@ static unsigned long __init setup_memory printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", INITRD_START + INITRD_SIZE, - max_low_pfn << PAGE_SHIFT); + PFN_PHYS(max_low_pfn)); initrd_start = 0; } } @@ -838,7 +842,7 @@ static void __init register_memory(unsig request_resource(&ioport_resource, standard_io_resources+i); /* Tell the PCI layer not to allocate too close to the RAM area.. */ - low_mem_size = ((max_low_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff; + low_mem_size = ((max_low_pfn << MMUPAGE_SHIFT) + 0xfffff) & ~0xfffff; if (low_mem_size > pci_mem_start) pci_mem_start = low_mem_size; } diff -prauN linux-2.6.0-test11/arch/i386/kernel/smpboot.c pgcl-2.6.0-test11-1/arch/i386/kernel/smpboot.c --- linux-2.6.0-test11/arch/i386/kernel/smpboot.c 2003-11-26 12:44:06.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/smpboot.c 2003-11-27 21:55:16.000000000 -0800 @@ -100,7 +100,7 @@ static unsigned long __init setup_trampo */ void __init smp_alloc_memory(void) { - trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); + trampoline_base = (void *) alloc_bootmem_low_pages(MMUPAGE_SIZE); /* * Has to be in very low memory so we can execute * real-mode AP code. diff -prauN linux-2.6.0-test11/arch/i386/kernel/srat.c pgcl-2.6.0-test11-1/arch/i386/kernel/srat.c --- linux-2.6.0-test11/arch/i386/kernel/srat.c 2003-11-26 12:43:25.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/srat.c 2003-11-27 21:55:16.000000000 -0800 @@ -24,23 +24,20 @@ * Send feedback to Pat Gaughen */ #include +#include #include #include #include #include #include +#include /* * proximity macros and definitions */ -#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ -#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ -#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) -#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) -#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */ -/* bitmap length; _PXM is at most 255 */ -#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) -static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ +#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */ + +static DECLARE_BITMAP(pxm_bitmap, MAX_PXM_DOMAINS); #define MAX_CHUNKS_PER_NODE 4 #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) @@ -57,10 +54,6 @@ static int num_memory_chunks; /* total static int zholes_size_init; static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES]; -extern unsigned long node_start_pfn[], node_end_pfn[]; - -extern void * boot_ioremap(unsigned long, unsigned long); - /* Identify CPU proximity domains */ static void __init parse_cpu_affinity_structure(char *p) { @@ -71,7 +64,7 @@ static void __init parse_cpu_affinity_st return; /* empty entry */ /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain); + set_bit(cpu_affinity->proximity_domain, pxm_bitmap); printk("CPU 0x%02X in proximity domain 0x%02X\n", cpu_affinity->apic_id, cpu_affinity->proximity_domain); @@ -94,7 +87,7 @@ static void __init parse_memory_affinity return; /* empty entry */ /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain); + set_bit(memory_affinity->proximity_domain, pxm_bitmap); /* calculate info for memory chunk structure */ paddr = memory_affinity->base_addr_hi; @@ -102,8 +95,8 @@ static void __init parse_memory_affinity size = memory_affinity->length_hi; size = (size << 32) | memory_affinity->length_lo; - start_pfn = paddr >> PAGE_SHIFT; - end_pfn = (paddr + size) >> PAGE_SHIFT; + start_pfn = paddr >> MMUPAGE_SHIFT; + end_pfn = (paddr + size) >> MMUPAGE_SHIFT; pxm = memory_affinity->proximity_domain; @@ -140,25 +133,20 @@ static void __init parse_memory_affinity #if MAX_NR_ZONES != 3 #error "MAX_NR_ZONES != 3, chunk_to_zone requires review" #endif -/* Take a chunk of pages from page frame cstart to cend and count the number - * of pages in each zone, returned via zones[]. +/* + * Take a chunk of pages from page frame cstart to cend and count the number + * of pages in each zone, returned via zones[]. This has a hardcoded bias + * to round up; for uses other than holes, introduce a bias argument to + * round differently in each case. */ static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, unsigned long *zones) { - unsigned long max_dma; - extern unsigned long max_low_pfn; - + unsigned long rend, max_dma = __pa(MAX_DMA_ADDRESS)/MMUPAGE_SIZE; int z; - unsigned long rend; - - /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide - * similarly scoped information and should be handled in a consistant - * manner. - */ - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - /* Split the hole into the zones in which it falls. Repeatedly + /* + * Split the hole into the zones in which it falls. Repeatedly * take the segment in which the remaining hole starts, round it * to the end of that zone. */ @@ -176,7 +164,7 @@ static __init void chunk_to_zones(unsign z = ZONE_HIGHMEM; rend = cend; } - zones[z] += rend - cstart; + zones[z] += (rend - cstart + PAGE_MMUCOUNT - 1)/PAGE_MMUCOUNT; cstart = rend; } } @@ -192,9 +180,7 @@ static void __init initialize_physnode_m for (i = num_memory_chunks; --i >= 0; nmcp++) { for (pfn = nmcp->start_pfn; pfn <= nmcp->end_pfn; pfn += PAGES_PER_ELEMENT) - { - physnode_map[pfn / PAGES_PER_ELEMENT] = (int)nmcp->nid; - } + physnode_map[pfn/PAGES_PER_ELEMENT] = (int)nmcp->nid; } } @@ -250,7 +236,7 @@ static int __init acpi20_parse_srat(stru */ numnodes = 0; /* init total nodes in system */ for (i = 0; i < MAX_PXM_DOMAINS; i++) { - if (BMAP_TEST(pxm_bitmap, i)) { + if (test_bit(i, pxm_bitmap)) { pxm_to_nid_map[i] = numnodes; nid_to_pxm_map[numnodes] = i; node_set_online(numnodes); @@ -258,8 +244,7 @@ static int __init acpi20_parse_srat(stru } } - if (numnodes == 0) - BUG(); + BUG_ON(!numnodes); /* set cnode id in memory chunk structure */ for (i = 0; i < num_memory_chunks; i++) @@ -268,25 +253,24 @@ static int __init acpi20_parse_srat(stru initialize_physnode_map(); printk("pxm bitmap: "); - for (i = 0; i < sizeof(pxm_bitmap); i++) { - printk("%02X ", pxm_bitmap[i]); - } + for (i = 0; i < sizeof(pxm_bitmap); i++) + printk("%08X ", pxm_bitmap[i]); + printk("\n"); printk("Number of logical nodes in system = %d\n", numnodes); printk("Number of memory chunks in system = %d\n", num_memory_chunks); - for (j = 0; j < num_memory_chunks; j++){ + for (j = 0; j < num_memory_chunks; j++) printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, node_memory_chunk[j].nid, node_memory_chunk[j].start_pfn, node_memory_chunk[j].end_pfn); - } /*calculate node_start_pfn/node_end_pfn arrays*/ for (nid = 0; nid < numnodes; nid++) { int been_here_before = 0; - for (j = 0; j < num_memory_chunks; j++){ + for (j = 0; j < num_memory_chunks; j++) { if (node_memory_chunk[j].nid == nid) { if (been_here_before == 0) { node_start_pfn[nid] = node_memory_chunk[j].start_pfn; @@ -410,28 +394,28 @@ out_err: */ static void __init get_zholes_init(void) { - int nid; - int c; - int first; + int nid, c, first; unsigned long end = 0; for (nid = 0; nid < numnodes; nid++) { first = 1; - for (c = 0; c < num_memory_chunks; c++){ - if (node_memory_chunk[c].nid == nid) { - if (first) { - end = node_memory_chunk[c].end_pfn; - first = 0; - - } else { - /* Record any gap between this chunk - * and the previous chunk on this node - * against the zones it spans. - */ - chunk_to_zones(end, - node_memory_chunk[c].start_pfn, - &zholes_size[nid * MAX_NR_ZONES]); - } + for (c = 0; c < num_memory_chunks; c++) { + if (node_memory_chunk[c].nid != nid) + continue; + + /* + * Record any gap between this chunk and the + * previous chunk on this node against the zones + * it spans. Also, round up the sizes of holes + * subtracted out to PAGE_SIZE multiples. + */ + if (!first) + chunk_to_zones(end, + node_memory_chunk[c].start_pfn, + &zholes_size[MAX_NR_ZONES*nid]); + else { + end = node_memory_chunk[c].end_pfn; + first = 0; } } } @@ -443,7 +427,7 @@ unsigned long * __init get_zholes_size(i zholes_size_init++; get_zholes_init(); } - if((nid >= numnodes) | (nid >= MAX_NUMNODES)) + if (nid < 0 || nid >= numnodes || nid >= MAX_NUMNODES) printk("%s: nid = %d is invalid. numnodes = %d", __FUNCTION__, nid, numnodes); return &zholes_size[nid * MAX_NR_ZONES]; diff -prauN linux-2.6.0-test11/arch/i386/kernel/sys_i386.c pgcl-2.6.0-test11-1/arch/i386/kernel/sys_i386.c --- linux-2.6.0-test11/arch/i386/kernel/sys_i386.c 2003-11-26 12:45:48.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/sys_i386.c 2003-11-27 21:55:16.000000000 -0800 @@ -97,10 +97,10 @@ asmlinkage int old_mmap(struct mmap_arg_ goto out; err = -EINVAL; - if (a.offset & ~PAGE_MASK) + if (a.offset & ~MMUPAGE_MASK) goto out; - err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> MMUPAGE_SHIFT); out: return err; } diff -prauN linux-2.6.0-test11/arch/i386/kernel/traps.c pgcl-2.6.0-test11-1/arch/i386/kernel/traps.c --- linux-2.6.0-test11/arch/i386/kernel/traps.c 2003-11-26 12:43:09.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/kernel/traps.c 2003-11-27 21:55:16.000000000 -0800 @@ -119,7 +119,7 @@ void show_trace_task(struct task_struct unsigned long esp = tsk->thread.esp; /* User space on another CPU? */ - if ((esp ^ (unsigned long)tsk->thread_info) & (PAGE_MASK<<1)) + if ((esp ^ (unsigned long)tsk->thread_info) & ~(THREAD_SIZE-1)) return; show_trace(tsk, (unsigned long *)esp); } @@ -437,6 +437,7 @@ static void unknown_nmi_error(unsigned c reason, smp_processor_id()); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + dump_stack(); } static void default_do_nmi(struct pt_regs * regs) @@ -842,15 +843,20 @@ static void __init set_task_gate(unsigne void __init trap_init(void) { #ifdef CONFIG_EISA + printk("trap_init(): about to check for ISA bus\n"); if (isa_readl(0x0FFFD9) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { EISA_bus = 1; } + printk("trap_init(): survived checking for ISA bus\n"); #endif #ifdef CONFIG_X86_LOCAL_APIC + printk("trap_init(): about to init_apic_mappings()\n"); init_apic_mappings(); + printk("trap_init(): survived init_apic_mappings()\n"); #endif + printk("trap_init(): about to set IDT gates\n"); set_trap_gate(0,÷_error); set_intr_gate(1,&debug); set_intr_gate(2,&nmi); @@ -882,11 +888,16 @@ void __init trap_init(void) */ set_call_gate(&default_ldt[0],lcall7); set_call_gate(&default_ldt[4],lcall27); + printk("trap_init(): survived setting IDT gates\n"); /* * Should be a barrier for any external CPU state. */ + printk("trap_init(): about to cpu_init()\n"); cpu_init(); + printk("trap_init(): survived cpu_init()\n"); + printk("trap_init(): about to trap_init_hook()\n"); trap_init_hook(); + printk("trap_init(): survived trap_init_hook()\n"); } diff -prauN linux-2.6.0-test11/arch/i386/lib/getuser.S pgcl-2.6.0-test11-1/arch/i386/lib/getuser.S --- linux-2.6.0-test11/arch/i386/lib/getuser.S 2003-11-26 12:42:48.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/lib/getuser.S 2003-11-27 21:55:16.000000000 -0800 @@ -8,9 +8,9 @@ * return an error value in addition to the "real" * return value. */ +#include #include - /* * __get_user_X * diff -prauN linux-2.6.0-test11/arch/i386/mm/Makefile pgcl-2.6.0-test11-1/arch/i386/mm/Makefile --- linux-2.6.0-test11/arch/i386/mm/Makefile 2003-11-26 12:43:26.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/mm/Makefile 2003-11-27 21:55:16.000000000 -0800 @@ -2,7 +2,7 @@ # Makefile for the linux i386-specific parts of the memory manager. # -obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o +obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o tlb.o obj-$(CONFIG_DISCONTIGMEM) += discontig.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff -prauN linux-2.6.0-test11/arch/i386/mm/discontig.c pgcl-2.6.0-test11-1/arch/i386/mm/discontig.c --- linux-2.6.0-test11/arch/i386/mm/discontig.c 2003-11-26 12:44:20.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/mm/discontig.c 2003-11-27 22:02:41.000000000 -0800 @@ -63,7 +63,6 @@ unsigned long node_end_pfn[MAX_NUMNODES] extern unsigned long find_max_low_pfn(void); extern void find_max_pfn(void); -extern void one_highpage_init(struct page *, int, int); extern struct e820map e820; extern char _end; @@ -72,8 +71,6 @@ extern unsigned long max_low_pfn; extern unsigned long totalram_pages; extern unsigned long totalhigh_pages; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - unsigned long node_remap_start_pfn[MAX_NUMNODES]; unsigned long node_remap_size[MAX_NUMNODES]; unsigned long node_remap_offset[MAX_NUMNODES]; @@ -137,7 +134,7 @@ static void __init allocate_pgdat(int ni if (nid) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { - NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); + NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn*MMUPAGE_SIZE)); min_low_pfn += PFN_UP(sizeof(pg_data_t)); memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); } @@ -190,8 +187,8 @@ void __init remap_numa_kva(void) int node; for (node = 1; node < numnodes; ++node) { - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn< system_max_low_pfn) highstart_pfn = system_max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); + (highend_pfn - highstart_pfn) >> (20 - MMUPAGE_SHIFT)); #endif system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages; printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(system_max_low_pfn)); + system_max_low_pfn >> (20 - MMUPAGE_SHIFT)); printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", min_low_pfn, max_low_pfn, highstart_pfn); @@ -269,6 +267,14 @@ unsigned long __init setup_memory(void) (ulong) pfn_to_kaddr(highstart_pfn)); for (nid = 0; nid < numnodes; nid++) find_max_pfn_node(nid); + printk("vmallocspace = [0x%lx, 0x%lx)\n", + VMALLOC_START, VMALLOC_END); + printk("fixmapspace = [0x%lx, 0x%lx)\n", + FIXADDR_START, FIXADDR_TOP); + printk("MAXMEM = 0x%lx\n", MAXMEM); + for (nid = 0; nid < numnodes; ++nid) + printk("node %d at pfns [0x%lx, 0x%lx)\n", + nid, node_start_pfn[nid], node_end_pfn[nid]); NODE_DATA(0)->bdata = &node0_bdata; @@ -285,21 +291,21 @@ unsigned long __init setup_memory(void) * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem_node(NODE_DATA(0), HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); + reserve_bootmem_node(NODE_DATA(0), HIGH_MEMORY, PFN_PHYS(min_low_pfn) + + bootmap_size - HIGH_MEMORY); /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ - reserve_bootmem_node(NODE_DATA(0), 0, PAGE_SIZE); + reserve_bootmem_node(NODE_DATA(0), 0, MMUPAGE_SIZE); /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_bootmem_node(NODE_DATA(0), PAGE_SIZE, PAGE_SIZE); + reserve_bootmem_node(NODE_DATA(0), MMUPAGE_SIZE, MMUPAGE_SIZE); #ifdef CONFIG_ACPI_SLEEP /* @@ -315,7 +321,7 @@ unsigned long __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << PAGE_SHIFT)) { + if (INITRD_START + INITRD_SIZE <= (system_max_low_pfn << MMUPAGE_SHIFT)) { reserve_bootmem_node(NODE_DATA(0), INITRD_START, INITRD_SIZE); initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0; @@ -325,7 +331,7 @@ unsigned long __init setup_memory(void) printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", INITRD_START + INITRD_SIZE, - system_max_low_pfn << PAGE_SHIFT); + system_max_low_pfn << MMUPAGE_SHIFT); initrd_start = 0; } } @@ -358,23 +364,23 @@ void __init zone_sizes_init(void) unsigned long start = node_start_pfn[nid]; unsigned long high = node_end_pfn[nid]; - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> MMUPAGE_SHIFT; if (start > low) { #ifdef CONFIG_HIGHMEM BUG_ON(start > high); - zones_size[ZONE_HIGHMEM] = high - start; + zones_size[ZONE_HIGHMEM] = (high - start) >> PAGE_MMUSHIFT; #endif } else { if (low < max_dma) - zones_size[ZONE_DMA] = low; + zones_size[ZONE_DMA] = low >> PAGE_MMUSHIFT; else { BUG_ON(max_dma > low); BUG_ON(low > high); - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; + zones_size[ZONE_DMA] = max_dma >> PAGE_MMUSHIFT; + zones_size[ZONE_NORMAL] = (low - max_dma) >> PAGE_MMUSHIFT; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; + zones_size[ZONE_HIGHMEM] = (high - low) >> PAGE_MMUSHIFT; #endif } } @@ -414,9 +420,27 @@ void __init set_highmem_pages_init(int b zone_start_pfn = NODE_DATA(nid)->node_zones[ZONE_HIGHMEM].zone_start_pfn; printk("Initializing highpages for node %d\n", nid); + + /* + * Note: zone->spanned_pages is in PAGE_SIZE units. + */ for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) { - one_highpage_init((struct page *)(zone_mem_map + node_pfn), - zone_start_pfn + node_pfn, bad_ppro); + if (page_to_pfn(&zone_mem_map[node_pfn]) + != zone_start_pfn + PAGE_MMUCOUNT*node_pfn) { + static int complained_once = 0; + if (!complained_once) { + complained_once = 1; + printk("mismatching page/pfn!!!\n" + "page = %p sees pfn 0x%lx " + "but pfn 0x%lx calculated\n", + &zone_mem_map[node_pfn], + page_to_pfn(&zone_mem_map[node_pfn]), + zone_start_pfn + PAGE_MMUCOUNT*node_pfn); + } + } + one_highpage_init(&zone_mem_map[node_pfn], + zone_start_pfn + PAGE_MMUCOUNT*node_pfn, + bad_ppro); } } totalram_pages += totalhigh_pages; diff -prauN linux-2.6.0-test11/arch/i386/mm/fault.c pgcl-2.6.0-test11-1/arch/i386/mm/fault.c --- linux-2.6.0-test11/arch/i386/mm/fault.c 2003-11-26 12:42:46.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/mm/fault.c 2003-11-27 23:00:38.000000000 -0800 @@ -21,6 +21,8 @@ #include /* For unblank_screen() */ #include #include +#include /* for max_low_pfn */ +#include #include #include @@ -211,25 +213,45 @@ asmlinkage void do_invalid_op(struct pt_ */ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) { - struct task_struct *tsk; + struct task_struct *tsk = current; struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; - unsigned long page; int write; siginfo_t info; /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); + pr_debug("%d: faulted on %lx, EIP = 0x%lx\n", + tsk->pid, + address, + regs->eip); + /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) local_irq_enable(); - tsk = current; + if (!tsk->pid) + dump_stack(); + mm = tsk->mm; info.si_code = SEGV_MAPERR; - + if (1) { + pgd_t *pgd = mm ? pgd_offset(mm, address) : pgd_offset_k(address); + pmd_t *pmd = pmd_offset(pgd, address); + pr_debug("%d: fault handled by PGD at vaddr %p, %Lx\n", + current->pid, pgd, (u64)pgd_val(*pgd)); + pr_debug("%d: fault handled by PMD at vaddr %p, %Lx\n", + current->pid, pmd, (u64)pmd_val(*pmd)); + if (pmd_present(*pmd)) { + pr_debug("%d: fault will be handled by PTE at paddr %Lx\n", + current->pid, + (u64)(pmd_val(*pmd) & MMUPAGE_MASK) + + pte_index(address)*sizeof(pte_t)); + } else + pr_debug("pmd not present\n"); + } /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. @@ -262,15 +284,20 @@ asmlinkage void do_page_fault(struct pt_ if (in_atomic() || !mm) goto bad_area_nosemaphore; + pr_debug("%d: about to down_read(&mm->mmap_sem)\n", current->pid); down_read(&mm->mmap_sem); vma = find_vma(mm, address); - if (!vma) + if (!vma) { + pr_debug("no vma, goto bad_area\n"); goto bad_area; + } if (vma->vm_start <= address) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!(vma->vm_flags & VM_GROWSDOWN)) { + pr_debug("VM_GROWSDOWN not in vma->vm_flags, goto bad_area\n"); goto bad_area; + } if (error_code & 4) { /* * accessing the stack below %esp is always a bug. @@ -278,11 +305,15 @@ asmlinkage void do_page_fault(struct pt_ * pusha) doing post-decrement on the stack and that * doesn't show up until later.. */ - if (address + 32 < regs->esp) + if (address + 32 < regs->esp) { + pr_debug("postdecrement on stack, goto bad_area\n"); goto bad_area; } - if (expand_stack(vma, address)) + } + if (expand_stack(vma, address)) { + pr_debug("expand_stack() failed, goto bad_area\n"); goto bad_area; + } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. @@ -298,16 +329,21 @@ good_area: #endif /* fall through */ case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) + if (!(vma->vm_flags & VM_WRITE)) { + pr_debug("vma not writable, goto bad_area\n"); goto bad_area; + } write++; break; case 1: /* read, present */ + pr_debug("NFI what happened, goto bad_area\n"); goto bad_area; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) { + pr_debug("vma not read/exec, goto bad_area\n"); goto bad_area; } + } survive: /* @@ -334,7 +370,7 @@ good_area: * Did it hit the DOS screen memory VA from vm86 mode? */ if (regs->eflags & VM_MASK) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + unsigned long bit = (address - 0xA0000) >> MMUPAGE_SHIFT; if (bit < 32) tsk->thread.screen_bitmap |= 1 << bit; } @@ -351,6 +387,56 @@ bad_area: bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { +#if 1 || defined(DEBUG) + printk("user mode SIGSEGV, pid = %d, comm = %16s, EIP = %p, ESP = %p, CR2 = %p\n", + current->pid, current->comm, (void *)regs->eip, (void *)regs->esp, (void *)address); + spin_lock(&mm->page_table_lock); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long addr; + + printk("vma = [%lx,%lx) prot=%lx flags=%lx\n", + vma->vm_start, vma->vm_end, + vma->vm_page_prot.pgprot, vma->vm_flags); + + for (addr = vma->vm_start; addr < vma->vm_end; addr += MMUPAGE_SIZE) { + pgd_t *pgd = pgd_offset(mm, addr); + pmd_t *pmd; + pte_t *pte; + struct page *page; + void *mem; + + if (pgd_none(*pgd) || pgd_bad(*pgd)) + continue; + + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + continue; + + pte = pte_offset_map(pmd, addr); + if (pte_none(*pte)) { + pte_unmap(pte); + continue; + } + + printk("pte for 0x%lx = 0x%Lx\n", + addr, + (u64)pte_val(*pte)); + + if (!pte_present(*pte) || + !pfn_valid(pte_pfn(*pte))) { + pte_unmap(pte); + continue; + } + page = pte_page(*pte); + mem = kmap_atomic(page, KM_USER0); + if (!memcmp(mem, page_address(ZERO_PAGE(0)), PAGE_SIZE)) + printk("page at 0x%lx zero!\n", addr); + kunmap_atomic(mem, KM_USER0); + pte_unmap(pte); + } + } + spin_unlock(&mm->page_table_lock); +#endif /* DEBUG */ /* * Valid to do another page fault here because this one came * from user space. @@ -405,30 +491,53 @@ no_context: bust_spinlocks(1); - if (address < PAGE_SIZE) + if (address < MMUPAGE_SIZE) printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); else printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at virtual address %08lx\n",address); printk(" printing eip:\n"); printk("%08lx\n", regs->eip); - asm("movl %%cr3,%0":"=r" (page)); - page = ((unsigned long *) __va(page))[address >> 22]; - printk(KERN_ALERT "*pde = %08lx\n", page); - /* - * We must not directly access the pte in the highpte - * case, the page table might be allocated in highmem. - * And lets rather not kmap-atomic the pte, just in case - * it's allocated already. - */ -#ifndef CONFIG_HIGHPTE - if (page & 1) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = %08lx\n", page); + { + unsigned long cr3; + pgd_t *pgd; + pmd_t *pmd; + char *fmt; + + asm("movl %%cr3,%0":"=r" (cr3)); + cr3 &= ~0x1f; /* lower 5 bits of %cr3 are flags */ + /* pgd's in lowmem, but only need to be < 4G (32-bit %cr3) */ + pgd = (pgd_t *)__va(cr3); + fmt = PTRS_PER_PMD > 1 ? KERN_ALERT "*pdpte = %Lx\n" : NULL; + if (PTRS_PER_PMD > 1) + printk(fmt, pgd_val(*pgd)); + + /* pmd's in lowmem, but can be anywhere (64-bit PDPTE) */ + pmd = pmd_offset(pgd, address); + if (PTRS_PER_PMD > 1) + fmt = KERN_ALERT "*pde = %Lx\n"; + else + fmt = KERN_ALERT "*pde = %08lx\n"; + printk(fmt, pmd_val(*pmd)); + + /* + * this is getting at what are potentially user + * PTE's with pte_offset_kernel(); it's mostly + * unsafe to try editing kernel PTE's at this + * point for kmap_atomic() so just drop out of it + * if pmd_val(*pmd)/MMUPAGE_SIZE > max_low_pfn + */ + + if (pmd_present(*pmd) && !pmd_large(*pmd) + && pmd_val(*pmd)/MMUPAGE_SIZE <= max_low_pfn) { + pte_t *pte = pte_offset_kernel(pmd, address); + if (PTRS_PER_PMD > 1) + fmt = KERN_ALERT "*pte = %Lx\n"; + else + fmt = KERN_ALERT "*pte = %08lx\n"; + printk(fmt, pte_val(*pte)); + } } -#endif die("Oops", regs, error_code); bust_spinlocks(0); do_exit(SIGKILL); @@ -440,6 +549,8 @@ no_context: out_of_memory: up_read(&mm->mmap_sem); if (tsk->pid == 1) { + printk("/sbin/init is OOM?\n"); + dump_stack(); yield(); down_read(&mm->mmap_sem); goto survive; @@ -460,6 +571,7 @@ do_sigbus: if (is_prefetch(regs, address)) return; + printk("sending SIGBUS\n"); tsk->thread.cr2 = address; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; @@ -484,12 +596,22 @@ vmalloc_fault: pmd_t *pmd, *pmd_k; pte_t *pte_k; + if (address >= FIXADDR_START) { + printk(KERN_CRIT "took fault on fixmapspace 0x%lx!\n", + address); + goto no_context; + } + + printk("took vmalloc fault on address %lx\n", address); + asm("movl %%cr3,%0":"=r" (pgd)); pgd = index + (pgd_t *)__va(pgd); pgd_k = init_mm.pgd + index; - if (!pgd_present(*pgd_k)) + if (!pgd_present(*pgd_k)) { + printk(KERN_CRIT "missing pgd in vmalloc fault!\n"); goto no_context; + } /* * set_pgd(pgd, *pgd_k); here would be useless on PAE @@ -498,13 +620,17 @@ vmalloc_fault: pmd = pmd_offset(pgd, address); pmd_k = pmd_offset(pgd_k, address); - if (!pmd_present(*pmd_k)) + if (!pmd_present(*pmd_k)) { + printk(KERN_CRIT "missing pmd in vmalloc fault!\n"); goto no_context; + } set_pmd(pmd, *pmd_k); pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) + if (!pte_present(*pte_k)) { + printk(KERN_CRIT "missing pte in vmalloc fault!\n"); goto no_context; + } return; } } diff -prauN linux-2.6.0-test11/arch/i386/mm/highmem.c pgcl-2.6.0-test11-1/arch/i386/mm/highmem.c --- linux-2.6.0-test11/arch/i386/mm/highmem.c 2003-11-26 12:44:16.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/mm/highmem.c 2003-11-27 21:55:16.000000000 -0800 @@ -1,4 +1,9 @@ #include +#include + +/* + * XXX: resurrect kmap_pte + */ void *kmap(struct page *page) { @@ -28,59 +33,139 @@ void kunmap(struct page *page) void *kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; - unsigned long vaddr; + unsigned long addr, vaddr, pfn; + int k; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; inc_preempt_count(); if (page < highmem_start_page) return page_address(page); idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + vaddr = __fix_to_virt(FIX_KMAP_END) + PAGE_SIZE*idx; + BUG_ON(vaddr > __fix_to_virt(FIX_KMAP_BEGIN)); + BUG_ON(vaddr < __fix_to_virt(FIX_KMAP_END)); + + pfn = page_to_pfn(page); + + addr = vaddr; + pgd = pgd_offset_k(addr); + pmd = pmd_offset(pgd, addr); + + /* barf on highmem-allocated pagetables */ + BUG_ON((pmd_val(*pmd) >> MMUPAGE_SHIFT) >= max_low_pfn); + + pte = pte_offset_kernel(pmd, addr); + + for (k = 0; k < PAGE_MMUCOUNT; ++k, addr += MMUPAGE_SIZE) { #ifdef CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); + BUG_ON(!pte_none(pte[k])); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); + BUG_ON(addr < vaddr); + BUG_ON(addr - vaddr >= PAGE_SIZE); + BUG_ON(!pfn_valid(pfn + k)); + if (pte_pfn(pte[k]) == pfn + k) + continue; - return (void*) vaddr; + set_pte(&pte[k], pfn_pte(pfn + k, kmap_prot)); + __flush_tlb_one(addr); + } + return (void *)vaddr; } +#ifdef CONFIG_DEBUG_HIGHMEM void kunmap_atomic(void *kvaddr, enum km_type type) { -#ifdef CONFIG_DEBUG_HIGHMEM - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + unsigned long vaddr = (unsigned long)kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + unsigned long lo, hi; + int k; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; if (vaddr < FIXADDR_START) { // FIXME dec_preempt_count(); return; } - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); + lo = __fix_to_virt(FIX_KMAP_END) + PAGE_SIZE*idx; + hi = lo + PAGE_SIZE; + + BUG_ON(vaddr < lo || vaddr > hi); /* * force other mappings to Oops if they'll try to access * this pte without first remap it */ - pte_clear(kmap_pte-idx); - __flush_tlb_one(vaddr); -#endif + pgd = pgd_offset_k(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + for (k = 0; k < PAGE_MMUCOUNT; ++k, vaddr += MMUPAGE_SIZE) { + pte_clear(&pte[k]); + __flush_tlb_one(vaddr); + } dec_preempt_count(); } +#endif -struct page *kmap_atomic_to_page(void *ptr) +unsigned long kmap_atomic_to_pfn(void *ptr) { - unsigned long idx, vaddr = (unsigned long)ptr; + unsigned long vaddr = (unsigned long)ptr; + pgd_t *pgd; + pmd_t *pmd; pte_t *pte; if (vaddr < FIXADDR_START) - return virt_to_page(ptr); + return __pa(vaddr)/MMUPAGE_SIZE; - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); + pgd = pgd_offset_k(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + + /* + * unsigned long idx = virt_to_fix(vaddr); + * pte = &kmap_pte[idx*PAGE_MMUCOUNT]; + */ + return pte_pfn(*pte); } +void kmap_atomic_sg(pte_t *ptes[], pte_addr_t paddrs[], enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr, base; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int k; + + inc_preempt_count(); + idx = type + KM_TYPE_NR*smp_processor_id(); + base = vaddr = __fix_to_virt(FIX_KMAP_END) + PAGE_SIZE*idx; + BUG_ON(vaddr > __fix_to_virt(FIX_KMAP_BEGIN)); + BUG_ON(vaddr < __fix_to_virt(FIX_KMAP_END)); + + pgd = pgd_offset_k(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + for (k = 0; k < PAGE_MMUCOUNT; ++k, vaddr += MMUPAGE_SIZE) { + unsigned long pfn = paddrs[k]/MMUPAGE_SIZE; + + if (!paddrs[k]) { + ptes[k] = NULL; + continue; + } + ptes[k] = (pte_t *)(vaddr + ((u32)paddrs[k] & ~MMUPAGE_MASK)); + BUG_ON(!pfn_valid(pfn)); + BUG_ON((u32)ptes[k] < base); + BUG_ON((u32)ptes[k] - base >= PAGE_SIZE); + + if (pte_pfn(pte[k]) != pfn) { + set_pte(&pte[k], pfn_pte(pfn, kmap_prot)); + __flush_tlb_one(vaddr); + } + } +} diff -prauN linux-2.6.0-test11/arch/i386/mm/init.c pgcl-2.6.0-test11-1/arch/i386/mm/init.c --- linux-2.6.0-test11/arch/i386/mm/init.c 2003-11-26 12:45:05.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/mm/init.c 2003-11-27 22:10:55.000000000 -0800 @@ -40,8 +40,8 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; +struct page *zero_page; static int do_test_wp_bit(void); @@ -55,7 +55,7 @@ static pmd_t * __init one_md_table_init( pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + pmd_table = (pmd_t *) alloc_bootmem_low_pages(MMUPAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); if (pmd_table != pmd_offset(pgd, 0)) BUG(); @@ -73,7 +73,7 @@ static pmd_t * __init one_md_table_init( static pte_t * __init one_page_table_init(pmd_t *pmd) { if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(MMUPAGE_SIZE); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); if (page_table != pte_offset_kernel(pmd, 0)) BUG(); @@ -94,6 +94,12 @@ static pte_t * __init one_page_table_ini * NOTE: The pagetables are allocated contiguous on the physical space * so we can cache the place of the first one and move around without * checking the pgd every time. + * + * Something happened here and I'm not sure what. This might back the + * thing out (I think). I think it was just a rename so I won't care + * unless it burns me. + * + * -- wli */ static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { @@ -110,7 +116,14 @@ static void __init page_table_range_init for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { if (pgd_none(*pgd)) one_md_table_init(pgd); + } + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { pmd = pmd_offset(pgd, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) @@ -168,6 +181,11 @@ static inline int page_kills_ppro(unsign static inline int page_is_ram(unsigned long pagenr) { int i; + unsigned long begin_pfn, end_pfn; + + /* check the whole range of pfn's spanned by the page */ + begin_pfn = pagenr & ~(PAGE_MMUCOUNT - 1); + end_pfn = begin_pfn + PAGE_MMUCOUNT - 1; for (i = 0; i < e820.nr_map; i++) { unsigned long addr, end; @@ -179,52 +197,27 @@ static inline int page_is_ram(unsigned l * are not. Notably the 640->1Mb area. We need a sanity * check here. */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; - if ((pagenr >= addr) && (pagenr < end)) + addr = MMUPAGE_ALIGN(e820.map[i].addr)/MMUPAGE_SIZE; + end = (e820.map[i].addr+e820.map[i].size)/MMUPAGE_SIZE; + if (begin_pfn >= addr && end_pfn < end) return 1; } return 0; } #ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); -EXPORT_SYMBOL(kmap_pte); -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) - -void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} +#define kmap_init() do { kmap_prot = PAGE_KERNEL; } while (0) void __init permanent_kmaps_init(pgd_t *pgd_base) { - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - vaddr = PKMAP_BASE; - page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + page_table_range_init(PKMAP_BASE, PKMAP_BASE + PAGE_SIZE*LAST_PKMAP, pgd_base); } -void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +void __init one_highpage_init(struct page *page, unsigned long pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); @@ -240,7 +233,7 @@ void __init one_highpage_init(struct pag void __init set_highmem_pages_init(int bad_ppro) { int pfn; - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) + for (pfn = highstart_pfn; pfn < highend_pfn; pfn += PAGE_MMUCOUNT) one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } @@ -307,6 +300,34 @@ static void __init pagetable_init (void) */ pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; #endif + { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long addr = VMALLOC_START; + + do { + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd) || pgd_bad(*pgd)) { + addr += MMUPAGE_SIZE; + continue; + } + do { + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) { + addr += MMUPAGE_SIZE; + continue; + } + do { + pte = pte_offset_kernel(pmd, addr); + if (!pte_none(*pte) || pte_present(*pte)) { + printk("bad vmallocspace PTE at vaddr 0x%lx\n", addr); + } + addr += MMUPAGE_SIZE; + } while (addr < VMALLOC_END); + } while (addr < VMALLOC_END); + } while (addr < VMALLOC_END); + } } void zap_low_mappings (void) @@ -333,17 +354,17 @@ void __init zone_sizes_init(void) unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; unsigned int max_dma, high, low; - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> MMUPAGE_SHIFT; low = max_low_pfn; high = highend_pfn; if (low < max_dma) - zones_size[ZONE_DMA] = low; + zones_size[ZONE_DMA] = low >> PAGE_MMUSHIFT; else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; + zones_size[ZONE_DMA] = max_dma >> PAGE_MMUSHIFT; + zones_size[ZONE_NORMAL] = (low - max_dma) >> PAGE_MMUSHIFT; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; + zones_size[ZONE_HIGHMEM] = (high - low) >> PAGE_MMUSHIFT; #endif } free_area_init(zones_size); @@ -374,7 +395,6 @@ void __init paging_init(void) set_in_cr4(X86_CR4_PAE); #endif __flush_tlb_all(); - kmap_init(); zone_sizes_init(); } @@ -420,6 +440,7 @@ static void __init set_max_mapnr_init(vo #else max_mapnr = num_physpages = max_low_pfn; #endif + max_mapnr /= PAGE_MMUCOUNT; } #define __free_all_bootmem() free_all_bootmem() #else @@ -429,11 +450,14 @@ extern void set_max_mapnr_init(void); static struct kcore_list kcore_mem, kcore_vmalloc; +/* + * Most of the reporting here needs doublechecking. + */ void __init mem_init(void) { extern int ppro_with_ram_bug(void); int codesize, reservedpages, datasize, initsize; - int tmp; + int pfn; int bad_ppro; #ifndef CONFIG_DISCONTIGMEM @@ -443,36 +467,30 @@ void __init mem_init(void) bad_ppro = ppro_with_ram_bug(); -#ifdef CONFIG_HIGHMEM - /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); - BUG(); - } -#endif - set_max_mapnr_init(); #ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE); + high_memory = (void *) __va(highstart_pfn * MMUPAGE_SIZE); #else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + high_memory = (void *) __va(max_low_pfn * MMUPAGE_SIZE); #endif - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); - /* this will put all low memory onto the freelists */ totalram_pages += __free_all_bootmem(); + zero_page = alloc_page(GFP_ATOMIC|GFP_DMA); + clear_page(page_address(zero_page)); + SetPageReserved(zero_page); + printk("zero_page at pfn 0x%lx\n", page_to_pfn(zero_page)); + tlb_init(); + totalram_pages--; + reservedpages = 0; - for (tmp = 0; tmp < max_low_pfn; tmp++) + for (pfn = 0; pfn < max_low_pfn; pfn += PAGE_MMUCOUNT) /* * Only count reserved RAM pages */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) + if (page_is_ram(pfn) && PageReserved(pfn_to_page(pfn))) reservedpages++; set_highmem_pages_init(bad_ppro); @@ -487,13 +505,36 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), + num_physpages << (MMUPAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10, (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); + printk("MAXMEM=0x%lx\n", MAXMEM); + printk("vmalloc: start = 0x%lx, end = 0x%lx\n", + VMALLOC_START, VMALLOC_END); + printk("fixaddr: start = 0x%lx, end = 0x%lx\n", + FIXADDR_START, FIXADDR_TOP); + +#ifdef CONFIG_HIGHMEM + printk("FIX_KMAP_END == %lx\n", __fix_to_virt(FIX_KMAP_END)); + if (__fix_to_virt(FIX_KMAP_END) % PAGE_SIZE) + printk(KERN_CRIT "kmap_atomic() area misaligned!\n"); + + printk("FIX_KMAP_BEGIN == %lx\n", __fix_to_virt(FIX_KMAP_BEGIN)); + if ((__fix_to_virt(FIX_KMAP_BEGIN) + MMUPAGE_SIZE) % PAGE_SIZE) + printk(KERN_CRIT "kmap_atomic() area misaligned!\n"); + + printk("FIX_PKMAP_END == %lx\n", __fix_to_virt(FIX_PKMAP_END)); + if (__fix_to_virt(FIX_PKMAP_END) % PAGE_SIZE) + printk(KERN_CRIT "kmap() area misaligned!\n"); + + printk("FIX_PKMAP_BEGIN == %lx\n", __fix_to_virt(FIX_PKMAP_BEGIN)); + if ((__fix_to_virt(FIX_PKMAP_BEGIN) + MMUPAGE_SIZE) % PAGE_SIZE) + printk(KERN_CRIT "kmap() area misaligned!\n"); +#endif #ifdef CONFIG_X86_PAE if (!cpu_has_pae) @@ -567,28 +608,43 @@ static int do_test_wp_bit(void) void free_initmem(void) { - unsigned long addr; + unsigned long addr, freed = 0; addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { + addr = (addr + PAGE_SIZE - 1) & PAGE_MASK; + while (addr < (((unsigned long)(&__init_end)) & PAGE_MASK)) { ClearPageReserved(virt_to_page(addr)); set_page_count(virt_to_page(addr), 1); free_page(addr); totalram_pages++; + freed++; + addr += PAGE_SIZE; } - printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10); + printk (KERN_INFO "Freeing unused kernel memory: %luk freed\n", + freed*(PAGE_SIZE/1024)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start < end) - printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { + unsigned long freed = 0; + + start = (start + PAGE_SIZE - 1) & PAGE_MASK; + end &= PAGE_MASK; + + if (start >= end) + return; + + while (start < end) { ClearPageReserved(virt_to_page(start)); set_page_count(virt_to_page(start), 1); free_page(start); + freed++; totalram_pages++; + start += PAGE_SIZE; } + + printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", + freed*(PAGE_SIZE/1024)); } #endif diff -prauN linux-2.6.0-test11/arch/i386/mm/ioremap.c pgcl-2.6.0-test11-1/arch/i386/mm/ioremap.c --- linux-2.6.0-test11/arch/i386/mm/ioremap.c 2003-11-26 12:43:26.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/mm/ioremap.c 2003-11-27 22:13:57.000000000 -0800 @@ -30,7 +30,7 @@ static inline void remap_area_pte(pte_t end = PMD_SIZE; if (address >= end) BUG(); - pfn = phys_addr >> PAGE_SHIFT; + pfn = phys_addr >> MMUPAGE_SHIFT; do { if (!pte_none(*pte)) { printk("remap_area_pte: page already exists\n"); @@ -38,7 +38,7 @@ static inline void remap_area_pte(pte_t } set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | flags))); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pfn++; pte++; } while (address && (address < end)); @@ -146,9 +146,9 @@ void * __ioremap(unsigned long phys_addr /* * Mappings have to be page-aligned */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; + offset = phys_addr & ~MMUPAGE_MASK; + phys_addr &= MMUPAGE_MASK; + size = MMUPAGE_ALIGN(last_addr+1) - phys_addr; /* * Ok, go for it.. @@ -202,15 +202,15 @@ void *ioremap_nocache (unsigned long phy struct page *ppage = virt_to_page(__va(phys_addr)); unsigned long npages; - phys_addr &= PAGE_MASK; + phys_addr &= MMUPAGE_MASK; /* This might overflow and become zero.. */ - last_addr = PAGE_ALIGN(last_addr); + last_addr = MMUPAGE_ALIGN(last_addr); /* .. but that's ok, because modulo-2**n arithmetic will make * the page-aligned "last - first" come out right. */ - npages = (last_addr - phys_addr) >> PAGE_SHIFT; + npages = (last_addr - phys_addr)/MMUPAGE_SIZE; if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { iounmap(p); @@ -227,7 +227,7 @@ void iounmap(void *addr) struct vm_struct *p; if (addr <= high_memory) return; - p = remove_vm_area((void *) (PAGE_MASK & (unsigned long) addr)); + p = remove_vm_area((void *) (MMUPAGE_MASK & (unsigned long) addr)); if (!p) { printk("__iounmap: bad address %p\n", addr); return; @@ -235,7 +235,7 @@ void iounmap(void *addr) if (p->flags && p->phys_addr < virt_to_phys(high_memory)) { change_page_attr(virt_to_page(__va(p->phys_addr)), - p->size >> PAGE_SHIFT, + p->size >> MMUPAGE_SHIFT, PAGE_KERNEL); global_flush_tlb(); } @@ -262,14 +262,14 @@ void __init *bt_ioremap(unsigned long ph /* * Mappings have to be page-aligned */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; + offset = phys_addr & ~MMUPAGE_MASK; + phys_addr &= MMUPAGE_MASK; + size = MMUPAGE_ALIGN(last_addr) - phys_addr; /* * Mappings have to fit in the FIX_BTMAP area. */ - nrpages = size >> PAGE_SHIFT; + nrpages = size >> MMUPAGE_SHIFT; if (nrpages > NR_FIX_BTMAPS) return NULL; @@ -279,7 +279,7 @@ void __init *bt_ioremap(unsigned long ph idx = FIX_BTMAP_BEGIN; while (nrpages > 0) { set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; + phys_addr += MMUPAGE_SIZE; --idx; --nrpages; } @@ -296,8 +296,8 @@ void __init bt_iounmap(void *addr, unsig virt_addr = (unsigned long)addr; if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) return; - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + offset = virt_addr & ~MMUPAGE_MASK; + nrpages = MMUPAGE_ALIGN(offset + size - 1) >> MMUPAGE_SHIFT; idx = FIX_BTMAP_BEGIN; while (nrpages > 0) { diff -prauN linux-2.6.0-test11/arch/i386/mm/pageattr.c pgcl-2.6.0-test11-1/arch/i386/mm/pageattr.c --- linux-2.6.0-test11/arch/i386/mm/pageattr.c 2003-11-26 12:43:41.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/mm/pageattr.c 2003-11-27 21:55:16.000000000 -0800 @@ -47,8 +47,8 @@ static struct page *split_large_page(uns address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, + for (i = 0; i < PTRS_PER_PTE; i++, addr += MMUPAGE_SIZE) { + pbase[i] = pfn_pte(addr/MMUPAGE_SIZE, addr == address ? prot : PAGE_KERNEL); } return base; @@ -76,11 +76,16 @@ static void set_pmd_pte(pte_t *kpte, uns spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pmd_t *pmd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - pmd = pmd_offset(pgd, address); - set_pte_atomic((pte_t *)pmd, pte); + int k; + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + pgd_t *pgd; + pmd_t *pmd; + pgd = (pgd_t *)page_address(page) + + PTRS_PER_PGD * k + + pgd_index(address); + pmd = pmd_offset(pgd, address); + set_pte_atomic((pte_t *)pmd, pte); + } } spin_unlock_irqrestore(&pgd_lock, flags); } @@ -94,7 +99,7 @@ static inline void revert_page(struct pa pte_t *linear = (pte_t *) pmd_offset(pgd_offset(&init_mm, address), address); set_pmd_pte(linear, address, - pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, + pfn_pte((__pa(address) & LARGE_PAGE_MASK)/MMUPAGE_SIZE, PAGE_KERNEL_LARGE)); } @@ -106,15 +111,14 @@ __change_page_attr(struct page *page, pg struct page *kpte_page; #ifdef CONFIG_HIGHMEM - if (page >= highmem_start_page) - BUG(); + BUG_ON(page >= highmem_start_page); #endif address = (unsigned long)page_address(page); kpte = lookup_address(address); if (!kpte) return -EINVAL; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + kpte_page = virt_to_page(((unsigned long)kpte) & MMUPAGE_MASK); if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { if ((pte_val(*kpte) & _PAGE_PSE) == 0) { pte_t old = *kpte; @@ -165,6 +169,8 @@ int change_page_attr(struct page *page, int i; unsigned long flags; + numpages = (numpages + PAGE_MMUCOUNT - 1) & ~(PAGE_MMUCOUNT - 1); + spin_lock_irqsave(&cpa_lock, flags); for (i = 0; i < numpages; i++, page++) { err = __change_page_attr(page, prot); diff -prauN linux-2.6.0-test11/arch/i386/mm/pgtable.c pgcl-2.6.0-test11-1/arch/i386/mm/pgtable.c --- linux-2.6.0-test11/arch/i386/mm/pgtable.c 2003-11-26 12:46:12.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/mm/pgtable.c 2003-11-27 21:55:16.000000000 -0800 @@ -127,7 +127,7 @@ void __set_fixmap (enum fixed_addresses BUG(); return; } - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + set_pte_pfn(address, phys >> MMUPAGE_SHIFT, flags); } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) @@ -138,20 +138,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); -#endif - if (pte) - clear_highpage(pte); - return pte; -} - void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) { memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); @@ -188,7 +174,19 @@ void pgd_ctor(void *pgd, kmem_cache_t *c if (PTRS_PER_PMD > 1) return; - list_add(&virt_to_page(pgd)->lru, &pgd_list); + /* + * When allocated, a page has a reference count of 1. + * This increases it fromthat to 2 on the first pgd_ctor() + * call to any part of a page. + */ + if (PAGE_MMUCOUNT == 1) + list_add(&virt_to_page(pgd)->lru, &pgd_list); + else { + struct page *page = virt_to_page(pgd); + atomic_inc(&page->count); + if (atomic_read(&page->count) == 2) + list_add(&page->lru, &pgd_list); + } spin_unlock_irqrestore(&pgd_lock, flags); memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); } @@ -199,7 +197,14 @@ void pgd_dtor(void *pgd, kmem_cache_t *c unsigned long flags; /* can be called from interrupt context */ spin_lock_irqsave(&pgd_lock, flags); - list_del(&virt_to_page(pgd)->lru); + if (PAGE_MMUCOUNT == 1) + list_del(&virt_to_page(pgd)->lru); + else { + struct page *page = virt_to_page(pgd); + atomic_dec(&page->count); + if (atomic_read(&page->count) == 1) + list_del(&page->lru); + } spin_unlock_irqrestore(&pgd_lock, flags); } diff -prauN linux-2.6.0-test11/arch/i386/mm/tlb.c pgcl-2.6.0-test11-1/arch/i386/mm/tlb.c --- linux-2.6.0-test11/arch/i386/mm/tlb.c 1969-12-31 16:00:00.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/mm/tlb.c 2003-11-27 21:55:16.000000000 -0800 @@ -0,0 +1,133 @@ +/* + * arch/i386/mm/tlb.c + * (C) June 2003 William Irwin, IBM + * Routines for pagetable cacheing and release. + */ +#include +#include +#include +#include +#include + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + +void tlb_init(void) +{ + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + int zone; + struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + INIT_LIST_HEAD(&tlb->active_list[zone]); + INIT_LIST_HEAD(&tlb->ready_list[zone]); + } + } +} + +/* + * When an mmu_gather fills, we must flush the entire mm, in no + * small part because whole-mm flushes are the sole bulk TLB + * invalidation primitive on i386. + */ +void tlb_flush_ready(struct mmu_gather *tlb) +{ + int count, zone = 0; + while (tlb->nr_pte_ready >= NR_PTE) { + BUG_ON(zone >= MAX_ZONE_ID); + if (!list_empty(&tlb->ready_list[zone])) { + BUG_ON(!zone_table[zone]); + free_pages_bulk(zone_table[zone], + tlb->ready_count[zone], + &tlb->ready_list[zone], + 0); + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + BUG_ON(tlb->nr_pte_ready < 0); + BUG_ON(!list_empty(&tlb->ready_list[zone])); + } + zone++; + } + for (count = 0; zone < MAX_ZONE_ID; ++zone) { + BUG_ON(tlb->ready_count[zone] < 0); + count += tlb->ready_count[zone]; + } + BUG_ON(count != tlb->nr_pte_ready); +} + +/* + * oddly declared in pgalloc.h; in general these are TLB-related pmd + * and pte twiddlings. + */ +void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *page) +{ + unsigned long pfn, pmd_off = (unsigned long)pmd; + int k; + + pmd_off = (pmd_off/sizeof(pmd_t)) % PAGE_MMUCOUNT; + pfn = page_to_pfn(page); + pmd -= pmd_off; + + if (PAGE_MMUCOUNT > 1) { + struct page *old_page = NULL; + + if (atomic_read(&page->count) != 1) { + WARN_ON(1); + printk(KERN_DEBUG "bad pte refcount = %d\n", + atomic_read(&page->count)); + } + + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + if (pmd_present(pmd[k]) || !pmd_none(pmd[k])) { + if (old_page) + WARN_ON(old_page != pmd_page(pmd[k])); + else + old_page = pmd_page(pmd[k]); + } + } + + if (!old_page || old_page == page) + atomic_set(&page->count, PAGE_MMUCOUNT); + else { + /* + * old_page->index can legitimately be 0 + * but something's corrupt if it's mapping's wrong + */ + BUG_ON((struct mm_struct *)old_page->mapping != mm); + + /* + * errant callers can potentially do things + * out-of-order + */ + WARN_ON((struct mm_struct *)page->mapping != mm); + /* if (old_page->mapping != mm) + pgtable_add_rmap(page, mm, page->index); */ + pgtable_remove_rmap(page); + put_page(page); + atomic_set(&old_page->count, PAGE_MMUCOUNT); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long long pmdval; + pmdval = page_to_pfn(old_page) + k; + pmdval <<= MMUPAGE_SHIFT; + if (pmd_present(pmd[k]) || !pmd_none(pmd[k])) { + WARN_ON(old_page != pmd_page(pmd[k])); + continue; + } else + set_pmd(&pmd[k], __pmd(_PAGE_TABLE + pmdval)); + } + return; + } + } + + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long long pmdval; + pmdval = (unsigned long long)(pfn + k) << MMUPAGE_SHIFT; + if (likely(pmd_none(pmd[k]) || !pmd_present(pmd[k]))) + set_pmd(&pmd[k], __pmd(_PAGE_TABLE + pmdval)); + else { + WARN_ON(1); + printk(KERN_DEBUG "pmdval=%Lx\n", (u64)pmd_val(pmd[k])); + put_page(page); /* a reference will be omitted */ + } + } +} diff -prauN linux-2.6.0-test11/arch/i386/pci/i386.c pgcl-2.6.0-test11-1/arch/i386/pci/i386.c --- linux-2.6.0-test11/arch/i386/pci/i386.c 2003-11-26 12:43:25.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/pci/i386.c 2003-11-27 21:55:16.000000000 -0800 @@ -291,7 +291,7 @@ int pci_mmap_page_range(struct pci_dev * /* Write-combine setting is ignored, it is changed via the mtrr * interfaces on this platform. */ - if (remap_page_range(vma, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, + if (remap_page_range(vma, vma->vm_start, vma->vm_pgoff << MMUPAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot)) return -EAGAIN; diff -prauN linux-2.6.0-test11/arch/i386/pci/numa.c pgcl-2.6.0-test11-1/arch/i386/pci/numa.c --- linux-2.6.0-test11/arch/i386/pci/numa.c 2003-11-26 12:43:24.000000000 -0800 +++ pgcl-2.6.0-test11-1/arch/i386/pci/numa.c 2003-11-27 21:55:16.000000000 -0800 @@ -115,7 +115,7 @@ static int __init pci_numa_init(void) return 0; pci_root_bus = pcibios_scan_root(0); - if (numnodes > 1) { + if (0 && numnodes > 1) { for (quad = 1; quad < numnodes; ++quad) { printk("Scanning PCI bus %d for quad %d\n", QUADLOCAL2BUS(quad,0), quad); diff -prauN linux-2.6.0-test11/drivers/block/ll_rw_blk.c pgcl-2.6.0-test11-1/drivers/block/ll_rw_blk.c --- linux-2.6.0-test11/drivers/block/ll_rw_blk.c 2003-11-26 12:42:58.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/block/ll_rw_blk.c 2003-11-27 21:55:16.000000000 -0800 @@ -258,7 +258,7 @@ EXPORT_SYMBOL(blk_queue_make_request); **/ void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr) { - unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; + unsigned long bounce_pfn = dma_addr >> MMUPAGE_SHIFT; unsigned long mb = dma_addr >> 20; static request_queue_t *last_q; diff -prauN linux-2.6.0-test11/drivers/char/Makefile pgcl-2.6.0-test11-1/drivers/char/Makefile --- linux-2.6.0-test11/drivers/char/Makefile 2003-11-26 12:45:39.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/char/Makefile 2003-11-27 21:57:19.000000000 -0800 @@ -7,7 +7,7 @@ # FONTMAPFILE = cp437.uni -obj-y += mem.o random.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o +obj-y += mem.o random.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o early_consoles.o obj-$(CONFIG_VT) += vt_ioctl.o vc_screen.o consolemap.o \ consolemap_deftbl.o selection.o keyboard.o diff -prauN linux-2.6.0-test11/drivers/char/agp/backend.c pgcl-2.6.0-test11-1/drivers/char/agp/backend.c --- linux-2.6.0-test11/drivers/char/agp/backend.c 2003-11-26 12:44:51.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/char/agp/backend.c 2003-11-27 21:55:16.000000000 -0800 @@ -106,10 +106,10 @@ static int agp_find_max(void) { long memory, index, result; -#if PAGE_SHIFT < 20 - memory = num_physpages >> (20 - PAGE_SHIFT); +#if MMUPAGE_SHIFT < 20 + memory = num_physpages >> (20 - MMUPAGE_SHIFT); #else - memory = num_physpages << (PAGE_SHIFT - 20); + memory = num_physpages << (MMUPAGE_SHIFT - 20); #endif index = 1; @@ -122,7 +122,7 @@ static int agp_find_max(void) (maxes_table[index].mem - maxes_table[index - 1].mem); printk(KERN_INFO PFX "Maximum main memory to use for agp memory: %ldM\n", result); - result = result << (20 - PAGE_SHIFT); + result = result << (20 - MMUPAGE_SHIFT); return result; } @@ -161,7 +161,7 @@ static int agp_backend_initialize(struct } got_gatt = 1; - bridge->key_list = vmalloc(PAGE_SIZE * 4); + bridge->key_list = vmalloc(4*MMUPAGE_SIZE); if (bridge->key_list == NULL) { printk(KERN_ERR PFX "error allocating memory for key lists.\n"); rc = -ENOMEM; @@ -170,7 +170,7 @@ static int agp_backend_initialize(struct got_keylist = 1; /* FIXME vmalloc'd memory not guaranteed contiguous */ - memset(bridge->key_list, 0, PAGE_SIZE * 4); + memset(bridge->key_list, 0, 4*MMUPAGE_SIZE); if (bridge->driver->configure()) { printk(KERN_ERR PFX "error configuring host chipset.\n"); diff -prauN linux-2.6.0-test11/drivers/char/agp/generic.c pgcl-2.6.0-test11-1/drivers/char/agp/generic.c --- linux-2.6.0-test11/drivers/char/agp/generic.c 2003-11-26 12:43:34.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/char/agp/generic.c 2003-11-27 21:55:16.000000000 -0800 @@ -91,7 +91,7 @@ struct agp_memory *agp_create_memory(int kfree(new); return NULL; } - new->memory = vmalloc(PAGE_SIZE * scratch_pages); + new->memory = vmalloc(MMUPAGE_SIZE * scratch_pages); if (new->memory == NULL) { agp_free_key(new->key); @@ -136,7 +136,7 @@ void agp_free_memory(struct agp_memory * } EXPORT_SYMBOL(agp_free_memory); -#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define ENTRIES_PER_PAGE (MMUPAGE_SIZE / sizeof(unsigned long)) /** * agp_allocate_memory - allocate a group of pages of a certain type. @@ -680,7 +680,7 @@ int agp_generic_create_gatt_table(void) if (table == NULL) return -ENOMEM; - table_end = table + ((PAGE_SIZE * (1 << page_order)) - 1); + table_end = table + ((MMUPAGE_SIZE * (1 << page_order)) - 1); for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) SetPageReserved(page); @@ -690,7 +690,7 @@ int agp_generic_create_gatt_table(void) agp_bridge->driver->cache_flush(); agp_bridge->gatt_table = ioremap_nocache(virt_to_phys(table), - (PAGE_SIZE * (1 << page_order))); + MMUPAGE_SIZE << page_order); agp_bridge->driver->cache_flush(); if (agp_bridge->gatt_table == NULL) { @@ -748,7 +748,7 @@ int agp_generic_free_gatt_table(void) iounmap(agp_bridge->gatt_table); table = (char *) agp_bridge->gatt_table_real; - table_end = table + ((PAGE_SIZE * (1 << page_order)) - 1); + table_end = table + ((MMUPAGE_SIZE * (1 << page_order)) - 1); for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) ClearPageReserved(page); @@ -796,7 +796,7 @@ int agp_generic_insert_memory(struct agp break; } - num_entries -= agp_memory_reserved/PAGE_SIZE; + num_entries -= agp_memory_reserved/MMUPAGE_SIZE; if (num_entries < 0) num_entries = 0; if (type != 0 || mem->type != 0) { diff -prauN linux-2.6.0-test11/drivers/char/early_consoles.c pgcl-2.6.0-test11-1/drivers/char/early_consoles.c --- linux-2.6.0-test11/drivers/char/early_consoles.c 1969-12-31 16:00:00.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/char/early_consoles.c 2003-11-27 21:55:16.000000000 -0800 @@ -0,0 +1,385 @@ +/* + * Early console drivers. + * (C) Nov 2001, William Irwin, IBM + * + * These are low-level pseudodrivers to enable early console output + * to aid in debugging during early boot. + * + * They are crude, but hopefully effective. They rely on the fact + * that consoles are largely unused prior to the true console_init(), + * and that printk() uses the ->write callback and that callback + * only during its operation. + * + * Serial port routines are derived from Linux serial.c, and + * vga_putc() is derived from vsta, (C) Andrew Valencia. + */ + +#include +#include +#include +#include +#include + +/* + * I/O ports are not linearly mapped on all architectures. + * On IA64 in particular, port I/O is just reading/writing from + * an uncached address, but ioremap there requires ia64_io_base + * to be initialized, which does not happen until the middle of + * setup_arch(). So a port remapping macro is provided here. + * + * The IA64 case is not handled here, although the port remapping + * is demonstrated for the purposes of understanding its necessity. + * The IO_BASE is taken from Lion systems; in general, this varies. + * True handling for IA64 will be merged in given testing. + */ + +#ifdef CONFIG_IA64 + +#define IO_BASE 0xC0000FFFFC000000UL +#define MK_PORT(port) ((char *)(IO_BASE|(((port)>>2)<<12)|((port) & 0xFFF))) + +#else + +/* + * This works for i386, but not everywhere. + * Other architectures with port I/O mapping needs will need to + * add to the preprocessor case analysis above. + */ + +#define MK_PORT(port) (port) + +#endif + +#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) + + +/* + * This serial output driver derived from the one appearing + * in serial.c + * + * It is a simple "bitbanging" style output routine, with + * initialization performed at every call. + */ + +#if defined(CONFIG_EARLY_CONSOLE_3F8) || defined(CONFIG_EARLY_CONSOLE_3E8) + +static inline __init void wait_for_readiness(unsigned port) +{ + unsigned retries; + unsigned char status; + + /* + * Wait for transmitter holding and shift registers to empty, + * which is required for output to succeed. If the retries are + * exceeded, this deliberately fails to ensure termination. + */ + for(retries = 0; retries < 65536; ++retries) { + status = inb(MK_PORT(port + 5)); + if((status & BOTH_EMPTY) == BOTH_EMPTY) + break; + } +} + +static void __init write_serial_io_port(unsigned port, + const char *s, + unsigned n) +{ + unsigned k; + + wait_for_readiness(port); + + /* + * Disable interrupts. + */ + outb(0x0, MK_PORT(port + 1)); + + /* + * Set the baud rate divisor's LSB. + */ + outb(0x83, MK_PORT(port + 3)); + + /* + * Set the baud rate divisor's MSB. + */ + outb(0x1, MK_PORT(port)); + + /* + * Set no parity, 8 bits, 1 stop bit, and select + * interrupt enable register. + */ + outb(0x3, MK_PORT(port + 3)); + + /* + * Set data terminal ready and request to send. + */ + + for(k = 0; k < n; ++k) { + wait_for_readiness(port); + outb(s[k], MK_PORT(port)); + if(s[k] == '\n') { + wait_for_readiness(port); + outb('\r', MK_PORT(port)); + } + } +} + +#endif /* CONFIG_EARLY_CONSOLE_3F8 || CONFIG_EARLY_CONSOLE_3E8 */ + + + +/* + * On Intel-derived architectures it is customary for onboard serial + * ports to have I/O ports at these two port addresses. + */ + +#ifdef CONFIG_EARLY_CONSOLE_3F8 +static void __init write_3F8(struct console *c, const char *s, unsigned n) +{ + write_serial_io_port(0x3F8, s, n); +} + +static struct console __initdata early_console_3F8 = +{ + write: write_3F8 +}; +#endif + +#ifdef CONFIG_EARLY_CONSOLE_3E8 +static void __init write_3E8(struct console *c, const char *s, unsigned n) +{ + write_serial_io_port(0x3E8, s, n); +} + +static struct console __initdata early_console_3E8 = +{ + write: write_3E8 +}; +#endif + + + + +/* + * This should work for a variety of Intel-derived architectures, + * as it is customary for VGA memory to reside in this address range. + * vga_putc() is derived from vsta sources, (C) Andrew Valencia. + * + * Several forms of functionality are intentionally omitted in the + * interest of robustness, in particular, cursor movement and cursor + * position determination. + */ + +#ifdef CONFIG_EARLY_CONSOLE_VGA + +#define VGA_MAXCOL 80 +#define VGA_MAXROW 25 +#define VGA_SCRNSZ (VGA_MAXCOL * VGA_MAXROW) +#define VGA_REG_PORT 0x3D4 +#define VGA_VAL_PORT 0x3D5 +#define VGA_TEXT_BUFFER 0xB8000 + +#define VGA_CHAR(_row_, _col_) vga_mem[(_row_)*VGA_MAXCOL + (_col_)].c + +struct vga_char_desc +{ + unsigned char c; + unsigned char color; +}; + +static struct vga_char_desc * vga_mem __initdata = + (struct vga_char_desc *)(VGA_TEXT_BUFFER + PAGE_OFFSET); + +/* + * The characters displayed at a screen position can be discerned by + * reading from the corresponding memory location. This can be used + * to simulate scrolling movement. Line blanking is simulated by + * overwriting the displayed characters with the space character. + * + * In the interest of robustness, cursor movement is also omitted. + */ +static void __inline__ __init vga_scroll_up(void) +{ + unsigned k; + + for(k = 0; k < (VGA_SCRNSZ - VGA_MAXCOL); ++k) + vga_mem[k].c = vga_mem[k + VGA_MAXCOL].c; + + for(k = VGA_SCRNSZ - VGA_MAXCOL; k < VGA_SCRNSZ; ++k) + vga_mem[k].c = ' '; +} + +/* + * The screen position can actually be determined by port I/O, + * but in the interest of robustness, these are always initialized + * to the (0, 0) position. These position indices must always be + * strictly less than the bounds VGA_MAXROW and VGA_MAXCOL. + */ +static unsigned short __initdata row; +static unsigned short __initdata col; + + +/* + * Line advancement must preserve the invariant that the row and + * column indices are in-bounds. The semantics of this mean that + * when line advancement "beyond" the last line results in scrolling. + */ +static inline void __init vga_line_advance(void) +{ + ++row; + + if(row >= VGA_MAXROW) { + row = VGA_MAXROW - 1; + vga_scroll_up(); + } +} + + +/* + * Character advancement must once again preserve the in-bounds + * invariants, and in so doing line wrapping and advancement may occur. + */ +static inline void __init vga_char_advance(void) +{ + ++col; + + if(col >= VGA_MAXCOL) { + col = 0; + vga_line_advance(); + } +} + + +/* + * Derived from vsta sources (C) Andrew Valencia. + * Here the interpretation of several common special characters occurs, + * namely linefeeds, newlines, tabs, and backspaces. The position + * indices are updated using the vga_char_advance() and vga_line_advance() + * routines, and a vga_char_advance() is triggered on the printing of + * each ordinary character. The special characters have specialized + * position update semantics in order to be faithful to their customary + * cursor movement effects, although the cursor position is not updated. + */ +static void __init vga_putc(char c) +{ + unsigned k; + switch(c) { + case '\t': + for(k = 0; k < 8; ++k) { + VGA_CHAR(row, col) = ' '; + vga_char_advance(); + } + break; + + case '\r': + col = 0; + break; + + case '\n': + col = 0; + vga_line_advance(); + break; + + case '\b': + if(col > 0) { + --col; + VGA_CHAR(row, col) = ' '; + } + break; + + default: + VGA_CHAR(row, col) = c; + vga_char_advance(); + break; + } +} + + +/* + * write_vga(), given a NUL-terminated character array, writes + * characters to VGA space in bulk, and is the callback used for the + * driver structure. + */ +static void __init write_vga(struct console *c, const char *s, unsigned n) +{ + unsigned k; + for(k = 0; k < n; ++k) + vga_putc(s[k]); +} + +static struct console __initdata early_console_vga = +{ + write: write_vga +}; + +#endif /* CONFIG_EARLY_CONSOLE_VGA */ + + +/* + * The bochs x86 simulator has an optional feature for enabling + * debugging output through a normally unused ISA I/O port. The + * protocol for communicating with the simulated device is simply + * using port I/O writes to write a stream of characters to the + * device, and these are then relayed by the simulator to the + * controlling terminal of the simulator process. + */ +#ifdef CONFIG_EARLY_CONSOLE_BOCHS_E9_HACK +static void __init write_bochs(struct console *c, const char *s, unsigned n) +{ + unsigned k; + + for(k = 0; k < n; ++k) + outb(s[k], MK_PORT(0xE9)); +} + +static struct console __initdata early_console_bochs = +{ + write: write_bochs +}; +#endif /* CONFIG_EARLY_CONSOLE_BOCHS_E9_HACK */ + + +/* + * In order to minimize the number of #ifdefs whch must + * appear in-line, this direct-mapped, NULL-terminated table + * of console entries is used to provide a configuration-independent + * structure which may be traversed to discover all of the available + * early console devices for registration and unregistration. + * + * This is the ugliest part of the code, thanks to #ifdef + */ +static struct console * __initdata early_console_table[] = + { +#ifdef CONFIG_EARLY_CONSOLE_3F8 + &early_console_3F8, +#endif +#ifdef CONFIG_EARLY_CONSOLE_3E8 + &early_console_3E8, +#endif +#ifdef CONFIG_EARLY_CONSOLE_VGA + &early_console_vga, +#endif +#ifdef CONFIG_EARLY_CONSOLE_BOCHS_E9_HACK + &early_console_bochs, +#endif + NULL + }; + + +/* + * The above implementations are quite far from complete console + * devices, but printk() only requires the ->write callback, so this is + * somewhat deceptive, but still cleaner than editing printk.c itself. + */ +void __init register_early_consoles(void) +{ + struct console **c = early_console_table; + while(*c) + register_console(*c++); +} + +void __init unregister_early_consoles(void) +{ + struct console **c = early_console_table; + while(*c) + unregister_console(*c++); +} diff -prauN linux-2.6.0-test11/drivers/char/mem.c pgcl-2.6.0-test11-1/drivers/char/mem.c --- linux-2.6.0-test11/drivers/char/mem.c 2003-11-26 12:44:12.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/char/mem.c 2003-11-27 21:55:16.000000000 -0800 @@ -103,8 +103,8 @@ static ssize_t do_write_mem(struct file written = 0; #if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) /* we don't have page 0 mapped on sparc and m68k.. */ - if (realp < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-realp; + if (realp < MMUPAGE_SIZE) { + unsigned long sz = MMUPAGE_SIZE-realp; if (sz > count) sz = count; /* Hmm. Do something? */ buf+=sz; @@ -136,8 +136,8 @@ static ssize_t read_mem(struct file * fi read = 0; #if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-p; + if (p < MMUPAGE_SIZE) { + unsigned long sz = MMUPAGE_SIZE-p; if (sz > count) sz = count; if (sz > 0) { @@ -169,7 +169,7 @@ static ssize_t write_mem(struct file * f static int mmap_mem(struct file * file, struct vm_area_struct * vma) { - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long offset = vma->vm_pgoff << MMUPAGE_SHIFT; int uncached; uncached = uncached_access(file, offset); @@ -214,8 +214,8 @@ static ssize_t read_kmem(struct file *fi #if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE && read > 0) { - size_t tmp = PAGE_SIZE - p; + if (p < MMUPAGE_SIZE && read > 0) { + size_t tmp = MMUPAGE_SIZE - p; if (tmp > read) tmp = read; if (clear_user(buf, tmp)) return -EFAULT; @@ -239,8 +239,8 @@ static ssize_t read_kmem(struct file *fi while (count > 0) { int len = count; - if (len > PAGE_SIZE) - len = PAGE_SIZE; + if (len > MMUPAGE_SIZE) + len = MMUPAGE_SIZE; len = vread(kbuf, (char *)p, len); if (!len) break; @@ -289,8 +289,8 @@ static ssize_t write_kmem(struct file * while (count > 0) { int len = count; - if (len > PAGE_SIZE) - len = PAGE_SIZE; + if (len > MMUPAGE_SIZE) + len = MMUPAGE_SIZE; if (len && copy_from_user(kbuf, buf, len)) { free_page((unsigned long)kbuf); return -EFAULT; @@ -400,12 +400,12 @@ static inline size_t read_zero_pagealign /* The shared case is hard. Let's do the conventional zeroing. */ do { - unsigned long unwritten = clear_user(buf, PAGE_SIZE); + unsigned long unwritten = clear_user(buf, MMUPAGE_SIZE); if (unwritten) - return size + unwritten - PAGE_SIZE; + return size + unwritten - MMUPAGE_SIZE; cond_resched(); - buf += PAGE_SIZE; - size -= PAGE_SIZE; + buf += MMUPAGE_SIZE; + size -= MMUPAGE_SIZE; } while (size); return size; @@ -428,23 +428,23 @@ static ssize_t read_zero(struct file * f left = count; /* do we want to be clever? Arbitrary cut-off */ - if (count >= PAGE_SIZE*4) { + if (count >= MMUPAGE_SIZE*4) { unsigned long partial; /* How much left of the page? */ - partial = (PAGE_SIZE-1) & -(unsigned long) buf; + partial = (MMUPAGE_SIZE-1) & -(unsigned long) buf; unwritten = clear_user(buf, partial); written = partial - unwritten; if (unwritten) goto out; left -= partial; buf += partial; - unwritten = read_zero_pagealigned(buf, left & PAGE_MASK); - written += (left & PAGE_MASK) - unwritten; + unwritten = read_zero_pagealigned(buf, left & MMUPAGE_MASK); + written += (left & MMUPAGE_MASK) - unwritten; if (unwritten) goto out; - buf += left & PAGE_MASK; - left &= ~PAGE_MASK; + buf += left & MMUPAGE_MASK; + left &= ~MMUPAGE_MASK; } unwritten = clear_user(buf, left); written += left - unwritten; diff -prauN linux-2.6.0-test11/drivers/oprofile/buffer_sync.c pgcl-2.6.0-test11-1/drivers/oprofile/buffer_sync.c --- linux-2.6.0-test11/drivers/oprofile/buffer_sync.c 2003-11-26 12:43:27.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/oprofile/buffer_sync.c 2003-11-27 21:55:16.000000000 -0800 @@ -245,7 +245,7 @@ static unsigned long lookup_dcookie(stru cookie = fast_get_dcookie(vma->vm_file->f_dentry, vma->vm_file->f_vfsmnt); - *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - vma->vm_start; + *offset = MMUPAGE_SIZE*vma->vm_pgoff + addr - vma->vm_start; break; } diff -prauN linux-2.6.0-test11/drivers/parisc/ccio-dma.c pgcl-2.6.0-test11-1/drivers/parisc/ccio-dma.c --- linux-2.6.0-test11/drivers/parisc/ccio-dma.c 2003-11-26 12:43:09.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/parisc/ccio-dma.c 2003-11-27 21:55:16.000000000 -0800 @@ -1404,7 +1404,7 @@ ccio_ioc_init(struct ioc *ioc) /* limit IOVA space size to 1MB-1GB */ - physmem = num_physpages << PAGE_SHIFT; + physmem = num_physpages << MMUPAGE_SHIFT; if(physmem < (ccio_mem_ratio * 1024 * 1024)) { iova_space_size = 1024 * 1024; #ifdef __LP64__ diff -prauN linux-2.6.0-test11/drivers/parisc/sba_iommu.c pgcl-2.6.0-test11-1/drivers/parisc/sba_iommu.c --- linux-2.6.0-test11/drivers/parisc/sba_iommu.c 2003-11-26 12:45:05.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/parisc/sba_iommu.c 2003-11-27 21:55:16.000000000 -0800 @@ -1566,7 +1566,7 @@ sba_ioc_init(struct parisc_device *sba, ** for DMA hints - ergo only 30 bits max. */ - physmem = num_physpages << PAGE_SHIFT; + physmem = num_physpages << MMUPAGE_SHIFT; iova_space_size = (u32) (physmem/(sba_mem_ratio*global_ioc_cnt)); /* limit IOVA space size to 1MB-1GB */ diff -prauN linux-2.6.0-test11/drivers/scsi/qlogicisp.c pgcl-2.6.0-test11-1/drivers/scsi/qlogicisp.c --- linux-2.6.0-test11/drivers/scsi/qlogicisp.c 2003-11-26 12:45:50.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/scsi/qlogicisp.c 2003-11-27 21:55:16.000000000 -0800 @@ -1410,7 +1410,7 @@ static int isp1020_init(struct Scsi_Host if ((command & PCI_COMMAND_MEMORY) && ((mem_flags & 1) == 0)) { - mem_base = (u_long) ioremap(mem_base, PAGE_SIZE); + mem_base = (u_long) ioremap(mem_base, MMUPAGE_SIZE); if (!mem_base) { printk("qlogicisp : i/o remapping failed.\n"); goto out_release; diff -prauN linux-2.6.0-test11/drivers/usb/host/uhci-hcd.c pgcl-2.6.0-test11-1/drivers/usb/host/uhci-hcd.c --- linux-2.6.0-test11/drivers/usb/host/uhci-hcd.c 2003-11-26 12:45:30.000000000 -0800 +++ pgcl-2.6.0-test11-1/drivers/usb/host/uhci-hcd.c 2003-11-27 21:55:19.000000000 -0800 @@ -80,7 +80,7 @@ static int debug = 0; MODULE_PARM(debug, "i"); MODULE_PARM_DESC(debug, "Debug level"); static char *errbuf; -#define ERRBUF_LEN (PAGE_SIZE * 8) +#define ERRBUF_LEN (8 * MMUPAGE_SIZE) #include "uhci-hub.c" #include "uhci-debug.c" diff -prauN linux-2.6.0-test11/fs/aio.c pgcl-2.6.0-test11-1/fs/aio.c --- linux-2.6.0-test11/fs/aio.c 2003-11-26 12:43:52.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/aio.c 2003-11-27 21:55:19.000000000 -0800 @@ -86,7 +86,7 @@ static void aio_free_ring(struct kioctx long i; for (i=0; inr_pages; i++) - put_page(info->ring_pages[i]); + put_page(pfn_to_page(info->ring_pages[i])); if (info->mmap_size) { down_write(&ctx->mm->mmap_sem); @@ -113,25 +113,25 @@ static int aio_setup_ring(struct kioctx size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + nr_pages = (size + MMUPAGE_SIZE-1) >> MMUPAGE_SHIFT; if (nr_pages < 0) return -EINVAL; info->nr_pages = nr_pages; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + nr_events = (MMUPAGE_SIZE*nr_pages - sizeof(struct aio_ring))/sizeof(struct io_event); info->nr = 0; info->ring_pages = info->internal_pages; if (nr_pages > AIO_RING_PAGES) { - info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); + info->ring_pages = kmalloc(sizeof(unsigned long)*nr_pages, GFP_KERNEL); if (!info->ring_pages) return -ENOMEM; - memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); + memset(info->ring_pages, 0, sizeof(unsigned long)*nr_pages); } - info->mmap_size = nr_pages * PAGE_SIZE; + info->mmap_size = nr_pages*MMUPAGE_SIZE; dprintk("attempting mmap of %lu bytes\n", info->mmap_size); down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap(NULL, 0, info->mmap_size, @@ -160,7 +160,8 @@ static int aio_setup_ring(struct kioctx info->nr = nr_events; /* trusted copy */ - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(pfn_to_page(info->ring_pages[0]), KM_USER0) + + (info->ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -177,15 +178,17 @@ static int aio_setup_ring(struct kioctx /* aio_ring_event: returns a pointer to the event at the given index from * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); */ -#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) -#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) +#define AIO_EVENTS_PER_PAGE (MMUPAGE_SIZE/sizeof(struct io_event)) +#define AIO_EVENTS_FIRST_PAGE ((MMUPAGE_SIZE-sizeof(struct aio_ring))/sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) #define aio_ring_event(info, nr, km) ({ \ unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ + unsigned long pfn; \ + pfn = (info)->ring_pages[pos/AIO_EVENTS_PER_PAGE]; \ + __event = kmap_atomic(pfn_to_page(pfn), km); \ + __event += (pfn % PAGE_MMUCOUNT) * MMUPAGE_SIZE; \ __event += pos % AIO_EVENTS_PER_PAGE; \ __event; \ }) @@ -193,7 +196,7 @@ static int aio_setup_ring(struct kioctx #define put_aio_ring_event(event, km) do { \ struct io_event *__event = (event); \ (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ + kunmap_atomic((void *)((unsigned long)__event & MMUPAGE_MASK), km); \ } while(0) /* ioctx_alloc @@ -405,7 +408,8 @@ static struct kiocb *__aio_get_req(struc * accept an event from this io. */ spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); + ring = kmap_atomic(pfn_to_page(ctx->ring_info.ring_pages[0]), KM_USER0) + + (ctx->ring_info.ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { list_add(&req->ki_list, &ctx->active_reqs); get_ioctx(ctx); @@ -666,8 +670,8 @@ int aio_complete(struct kiocb *iocb, lon */ spin_lock_irqsave(&ctx->ctx_lock, flags); - ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); - + ring = kmap_atomic(pfn_to_page(info->ring_pages[0]), KM_IRQ1) + + (info->ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; tail = info->tail; event = aio_ring_event(info, tail, KM_IRQ0); tail = (tail + 1) % info->nr; @@ -721,7 +725,8 @@ static int aio_read_evt(struct kioctx *i unsigned long head; int ret = 0; - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(pfn_to_page(info->ring_pages[0]), KM_USER0) + + (info->ring_pages[0] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; dprintk("in aio_read_evt h%lu t%lu m%lu\n", (unsigned long)ring->head, (unsigned long)ring->tail, (unsigned long)ring->nr); diff -prauN linux-2.6.0-test11/fs/binfmt_elf.c pgcl-2.6.0-test11-1/fs/binfmt_elf.c --- linux-2.6.0-test11/fs/binfmt_elf.c 2003-11-26 12:43:51.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/binfmt_elf.c 2003-11-27 21:55:19.000000000 -0800 @@ -62,10 +62,10 @@ static int elf_core_dump(long signr, str #define elf_core_dump NULL #endif -#if ELF_EXEC_PAGESIZE > PAGE_SIZE +#if ELF_EXEC_PAGESIZE > MMUPAGE_SIZE # define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE #else -# define ELF_MIN_ALIGN PAGE_SIZE +# define ELF_MIN_ALIGN MMUPAGE_SIZE #endif #define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) @@ -103,10 +103,12 @@ static void padzero(unsigned long elf_bs unsigned long nbyte; nbyte = ELF_PAGEOFFSET(elf_bss); + pr_debug("padzero(0x%lx) about to clear %lu bytes\n", elf_bss, nbyte); if (nbyte) { nbyte = ELF_MIN_ALIGN - nbyte; clear_user((void *) elf_bss, nbyte); } + pr_debug("padzero(0x%lx) survived clear %lu bytes\n", elf_bss, nbyte); } /* Let's use some macros to make this stack manipulation a litle clearer */ @@ -297,6 +299,8 @@ static unsigned long load_elf_interp(str unsigned long error = ~0UL; int retval, i, size; + pr_debug("entering load_elf_interp()\n"); + /* First of all, some simple consistency checks */ if (interp_elf_ex->e_type != ET_EXEC && interp_elf_ex->e_type != ET_DYN) @@ -324,11 +328,15 @@ static unsigned long load_elf_interp(str if (!elf_phdata) goto out; + pr_debug("about to kernel_read() interpreter\n"); retval = kernel_read(interpreter,interp_elf_ex->e_phoff,(char *)elf_phdata,size); + pr_debug("survived kernel_read() of interpreter\n"); + error = retval; if (retval < 0) goto out_close; + pr_debug("about to loop over sections of interpreter\n"); eppnt = elf_phdata; for (i=0; ie_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { @@ -370,6 +378,7 @@ static unsigned long load_elf_interp(str last_bss = k; } } + pr_debug("survived looping over sections of interpreter\n"); /* * Now fill out the bss section. First pad the last page up @@ -380,9 +389,11 @@ static unsigned long load_elf_interp(str padzero(elf_bss); elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); /* What we have mapped so far */ + pr_debug("about to do_brk()\n"); /* Map the last of the bss segment */ if (last_bss > elf_bss) do_brk(elf_bss, last_bss - elf_bss); + pr_debug("survived do_brk()\n"); *interp_load_addr = load_addr; error = ((unsigned long) interp_elf_ex->e_entry) + load_addr; @@ -390,6 +401,7 @@ static unsigned long load_elf_interp(str out_close: kfree(elf_phdata); out: + pr_debug("leaving load_elf_interp()\n"); return error; } @@ -466,6 +478,8 @@ static int load_elf_binary(struct linux_ struct elfhdr interp_elf_ex; struct exec interp_ex; char passed_fileno[6]; + + pr_debug("about to load_elf_binary()\n"); /* Get the exec-header */ elf_ex = *((struct elfhdr *) bprm->buf); @@ -738,6 +752,7 @@ static int load_elf_binary(struct linux_ end_data += load_bias; if (elf_interpreter) { + pr_debug("loading interpreter\n"); if (interpreter_type == INTERPRETER_AOUT) elf_entry = load_aout_interp(&interp_ex, interpreter); @@ -758,17 +773,22 @@ static int load_elf_binary(struct linux_ goto out; } reloc_func_desc = interp_load_addr; + pr_debug("finished loading interpreter\n"); } else { + pr_debug("static executable, not loading interpreter\n"); elf_entry = elf_ex.e_entry; } + pr_debug("cleaning up load-time data\n"); kfree(elf_phdata); if (interpreter_type != INTERPRETER_AOUT) sys_close(elf_exec_fileno); set_binfmt(&elf_format); + pr_debug("finished cleaning up load-time data\n"); + pr_debug("setting up current process structure\n"); compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; create_elf_tables(bprm, &elf_ex, (interpreter_type == INTERPRETER_AOUT), @@ -781,24 +801,30 @@ static int load_elf_binary(struct linux_ current->mm->start_data = start_data; current->mm->end_data = end_data; current->mm->start_stack = bprm->p; + pr_debug("finished setting up current process structure\n"); /* Calling set_brk effectively mmaps the pages that we need * for the bss and break sections */ + pr_debug("about to set up the bss/break sections\n"); set_brk(elf_bss, elf_brk); + pr_debug("finished setting up the bss/break sections\n"); + pr_debug("about to pad the bss with zeros\n"); padzero(elf_bss); + pr_debug("survived padding the bss with zeros\n"); if (current->personality & MMAP_PAGE_ZERO) { + pr_debug("mapping page 0\n"); /* Why this, you ask??? Well SVr4 maps page 0 as read-only, and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ - /* N.B. Shouldn't the size here be PAGE_SIZE?? */ down_write(¤t->mm->mmap_sem); - error = do_mmap(NULL, 0, 4096, PROT_READ | PROT_EXEC, + error = do_mmap(NULL, 0, MMUPAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); up_write(¤t->mm->mmap_sem); + pr_debug("survived mapping page 0\n"); } #ifdef ELF_PLAT_INIT @@ -812,9 +838,12 @@ static int load_elf_binary(struct linux_ * the regs structure is required as well as any relocations to the * function descriptor entries when executing dynamically links apps. */ + pr_debug("about to ELF_PLAT_INIT()\n"); ELF_PLAT_INIT(regs, reloc_func_desc); + pr_debug("survived ELF_PLAT_INIT()\n"); #endif + pr_debug("about to fiddle with threads and ptrace\n"); start_thread(regs, elf_entry, bprm->p); if (unlikely(current->ptrace & PT_PTRACED)) { if (current->ptrace & PT_TRACE_EXEC) @@ -822,8 +851,10 @@ static int load_elf_binary(struct linux_ else send_sig(SIGTRAP, current, 0); } + pr_debug("survived fiddling with threads and ptrace\n"); retval = 0; out: + pr_debug("leaving load_elf_binary()\n"); return retval; /* error cleanup */ @@ -837,6 +868,7 @@ out_free_file: sys_close(elf_exec_fileno); out_free_ph: kfree(elf_phdata); + pr_debug("survived error cleanups, exiting with error\n"); goto out; } @@ -1393,21 +1425,26 @@ static int elf_core_dump(long signr, str for (addr = vma->vm_start; addr < vma->vm_end; - addr += PAGE_SIZE) { + addr += MMUPAGE_SIZE) { struct page* page; + unsigned long pfn = 0; struct vm_area_struct *vma; if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &vma) <= 0) { - DUMP_SEEK (file->f_pos + PAGE_SIZE); + &pfn, &vma) <= 0) { + DUMP_SEEK (file->f_pos + MMUPAGE_SIZE); } else { + page = pfn_to_page(pfn); if (page == ZERO_PAGE(addr)) { - DUMP_SEEK (file->f_pos + PAGE_SIZE); + DUMP_SEEK (file->f_pos + MMUPAGE_SIZE); } else { void *kaddr; + unsigned long subpfn; + subpfn = pfn % PAGE_MMUCOUNT; flush_cache_page(vma, addr); kaddr = kmap(page); - DUMP_WRITE(kaddr, PAGE_SIZE); + kaddr += subpfn * MMUPAGE_SIZE; + DUMP_WRITE(kaddr, MMUPAGE_SIZE); kunmap(page); } page_cache_release(page); diff -prauN linux-2.6.0-test11/fs/bio.c pgcl-2.6.0-test11-1/fs/bio.c --- linux-2.6.0-test11/fs/bio.c 2003-11-26 12:44:27.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/bio.c 2003-11-27 21:55:19.000000000 -0800 @@ -364,12 +364,12 @@ static struct bio *__bio_map_user(struct unsigned long uaddr, unsigned int len, int write_to_vm) { - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; + unsigned long end = (uaddr + len + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; + unsigned long start = uaddr >> MMUPAGE_SHIFT; const int nr_pages = end - start; request_queue_t *q = bdev_get_queue(bdev); int ret, offset, i; - struct page **pages; + unsigned long *pages; struct bio *bio; /* @@ -383,7 +383,7 @@ static struct bio *__bio_map_user(struct if (!bio) return NULL; - pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc(nr_pages * sizeof(unsigned long), GFP_KERNEL); if (!pages) goto out; @@ -397,9 +397,11 @@ static struct bio *__bio_map_user(struct bio->bi_bdev = bdev; - offset = uaddr & ~PAGE_MASK; + offset = uaddr & ~MMUPAGE_MASK; for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; + unsigned int bytes = MMUPAGE_SIZE - offset; + int suboff = (pages[i] % PAGE_MMUCOUNT)*MMUPAGE_SIZE; + struct page *pg = pfn_to_page(pages[i]); if (len <= 0) break; @@ -410,7 +412,7 @@ static struct bio *__bio_map_user(struct /* * sorry... */ - if (bio_add_page(bio, pages[i], bytes, offset) < bytes) + if (bio_add_page(bio, pg, bytes, offset + suboff) < bytes) break; len -= bytes; @@ -421,7 +423,7 @@ static struct bio *__bio_map_user(struct * release the pages we didn't map into the bio, if any */ while (i < nr_pages) - page_cache_release(pages[i++]); + page_cache_release(pfn_to_page(pages[i++])); kfree(pages); diff -prauN linux-2.6.0-test11/fs/dcache.c pgcl-2.6.0-test11-1/fs/dcache.c --- linux-2.6.0-test11/fs/dcache.c 2003-11-26 12:43:06.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/dcache.c 2003-11-27 21:55:19.000000000 -0800 @@ -1548,10 +1548,8 @@ out: static void __init dcache_init(unsigned long mempages) { - struct hlist_head *d; - unsigned long order; + int order, i; unsigned int nr_hash; - int i; /* * A constructor could be added for stable state like the lists, @@ -1571,42 +1569,28 @@ static void __init dcache_init(unsigned set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); -#if PAGE_SHIFT < 13 - mempages >>= (13 - PAGE_SHIFT); -#endif - mempages *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) - ; - - do { - unsigned long tmp; - - nr_hash = (1UL << order) * PAGE_SIZE / - sizeof(struct hlist_head); - d_hash_mask = (nr_hash - 1); - - tmp = nr_hash; - d_hash_shift = 0; - while ((tmp >>= 1UL) != 0UL) - d_hash_shift++; - - dentry_hashtable = (struct hlist_head *) - __get_free_pages(GFP_ATOMIC, order); - } while (dentry_hashtable == NULL && --order >= 0); + mempages = (mempages*sizeof(struct hlist_head)) >> (13+PAGE_MMUSHIFT); + mempages = max(1UL, mempages); - printk(KERN_INFO "Dentry cache hash table entries: %d (order: %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); + for (order = fls(mempages)-1; order >= 0; --order) { + dentry_hashtable = (void *)__get_free_pages(GFP_ATOMIC, order); + if (dentry_hashtable) + break; + } if (!dentry_hashtable) panic("Failed to allocate dcache hash table\n"); - d = dentry_hashtable; - i = nr_hash; - do { - INIT_HLIST_HEAD(d); - d++; - i--; - } while (i); + nr_hash = (PAGE_SIZE << order)/sizeof(struct hlist_head); + d_hash_mask = nr_hash - 1; + d_hash_shift = fls(nr_hash) - 1; + + printk(KERN_INFO "Dentry cache hash table entries: %d " + "(order: %d, %ld bytes)\n", + nr_hash, order, PAGE_SIZE << order); + + for (i = 0; i < nr_hash; ++i) + INIT_HLIST_HEAD(&dentry_hashtable[i]); } /* SLAB cache for __getname() consumers */ diff -prauN linux-2.6.0-test11/fs/direct-io.c pgcl-2.6.0-test11-1/fs/direct-io.c --- linux-2.6.0-test11/fs/direct-io.c 2003-11-26 12:44:23.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/direct-io.c 2003-11-27 21:55:19.000000000 -0800 @@ -38,7 +38,9 @@ /* * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure on the stack. + * the size of a structure on the stack. But these are mmupages; this + * will _not_ even be able to see a whole PAGE_SIZE area if you make + * PAGE_MMUCOUNT > DIO_PAGES. */ #define DIO_PAGES 64 @@ -52,6 +54,20 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. + * + * XXX: + * Okay, I just broke this and I'm not sure how to put it back together. + * Basically the issue is that we're pointed at _pfn's_ only by + * get_user_pages() so the assumption of virtual contiguity doesn't even + * guarantee PAGE_SIZE -aligned physical contiguity. + * + * AFAICT the fixup is to "opportunistically" merge all this stuff together + * into PAGE_SIZE-aligned contiguous bits and either special-case or be + * able to handle the rest as they come. I've left this broken for now. + * I'm relatively fearful of eating stackspace to keep count of the number + * mmupages starting at a given pfn there are while merging. + * + * -- wli */ struct dio { @@ -104,7 +120,7 @@ struct dio { * Page queue. These variables belong to dio_refill_pages() and * dio_get_page(). */ - struct page *pages[DIO_PAGES]; /* page buffer */ + unsigned long pages[DIO_PAGES]; /* page buffer */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ int page_errors; /* errno from get_user_pages() */ @@ -159,7 +175,7 @@ static int dio_refill_pages(struct dio * */ if (dio->page_errors == 0) dio->page_errors = ret; - dio->pages[0] = ZERO_PAGE(dio->curr_user_address); + dio->pages[0] = page_to_pfn(ZERO_PAGE(dio->curr_user_address)); dio->head = 0; dio->tail = 1; ret = 0; @@ -167,7 +183,7 @@ static int dio_refill_pages(struct dio * } if (ret >= 0) { - dio->curr_user_address += ret * PAGE_SIZE; + dio->curr_user_address += ret * MMUPAGE_SIZE; dio->curr_page += ret; dio->head = 0; dio->tail = ret; @@ -183,8 +199,13 @@ out: * decent number of pages, less frequently. To provide nicer use of the * L1 cache. */ -static struct page *dio_get_page(struct dio *dio) +static struct page *dio_get_page(struct dio *dio, int *pfoff_in_page, int *page_size) { + int pg_size = MMUPAGE_SIZE; + int pfn, tpfn; + struct page *page; + int i = 0; + if (dio_pages_present(dio) == 0) { int ret; @@ -193,7 +214,33 @@ static struct page *dio_get_page(struct return ERR_PTR(ret); BUG_ON(dio_pages_present(dio) == 0); } - return dio->pages[dio->head++]; + + pfn = dio->pages[dio->head++]; + *pfoff_in_page = (pfn % PAGE_MMUCOUNT) * MMUPAGE_SIZE; + + /* Try to cluster all pfns that belongs to this page together */ + tpfn = pfn + 1; + while (pg_size + *pfoff_in_page < PAGE_SIZE) { + if (dio->head == dio->tail) break; + if (tpfn != dio->pages[dio->head]) break; + tpfn++; + dio->head++; + pg_size += MMUPAGE_SIZE; + i++; + } + + page = pfn_to_page(pfn); + *page_size = pg_size; + + /* + * FIXME - get_user_pages got ref for each pfn, we need to drop + * the extra refs for this page + */ + while (i--) { + page_cache_release(page); + } + + return page; } /* @@ -311,8 +358,9 @@ static void dio_bio_submit(struct dio *d */ static void dio_cleanup(struct dio *dio) { + int a, b; while (dio_pages_present(dio)) - page_cache_release(dio_get_page(dio)); + page_cache_release(dio_get_page(dio, &a, &b)); } /* @@ -704,22 +752,26 @@ static void dio_zero_block(struct dio *d static int do_direct_IO(struct dio *dio) { const unsigned blkbits = dio->blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + unsigned blocks_per_page = PAGE_SIZE >> blkbits; struct page *page; unsigned block_in_page; struct buffer_head *map_bh = &dio->map_bh; int ret = 0; + int page_size; + int pf_pgoff; /* The I/O can start at any block offset within the first page */ block_in_page = dio->first_block_in_page; while (dio->block_in_file < dio->final_block_in_request) { - page = dio_get_page(dio); + page = dio_get_page(dio, &pf_pgoff, &page_size); + if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } + blocks_per_page = page_size >> blkbits; while (block_in_page < blocks_per_page) { unsigned offset_in_page = block_in_page << blkbits; unsigned this_chunk_bytes; /* # of bytes mapped */ @@ -803,7 +855,7 @@ do_holes: * can add to this page */ this_chunk_blocks = dio->blocks_available; - u = (PAGE_SIZE - offset_in_page) >> blkbits; + u = (page_size - offset_in_page) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; u = dio->final_block_in_request - dio->block_in_file; @@ -813,7 +865,7 @@ do_holes: BUG_ON(this_chunk_bytes == 0); dio->boundary = buffer_boundary(map_bh); - ret = submit_page_section(dio, page, offset_in_page, + ret = submit_page_section(dio, page, pf_pgoff + offset_in_page, this_chunk_bytes, dio->next_block_for_io); if (ret) { page_cache_release(page); @@ -902,7 +954,7 @@ direct_io_worker(int rw, struct kiocb *i bytes = iov[seg].iov_len; /* Index into the first page of the first block */ - dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; + dio->first_block_in_page = (user_addr & ~MMUPAGE_MASK) >> blkbits; dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits); /* Page fetching state */ @@ -911,11 +963,11 @@ direct_io_worker(int rw, struct kiocb *i dio->curr_page = 0; dio->total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { + if (user_addr & (MMUPAGE_SIZE-1)) { dio->total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); + bytes -= MMUPAGE_SIZE - (user_addr & (MMUPAGE_SIZE - 1)); } - dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; + dio->total_pages += (bytes + MMUPAGE_SIZE - 1) / MMUPAGE_SIZE; dio->curr_user_address = user_addr; ret = do_direct_IO(dio); diff -prauN linux-2.6.0-test11/fs/exec.c pgcl-2.6.0-test11-1/fs/exec.c --- linux-2.6.0-test11/fs/exec.c 2003-11-26 12:43:36.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/exec.c 2003-11-27 21:55:19.000000000 -0800 @@ -292,51 +292,72 @@ EXPORT_SYMBOL(copy_strings_kernel); * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. * - * tsk->mmap_sem is held for writing. + * task->mm->mmap_sem is held for writing. */ -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot) +void put_dirty_page(task_t *task, struct page *page, int min_subpfn, + unsigned long addr, pgprot_t prot) { - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte; - struct pte_chain *pte_chain; - - if (page_count(page) != 1) - printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", - page, address); - - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto out_sig; - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); - if (!pmd) - goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); - if (!pte) - goto out; - if (!pte_none(*pte)) { + unsigned long page_pfn, subpfn; + struct pte_chain *pte_chain = NULL; + struct mm_struct *mm = task->mm; + + page_pfn = page_to_pfn(page); + + pr_debug("%d: put_dirty_page: page = %p, start pfn = 0x%lx, min_subpfn = 0x%x, addr = 0x%lx, prot = %lx\n", + current->pid, page, page_pfn, min_subpfn, addr, pgprot_val(prot)); + + spin_lock(&mm->page_table_lock); + + addr += MMUPAGE_SIZE * min_subpfn; + for (subpfn = min_subpfn; subpfn < PAGE_MMUCOUNT; ++subpfn, addr += MMUPAGE_SIZE) { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long pfn; + + pgd = pgd_offset(mm, addr); + if (!pte_chain) + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto out_nolock; + spin_lock(&mm->page_table_lock); + } + pmd = pmd_alloc(mm, pgd, addr); + if (!pmd) + goto out; + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + goto out; + if (!pte_none(*pte)) { + pte_unmap(pte); + pr_debug("%d: put_dirty_page: skipping addr 0x%lx\n", + current->pid, addr); + continue; + } + pfn = page_pfn + subpfn; + set_pte(pte, pte_mkdirty(pte_mkwrite(pfn_pte(pfn, prot)))); + pte_chain = page_add_rmap(page, pte, pte_chain); + pr_debug("%d: put_dirty_page translating 0x%lx to pfn 0x%lx\n", + current->pid, addr, pfn); + page_cache_get(page); pte_unmap(pte); - goto out; + task->mm->rss++; } + spin_unlock(&mm->page_table_lock); + pte_chain_free(pte_chain); lru_cache_add_active(page); flush_dcache_page(page); - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); - pte_unmap(pte); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); - /* no need for flush_tlb */ - pte_chain_free(pte_chain); + page_cache_release(page); /* want to add PAGE_MMUCOUNT-1 */ return; out: - spin_unlock(&tsk->mm->page_table_lock); -out_sig: - __free_page(page); - force_sig(SIGKILL, tsk); + spin_unlock(&mm->page_table_lock); +out_nolock: + page_cache_release(page); + force_sig(SIGKILL, task); pte_chain_free(pte_chain); return; } @@ -396,7 +417,7 @@ int setup_arg_pages(struct linux_binprm #else stack_base = STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE; mm->arg_start = bprm->p + stack_base; - arg_size = STACK_TOP - (PAGE_MASK & (unsigned long) mm->arg_start); + arg_size = STACK_TOP - (MMUPAGE_MASK & (unsigned long) mm->arg_start); #endif bprm->p += stack_base; @@ -408,7 +429,8 @@ int setup_arg_pages(struct linux_binprm if (!mpnt) return -ENOMEM; - if (security_vm_enough_memory(arg_size >> PAGE_SHIFT)) { + /* must match length of mpnt below */ + if (security_vm_enough_memory(arg_size >> MMUPAGE_SHIFT)) { kmem_cache_free(vm_area_cachep, mpnt); return -ENOMEM; } @@ -418,10 +440,10 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_mm = mm; #ifdef CONFIG_STACK_GROWSUP mpnt->vm_start = stack_base; - mpnt->vm_end = PAGE_MASK & - (PAGE_SIZE - 1 + (unsigned long) bprm->p); + mpnt->vm_end = MMUPAGE_ALIGN((unsigned long)bprm->p); #else - mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; + /* mpnt->vm_start = MMUPAGE_MASK & (unsigned long)bprm->p; */ + mpnt->vm_start = PAGE_MASK & (unsigned long)bprm->p; mpnt->vm_end = STACK_TOP; #endif mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7]; @@ -430,16 +452,22 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_pgoff = 0; mpnt->vm_file = NULL; INIT_LIST_HEAD(&mpnt->shared); - mpnt->vm_private_data = (void *) 0; + mpnt->vm_private_data = NULL; insert_vm_struct(mm, mpnt); - mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> MMUPAGE_SHIFT; } for (i = 0 ; i < MAX_ARG_PAGES ; i++) { struct page *page = bprm->page[i]; if (page) { + int min_subpfn; + + if (mpnt->vm_start <= stack_base) + min_subpfn = 0; + else + min_subpfn = (mpnt->vm_start - stack_base)/MMUPAGE_SIZE; bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, + put_dirty_page(current, page, min_subpfn, stack_base, mpnt->vm_page_prot); } stack_base += PAGE_SIZE; @@ -1071,13 +1099,17 @@ int do_execve(char * filename, struct file *file; int retval; + pr_debug("do_execve(%p, %p, %p, %p)\n", filename, argv, envp, regs); + sched_balance_exec(); file = open_exec(filename); retval = PTR_ERR(file); - if (IS_ERR(file)) + if (IS_ERR(file)) { + pr_debug("return 1 from do_execve()\n"); return retval; + } bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); @@ -1133,6 +1165,7 @@ int do_execve(char * filename, /* execve success */ security_bprm_free(&bprm); + pr_debug("return 2 from do_execve()\n"); return retval; } @@ -1152,6 +1185,8 @@ out_file: allow_write_access(bprm.file); fput(bprm.file); } + + pr_debug("return 3 from do_execve()\n"); return retval; } diff -prauN linux-2.6.0-test11/fs/ext2/dir.c pgcl-2.6.0-test11-1/fs/ext2/dir.c --- linux-2.6.0-test11/fs/ext2/dir.c 2003-11-26 12:44:18.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/ext2/dir.c 2003-11-27 21:55:19.000000000 -0800 @@ -432,15 +432,15 @@ int ext2_add_link (struct dentry *dentry struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned chunk_size = ext2_chunk_size(dir); - unsigned reclen = EXT2_DIR_REC_LEN(namelen); - unsigned short rec_len, name_len; + unsigned long chunk_size = ext2_chunk_size(dir); + unsigned long reclen = EXT2_DIR_REC_LEN(namelen); + unsigned long rec_len, name_len; struct page *page = NULL; ext2_dirent * de; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; - unsigned from, to; + unsigned long from, to; int err; /* diff -prauN linux-2.6.0-test11/fs/file_table.c pgcl-2.6.0-test11-1/fs/file_table.c --- linux-2.6.0-test11/fs/file_table.c 2003-11-26 12:42:55.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/file_table.c 2003-11-27 21:55:19.000000000 -0800 @@ -295,7 +295,7 @@ void __init files_init(unsigned long mem * Per default don't use more than 10% of our memory for files. */ - n = (mempages * (PAGE_SIZE / 1024)) / 10; + n = (mempages * (MMUPAGE_SIZE / 1024)) / 10; files_stat.max_files = n; if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; diff -prauN linux-2.6.0-test11/fs/inode.c pgcl-2.6.0-test11-1/fs/inode.c --- linux-2.6.0-test11/fs/inode.c 2003-11-26 12:45:53.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/inode.c 2003-11-27 21:55:19.000000000 -0800 @@ -1332,48 +1332,34 @@ void wake_up_inode(struct inode *inode) */ void __init inode_init(unsigned long mempages) { - struct hlist_head *head; - unsigned long order; unsigned int nr_hash; - int i; + int order, i; for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) init_waitqueue_head(&i_wait_queue_heads[i].wqh); - mempages >>= (14 - PAGE_SHIFT); - mempages *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) - ; - - do { - unsigned long tmp; - - nr_hash = (1UL << order) * PAGE_SIZE / - sizeof(struct hlist_head); - i_hash_mask = (nr_hash - 1); - - tmp = nr_hash; - i_hash_shift = 0; - while ((tmp >>= 1UL) != 0UL) - i_hash_shift++; - - inode_hashtable = (struct hlist_head *) - __get_free_pages(GFP_ATOMIC, order); - } while (inode_hashtable == NULL && --order >= 0); + mempages = (mempages*sizeof(struct hlist_head)) >> (14+PAGE_MMUSHIFT); + mempages = max(1UL, mempages); - printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); + for (order = fls(mempages)-1; order >= 0; --order) { + inode_hashtable = (void *)__get_free_pages(GFP_ATOMIC, order); + if (inode_hashtable) + break; + } if (!inode_hashtable) panic("Failed to allocate inode hash table\n"); - head = inode_hashtable; - i = nr_hash; - do { - INIT_HLIST_HEAD(head); - head++; - i--; - } while (i); + nr_hash = (PAGE_SIZE << order)/sizeof(struct hlist_head); + i_hash_mask = nr_hash - 1; + i_hash_shift = fls(nr_hash) - 1; + + printk(KERN_INFO "Inode-cache hash table entries: %d " + "(order: %d, %ld bytes)\n", + nr_hash, order, PAGE_SIZE << order); + + for (i = 0; i < nr_hash; ++i) + INIT_HLIST_HEAD(&inode_hashtable[i]); /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), diff -prauN linux-2.6.0-test11/fs/namespace.c pgcl-2.6.0-test11-1/fs/namespace.c --- linux-2.6.0-test11/fs/namespace.c 2003-11-26 12:44:41.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/namespace.c 2003-11-27 21:55:19.000000000 -0800 @@ -1117,8 +1117,6 @@ static void __init init_mount_tree(void) void __init mnt_init(unsigned long mempages) { - struct list_head *d; - unsigned long order; unsigned int nr_hash; int i; @@ -1127,9 +1125,7 @@ void __init mnt_init(unsigned long mempa if (!mnt_cache) panic("Cannot create vfsmount cache"); - order = 0; - mount_hashtable = (struct list_head *) - __get_free_pages(GFP_ATOMIC, order); + mount_hashtable = (void *)__get_free_page(GFP_ATOMIC); if (!mount_hashtable) panic("Failed to allocate mount hash table\n"); @@ -1138,32 +1134,21 @@ void __init mnt_init(unsigned long mempa * Find the power-of-two list-heads that can fit into the allocation.. * We don't guarantee that "sizeof(struct list_head)" is necessarily * a power-of-two. - */ - nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct list_head); - hash_bits = 0; - do { - hash_bits++; - } while ((nr_hash >> hash_bits) != 0); - hash_bits--; - - /* * Re-calculate the actual number of entries and the mask * from the number of bits we can fit. */ + nr_hash = PAGE_SIZE/sizeof(struct list_head); + hash_bits = fls(nr_hash) - 1; nr_hash = 1UL << hash_bits; - hash_mask = nr_hash-1; + hash_mask = nr_hash - 1; - printk("Mount-cache hash table entries: %d (order: %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); + printk(KERN_INFO "Mount-cache hash table entries: %d " + "(order: %d, %ld bytes)\n", nr_hash, 0, PAGE_SIZE); /* And initialize the newly allocated array */ - d = mount_hashtable; - i = nr_hash; - do { - INIT_LIST_HEAD(d); - d++; - i--; - } while (i); + for (i = 0; i < nr_hash; ++i) + INIT_LIST_HEAD(&mount_hashtable[i]); + sysfs_init(); init_rootfs(); init_mount_tree(); diff -prauN linux-2.6.0-test11/fs/ntfs/malloc.h pgcl-2.6.0-test11-1/fs/ntfs/malloc.h --- linux-2.6.0-test11/fs/ntfs/malloc.h 2003-11-26 12:43:05.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/ntfs/malloc.h 2003-11-27 21:55:19.000000000 -0800 @@ -45,7 +45,7 @@ static inline void *ntfs_malloc_nofs(uns } BUG(); } - if (likely(size >> PAGE_SHIFT < num_physpages)) + if (likely((size >> MMUPAGE_SHIFT) < num_physpages)) return __vmalloc(size, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL); return NULL; } diff -prauN linux-2.6.0-test11/fs/proc/base.c pgcl-2.6.0-test11-1/fs/proc/base.c --- linux-2.6.0-test11/fs/proc/base.c 2003-11-26 12:44:31.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/proc/base.c 2003-11-27 21:55:19.000000000 -0800 @@ -32,6 +32,7 @@ #include #include #include +#include /* * For hysterical raisins we keep the same inumbers as in the old procfs. @@ -535,29 +536,37 @@ static ssize_t mem_read(struct file * fi size_t count, loff_t *ppos) { struct task_struct *task = proc_task(file->f_dentry->d_inode); - char *page; + char *kbuf; + struct page *page; unsigned long src = *ppos; int ret = -ESRCH; struct mm_struct *mm; - if (!MAY_PTRACE(task)) + if (0 && !MAY_PTRACE(task)) goto out; ret = -ENOMEM; - page = (char *)__get_free_page(GFP_USER); - if (!page) + page = alloc_page(GFP_HIGHUSER); + if (!page) { + printk("alloc_page() failed in mem_read()\n"); goto out; + } + kbuf = kmap(page); ret = 0; mm = get_task_mm(task); - if (!mm) + if (!mm) { + printk("get_task_mm() failed in mem_read()\n"); goto out_free; + } +#if 0 ret = -EIO; if (file->private_data != (void*)((long)current->self_exec_id)) goto out_put; +#endif ret = 0; @@ -565,14 +574,16 @@ static ssize_t mem_read(struct file * fi int this_len, retval; this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - retval = access_process_vm(task, src, page, this_len, 0); + retval = access_process_vm(task, src, kbuf, this_len, 0); if (!retval) { + printk("access_process_vm() failed in mem_read()\n"); if (!ret) ret = -EIO; break; } - if (copy_to_user(buf, page, retval)) { + if (copy_to_user(buf, kbuf, retval)) { + printk("copy_to_user() failed in mem_read()\n"); ret = -EFAULT; break; } @@ -584,15 +595,17 @@ static ssize_t mem_read(struct file * fi } *ppos = src; -out_put: mmput(mm); out_free: - free_page((unsigned long) page); + kunmap(page); + __free_page(page); out: return ret; } +#if 0 #define mem_write NULL +#endif #ifndef mem_write /* This is a security hazard */ @@ -600,26 +613,28 @@ static ssize_t mem_write(struct file * f size_t count, loff_t *ppos) { int copied = 0; - char *page; + char *kbuf; + struct page *page; struct task_struct *task = proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; - if (!MAY_PTRACE(task)) + if (0 && !MAY_PTRACE(task)) return -ESRCH; - page = (char *)__get_free_page(GFP_USER); + page = alloc_page(GFP_HIGHUSER); if (!page) return -ENOMEM; + kbuf = kmap(page); while (count > 0) { int this_len, retval; this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - if (copy_from_user(page, buf, this_len)) { + if (copy_from_user(kbuf, buf, this_len)) { copied = -EFAULT; break; } - retval = access_process_vm(task, dst, page, this_len, 1); + retval = access_process_vm(task, dst, kbuf, this_len, 1); if (!retval) { if (!copied) copied = -EIO; @@ -631,7 +646,8 @@ static ssize_t mem_write(struct file * f count -= retval; } *ppos = dst; - free_page((unsigned long) page); + kunmap(page); + __free_page(page); return copied; } #endif diff -prauN linux-2.6.0-test11/fs/proc/proc_misc.c pgcl-2.6.0-test11-1/fs/proc/proc_misc.c --- linux-2.6.0-test11/fs/proc/proc_misc.c 2003-11-26 12:43:07.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/proc/proc_misc.c 2003-11-27 21:55:19.000000000 -0800 @@ -224,7 +224,7 @@ static int meminfo_read_proc(char *page, K(ps.nr_writeback), K(ps.nr_mapped), K(ps.nr_slab), - K(committed), + committed << (MMUPAGE_SHIFT - 10), K(ps.nr_page_table_pages), vmtot, vmi.used, diff -prauN linux-2.6.0-test11/fs/proc/task_mmu.c pgcl-2.6.0-test11-1/fs/proc/task_mmu.c --- linux-2.6.0-test11/fs/proc/task_mmu.c 2003-11-26 12:43:07.000000000 -0800 +++ pgcl-2.6.0-test11-1/fs/proc/task_mmu.c 2003-11-27 21:55:19.000000000 -0800 @@ -34,9 +34,9 @@ char *task_mem(struct mm_struct *mm, cha "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n", - mm->total_vm << (PAGE_SHIFT-10), - mm->locked_vm << (PAGE_SHIFT-10), - mm->rss << (PAGE_SHIFT-10), + mm->total_vm << (MMUPAGE_SHIFT-10), + mm->locked_vm << (MMUPAGE_SHIFT-10), + mm->rss << (MMUPAGE_SHIFT-10), data - stack, stack, exec - lib, lib); up_read(&mm->mmap_sem); @@ -45,7 +45,7 @@ char *task_mem(struct mm_struct *mm, cha unsigned long task_vsize(struct mm_struct *mm) { - return PAGE_SIZE * mm->total_vm; + return MMUPAGE_SIZE * mm->total_vm; } int task_statm(struct mm_struct *mm, int *shared, int *text, @@ -56,7 +56,7 @@ int task_statm(struct mm_struct *mm, int *resident = mm->rss; for (vma = mm->mmap; vma; vma = vma->vm_next) { - int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int pages = (vma->vm_end - vma->vm_start) >> MMUPAGE_SHIFT; size += pages; if (is_vm_hugetlb_page(vma)) { @@ -97,7 +97,7 @@ static int show_map(struct seq_file *m, flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', - map->vm_pgoff << PAGE_SHIFT, + map->vm_pgoff << MMUPAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (map->vm_file) { diff -prauN linux-2.6.0-test11/include/asm-alpha/page.h pgcl-2.6.0-test11-1/include/asm-alpha/page.h --- linux-2.6.0-test11/include/asm-alpha/page.h 2003-11-26 12:45:08.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-alpha/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -98,6 +98,8 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _ALPHA_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-arm/page.h pgcl-2.6.0-test11-1/include/asm-arm/page.h --- linux-2.6.0-test11/include/asm-arm/page.h 2003-11-26 12:43:09.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-arm/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -188,6 +188,8 @@ static inline int get_order(unsigned lon #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif diff -prauN linux-2.6.0-test11/include/asm-cris/page.h pgcl-2.6.0-test11-1/include/asm-cris/page.h --- linux-2.6.0-test11/include/asm-cris/page.h 2003-11-26 12:43:31.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-cris/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -91,6 +91,8 @@ static inline int get_order(unsigned lon #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _CRIS_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-generic/page.h pgcl-2.6.0-test11-1/include/asm-generic/page.h --- linux-2.6.0-test11/include/asm-generic/page.h 1969-12-31 16:00:00.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-generic/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -0,0 +1,11 @@ +#ifndef _ASM_GENERIC_PAGE_H +#define _ASM_GENERIC_PAGE_H + +#define MMUPAGE_SHIFT PAGE_SHIFT +#define MMUPAGE_SIZE PAGE_SIZE +#define MMUPAGE_MASK PAGE_MASK +#define MMUPAGE_ALIGN(x) PAGE_ALIGN(x) +#define PAGE_MMUSHIFT 0 +#define PAGE_MMUCOUNT 1 + +#endif /* _ASM_GENERIC_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-generic/rmap.h pgcl-2.6.0-test11-1/include/asm-generic/rmap.h --- linux-2.6.0-test11/include/asm-generic/rmap.h 2003-11-26 12:44:29.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-generic/rmap.h 2003-11-27 21:55:19.000000000 -0800 @@ -15,7 +15,7 @@ * offset of the page table entry within the page table page * * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE + * scalar pte_addr_t. The pfn of the pte's page is shifted left by MMUPAGE_SIZE * bits and is then ORed with the byte offset of the pte within its page. * * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for @@ -26,7 +26,15 @@ */ #include -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) +/* + * This looks bizarre, but it's actually meaningful. + */ +#define MMUPAGES_MAPPED_PER_PTE_PAGE (PTRS_PER_PTE * PAGE_MMUCOUNT) +#define VIRT_AREA_MAPPED_PER_PTE_PAGE \ + (MMUPAGES_MAPPED_PER_PTE_PAGE*MMUPAGE_SIZE) + +static inline void pgtable_add_rmap(struct page *page, struct mm_struct *mm, + unsigned long address) { #ifdef BROKEN_PPC_PTE_ALLOC_ONE /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ @@ -35,44 +43,114 @@ static inline void pgtable_add_rmap(stru if (!mem_init_done) return; #endif + + /* rmap's accounting is already set up */ + if (page->mapping) { + /* + * address is presumably large. if smaller, overflow traps + * the error; if larger, check the distance + */ + WARN_ON(address - page->index >= VIRT_AREA_MAPPED_PER_PTE_PAGE); + return; + } + page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + page->index = address & ~(VIRT_AREA_MAPPED_PER_PTE_PAGE - 1); inc_page_state(nr_page_table_pages); } static inline void pgtable_remove_rmap(struct page * page) { + /* we're not down to a unique reference */ + if (PAGE_MMUCOUNT > 1) { + if (atomic_read(&page->count) > 1) + return; + + /* + * A zero reference count should not be possible; + * put_page() should have freed the things outright + * so this essentially means use-after-free is happening. + */ + else + BUG_ON(atomic_read(&page->count) <= 0); + } + page->mapping = NULL; page->index = 0; dec_page_state(nr_page_table_pages); } -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +#ifdef CONFIG_HIGHPTE +static inline struct mm_struct *pte_paddr_to_mm(pte_addr_t paddr) { - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; + return (struct mm_struct *)pfn_to_page(paddr/MMUPAGE_SIZE)->mapping; } -static inline unsigned long ptep_to_address(pte_t * ptep) +static inline struct mm_struct *ptep_to_mm(pte_t *pte) { - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; + pte_addr_t pfn = kmap_atomic_to_pfn(pte); + return pte_paddr_to_mm(MMUPAGE_SIZE*pfn); +} +#else +static inline struct mm_struct *pte_paddr_to_mm(pte_addr_t paddr) +{ + return (struct mm_struct *)virt_to_page(paddr)->mapping; } +static inline struct mm_struct *ptep_to_mm(pte_t *ptep) +{ + return pte_paddr_to_mm((pte_addr_t)ptep); +} +#endif + #ifdef CONFIG_HIGHPTE +static inline unsigned long ptep_to_address(pte_t *ptep) +{ + unsigned long kvaddr = (unsigned long)ptep; + unsigned long swpage_voff = kvaddr/sizeof(pte_t); + + if (1) { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long pfn; + struct page *page; + + pgd = pgd_offset_k(kvaddr); + pmd = pmd_offset(pgd, kvaddr); + pte = pte_offset_kernel(pmd, kvaddr); + pfn = pte_pfn(*pte); + page = pfn_to_page(pfn); + return page->index + PMD_SIZE*(pfn % PAGE_MMUCOUNT) + + MMUPAGE_SIZE*(swpage_voff % PTRS_PER_PTE); + } else { + struct page *page = pfn_to_page(kmap_atomic_to_pfn(ptep)); + + WARN_ON(kvaddr > (unsigned long)(-PAGE_SIZE)); + + swpage_voff %= MMUPAGES_MAPPED_PER_PTE_PAGE; + /* WARN_ON(swpage_voff != pfn - page_to_pfn(page)); */ + return page->index + MMUPAGE_SIZE*swpage_voff; + } +} + static inline pte_addr_t ptep_to_paddr(pte_t *ptep) { - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); + unsigned long pfn, vaddr = (unsigned long)ptep; + pfn = kmap_atomic_to_pfn(ptep); + return MMUPAGE_SIZE*((pte_addr_t)pfn) + (vaddr & ~MMUPAGE_MASK); } #else static inline pte_addr_t ptep_to_paddr(pte_t *ptep) { return (pte_addr_t)ptep; } + +static inline unsigned long ptep_to_address(pte_t *pte) +{ + unsigned long offset = ((unsigned long)pte & ~PAGE_MASK)/sizeof(pte_t); + return virt_to_page(pte)->index + MMUPAGE_SIZE*offset; +} #endif #ifndef CONFIG_HIGHPTE diff -prauN linux-2.6.0-test11/include/asm-generic/tlb.h pgcl-2.6.0-test11-1/include/asm-generic/tlb.h --- linux-2.6.0-test11/include/asm-generic/tlb.h 2003-11-26 12:42:37.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-generic/tlb.h 2003-11-27 21:55:19.000000000 -0800 @@ -46,6 +46,16 @@ struct mmu_gather { /* Users of the generic TLB shootdown code must declare this storage space. */ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +static inline struct mm_struct *tlb_mm(struct mmu_gather *tlb) +{ + return tlb->mm; +} + +static inline void tlb_inc_freed(struct mmu_gather *tlb) +{ + tlb->freed++; +} + /* tlb_gather_mmu * Return a pointer to an initialized struct mmu_gather. */ diff -prauN linux-2.6.0-test11/include/asm-i386/dma-mapping.h pgcl-2.6.0-test11-1/include/asm-i386/dma-mapping.h --- linux-2.6.0-test11/include/asm-i386/dma-mapping.h 2003-11-26 12:46:08.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/dma-mapping.h 2003-11-27 21:55:19.000000000 -0800 @@ -51,7 +51,7 @@ dma_map_page(struct device *dev, struct size_t size, enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); - return (dma_addr_t)(page_to_pfn(page)) * PAGE_SIZE + offset; + return (dma_addr_t)(page_to_pfn(page)) * MMUPAGE_SIZE + offset; } static inline void diff -prauN linux-2.6.0-test11/include/asm-i386/elf.h pgcl-2.6.0-test11-1/include/asm-i386/elf.h --- linux-2.6.0-test11/include/asm-i386/elf.h 2003-11-26 12:44:46.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/elf.h 2003-11-27 21:55:19.000000000 -0800 @@ -132,11 +132,17 @@ extern int dump_task_extended_fpu (struc #define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall) extern void __kernel_vsyscall; +/* + * Something in pgcl broke vsyscalls. Until that's tracked down, + * work around it with this: + */ +#if 0 #define ARCH_DLINFO \ do { \ NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ } while (0) +#endif /* 0 */ /* * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out @@ -146,6 +152,7 @@ do { \ * Dumping its extra ELF program headers includes all the other information * a debugger needs to easily find how the vsyscall DSO was being used. */ +#if 0 #define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum) #define ELF_CORE_WRITE_EXTRA_PHDRS \ do { \ @@ -181,7 +188,8 @@ do { \ PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \ } \ } while (0) +#endif /* 0 */ -#endif +#endif /* __KERNEL__ */ #endif diff -prauN linux-2.6.0-test11/include/asm-i386/fixmap.h pgcl-2.6.0-test11-1/include/asm-i386/fixmap.h --- linux-2.6.0-test11/include/asm-i386/fixmap.h 2003-11-26 12:42:55.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/fixmap.h 2003-11-27 21:55:19.000000000 -0800 @@ -28,21 +28,55 @@ * addresses. The point is to have a constant address at * compile time, but to set the physical address only * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. + * from the end of virtual memory (-PAGE_SIZE) backwards. * Also this lets us do fail-safe vmalloc(), we * can guarantee that these special addresses and * vmalloc()-ed addresses never overlap. * - * these 'compile-time allocated' memory buffers are - * fixed-size 4k pages. (or larger if used with an increment - * highger than 1) use fixmap_set(idx,phys) to associate - * physical memory with fixmap indices. + * These 'compile-time allocated' memory buffers are + * fixed-size MMUPAGE_SIZE-size pages. Use + * set_fixmap(idx, phys, prot) to associate physical memory with + * fixmap indices. * * TLB entries of such buffers will not be flushed across * task switches. + * + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +#define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) +#define LAST_PKMAP 1024 +#define LAST_PKMAP_MASK (LAST_PKMAP-1) + +/* + * FIXADDR stuff is used by highmem.c for kmapping, and various + * drivers for system devices for their io mappings. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap. + * + * leave a hole of exactly PAGE_SIZE at the top for CONFIG_HIGHMEM + * this makes things easier on core code; the math works out funny + * and I didn't care enough to conserve PAGE_SIZE - MMUPAGE_SIZE + * worth of virtualspace. */ +#define FIXADDR_TOP (-PAGE_SIZE) +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << MMUPAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << MMUPAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x) & MMUPAGE_MASK)) >> MMUPAGE_SHIFT) + enum fixed_addresses { - FIX_HOLE, +#ifdef CONFIG_HIGHMEM + /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_BEGIN = 1, + FIX_KMAP_END = FIX_KMAP_BEGIN+((KM_TYPE_NR*NR_CPUS+1)*PAGE_MMUCOUNT)-1, + FIX_PKMAP_BEGIN, + FIX_PKMAP_END = FIX_PKMAP_BEGIN + (LAST_PKMAP+1)*PAGE_MMUCOUNT - 1, +#endif FIX_VSYSCALL, #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ @@ -63,10 +97,6 @@ enum fixed_addresses { #ifdef CONFIG_X86_CYCLONE_TIMER FIX_CYCLONE_TIMER, /*cyclone timer register*/ #endif -#ifdef CONFIG_HIGHMEM - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif #ifdef CONFIG_ACPI_BOOT FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, @@ -95,19 +125,6 @@ extern void __set_fixmap (enum fixed_add __set_fixmap(idx, 0, __pgprot(0)) /* - * used by vmalloc.c. - * - * Leave one empty page between vmalloc'ed areas and - * the start of the fixmap. - */ -#define FIXADDR_TOP (0xfffff000UL) -#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) - -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) - -/* * This is the range that is readable by user mode, and things * acting like user mode such as get_user_pages. */ @@ -141,8 +158,13 @@ static inline unsigned long fix_to_virt( static inline unsigned long virt_to_fix(const unsigned long vaddr) { - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + if (vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START) { + printk("bad vaddr in virt_to_fix 0x%lx\n", vaddr); + BUG(); + } return __virt_to_fix(vaddr); } +#define PKMAP_BASE fix_to_virt(FIX_PKMAP_END) + #endif diff -prauN linux-2.6.0-test11/include/asm-i386/highmem.h pgcl-2.6.0-test11-1/include/asm-i386/highmem.h --- linux-2.6.0-test11/include/asm-i386/highmem.h 2003-11-26 12:42:57.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/highmem.h 2003-11-27 21:55:19.000000000 -0800 @@ -33,35 +33,35 @@ extern pte_t *kmap_pte; extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; -extern void kmap_init(void); +void one_highpage_init(struct page *, unsigned long, int); -/* - * Right now we initialize only a single pte table. It can be extended - * easily, subsequent pte tables have to be allocated in one physical - * chunk of RAM. - */ -#if NR_CPUS <= 32 -#define PKMAP_BASE (0xff800000UL) -#else -#define PKMAP_BASE (0xff600000UL) -#endif -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif -#define LAST_PKMAP_MASK (LAST_PKMAP-1) -#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) -#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) - -extern void * FASTCALL(kmap_high(struct page *page)); -extern void FASTCALL(kunmap_high(struct page *page)); +void kmap_init(void); +void *FASTCALL(kmap_high(struct page *page)); +void FASTCALL(kunmap_high(struct page *page)); void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); -struct page *kmap_atomic_to_page(void *ptr); +void kmap_atomic_sg(pte_t *[], pte_addr_t [], enum km_type); + +#ifndef CONFIG_DEBUG_HIGHMEM +static inline void kunmap_atomic_sg(pte_t *ptes[], enum km_type type) +{ + dec_preempt_count(); +} + +static inline void kunmap_atomic(void *kvaddr, enum km_type type) +{ + dec_preempt_count(); +} +#else +void kunmap_atomic_sg(pte_t *[], enum km_type); +void kunmap_atomic(void *, enum km_type); +#endif + +void *kmap_atomic_pfns(unsigned long [], enum km_type); +void kunmap_atomic_pfns(unsigned long [], enum km_type); +unsigned long kmap_atomic_to_pfn(void *ptr); #define flush_cache_kmaps() do { } while (0) diff -prauN linux-2.6.0-test11/include/asm-i386/io.h pgcl-2.6.0-test11-1/include/asm-i386/io.h --- linux-2.6.0-test11/include/asm-i386/io.h 2003-11-26 12:46:03.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/io.h 2003-11-27 21:55:19.000000000 -0800 @@ -69,7 +69,7 @@ * this function */ -static inline unsigned long virt_to_phys(volatile void * address) +static inline unsigned long virt_to_phys(volatile void *address) { return __pa(address); } @@ -87,7 +87,7 @@ static inline unsigned long virt_to_phys * this function */ -static inline void * phys_to_virt(unsigned long address) +static inline void *phys_to_virt(unsigned long address) { return __va(address); } @@ -95,9 +95,9 @@ static inline void * phys_to_virt(unsign /* * Change "struct page" to physical address. */ -#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) +#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << MMUPAGE_SHIFT) -extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); +void *__ioremap(unsigned long offset, unsigned long size, unsigned long flags); /** * ioremap - map bus memory into CPU space @@ -111,21 +111,33 @@ extern void * __ioremap(unsigned long of * address. */ -static inline void * ioremap (unsigned long offset, unsigned long size) +static inline void *ioremap(unsigned long offset, unsigned long size) { return __ioremap(offset, size, 0); } -extern void * ioremap_nocache (unsigned long offset, unsigned long size); -extern void iounmap(void *addr); +void *ioremap_nocache(unsigned long offset, unsigned long size); +void iounmap(void *addr); + /* * bt_ioremap() and bt_iounmap() are for temporary early boot-time * mappings, before the real ioremap() is functional. * A boot-time mapping is currently limited to at most 16 pages. */ -extern void *bt_ioremap(unsigned long offset, unsigned long size); -extern void bt_iounmap(void *addr, unsigned long size); +void *bt_ioremap(unsigned long offset, unsigned long size); +void bt_iounmap(void *addr, unsigned long size); + +#ifdef CONFIG_BOOT_IOREMAP +/* + * boot_ioremap() is an "even earlier" ioremap, primarily for use + * when the pagetable formats used during early boot differ from + * those used at runtime, e.g. PAE booting off of non-PAE pagetables. + * Don't use this unless you _really_ know what you're doing. + * -- wli + */ +void *boot_ioremap(unsigned long paddr, unsigned long size); +#endif /* CONFIG_BOOT_IOREMAP */ /* * ISA I/O bus memory addresses are 1:1 with the physical address. diff -prauN linux-2.6.0-test11/include/asm-i386/io_apic.h pgcl-2.6.0-test11-1/include/asm-i386/io_apic.h --- linux-2.6.0-test11/include/asm-i386/io_apic.h 2003-11-26 12:44:44.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/io_apic.h 2003-11-27 21:55:19.000000000 -0800 @@ -17,7 +17,7 @@ #define IO_APIC_BASE(idx) \ ((volatile int *)(__fix_to_virt(FIX_IO_APIC_BASE_0 + idx) \ - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK))) + + (mp_ioapics[idx].mpc_apicaddr & ~MMUPAGE_MASK))) /* * The structure of the IO-APIC: diff -prauN linux-2.6.0-test11/include/asm-i386/kmap_types.h pgcl-2.6.0-test11-1/include/asm-i386/kmap_types.h --- linux-2.6.0-test11/include/asm-i386/kmap_types.h 2003-11-26 12:44:56.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/kmap_types.h 2003-11-27 21:55:19.000000000 -0800 @@ -24,7 +24,8 @@ D(10) KM_IRQ0, D(11) KM_IRQ1, D(12) KM_SOFTIRQ0, D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(14) KM_FOLIO, +D(15) KM_TYPE_NR }; #undef D diff -prauN linux-2.6.0-test11/include/asm-i386/mach-numaq/mach_apic.h pgcl-2.6.0-test11-1/include/asm-i386/mach-numaq/mach_apic.h --- linux-2.6.0-test11/include/asm-i386/mach-numaq/mach_apic.h 2003-11-26 12:43:40.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/mach-numaq/mach_apic.h 2003-11-27 21:55:19.000000000 -0800 @@ -39,6 +39,7 @@ static inline void init_apic_ldr(void) static inline void clustered_apic_check(void) { + nr_ioapics = min(2, nr_ioapics); printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "NUMA-Q", nr_ioapics); } diff -prauN linux-2.6.0-test11/include/asm-i386/mmzone.h pgcl-2.6.0-test11-1/include/asm-i386/mmzone.h --- linux-2.6.0-test11/include/asm-i386/mmzone.h 2003-11-26 12:44:10.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/mmzone.h 2003-11-27 21:55:19.000000000 -0800 @@ -11,6 +11,7 @@ #ifdef CONFIG_DISCONTIGMEM extern struct pglist_data *node_data[]; +extern unsigned long node_start_pfn[], node_end_pfn[]; /* * Following are macros that are specific to this numa platform. @@ -22,17 +23,17 @@ extern struct pglist_data *node_data[]; #define alloc_bootmem_low(x) \ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, 0) #define alloc_bootmem_node(ignore, x) \ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) + __alloc_bootmem_node(NODE_DATA(0), (x), MMUPAGE_SIZE, 0) -#define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) +#define node_localnr(pfn, nid) (((pfn) - node_data[nid]->node_start_pfn)/PAGE_MMUCOUNT) /* * Following are macros that each numa implmentation must define. @@ -41,7 +42,7 @@ extern struct pglist_data *node_data[]; /* * Given a kernel address, find the home node of the underlying memory. */ -#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) +#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> MMUPAGE_SHIFT) /* * Return a pointer to the node data for node n. @@ -53,13 +54,22 @@ extern struct pglist_data *node_data[]; #define node_end_pfn(nid) \ ({ \ pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ + __pgdat->node_start_pfn \ + + PAGE_MMUCOUNT*__pgdat->node_spanned_pages; \ }) #define local_mapnr(kvaddr) \ ({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ + unsigned long __pfn = __pa(kvaddr) >> MMUPAGE_SHIFT; \ + (__pfn - node_start_pfn(pfn_to_nid(__pfn)))/PAGE_MMUCOUNT; \ +}) + +#define local_pfn(pg) \ +({ \ + struct page *__pg = pg; \ + unsigned long __nr; \ + __nr = (unsigned long)(__pg - page_zone(__pg)->zone_mem_map); \ + PAGE_MMUCOUNT*__nr; \ }) #define kern_addr_valid(kaddr) \ @@ -80,10 +90,9 @@ extern struct pglist_data *node_data[]; ({ \ struct page *__page = pg; \ struct zone *__zone = page_zone(__page); \ - (unsigned long)(__page - __zone->zone_mem_map) \ - + __zone->zone_start_pfn; \ + local_pfn(__page) + __zone->zone_start_pfn; \ }) -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> MMUPAGE_SHIFT)) /* * pfn_valid should be made as fast as possible, and the current definition * is valid for machines that are NUMA, but still contiguous, which is what diff -prauN linux-2.6.0-test11/include/asm-i386/page.h pgcl-2.6.0-test11-1/include/asm-i386/page.h --- linux-2.6.0-test11/include/asm-i386/page.h 2003-11-26 12:43:09.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -1,13 +1,36 @@ #ifndef _I386_PAGE_H #define _I386_PAGE_H -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 +#include /* for CONFIG_PAGE_CLUSTER */ + +/* + * One mmupage is represented by one Page Table Entry at the MMU level, + * and corresponds to one page at the user process level: its size is + * the same as param.h EXEC_PAGESIZE (for getpagesize(2) and mmap(2)). + */ +#define MMUPAGE_SHIFT 12 +#define MMUPAGE_SIZE (1 << MMUPAGE_SHIFT) +#define MMUPAGE_MASK (~(MMUPAGE_SIZE-1)) + +/* + * 2**N adjacent mmupages may be clustered to make up one kernel page. + * Reasonable and tested values for PAGE_MMUSHIFT are 0 (4k page), + * 1 (8k page), 2 (16k page), 3 (32k page). Higher values will not + * work without further changes e.g. to unsigned short b_size. + */ +#define PAGE_MMUSHIFT CONFIG_PAGE_CLUSTER +#define PAGE_MMUCOUNT (1 << PAGE_MMUSHIFT) + +/* + * One kernel page is represented by one struct page (see mm.h), + * and is the kernel's principal unit of memory allocation. + */ +#define PAGE_SHIFT (PAGE_MMUSHIFT + MMUPAGE_SHIFT) #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) +#define LARGE_PAGE_SIZE (1 << PMD_SHIFT) #ifdef __KERNEL__ #ifndef __ASSEMBLY__ @@ -53,7 +76,7 @@ typedef struct { unsigned long pgd; } pg #define pte_val(x) ((x).pte_low) #define HPAGE_SHIFT 22 #endif -#define PTE_MASK PAGE_MASK +#define PTE_MASK MMUPAGE_MASK #ifdef CONFIG_HUGETLB_PAGE #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) @@ -76,6 +99,7 @@ typedef struct { unsigned long pgprot; } /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) +#define MMUPAGE_ALIGN(addr) (((addr)+MMUPAGE_SIZE-1)&MMUPAGE_MASK) /* * This handles the memory map.. We could make this a config @@ -123,18 +147,22 @@ static __inline__ int get_order(unsigned #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) -#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define __MAXMEM \ + ((VMALLOC_START-2*MMUPAGE_SIZE-__PAGE_OFFSET) & LARGE_PAGE_MASK) +#define MAXMEM \ + __pa((VMALLOC_START-2*MMUPAGE_SIZE) & LARGE_PAGE_MASK) #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) +#define pfn_to_kaddr(pfn) __va(MMUPAGE_SIZE*(pfn)) #ifndef CONFIG_DISCONTIGMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) -#define pfn_valid(pfn) ((pfn) < max_mapnr) +#define pfn_to_page(pfn) (&mem_map[(pfn)/PAGE_MMUCOUNT]) +#define page_to_mapnr(page) ((unsigned long)((page) - mem_map)) +#define page_to_pfn(page) (PAGE_MMUCOUNT*page_to_mapnr(page)) +#define pfn_valid(pfn) ((pfn) < max_mapnr*PAGE_MMUCOUNT) #endif /* !CONFIG_DISCONTIGMEM */ -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr)/MMUPAGE_SIZE) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr)/MMUPAGE_SIZE) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff -prauN linux-2.6.0-test11/include/asm-i386/pci.h pgcl-2.6.0-test11-1/include/asm-i386/pci.h --- linux-2.6.0-test11/include/asm-i386/pci.h 2003-11-26 12:43:40.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/pci.h 2003-11-27 21:55:19.000000000 -0800 @@ -69,13 +69,13 @@ pci_dac_page_to_dma(struct pci_dev *pdev static __inline__ struct page * pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) { - return pfn_to_page(dma_addr >> PAGE_SHIFT); + return pfn_to_page(dma_addr >> MMUPAGE_SHIFT); } static __inline__ unsigned long pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) { - return (dma_addr & ~PAGE_MASK); + return dma_addr & ~PAGE_MASK; } static __inline__ void diff -prauN linux-2.6.0-test11/include/asm-i386/pgalloc.h pgcl-2.6.0-test11-1/include/asm-i386/pgalloc.h --- linux-2.6.0-test11/include/asm-i386/pgalloc.h 2003-11-26 12:42:55.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/pgalloc.h 2003-11-27 21:55:19.000000000 -0800 @@ -2,54 +2,101 @@ #define _I386_PGALLOC_H #include +#include #include #include #include #include /* for struct page */ +#include /* to make asm-generic/rmap.h happy */ +#include /* for pgtable_remove_rmap() */ +/* + * allocating and freeing a pmd is trivial: the 1-entry pmd is + * inside the pgd, so has no extra memory associated with it. + * (In the PAE case we free the pmds as part of the pgd.) + */ + +#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_free(x) do { } while (0) +#define __pmd_free_tlb(tlb,x) do { } while (0) +#define pgd_populate(mm, pmd, pte) BUG() + +#define check_pgt_cache() do { } while (0) #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) -{ - set_pmd(pmd, __pmd(_PAGE_TABLE + - ((unsigned long long)page_to_pfn(pte) << - (unsigned long long) PAGE_SHIFT))); -} +struct mmu_gather; + /* * Allocate and free page tables. */ - -extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *pgd); - -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); +void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *page); +pgd_t *pgd_alloc(struct mm_struct *); +void pgd_free(pgd_t *pgd); +pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); static inline void pte_free_kernel(pte_t *pte) { free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct page *page) { - __free_page(pte); + put_page(page); } +#include -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +static inline struct page *pte_alloc_fresh(void) +{ + struct page *page = alloc_page(GFP_PTE); + if (page) { + clear_highpage(page); + BUG_ON(PagePTE(page)); + SetPagePTE(page); + } + return page; +} -/* - * allocating and freeing a pmd is trivial: the 1-entry pmd is - * inside the pgd, so has no extra memory associated with it. - * (In the PAE case we free the pmds as part of the pgd.) - */ +static inline struct page *pte_alloc_ready(void) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + struct page *page; -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free(x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) -#define pgd_populate(mm, pmd, pte) BUG() + BUG_ON(tlb->nr_pte_ready < 0); + if (!tlb->nr_pte_ready) { + BUG_ON(tlb->nr_pte_active < 0); + BUG_ON(tlb->nr_nonpte < 0); + page = NULL; + } else { + int zone; + for (zone = MAX_ZONE_ID - 1; zone >= 0; --zone) { + if (!list_empty(&tlb->ready_list[zone])) + break; + } + + BUG_ON(zone < 0); + BUG_ON(list_empty(&tlb->ready_list[zone])); + + page = list_entry(tlb->ready_list[zone].next, struct page, list); + BUG_ON(PagePTE(page)); + SetPagePTE(page); + list_del(&page->list); + atomic_set(&page->count, 1); + tlb->ready_count[zone]--; + tlb->nr_pte_ready--; + BUG_ON(tlb->ready_count[zone] < 0); + BUG_ON(tlb->nr_pte_ready < 0); + } + put_cpu(); + return page; +} -#define check_pgt_cache() do { } while (0) +static inline struct page *pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + struct page *page = pte_alloc_ready(); + return page ? page : pte_alloc_fresh(); +} #endif /* _I386_PGALLOC_H */ diff -prauN linux-2.6.0-test11/include/asm-i386/pgtable-2level.h pgcl-2.6.0-test11-1/include/asm-i386/pgtable-2level.h --- linux-2.6.0-test11/include/asm-i386/pgtable-2level.h 2003-11-26 12:44:17.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/pgtable-2level.h 2003-11-27 21:55:19.000000000 -0800 @@ -17,6 +17,7 @@ #define PTRS_PER_PTE 1024 +#ifndef __ASSEMBLY__ #define pte_ERROR(e) \ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) #define pmd_ERROR(e) \ @@ -34,12 +35,32 @@ static inline int pgd_bad(pgd_t pgd) { static inline int pgd_present(pgd_t pgd) { return 1; } #define pgd_clear(xp) do { } while (0) +#if 0 +#define bad_pte_check(pteval) \ +({ \ + pte_t __pte_val__ = pteval; \ + int __bad_pte__ = !!((__pte_val__.pte_low >> 24) == 0x67); \ + if (__bad_pte__) { \ + WARN_ON(1); \ + printk("bad pte 0x%Lx!!\n", (u64)pte_val(__pte_val__)); \ + } \ + __bad_pte__; \ +}) +#else +#define bad_pte_check(pteval) ({ 0; }) +#endif + /* * Certain architectures need to do special things when PTEs * within a page table are directly modified. Thus, the following * hook is made available. */ -#define set_pte(pteptr, pteval) (*(pteptr) = pteval) +#define set_pte(pteptr, pteval) \ +do { \ + pte_t __set_pte_val__ = pteval; \ + bad_pte_check(__set_pte_val__); \ + *(pteptr) = __set_pte_val__; \ +} while (0) #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) /* * (pmds are folded into pgds so this doesn't get actually called, @@ -49,19 +70,60 @@ static inline int pgd_present(pgd_t pgd) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) #define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +((unsigned long) __va(pgd_val(pgd) & MMUPAGE_MASK)) static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { return (pmd_t *) dir; } -#define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) -#define pte_same(a, b) ((a).pte_low == (b).pte_low) -#define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pte_none(x) (!(x).pte_low) -#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) -#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) + +#define ptep_get_and_clear(pteptr) \ +({ \ + pte_t *__ptep_get_and_clr_ptr__ = pteptr; \ + bad_pte_check(*__ptep_get_and_clr_ptr__); \ + __pte(xchg(&__ptep_get_and_clr_ptr__->pte_low, 0)); \ +}) + +#define pte_same(a, b) \ +({ \ + pte_t __pte_same_a__ = (a), __pte_same_b__ = (b); \ + bad_pte_check(__pte_same_a__); \ + bad_pte_check(__pte_same_b__); \ + __pte_same_a__.pte_low == __pte_same_b__.pte_low; \ +}) + +#define pte_page(pte) \ +({ \ + pte_t __pte_page_pte__ = pte; \ + bad_pte_check(__pte_page_pte__); \ + pfn_to_page(pte_pfn(__pte_page_pte__)); \ +}) + +#define pte_none(pte) \ +({ \ + pte_t __pte_none_pte__ = (pte); \ + bad_pte_check(__pte_none_pte__); \ + !__pte_none_pte__.pte_low; \ +}) + +#define pte_pfn(pte) \ +({ \ + pte_t __pte_pfn_pte__ = pte; \ + bad_pte_check(__pte_pfn_pte__); \ + __pte_pfn_pte__.pte_low/MMUPAGE_SIZE; \ +}) + +#define pfn_pte(pfn, prot) \ +({ \ + pte_t __pfn_pte_pte__; \ + __pfn_pte_pte__ = __pte((MMUPAGE_SIZE*(pfn))|pgprot_val(prot)); \ + bad_pte_check(__pfn_pte_pte__); \ + __pfn_pte_pte__; \ +}) + +#define pfn_pmd(pfn, prot) __pmd(((pfn)<> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 )) - -#define pgoff_to_pte(off) \ - ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE }) +#define __pte_to_pgoff(pte) \ +({ \ + pte_t ___pte_to_pgoff_pte___ = pte; \ + BUG_ON(!(___pte_to_pgoff_pte___.pte_low & ~_PAGE_FILE)); \ + ((___pte_to_pgoff_pte___.pte_low >> 1) & 0x1f) \ + | ((___pte_to_pgoff_pte___.pte_low >> 8) << 5); \ +}) + +#define __pgoff_to_pte(off) \ +({ \ + unsigned long __pgoff_to_pte_pteval__; \ + unsigned long __pgoff_to_pte_off__ = off; \ + __pgoff_to_pte_pteval__ = ((__pgoff_to_pte_off__ & 0x1f) << 1) \ + | ((__pgoff_to_pte_off__ >> 5) << 8); \ + BUG_ON(!__pgoff_to_pte_pteval__); \ + (pte_t) { __pgoff_to_pte_pteval__ | _PAGE_FILE }; \ +}) + +#define pte_to_pgoff(pte) \ +({ \ + pte_t __pte_to_pgoff_pte__ = pte; \ + bad_pte_check(__pte_to_pgoff_pte__); \ + __pte_to_pgoff(__pte_to_pgoff_pte__); \ +}) + +#define pgoff_to_pte(off) \ +({ \ + pte_t __pgoff_to_pte_pte__ = __pgoff_to_pte(off); \ + bad_pte_check(__pgoff_to_pte_pte__); \ + __pgoff_to_pte_pte__; \ +}) #endif /* _I386_PGTABLE_2LEVEL_H */ diff -prauN linux-2.6.0-test11/include/asm-i386/pgtable-3level.h pgcl-2.6.0-test11-1/include/asm-i386/pgtable-3level.h --- linux-2.6.0-test11/include/asm-i386/pgtable-3level.h 2003-11-26 12:45:20.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/pgtable-3level.h 2003-11-27 21:55:19.000000000 -0800 @@ -33,6 +33,9 @@ #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e)) +#define bad_pte_check(x) ({ 0; }) + +#ifndef __ASSEMBLY__ static inline int pgd_none(pgd_t pgd) { return 0; } static inline int pgd_bad(pgd_t pgd) { return 0; } static inline int pgd_present(pgd_t pgd) { return 1; } @@ -65,7 +68,7 @@ static inline void set_pte(pte_t *ptep, static inline void pgd_clear (pgd_t * pgd) { } #define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +((unsigned long) __va(pgd_val(pgd) & MMUPAGE_MASK)) /* Find an entry in the second-level page table.. */ #define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ @@ -97,22 +100,22 @@ static inline int pte_none(pte_t pte) static inline unsigned long pte_pfn(pte_t pte) { - return (pte.pte_low >> PAGE_SHIFT) | - (pte.pte_high << (32 - PAGE_SHIFT)); + return (pte.pte_low >> MMUPAGE_SHIFT) | + (pte.pte_high << (32 - MMUPAGE_SHIFT)); } static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { pte_t pte; - pte.pte_high = page_nr >> (32 - PAGE_SHIFT); - pte.pte_low = (page_nr << PAGE_SHIFT) | pgprot_val(pgprot); + pte.pte_high = page_nr >> (32 - MMUPAGE_SHIFT); + pte.pte_low = (page_nr << MMUPAGE_SHIFT) | pgprot_val(pgprot); return pte; } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - return __pmd(((unsigned long long)page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); + return __pmd(((unsigned long long)page_nr << MMUPAGE_SHIFT) | pgprot_val(pgprot)); } /* @@ -121,6 +124,9 @@ static inline pmd_t pfn_pmd(unsigned lon */ #define pte_to_pgoff(pte) ((pte).pte_high) #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) +#endif /* __ASSEMBLY__ */ + + #define PTE_FILE_MAX_BITS 32 #endif /* _I386_PGTABLE_3LEVEL_H */ diff -prauN linux-2.6.0-test11/include/asm-i386/pgtable.h pgcl-2.6.0-test11-1/include/asm-i386/pgtable.h --- linux-2.6.0-test11/include/asm-i386/pgtable.h 2003-11-26 12:44:59.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/pgtable.h 2003-11-27 22:21:05.000000000 -0800 @@ -29,8 +29,9 @@ * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#define ZERO_PAGE(vaddr) zero_page extern unsigned long empty_zero_page[1024]; +extern struct page *zero_page; extern pgd_t swapper_pg_dir[1024]; extern kmem_cache_t *pgd_cache; extern kmem_cache_t *pmd_cache; @@ -50,13 +51,11 @@ void paging_init(void); * implements both the traditional 2-level x86 page tables and the * newer 3-level PAE-mode page tables. */ -#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include #else # include #endif -#endif #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) @@ -82,14 +81,13 @@ void paging_init(void); * The vmalloc() routines leaves a hole of 4kB between each vmalloced * area for the same reason. ;) */ -#define VMALLOC_OFFSET (8*1024*1024) -#define VMALLOC_START (((unsigned long) high_memory + 2*VMALLOC_OFFSET-1) & \ - ~(VMALLOC_OFFSET-1)) -#ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) -#else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) -#endif +#define VMALLOC_END (FIXADDR_START-2*MMUPAGE_SIZE) +#define __VMALLOC_START (VMALLOC_END - VMALLOC_RESERVE - 2*MMUPAGE_SIZE) +#define VMALLOC_START \ + (high_memory \ + ? max(__VMALLOC_START, (unsigned long)high_memory) \ + : __VMALLOC_START \ + ) /* * The 4MB page is guessing.. Detailed in the infamous "Chapter H" @@ -176,13 +174,17 @@ extern unsigned long __PAGE_KERNEL; /* page table for 0-4MB for everybody */ extern unsigned long pg0[1024]; -#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) +#define pte_clear(xp) \ +do { \ + pte_t *__pte_clear_pte__ = xp; \ + bad_pte_check(*__pte_clear_pte__); \ + set_pte(__pte_clear_pte__, __pte(0)); \ +} while (0) #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_bad(x) ((pmd_val(x) & (~MMUPAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) @@ -191,33 +193,89 @@ extern unsigned long pg0[1024]; * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; } -static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } -static inline int pte_exec(pte_t pte) { return (pte).pte_low & _PAGE_USER; } -static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } -static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } -static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } +#define pte_chkflag(pte, func, flag) \ +({ \ + pte_t __pte_chkflag_##func##_pte__ = pte; \ + bad_pte_check(__pte_chkflag_##func##_pte__); \ + !!(__pte_chkflag_##func##_pte__.pte_low & (flag)); \ +}) + +#define pte_present(pte) pte_chkflag(pte, present, _PAGE_PRESENT|_PAGE_PROTNONE) +#define pte_user(pte) pte_chkflag(pte, user, _PAGE_USER) +#define pte_read(pte) pte_chkflag(pte, read, _PAGE_USER) +#define pte_exec(pte) pte_user(pte, exec, _PAGE_USER) + +#define pte_dirty(pte) pte_chkflag(pte, dirty, _PAGE_DIRTY) +#define pte_young(pte) pte_chkflag(pte, young, _PAGE_ACCESSED) +#define pte_write(pte) pte_chkflag(pte, write, _PAGE_RW) /* * The following only works if pte_present() is not true. */ -static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; } +#define pte_file(pte) pte_chkflag(pte, file, _PAGE_FILE) + + +#define pte_clrflag(pte, func, flag) \ +({ \ + pte_t __pte_clrflag_##func##_pte__ = pte; \ + bad_pte_check(__pte_clrflag_##func##_pte__); \ + __pte_clrflag_##func##_pte__.pte_low &= ~(flag); \ + __pte_clrflag_##func##_pte__; \ +}) + +#define pte_rdprotect(pte) pte_clrflag(pte, rdprotect, _PAGE_USER) +#define pte_exprotect(pte) pte_clrflag(pte, exprotect, _PAGE_USER) +#define pte_mkclean(pte) pte_clrflag(pte, mkclean, _PAGE_DIRTY) +#define pte_mkold(pte) pte_clrflag(pte, mkold, _PAGE_ACCESSED) +#define pte_wrprotect(pte) pte_clrflag(pte, wrprotect, _PAGE_RW) + +#ifndef DEBUG_PGCL +#define PGCL_BUG_ON(p) do { } while (0) +#else +#define PGCL_BUG_ON(p) BUG_ON(p) +#endif -static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } -static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } -static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } -static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } -static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } -static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } -static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } -static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } -static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } -static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } - -static inline int ptep_test_and_clear_dirty(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } -static inline int ptep_test_and_clear_young(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); } -static inline void ptep_set_wrprotect(pte_t *ptep) { clear_bit(_PAGE_BIT_RW, &ptep->pte_low); } -static inline void ptep_mkdirty(pte_t *ptep) { set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } +#define pte_setflag(pte, func, flag) \ +({ \ + pte_t __pte_setflag_##func##_pte__ = pte; \ + PGCL_BUG_ON(pte_none(__pte_setflag_##func##_pte__)); \ + bad_pte_check(__pte_setflag_##func##_pte__); \ + __pte_setflag_##func##_pte__.pte_low |= (flag); \ + __pte_setflag_##func##_pte__; \ +}) + +#define pte_mkread(pte) pte_setflag(pte, mkread, _PAGE_USER) +#define pte_mkexec(pte) pte_setflag(pte, mkexec, _PAGE_USER) +#define pte_mkdirty(pte) pte_setflag(pte, mkdirty, _PAGE_DIRTY) +#define pte_mkyoung(pte) pte_setflag(pte, mkyoung, _PAGE_ACCESSED) +#define pte_mkwrite(pte) pte_setflag(pte, mkwrite, _PAGE_RW) + +#define pte_testclrbit(pte, func, bit) \ +({ \ + pte_t *__pte_testclrbit_##func##_pte__ = pte; \ + PGCL_BUG_ON(pte_none(*__pte_testclrbit_##func##_pte__)); \ + bad_pte_check(*__pte_testclrbit_##func##_pte__); \ + test_and_clear_bit(bit, &__pte_testclrbit_##func##_pte__->pte_low);\ +}) + +#define ptep_test_and_clear_dirty(pte) pte_testclrbit(pte, dirty, _PAGE_BIT_DIRTY) +#define ptep_test_and_clear_young(pte) pte_testclrbit(pte, young, _PAGE_BIT_ACCESSED) + +#define ptep_set_wrprotect(pte) \ +({ \ + pte_t *__pte_wrprotect_pte__ = pte; \ + bad_pte_check(*__pte_wrprotect_pte__); \ + clear_bit(_PAGE_BIT_RW, &__pte_wrprotect_pte__->pte_low); \ +}) + + +#define ptep_mkdirty(pte) \ +({ \ + pte_t *__ptep_mkdirty_pte__ = pte; \ + bad_pte_check(*__ptep_mkdirty_pte__); \ + PGCL_BUG_ON(pte_none(*__ptep_mkdirty_pte__)); \ + set_bit(_PAGE_BIT_DIRTY, &__ptep_mkdirty_pte__->pte_low); \ +}) /* * Macro to mark a page protection value as "uncacheable". On processors which do not support @@ -234,20 +292,23 @@ static inline void ptep_mkdirty(pte_t *p #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) #define mk_pte_huge(entry) ((entry).pte_low |= _PAGE_PRESENT | _PAGE_PSE) -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - pte.pte_low &= _PAGE_CHG_MASK; - pte.pte_low |= pgprot_val(newprot); - return pte; -} - -#define page_pte(page) page_pte_prot(page, __pgprot(0)) +#define pte_modify(pte, prot) \ +({ \ + pte_t __pte_modify_pte__ = pte; \ + bad_pte_check(__pte_modify_pte__); \ + __pte_modify_pte__.pte_low &= _PAGE_CHG_MASK; \ + bad_pte_check(__pte_modify_pte__); \ + PGCL_BUG_ON(pgprot_val(prot) && pte_none(__pte_modify_pte__)); \ + __pte_modify_pte__.pte_low |= pgprot_val(prot); \ + bad_pte_check(__pte_modify_pte__); \ + __pte_modify_pte__; \ +}) #define pmd_page_kernel(pmd) \ -((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +((unsigned long) __va(pmd_val(pmd) & MMUPAGE_MASK)) #ifndef CONFIG_DISCONTIGMEM -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> MMUPAGE_SHIFT)) #endif /* !CONFIG_DISCONTIGMEM */ #define pmd_large(pmd) \ @@ -289,35 +350,52 @@ static inline pte_t pte_modify(pte_t pte * control the given virtual address */ #define pte_index(address) \ - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + (((address) >> MMUPAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) +( \ + (pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) \ + + (PTRS_PER_PTE*((pmd_val(*(dir))/MMUPAGE_SIZE)%PAGE_MMUCOUNT)\ + + pte_index(address)) \ +) #define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) +( \ + (pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) \ + + (PTRS_PER_PTE*((pmd_val(*(dir))/MMUPAGE_SIZE)%PAGE_MMUCOUNT)\ + + pte_index(address)) \ +) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else #define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) +( \ + (pte_t *)page_address(pmd_page(*(dir))) \ + + (PTRS_PER_PTE*((pmd_val(*(dir))/MMUPAGE_SIZE)%PAGE_MMUCOUNT)\ + + pte_index(address)) \ +) #define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) #endif -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) -typedef u32 pte_addr_t; +#ifdef CONFIG_HIGHPTE +#define pte_offset_phys(pmd, addr) \ +({ \ + (pte_addr_t)(pmd_val(*(pmd)) & MMUPAGE_MASK) \ + + pte_index(addr)*sizeof(pte_t); \ +}) +#else +#define pte_offset_phys(pmd, addr) \ + ((pte_addr_t)pte_offset_kernel(pmd, addr)) #endif -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) +#if defined(CONFIG_HIGHMEM64G) && defined(CONFIG_HIGHPTE) typedef u64 pte_addr_t; -#endif - -#if !defined(CONFIG_HIGHPTE) -typedef pte_t *pte_addr_t; +#else +typedef u32 pte_addr_t; #endif /* @@ -330,8 +408,20 @@ typedef pte_t *pte_addr_t; #define __swp_type(x) (((x).val >> 1) & 0x1f) #define __swp_offset(x) ((x).val >> 8) #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) + +#define __pte_to_swp_entry(pte) \ +({ \ + pte_t __pte_to_swp_entry_pte__ = (pte); \ + bad_pte_check(__pte_to_swp_entry_pte__); \ + (swp_entry_t) { __pte_to_swp_entry_pte__.pte_low }; \ +}) + +#define __swp_entry_to_pte(x) \ +({ \ + pte_t ___swp_entry_to_pte__ = (pte_t) { (x).val }; \ + bad_pte_check(___swp_entry_to_pte__); \ + ___swp_entry_to_pte__; \ +}) #endif /* !__ASSEMBLY__ */ diff -prauN linux-2.6.0-test11/include/asm-i386/rmap.h pgcl-2.6.0-test11-1/include/asm-i386/rmap.h --- linux-2.6.0-test11/include/asm-i386/rmap.h 2003-11-26 12:44:30.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/rmap.h 2003-11-27 21:55:19.000000000 -0800 @@ -5,10 +5,17 @@ #include #ifdef CONFIG_HIGHPTE +/* + * The byte offset needs to be relative to PAGE_SIZE, the pfn will be + * implicitly truncated to a PAGE_SIZE boundary, the mapping will be + * returned rounded downward, and will need compensation by adding in + * the paddr's offset within the PAGE_SIZE-aligned region to the vaddr + * returned from kmap_atomic(). + */ static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) { - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; + unsigned long pfn = (unsigned long)(pte_paddr/MMUPAGE_SIZE); + unsigned long off = (unsigned long)pte_paddr & ~PAGE_MASK; return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); } diff -prauN linux-2.6.0-test11/include/asm-i386/setup.h pgcl-2.6.0-test11-1/include/asm-i386/setup.h --- linux-2.6.0-test11/include/asm-i386/setup.h 2003-11-26 12:44:41.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/setup.h 2003-11-27 21:55:19.000000000 -0800 @@ -6,15 +6,15 @@ #ifndef _i386_SETUP_H #define _i386_SETUP_H -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) +#define PFN_UP(x) (((x) + MMUPAGE_SIZE-1) >> MMUPAGE_SHIFT) +#define PFN_DOWN(x) ((x) >> MMUPAGE_SHIFT) +#define PFN_PHYS(x) ((x) << MMUPAGE_SHIFT) /* * Reserved space for vmalloc and iomap - defined in asm/page.h */ #define MAXMEM_PFN PFN_DOWN(MAXMEM) -#define MAX_NONPAE_PFN (1 << 20) +#define MAX_NONPAE_PFN (1 << (32 - MMUPAGE_SHIFT)) /* * This is set up by the setup-routine at boot-time diff -prauN linux-2.6.0-test11/include/asm-i386/shmparam.h pgcl-2.6.0-test11-1/include/asm-i386/shmparam.h --- linux-2.6.0-test11/include/asm-i386/shmparam.h 2003-11-26 12:43:33.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/shmparam.h 2003-11-27 21:55:19.000000000 -0800 @@ -1,6 +1,6 @@ #ifndef _ASMI386_SHMPARAM_H #define _ASMI386_SHMPARAM_H -#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ +#define SHMLBA MMUPAGE_SIZE /* attach addr a multiple of this */ #endif /* _ASMI386_SHMPARAM_H */ diff -prauN linux-2.6.0-test11/include/asm-i386/thread_info.h pgcl-2.6.0-test11-1/include/asm-i386/thread_info.h --- linux-2.6.0-test11/include/asm-i386/thread_info.h 2003-11-26 12:43:06.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/thread_info.h 2003-11-27 21:55:19.000000000 -0800 @@ -53,6 +53,8 @@ struct thread_info { #endif #define PREEMPT_ACTIVE 0x4000000 +#define THREAD_SIZE (2*MMUPAGE_SIZE) +#define INIT_THREAD_SIZE THREAD_SIZE /* * macros/functions for gaining access to the thread information structure @@ -81,12 +83,11 @@ struct thread_info { static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } /* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) #define alloc_thread_info(task) ((struct thread_info *)kmalloc(THREAD_SIZE, GFP_KERNEL)) #define free_thread_info(info) kfree(info) #define get_thread_info(ti) get_task_struct((ti)->task) @@ -96,7 +97,7 @@ static inline struct thread_info *curren /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $~(THREAD_SIZE-1), reg; \ andl %esp, reg #endif diff -prauN linux-2.6.0-test11/include/asm-i386/tlb.h pgcl-2.6.0-test11-1/include/asm-i386/tlb.h --- linux-2.6.0-test11/include/asm-i386/tlb.h 2003-11-26 12:43:30.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/tlb.h 2003-11-27 21:55:19.000000000 -0800 @@ -1,20 +1,210 @@ #ifndef _I386_TLB_H #define _I386_TLB_H +#include +#include +#include +#include +#include +#include +#include +#include + +#define __GFP_PTE (GFP_KERNEL|__GFP_REPEAT) +#ifdef CONFIG_HIGHMEM +#define GFP_PTE (__GFP_PTE|__GFP_HIGHMEM) +#else +#define GFP_PTE __GFP_PTE +#endif + /* - * x86 doesn't need any special per-pte or - * per-vma handling.. + * There are probably better ways to set these thresholds. + * The degenerate cases bother me. */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) +#define PG_PTE PG_arch_1 +#define NR_PTE (128 > 8*PAGE_MMUCOUNT ? 128/PAGE_MMUCOUNT : 8) +#define FREE_PTE_NR NR_PTE +#define NR_NONPTE (512 > 8*PAGE_MMUCOUNT ? 512/PAGE_MMUCOUNT : 8) +#define MAX_ZONE_ID (MAX_NUMNODES * MAX_NR_ZONES) + +#define PagePTE(page) test_bit(PG_PTE, &(page)->flags) +#define SetPagePTE(page) set_bit(PG_PTE, &(page)->flags) +#define ClearPagePTE(page) clear_bit(PG_PTE, &(page)->flags) +#define PageZoneID(page) ((page)->flags >> ZONE_SHIFT) /* - * .. because we flush the whole mm when it - * fills up. + * x86 doesn't need any special per-pte or + * per-vma handling... + * + * We do it anyway to cache pagetables with highpte. */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) +struct vm_area_struct; +struct mmu_gather { + struct mm_struct *mm; -#include + /* number of active ptes needing a TLB flush before reuse */ + int nr_pte_active; -#endif + /* whether some ptes were unmapped */ + unsigned int need_flush; + + /* non-zero means full mm flush */ + unsigned int fullmm; + + /* number freed for RSS adjustment */ + unsigned long freed; + + /* number of ready ptes */ + int nr_pte_ready; + + struct list_head active_list[MAX_ZONE_ID], ready_list[MAX_ZONE_ID]; + int active_count[MAX_ZONE_ID], ready_count[MAX_ZONE_ID]; + + int nr_nonpte; + struct page *nonpte[NR_NONPTE]; +}; + +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); + +void tlb_flush_ready(struct mmu_gather *tlb); +void tlb_init(void); + +static inline void tlb_start_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma) +{ +} + +static inline void tlb_end_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma) +{ +} + +static inline void tlb_inc_freed(struct mmu_gather *tlb) +{ + tlb->freed++; +} + +static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + tlb->need_flush = 1; +} + +static inline void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); +} + +static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, + unsigned int flush) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb->mm = mm; + tlb->fullmm = flush; + tlb->freed = 0; + put_cpu(); + return tlb; +} + +static inline void tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, + unsigned long end) +{ + int zone; + + if (!tlb->need_flush && tlb->nr_nonpte < NR_NONPTE) { + BUG_ON(tlb->nr_nonpte < 0); + BUG_ON(tlb->nr_pte_active < 0); + BUG_ON(tlb->nr_pte_ready < 0); + return; + } + + tlb->need_flush = 0; + tlb_flush(tlb); + BUG_ON(tlb->nr_nonpte < 0); + if (tlb->nr_nonpte) { + free_pages_and_swap_cache(tlb->nonpte, tlb->nr_nonpte); + tlb->nr_nonpte = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (list_empty(&tlb->active_list[zone])) { + BUG_ON(tlb->active_count[zone]); + continue; + } + + list_splice_init(&tlb->active_list[zone], + &tlb->ready_list[zone]); + BUG_ON(tlb->active_count[zone] < 0); + BUG_ON(tlb->ready_count[zone] < 0); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + if (tlb->nr_pte_ready >= NR_PTE) + tlb_flush_ready(tlb); +} + +static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, + unsigned long end) +{ + if (tlb->mm->rss >= tlb->freed) + tlb->mm->rss -= tlb->freed; + else + tlb->mm->rss = 0; + tlb->freed = 0; + tlb_flush_mmu(tlb, start, end); +} + +static inline void tlb_remove_nonpte_page(struct mmu_gather *tlb, + struct page *page) +{ + BUG_ON(tlb->nr_nonpte >= NR_NONPTE); + tlb->nonpte[tlb->nr_nonpte] = page; + tlb->nr_nonpte++; + if (tlb->nr_nonpte == NR_NONPTE) + tlb_flush_mmu(tlb, 0, 0); +} + +static inline void tlb_remove_pte_page(struct mmu_gather *tlb, + struct page *page) +{ + int zone; + + if (!atomic_dec_and_test(&page->count)) + return; + + zone = PageZoneID(page); + ClearPagePTE(page); + BUG_ON(tlb->nr_pte_active < 0); + BUG_ON(tlb->active_count[zone] < 0); + tlb->nr_pte_active++; + tlb->active_count[zone]++; + list_add(&page->list, &tlb->active_list[zone]); +} + +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->need_flush = 1; + if (PagePTE(page)) + tlb_remove_pte_page(tlb, page); + else + tlb_remove_nonpte_page(tlb, page); +} + +static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page) +{ + tlb_remove_page(tlb, page); +} + +static inline void tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *pte, + unsigned long addr) +{ + tlb->need_flush = 1; +} + +static inline struct mm_struct *tlb_mm(struct mmu_gather *tlb) +{ + return tlb->mm; +} + +#endif /* _I386_TLB_H */ diff -prauN linux-2.6.0-test11/include/asm-i386/tlbflush.h pgcl-2.6.0-test11-1/include/asm-i386/tlbflush.h --- linux-2.6.0-test11/include/asm-i386/tlbflush.h 2003-11-26 12:45:46.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-i386/tlbflush.h 2003-11-27 21:55:19.000000000 -0800 @@ -92,8 +92,17 @@ static inline void flush_tlb_mm(struct m static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { + int k; if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long vaddr = addr + k*MMUPAGE_SIZE; + if (vaddr < vma->vm_start) + continue; + else if (vaddr >= vma->vm_end) + break; + else + __flush_tlb_one(vaddr); + } } static inline void flush_tlb_range(struct vm_area_struct *vma, @@ -110,10 +119,10 @@ static inline void flush_tlb_range(struc #define local_flush_tlb() \ __flush_tlb() -extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); -extern void flush_tlb_mm(struct mm_struct *); -extern void flush_tlb_page(struct vm_area_struct *, unsigned long); +void flush_tlb_all(void); +void flush_tlb_current_task(void); +void flush_tlb_mm(struct mm_struct *); +void flush_tlb_page(struct vm_area_struct *, unsigned long); #define flush_tlb() flush_tlb_current_task() diff -prauN linux-2.6.0-test11/include/asm-ia64/page.h pgcl-2.6.0-test11-1/include/asm-ia64/page.h --- linux-2.6.0-test11/include/asm-ia64/page.h 2003-11-26 12:43:05.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-ia64/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -201,4 +201,6 @@ get_order (unsigned long size) (((current->thread.flags & IA64_THREAD_XSTACK) != 0) \ ? VM_EXEC : 0)) +#include + #endif /* _ASM_IA64_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-m68k/page.h pgcl-2.6.0-test11-1/include/asm-m68k/page.h --- linux-2.6.0-test11/include/asm-m68k/page.h 2003-11-26 12:46:08.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-m68k/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -192,6 +192,8 @@ static inline void *__va(unsigned long x #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _M68K_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-m68knommu/page.h pgcl-2.6.0-test11-1/include/asm-m68knommu/page.h --- linux-2.6.0-test11/include/asm-m68knommu/page.h 2003-11-26 12:43:49.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-m68knommu/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -94,6 +94,8 @@ extern unsigned long memory_end; #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) +#include + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff -prauN linux-2.6.0-test11/include/asm-mips/page.h pgcl-2.6.0-test11-1/include/asm-mips/page.h --- linux-2.6.0-test11/include/asm-mips/page.h 2003-11-26 12:43:35.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-mips/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -126,6 +126,8 @@ static __inline__ int get_order(unsigned #define UNCAC_ADDR(addr) ((addr) - PAGE_OFFSET + UNCAC_BASE) #define CAC_ADDR(addr) ((addr) - UNCAC_BASE + PAGE_OFFSET) +#include + #endif /* defined (__KERNEL__) */ #endif /* _ASM_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-parisc/page.h pgcl-2.6.0-test11-1/include/asm-parisc/page.h --- linux-2.6.0-test11/include/asm-parisc/page.h 2003-11-26 12:43:24.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-parisc/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -113,6 +113,8 @@ extern int npmem_ranges; #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _PARISC_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-ppc/page.h pgcl-2.6.0-test11-1/include/asm-ppc/page.h --- linux-2.6.0-test11/include/asm-ppc/page.h 2003-11-26 12:43:27.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-ppc/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -162,5 +162,7 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _PPC_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-ppc64/page.h pgcl-2.6.0-test11-1/include/asm-ppc64/page.h --- linux-2.6.0-test11/include/asm-ppc64/page.h 2003-11-26 12:42:49.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-ppc64/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -237,5 +237,7 @@ static inline int get_order(unsigned lon #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _PPC64_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-s390/page.h pgcl-2.6.0-test11-1/include/asm-s390/page.h --- linux-2.6.0-test11/include/asm-s390/page.h 2003-11-26 12:43:09.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-s390/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -181,6 +181,8 @@ typedef struct { unsigned long pgd; } pg #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _S390_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-sh/page.h pgcl-2.6.0-test11-1/include/asm-sh/page.h --- linux-2.6.0-test11/include/asm-sh/page.h 2003-11-26 12:45:33.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-sh/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -118,6 +118,8 @@ static __inline__ int get_order(unsigned #endif +#include + #endif /* __KERNEL__ */ #endif /* __ASM_SH_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-sparc/page.h pgcl-2.6.0-test11-1/include/asm-sparc/page.h --- linux-2.6.0-test11/include/asm-sparc/page.h 2003-11-26 12:43:27.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-sparc/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -171,6 +171,8 @@ extern __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* __KERNEL__ */ #endif /* _SPARC_PAGE_H */ diff -prauN linux-2.6.0-test11/include/asm-sparc64/page.h pgcl-2.6.0-test11-1/include/asm-sparc64/page.h --- linux-2.6.0-test11/include/asm-sparc64/page.h 2003-11-26 12:43:41.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-sparc64/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -174,6 +174,8 @@ static __inline__ int get_order(unsigned #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#include + #endif /* !(__KERNEL__) */ #endif /* !(_SPARC64_PAGE_H) */ diff -prauN linux-2.6.0-test11/include/asm-v850/page.h pgcl-2.6.0-test11-1/include/asm-v850/page.h --- linux-2.6.0-test11/include/asm-v850/page.h 2003-11-26 12:44:29.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-v850/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -140,6 +140,7 @@ extern __inline__ int get_order (unsigne #define __pa(x) __virt_to_phys ((unsigned long)(x)) #define __va(x) ((void *)__phys_to_virt ((unsigned long)(x))) +#include #endif /* KERNEL */ diff -prauN linux-2.6.0-test11/include/asm-x86_64/page.h pgcl-2.6.0-test11-1/include/asm-x86_64/page.h --- linux-2.6.0-test11/include/asm-x86_64/page.h 2003-11-26 12:42:55.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/asm-x86_64/page.h 2003-11-27 21:55:19.000000000 -0800 @@ -138,6 +138,8 @@ extern __inline__ int get_order(unsigned (test_thread_flag(TIF_IA32) ? vm_stack_flags32 : vm_stack_flags) +#include + #endif /* __KERNEL__ */ #endif /* _X86_64_PAGE_H */ diff -prauN linux-2.6.0-test11/include/linux/aio.h pgcl-2.6.0-test11-1/include/linux/aio.h --- linux-2.6.0-test11/include/linux/aio.h 2003-11-26 12:42:51.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/aio.h 2003-11-27 21:55:19.000000000 -0800 @@ -104,13 +104,13 @@ struct aio_ring_info { unsigned long mmap_base; unsigned long mmap_size; - struct page **ring_pages; + unsigned long *ring_pages; spinlock_t ring_lock; long nr_pages; unsigned nr, tail; - struct page *internal_pages[AIO_RING_PAGES]; + unsigned long internal_pages[AIO_RING_PAGES]; /* pfn's */ }; struct kioctx { diff -prauN linux-2.6.0-test11/include/linux/binfmts.h pgcl-2.6.0-test11-1/include/linux/binfmts.h --- linux-2.6.0-test11/include/linux/binfmts.h 2003-11-26 12:44:15.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/binfmts.h 2003-11-27 21:55:19.000000000 -0800 @@ -2,6 +2,7 @@ #define _LINUX_BINFMTS_H #include +#include /* for PAGE_MMUCOUNT */ struct pt_regs; @@ -9,8 +10,16 @@ struct pt_regs; * MAX_ARG_PAGES defines the number of pages allocated for arguments * and envelope for the new program. 32 should suffice, this gives * a maximum env+arg of 128kB w/4KB pages! + * Now that PAGE_SIZE is a software construct and varies wildly, + * MAX_ARG_PAGES should represent a constant size of 128KB. When + * PAGE_SIZE exceeds that, we're in trouble. */ -#define MAX_ARG_PAGES 32 +#if PAGE_MMUCOUNT <= 32 +#define MAX_ARG_PAGES (32/PAGE_MMUCOUNT) +#else +/* #error PAGE_SIZE too large to enforce MAX_ARG_PAGES! */ +#define MAX_ARG_PAGES 1 +#endif /* sizeof(linux_binprm->buf) */ #define BINPRM_BUF_SIZE 128 diff -prauN linux-2.6.0-test11/include/linux/bio.h pgcl-2.6.0-test11-1/include/linux/bio.h --- linux-2.6.0-test11/include/linux/bio.h 2003-11-26 12:45:10.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/bio.h 2003-11-27 21:55:19.000000000 -0800 @@ -257,26 +257,20 @@ extern void bio_check_pages_dirty(struct */ extern inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) { - unsigned long addr; + char *addr; /* * might not be a highmem page, but the preempt/irq count * balancing is a lot nicer this way */ local_irq_save(*flags); - addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ); - - if (addr & ~PAGE_MASK) - BUG(); - - return (char *) addr + bvec->bv_offset; + addr = (char *)kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ); + return addr + bvec->bv_offset; } extern inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) { - unsigned long ptr = (unsigned long) buffer & PAGE_MASK; - - kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); + kunmap_atomic(buffer, KM_BIO_SRC_IRQ); local_irq_restore(*flags); } diff -prauN linux-2.6.0-test11/include/linux/folio.h pgcl-2.6.0-test11-1/include/linux/folio.h --- linux-2.6.0-test11/include/linux/folio.h 1969-12-31 16:00:00.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/folio.h 2003-11-27 21:55:19.000000000 -0800 @@ -0,0 +1,679 @@ +#ifndef _LINUX_FOLIO_H +#define _LINUX_FOLIO_H + +/* + * include/linux/folio.h by Hugh Dickins hugh@veritas.com 31may01 + * + * This header file is intended for inclusion in Linux kernel source + * mm/memory.c alone. It sequesters the "folio" functions for use by + * its fault handlers do_no_page(), do_anonymous_page(), do_swap_page() + * and do_wp_page(). On a standard system, these "folio" functions are + * trivial; but on a system with "large pages" i.e. PAGE_MMUSHIFT != 0, + * they manage the awkwardness of presenting small MMUPAGE_SIZE pages + * to user programs, from a kernel pool of large PAGE_SIZE pages. + * Shared file mappings present little problem, but without this folio + * treatment, private mappings might quickly degenerate into needing + * one PAGE_SIZE page to support each MMUPAGE_SIZE mapping. + */ + +#if PAGE_MMUSHIFT + +#define bad_paddr_check(__paddr__) do { } while (0) + +/* + * Test whether pte2 indicates the same page as pte1. + */ +static /* inline */ int pte_match(pte_t *pte1, pte_t *pte2) +{ + if (pte_none(*pte1)) + return pte_none(*pte2); + else if (pte_present(*pte1)) { + unsigned long pfn1, pfn2; + + if (!pte_present(*pte2)) + return 0; + + pfn1 = pte_pfn(*pte1); + pfn2 = pte_pfn(*pte2); + + /* if pfn1 is invalid, someone screwed up */ + if (!pfn_valid(pfn1) || !pfn_valid(pfn2)) + return 0; + /* + * We want to do the following with less overhead: + * return pfn_to_page(pfn1) == pfn_to_page(pfn2); + * If gcc doesn't turn this into a shift, it's time + * to start gunning down gcc hackers. + */ + return pfn1/PAGE_MMUCOUNT == pfn2/PAGE_MMUCOUNT; + } else if (pte_none(*pte2) || pte_present(*pte2)) + return 0; + else + return pte_to_swp_entry(*pte2).val/PAGE_MMUCOUNT + == pte_to_swp_entry(*pte1).val/PAGE_MMUCOUNT; +} + +/* + * Test whether nearby vma2 could ever share a private page with vma1. + */ +static /* inline */ int vma_neighbourly(struct vm_area_struct *vma1, + struct vm_area_struct *vma2) +{ + if ((vma1->vm_flags | vma2->vm_flags) & VM_MAYSHARE) + return 0; + if ((((vma1->vm_start - vma2->vm_start) >> MMUPAGE_SHIFT) - + (vma1->vm_pgoff - vma2->vm_pgoff)) & (PAGE_MMUCOUNT-1)) + return 0; + return 1; +} + +#define NOPTE (((pte_addr_t)0)) + +/* + * Prepare folio of page table pointers for the do_ fault handlers. + */ +static int prepare_folio(pte_addr_t folio[], struct vm_area_struct *vma, + unsigned long address, pte_addr_t ptep, int wide) +{ + struct vm_area_struct *vmp; + unsigned long suboffset; + unsigned long base, addr; + int subnr, ptenr; + int j, limit; + pgd_t *pgd; + pmd_t *pmd; + int reprep = 0; + + bad_paddr_check(ptep); + + suboffset = vma_suboffset(vma, address); + base = (address - suboffset) & MMUPAGE_MASK; + subnr = suboffset >> MMUPAGE_SHIFT; + ptenr = (address & ~PMD_MASK) >> MMUPAGE_SHIFT; + + /* First approximation: set full vector of probable pteps + */ + ptep -= subnr*sizeof(pte_t); + for (j = 0; j < PAGE_MMUCOUNT; j++) { + folio[j] = ptep + j*sizeof(pte_t); + bad_paddr_check(folio[j]); + } + j = 0; + + /* Second approximation: wipe pteps which don't belong to vma; + * but if wide, include neighbouring vmas perhaps sharing page. + */ + addr = base; + if (addr > TASK_SIZE) { /* wrapped */ + for (; addr > TASK_SIZE; addr += MMUPAGE_SIZE, j++) + folio[j] = NOPTE; + } + if (addr < vma->vm_start) { + if (wide) { + for (vmp = find_vma(vma->vm_mm, addr); + vmp != vma; vmp = vmp->vm_next) { + for (; addr < vmp->vm_start; addr += MMUPAGE_SIZE, j++) + folio[j] = NOPTE; + if (vma_neighbourly(vma, vmp)) { + j += (vmp->vm_end - addr) >> MMUPAGE_SHIFT; + addr = vmp->vm_end; + } + } + } + for (; addr < vma->vm_start; addr += MMUPAGE_SIZE, j++) + folio[j] = NOPTE; + } + if (vma->vm_end < base + PAGE_SIZE) { + j = (vma->vm_end - base) >> MMUPAGE_SHIFT; + if (wide) { + addr = vma->vm_end; + for (vmp = vma->vm_next; vmp && + vmp->vm_start < base + PAGE_SIZE; vmp = vmp->vm_next) { + for (; addr < vmp->vm_start; addr += MMUPAGE_SIZE, j++) + folio[j] = NOPTE; + if (vma_neighbourly(vma, vmp)) { + j += (vmp->vm_end - addr) >> MMUPAGE_SHIFT; + addr = vmp->vm_end; + } + } + } + for (; j < PAGE_MMUCOUNT; j++) + folio[j] = NOPTE; + } + + /* Third approximation: fix pteps to page table below or above. + */ + if (subnr > ptenr) { + limit = subnr - ptenr; + for (j = 0; folio[j] == NOPTE; j++) + ; + if (j < limit) { + ptep = NOPTE; + pgd = pgd_offset(vma->vm_mm, base); + if (!pgd_none(*pgd) && !pgd_bad(*pgd)) { + pmd = pmd_offset(pgd, base); + if (!pmd_none(*pmd) && !pmd_bad(*pmd)) + ptep = pte_offset_phys(pmd, base); + } + if (ptep != NOPTE) { + for (; j < limit; j++) { + if (folio[j] != NOPTE) { + folio[j] = ptep + j*sizeof(pte_t); + bad_paddr_check(folio[j]); + } + } + } else { + for (; j < limit; j++) + folio[j] = NOPTE; + reprep = 1; + } + } + } + if (ptenr > subnr + PTRS_PER_PTE - PAGE_MMUCOUNT) { + j = subnr + PTRS_PER_PTE - ptenr; + for (limit = PAGE_MMUCOUNT; folio[limit-1] == NOPTE; limit--) + ; + if (j < limit) { + ptep = NOPTE; + base += PAGE_SIZE; + pgd = pgd_offset(vma->vm_mm, base); + if (!pgd_none(*pgd) && !pgd_bad(*pgd)) { + pmd = pmd_offset(pgd, base); + if (!pmd_none(*pmd) && !pmd_bad(*pmd)) + ptep = pte_offset_phys(pmd, base); + } + if (ptep) { + ptep -= PAGE_MMUCOUNT*sizeof(pte_t); + for (; j < limit; j++) { + if (folio[j] != NOPTE) { + folio[j] = ptep + j*sizeof(pte_t); + bad_paddr_check(folio[j]); + } + } + } else { + for (; j < limit; j++) + folio[j] = NOPTE; + reprep = 1; + } + } + } + return reprep; /* needs recall if page_table_lock dropped */ +} + +/* + * Check if the wide folio already has a private page allocated to it. + * i.e. we're trying to see if we are the sole owners of the page we + * would otherwise COW in this folio so as to merely map it read/write + * ourselves without copying, allocating, freeing, etc. + */ +static struct page *private_folio_page(pte_addr_t paddrs[], struct page *swap_page) +{ + pte_t *folio[PAGE_MMUCOUNT+1]; + unsigned long pfn; + struct page *page; + swp_entry_t entry; + pte_t swap_pte; + int fcount, pcount, scount, tcount; + int i, j; + + pr_debug("%d: private_folio_page(%p, %p)\n", + current->pid, paddrs, swap_page); + + kmap_atomic_sg(folio, paddrs, KM_FOLIO); + + for (j = PAGE_MMUCOUNT - 1; !folio[j]; j--) + pr_debug("%d: skipping %d\n", current->pid, j); + fcount = j + 1; + /* + * The easiest way to handle the do_swap_page() case is + * to make up one extra element on the end of the folio: + * typically all the folio entries will be swapped out, + * and we need one present page to make sense of them. + */ + if (swap_page) { + swap_pte = mk_pte(swap_page, PAGE_KERNEL); + folio[fcount] = &swap_pte; + pr_debug("%d: putting swap_pte = 0x%p at fcount %d\n", + current->pid, &swap_pte, fcount); + fcount++; + } + + j = 0; + pr_debug("%d: starting fcount = %d\n", current->pid, fcount); + while (j < fcount) { + pr_debug("%d: folio[%d] = %p\n", current->pid, j, folio[j]); + if (!folio[j] || !pte_present(*folio[j])) { + pr_debug("%d: skipping folio[%d] = %p (0x%Lx), " + "presence = %d\n", + current->pid, + j, + folio[j], + folio[j] ? (u64)pte_val(*folio[j]) : 0, + folio[j] ? !!pte_present(*folio[j]) : 0); + j++; + continue; + } + tcount = 1; + pfn = pte_pfn(*folio[j]); + if (!pfn_valid(pfn)) { + j++; + continue; + } + page = pfn_to_page(pfn); + if (PageReserved(page)) { + j++; + continue; + } + while (++j < fcount) { + if (!folio[j] || !pte_present(*folio[j])) + continue; + else if (!pfn_valid(pte_pfn(*folio[j]))) + break; + else if (pte_page(*folio[j]) != page) + break; + else + tcount++; + } + if (PageSwapCache(page)) { + if (page != swap_page) { + if (TestSetPageLocked(page)) + continue; + if (!PageSwapCache(page)) { + unlock_page(page); + continue; + } + } + entry.val = page->index*PAGE_MMUCOUNT; + pcount = page_count(page) - 1; /* omit swap cache */ + if (PagePrivate(page)) /* omit bh's */ + pcount--; + scount = swap_count(page) - 1; /* omit swap cache */ + if (page != swap_page) + unlock_page(page); + if (pcount + scount > fcount) + continue; + } else { + /* Morton pages? */ + if (page->mapping) + continue; + pcount = page_count(page); + if (PagePrivate(page)) + pcount--; + scount = 0; + } + pcount -= tcount; + if (j + pcount > fcount) + continue; + + /* nuke the rest of the pte refcounts beyond tcount */ + for (i = j + 1; pcount && i < fcount; i++) { + if (!folio[i] || !pte_present(*folio[i])) + continue; + pfn = pte_pfn(*folio[i]); + if (!pfn_valid(pfn)) + continue; + if (pfn_to_page(pfn) == page) + pcount--; + } + if (pcount) + continue; + + /* do not map swapcache RW if others also have swap ptes */ + for (i = 0; scount && i < fcount; i++) { + if (!folio[i]) + continue; + else if (pte_present(*folio[i])) + continue; + else if (pte_file(*folio[i])) + continue; + /* + * entry.val % PAGE_MMUCOUNT represents the mmupage + * within the page; divide by PAGE_MMUCOUNT to see + * if they refer to the same swap entry; all indexing + * into ->swap_map[] is done with this scaling. + */ + if (pte_to_swp_entry(*folio[i]).val/PAGE_MMUCOUNT + == entry.val/PAGE_MMUCOUNT) + scount--; + } + if (scount) + continue; + kunmap_atomic_sg(folio, KM_FOLIO); + pr_debug("private_folio_page() found a private page: " + "page=%p, fcount=%d, pcount=%d, scount=%d, " + "tcount=%d\n", + page, fcount, pcount, scount, tcount); +#ifdef DEBUG_RMAP + WARN_ON(!page_truly_private(page, current->mm)); +#endif + return page; + } + kunmap_atomic_sg(folio, KM_FOLIO); + pr_debug("%d: private_folio_page: page=NULL, j=%d, fcount=%d\n", + current->pid, j, fcount); + return NULL; +} + +/* + * Replace page just allocated by private folio page if it has one. + */ +static /* inline */ struct page *private_folio_page_xchg(pte_addr_t folio[], struct page *new_page) +{ + struct page *folio_page = private_folio_page(folio, NULL); + if (!folio_page) + return new_page; + page_cache_release(new_page); + page_cache_get(folio_page); + return folio_page; +} + +/* + * Limit folio to page table entries of this vma matching this *ptep. + */ +static void restrict_folio(pte_addr_t paddrs[], struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t *folio[PAGE_MMUCOUNT+1]; + unsigned long addr; + int j; + + pr_debug("%d: restrict_folio(%p, %p = [0x%lx,0x%lx), 0x%lx, %p)\n", + current->pid, paddrs, vma, vma->vm_start, vma->vm_end, + address, ptep); + + kmap_atomic_sg(folio, paddrs, KM_FOLIO); + + addr = address - vma_suboffset(vma, address); + for (j = 0; j < PAGE_MMUCOUNT; j++, addr += MMUPAGE_SIZE) { + if (!folio[j]) { + pr_debug("%d: restrict_folio() saw folio[%d], addr 0x%lx NULL\n", + current->pid, j, addr); + continue; + } + if (addr < vma->vm_start || addr >= vma->vm_end) { + pr_debug("%d: restrict_folio() saw folio[%d], addr 0x%lx outside vma\n", + current->pid, j, addr); + folio[j] = NULL; + paddrs[j] = NOPTE; + } else if (!pte_match(folio[j], ptep)) { + pr_debug("%d: restrict_folio() saw folio[%d], addr 0x%lx not match, folio[j] = 0x%Lx, ptep = 0x%Lx\n", + current->pid, j, addr, + (u64)pte_val(*folio[j]), (u64)pte_val(*ptep)); + folio[j] = NULL; + paddrs[j] = NOPTE; + } else + pr_debug("%d: restrict folio saw folio[%d], addr 0x%lx = 0x%Lx match with ptep = %Lx\n", + current->pid, j, addr, + (u64)pte_val(*folio[j]), (u64)pte_val(*ptep)); + } + kunmap_atomic_sg(folio, KM_FOLIO); +} + +/* + * Copy (or clear) folio of mmupages from src_page to dst_page. + * + * There's something fragile about hugh's copy_folio() wrt. + * alignment of a folio. + */ +#if 0 +static void copy_folio(pte_addr_t paddrs[], struct page *dst_page, + struct page *src_page, unsigned long address) +{ + pte_t *folio[PAGE_MMUCOUNT]; + unsigned long pfns[PAGE_MMUCOUNT]; + char *src, *dst; + int k; + + kmap_atomic_sg(folio, paddrs, KM_FOLIO); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + if (!folio[k]) + pfns[k] = 0; + else { + pfns[k] = pte_pfn(*folio[k]); + if (pfn_to_page(pfns[k]) == ZERO_PAGE(address)) + pfns[k] = 0; + } + } + kunmap_atomic_sg(folio, KM_FOLIO); + src = kmap_atomic_pfns(pfns, KM_USER0); + dst = kmap_atomic(dst_page, KM_USER1) + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + if (pfns[k]) + memcpy(&dst[MMUPAGE_SIZE*k], &src[MMUPAGE_SIZE*k], MMUPAGE_SIZE); + else + memset(&dst[MMUPAGE_SIZE*k], 0, MMUPAGE_SIZE); + } + kunmap_atomic(dst, KM_USER1); + kunmap_atomic_pfns(pfns, KM_USER0); +} +#else +static void copy_folio(pte_addr_t paddrs[], struct page *dst_page, + struct page *src_page, unsigned long address) +{ + pte_t *folio[PAGE_MMUCOUNT+1]; + char *src, *dst; + unsigned int size, offset = 0; + unsigned long src_pfn, dst_pfn; + int j = 0; + + kmap_atomic_sg(folio, paddrs, KM_FOLIO); + + dst = kmap_atomic(dst_page, KM_USER0); + dst_pfn = page_to_pfn(dst_page); + if (src_page != ZERO_PAGE(address)) { + src = kmap_atomic(src_page, KM_USER1); + src_pfn = page_to_pfn(src_page); + pr_debug("%d: copying nonzero page\n", current->pid); + } else { + src = NULL; + src_pfn = 0; + pr_debug("%d: zeroing out page\n", current->pid); + } + while (j < PAGE_MMUCOUNT) { + if (!folio[j]) { + offset += MMUPAGE_SIZE; + j++; + continue; + } + size = MMUPAGE_SIZE; + while (++j < PAGE_MMUCOUNT) { + if (!folio[j]) + break; + size += MMUPAGE_SIZE; + } + /* We assume one long op is faster than several shorts. + * But ia64 sh sparc64 need to use clear/copy_user_page. + */ + if (src) { + pr_debug("%d: copying %d mmupages from pfn " + "0x%lx to 0x%lx\n", + current->pid, size/MMUPAGE_SIZE, + src_pfn + offset/MMUPAGE_SIZE, + dst_pfn + offset/MMUPAGE_SIZE); + memcpy(dst + offset, src + offset, size); + } else { + pr_debug("%d: zeroing %d mmupages at pfn 0x%lx\n", + current->pid, size/MMUPAGE_SIZE, + dst_pfn + offset/MMUPAGE_SIZE); + memset(dst + offset, 0, size); + } + offset += size; + } + if (src) + kunmap_atomic(src, KM_USER1); + kunmap_atomic(dst, KM_USER0); + kunmap_atomic_sg(folio, KM_FOLIO); +} +#endif + +/* + * Update page table entries of the folio, counting how many done. + */ +static /* inline */ unsigned long set_folio_page(pte_addr_t paddrs[], + struct page *page, + pgprot_t prot, + unsigned long flags) +{ + pte_t *folio[PAGE_MMUCOUNT+1]; + unsigned long rss = 0, pfn = page_to_pfn(page); + int j; + + kmap_atomic_sg(folio, paddrs, KM_FOLIO); + + for (j = 0; j < PAGE_MMUCOUNT; j++) { + pte_t old_pte; + + if (!folio[j]) + continue; + old_pte = *folio[j]; + set_pte(folio[j], + pfn_pte(pfn + j, __pgprot(pgprot_val(prot) | flags))); + pr_debug("%d: translating vaddr 0x%lx to pfn 0x%lx, " + "new pte = 0x%Lx, old pte = 0x%Lx\n", + current->pid, + ptep_to_address(folio[j]), pfn + j, + (u64)pte_val(*folio[j]), + (u64)pte_val(old_pte)); + rss++; + } + kunmap_atomic_sg(folio, KM_FOLIO); + return rss; +} + +/* + * Flush TLB entries for the folio (if ptes were present before). + */ +static /* inline */ void flush_folio(pte_addr_t folio[], + struct vm_area_struct *vma, + unsigned long address) +{ + unsigned long start, end; + int j; + + start = (address - vma_suboffset(vma, address)) & MMUPAGE_MASK; + end = start + PAGE_SIZE; + for (j = 0; folio[j] == NOPTE; j++) + start += MMUPAGE_SIZE; + for (j = PAGE_MMUCOUNT - 1; folio[j] == NOPTE; j--) + end -= MMUPAGE_SIZE; + flush_tlb_range(vma, start, end); +} + +#ifdef DEBUG_RMAP +static inline void __adjust_page_count(const char *file, + int line, + const char *func, + struct page *page, + int count) +{ + pr_debug("%d: adjust_page_count(0x%lx, %d) in %s, %s:%d with " + "count %d (expected %d)\n", + current->pid, + page_to_pfn(page), + count, + func, + file, + line, + page_count(page), + page_count_expected(page)); + + BUG_ON(page_count(page) + count <= 0); + atomic_add(count, &page->count); +} +#else +static inline void __adjust_page_count(const char *file, + int line, + const char *func, + struct page *page, + int count) +{ + BUG_ON(page_count(page) + count <= 0); + atomic_add(count, &page->count); +} +#endif + +#define adjust_page_count(page, count) \ + __adjust_page_count(__FILE__, __LINE__, __FUNCTION__, page, count) + +#else /* PAGE_MMUSHIFT 0 */ + +static /* inline */ int prepare_folio(pte_t *folio[], struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, int wide) +{ + folio[0] = ptep; + return 0; +} + +/* + * Calling convention different if !PAGE_MMUSHIFT: page always passed in + */ +static /* inline */ struct page *private_folio_page(pte_t *folio[], struct page *page) +{ + int doing_wp = pte_present(*folio[0]); + int count; + + if (PageReserved(page)) + return NULL; + if (PageSwapCache(page)) { + if (doing_wp) { + if (TryLockPage(page)) + return NULL; + if (!PageSwapCache(page)) { + UnlockPage(page); + return NULL; + } + } + count = page_count(page) + swap_count(page) - 3; + if (doing_wp) + UnlockPage(page); + else + count--; /* swap not yet freed */ + } else { + count = page_count(page) - 1; + } + if (PagePrivate(page)) + count--; + return count? NULL: page; +} + +#define private_folio_page_xchg(folio, new_page) \ + (new_page) + +#define restrict_folio(folio, vma, address, ptep) \ + do {} while (0) + +static /* inline */ void copy_folio(pte_t *folio[], struct page *dst_page, + struct page *src_page, unsigned long address) +{ + char *dst = kmap(dst_page); + if (src_page == ZERO_PAGE(address)) { + clear_user_page(dst, address); + } else { + copy_user_page(dst, kmap(src_page), address); + kunmap(src_page); + } + kunmap(dst_page); +} + +static /* inline */ unsigned long set_folio(pte_t *folio[], pte_t pte) +{ + set_pte(folio[0], pte); + return 1; +} + +static inline void flush_folio(pte_addr_t folio[], + struct vm_area_struct *vma, + unsigned long addr) +{ + flush_tlb_page(vma, addr); +} + +static inline void adjust_page_count(struct page *page, int count) +{ + BUG_ON(page_count(page) <= 0); + BUG_ON(count); +} + +#endif /* PAGE_MMUSHIFT 0 */ + +#endif /* _LINUX_FOLIO_H */ diff -prauN linux-2.6.0-test11/include/linux/gfp.h pgcl-2.6.0-test11-1/include/linux/gfp.h --- linux-2.6.0-test11/include/linux/gfp.h 2003-11-26 12:43:26.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/gfp.h 2003-11-27 21:55:19.000000000 -0800 @@ -77,8 +77,9 @@ static inline struct page * alloc_pages_ #define alloc_page(gfp_mask) \ alloc_pages_node(numa_node_id(), gfp_mask, 0) -extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); -extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); +unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff -prauN linux-2.6.0-test11/include/linux/highmem.h pgcl-2.6.0-test11-1/include/linux/highmem.h --- linux-2.6.0-test11/include/linux/highmem.h 2003-11-26 12:45:04.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/highmem.h 2003-11-27 21:55:19.000000000 -0800 @@ -31,6 +31,11 @@ static inline void *kmap(struct page *pa #define kmap_atomic(page, idx) page_address(page) #define kunmap_atomic(addr, idx) do { } while (0) #define kmap_atomic_to_page(ptr) virt_to_page(ptr) +#define kmap_atomic_to_pfn(ptr) (__pa(ptr)/MMUPAGE_SIZE) + +#define kmap_atomic_sg(ptes, addrs, type) \ + memcpy(ptes,addrs,PAGE_MMUCOUNT*sizeof(pte_t)) +#define kunmap_atomic_sg(ptes, type) do { } while (0) #endif /* CONFIG_HIGHMEM */ @@ -76,6 +81,17 @@ static inline void copy_user_highpage(st kunmap_atomic(vto, KM_USER1); } +static inline void copy_user_mmupages(struct page *dst, struct page *src, int offset, int size) +{ + char *vfrom, *vto; + + vfrom = kmap_atomic(src, KM_USER0); + vto = kmap_atomic(dst, KM_USER1); + memcpy(&vto[offset], &vfrom[offset], size); + kunmap_atomic(src, KM_USER0); + kunmap_atomic(dst, KM_USER1); +} + static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; diff -prauN linux-2.6.0-test11/include/linux/ide.h pgcl-2.6.0-test11-1/include/linux/ide.h --- linux-2.6.0-test11/include/linux/ide.h 2003-11-26 12:43:36.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/ide.h 2003-11-27 21:55:19.000000000 -0800 @@ -227,7 +227,7 @@ typedef unsigned char byte; /* used ever * allowing each to have about 256 entries (8 bytes each) from this. */ #define PRD_BYTES 8 -#define PRD_ENTRIES (PAGE_SIZE / (2 * PRD_BYTES)) +#define PRD_ENTRIES (MMUPAGE_SIZE / (2 * PRD_BYTES)) /* * Some more useful definitions diff -prauN linux-2.6.0-test11/include/linux/kernel.h pgcl-2.6.0-test11-1/include/linux/kernel.h --- linux-2.6.0-test11/include/linux/kernel.h 2003-11-26 12:42:43.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/kernel.h 2003-11-27 21:55:19.000000000 -0800 @@ -84,6 +84,9 @@ extern unsigned long long memparse(char extern int kernel_text_address(unsigned long addr); extern int session_of_pgrp(int pgrp); +extern void register_early_consoles(void); +extern void unregister_early_consoles(void); + asmlinkage int printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))); diff -prauN linux-2.6.0-test11/include/linux/mm.h pgcl-2.6.0-test11-1/include/linux/mm.h --- linux-2.6.0-test11/include/linux/mm.h 2003-11-26 12:42:55.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/mm.h 2003-11-27 22:15:53.000000000 -0800 @@ -73,7 +73,7 @@ struct vm_area_struct { struct vm_operations_struct * vm_ops; /* Information about our backing store: */ - unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + unsigned long vm_pgoff; /* Offset (within vm_file) in MMUPAGE_SIZE units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ @@ -173,7 +173,8 @@ struct page { atomic_t count; /* Usage count, see below. */ struct list_head list; /* ->mapping has some page lists. */ struct address_space *mapping; /* The inode (or ...) we belong to. */ - unsigned long index; /* Our offset within mapping. */ + unsigned long index; /* Our offset within mapping. + * in PAGE_CACHE_SIZE units. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ union { @@ -199,6 +200,11 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ }; +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) @@ -206,6 +212,32 @@ struct page { #include /* + * The zone field is never updated after free_area_init_core() + * sets it, so none of the operations on it need to be atomic. + */ +#define NODE_SHIFT 4 +#define ZONE_SHIFT (BITS_PER_LONG - 8) + +struct zone; +extern struct zone *zone_table[]; + +static inline struct zone *page_zone(struct page *page) +{ + return zone_table[page->flags >> ZONE_SHIFT]; +} + +static inline void set_page_zone(struct page *page, unsigned long zone_num) +{ + page->flags &= ~(~0UL << ZONE_SHIFT); + page->flags |= zone_num << ZONE_SHIFT; +} + +#ifdef DEBUG_RMAP +struct mm_struct; +int page_count_expected(struct page *); +int page_truly_private(struct page *, struct mm_struct *); +#endif +/* * Methods to modify the page usage count. * * What counts for a page usage: @@ -217,11 +249,32 @@ struct page { * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. */ -#define put_page_testzero(p) \ +#define __put_page_testzero(p) \ ({ \ - BUG_ON(page_count(p) == 0); \ + BUG_ON(page_count(p) <= 0); \ atomic_dec_and_test(&(p)->count); \ }) +#ifdef DEBUG_RMAP +#define _put_page_testzero(file, line, func, p) \ +({ \ + struct page *__page__ = p; \ + pr_debug("%d: put_page_testzero(0x%lx) in %s, %s:%d with " \ + "count %d (expected %d)\n", \ + current->pid, \ + page_to_pfn(__page__), \ + func, \ + file, \ + line, \ + page_count(__page__), \ + page_count_expected(__page__)); \ + __put_page_testzero(__page__); \ +}) +#else +#define _put_page_testzero(file, line, func, p) __put_page_testzero(p) +#endif + +#define put_page_testzero(p) \ + _put_page_testzero(__FILE__, __LINE__, __FUNCTION__, p) #define page_count(p) atomic_read(&(p)->count) #define set_page_count(p,v) atomic_set(&(p)->count, v) @@ -231,14 +284,14 @@ extern void FASTCALL(__page_cache_releas #ifdef CONFIG_HUGETLB_PAGE -static inline void get_page(struct page *page) +static inline void __get_page(struct page *page) { if (PageCompound(page)) page = (struct page *)page->lru.next; atomic_inc(&page->count); } -static inline void put_page(struct page *page) +static inline void ___put_page(struct page *page) { if (PageCompound(page)) { page = (struct page *)page->lru.next; @@ -257,20 +310,69 @@ static inline void put_page(struct page #else /* CONFIG_HUGETLB_PAGE */ -static inline void get_page(struct page *page) +static inline void ___put_page(struct page *page) { - atomic_inc(&page->count); + if (!PageReserved(page) && __put_page_testzero(page)) + __page_cache_release(page); } -static inline void put_page(struct page *page) +static inline void __get_page(struct page *page) { - if (!PageReserved(page) && put_page_testzero(page)) - __page_cache_release(page); + atomic_inc(&page->count); } #endif /* CONFIG_HUGETLB_PAGE */ /* + * Wrapper layer for deciding (at compile-time) + * whether to do printk's or not. + */ + +#ifdef DEBUG_RMAP +#define _put_page(file, line, func, p) \ +do { \ + struct page *__page__ = p; \ + pr_debug("%d: put_page(0x%lx) in %s, %s:%d with " \ + "count %d (expected %d)\n", \ + current->pid, \ + page_to_pfn(__page__), \ + func, \ + file, \ + line, \ + page_count(__page__), \ + page_count_expected(__page__)); \ + ___put_page(__page__); \ +} while (0) +#else +#define _put_page(file, line, func, p) ___put_page(p) +#endif + +#ifdef DEBUG_RMAP +#define _get_page(file, line, func, p) \ +do { \ + struct page *__page__ = p; \ + pr_debug("%d: get_page(0x%lx) in %s, %s:%d with " \ + "count %d (expected %d)\n", \ + current->pid, \ + page_to_pfn(__page__), \ + func, \ + file, \ + line, \ + page_count(__page__), \ + page_count_expected(__page__)); \ + __get_page(__page__); \ +} while (0) +#else +#define _get_page(file, line, func, p) __get_page(p) +#endif + +/* + * Wrapper layer for grabbing __FILE__, __LINE__, etc. + */ +#define get_page(p) _get_page(__FILE__, __LINE__, __FUNCTION__, p) +#define put_page(p) _put_page(__FILE__, __LINE__, __FUNCTION__, p) + +/* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have @@ -319,34 +421,9 @@ static inline void put_page(struct page * to swap space and (later) to be read back into memory. */ -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. - */ -#define ZONE_SHIFT (BITS_PER_LONG - 8) - -struct zone; -extern struct zone *zone_table[]; - -static inline struct zone *page_zone(struct page *page) -{ - return zone_table[page->flags >> ZONE_SHIFT]; -} - -static inline void set_page_zone(struct page *page, unsigned long zone_num) -{ - page->flags &= ~(~0UL << ZONE_SHIFT); - page->flags |= zone_num << ZONE_SHIFT; -} - -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ -extern struct page *mem_map; -#endif - static inline void *lowmem_page_address(struct page *page) { - return __va(page_to_pfn(page) << PAGE_SHIFT); + return __va(page_to_pfn(page) << MMUPAGE_SHIFT); } #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) @@ -430,18 +507,19 @@ extern int vmtruncate(struct inode * ino extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); +int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot, int subpfn); extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); extern long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); -void put_dirty_page(struct task_struct *tsk, struct page *page, +void put_dirty_page(task_t *task, struct page *page, int min_subpfn, unsigned long address, pgprot_t prot); -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +int get_user_pages(task_t *task, struct mm_struct *mm, unsigned long start, + int len, int write, int force, unsigned long *mmupages, + struct vm_area_struct **vmas); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); @@ -526,10 +604,10 @@ static inline unsigned long do_mmap(stru unsigned long flag, unsigned long offset) { unsigned long ret = -EINVAL; - if ((offset + PAGE_ALIGN(len)) < offset) + if ((offset + MMUPAGE_ALIGN(len)) < offset) goto out; - if (!(offset & ~PAGE_MASK)) - ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); + if (!(offset & ~MMUPAGE_MASK)) + ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> MMUPAGE_SHIFT); out: return ret; } @@ -609,8 +687,18 @@ extern struct vm_area_struct *find_exten extern unsigned int nr_used_zone_pages(void); +/* + * Return byte offset from start of page containing virtual address in + * vma, to start of mmupage containing it: 0 if PAGE_MMUSHIFT 0. + */ +static inline unsigned long vma_suboffset(struct vm_area_struct *vma, unsigned long address) +{ + return (address - vma->vm_start + MMUPAGE_SIZE * vma->vm_pgoff) + & (MMUPAGE_MASK - PAGE_MASK); +} + extern struct page * vmalloc_to_page(void *addr); -extern struct page * follow_page(struct mm_struct *mm, unsigned long address, +unsigned long follow_page(struct mm_struct *mm, unsigned long address, int write); extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); diff -prauN linux-2.6.0-test11/include/linux/mmzone.h pgcl-2.6.0-test11-1/include/linux/mmzone.h --- linux-2.6.0-test11/include/linux/mmzone.h 2003-11-26 12:44:20.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/mmzone.h 2003-11-27 21:55:19.000000000 -0800 @@ -15,7 +15,7 @@ /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER -#define MAX_ORDER 11 +#define MAX_ORDER (11 - PAGE_MMUSHIFT) #else #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif diff -prauN linux-2.6.0-test11/include/linux/pagemap.h pgcl-2.6.0-test11-1/include/linux/pagemap.h --- linux-2.6.0-test11/include/linux/pagemap.h 2003-11-26 12:42:49.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/pagemap.h 2003-11-27 21:55:19.000000000 -0800 @@ -46,8 +46,52 @@ static inline void mapping_set_gfp_mask( #define PAGE_CACHE_MASK PAGE_MASK #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK) -#define page_cache_get(page) get_page(page) -#define page_cache_release(page) put_page(page) +#define PAGE_CACHE_MMUSHIFT (PAGE_CACHE_SHIFT - MMUPAGE_SHIFT) +#define PAGE_CACHE_MMUCOUNT (PAGE_CACHE_SIZE/MMUPAGE_SIZE) + +#ifdef DEBUG_RMAP +#define __page_cache_get__(file, line, func, pg) \ +do { \ + struct page *__page__ = pg; \ + pr_debug("%d: page_cache_get(0x%lx) in %s, %s:%d with " \ + "count %d (expected %d)\n", \ + current->pid, \ + page_to_pfn(__page__), \ + func, \ + file, \ + line, \ + page_count(__page__), \ + page_count_expected(__page__)); \ + __get_page(__page__); \ +} while (0) +#else +#define __page_cache_get__(file, line, func, p) __get_page(p) +#endif + +#ifdef DEBUG_RMAP +#define __page_cache_release__(file, line, func, pg) \ +do { \ + struct page *__page__ = pg; \ + pr_debug("%d: page_cache_release(0x%lx) in %s, %s:%d with " \ + "count %d (expected %d)\n", \ + current->pid, \ + page_to_pfn(__page__), \ + func, \ + file, \ + line, \ + page_count(__page__), \ + page_count_expected(__page__)); \ + ___put_page(__page__); \ +} while (0) +#else +#define __page_cache_release__(file, line, func, p) ___put_page(p) +#endif + +#define page_cache_get(page) \ + __page_cache_get__(__FILE__, __LINE__, __FUNCTION__, page) +#define page_cache_release(page) \ + __page_cache_release__(__FILE__, __LINE__, __FUNCTION__, page) + void release_pages(struct page **pages, int nr, int cold); static inline struct page *page_cache_alloc(struct address_space *x) @@ -197,40 +241,27 @@ extern void end_page_writeback(struct pa */ static inline int fault_in_pages_writeable(char __user *uaddr, int size) { - int ret; + int ret = 0; + unsigned long addr, end = (unsigned long)uaddr + size - 1; /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. */ - ret = __put_user(0, uaddr); - if (ret == 0) { - char __user *end = uaddr + size - 1; - - /* - * If the page was already mapped, this will get a cache miss - * for sure, so try to avoid doing it. - */ - if (((unsigned long)uaddr & PAGE_MASK) != - ((unsigned long)end & PAGE_MASK)) - ret = __put_user(0, end); - } + for (addr = (unsigned long)uaddr; addr <= MMUPAGE_ALIGN(end); addr += MMUPAGE_SIZE) + if ((ret = __put_user(0, (char *)min(addr, end))) != 0) + break; + return ret; } static inline void fault_in_pages_readable(const char __user *uaddr, int size) { volatile char c; - int ret; + unsigned long addr, end = (unsigned long)uaddr + size - 1; - ret = __get_user(c, (char *)uaddr); - if (ret == 0) { - const char __user *end = uaddr + size - 1; - - if (((unsigned long)uaddr & PAGE_MASK) != - ((unsigned long)end & PAGE_MASK)) - __get_user(c, (char *)end); - } + for (addr = (unsigned long)uaddr; addr <= MMUPAGE_ALIGN(end); addr += MMUPAGE_SIZE) + __get_user(c, (char *)min(addr, end)); } #endif /* _LINUX_PAGEMAP_H */ diff -prauN linux-2.6.0-test11/include/linux/sched.h pgcl-2.6.0-test11-1/include/linux/sched.h --- linux-2.6.0-test11/include/linux/sched.h 2003-11-26 12:42:58.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/sched.h 2003-11-27 21:55:19.000000000 -0800 @@ -202,7 +202,7 @@ struct mm_struct { unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long rss, total_vm, locked_vm; + unsigned long rss, total_vm, locked_vm; /* in MMUPAGE_SIZE units */ unsigned long def_flags; cpumask_t cpu_vm_mask; unsigned long swap_address; @@ -670,12 +670,7 @@ static inline int capable(int cap) extern struct mm_struct * mm_alloc(void); /* mmdrop drops the mm and the page tables */ -extern inline void FASTCALL(__mmdrop(struct mm_struct *)); -static inline void mmdrop(struct mm_struct * mm) -{ - if (atomic_dec_and_test(&mm->mm_count)) - __mmdrop(mm); -} +void mmdrop(struct mm_struct * mm); /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); diff -prauN linux-2.6.0-test11/include/linux/shm.h pgcl-2.6.0-test11-1/include/linux/shm.h --- linux-2.6.0-test11/include/linux/shm.h 2003-11-26 12:44:58.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/shm.h 2003-11-27 21:55:19.000000000 -0800 @@ -12,7 +12,7 @@ #define SHMMAX 0x2000000 /* max shared seg size (bytes) */ #define SHMMIN 1 /* min shared seg size (bytes) */ #define SHMMNI 4096 /* max num of segs system wide */ -#define SHMALL (SHMMAX/PAGE_SIZE*(SHMMNI/16)) /* max shm system wide (pages) */ +#define SHMALL (SHMMAX/MMUPAGE_SIZE*(SHMMNI/16)) /* max shm system wide (mmupages) */ #define SHMSEG SHMMNI /* max shared segs per process */ #include diff -prauN linux-2.6.0-test11/include/linux/swap.h pgcl-2.6.0-test11-1/include/linux/swap.h --- linux-2.6.0-test11/include/linux/swap.h 2003-11-26 12:42:52.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/swap.h 2003-11-27 21:55:19.000000000 -0800 @@ -7,8 +7,10 @@ #include #include #include +#include #include #include +#include #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff @@ -44,7 +46,7 @@ static inline int current_is_kswapd(void */ union swap_header { struct { - char reserved[PAGE_SIZE - 10]; + char reserved[MMUPAGE_SIZE - 10]; char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ } magic; struct { @@ -111,8 +113,8 @@ enum { #define SWAP_CLUSTER_MAX 32 -#define SWAP_MAP_MAX 0x7fff -#define SWAP_MAP_BAD 0x8000 +#define SWAP_MAP_MAX 0xfffe +#define SWAP_MAP_BAD 0xffff /* * The in-memory structure used to track swap areas. @@ -179,17 +181,29 @@ extern int vm_swappiness; /* linux/mm/rmap.c */ #ifdef CONFIG_MMU -int FASTCALL(page_referenced(struct page *)); -struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, +#if 0 +#define RMAP_FASTCALL(x) FASTCALL(x) +#else +#define RMAP_FASTCALL(x) x +#endif + +int RMAP_FASTCALL(page_referenced(struct page *)); +struct pte_chain *RMAP_FASTCALL(page_add_rmap(struct page *, pte_t *, struct pte_chain *)); -void FASTCALL(page_remove_rmap(struct page *, pte_t *)); -int FASTCALL(try_to_unmap(struct page *)); +struct pte_chain *RMAP_FASTCALL(rmap_add_folio(struct page *, + pte_addr_t [], + struct pte_chain *)); +void RMAP_FASTCALL(rmap_remove_folio(struct page *, pte_addr_t [])); +void RMAP_FASTCALL(page_remove_rmap(struct page *, pte_t *)); +int RMAP_FASTCALL(try_to_unmap(struct page *)); /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else #define page_referenced(page) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL + +/* people really need to make sure these macro-like things aren't abused */ +#define try_to_unmap(page) ({ SWAP_FAIL; }) #endif /* CONFIG_MMU */ /* return values of try_to_unmap */ diff -prauN linux-2.6.0-test11/include/linux/swapops.h pgcl-2.6.0-test11-1/include/linux/swapops.h --- linux-2.6.0-test11/include/linux/swapops.h 2003-11-26 12:42:44.000000000 -0800 +++ pgcl-2.6.0-test11-1/include/linux/swapops.h 2003-11-27 21:55:19.000000000 -0800 @@ -1,3 +1,5 @@ +#ifndef _LINUX_SWAPOPS_H +#define _LINUX_SWAPOPS_H /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in @@ -68,3 +70,7 @@ static inline pte_t swp_entry_to_pte(swp BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } + +int swap_count(struct page *page); +void __swap_free(swp_entry_t entry, unsigned short count); +#endif /* _LINUX_SWAPOPS_H */ diff -prauN linux-2.6.0-test11/init/main.c pgcl-2.6.0-test11-1/init/main.c --- linux-2.6.0-test11/init/main.c 2003-11-26 12:43:09.000000000 -0800 +++ pgcl-2.6.0-test11-1/init/main.c 2003-11-27 21:55:19.000000000 -0800 @@ -365,6 +365,9 @@ static void __init smp_init(void) #endif +extern void register_early_consoles(void); +extern void unregister_early_consoles(void); + /* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to @@ -379,6 +382,7 @@ static void rest_init(void) cpu_idle(); } + /* * Activate the first processor. */ @@ -393,6 +397,7 @@ asmlinkage void __init start_kernel(void * enable them */ lock_kernel(); + register_early_consoles(); printk(linux_banner); setup_arch(&command_line); setup_per_zone_pages_min(); @@ -410,7 +415,9 @@ asmlinkage void __init start_kernel(void parse_args("Booting kernel", command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); + printk("survived parse_args(), calling trap_init()\n"); trap_init(); + printk("survived trap_init(), calling rcu_init()\n"); rcu_init(); init_IRQ(); pidhash_init(); @@ -423,14 +430,15 @@ asmlinkage void __init start_kernel(void * we've done PCI setups etc, and console_init() must be aware of * this. But we do want output early, in case something goes wrong. */ + unregister_early_consoles(); console_init(); profile_init(); local_irq_enable(); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && - initrd_start < min_low_pfn << PAGE_SHIFT) { + initrd_start < min_low_pfn << MMUPAGE_SHIFT) { printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " - "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); + "disabling it.\n",initrd_start,min_low_pfn << MMUPAGE_SHIFT); initrd_start = 0; } #endif @@ -578,14 +586,18 @@ static int init(void * unused) smp_init(); do_basic_setup(); + printk("about to prepare_namespace()\n"); prepare_namespace(); + printk("return from prepare_namespace()\n"); /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ + printk("about to free_initmem()\n"); free_initmem(); + printk("return from free_initmem()\n"); unlock_kernel(); system_running = 1; @@ -602,6 +614,7 @@ static int init(void * unused) * trying to recover a really broken machine. */ + printk("about to execve(\"/sbin/init\")\n"); if (execute_command) run_init_process(execute_command); diff -prauN linux-2.6.0-test11/ipc/shm.c pgcl-2.6.0-test11-1/ipc/shm.c --- linux-2.6.0-test11/ipc/shm.c 2003-11-26 12:44:10.000000000 -0800 +++ pgcl-2.6.0-test11-1/ipc/shm.c 2003-11-27 21:55:19.000000000 -0800 @@ -110,7 +110,7 @@ static void shm_open (struct vm_area_str */ static void shm_destroy (struct shmid_kernel *shp) { - shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; + shm_tot -= (shp->shm_segsz + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; shm_rmid (shp->id); shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) @@ -169,7 +169,7 @@ static int newseg (key_t key, int shmflg { int error; struct shmid_kernel *shp; - int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; + int numpages = (size + MMUPAGE_SIZE -1) >> MMUPAGE_SHIFT; struct file * file; char name[13]; int id; @@ -717,7 +717,7 @@ long sys_shmat(int shmid, char __user *s * space left for the stack to grow (at least 4 pages). */ if (addr < current->mm->start_stack && - addr > current->mm->start_stack - size - PAGE_SIZE * 5) + addr > current->mm->start_stack - size - MMUPAGE_SIZE * 5) goto invalid; } @@ -775,7 +775,7 @@ asmlinkage long sys_shmdt(char __user *s * otherwise it starts at this address with no hassles. */ if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) && - (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { + (vma->vm_start - addr)/MMUPAGE_SIZE == vma->vm_pgoff) { size = vma->vm_file->f_dentry->d_inode->i_size; @@ -803,7 +803,7 @@ asmlinkage long sys_shmdt(char __user *s /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) && - (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) + (vma->vm_start - addr)/MMUPAGE_SIZE == vma->vm_pgoff) do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); vma = next; diff -prauN linux-2.6.0-test11/kernel/fork.c pgcl-2.6.0-test11-1/kernel/fork.c --- linux-2.6.0-test11/kernel/fork.c 2003-11-26 12:42:58.000000000 -0800 +++ pgcl-2.6.0-test11-1/kernel/fork.c 2003-11-27 21:55:19.000000000 -0800 @@ -196,7 +196,7 @@ void __init fork_init(unsigned long memp * value: the thread structures can take up at most half * of memory. */ - max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + max_threads = mempages / 8; /* * we need to allow at least 20 threads to boot a system */ @@ -269,7 +269,7 @@ static inline int dup_mmap(struct mm_str if(mpnt->vm_flags & VM_DONTCOPY) continue; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> MMUPAGE_SHIFT; if (security_vm_enough_memory(len)) goto fail_nomem; charge += len; @@ -391,8 +391,11 @@ struct mm_struct * mm_alloc(void) * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ -inline void __mmdrop(struct mm_struct *mm) +void mmdrop(struct mm_struct *mm) { + if (!atomic_dec_and_test(&mm->mm_count)) + return; + BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); @@ -1120,6 +1123,8 @@ long do_fork(unsigned long clone_flags, int trace = 0; long pid; + pr_debug("%d: do_fork()\n", current->pid); + if (unlikely(current->ptrace)) { trace = fork_traceflag (clone_flags); if (trace) @@ -1170,6 +1175,7 @@ long do_fork(unsigned long clone_flags, */ set_need_resched(); } + pr_debug("%d: do_fork() = %ld\n", current->pid, pid); return pid; } diff -prauN linux-2.6.0-test11/kernel/futex.c pgcl-2.6.0-test11-1/kernel/futex.c --- linux-2.6.0-test11/kernel/futex.c 2003-11-26 12:43:26.000000000 -0800 +++ pgcl-2.6.0-test11-1/kernel/futex.c 2003-11-27 21:55:19.000000000 -0800 @@ -141,13 +141,13 @@ static int get_futex_key(unsigned long u { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - struct page *page; + unsigned long pfn; int err; /* * The futex address must be "naturally" aligned. */ - key->both.offset = uaddr % PAGE_SIZE; + key->both.offset = uaddr % MMUPAGE_SIZE; if (unlikely((key->both.offset % sizeof(u32)) != 0)) return -EINVAL; uaddr -= key->both.offset; @@ -187,7 +187,7 @@ static int get_futex_key(unsigned long u key->shared.inode = vma->vm_file->f_dentry->d_inode; key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) + key->shared.pgoff = (((uaddr - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff); return 0; } @@ -203,10 +203,11 @@ static int get_futex_key(unsigned long u * Do a quick atomic lookup first - this is the fastpath. */ spin_lock(¤t->mm->page_table_lock); - page = follow_page(mm, uaddr, 0); - if (likely(page != NULL)) { - key->shared.pgoff = - page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pfn = follow_page(mm, uaddr, 0); + if (likely(pfn != 0)) { + struct page *page = pfn_to_page(pfn); + key->shared.pgoff = (page->index << PAGE_CACHE_MMUSHIFT) + + (pfn % PAGE_MMUCOUNT); spin_unlock(¤t->mm->page_table_lock); return 0; } @@ -215,10 +216,11 @@ static int get_futex_key(unsigned long u /* * Do it the general way. */ - err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); + err = get_user_pages(current, mm, uaddr, 1, 0, 0, &pfn, NULL); if (err >= 0) { - key->shared.pgoff = - page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct page *page = pfn_to_page(pfn); + key->shared.pgoff = (page->index << PAGE_CACHE_MMUSHIFT) + + (pfn % PAGE_MMUCOUNT); put_page(page); return 0; } diff -prauN linux-2.6.0-test11/kernel/params.c pgcl-2.6.0-test11-1/kernel/params.c --- linux-2.6.0-test11/kernel/params.c 2003-11-26 12:45:21.000000000 -0800 +++ pgcl-2.6.0-test11-1/kernel/params.c 2003-11-27 21:55:21.000000000 -0800 @@ -112,13 +112,20 @@ int parse_args(const char *name, unsigned num, int (*unknown)(char *param, char *val)) { + int k; char *param, *val; + printk("parsing args, trying to print the string\n"); + printk("args=\"%s\"\n", args); + DEBUGP("Parsing ARGS: %s\n", args); + printk("survived parsing args\n"); + k = 0; while (*args) { int ret; + printk("parse_args(): entering iteration %d\n", k); args = next_arg(args, ¶m, &val); ret = parse_one(param, val, params, num, unknown); switch (ret) { @@ -139,8 +146,11 @@ int parse_args(const char *name, name, val ?: "", param); return ret; } + printk("parse_args(): exiting iteration %d\n", k); + ++k; } + printk("parse_args(): iterated %d times\n", k); /* All parsed OK. */ return 0; } diff -prauN linux-2.6.0-test11/kernel/pid.c pgcl-2.6.0-test11-1/kernel/pid.c --- linux-2.6.0-test11/kernel/pid.c 2003-11-26 12:44:21.000000000 -0800 +++ pgcl-2.6.0-test11-1/kernel/pid.c 2003-11-27 21:55:21.000000000 -0800 @@ -271,7 +271,7 @@ void switch_exec_pids(task_t *leader, ta void __init pidhash_init(void) { int i, j, pidhash_size; - unsigned long megabytes = max_pfn >> (20 - PAGE_SHIFT); + unsigned long megabytes = max_pfn >> (20 - MMUPAGE_SHIFT); pidhash_shift = max(4, fls(megabytes * 4)); pidhash_shift = min(12, pidhash_shift); diff -prauN linux-2.6.0-test11/kernel/printk.c pgcl-2.6.0-test11-1/kernel/printk.c --- linux-2.6.0-test11/kernel/printk.c 2003-11-26 12:46:04.000000000 -0800 +++ pgcl-2.6.0-test11-1/kernel/printk.c 2003-11-27 21:55:21.000000000 -0800 @@ -481,7 +481,7 @@ asmlinkage int printk(const char *fmt, . log_level_unknown = 1; } - if (!cpu_online(smp_processor_id())) { + if (0 && !cpu_online(smp_processor_id())) { /* * Some console drivers may assume that per-cpu resources have * been allocated. So don't allow them to be called by this diff -prauN linux-2.6.0-test11/kernel/ptrace.c pgcl-2.6.0-test11-1/kernel/ptrace.c --- linux-2.6.0-test11/kernel/ptrace.c 2003-11-26 12:46:09.000000000 -0800 +++ pgcl-2.6.0-test11-1/kernel/ptrace.c 2003-11-27 22:23:46.000000000 -0800 @@ -156,38 +156,45 @@ int access_process_vm(struct task_struct struct mm_struct *mm; struct vm_area_struct *vma; struct page *page; + unsigned long pfn = 0; void *old_buf = buf; mm = get_task_mm(tsk); - if (!mm) + if (!mm) { + printk("get_task_mm() failed in access_process_vm()\n"); return 0; + } down_read(&mm->mmap_sem); /* ignore errors, just check how much was sucessfully transfered */ while (len) { int bytes, ret, offset; + unsigned long dst_off; void *maddr; - ret = get_user_pages(current, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) + ret = get_user_pages(current, mm, addr, 1, write, 1, &pfn, &vma); + if (ret <= 0) { + pr_debug("get_user_pages() failed in access_process_vm()\n"); break; + } bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; + offset = addr & ~MMUPAGE_MASK; + if (bytes > MMUPAGE_SIZE-offset) + bytes = MMUPAGE_SIZE-offset; flush_cache_page(vma, addr); + page = pfn_to_page(pfn); maddr = kmap(page); + dst_off = MMUPAGE_SIZE*(pfn % PAGE_MMUCOUNT); if (write) { copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); + maddr + offset + dst_off, buf, bytes); set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); + buf, maddr + offset + dst_off, bytes); } kunmap(page); page_cache_release(page); diff -prauN linux-2.6.0-test11/mm/bootmem.c pgcl-2.6.0-test11-1/mm/bootmem.c --- linux-2.6.0-test11/mm/bootmem.c 2003-11-26 12:43:30.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/bootmem.c 2003-11-27 21:55:21.000000000 -0800 @@ -33,10 +33,7 @@ unsigned long __init bootmem_bootmap_pag unsigned long mapsize; mapsize = (pages+7)/8; - mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; - mapsize >>= PAGE_SHIFT; - - return mapsize; + return (mapsize + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; } /* @@ -51,9 +48,10 @@ static unsigned long __init init_bootmem pgdat->pgdat_next = pgdat_list; pgdat_list = pgdat; + start &= ~(PAGE_MMUCOUNT - 1); mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); - bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); - bdata->node_boot_start = (start << PAGE_SHIFT); + bdata->node_bootmem_map = phys_to_virt(mapstart << MMUPAGE_SHIFT); + bdata->node_boot_start = (start << MMUPAGE_SHIFT); bdata->node_low_pfn = end; /* @@ -77,22 +75,20 @@ static void __init reserve_bootmem_core( * round up, partially reserved pages are considered * fully reserved. */ - unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; + unsigned long sidx = (addr - bdata->node_boot_start)/MMUPAGE_SIZE; unsigned long eidx = (addr + size - bdata->node_boot_start + - PAGE_SIZE-1)/PAGE_SIZE; - unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; + MMUPAGE_SIZE-1)/MMUPAGE_SIZE; + unsigned long end_pfn = (addr + size + MMUPAGE_SIZE-1)/MMUPAGE_SIZE; if (!size) BUG(); if (sidx >= eidx) BUG(); - if ((addr >> PAGE_SHIFT) >= bdata->node_low_pfn) - BUG(); - if (end > bdata->node_low_pfn) + if (end_pfn > bdata->node_low_pfn) BUG(); for (i = sidx; i < eidx; i++) if (test_and_set_bit(i, bdata->node_bootmem_map)) - printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); + printk("hm, page %08lx reserved twice.\n", i*MMUPAGE_SIZE); } static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) @@ -104,11 +100,11 @@ static void __init free_bootmem_core(boo * considered reserved. */ unsigned long sidx; - unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; - unsigned long end = (addr + size)/PAGE_SIZE; + unsigned long eidx = (addr + size - bdata->node_boot_start)/MMUPAGE_SIZE; + unsigned long end_pfn = (addr + size)/MMUPAGE_SIZE; if (!size) BUG(); - if (end > bdata->node_low_pfn) + if (end_pfn > bdata->node_low_pfn) BUG(); if (addr < bdata->last_success) @@ -117,8 +113,8 @@ static void __init free_bootmem_core(boo /* * Round up the beginning of the address. */ - start = (addr + PAGE_SIZE-1) / PAGE_SIZE; - sidx = start - (bdata->node_boot_start/PAGE_SIZE); + start = (addr + MMUPAGE_SIZE-1)/MMUPAGE_SIZE; + sidx = start - bdata->node_boot_start/MMUPAGE_SIZE; for (i = sidx; i < eidx; i++) { if (!test_and_clear_bit(i, bdata->node_bootmem_map)) @@ -154,19 +150,19 @@ __alloc_bootmem_core(struct bootmem_data } BUG_ON(align & (align-1)); - eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + eidx = bdata->node_low_pfn - bdata->node_boot_start/MMUPAGE_SIZE; offset = 0; if (align && (bdata->node_boot_start & (align - 1UL)) != 0) offset = (align - (bdata->node_boot_start & (align - 1UL))); - offset >>= PAGE_SHIFT; + offset >>= MMUPAGE_SHIFT; /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. */ if (goal && (goal >= bdata->node_boot_start) && - ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) { + ((goal >> MMUPAGE_SHIFT) < bdata->node_low_pfn)) { preferred = goal - bdata->node_boot_start; if (bdata->last_success >= preferred) @@ -174,10 +170,10 @@ __alloc_bootmem_core(struct bootmem_data } else preferred = 0; - preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; + preferred = ((preferred + align - 1) & ~(align - 1)) >> MMUPAGE_SHIFT; preferred += offset; - areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; - incr = align >> PAGE_SHIFT ? : 1; + areasize = (size + MMUPAGE_SIZE - 1)/MMUPAGE_SIZE; + incr = align >> MMUPAGE_SHIFT ? : 1; restart_scan: for (i = preferred; i < eidx; i += incr) { @@ -205,7 +201,7 @@ restart_scan: return NULL; found: - bdata->last_success = start << PAGE_SHIFT; + bdata->last_success = start << MMUPAGE_SHIFT; BUG_ON(start >= eidx); /* @@ -213,30 +209,30 @@ found: * of this allocation's buffer? If yes then we can 'merge' * the previous partial page with this allocation. */ - if (align < PAGE_SIZE && + if (align < MMUPAGE_SIZE && bdata->last_offset && bdata->last_pos+1 == start) { offset = (bdata->last_offset+align-1) & ~(align-1); - BUG_ON(offset > PAGE_SIZE); - remaining_size = PAGE_SIZE-offset; + BUG_ON(offset > MMUPAGE_SIZE); + remaining_size = MMUPAGE_SIZE - offset; if (size < remaining_size) { areasize = 0; /* last_pos unchanged */ bdata->last_offset = offset+size; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + ret = phys_to_virt(bdata->last_pos*MMUPAGE_SIZE + offset + bdata->node_boot_start); } else { remaining_size = size - remaining_size; - areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + + areasize = (remaining_size+MMUPAGE_SIZE-1)/MMUPAGE_SIZE; + ret = phys_to_virt(bdata->last_pos*MMUPAGE_SIZE + offset + bdata->node_boot_start); bdata->last_pos = start+areasize-1; bdata->last_offset = remaining_size; } - bdata->last_offset &= ~PAGE_MASK; + bdata->last_offset &= ~MMUPAGE_MASK; } else { bdata->last_pos = start + areasize - 1; - bdata->last_offset = size & ~PAGE_MASK; - ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); + bdata->last_offset = size & ~MMUPAGE_MASK; + ret = phys_to_virt(start * MMUPAGE_SIZE + bdata->node_boot_start); } /* @@ -253,51 +249,35 @@ static unsigned long __init free_all_boo { struct page *page; bootmem_data_t *bdata = pgdat->bdata; - unsigned long i, count, total = 0; - unsigned long idx; + unsigned long i, total = 0; + unsigned long idx, mapnr, node_low_mapnr; unsigned long *map; - if (!bdata->node_bootmem_map) BUG(); + BUG_ON(!bdata->node_bootmem_map); - count = 0; - /* first extant page of the node */ - page = virt_to_page(phys_to_virt(bdata->node_boot_start)); - idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + i = 0; + idx = bdata->node_low_pfn - bdata->node_boot_start/MMUPAGE_SIZE; + node_low_mapnr = idx/PAGE_MMUCOUNT; map = bdata->node_bootmem_map; - for (i = 0; i < idx; ) { - unsigned long v = ~map[i / BITS_PER_LONG]; - if (v) { - unsigned long m; - for (m = 1; m && i < idx; m<<=1, page++, i++) { - if (v & m) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - } - } - } else { - i+=BITS_PER_LONG; - page += BITS_PER_LONG; + /* first extant page of the node */ + for (mapnr = 0; mapnr < node_low_mapnr; ++mapnr) { + int k, should_free = 1; + for (k = 0; k < PAGE_MMUCOUNT; ++k) + if (test_bit(PAGE_MMUCOUNT*mapnr +k, map)) + should_free = 0; + if (should_free) { + page = &pgdat->node_mem_map[mapnr]; + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + ++total; } } - total += count; /* - * Now free the allocator bitmap itself, it's not - * needed anymore: + * Leak the allocator bitmap; it's not worth saving. */ - page = virt_to_page(bdata->node_bootmem_map); - count = 0; - for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - } - total += count; bdata->node_bootmem_map = NULL; - return total; } diff -prauN linux-2.6.0-test11/mm/filemap.c pgcl-2.6.0-test11-1/mm/filemap.c --- linux-2.6.0-test11/mm/filemap.c 2003-11-26 12:43:33.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/filemap.c 2003-11-27 21:55:21.000000000 -0800 @@ -998,14 +998,15 @@ struct page * filemap_nopage(struct vm_a struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; struct page *page; - unsigned long size, pgoff, endoff; + unsigned long size, pgoff, endoff, idx; int did_readaround = 0; - pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; - endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + pgoff = (address - area->vm_start)/MMUPAGE_SIZE + area->vm_pgoff; + endoff = (area->vm_end - area->vm_start)/MMUPAGE_SIZE + area->vm_pgoff; + idx = pgoff/PAGE_CACHE_MMUCOUNT; retry_all: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size = (i_size_read(inode) + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; if (pgoff >= size) goto outside_data_content; @@ -1027,16 +1028,17 @@ retry_all: * For sequential accesses, we use the generic readahead logic. */ if (VM_SequentialReadHint(area)) - page_cache_readahead(mapping, ra, file, pgoff); + page_cache_readahead(mapping, ra, file, idx); /* * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_get_page(mapping, idx); if (!page) { + unsigned long ra_idx = idx & ~(MMAP_READAROUND - 1); if (VM_SequentialReadHint(area)) { - handle_ra_miss(mapping, ra, pgoff); + handle_ra_miss(mapping, ra, ra_idx); goto no_cached_page; } ra->mmap_miss++; @@ -1084,7 +1086,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, pgoff); + error = page_cache_read(file, idx); /* * The page we want has now been added to the page cache. @@ -1174,7 +1176,7 @@ static struct page * filemap_getpage(str * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_get_page(mapping, pgoff/PAGE_CACHE_MMUCOUNT); if (!page) { if (nonblock) return NULL; @@ -1196,7 +1198,7 @@ success: return page; no_cached_page: - error = page_cache_read(file, pgoff); + error = page_cache_read(file, pgoff/PAGE_CACHE_MMUCOUNT); /* * The page we want has now been added to the page cache. @@ -1280,25 +1282,25 @@ static int filemap_populate(struct vm_ar struct file *file = vma->vm_file; struct address_space *mapping = file->f_dentry->d_inode->i_mapping; struct inode *inode = mapping->host; - unsigned long size; + unsigned long size, idx = pgoff/PAGE_CACHE_MMUCOUNT; struct mm_struct *mm = vma->vm_mm; struct page *page; int err; if (!nonblock) force_page_cache_readahead(mapping, vma->vm_file, - pgoff, len >> PAGE_CACHE_SHIFT); + idx, len >> PAGE_CACHE_SHIFT); repeat: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) + size = (i_size_read(inode) + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; + if (pgoff + (len + MMUPAGE_SIZE - 1)/MMUPAGE_SIZE > size) return -EINVAL; - page = filemap_getpage(file, pgoff, nonblock); + page = filemap_getpage(file, idx, nonblock); if (!page && !nonblock) return -ENOMEM; if (page) { - err = install_page(mm, vma, addr, page, prot); + err = install_page(mm, vma, addr, page, prot, pgoff % PAGE_MMUCOUNT); if (err) { page_cache_release(page); return err; @@ -1309,9 +1311,8 @@ repeat: * in the pte. */ unsigned long pgidx; - pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx = (addr - vma->vm_start) >> MMUPAGE_SHIFT; pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; if (pgoff != pgidx) { err = install_file_pte(mm, vma, addr, pgoff, prot); if (err) @@ -1319,8 +1320,8 @@ repeat: } } - len -= PAGE_SIZE; - addr += PAGE_SIZE; + len -= MMUPAGE_SIZE; + addr += MMUPAGE_SIZE; pgoff++; if (len) goto repeat; diff -prauN linux-2.6.0-test11/mm/fremap.c pgcl-2.6.0-test11-1/mm/fremap.c --- linux-2.6.0-test11/mm/fremap.c 2003-11-26 12:42:50.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/fremap.c 2003-11-27 21:55:21.000000000 -0800 @@ -55,7 +55,7 @@ static inline int zap_pte(struct mm_stru * previously existing mapping. */ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, struct page *page, pgprot_t prot) + unsigned long addr, struct page *page, pgprot_t prot, int subpfn) { int err = -ENOMEM, flush; pte_t *pte; @@ -63,6 +63,7 @@ int install_page(struct mm_struct *mm, s pmd_t *pmd; pte_t pte_val; struct pte_chain *pte_chain; + unsigned long pfn = page_to_pfn(page) + subpfn; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) @@ -82,7 +83,7 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); - set_pte(pte, mk_pte(page, prot)); + set_pte(pte, pfn_pte(pfn, prot)); pte_chain = page_add_rmap(page, pte, pte_chain); pte_val = *pte; pte_unmap(pte); @@ -174,8 +175,8 @@ long sys_remap_file_pages(unsigned long /* * Sanitize the syscall parameters: */ - start = start & PAGE_MASK; - size = size & PAGE_MASK; + start = start & MMUPAGE_MASK; + size = size & MMUPAGE_MASK; /* Does the address range wrap, or is the span zero-sized? */ if (start + size <= start) @@ -183,7 +184,7 @@ long sys_remap_file_pages(unsigned long /* Can we represent this offset inside this architecture's pte's? */ #if PTE_FILE_MAX_BITS < BITS_PER_LONG - if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) + if (pgoff + (size >> MMUPAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) return err; #endif diff -prauN linux-2.6.0-test11/mm/highmem.c pgcl-2.6.0-test11-1/mm/highmem.c --- linux-2.6.0-test11/mm/highmem.c 2003-11-26 12:44:31.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/highmem.c 2003-11-27 21:55:21.000000000 -0800 @@ -56,8 +56,6 @@ static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -pte_t * pkmap_page_table; - static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); static void flush_all_zero_pkmaps(void) @@ -67,6 +65,8 @@ static void flush_all_zero_pkmaps(void) flush_cache_kmaps(); for (i = 0; i < LAST_PKMAP; i++) { + int j; + unsigned long vaddr = PKMAP_ADDR(i); struct page *page; /* @@ -80,8 +80,14 @@ static void flush_all_zero_pkmaps(void) pkmap_count[i] = 0; /* sanity check */ - if (pte_none(pkmap_page_table[i])) - BUG(); + for (j = 0; j < PAGE_MMUCOUNT; ++j) { + unsigned long addr = vaddr + j*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + + BUG_ON(pte_none(*pte)); + } /* * Don't need an atomic fetch-and-clear op here; @@ -90,8 +96,20 @@ static void flush_all_zero_pkmaps(void) * getting the kmap_lock (which is held here). * So no dangers, even with speculative execution. */ - page = pte_page(pkmap_page_table[i]); - pte_clear(&pkmap_page_table[i]); + { + pgd_t *pgd = pgd_offset_k(vaddr); + pmd_t *pmd = pmd_offset(pgd, vaddr); + pte_t *pte = pte_offset_kernel(pmd, vaddr); + page = pte_page(*pte); + } + + for (j = 0; j < PAGE_MMUCOUNT; ++j) { + unsigned long addr = vaddr + j*MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + pte_clear(pte); + } set_page_address(page, NULL); } @@ -101,7 +119,7 @@ static void flush_all_zero_pkmaps(void) static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; - int count; + int k, count; start: count = LAST_PKMAP; @@ -139,7 +157,15 @@ start: } } vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + WARN_ON(vaddr > __fix_to_virt(FIX_PKMAP_BEGIN)); + WARN_ON(vaddr < __fix_to_virt(FIX_PKMAP_END)); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + unsigned long addr = vaddr + k * MMUPAGE_SIZE; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd = pmd_offset(pgd, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + set_pte(pte, pfn_pte(page_to_pfn(page) + k, kmap_prot)); + } pkmap_count[last_pkmap_nr] = 1; set_page_address(page, (void *)vaddr); diff -prauN linux-2.6.0-test11/mm/madvise.c pgcl-2.6.0-test11-1/mm/madvise.c --- linux-2.6.0-test11/mm/madvise.c 2003-11-26 12:43:26.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/madvise.c 2003-11-27 21:55:21.000000000 -0800 @@ -60,10 +60,12 @@ static long madvise_willneed(struct vm_a if (!file) return -EBADF; - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + start = ((start - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + end = ((end - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; + start /= PAGE_MMUCOUNT; + end /= PAGE_MMUCOUNT; force_page_cache_readahead(file->f_dentry->d_inode->i_mapping, file, start, max_sane_readahead(end - start)); @@ -170,9 +172,9 @@ asmlinkage long sys_madvise(unsigned lon down_write(¤t->mm->mmap_sem); - if (start & ~PAGE_MASK) + if (start & ~MMUPAGE_MASK) goto out; - len = (len + ~PAGE_MASK) & PAGE_MASK; + len = (len + ~MMUPAGE_MASK) & MMUPAGE_MASK; end = start + len; if (end < start) goto out; diff -prauN linux-2.6.0-test11/mm/memory.c pgcl-2.6.0-test11-1/mm/memory.c --- linux-2.6.0-test11/mm/memory.c 2003-11-26 12:43:52.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/memory.c 2003-11-27 22:46:24.000000000 -0800 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -52,8 +53,8 @@ #include #include #include - -#include +#include +#include #ifndef CONFIG_DISCONTIGMEM /* use the per-pgdat data instead for discontigmem - mbligh */ @@ -72,19 +73,7 @@ EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(highmem_start_page); EXPORT_SYMBOL(high_memory); -/* - * We special-case the C-O-W ZERO_PAGE, because it's such - * a common occurrence (no need to read the page to know - * that it's zero - better for the cache and memory subsystem). - */ -static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) -{ - if (from == ZERO_PAGE(address)) { - clear_user_highpage(to, address); - return; - } - copy_user_highpage(to, from, address); -} +struct pte_chain *rmap_add_folio(struct page *, pte_addr_t [], struct pte_chain *); /* * Note: this doesn't free the actual pages themselves. That @@ -134,7 +123,7 @@ static inline void free_one_pgd(struct m */ void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) { - pgd_t * page_dir = tlb->mm->pgd; + pgd_t * page_dir = tlb_mm(tlb)->pgd; page_dir += first; do { @@ -159,6 +148,8 @@ pte_t * pte_alloc_map(struct mm_struct * * entry, as somebody else could have populated it.. */ if (pmd_present(*pmd)) { + if (PAGE_MMUCOUNT > 1) + atomic_sub(PAGE_MMUCOUNT - 1, &new->count); pte_free(new); goto out; } @@ -354,7 +345,7 @@ skip_copy_pte_range: src_pte = pte_offset_map_nested(src_pmd, address); cont_copy_pte_range_noset: - address += PAGE_SIZE; + address += MMUPAGE_SIZE; if (address >= end) { pte_unmap_nested(src_pte); pte_unmap(dst_pte); @@ -400,8 +391,8 @@ zap_pte_range(struct mmu_gather *tlb, pm offset = address & ~PMD_MASK; if (offset + size > PMD_SIZE) size = PMD_SIZE - offset; - size &= PAGE_MASK; - for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { + size &= MMUPAGE_MASK; + for (offset=0; offset < size; ptep++, offset += MMUPAGE_SIZE) { pte_t pte = *ptep; if (pte_none(pte)) continue; @@ -418,7 +409,7 @@ zap_pte_range(struct mmu_gather *tlb, pm if (page->mapping && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); - tlb->freed++; + tlb_inc_freed(tlb); page_remove_rmap(page, ptep); tlb_remove_page(tlb, page); } @@ -481,12 +472,12 @@ void unmap_page_range(struct mmu_gather /* Dispose of an entire struct mmu_gather per rescheduling point */ #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (FREE_PTE_NR * PAGE_SIZE) +#define ZAP_BLOCK_SIZE (FREE_PTE_NR * MMUPAGE_SIZE) #endif /* For UP, 256 pages at a time gives nice low latency */ #if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (256 * PAGE_SIZE) +#define ZAP_BLOCK_SIZE (256 * MMUPAGE_SIZE) #endif /* No preempt: go for the best straight-line efficiency */ @@ -550,7 +541,7 @@ int unmap_vmas(struct mmu_gather **tlbp, continue; if (vma->vm_flags & VM_ACCOUNT) - *nr_accounted += (end - start) >> PAGE_SHIFT; + *nr_accounted += (end - start) >> MMUPAGE_SHIFT; ret++; while (start != end) { @@ -572,7 +563,9 @@ int unmap_vmas(struct mmu_gather **tlbp, if ((long)zap_bytes > 0) continue; if (need_resched()) { - tlb_finish_mmu(*tlbp, tlb_start, start); + tlb_finish_mmu(*tlbp, + tlb_start_valid ? tlb_start : 0, + start); cond_resched_lock(&mm->page_table_lock); *tlbp = tlb_gather_mmu(mm, 0); tlb_start_valid = 0; @@ -619,18 +612,19 @@ void zap_page_range(struct vm_area_struc * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -struct page * -follow_page(struct mm_struct *mm, unsigned long address, int write) +unsigned long follow_page(struct mm_struct *mm, unsigned long address, int write) { pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; unsigned long pfn; - struct vm_area_struct *vma; +#if 0 + struct vm_area_struct *vma; vma = hugepage_vma(mm, address); if (vma) return follow_huge_addr(mm, vma, address, write); +#endif pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || pgd_bad(*pgd)) @@ -639,8 +633,10 @@ follow_page(struct mm_struct *mm, unsign pmd = pmd_offset(pgd, address); if (pmd_none(*pmd)) goto out; +#if 0 if (pmd_huge(*pmd)) return follow_huge_pmd(mm, address, pmd, write); +#endif if (pmd_bad(*pmd)) goto out; @@ -657,13 +653,13 @@ follow_page(struct mm_struct *mm, unsign struct page *page = pfn_to_page(pfn); mark_page_accessed(page); - return page; + return pfn; } } } out: - return NULL; + return 0; } /* @@ -672,17 +668,14 @@ out: * with IO-aperture pages for direct-IO. */ -static inline struct page *get_page_map(struct page *page) +unsigned long get_pfn_map(unsigned long pfn) { - if (!pfn_valid(page_to_pfn(page))) - return 0; - return page; + return pfn_valid(pfn) ? pfn : 0; } - -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +int get_user_pages(task_t *task, struct mm_struct *mm, unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) + unsigned long *pfns, struct vm_area_struct **vmas) { int i; unsigned int flags; @@ -713,7 +706,7 @@ int get_user_pages(struct task_struct *t .vm_page_prot = PAGE_READONLY, .vm_flags = VM_READ | VM_EXEC, }; - unsigned long pg = start & PAGE_MASK; + unsigned long pg = start & MMUPAGE_MASK; pgd_t *pgd; pmd_t *pmd; pte_t *pte; @@ -728,39 +721,41 @@ int get_user_pages(struct task_struct *t pte = pte_offset_kernel(pmd, pg); if (!pte || !pte_present(*pte)) return i ? : -EFAULT; - if (pages) { - pages[i] = pte_page(*pte); - get_page(pages[i]); + if (pfns) { + pfns[i] = pte_pfn(*pte); + get_page(pfn_to_page(pfns[i])); } if (vmas) vmas[i] = &fixmap_vma; i++; - start += PAGE_SIZE; + start += MMUPAGE_SIZE; len--; continue; } #endif - if (!vma || (pages && (vma->vm_flags & VM_IO)) + if (!vma || (pfns && (vma->vm_flags & VM_IO)) || !(flags & vma->vm_flags)) return i ? : -EFAULT; +#if 0 if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &len, i); continue; } +#endif spin_lock(&mm->page_table_lock); do { - struct page *map; - while (!(map = follow_page(mm, start, write))) { + unsigned long map_pfn; + while (!(map_pfn = follow_page(mm, start, write))) { spin_unlock(&mm->page_table_lock); switch (handle_mm_fault(mm,vma,start,write)) { case VM_FAULT_MINOR: - tsk->min_flt++; + task->min_flt++; break; case VM_FAULT_MAJOR: - tsk->maj_flt++; + task->maj_flt++; break; case VM_FAULT_SIGBUS: return i ? i : -EFAULT; @@ -771,23 +766,35 @@ int get_user_pages(struct task_struct *t } spin_lock(&mm->page_table_lock); } - if (pages) { - pages[i] = get_page_map(map); - if (!pages[i]) { + if (pfns) { + pfns[i] = get_pfn_map(map_pfn); + if (!pfns[i]) { spin_unlock(&mm->page_table_lock); - while (i--) - page_cache_release(pages[i]); + while (i--) { + if (pfns[i]) + page_cache_release(pfn_to_page(pfns[i])); + } i = -EFAULT; goto out; } flush_dcache_page(pages[i]); - if (!PageReserved(pages[i])) - page_cache_get(pages[i]); + if (1) { + struct page *map; + if (pfns[i]) + map = pfn_to_page(pfns[i]); + else + map = NULL; + if (map) { + flush_dcache_page(map); + if (!PageReserved(map)) + page_cache_get(map); + } + } } if (vmas) vmas[i] = vma; i++; - start += PAGE_SIZE; + start += MMUPAGE_SIZE; len--; } while(len && start < vma->vm_end); spin_unlock(&mm->page_table_lock); @@ -811,7 +818,7 @@ static void zeromap_pte_range(pte_t * pt pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); BUG_ON(!pte_none(*pte)); set_pte(pte, zero_pte); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); } @@ -883,12 +890,12 @@ static inline void remap_pte_range(pte_t end = address + size; if (end > PMD_SIZE) end = PMD_SIZE; - pfn = phys_addr >> PAGE_SHIFT; + pfn = phys_addr >> MMUPAGE_SHIFT; do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) set_pte(pte, pfn_pte(pfn, prot)); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pfn++; pte++; } while (address && (address < end)); @@ -969,11 +976,13 @@ static inline void establish_pte(struct /* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) +static inline void +break_cow(struct vm_area_struct *vma, struct page *new_page, + unsigned long address, pte_t *page_table, int subpfn) { + pte_t pte = pfn_pte(page_to_pfn(new_page) + subpfn, vma->vm_page_prot); flush_cache_page(vma, address); - establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); + establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(pte))); } /* @@ -1001,7 +1010,12 @@ static int do_wp_page(struct mm_struct * { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain; + struct pte_chain *pte_chain = NULL; + pte_addr_t folio[PAGE_MMUCOUNT+1] = { [0 ... PAGE_MMUCOUNT] = 0 }; + int reprep, rss, ret; + + pr_debug("%d: do_wp_page(%p, %p, 0x%lx, %p, %p, %Lx\n", + current->pid, mm, vma, address, page_table, pmd, (u64)pte_val(pte)); if (unlikely(!pfn_valid(pfn))) { /* @@ -1016,68 +1030,89 @@ static int do_wp_page(struct mm_struct * return VM_FAULT_OOM; } old_page = pfn_to_page(pfn); + reprep = prepare_folio(folio, vma, address, ptep_to_paddr(page_table), 1); + new_page = private_folio_page(folio, PAGE_MMUSHIFT ? NULL : old_page); - if (!TestSetPageLocked(old_page)) { - int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { - flush_cache_page(vma, address); - establish_pte(vma, address, page_table, - pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; - } + if (new_page) { + pr_debug("%d: got private page\n", current->pid); + page_cache_get(new_page); + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) + panic("Could not allocate pte_chain in %s, %s:%d\n", + __FUNCTION__, __FILE__, __LINE__); + goto got_page; } - pte_unmap(page_table); - /* - * Ok, we need to copy. Oh, well.. - */ - page_cache_get(old_page); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_pte_chain; + if (!pte_chain) { + spin_lock(&mm->page_table_lock); + goto oom; + } + new_page = alloc_page(GFP_HIGHUSER); - if (!new_page) - goto no_new_page; - copy_cow_page(old_page,new_page,address); + pr_debug("%d: allocated page at 0x%lx\n", + current->pid, + page_to_pfn(new_page)); + if (!new_page) { + spin_lock(&mm->page_table_lock); + goto oom; + } /* * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (pte_same(*page_table, pte)) { - if (PageReserved(old_page)) - ++mm->rss; - page_remove_rmap(old_page, page_table); - break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); - lru_cache_add_active(new_page); + if (!pte_same(*page_table, pte)) { + pr_debug("%d: pte changed, back out of fault\n", current->pid); + page_cache_release(new_page); + return VM_FAULT_MINOR; + } - /* Free the old page.. */ - new_page = old_page; + if (reprep) { + pr_debug("%d: reprepping folio\n", current->pid); + prepare_folio(folio, vma, address, ptep_to_paddr(page_table), 1); + } + new_page = private_folio_page_xchg(folio, new_page); +got_page: + restrict_folio(folio, vma, address, page_table); + if (new_page != old_page) { + pr_debug("%d: copying folio\n", current->pid); + copy_folio(folio, new_page, old_page, address); + flush_cache_page(vma, address); + rmap_remove_folio(old_page, folio); + } + rss = set_folio_page(folio, new_page, vma->vm_page_prot, _PAGE_DIRTY|_PAGE_RW); + if (new_page != old_page) { + pte_chain = rmap_add_folio(new_page, folio, pte_chain); + adjust_page_count(new_page, rss - 1); + if (PageReserved(old_page)) + mm->rss += rss; + else + adjust_page_count(old_page, 1 - rss); } pte_unmap(page_table); - page_cache_release(new_page); - page_cache_release(old_page); + flush_folio(folio, vma, address); + update_mmu_cache(vma, address, folio); + if (!PageReserved(old_page)) + page_cache_release(old_page); + ret = VM_FAULT_MINOR; + goto out; +oom: + ret = VM_FAULT_OOM; +out: spin_unlock(&mm->page_table_lock); pte_chain_free(pte_chain); - return VM_FAULT_MINOR; - -no_new_page: - pte_chain_free(pte_chain); -no_pte_chain: - page_cache_release(old_page); - return VM_FAULT_OOM; + pr_debug("%d: return from do_wp_page()\n", current->pid); + return ret; } /* * Helper function for invalidate_mmap_range(). - * Both hba and hlen are page numbers in PAGE_SIZE units. + * Both hba and hlen are page numbers in MMUPAGE_SIZE units. * An hlen of zero blows away the entire portion file after hba. */ static void @@ -1099,14 +1134,14 @@ invalidate_mmap_range_list(struct list_h list_for_each(curr, head) { vp = list_entry(curr, struct vm_area_struct, shared); vba = vp->vm_pgoff; - vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1; + vea = vba + ((vp->vm_end - vp->vm_start) >> MMUPAGE_SHIFT) - 1; if (hea < vba || vea < hba) continue; /* Mapping disjoint from hole. */ zba = (hba <= vba) ? vba : hba; zea = (vea <= hea) ? vea : hea; zap_page_range(vp, - ((zba - vba) << PAGE_SHIFT) + vp->vm_start, - (zea - zba + 1) << PAGE_SHIFT); + ((zba - vba) << MMUPAGE_SHIFT) + vp->vm_start, + (zea - zba + 1) << MMUPAGE_SHIFT); } } @@ -1116,24 +1151,24 @@ invalidate_mmap_range_list(struct list_h * page range in the underlying file. * @address_space: the address space containing mmaps to be invalidated. * @holebegin: byte in first page to invalidate, relative to the start of - * the underlying file. This will be rounded down to a PAGE_SIZE + * the underlying file. This will be rounded down to a MMUPAGE_SIZE * boundary. Note that this is different from vmtruncate(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded - * up to a PAGE_SIZE boundary. A holelen of zero truncates to the + * up to a MMUPAGE_SIZE boundary. A holelen of zero truncates to the * end of the file. */ void invalidate_mmap_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { - unsigned long hba = holebegin >> PAGE_SHIFT; - unsigned long hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long hba = holebegin >> MMUPAGE_SHIFT; + unsigned long hlen = (holelen + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { long long holeend = - (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + (holebegin + holelen + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; @@ -1165,7 +1200,7 @@ int vmtruncate(struct inode * inode, lof if (inode->i_size < offset) goto do_expand; i_size_write(inode, offset); - invalidate_mmap_range(mapping, offset + PAGE_SIZE - 1, 0); + invalidate_mmap_range(mapping, offset + MMUPAGE_SIZE - 1, 0); truncate_inode_pages(mapping, offset); goto out_truncate; @@ -1224,19 +1259,22 @@ static int do_swap_page(struct mm_struct struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) { - struct page *page; + struct page *page, *swap_page; + pte_addr_t folio[PAGE_MMUCOUNT+1] = { [0 ... PAGE_MMUCOUNT] = 0 }; swp_entry_t entry = pte_to_swp_entry(orig_pte); - pte_t pte; - int ret = VM_FAULT_MINOR; + int rss, ret = VM_FAULT_MINOR; struct pte_chain *pte_chain = NULL; + pr_debug("%d: do_swap_page(%p, %p, %lx, %p, %p, %Lx, %d)\n", + current->pid, mm, vma, address, page_table, pmd, (u64)pte_val(orig_pte), write_access); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - page = lookup_swap_cache(entry); - if (!page) { + swap_page = lookup_swap_cache(entry); + if (!swap_page) { swapin_readahead(entry); - page = read_swap_cache_async(entry); - if (!page) { + swap_page = read_swap_cache_async(entry); + if (!swap_page) { /* * Back out if somebody else faulted in this pte while * we released the page table lock. @@ -1257,13 +1295,13 @@ static int do_swap_page(struct mm_struct inc_page_state(pgmajfault); } - mark_page_accessed(page); + mark_page_accessed(swap_page); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) { - ret = -ENOMEM; + ret = VM_FAULT_OOM; goto out; } - lock_page(page); + lock_page(swap_page); /* * Back out if somebody else faulted in this pte while we @@ -1271,30 +1309,45 @@ static int do_swap_page(struct mm_struct */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (!pte_same(*page_table, orig_pte)) { + if (pte_to_swp_entry(*page_table).val != entry.val) { pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); + unlock_page(swap_page); + page_cache_release(swap_page); ret = VM_FAULT_MINOR; goto out; } /* The page isn't present yet, go ahead with the fault. */ - swap_free(entry); if (vm_swap_full()) - remove_exclusive_swap_page(page); + remove_exclusive_swap_page(swap_page); - mm->rss++; - pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) - pte = pte_mkdirty(pte_mkwrite(pte)); - unlock_page(page); + prepare_folio(folio, vma, address, ptep_to_paddr(page_table), write_access); + if (write_access) { + page = private_folio_page(folio, swap_page); + restrict_folio(folio, vma, address, page_table); + if (!page) { + page = swap_page; + write_access = 0; + } else if (page != swap_page) { + page_cache_get(page); + copy_folio(folio, page, swap_page, address); + } + } else { + restrict_folio(folio, vma, address, page_table); + page = swap_page; + } flush_icache_page(vma, page); - set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); + rss = set_folio_page(folio, page, vma->vm_page_prot, write_access ? (_PAGE_DIRTY|_PAGE_RW) : 0); + pte_chain = rmap_add_folio(page, folio, pte_chain); + adjust_page_count(page, rss - 1); + mm->rss += rss; + __swap_free(entry, rss); + unlock_page(swap_page); + if (page != swap_page) + page_cache_release(swap_page); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); @@ -1302,6 +1355,7 @@ static int do_swap_page(struct mm_struct spin_unlock(&mm->page_table_lock); out: pte_chain_free(pte_chain); + pr_debug("%d: return from do_swap_page()\n", current->pid); return ret; } @@ -1315,11 +1369,14 @@ do_anonymous_page(struct mm_struct *mm, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr) { - pte_t entry; - struct page * page = ZERO_PAGE(addr); + pte_addr_t folio[PAGE_MMUCOUNT+1] = { [0 ... PAGE_MMUCOUNT] = 0 }; + struct page *new_page, *page = ZERO_PAGE(addr); struct pte_chain *pte_chain; int ret; + pr_debug("%d: do_anonymous_page(%p, %p, %p, %p, %d, %lx)\n", + current->pid, mm, vma, page_table, pmd, write_access, addr); + pte_chain = pte_chain_alloc(GFP_ATOMIC); if (!pte_chain) { pte_unmap(page_table); @@ -1331,39 +1388,50 @@ do_anonymous_page(struct mm_struct *mm, page_table = pte_offset_map(pmd, addr); } - /* Read-only mapping of ZERO_PAGE. */ - entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); - /* ..except if it's a write access */ if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - page = alloc_page(GFP_HIGHUSER); - if (!page) + new_page = alloc_page(GFP_HIGHUSER); + pr_debug("%d: allocated page at 0x%lx\n", + current->pid, + page_to_pfn(new_page)); + if (!new_page) goto no_mem; - clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { pte_unmap(page_table); - page_cache_release(page); + page_cache_release(new_page); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; goto out; } - mm->rss++; - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - lru_cache_add_active(page); - mark_page_accessed(page); - } + lru_cache_add_active(new_page); + mark_page_accessed(new_page); + } else + new_page = NULL; - set_pte(page_table, entry); + prepare_folio(folio, vma, addr, ptep_to_paddr(page_table), write_access); + if (write_access) { + int rss; + new_page = private_folio_page_xchg(folio, new_page); + restrict_folio(folio, vma, addr, page_table); + copy_folio(folio, new_page, page, addr); + page = new_page; + rss = set_folio_page(folio, page, vma->vm_page_prot, _PAGE_RW|_PAGE_DIRTY); + adjust_page_count(page, rss - 1); + mm->rss += rss; + } else { + restrict_folio(folio, vma, addr, page_table); + set_folio_page(folio, page, vma->vm_page_prot, 0); + } /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); + pte_chain = rmap_add_folio(page, folio, pte_chain); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ @@ -1376,6 +1444,7 @@ no_mem: ret = VM_FAULT_OOM; out: pte_chain_free(pte_chain); + pr_debug("%d: return from do_anonymous_page()\n", current->pid); return ret; } @@ -1395,16 +1464,15 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { - struct page * new_page; + struct page *page, *new_page; + pte_addr_t folio[PAGE_MMUCOUNT+1] = { [0 ... PAGE_MMUCOUNT] = 0 }; struct address_space *mapping = NULL; - pte_t entry; - struct pte_chain *pte_chain; - int sequence = 0; - int ret; + struct pte_chain *pte_chain = NULL; + int ret, rss, sequence = 0; + + pr_debug("%d: do_no_page(%p, %p, %lx, %d, %p, %p)\n", + current->pid, mm, vma, address, write_access, page_table, pmd); - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, page_table, - pmd, write_access, address); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1414,32 +1482,39 @@ do_no_page(struct mm_struct *mm, struct } smp_rmb(); /* Prevent CPU from reordering lock-free ->nopage() */ retry: - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + page = vma->vm_ops->nopage(vma, address, 0); /* no page was available -- either SIGBUS or OOM */ - if (new_page == NOPAGE_SIGBUS) + if (page == NOPAGE_SIGBUS) { + printk("%d: return VM_FAULT_SIGBUS from do_no_page()\n", current->pid); return VM_FAULT_SIGBUS; - if (new_page == NOPAGE_OOM) + } else if (page == NOPAGE_OOM) { + printk("%d: return VM_FAULT_OOM from do_no_page()\n", current->pid); return VM_FAULT_OOM; + } - pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto oom; /* * Should we do an early C-O-W break? */ - if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page * page = alloc_page(GFP_HIGHUSER); - if (!page) { - page_cache_release(new_page); + if ((write_access && !(vma->vm_flags & VM_SHARED)) && + (page_count(page) > 1 || PageReserved(page))) { + new_page = alloc_page(GFP_HIGHUSER); + pr_debug("%d: allocated page at 0x%lx\n", + current->pid, + page_to_pfn(new_page)); + if (!new_page) { + if (!PageReserved(page)) + page_cache_release(page); goto oom; } - copy_user_highpage(page, new_page, address); - page_cache_release(new_page); - lru_cache_add_active(page); - new_page = page; - } + lru_cache_add_active(new_page); + } else + new_page = NULL; spin_lock(&mm->page_table_lock); /* @@ -1451,34 +1526,14 @@ retry: (unlikely(sequence != atomic_read(&mapping->truncate_count)))) { sequence = atomic_read(&mapping->truncate_count); spin_unlock(&mm->page_table_lock); - page_cache_release(new_page); + if (new_page) + page_cache_release(new_page); pte_chain_free(pte_chain); goto retry; } page_table = pte_offset_map(pmd, address); - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if write_access is true, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. - */ - /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - if (!PageReserved(new_page)) - ++mm->rss; - flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); - if (write_access) - entry = pte_mkwrite(pte_mkdirty(entry)); - set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); - pte_unmap(page_table); - } else { + if (!pte_none(*page_table)) { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); page_cache_release(new_page); @@ -1487,8 +1542,33 @@ retry: goto out; } - /* no need to invalidate: a not-present page shouldn't be cached */ - update_mmu_cache(vma, address, entry); + prepare_folio(folio, vma, address, ptep_to_paddr(page_table), !!new_page); + if (new_page) { + new_page = private_folio_page_xchg(folio, new_page); + restrict_folio(folio, vma, address, page_table); + copy_folio(folio, new_page, page, address); + if (!PageReserved(page)) + page_cache_release(page); + page = new_page; + } else + restrict_folio(folio, vma, address, page_table); + + flush_icache_page(vma, page); + rss = set_folio_page(folio, page, vma->vm_page_prot, write_access ? (_PAGE_RW|_PAGE_DIRTY) : 0); + BUG_ON(!pte_chain); + if (!PageReserved(page)) { + /* + * Ignores PageReserved() internally but best to dodge + * the function call overhead. + */ + pte_chain = rmap_add_folio(page, folio, pte_chain); + adjust_page_count(page, rss - 1); + mm->rss += rss; + } + + /* No need to invalidate - it was non-present before */ + pte_unmap(page_table); + update_mmu_cache(vma, address, pte); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MAJOR; goto out; @@ -1496,6 +1576,7 @@ oom: ret = VM_FAULT_OOM; out: pte_chain_free(pte_chain); + pr_debug("%d: return %d from do_no_page()\n", current->pid, ret); return ret; } @@ -1510,6 +1591,9 @@ static int do_file_page(struct mm_struct unsigned long pgoff; int err; + pr_debug("%d: do_file_page(%p, %p, %lx, %d, %p, %p)\n", + current->pid, mm, vma, address, write_access, pte, pmd); + BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); /* * Fall back to the linear mapping if the fs does not support @@ -1526,7 +1610,8 @@ static int do_file_page(struct mm_struct pte_unmap(pte); spin_unlock(&mm->page_table_lock); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + err = vma->vm_ops->populate(vma, address & MMUPAGE_MASK, MMUPAGE_SIZE, vma->vm_page_prot, pgoff, 0); + pr_debug("%d: return from do_file_page()\n", current->pid); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) @@ -1568,8 +1653,11 @@ static inline int handle_pte_fault(struc * and the PTE updates will not touch it later. So * drop the lock. */ - if (pte_none(entry)) + if (pte_none(entry)) { + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, pte, pmd, write_access, address); return do_no_page(mm, vma, address, write_access, pte, pmd); + } if (pte_file(entry)) return do_file_page(mm, vma, address, write_access, pte, pmd); return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); @@ -1597,6 +1685,12 @@ int handle_mm_fault(struct mm_struct *mm pgd_t *pgd; pmd_t *pmd; + address &= MMUPAGE_MASK; + + pr_debug("%d: handle_mm_fault(%p, %p = [%lx, %lx), %lx, %d)\n", + current->pid, mm, vma, vma->vm_start, vma->vm_end, + address, write_access); + __set_current_state(TASK_RUNNING); pgd = pgd_offset(mm, address); @@ -1664,7 +1758,7 @@ int make_pages_present(unsigned long add BUG(); if (end > vma->vm_end) BUG(); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + len = (end+MMUPAGE_SIZE-1)/MMUPAGE_SIZE-addr/MMUPAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); if (ret < 0) diff -prauN linux-2.6.0-test11/mm/mincore.c pgcl-2.6.0-test11-1/mm/mincore.c --- linux-2.6.0-test11/mm/mincore.c 2003-11-26 12:45:52.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/mincore.c 2003-11-27 21:55:21.000000000 -0800 @@ -29,7 +29,7 @@ static unsigned char mincore_page(struct struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; struct page * page; - page = find_get_page(as, pgoff); + page = find_get_page(as, pgoff/PAGE_CACHE_MMUCOUNT); if (page) { present = PageUptodate(page); page_cache_release(page); @@ -42,41 +42,43 @@ static long mincore_vma(struct vm_area_s unsigned long start, unsigned long end, unsigned char __user * vec) { long error, i, remaining; - unsigned char * tmp; + unsigned char *kaddr; + struct page *page; error = -ENOMEM; if (!vma->vm_file) return error; - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + start = ((start - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + end = ((end - vma->vm_start) >> MMUPAGE_SHIFT) + vma->vm_pgoff; error = -EAGAIN; - tmp = (unsigned char *) __get_free_page(GFP_KERNEL); - if (!tmp) + page = alloc_page(GFP_HIGHUSER); + if (!page) return error; /* (end - start) is # of pages, and also # of bytes in "vec */ - remaining = (end - start), + remaining = end - start; error = 0; + kaddr = kmap_atomic(page, KM_USER0); for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { int j = 0; long thispiece = (remaining < PAGE_SIZE) ? remaining : PAGE_SIZE; while (j < thispiece) - tmp[j++] = mincore_page(vma, start++); + kaddr[j++] = mincore_page(vma, start++); - if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { + if (copy_to_user(vec + PAGE_SIZE * i, kaddr, thispiece)) { error = -EFAULT; break; } } - - free_page((unsigned long) tmp); + kunmap_atomic(kaddr, KM_USER0); + __free_page(page); return error; } @@ -116,15 +118,15 @@ asmlinkage long sys_mincore(unsigned lon down_read(¤t->mm->mmap_sem); - if (start & ~PAGE_CACHE_MASK) + if (start & ~MMUPAGE_MASK) goto out; - len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK; + len = (len + ~MMUPAGE_MASK) & MMUPAGE_MASK; end = start + len; if (end < start) goto out; error = -EFAULT; - if (!access_ok(VERIFY_WRITE, (unsigned long) vec, len >> PAGE_SHIFT)) + if (!access_ok(VERIFY_WRITE, (unsigned long) vec, len >> MMUPAGE_SHIFT)) goto out; error = 0; @@ -164,7 +166,7 @@ asmlinkage long sys_mincore(unsigned lon error = mincore_vma(vma, start, vma->vm_end, &vec[index]); if (error) goto out; - index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; + index += (vma->vm_end - start)/MMUPAGE_SIZE; start = vma->vm_end; vma = vma->vm_next; } diff -prauN linux-2.6.0-test11/mm/mlock.c pgcl-2.6.0-test11-1/mm/mlock.c --- linux-2.6.0-test11/mm/mlock.c 2003-11-26 12:42:56.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/mlock.c 2003-11-27 21:55:21.000000000 -0800 @@ -40,7 +40,7 @@ static int mlock_fixup(struct vm_area_st /* * Keep track of amount of locked VM. */ - pages = (end - start) >> PAGE_SHIFT; + pages = (end - start) >> MMUPAGE_SHIFT; if (newflags & VM_LOCKED) { pages = -pages; ret = make_pages_present(start, end); @@ -59,7 +59,7 @@ static int do_mlock(unsigned long start, if (on && !capable(CAP_IPC_LOCK)) return -EPERM; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; @@ -105,14 +105,14 @@ asmlinkage long sys_mlock(unsigned long int error = -ENOMEM; down_write(¤t->mm->mmap_sem); - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); - start &= PAGE_MASK; + len = MMUPAGE_ALIGN(len + (start & ~MMUPAGE_MASK)); + start &= MMUPAGE_MASK; - locked = len >> PAGE_SHIFT; + locked = len >> MMUPAGE_SHIFT; locked += current->mm->locked_vm; lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; + lock_limit >>= MMUPAGE_SHIFT; /* check against resource limits */ if (locked <= lock_limit) @@ -126,8 +126,8 @@ asmlinkage long sys_munlock(unsigned lon int ret; down_write(¤t->mm->mmap_sem); - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); - start &= PAGE_MASK; + len = MMUPAGE_ALIGN(len + (start & ~MMUPAGE_MASK)); + start &= MMUPAGE_MASK; ret = do_mlock(start, len, 0); up_write(¤t->mm->mmap_sem); return ret; @@ -171,7 +171,7 @@ asmlinkage long sys_mlockall(int flags) goto out; lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; + lock_limit >>= MMUPAGE_SHIFT; ret = -ENOMEM; if (current->mm->total_vm <= lock_limit) diff -prauN linux-2.6.0-test11/mm/mmap.c pgcl-2.6.0-test11-1/mm/mmap.c --- linux-2.6.0-test11/mm/mmap.c 2003-11-26 12:44:31.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/mmap.c 2003-11-27 22:49:10.000000000 -0800 @@ -104,8 +104,8 @@ asmlinkage unsigned long sys_brk(unsigne if (brk < mm->end_code) goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); + newbrk = MMUPAGE_ALIGN(brk); + oldbrk = MMUPAGE_ALIGN(mm->brk); if (oldbrk == newbrk) goto set_brk; @@ -122,7 +122,7 @@ asmlinkage unsigned long sys_brk(unsigne goto out; /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + if (find_vma_intersection(mm, oldbrk, newbrk+MMUPAGE_SIZE)) goto out; /* Ok, looks good - let it rip. */ @@ -354,7 +354,7 @@ can_vma_merge_after(struct vm_area_struc if (!file) return 1; /* anon mapping */ - vma_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + vma_size = (vma->vm_end - vma->vm_start) >> MMUPAGE_SHIFT; if (vma->vm_pgoff + vma_size == vm_pgoff) return 1; } @@ -412,7 +412,7 @@ static int vma_merge(struct mm_struct *m next = prev->vm_next; if (next && prev->vm_end == next->vm_start && can_vma_merge_before(next, vm_flags, file, - pgoff, (end - addr) >> PAGE_SHIFT)) { + pgoff, (end - addr) >> MMUPAGE_SHIFT)) { prev->vm_end = next->vm_end; __vma_unlink(mm, next, prev); __remove_shared_vm_struct(next, inode); @@ -439,14 +439,14 @@ static int vma_merge(struct mm_struct *m if (prev) { merge_next: if (!can_vma_merge_before(prev, vm_flags, file, - pgoff, (end - addr) >> PAGE_SHIFT)) + pgoff, (end - addr) >> MMUPAGE_SHIFT)) return 0; if (end == prev->vm_start) { if (file) down(i_shared_sem); spin_lock(lock); prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; + prev->vm_pgoff -= (end - addr) >> MMUPAGE_SHIFT; spin_unlock(lock); if (file) up(i_shared_sem); @@ -481,12 +481,12 @@ unsigned long do_mmap_pgoff(struct file return addr; /* Careful about overflows.. */ - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); if (!len || len > TASK_SIZE) return -EINVAL; /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + if (pgoff + len/MMUPAGE_SIZE < pgoff) return -EINVAL; /* Too many mappings? */ @@ -497,7 +497,7 @@ unsigned long do_mmap_pgoff(struct file * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) + if (addr & ~MMUPAGE_MASK) return addr; /* Do simple checking here so the lower-level routines won't have @@ -514,7 +514,7 @@ unsigned long do_mmap_pgoff(struct file } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { - unsigned long locked = mm->locked_vm << PAGE_SHIFT; + unsigned long locked = mm->locked_vm << MMUPAGE_SHIFT; locked += len; if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN; @@ -582,7 +582,7 @@ munmap_back: } /* Check against address space limit. */ - if ((mm->total_vm << PAGE_SHIFT) + len + if ((mm->total_vm << MMUPAGE_SHIFT) + len > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; @@ -594,7 +594,7 @@ munmap_back: /* * Private writable mapping: check memory availability */ - charged = len >> PAGE_SHIFT; + charged = len >> MMUPAGE_SHIFT; if (security_vm_enough_memory(charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT; @@ -679,9 +679,9 @@ munmap_back: kmem_cache_free(vm_area_cachep, vma); } out: - mm->total_vm += len >> PAGE_SHIFT; + mm->total_vm += len >> MMUPAGE_SHIFT; if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; + mm->locked_vm += len >> MMUPAGE_SHIFT; make_pages_present(addr, addr + len); } if (flags & MAP_POPULATE) { @@ -716,7 +716,7 @@ EXPORT_SYMBOL(do_mmap_pgoff); * Ugly calling convention alert: * Return value with the low bits set means error value, * ie - * if (ret & ~PAGE_MASK) + * if (ret & ~MMUPAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. @@ -734,7 +734,7 @@ arch_get_unmapped_area(struct file *filp return -ENOMEM; if (addr) { - addr = PAGE_ALIGN(addr); + addr = MMUPAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) @@ -781,7 +781,7 @@ get_unmapped_area(struct file *file, uns if (addr > TASK_SIZE - len) return -ENOMEM; - if (addr & ~PAGE_MASK) + if (addr & ~MMUPAGE_MASK) return -EINVAL; if (file && is_file_hugepages(file)) { /* @@ -891,18 +891,20 @@ int expand_stack(struct vm_area_struct * { unsigned long grow; - if (!(vma->vm_flags & VM_GROWSUP)) + if (!(vma->vm_flags & VM_GROWSUP)) { + printk("bad vma flags in expand_stack()\n"); return -EFAULT; + } /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need to get * the spinlock only before relocating the vma range ourself. */ - address += 4 + PAGE_SIZE - 1; - address &= PAGE_MASK; + address += 4 + MMUPAGE_SIZE - 1; + address &= MMUPAGE_MASK; spin_lock(&vma->vm_mm->page_table_lock); - grow = (address - vma->vm_end) >> PAGE_SHIFT; + grow = (address - vma->vm_end) >> MMUPAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { @@ -911,7 +913,7 @@ int expand_stack(struct vm_area_struct * } if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + ((vma->vm_mm->total_vm + grow) << MMUPAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); @@ -930,7 +932,7 @@ find_extend_vma(struct mm_struct *mm, un { struct vm_area_struct *vma, *prev; - addr &= PAGE_MASK; + addr &= MMUPAGE_MASK; vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; @@ -954,9 +956,10 @@ int expand_stack(struct vm_area_struct * * is required to hold the mmap_sem in read mode. We need to get * the spinlock only before relocating the vma range ourself. */ + /* address &= MMUPAGE_MASK; */ address &= PAGE_MASK; spin_lock(&vma->vm_mm->page_table_lock); - grow = (vma->vm_start - address) >> PAGE_SHIFT; + grow = (vma->vm_start - address) >> MMUPAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { @@ -965,7 +968,7 @@ int expand_stack(struct vm_area_struct * } if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + ((vma->vm_mm->total_vm + grow) << MMUPAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); @@ -986,7 +989,7 @@ find_extend_vma(struct mm_struct * mm, u struct vm_area_struct * vma; unsigned long start; - addr &= PAGE_MASK; + addr &= MMUPAGE_MASK; vma = find_vma(mm,addr); if (!vma) return NULL; @@ -1023,7 +1026,7 @@ static void free_pgtables(struct mmu_gat unsigned long first = start & PGDIR_MASK; unsigned long last = end + PGDIR_SIZE - 1; unsigned long start_index, end_index; - struct mm_struct *mm = tlb->mm; + struct mm_struct *mm = tlb_mm(tlb); if (!prev) { prev = mm->mmap; @@ -1078,9 +1081,9 @@ static void unmap_vma(struct mm_struct * { size_t len = area->vm_end - area->vm_start; - area->vm_mm->total_vm -= len >> PAGE_SHIFT; + area->vm_mm->total_vm -= len >> MMUPAGE_SHIFT; if (area->vm_flags & VM_LOCKED) - area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + area->vm_mm->locked_vm -= len >> MMUPAGE_SHIFT; /* * Is this a new hole at the lowest possible address? */ @@ -1187,7 +1190,7 @@ int split_vma(struct mm_struct * mm, str new->vm_end = addr; else { new->vm_start = addr; - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + new->vm_pgoff += ((addr - vma->vm_start) >> MMUPAGE_SHIFT); } if (new->vm_file) @@ -1205,7 +1208,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + vma->vm_pgoff += ((addr - new->vm_start) >> MMUPAGE_SHIFT); } else vma->vm_end = addr; @@ -1228,10 +1231,10 @@ int do_munmap(struct mm_struct *mm, unsi unsigned long end; struct vm_area_struct *mpnt, *prev, *last; - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + if ((start & ~MMUPAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; - if ((len = PAGE_ALIGN(len)) == 0) + if ((len = MMUPAGE_ALIGN(len)) == 0) return -EINVAL; /* Find the first overlapping VMA */ @@ -1316,7 +1319,7 @@ unsigned long do_brk(unsigned long addr, unsigned long flags; struct rb_node ** rb_link, * rb_parent; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); if (!len) return addr; @@ -1327,7 +1330,7 @@ unsigned long do_brk(unsigned long addr, * mlock MCL_FUTURE? */ if (mm->def_flags & VM_LOCKED) { - unsigned long locked = mm->locked_vm << PAGE_SHIFT; + unsigned long locked = mm->locked_vm << MMUPAGE_SHIFT; locked += len; if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN; @@ -1345,14 +1348,14 @@ unsigned long do_brk(unsigned long addr, } /* Check against address space limits *after* clearing old maps... */ - if ((mm->total_vm << PAGE_SHIFT) + len + if ((mm->total_vm << MMUPAGE_SHIFT) + len > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; if (mm->map_count > MAX_MAP_COUNT) return -ENOMEM; - if (security_vm_enough_memory(len >> PAGE_SHIFT)) + if (security_vm_enough_memory(len >> MMUPAGE_SHIFT)) return -ENOMEM; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; @@ -1367,7 +1370,7 @@ unsigned long do_brk(unsigned long addr, */ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); + vm_unacct_memory(len >> MMUPAGE_SHIFT); return -ENOMEM; } @@ -1385,9 +1388,9 @@ unsigned long do_brk(unsigned long addr, vma_link(mm, vma, prev, rb_link, rb_parent); out: - mm->total_vm += len >> PAGE_SHIFT; + mm->total_vm += len >> MMUPAGE_SHIFT; if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; + mm->locked_vm += len >> MMUPAGE_SHIFT; make_pages_present(addr, addr + len); } return addr; diff -prauN linux-2.6.0-test11/mm/mprotect.c pgcl-2.6.0-test11-1/mm/mprotect.c --- linux-2.6.0-test11/mm/mprotect.c 2003-11-26 12:43:38.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/mprotect.c 2003-11-27 21:55:21.000000000 -0800 @@ -53,7 +53,7 @@ change_pte_range(pmd_t *pmd, unsigned lo entry = ptep_get_and_clear(pte); set_pte(pte, pte_modify(entry, newprot)); } - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); pte_unmap(pte - 1); @@ -174,7 +174,7 @@ mprotect_fixup(struct vm_area_struct *vm */ if (newflags & VM_WRITE) { if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { - charged = (end - start) >> PAGE_SHIFT; + charged = (end - start) >> MMUPAGE_SHIFT; if (security_vm_enough_memory(charged)) return -ENOMEM; newflags |= VM_ACCOUNT; @@ -232,9 +232,9 @@ sys_mprotect(unsigned long start, size_t if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; - if (start & ~PAGE_MASK) + if (start & ~MMUPAGE_MASK) return -EINVAL; - len = PAGE_ALIGN(len); + len = MMUPAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; diff -prauN linux-2.6.0-test11/mm/mremap.c pgcl-2.6.0-test11-1/mm/mremap.c --- linux-2.6.0-test11/mm/mremap.c 2003-11-26 12:44:19.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/mremap.c 2003-11-27 21:55:21.000000000 -0800 @@ -159,7 +159,7 @@ static int move_page_tables(struct vm_ar * only a few pages.. This also makes error recovery easier. */ while (offset) { - offset -= PAGE_SIZE; + offset -= MMUPAGE_SIZE; if (move_one_page(vma, old_addr + offset, new_addr + offset)) goto oops_we_failed; } @@ -174,7 +174,7 @@ static int move_page_tables(struct vm_ar */ oops_we_failed: flush_cache_range(vma, new_addr, new_addr + len); - while ((offset += PAGE_SIZE) < len) + while ((offset += MMUPAGE_SIZE) < len) move_one_page(vma, new_addr + offset, old_addr + offset); zap_page_range(vma, new_addr, len); return -1; @@ -248,7 +248,7 @@ static unsigned long move_vma(struct vm_ INIT_LIST_HEAD(&new_vma->shared); new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; - new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; + new_vma->vm_pgoff += (addr - vma->vm_start) >> MMUPAGE_SHIFT; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) @@ -276,9 +276,9 @@ static unsigned long move_vma(struct vm_ vma->vm_next->vm_flags |= VM_ACCOUNT; } - current->mm->total_vm += new_len >> PAGE_SHIFT; + current->mm->total_vm += new_len >> MMUPAGE_SHIFT; if (vm_locked) { - current->mm->locked_vm += new_len >> PAGE_SHIFT; + current->mm->locked_vm += new_len >> MMUPAGE_SHIFT; if (new_len > old_len) make_pages_present(new_addr + old_len, new_addr + new_len); @@ -309,15 +309,15 @@ unsigned long do_mremap(unsigned long ad if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) goto out; - if (addr & ~PAGE_MASK) + if (addr & ~MMUPAGE_MASK) goto out; - old_len = PAGE_ALIGN(old_len); - new_len = PAGE_ALIGN(new_len); + old_len = MMUPAGE_ALIGN(old_len); + new_len = MMUPAGE_ALIGN(new_len); /* new_addr is only valid if MREMAP_FIXED is specified */ if (flags & MREMAP_FIXED) { - if (new_addr & ~PAGE_MASK) + if (new_addr & ~MMUPAGE_MASK) goto out; if (!(flags & MREMAP_MAYMOVE)) goto out; @@ -369,19 +369,19 @@ unsigned long do_mremap(unsigned long ad goto out; } if (vma->vm_flags & VM_LOCKED) { - unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; + unsigned long locked = current->mm->locked_vm << MMUPAGE_SHIFT; locked += new_len - old_len; ret = -EAGAIN; if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) goto out; } ret = -ENOMEM; - if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) + if ((current->mm->total_vm << MMUPAGE_SHIFT) + (new_len - old_len) > current->rlim[RLIMIT_AS].rlim_cur) goto out; if (vma->vm_flags & VM_ACCOUNT) { - charged = (new_len - old_len) >> PAGE_SHIFT; + charged = (new_len - old_len) >> MMUPAGE_SHIFT; if (security_vm_enough_memory(charged)) goto out_nc; } @@ -397,7 +397,7 @@ unsigned long do_mremap(unsigned long ad max_addr = vma->vm_next->vm_start; /* can we just expand the current mapping? */ if (max_addr - addr >= new_len) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + int pages = (new_len - old_len) >> MMUPAGE_SHIFT; spin_lock(&vma->vm_mm->page_table_lock); vma->vm_end = addr + new_len; spin_unlock(&vma->vm_mm->page_table_lock); @@ -426,13 +426,13 @@ unsigned long do_mremap(unsigned long ad new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); ret = new_addr; - if (new_addr & ~PAGE_MASK) + if (new_addr & ~MMUPAGE_MASK) goto out; } ret = move_vma(vma, addr, old_len, new_len, new_addr); } out: - if (ret & ~PAGE_MASK) + if (ret & ~MMUPAGE_MASK) vm_unacct_memory(charged); out_nc: return ret; diff -prauN linux-2.6.0-test11/mm/msync.c pgcl-2.6.0-test11-1/mm/msync.c --- linux-2.6.0-test11/mm/msync.c 2003-11-26 12:43:36.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/msync.c 2003-11-27 21:55:21.000000000 -0800 @@ -59,7 +59,7 @@ static int filemap_sync_pte_range(pmd_t error = 0; do { error |= filemap_sync_pte(pte, vma, address, flags); - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); @@ -174,12 +174,12 @@ asmlinkage long sys_msync(unsigned long down_read(¤t->mm->mmap_sem); if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; - if (start & ~PAGE_MASK) + if (start & ~MMUPAGE_MASK) goto out; if ((flags & MS_ASYNC) && (flags & MS_SYNC)) goto out; error = -ENOMEM; - len = (len + ~PAGE_MASK) & PAGE_MASK; + len = (len + ~MMUPAGE_MASK) & MMUPAGE_MASK; end = start + len; if (end < start) goto out; diff -prauN linux-2.6.0-test11/mm/page-writeback.c pgcl-2.6.0-test11-1/mm/page-writeback.c --- linux-2.6.0-test11/mm/page-writeback.c 2003-11-26 12:44:45.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/page-writeback.c 2003-11-27 21:55:21.000000000 -0800 @@ -392,8 +392,8 @@ static void set_ratelimit(void) ratelimit_pages = total_pages / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; - if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) - ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; + if (ratelimit_pages * PAGE_CACHE_SIZE > PAGE_SIZE * 1024) + ratelimit_pages = (PAGE_SIZE * 1024) / PAGE_CACHE_SIZE; } static int diff -prauN linux-2.6.0-test11/mm/page_alloc.c pgcl-2.6.0-test11-1/mm/page_alloc.c --- linux-2.6.0-test11/mm/page_alloc.c 2003-11-26 12:42:56.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/page_alloc.c 2003-11-27 21:55:21.000000000 -0800 @@ -61,7 +61,7 @@ int min_free_kbytes = 1024; */ static int bad_range(struct zone *zone, struct page *page) { - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages*PAGE_MMUCOUNT) return 1; if (page_to_pfn(page) < zone->zone_start_pfn) return 1; @@ -72,21 +72,25 @@ static int bad_range(struct zone *zone, static void bad_page(const char *function, struct page *page) { - printk("Bad page state at %s\n", function); - printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n", + printk("Bad page state for 0x%p (pfn=0x%lx) at %s\n", + page, page_to_pfn(page), function); + printk("flags:0x%08lx mapping:%p mapped:%d count:%d private:0x%lx\n", page->flags, page->mapping, - page_mapped(page), page_count(page)); + page_mapped(page), page_count(page), page->private); printk("Backtrace:\n"); dump_stack(); printk("Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_private | - 1 << PG_locked | - 1 << PG_lru | - 1 << PG_active | - 1 << PG_dirty | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_direct | + 1 << PG_chainlock | 1 << PG_writeback); set_page_count(page, 0); page->mapping = NULL; + page->pte.direct = (pte_addr_t)NULL; } #ifndef CONFIG_HUGETLB_PAGE @@ -209,20 +213,67 @@ static inline void __free_pages_bulk (st list_add(&(base + page_idx)->list, &area->free_list); } +/* + * I ruined this goddamn thing performancewise, but I desperately need + * more accessible debugging info. + */ static inline void free_pages_check(const char *function, struct page *page) { - if ( page_mapped(page) || - page->mapping != NULL || - page_count(page) != 0 || - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_writeback ))) + int bad = 0; + + if (unlikely(PageDirect(page))) { + printk("PageDirect(page)\n"); + bad = 1; + } + + if (unlikely(test_bit(PG_chainlock, &page->flags))) { + printk("pte_chain_lock(page) held!\n"); + bad = 1; + } + + if (unlikely(page_mapped(page))) { + printk("page_mapped(page)\n"); + bad = 1; + } + if (unlikely(page->mapping)) { + printk("page->mapping == %p != NULL\n", page->mapping); + bad = 1; + } + if (unlikely(page_count(page))) { + printk("page->count == %d != 0\n", page_count(page)); + bad = 1; + } + if (unlikely(PageLRU(page))) { + printk("PageLRU(page)\n"); + bad = 1; + } + if (unlikely(page->private)) { + printk("PagePrivate(page)\n"); + bad = 1; + } + if (unlikely(PageLocked(page))) { + printk("PageLocked(page)\n"); + bad = 1; + } + if (unlikely(PageActive(page))) { + printk("PageActive(page)\n"); + bad = 1; + } + if (unlikely(PageReclaim(page))) { + printk("PageReclaim(page)\n"); + bad = 1; + } + if (unlikely(PageSlab(page))) { + printk("PageSlab(page)\n"); + bad = 1; + } + if (unlikely(PageWriteback(page))) { + printk("PageWriteback(page)\n"); + bad = 1; + } + if (unlikely(bad)) bad_page(function, page); + if (PageDirty(page)) ClearPageDirty(page); } @@ -238,7 +289,7 @@ static inline void free_pages_check(cons * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int +int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order) { @@ -318,20 +369,24 @@ static inline void set_page_refs(struct */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapped(page) || + if (page->mapping || page_mapped(page) || PagePrivate(page) || (page->flags & ( - 1 << PG_private | - 1 << PG_locked | - 1 << PG_lru | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_direct | + 1 << PG_chainlock | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); + page->flags &= ~( + 1<< PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_direct | 1 << PG_chainlock | + 1 << PG_checked | 1 << PG_mappedtodisk); page->private = 0; set_page_refs(page, order); } @@ -1217,9 +1272,9 @@ void __init memmap_init_zone(struct page #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ if (zone != ZONE_HIGHMEM) - set_page_address(page, __va(start_pfn << PAGE_SHIFT)); + set_page_address(page, __va(start_pfn << MMUPAGE_SHIFT)); #endif - start_pfn++; + start_pfn += PAGE_MMUCOUNT; } } @@ -1238,7 +1293,7 @@ static void __init free_area_init_core(s unsigned long *zones_size, unsigned long *zholes_size) { unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-PAGE_MMUSHIFT-1); int cpu, nid = pgdat->node_id; struct page *lmem_map = pgdat->node_mem_map; unsigned long zone_start_pfn = pgdat->node_start_pfn; @@ -1296,7 +1351,7 @@ static void __init free_area_init_core(s INIT_LIST_HEAD(&pcp->list); } printk(" %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_names[j], realsize*PAGE_MMUCOUNT, batch); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); atomic_set(&zone->refill_counter, 0); @@ -1329,7 +1384,7 @@ static void __init free_area_init_core(s memmap_init(lmem_map, size, nid, j, zone_start_pfn); - zone_start_pfn += size; + zone_start_pfn += PAGE_MMUCOUNT*size; lmem_map += size; for (i = 0; ; i++) { @@ -1402,7 +1457,7 @@ EXPORT_SYMBOL(contig_page_data); void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, &contig_page_data, NULL, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + __pa(PAGE_OFFSET) >> MMUPAGE_SHIFT, NULL); mem_map = contig_page_data.node_mem_map; } #endif @@ -1517,6 +1572,14 @@ static void *vmstat_start(struct seq_fil if (!ps) return ERR_PTR(-ENOMEM); get_full_page_state(ps); + if (PAGE_MMUCOUNT > 1) { + ps->nr_dirty *= PAGE_MMUCOUNT; + ps->nr_writeback *= PAGE_MMUCOUNT; + ps->nr_unstable *= PAGE_MMUCOUNT; + ps->nr_page_table_pages *= PAGE_MMUCOUNT; + ps->nr_mapped *= PAGE_MMUCOUNT; + ps->nr_slab *= PAGE_MMUCOUNT; + } ps->pgpgin /= 2; /* sectors -> kbytes */ ps->pgpgout /= 2; return (unsigned long *)ps + *pos; diff -prauN linux-2.6.0-test11/mm/page_io.c pgcl-2.6.0-test11-1/mm/page_io.c --- linux-2.6.0-test11/mm/page_io.c 2003-11-26 12:43:05.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/page_io.c 2003-11-27 21:55:21.000000000 -0800 @@ -32,7 +32,7 @@ get_swap_bio(int gfp_flags, struct page swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -103,7 +103,7 @@ int swap_writepage(struct page *page, st ret = -ENOMEM; goto out; } - inc_page_state(pswpout); + mod_page_state(pswpout, PAGE_MMUCOUNT); SetPageWriteback(page); unlock_page(page); submit_bio(WRITE, bio); @@ -124,7 +124,7 @@ int swap_readpage(struct file *file, str ret = -ENOMEM; goto out; } - inc_page_state(pswpin); + mod_page_state(pswpin, PAGE_MMUCOUNT); submit_bio(READ, bio); out: return ret; @@ -152,7 +152,7 @@ int rw_swap_page_sync(int rw, swp_entry_ BUG_ON(page->mapping); page->mapping = &swapper_space; - page->index = entry.val; + page->index = entry.val/PAGE_MMUCOUNT; if (rw == READ) { ret = swap_readpage(NULL, page); diff -prauN linux-2.6.0-test11/mm/rmap.c pgcl-2.6.0-test11-1/mm/rmap.c --- linux-2.6.0-test11/mm/rmap.c 2003-11-26 12:45:36.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/rmap.c 2003-11-27 21:55:21.000000000 -0800 @@ -37,6 +37,12 @@ /* #define DEBUG_RMAP */ +#ifdef DEBUG_RMAP +#define RMAP_BUG_ON(p) BUG_ON(p) +#else +#define RMAP_BUG_ON(p) do { } while (0) +#endif + /* * Shared pages have a chain of pte_chain structures, used to locate * all the mappings to this page. We only need a pointer to the pte @@ -45,8 +51,37 @@ * * We use an array of pte pointers in this structure to minimise cache misses * while traversing reverse maps. + * + * What we want here is + * NRPTE = (N*L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t) + * where N is the least such that NRPTE >= PAGE_MMUCOUNT. + * This looks hairier than it truly is. + * + * Suppose we know that: + * (N*L1_CACHE_BYTES - sizeof(long))/sizeof(pte_addr_t) >= PAGE_MMUCOUNT + * We then have + * N*L1_CACHE_BYTES - sizeof(long) >= PAGE_MMUCOUNT*sizeof(pte_addr_t) + * and + * N*L1_CACHE_BYTES >= PAGE_MMUCOUNT*sizeof(pte_addr_t) + sizeof(long) + * and in turn + * N >= (PAGE_MMUCOUNT*sizeof(pte_addr_t)+sizeof(long)+L1_CACHE_BYTES-1) + * /L1_CACHE_BYTES + * + * In order for the pte_chain_next_and_idx() etc. algorithms to work, + * NRPTE must be of the form 2**N - 1. */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) + +#if 0 +#define __NL1CB__ (PAGE_MMUCOUNT*sizeof(pte_addr_t) + sizeof(long)) +#define __NL1CL__ ((__NL1CB__ + L1_CACHE_BYTES - 1)/L1_CACHE_BYTES) +#define _NL1CL_ (__NL1CL__ > 0 ? __NL1CL__ : 1) +#define NRPTE ((_NL1CL_*L1_CACHE_BYTES - sizeof(long))/sizeof(pte_addr_t)) +#else +#define _NRPTE0 (2*PAGE_MMUCOUNT*sizeof(pte_addr_t)) +#define _NRPTE1 (L1_CACHE_BYTES) +#define _NRPTE2 (_NRPTE0 > _NRPTE1 ? _NRPTE0 : _NRPTE1) +#define NRPTE ((_NRPTE2 - sizeof(long))/sizeof(pte_addr_t)) +#endif /* * next_and_idx encodes both the address of the next pte_chain and the @@ -101,6 +136,78 @@ pte_chain_encode(struct pte_chain *pte_c ** VM stuff below this comment **/ +#ifdef DEBUG_RMAP +/** + * page_truly_private - check if this mm is the only one mapping page + * @page: the page to check + * @mm: the mm to check + * + * This finds whether the page is private to the mm. + */ +int page_truly_private(struct page *page, struct mm_struct *mm) +{ + int ret = 0; + + pte_chain_lock(page); + if (PageDirect(page) && page->pte.direct) { + ret = mm == pte_paddr_to_mm(page->pte.direct); + goto out; + } else { + struct pte_chain *chain = page->pte.chain; + while (chain) { + int k; + + for (k = NRPTE - 1; k >= 0; --k) { + if (!chain->ptes[k]) + break; + if (pte_paddr_to_mm(chain->ptes[k]) != mm) + goto out; + } + chain = pte_chain_next(chain); + } + } + ret = 1; +out: pte_chain_unlock(page); + return ret; +} + +/** + * page_count_expected - find the expected refcount + * @page: the page to check + * + * This finds the expected reference count from page attributes. + */ +int page_count_expected(struct page *page) +{ + int count = 0; + + pte_chain_lock(page); + if (PageDirect(page) && page->pte.direct) + ++count; + else { + struct pte_chain *chain = page->pte.chain; + + while (chain) { + int k; + + for (k = NRPTE - 1; k >= 0; --k) { + if (!chain->ptes[k]) + break; + else + ++count; + } + chain = pte_chain_next(chain); + } + } + if (page->mapping) + ++count; + if (PagePrivate(page)) + ++count; + pte_chain_unlock(page); + return count; +} +#endif + /** * page_referenced - test if the page was referenced * @page: the page to test @@ -156,6 +263,24 @@ int page_referenced(struct page * page) return referenced; } +static inline int check_pte_paddr_present(struct page *page, pte_addr_t pte) +{ + struct pte_chain *chain; + + if (PageDirect(page)) + return page->pte.direct == pte; + + for (chain = page->pte.chain; chain; chain = pte_chain_next(chain)) { + int k; + for (k = pte_chain_idx(chain); k < NRPTE; ++k) { + if (chain->ptes[k] == pte) + return 1; + } + } + + return 0; +} + /** * page_add_rmap - add reverse mapping entry to a page * @page: the page to add the mapping to @@ -164,16 +289,15 @@ int page_referenced(struct page * page) * Add a new pte reverse mapping to a page. * The caller needs to hold the mm->page_table_lock. */ -struct pte_chain * -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) +static struct pte_chain * +RMAP_FASTCALL(__page_add_rmap(struct page *, pte_addr_t, struct pte_chain *)); + +static struct pte_chain * +__page_add_rmap(struct page *page, pte_addr_t pte_paddr, struct pte_chain *pte_chain) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); struct pte_chain *cur_pte_chain; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return pte_chain; - - pte_chain_lock(page); + RMAP_BUG_ON(check_pte_paddr_present(page, pte_paddr)); if (page->pte.direct == 0) { page->pte.direct = pte_paddr; @@ -185,6 +309,7 @@ page_add_rmap(struct page *page, pte_t * if (PageDirect(page)) { /* Convert a direct pointer into a pte_chain */ ClearPageDirect(page); + BUG_ON(!pte_chain); pte_chain->ptes[NRPTE-1] = page->pte.direct; pte_chain->ptes[NRPTE-2] = pte_paddr; pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); @@ -195,6 +320,7 @@ page_add_rmap(struct page *page, pte_t * } cur_pte_chain = page->pte.chain; + BUG_ON(!cur_pte_chain); if (cur_pte_chain->ptes[0]) { /* It's full */ pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, NRPTE - 1); @@ -206,6 +332,50 @@ page_add_rmap(struct page *page, pte_t * cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; cur_pte_chain->next_and_idx--; out: + return pte_chain; +} + +struct pte_chain *page_add_rmap(struct page *page, + pte_t *ptep, + struct pte_chain *pte_chain) +{ + pte_addr_t pte_paddr = ptep_to_paddr(ptep); + + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return pte_chain; + + pte_chain_lock(page); + pte_chain = __page_add_rmap(page, pte_paddr, pte_chain); + pte_chain_unlock(page); + return pte_chain; +} + +/* + * Ultralame. The whole interaction with rmap needs rewriting anyway + * in order to reap an expected O(PAGE_MMUCOUNT) overhead reduction. + */ +struct pte_chain *rmap_add_folio(struct page *page, + pte_addr_t folio[], + struct pte_chain *pte_chain) +{ + int k; + + BUG_ON(!page); + BUG_ON(!pte_chain); + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return pte_chain; + + pte_chain_lock(page); + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + if (!folio[k]) + continue; + /* + * I'd like to BUG_ON(!pte_chain) here, but we can + * consume the goddamn thing in __page_add_rmap() + * while still being able to accomplish insertions. + */ + pte_chain = __page_add_rmap(page, folio[k], pte_chain); + } pte_chain_unlock(page); return pte_chain; } @@ -220,18 +390,12 @@ out: * the page. * Caller needs to hold the mm->page_table_lock. */ -void page_remove_rmap(struct page *page, pte_t *ptep) +static void RMAP_FASTCALL(__page_remove_rmap(struct page *, pte_addr_t)); +static void __page_remove_rmap(struct page *page, pte_addr_t pte_paddr) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); struct pte_chain *pc; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; - - pte_chain_lock(page); - - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + RMAP_BUG_ON(!check_pte_paddr_present(page, pte_paddr)); if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { @@ -241,9 +405,10 @@ void page_remove_rmap(struct page *page, } } else { struct pte_chain *start = page->pte.chain; - struct pte_chain *next; + struct pte_chain *next, *orig_start; int victim_i = -1; + orig_start = start; for (pc = start; pc; pc = next) { int i; @@ -271,11 +436,46 @@ void page_remove_rmap(struct page *page, } } out: + return; +} + +void page_remove_rmap(struct page *page, pte_t *ptep) +{ + pte_addr_t pte_paddr; + + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; + + pte_paddr = ptep_to_paddr(ptep); + + pte_chain_lock(page); + __page_remove_rmap(page, pte_paddr); + if (!page_mapped(page)) + dec_page_state(nr_mapped); + pte_chain_unlock(page); +} + +void rmap_remove_folio(struct page *page, pte_addr_t folio[]) +{ + int k; + + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; + + pte_chain_lock(page); + + if (!page_mapped(page)) + goto out_unlock; + + for (k = 0; k < PAGE_MMUCOUNT; ++k) { + if (folio[k]) + __page_remove_rmap(page, folio[k]); + } + if (!page_mapped(page)) dec_page_state(nr_mapped); out_unlock: pte_chain_unlock(page); - return; } /** @@ -291,7 +491,7 @@ out_unlock: * pte_chain_lock shrink_list() * mm->page_table_lock try_to_unmap_one(), trylock */ -static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); +static int RMAP_FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); static int try_to_unmap_one(struct page * page, pte_addr_t paddr) { pte_t *ptep = rmap_ptep_map(paddr); @@ -337,7 +537,8 @@ static int try_to_unmap_one(struct page * Store the swap location in the pte. * See handle_pte_fault() ... */ - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->index*PAGE_CACHE_MMUCOUNT + + (pte_pfn(pte)%PAGE_CACHE_MMUCOUNT) }; swap_duplicate(entry); set_pte(ptep, swp_entry_to_pte(entry)); BUG_ON(pte_file(*ptep)); @@ -347,11 +548,12 @@ static int try_to_unmap_one(struct page * If a nonlinear mapping then store the file page offset * in the pte. */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; - pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); + pgidx = (address - vma->vm_start)/MMUPAGE_SIZE + vma->vm_pgoff; + if (page->index != pgidx/PAGE_CACHE_MMUCOUNT) { + unsigned long f_pgoff; + f_pgoff = PAGE_CACHE_MMUCOUNT*page->index + + (pte_pfn(pte) % PAGE_CACHE_MMUCOUNT); + set_pte(ptep, pgoff_to_pte(f_pgoff)); BUG_ON(!pte_file(*ptep)); } } @@ -484,6 +686,7 @@ void __pte_chain_free(struct pte_chain * pte_chain->next_and_idx = 0; if (*pte_chainp) kmem_cache_free(pte_chain_cache, *pte_chainp); + *pte_chainp = pte_chain; put_cpu_var(local_pte_chain); } @@ -514,6 +717,7 @@ struct pte_chain *pte_chain_alloc(int gf put_cpu_var(local_pte_chain); ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); } + return ret; } diff -prauN linux-2.6.0-test11/mm/shmem.c pgcl-2.6.0-test11-1/mm/shmem.c --- linux-2.6.0-test11/mm/shmem.c 2003-11-26 12:43:41.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/shmem.c 2003-11-27 21:55:21.000000000 -0800 @@ -50,7 +50,7 @@ #define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) #define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) -#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) +#define VM_ACCT(size) (MMUPAGE_ALIGN(size)/MMUPAGE_SIZE) /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ #define SHMEM_PAGEIN VM_READ @@ -78,14 +78,14 @@ static inline struct page *shmem_dir_all /* * The above definition of ENTRIES_PER_PAGE, and the use of * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: - * might be reconsidered if it ever diverges from PAGE_SIZE. + * might be reconsidered if it ever diverges from MMUPAGE_SIZE. */ - return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); + return alloc_pages(gfp_mask, PAGE_CACHE_MMUSHIFT); } static inline void shmem_dir_free(struct page *page) { - __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); + __free_pages(page, PAGE_CACHE_MMUSHIFT); } static struct page **shmem_dir_map(struct page *page) @@ -283,7 +283,7 @@ static void shmem_swp_set(struct shmem_i entry->val = value; info->swapped += incdec; if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) - kmap_atomic_to_page(entry)->nr_swapped += incdec; + pfn_to_page(kmap_atomic_to_pfn(entry))->nr_swapped += incdec; } /* @@ -389,7 +389,7 @@ static void shmem_truncate(struct inode int freed; inode->i_ctime = inode->i_mtime = CURRENT_TIME; - idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + idx = (inode->i_size + PAGE_CACHE_SIZE - 1)/PAGE_CACHE_SIZE; if (idx >= info->next_index) return; @@ -519,7 +519,7 @@ static int shmem_notify_change(struct de long change = 0; int error; - if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size <= SHMEM_MAX_BYTES)) { + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size <= SHMEM_MAX_BYTES) { /* * Account swap file usage based on new file size, * but just let vmtruncate fail on out-of-range sizes. @@ -537,9 +537,9 @@ static int shmem_notify_change(struct de * truncate_partial_page cannnot miss it were * it assigned to swap. */ - if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + if (attr->ia_size % PAGE_CACHE_SIZE) { (void) shmem_getpage(inode, - attr->ia_size>>PAGE_CACHE_SHIFT, + attr->ia_size/PAGE_CACHE_SIZE, &page, SGP_READ); } /* @@ -973,16 +973,14 @@ struct page *shmem_nopage(struct vm_area { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page = NULL; - unsigned long idx; + unsigned long pgoff; int error; - idx = (address - vma->vm_start) >> PAGE_SHIFT; - idx += vma->vm_pgoff; - idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + pgoff = (address - vma->vm_start)/MMUPAGE_SIZE + vma->vm_pgoff; - error = shmem_getpage(inode, idx, &page, SGP_CACHE); + error = shmem_getpage(inode, pgoff/PAGE_CACHE_MMUCOUNT, &page, SGP_CACHE); if (error) - return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; + return error == -ENOMEM ? NOPAGE_OOM : NOPAGE_SIGBUS; mark_page_accessed(page); return page; @@ -997,8 +995,8 @@ static int shmem_populate(struct vm_area enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; unsigned long size; - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size) + size = (i_size_read(inode) + MMUPAGE_SIZE - 1) >> MMUPAGE_SHIFT; + if (pgoff >= size || pgoff + len/MMUPAGE_SIZE > size) return -EINVAL; while ((long) len > 0) { @@ -1007,12 +1005,12 @@ static int shmem_populate(struct vm_area /* * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE */ - err = shmem_getpage(inode, pgoff, &page, sgp); + err = shmem_getpage(inode, pgoff/PAGE_CACHE_MMUCOUNT, &page, sgp); if (err) return err; if (page) { mark_page_accessed(page); - err = install_page(mm, vma, addr, page, prot); + err = install_page(mm, vma, addr, page, prot, pgoff % PAGE_CACHE_MMUCOUNT); if (err) { page_cache_release(page); return err; @@ -1023,9 +1021,8 @@ static int shmem_populate(struct vm_area * offset in the pte. */ unsigned long pgidx; - pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx = (addr - vma->vm_start) >> MMUPAGE_SHIFT; pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; if (pgoff != pgidx) { err = install_file_pte(mm, vma, addr, pgoff, prot); if (err) @@ -1033,8 +1030,8 @@ static int shmem_populate(struct vm_area } } - len -= PAGE_SIZE; - addr += PAGE_SIZE; + len -= MMUPAGE_SIZE; + addr += MMUPAGE_SIZE; pgoff++; } return 0; @@ -1202,8 +1199,8 @@ shmem_file_write(struct file *file, cons char *kaddr; int left; - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; + offset = pos % PAGE_CACHE_SIZE; /* Within page */ + index = pos/PAGE_CACHE_SIZE; bytes = PAGE_CACHE_SIZE - offset; if (bytes > count) bytes = count; diff -prauN linux-2.6.0-test11/mm/slab.c pgcl-2.6.0-test11-1/mm/slab.c --- linux-2.6.0-test11/mm/slab.c 2003-11-26 12:45:08.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/slab.c 2003-11-27 21:55:21.000000000 -0800 @@ -666,7 +666,7 @@ void __init kmem_cache_init(void) * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ - if (num_physpages > (32 << 20) >> PAGE_SHIFT) + if (num_physpages > (32 << 20) >> MMUPAGE_SHIFT) slab_break_gfp_order = BREAK_GFP_ORDER_HI; @@ -1123,7 +1123,7 @@ kmem_cache_create (const char *name, siz align = L1_CACHE_BYTES; /* Determine if the slab management is 'on' or 'off' slab. */ - if (size >= (PAGE_SIZE>>3)) + if (size >= PAGE_SIZE/8 || ((flags & SLAB_MUST_HWCACHE_ALIGN) && size >= MMUPAGE_SIZE)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). diff -prauN linux-2.6.0-test11/mm/swap.c pgcl-2.6.0-test11-1/mm/swap.c --- linux-2.6.0-test11/mm/swap.c 2003-11-26 12:43:37.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/swap.c 2003-11-27 21:55:21.000000000 -0800 @@ -407,7 +407,7 @@ EXPORT_SYMBOL(percpu_counter_mod); */ void __init swap_setup(void) { - unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + unsigned long megs = num_physpages >> (20 - MMUPAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff -prauN linux-2.6.0-test11/mm/swap_state.c pgcl-2.6.0-test11-1/mm/swap_state.c --- linux-2.6.0-test11/mm/swap_state.c 2003-11-26 12:43:48.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/swap_state.c 2003-11-27 21:55:21.000000000 -0800 @@ -69,7 +69,7 @@ static int add_to_swap_cache(struct page INC_CACHE_INFO(noent_race); return -ENOENT; } - error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL); + error = add_to_page_cache(page, &swapper_space, entry.val/PAGE_MMUCOUNT, GFP_KERNEL); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ @@ -141,7 +141,7 @@ int add_to_swap(struct page * page) * Add it to the swap cache and mark it dirty */ err = add_to_page_cache(page, &swapper_space, - entry.val, GFP_ATOMIC); + entry.val/PAGE_MMUCOUNT, GFP_ATOMIC); if (pf_flags & PF_MEMALLOC) current->flags |= PF_MEMALLOC; @@ -180,7 +180,7 @@ void delete_from_swap_cache(struct page BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; spin_lock(&swapper_space.page_lock); __delete_from_swap_cache(page); @@ -198,10 +198,10 @@ int move_to_swap_cache(struct page *page spin_lock(&swapper_space.page_lock); spin_lock(&mapping->page_lock); - err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + err = radix_tree_insert(&swapper_space.page_tree, entry.val/PAGE_MMUCOUNT, page); if (!err) { __remove_from_page_cache(page); - ___add_to_page_cache(page, &swapper_space, entry.val); + ___add_to_page_cache(page, &swapper_space, entry.val/PAGE_MMUCOUNT); } spin_unlock(&mapping->page_lock); @@ -229,7 +229,7 @@ int move_from_swap_cache(struct page *pa BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; spin_lock(&swapper_space.page_lock); spin_lock(&mapping->page_lock); @@ -312,7 +312,7 @@ struct page * lookup_swap_cache(swp_entr { struct page *found; - found = find_get_page(&swapper_space, entry.val); + found = find_get_page(&swapper_space, entry.val/PAGE_MMUCOUNT); /* * Unsafe to assert PageSwapCache and mapping on page found: * if SMP nothing prevents swapoff from deleting this page from @@ -343,7 +343,7 @@ struct page * read_swap_cache_async(swp_ * that would confuse statistics: use find_get_page() * directly. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(&swapper_space, entry.val/PAGE_MMUCOUNT); if (found_page) break; diff -prauN linux-2.6.0-test11/mm/swapfile.c pgcl-2.6.0-test11-1/mm/swapfile.c --- linux-2.6.0-test11/mm/swapfile.c 2003-11-26 12:43:24.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/swapfile.c 2003-11-27 21:55:21.000000000 -0800 @@ -3,6 +3,7 @@ * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie + * Page clustering swap adjustments Nov 2003, William Irwin */ #include @@ -46,6 +47,25 @@ struct swap_info_struct swap_info[MAX_SW #define SWAPFILE_CLUSTER 256 +/* + * returns offset into ->swap_map[] array, each entry of which + * tracks PAGE_SIZE (not MMUPAGE_SIZE). + * Here comes the pain. Every usage of swp_offset() needs to be + * watched like a hawk. swp_offset(entry) % PAGE_MMUCOUNT tracks the + * MMUPAGE_SIZE -sized subblock within the PAGE_SIZE region swapped + * and is largely for the benefit of ptes, so that unaligned swapped + * areas can have their proper pieces of pages recovered from ptes. + * More advanced implementations may utilize rmap information to + * accomplish private_folio_page() and the like, at which time the + * swap layer will have no means of recovering sub-block information + * from virtual addresses. This is a serious concern, as incremental + * methods that would scatter pieces of a page to the four winds are + * required for scaling to very large values of PAGE_SIZE. + * + * The upshot is that indexing wildly off of ->swap_map[] without + * scaling the results of swp_offset() will hurt. Badly. I've seen bad + * swp_type() results here too; I may be in trouble. + */ static inline int scan_swap_map(struct swap_info_struct *si) { unsigned long offset; @@ -132,7 +152,7 @@ swp_entry_t get_swap_page(void) offset = scan_swap_map(p); swap_device_unlock(p); if (offset) { - entry = swp_entry(type,offset); + entry = swp_entry(type, offset*PAGE_MMUCOUNT); type = swap_info[type].next; if (type < 0 || p->prio != swap_info[type].prio) { @@ -160,21 +180,29 @@ out: static struct swap_info_struct * swap_info_get(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p = NULL; unsigned long offset, type; if (!entry.val) goto out; type = swp_type(entry); - if (type >= nr_swapfiles) + if (type >= nr_swapfiles) { + printk(KERN_ERR "bad type %lu beyond nr_swapfiles %u " + "in swap_info_get()\n", type, nr_swapfiles); goto bad_nofile; - p = & swap_info[type]; + } + p = &swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); - if (offset >= p->max) + + /* + * offset returned by swp_offset() is in MMUPAGE_SIZE units, + * p->max is in PAGE_SIZE units + */ + if (offset >= p->max*PAGE_MMUCOUNT) goto bad_offset; - if (!p->swap_map[offset]) + if (!p->swap_map[offset/PAGE_MMUCOUNT]) goto bad_free; swap_list_lock(); if (p->prio > swap_info[swap_list.next].prio) @@ -184,15 +212,75 @@ static struct swap_info_struct * swap_in bad_free: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); - goto out; + WARN_ON(1); + goto out_err; bad_offset: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); - goto out; + WARN_ON(1); + goto out_err; bad_device: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); - goto out; + WARN_ON(1); + goto out_err; bad_nofile: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); + WARN_ON(1); +out_err: + printk("swap_free: type = %x, offset = 0x%lx, max = 0x%lx\n", + swp_type(entry), + swp_offset(entry), + p ? p->max*PAGE_MMUCOUNT : 0); + +/* dump pagetables */ +#if 1 + { + struct mm_struct *mm = current->mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long vaddr; + + if (!mm) { + /* we're dead here anyway, but... */ + printk(KERN_ERR "bug in free_swap_and_cache() " + "with no mm!\n"); + goto out_noscan; + } + + for (vaddr = 0; vaddr < TASK_SIZE; vaddr += PGDIR_SIZE) { + pgd = pgd_offset(mm, vaddr); + printk(KERN_DEBUG "pgd for 0x%lx = 0x%Lx\n", + vaddr, (u64)pgd_val(*pgd)); + } + + if (PTRS_PER_PMD > 1) { + for (vaddr = 0; vaddr < TASK_SIZE; vaddr += PMD_SIZE) { + pgd = pgd_offset(mm, vaddr); + if (pgd_none(*pgd) || !pgd_present(*pgd)) + continue; + pmd = pmd_offset(pgd, vaddr); + printk(KERN_DEBUG "pmd for 0x%lx = 0x%Lx\n", + vaddr, (u64)pmd_val(*pmd)); + } + } + + for (vaddr = 0; vaddr < TASK_SIZE; vaddr += MMUPAGE_SIZE) { + pgd = pgd_offset(mm, vaddr); + if (pgd_none(*pgd) || !pgd_present(*pgd)) + continue; + pmd = pmd_offset(pgd, vaddr); + if (pmd_none(*pmd) || !pmd_present(*pmd)) + continue; + pte = pte_offset_map_nested(pmd, vaddr); + if (!pte_none(*pte) && pte_present(*pte)) + printk(KERN_DEBUG "pte for 0x%lx = 0x%Lx\n", + vaddr, (u64)pte_val(*pte)); + pte_unmap_nested(pte); + } +out_noscan: + ; + } +#endif out: return NULL; } @@ -203,6 +291,9 @@ static void swap_info_put(struct swap_in swap_list_unlock(); } +/* + * offset is entry.val/PAGE_MMUCOUNT + */ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) { int count = p->swap_map[offset]; @@ -231,10 +322,11 @@ void swap_free(swp_entry_t entry) struct swap_info_struct * p; p = swap_info_get(entry); - if (p) { - swap_entry_free(p, swp_offset(entry)); - swap_info_put(p); - } + if (!p) + return; + + swap_entry_free(p, swp_offset(entry)/PAGE_MMUCOUNT); + swap_info_put(p); } /* @@ -247,11 +339,11 @@ static int exclusive_swap_page(struct pa struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ - if (p->swap_map[swp_offset(entry)] == 1) { + if (p->swap_map[swp_offset(entry)/PAGE_MMUCOUNT] == 1) { /* Recheck the page count with the pagecache lock held.. */ spin_lock(&swapper_space.page_lock); if (page_count(page) - !!PagePrivate(page) == 2) @@ -315,14 +407,14 @@ int remove_exclusive_swap_page(struct pa if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->index*PAGE_MMUCOUNT; p = swap_info_get(entry); if (!p) return 0; /* Is the only swap cache user the cache itself? */ retval = 0; - if (p->swap_map[swp_offset(entry)] == 1) { + if (p->swap_map[swp_offset(entry)/PAGE_MMUCOUNT] == 1) { /* Recheck the page count with the pagecache lock held.. */ spin_lock(&swapper_space.page_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { @@ -353,8 +445,8 @@ void free_swap_and_cache(swp_entry_t ent p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); + if (swap_entry_free(p, swp_offset(entry)/PAGE_MMUCOUNT) == 1) + page = find_trylock_page(&swapper_space, entry.val/PAGE_MMUCOUNT); swap_info_put(p); } if (page) { @@ -387,9 +479,12 @@ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { + unsigned long pfn; + /* vma_suboffset() would be meaningless; these are anonymous */ + pfn = page_to_pfn(page) + (entry.val % PAGE_MMUCOUNT); vma->vm_mm->rss++; get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + set_pte(dir, pte_mkold(pfn_pte(pfn, vma->vm_page_prot))); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } @@ -427,7 +522,7 @@ static int unuse_pmd(struct vm_area_stru pte_unmap(pte); return 1; } - address += PAGE_SIZE; + address += MMUPAGE_SIZE; pte++; } while (address && (address < end)); pte_unmap(pte - 1); @@ -577,6 +672,9 @@ static int try_to_unuse(unsigned int typ * child immediately after parent. If we race with dup_mmap(), * we very much want to resolve parent before child, otherwise * we may miss some entries: using last mm would invert that. + * + * The whole of the preceding discussion is bogus now that + * physical scanning is in place. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -606,7 +704,7 @@ static int try_to_unuse(unsigned int typ * page and read the swap into it. */ swap_map = &si->swap_map[i]; - entry = swp_entry(type, i); + entry = swp_entry(type, i*PAGE_MMUCOUNT); page = read_swap_cache_async(entry); if (!page) { /* @@ -715,6 +813,10 @@ static int try_to_unuse(unsigned int typ * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. + * + * The whole of the preceding discussion is bogus given + * the new process capacities and there are probably + * resource leaks to fix up here. */ if (*swap_map == SWAP_MAP_MAX) { swap_device_lock(si); @@ -791,6 +893,8 @@ sector_t map_swap_page(struct swap_info_ struct swap_extent *se = sis->curr_swap_extent; struct swap_extent *start_se = se; + offset /= PAGE_MMUCOUNT; + for ( ; ; ) { struct list_head *lh; @@ -997,7 +1101,7 @@ int page_queue_congested(struct page *pa bdi = page->mapping->backing_dev_info; if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->index*PAGE_MMUCOUNT }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); @@ -1365,20 +1469,20 @@ asmlinkage long sys_swapon(const char __ maxpages = swp_offset(swp_entry(0,~0UL)) - 1; if (maxpages > swap_header->info.last_page) maxpages = swap_header->info.last_page; - p->highest_bit = maxpages - 1; + p->highest_bit = maxpages/PAGE_MMUCOUNT - 1; error = -EINVAL; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + if (!(p->swap_map = vmalloc(maxpages/PAGE_MMUCOUNT * sizeof(short)))) { error = -ENOMEM; goto bad_swap; } error = 0; - memset(p->swap_map, 0, maxpages * sizeof(short)); + memset(p->swap_map, 0, maxpages/PAGE_MMUCOUNT * sizeof(short)); for (i=0; iinfo.nr_badpages; i++) { int page = swap_header->info.badpages[i]; if (page <= 0 || page >= swap_header->info.last_page) @@ -1386,14 +1490,14 @@ asmlinkage long sys_swapon(const char __ else p->swap_map[page] = SWAP_MAP_BAD; } - nr_good_pages = swap_header->info.last_page - + nr_good_pages = (swap_header->info.last_page - swap_header->info.nr_badpages - - 1 /* header page */; + 1)/PAGE_MMUCOUNT /* header page */; if (error) goto bad_swap; } - if (swapfilesize && maxpages > swapfilesize) { + if (swapfilesize && maxpages/PAGE_MMUCOUNT > swapfilesize) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); error = -EINVAL; @@ -1405,7 +1509,7 @@ asmlinkage long sys_swapon(const char __ goto bad_swap; } p->swap_map[0] = SWAP_MAP_BAD; - p->max = maxpages; + p->max = maxpages/PAGE_MMUCOUNT; p->pages = nr_good_pages; error = setup_swap_extents(p); @@ -1503,7 +1607,7 @@ int swap_duplicate(swp_entry_t entry) if (type >= nr_swapfiles) goto bad_file; p = type + swap_info; - offset = swp_offset(entry); + offset = swp_offset(entry)/PAGE_MMUCOUNT; swap_device_lock(p); if (offset < p->max && p->swap_map[offset]) { @@ -1523,6 +1627,7 @@ out: bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + BUG(); goto out; } @@ -1544,7 +1649,7 @@ int valid_swaphandles(swp_entry_t entry, if (!page_cluster) /* no readahead */ return 0; - toff = (swp_offset(entry) >> page_cluster) << page_cluster; + toff = (swp_offset(entry)/PAGE_MMUCOUNT) & ~((1UL << page_cluster)-1); if (!toff) /* first page is swap header */ toff++, i--; *offset = toff; @@ -1565,3 +1670,97 @@ int valid_swaphandles(swp_entry_t entry, swap_device_unlock(swapdev); return ret; } + +int swap_count(struct page *page) +{ + struct swap_info_struct * p; + unsigned long offset, type; + swp_entry_t entry; + int retval = 0; + + entry.val = page->index*PAGE_MMUCOUNT; + if (!entry.val) + goto bad_entry; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = swp_offset(entry); + if (offset >= p->max*PAGE_MMUCOUNT) + goto bad_offset; + if (!p->swap_map[offset/PAGE_MMUCOUNT]) + goto bad_unused; + retval = p->swap_map[offset/PAGE_MMUCOUNT]; +out: + return retval; +bad_entry: + printk(KERN_ERR "swap_count: null entry!\n"); + goto out; +bad_file: + printk("Bad swap file entry %08lx\n", entry.val); + goto out; +bad_offset: + printk("Bad swap offset entry %08lx\n", entry.val); + goto out; +bad_unused: + printk("Unused swap offset entry in swap_count %08lx\n", entry.val); + goto out; +} + +void __swap_free(swp_entry_t entry, unsigned short count) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry.val) + goto out; + + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + p = & swap_info[type]; + if (!(p->flags & SWP_USED)) + goto bad_device; + offset = swp_offset(entry); + if (offset >= p->max*PAGE_MMUCOUNT) + goto bad_offset; + if (!p->swap_map[offset/PAGE_MMUCOUNT]) + goto bad_free; + swap_list_lock(); + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = type; + swap_device_lock(p); + if (p->swap_map[offset/PAGE_MMUCOUNT] < SWAP_MAP_MAX) { + if (p->swap_map[offset/PAGE_MMUCOUNT] < count) + goto bad_count; + if (!(p->swap_map[offset/PAGE_MMUCOUNT] -= count)) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + nr_swap_pages++; + } + } + swap_device_unlock(p); + swap_list_unlock(); +out: + return; + +bad_nofile: + printk("swap_free: Trying to free nonexistent swap-page\n"); + goto out; +bad_device: + printk("swap_free: Trying to free swap from unused swap-device\n"); + goto out; +bad_offset: + printk("swap_free: offset exceeds max\n"); + goto out; +bad_free: + printk("VM: Bad swap entry %08lx\n", entry.val); + goto out; +bad_count: + swap_device_unlock(p); + swap_list_unlock(); + printk(KERN_ERR "VM: Bad count %hd current count %hd\n", count, p->swap_map[offset]); + goto out; +} diff -prauN linux-2.6.0-test11/mm/vmalloc.c pgcl-2.6.0-test11-1/mm/vmalloc.c --- linux-2.6.0-test11/mm/vmalloc.c 2003-11-26 12:44:23.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/vmalloc.c 2003-11-27 22:55:43.000000000 -0800 @@ -45,15 +45,12 @@ static void unmap_area_pte(pmd_t *pmd, u end = PMD_SIZE; do { - pte_t page; - page = ptep_get_and_clear(pte); - address += PAGE_SIZE; - pte++; - if (pte_none(page)) - continue; - if (pte_present(page)) - continue; + if (pte_present(*pte)) + pte_clear(pte); + else if (!pte_none(*pte)) printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n"); + pte++; + address += MMUPAGE_SIZE; } while (address < end); } @@ -84,56 +81,6 @@ static void unmap_area_pmd(pgd_t *dir, u } while (address < end); } -static int map_area_pte(pte_t *pte, unsigned long address, - unsigned long size, pgprot_t prot, - struct page ***pages) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - - do { - struct page *page = **pages; - - WARN_ON(!pte_none(*pte)); - if (!page) - return -ENOMEM; - - set_pte(pte, mk_pte(page, prot)); - address += PAGE_SIZE; - pte++; - (*pages)++; - } while (address < end); - return 0; -} - -static int map_area_pmd(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t prot, - struct page ***pages) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - - do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); - if (!pte) - return -ENOMEM; - if (map_area_pte(pte, address, end - address, prot, pages)) - return -ENOMEM; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - - return 0; -} - void unmap_vm_area(struct vm_struct *area) { unsigned long address = (unsigned long) area->addr; @@ -150,30 +97,48 @@ void unmap_vm_area(struct vm_struct *are flush_tlb_kernel_range((unsigned long) area->addr, end); } +#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { unsigned long address = (unsigned long) area->addr; - unsigned long end = address + (area->size-PAGE_SIZE); - pgd_t *dir; + /* don't instantiate PTE's for the guard page */ + unsigned long end = address + area->size - MMUPAGE_SIZE; + unsigned long voffset = 0; + pgd_t *pgd; int err = 0; - dir = pgd_offset_k(address); + pgd = pgd_offset_k(address); spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); + pmd_t *pmd = pmd_alloc(&init_mm, pgd, address); if (!pmd) { err = -ENOMEM; - break; - } - if (map_area_pmd(pmd, address, end - address, prot, pages)) { - err = -ENOMEM; - break; + goto out; } - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - + do { + pte_t *pte = pte_alloc_kernel(&init_mm, pmd, address); + if (!pte) { + err = -ENOMEM; + goto out; + } + + do { + unsigned long pfn; + pfn = page_to_pfn((*pages)[voffset/PAGE_SIZE]); + pfn += (voffset/MMUPAGE_SIZE) % PAGE_MMUCOUNT; + set_pte(pte, pfn_pte(pfn, prot)); + ++pte; + address += MMUPAGE_SIZE; + voffset += MMUPAGE_SIZE; + } while (((unsigned long)pte & PTE_TABLE_MASK) && address < end); + ++pmd; + } while (((unsigned long)pmd & PMD_TABLE_MASK) && address < end); + /* presumably address could wrap to 0, but I doubt it */ + } while (address && address < end); +out: spin_unlock(&init_mm.page_table_lock); flush_cache_vmap((unsigned long) area->addr, end); return err; @@ -192,7 +157,7 @@ struct vm_struct *__get_vm_area(unsigned /* * We always allocate a guard page. */ - size += PAGE_SIZE; + size += MMUPAGE_SIZE; if (unlikely(!size)) { kfree (area); return NULL; @@ -223,6 +188,9 @@ found: area->phys_addr = 0; write_unlock(&vmlist_lock); + printk("vmalloc, returning [0x%p, 0x%p)\n", + area->addr, ((char *)area->addr) + area->size); + return area; out: @@ -251,17 +219,20 @@ struct vm_struct *get_vm_area(unsigned l * * @addr: base address * - * Search for the kernel VM area starting at @addr, and remove it. + * Search for the kernel VM area containing @addr, and remove it. * This function returns the found VM area, but using it is NOT safe - * on SMP machines. + * on SMP machines; the final removal of an area must be serialized + * externally, and those who allocated the area own it. */ -struct vm_struct *remove_vm_area(void *addr) +struct vm_struct *remove_vm_area(void *__addr) { struct vm_struct **p, *tmp; + unsigned long addr = (unsigned long)__addr; write_lock(&vmlist_lock); - for (p = &vmlist ; (tmp = *p) ;p = &tmp->next) { - if (tmp->addr == addr) + for (p = &vmlist; (tmp = *p); p = &tmp->next) { + unsigned long tmp_addr = (unsigned long)tmp->addr; + if (addr >= tmp_addr && addr - tmp_addr < tmp->size) goto found; } write_unlock(&vmlist_lock); @@ -281,7 +252,7 @@ void __vunmap(void *addr, int deallocate if (!addr) return; - if ((PAGE_SIZE-1) & (unsigned long)addr) { + if ((MMUPAGE_SIZE-1) & (unsigned long)addr) { printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); return; } @@ -297,8 +268,7 @@ void __vunmap(void *addr, int deallocate int i; for (i = 0; i < area->nr_pages; i++) { - if (unlikely(!area->pages[i])) - BUG(); + BUG_ON(unlikely(!area->pages[i])); __free_page(area->pages[i]); } @@ -361,10 +331,10 @@ void *vmap(struct page **pages, unsigned { struct vm_struct *area; - if (count > num_physpages) + if (PAGE_MMUCOUNT*count > num_physpages) return NULL; - area = get_vm_area((count << PAGE_SHIFT), flags); + area = get_vm_area(PAGE_SIZE*count, flags); if (!area) return NULL; if (map_vm_area(area, prot, &pages)) { @@ -394,16 +364,16 @@ void *__vmalloc(unsigned long size, int struct page **pages; unsigned int nr_pages, array_size, i; - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) + size = MMUPAGE_ALIGN(size); + if (!size || (size >> MMUPAGE_SHIFT) > num_physpages) return NULL; area = get_vm_area(size, VM_ALLOC); if (!area) return NULL; - nr_pages = size >> PAGE_SHIFT; - array_size = (nr_pages * sizeof(struct page *)); + nr_pages = PAGE_ALIGN(size)/PAGE_SIZE; + array_size = nr_pages * sizeof(struct page *); area->nr_pages = nr_pages; area->pages = pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); @@ -480,7 +450,7 @@ long vread(char *buf, char *addr, unsign read_lock(&vmlist_lock); for (tmp = vmlist; tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + if (addr >= vaddr + tmp->size - MMUPAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -490,7 +460,7 @@ long vread(char *buf, char *addr, unsign addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + tmp->size - MMUPAGE_SIZE - addr; do { if (count == 0) goto finished; @@ -518,7 +488,7 @@ long vwrite(char *buf, char *addr, unsig read_lock(&vmlist_lock); for (tmp = vmlist; tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + if (addr >= vaddr + tmp->size - MMUPAGE_SIZE) continue; while (addr < vaddr) { if (count == 0) @@ -527,7 +497,7 @@ long vwrite(char *buf, char *addr, unsig addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + tmp->size - MMUPAGE_SIZE - addr; do { if (count == 0) goto finished; diff -prauN linux-2.6.0-test11/mm/vmscan.c pgcl-2.6.0-test11-1/mm/vmscan.c --- linux-2.6.0-test11/mm/vmscan.c 2003-11-26 12:43:06.000000000 -0800 +++ pgcl-2.6.0-test11-1/mm/vmscan.c 2003-11-27 21:55:21.000000000 -0800 @@ -433,7 +433,7 @@ shrink_list(struct list_head *page_list, #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->index*PAGE_MMUCOUNT }; __delete_from_swap_cache(page); spin_unlock(&mapping->page_lock); swap_free(swap); diff -prauN linux-2.6.0-test11/net/decnet/dn_route.c pgcl-2.6.0-test11-1/net/decnet/dn_route.c --- linux-2.6.0-test11/net/decnet/dn_route.c 2003-11-26 12:45:40.000000000 -0800 +++ pgcl-2.6.0-test11-1/net/decnet/dn_route.c 2003-11-27 21:55:21.000000000 -0800 @@ -1793,7 +1793,7 @@ void __init dn_route_init(void) dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); - goal = num_physpages >> (26 - PAGE_SHIFT); + goal = num_physpages >> (26 - MMUPAGE_SHIFT); for(order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff -prauN linux-2.6.0-test11/net/ipv4/netfilter/arp_tables.c pgcl-2.6.0-test11-1/net/ipv4/netfilter/arp_tables.c --- linux-2.6.0-test11/net/ipv4/netfilter/arp_tables.c 2003-11-26 12:45:10.000000000 -0800 +++ pgcl-2.6.0-test11-1/net/ipv4/netfilter/arp_tables.c 2003-11-27 21:55:21.000000000 -0800 @@ -880,7 +880,7 @@ static int do_replace(void *user, unsign return -ENOPROTOOPT; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + if ((SMP_ALIGN(tmp.size) >> MMUPAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; newinfo = vmalloc(sizeof(struct arpt_table_info) diff -prauN linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c pgcl-2.6.0-test11-1/net/ipv4/netfilter/ip_conntrack_core.c --- linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c 2003-11-26 12:42:40.000000000 -0800 +++ pgcl-2.6.0-test11-1/net/ipv4/netfilter/ip_conntrack_core.c 2003-11-27 21:55:21.000000000 -0800 @@ -1372,9 +1372,9 @@ int __init ip_conntrack_init(void) ip_conntrack_htable_size = hashsize; } else { ip_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) + = (((num_physpages << MMUPAGE_SHIFT) / 16384) / sizeof(struct list_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + if (num_physpages > (1024 * 1024 * 1024 / MMUPAGE_SIZE)) ip_conntrack_htable_size = 8192; if (ip_conntrack_htable_size < 16) ip_conntrack_htable_size = 16; diff -prauN linux-2.6.0-test11/net/ipv4/netfilter/ip_tables.c pgcl-2.6.0-test11-1/net/ipv4/netfilter/ip_tables.c --- linux-2.6.0-test11/net/ipv4/netfilter/ip_tables.c 2003-11-26 12:43:25.000000000 -0800 +++ pgcl-2.6.0-test11-1/net/ipv4/netfilter/ip_tables.c 2003-11-27 21:55:21.000000000 -0800 @@ -1059,7 +1059,7 @@ do_replace(void *user, unsigned int len) return -ENOPROTOOPT; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + if ((SMP_ALIGN(tmp.size) >> MMUPAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; newinfo = vmalloc(sizeof(struct ipt_table_info) diff -prauN linux-2.6.0-test11/net/ipv4/route.c pgcl-2.6.0-test11-1/net/ipv4/route.c --- linux-2.6.0-test11/net/ipv4/route.c 2003-11-26 12:45:53.000000000 -0800 +++ pgcl-2.6.0-test11-1/net/ipv4/route.c 2003-11-27 21:55:21.000000000 -0800 @@ -2745,7 +2745,7 @@ int __init ip_rt_init(void) if (!ipv4_dst_ops.kmem_cachep) panic("IP: failed to allocate ip_dst_cache\n"); - goal = num_physpages >> (26 - PAGE_SHIFT); + goal = num_physpages >> (26 - MMUPAGE_SHIFT); for (order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff -prauN linux-2.6.0-test11/net/ipv4/tcp.c pgcl-2.6.0-test11-1/net/ipv4/tcp.c --- linux-2.6.0-test11/net/ipv4/tcp.c 2003-11-26 12:43:27.000000000 -0800 +++ pgcl-2.6.0-test11-1/net/ipv4/tcp.c 2003-11-27 21:55:21.000000000 -0800 @@ -2606,9 +2606,9 @@ void __init tcp_init(void) * The methodology is similar to that of the buffer cache. */ if (num_physpages >= (128 * 1024)) - goal = num_physpages >> (21 - PAGE_SHIFT); + goal = num_physpages >> (21 - MMUPAGE_SHIFT); else - goal = num_physpages >> (23 - PAGE_SHIFT); + goal = num_physpages >> (23 - MMUPAGE_SHIFT); for (order = 0; (1UL << order) < goal; order++) ; diff -prauN linux-2.6.0-test11/net/ipv6/netfilter/ip6_tables.c pgcl-2.6.0-test11-1/net/ipv6/netfilter/ip6_tables.c --- linux-2.6.0-test11/net/ipv6/netfilter/ip6_tables.c 2003-11-26 12:45:30.000000000 -0800 +++ pgcl-2.6.0-test11-1/net/ipv6/netfilter/ip6_tables.c 2003-11-27 21:55:21.000000000 -0800 @@ -1140,7 +1140,7 @@ do_replace(void *user, unsigned int len) return -EFAULT; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + if ((SMP_ALIGN(tmp.size) >> MMUPAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; newinfo = vmalloc(sizeof(struct ip6t_table_info) diff -prauN linux-2.6.0-test11/security/commoncap.c pgcl-2.6.0-test11-1/security/commoncap.c --- linux-2.6.0-test11/security/commoncap.c 2003-11-26 12:43:36.000000000 -0800 +++ pgcl-2.6.0-test11-1/security/commoncap.c 2003-11-27 22:26:32.000000000 -0800 @@ -312,6 +312,7 @@ int cap_vm_enough_memory(long pages) * cache and most inode caches should fall into this */ free += atomic_read(&slab_reclaim_pages); + free *= PAGE_MMUCOUNT; /* * Leave the last 3% for root @@ -327,6 +328,7 @@ int cap_vm_enough_memory(long pages) allowed = totalram_pages * sysctl_overcommit_ratio / 100; allowed += total_swap_pages; + allowed *= PAGE_MMUCOUNT; if (atomic_read(&vm_committed_space) < allowed) return 0; diff -prauN linux-2.6.0-test11/security/dummy.c pgcl-2.6.0-test11-1/security/dummy.c --- linux-2.6.0-test11/security/dummy.c 2003-11-26 12:43:24.000000000 -0800 +++ pgcl-2.6.0-test11-1/security/dummy.c 2003-11-27 22:26:08.000000000 -0800 @@ -124,6 +124,7 @@ static int dummy_vm_enough_memory(long p * cache and most inode caches should fall into this */ free += atomic_read(&slab_reclaim_pages); + free *= PAGE_MMUCOUNT; /* * Leave the last 3% for root @@ -139,6 +140,7 @@ static int dummy_vm_enough_memory(long p allowed = totalram_pages * sysctl_overcommit_ratio / 100; allowed += total_swap_pages; + allowed *= PAGE_MMUCOUNT; if (atomic_read(&vm_committed_space) < allowed) return 0; diff -prauN linux-2.6.0-test11/security/selinux/hooks.c pgcl-2.6.0-test11-1/security/selinux/hooks.c --- linux-2.6.0-test11/security/selinux/hooks.c 2003-11-26 12:45:38.000000000 -0800 +++ pgcl-2.6.0-test11-1/security/selinux/hooks.c 2003-11-27 22:25:31.000000000 -0800 @@ -1290,6 +1290,7 @@ static int selinux_vm_enough_memory(long * cache and most inode caches should fall into this */ free += atomic_read(&slab_reclaim_pages); + free *= PAGE_MMUCOUNT; /* * Leave the last 3% for privileged processes. @@ -1314,6 +1315,7 @@ static int selinux_vm_enough_memory(long allowed = totalram_pages * sysctl_overcommit_ratio / 100; allowed += total_swap_pages; + allowed *= PAGE_MMUCOUNT; if (atomic_read(&vm_committed_space) < allowed) return 0;