From: Andi Kleen I ported the Athlon prefetch fix to 2.6.0test6mm2 now. I tested that it handles user space prefetch faults correctly. This is Jamie's "mutiliated" version with segment/limit checking and some additional changes: - Handle the mmap_sem deadlock in the way Jamie proposed: just check the address for >= TASK_SIZE and if true never take the semaphore - Do the same for 4/4 (untested). On some thought 4/4 is actually easy because it should never do user exceptions from kernel space - its *_user functions do all call handle_mm_fault() directly. This means if an exception space came from ring 0 it can be only a bug, a prefetch fault or a vmalloc fault. None of these require I also fixed a bug in the process - the 4/4 code didn't handle lazy vmalloc SMP faults correctly IMHO. Note I didn't test if it works. - Port LDT checking to -mm*. This is a bit ugly. The 4/4 patch allows LDT pages to be in highmem and require a kmap. I put that code into #if 1. For submission to Linus all #if 1 code should be removed. - Added a printk for now just for testing (should be also removed, but doesn't harm right now) - Removed the #ifdefs. The code is now always compiled in. - Removed the eip==addr check. - Some other minor cleanup. arch/i386/mm/fault.c | 204 ++++++++++++++++++++++++++++++++++++++++--- include/asm-i386/processor.h | 6 - 2 files changed, 196 insertions(+), 14 deletions(-) diff -puN arch/i386/mm/fault.c~athlon-prefetch-handling arch/i386/mm/fault.c --- 25/arch/i386/mm/fault.c~athlon-prefetch-handling 2003-10-04 02:39:10.000000000 -0700 +++ 25-akpm/arch/i386/mm/fault.c 2003-10-04 02:39:10.000000000 -0700 @@ -19,6 +19,7 @@ #include #include #include /* For unblank_screen() */ +#include #include #include @@ -55,6 +56,161 @@ void bust_spinlocks(int yes) console_loglevel = loglevel_save; } +/* + * Return EIP plus the CS segment base. The segment limit is also + * adjusted, clamped to the kernel/user address space (whichever is + * appropriate), and returned in *eip_limit. + * + * The segment is checked, because it might have been changed by another + * task between the original faulting instruction and here. + * + * If CS is no longer a valid code segment, or if EIP is beyond the + * limit, or if it is a kernel address when CS is not a kernel segment, + * then the returned value will be greater than *eip_limit. + */ +static inline unsigned long get_segment_eip(struct pt_regs *regs, + unsigned long *eip_limit) +{ + unsigned long eip = regs->eip; + unsigned seg = regs->xcs & 0xffff; + u32 seg_ar, seg_limit, base, *desc; + + /* The standard kernel/user address space limit. */ + *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg; + + /* Unlikely, but must come before segment checks. */ + if (unlikely((regs->eflags & VM_MASK) != 0)) + return eip + (seg << 4); + + /* By far the commonest cases. */ + if (likely(seg == __USER_CS || seg == __KERNEL_CS)) + return eip; + + /* Check the segment exists, is within the current LDT/GDT size, + that kernel/user (ring 0..3) has the appropriate privilege, + that it's a code segment, and get the limit. */ + __asm__ ("larl %3,%0; lsll %3,%1" + : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); + if ((~seg_ar & 0x9800) || eip > seg_limit) { + *eip_limit = 0; + return 1; /* So that returned eip > *eip_limit. */ + } + + /* Get the GDT/LDT descriptor base. + When you look for races in this code remember that + LDT and other horrors are only used in user space. */ + if (seg & (1<<2)) { + /* Must lock the LDT while reading it. */ + down(¤t->mm->context.sem); +#if 1 + /* horrible hack for 4/4 disabled kernels. + I'm not quite sure what the TLB flush is good for, + it's mindlessly copied from the read_ldt code */ + __flush_tlb_global(); + desc = kmap(current->mm->context.ldt_pages[(seg & ~7)/PAGE_SIZE]); + desc = (void *)desc + ((seg & ~7) % PAGE_SIZE); +#else + desc = current->mm->context.ldt; + desc = (void *)desc + (seg & ~7); +#endif + } else { + /* Must disable preemption while reading the GDT. */ + desc = (u32 *)&cpu_gdt_table[get_cpu()]; + desc = (void *)desc + (seg & ~7); + } + base = (desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000); + if (seg & (1<<2)) { +#if 1 + kunmap((void *)((unsigned long)desc & PAGE_MASK)); +#endif + up(¤t->mm->context.sem); + } else + put_cpu(); + + /* Adjust EIP and segment limit, and clamp at the kernel limit. + It's legitimate for segments to wrap at 0xffffffff. */ + seg_limit += base; + if (seg_limit < *eip_limit && seg_limit >= base) + *eip_limit = seg_limit; + return eip + base; +} + +/* + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + */ +static int __is_prefetch(struct pt_regs *regs, unsigned long addr) +{ + unsigned long limit; + unsigned long instr = get_segment_eip (regs, &limit); + int scan_more = 1; + int prefetch = 0; + int i; + + for (i = 0; scan_more && i < 15; i++) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (instr > limit) + break; + if (__get_user(opcode, (unsigned char *) instr)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ + scan_more = ((instr_lo & 7) == 0x6); + break; + + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + if (instr > limit) + break; + if (__get_user(opcode, (unsigned char *) instr)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + +#if 1 + if (prefetch) + printk("prefetch handled at %lx eip %lx instr %lx cs %x\n", + addr, regs->eip, instr, regs->xcs); +#endif + + return prefetch; +} + +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr) +{ + if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 6)) + return __is_prefetch(regs, addr); + return 0; +} + asmlinkage void do_invalid_op(struct pt_regs *, unsigned long); /* @@ -86,6 +242,8 @@ asmlinkage void do_page_fault(struct pt_ tsk = current; + info.si_code = SEGV_MAPERR; + /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. @@ -99,18 +257,26 @@ asmlinkage void do_page_fault(struct pt_ * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 1) == 0. */ - if (address >= TASK_SIZE && !(error_code & 5)) +#ifdef CONFIG_X86_4G + /* On 4/4 all kernels faults are either bugs, vmalloc or prefetch */ + if (unlikely((regs->xcs & 3) == 0)) goto vmalloc_fault; +#else + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & 5)) + goto vmalloc_fault; + goto bad_area_nosemaphore; + } +#endif mm = tsk->mm; - info.si_code = SEGV_MAPERR; /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault.. */ if (in_atomic() || !mm) - goto no_context; + goto bad_area_nosemaphore; down_read(&mm->mmap_sem); @@ -198,8 +364,16 @@ good_area: bad_area: up_read(&mm->mmap_sem); +bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, address)) + return; + tsk->thread.cr2 = address; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; @@ -232,6 +406,14 @@ no_context: if (fixup_exception(regs)) return; + /* + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + */ + if (is_prefetch(regs, address)) + return; + /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. @@ -292,10 +474,14 @@ out_of_memory: do_sigbus: up_read(&mm->mmap_sem); - /* - * Send a sigbus, regardless of whether we were in kernel - * or user mode. - */ + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & 4)) + goto no_context; + + /* User space => ok to do another page fault */ + if (is_prefetch(regs, address)) + return; + tsk->thread.cr2 = address; tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; @@ -304,10 +490,6 @@ do_sigbus: info.si_code = BUS_ADRERR; info.si_addr = (void *)address; force_sig_info(SIGBUS, &info, tsk); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) - goto no_context; return; vmalloc_fault: diff -puN include/asm-i386/processor.h~athlon-prefetch-handling include/asm-i386/processor.h --- 25/include/asm-i386/processor.h~athlon-prefetch-handling 2003-10-04 02:39:10.000000000 -0700 +++ 25-akpm/include/asm-i386/processor.h 2003-10-04 02:39:10.000000000 -0700 @@ -588,12 +588,12 @@ static inline void rep_nop(void) /* Prefetch instructions for Pentium III and AMD Athlon */ /* It's not worth to care about 3dnow! prefetches for the K6 - because they are microcoded there and very slow. */ + because they are microcoded there and very slow. + However we don't do prefetches for pre XP Athlons currently + That should be fixed. */ #define ARCH_HAS_PREFETCH extern inline void prefetch(const void *x) { - if (cpu_data[0].x86_vendor == X86_VENDOR_AMD) - return; /* Some athlons fault if the address is bad */ alternative_input(ASM_NOP4, "prefetchnta (%1)", X86_FEATURE_XMM, _