From: "Martin J. Bligh" At the moment, we initialise physnode_map from the various BIOS tables, which can create problems, as holes inside an node return 1 for pfn_valid, and yet pfn_to_nid is not correct for them. I'd hacked around this in my tree by defaulting the mapping to 0, not -1, but that's not the correct fix ... this is. I consolidated all the code back into 1 place, and use node_start_pfn[] and node_end_pfn[] to walk over it instead - that means it matches up perfectly with lmem_map's as we're using the same data. It also cleans up a lot of the code. Tested on both NUMA-Q and x440 ... and it only affects i386 NUMA boxen. Signed-off-by: Andrew Morton --- 25-akpm/arch/i386/kernel/numaq.c | 36 ------------------------------------ 25-akpm/arch/i386/kernel/srat.c | 19 ------------------- 25-akpm/arch/i386/mm/discontig.c | 30 +++++++++++++++++------------- 3 files changed, 17 insertions(+), 68 deletions(-) diff -puN arch/i386/kernel/numaq.c~fix-up-physnode_map arch/i386/kernel/numaq.c --- 25/arch/i386/kernel/numaq.c~fix-up-physnode_map Fri Jul 2 16:36:30 2004 +++ 25-akpm/arch/i386/kernel/numaq.c Fri Jul 2 16:36:30 2004 @@ -65,41 +65,6 @@ static void __init smp_dump_qct(void) } /* - * for each node mark the regions - * TOPOFMEM = hi_shrd_mem_start + hi_shrd_mem_size - * - * need to be very careful to not mark 1024+ as belonging - * to node 0. will want 1027 to show as belonging to node 1 - * example: - * TOPOFMEM = 1024 - * 1024 >> 8 = 4 (subtract 1 for starting at 0] - * tmpvar = TOPOFMEM - 256 = 768 - * 1024 >> 8 = 4 (subtract 1 for starting at 0] - * - */ -static void __init initialize_physnode_map(void) -{ - int nid; - unsigned int topofmem, cur; - struct eachquadmem *eq; - struct sys_cfg_data *scd = - (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR); - - - for(nid = 0; nid < numnodes; nid++) { - if(scd->quads_present31_0 & (1 << nid)) { - eq = &scd->eq[nid]; - cur = eq->hi_shrd_mem_start; - topofmem = eq->hi_shrd_mem_start + eq->hi_shrd_mem_size; - while (cur < topofmem) { - physnode_map[cur >> 8] = (s8) nid; - cur ++; - } - } - } -} - -/* * Unlike Summit, we don't really care to let the NUMA-Q * fall back to flat mode. Don't compile for NUMA-Q * unless you really need it! @@ -107,6 +72,5 @@ static void __init initialize_physnode_m int __init get_memcfg_numaq(void) { smp_dump_qct(); - initialize_physnode_map(); return 1; } diff -puN arch/i386/kernel/srat.c~fix-up-physnode_map arch/i386/kernel/srat.c --- 25/arch/i386/kernel/srat.c~fix-up-physnode_map Fri Jul 2 16:36:30 2004 +++ 25-akpm/arch/i386/kernel/srat.c Fri Jul 2 16:36:30 2004 @@ -181,23 +181,6 @@ static __init void chunk_to_zones(unsign } } -static void __init initialize_physnode_map(void) -{ - int i; - unsigned long pfn; - struct node_memory_chunk_s *nmcp; - - /* Run the list of memory chunks and fill in the phymap. */ - nmcp = node_memory_chunk; - for (i = num_memory_chunks; --i >= 0; nmcp++) { - for (pfn = nmcp->start_pfn; pfn <= nmcp->end_pfn; - pfn += PAGES_PER_ELEMENT) - { - physnode_map[pfn / PAGES_PER_ELEMENT] = (s8) nmcp->nid; - } - } -} - /* Parse the ACPI Static Resource Affinity Table */ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) { @@ -265,8 +248,6 @@ static int __init acpi20_parse_srat(stru for (i = 0; i < num_memory_chunks; i++) node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm]; - initialize_physnode_map(); - printk("pxm bitmap: "); for (i = 0; i < sizeof(pxm_bitmap); i++) { printk("%02X ", pxm_bitmap[i]); diff -puN arch/i386/mm/discontig.c~fix-up-physnode_map arch/i386/mm/discontig.c --- 25/arch/i386/mm/discontig.c~fix-up-physnode_map Fri Jul 2 16:36:30 2004 +++ 25-akpm/arch/i386/mm/discontig.c Fri Jul 2 16:37:50 2004 @@ -87,8 +87,6 @@ void set_pmd_pfn(unsigned long vaddr, un */ int __init get_memcfg_numa_flat(void) { - int pfn; - printk("NUMA - single node, flat memory mode\n"); /* Run the memory configuration and find the top of memory. */ @@ -96,16 +94,7 @@ int __init get_memcfg_numa_flat(void) node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; - /* Fill in the physnode_map with our simplistic memory model, - * all memory is in node 0. - */ - for (pfn = node_start_pfn[0]; pfn <= node_end_pfn[0]; - pfn += PAGES_PER_ELEMENT) - { - physnode_map[pfn / PAGES_PER_ELEMENT] = 0; - } - - /* Indicate there is one node available. */ + /* Indicate there is one node available. */ node_set_online(0); numnodes = 1; return 1; @@ -234,7 +223,7 @@ unsigned long __init setup_memory(void) { int nid; unsigned long bootmap_size, system_start_pfn, system_max_low_pfn; - unsigned long reserve_pages; + unsigned long reserve_pages, pfn; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -244,6 +233,21 @@ unsigned long __init setup_memory(void) * and ZONE_HIGHMEM. */ get_memcfg_numa(); + + /* Fill in the physnode_map */ + for (nid = 0; nid < numnodes; nid++) { + printk("Node: %d, start_pfn: %ld, end_pfn: %ld\n", + nid, node_start_pfn[nid], node_end_pfn[nid]); + printk(" Setting physnode_map array to node %d for pfns:\n ", + nid); + for (pfn = node_start_pfn[nid]; pfn < node_end_pfn[nid]; + pfn += PAGES_PER_ELEMENT) { + physnode_map[pfn / PAGES_PER_ELEMENT] = nid; + printk("%ld ", pfn); + } + printk("\n"); + } + reserve_pages = calculate_numa_remap_pages(); /* partially used pages are not usable - thus round upwards */ _