Jul 12 2012

Linux kernel memory model

稍微了解 Linux 内核的人都知道，在 x86 上内核中所有的 struct page 是放在一个数组中管理的，它就是 mem_map，通过它我们就可以用 pfn 作为 index 来找到对应的 struct page 了：

[c]

define __pfn_to_page(pfn) (mem_map + ((pfn) - ARCH_PFN_OFFSET))

define __page_to_pfn(page) ((unsigned long)((page) - mem_map) +

                             ARCH_PFN_OFFSET)

[/c]

这是以前旧的 memory model（这个词很流行，你应该在很多地方见过，比如 C/C++标准），现在情况变了。

因为对 NUMA 和内存热插拔（memory hot-plug）的支持，Linux 内核中现在又引入两个新的 memory model，之前那个旧的被称为 Flat memory，新的两个被称为 Discontiguous memory 和 Sparse memory。它们对应的选项是：CONFIG_FLATMEM，CONFIG_DISCONTIGMEM，CONFIG_SPARSEMEM。让我们来看看这两个新的 memory model。

顺便多说两句，NUMA 和内存热插拔并没有直接的联系，NUMA 系统不一定支持内存的热插拔，而可以进行内存热插拔的系统也不一定是 NUMA 的！NUMA 支持对应的选项是 CONFIG_NUMA，而内存热插拔对应的选项是 CONFIG_MEMORY_HOTPLUG 和 CONFIG_MEMORY_HOTREMOVE。由此也可以看出 Linux 内核配置是多么灵活，你可以任意搭配你需要的选项。

CONFIG_DISCONTIGMEM

mm/Kconfig 中对它的介绍是：

This option provides enhanced support for discontiguous memory systems, over FLATMEM. These systems have holes in their physical address spaces, and this option provides more efficient handling of these holes. However, the vast majority of hardware has quite flat address spaces, and can have degraded performance from the extra overhead that this option imposes.

Many NUMA configurations will have this as the only option.
Discontiguous memory 其实很简单，它上从 flat memory 的基础上对 NUMA 进行扩展得出来的。每一个 node 都有一个 struct pglist_data，对于 discontiguous memory 其中记录了每个 node 的 node_start_pfn、node_spanned_pages、node_mem_map，分别表示该 node 的起始页的 PFN、物理页的数量、这些页面的 mem_map。我们可以看 alloc_node_mem_map() 是如何初始化这几个值的：

[c]
static void __init_refok alloc_node_mem_map(struct pglist_data pgdat)
{
/ Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;

ifdef CONFIG_FLAT_NODE_MEM_MAP

    /* ia64 gets its own node_mem_map, before this, without bootmem */
    if (!pgdat-&gt;node_mem_map) {
            unsigned long size, start, end;
            struct page *map;

            /*
             * The zone's endpoints aren't required to be MAX_ORDER
             * aligned but the node_mem_map endpoints must be in order
             * for the buddy allocator to function correctly.
             */
            start = pgdat-&gt;node_start_pfn &amp; ~(MAX_ORDER_NR_PAGES - 1);
            end = pgdat-&gt;node_start_pfn + pgdat-&gt;node_spanned_pages;
            end = ALIGN(end, MAX_ORDER_NR_PAGES);
            size =  (end - start) * sizeof(struct page);
            map = alloc_remap(pgdat-&gt;node_id, size);
            if (!map)
                    map = alloc_bootmem_node_nopanic(pgdat, size);
            pgdat-&gt;node_mem_map = map + (pgdat-&gt;node_start_pfn - start);
    }
    //....

}
[/c]

所以，它对应的 pfn_to_page() 和 page_to_pfn() 定义如下：

[c]

define arch_local_page_offset(pfn, nid)

    ((pfn) - NODE_DATA(nid)-&gt;node_start_pfn)

define __pfn_to_page(pfn)

({ unsigned long pfn = (pfn);
unsigned long nid = arch_pfn_to_nid(pfn);
NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn, nid);
})

define __page_to_pfn(pg)

({ const struct page __pg = (pg);
struct pglist_data pgdat = NODE_DATA(page_to_nid(pg));
(unsigned long)(pg - pgdat->node_mem_map) +
__pgdat->node_start_pfn;
})
[/c]

两个 node 之间的内存未必是连续的，中间可能有内存空洞，空洞的单位是 64M，但整个内存依然是 flat 的：

[c]
/*

generic node memory support, the following assumptions apply:
*
1) memory comes in 64Mb contiguous chunks which are either present or not
2) we will not have more than 64Gb in total
*
for now assume that 64Gb is max amount of RAM for whole system
64Gb / 4096bytes/page = 16777216 pages
*/
define MAX_NR_PAGES 16777216

define MAX_SECTIONS 1024

define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS)

extern s8 physnode_map[];

static inline int pfn_to_nid(unsigned long pfn)
{

ifdef CONFIG_NUMA

    return((int) physnode_map[(pfn) / PAGES_PER_SECTION]);

else

    return 0;

endif

}
[/c]

CONFIG_SPARSEMEM

Sparse memory 是一个相对比较复杂的模型。mm/Kconfig 中对它的介绍是：

This will be the only option for some systems, including memory hotplug systems. This is normal. For many other systems, this will be an alternative to “Discontiguous Memory”. This option provides some potential performance benefits, along with decreased code complexity, but it is newer, and more experimental.

它主要是因为支持内存热插拔引入的。对于支持内存热插拔的系统，系统中的某一个部分内存可以随时被移除和添加，这就是得原本连续的内存空间变得稀疏。这个内存模型是把所有的内存空间划分成一个个 section，每个 section 都是同样大小的，可以进行热插拔的内存大小就是以 section 为单位的。在 x86_64 上面，每个 section 是 128M：

[c]

ifdef CONFIG_X86_32

ifdef CONFIG_X86_PAE

define SECTION_SIZE_BITS 29

define MAX_PHYSADDR_BITS 36

define MAX_PHYSMEM_BITS 36

else

define SECTION_SIZE_BITS 26

define MAX_PHYSADDR_BITS 32

define MAX_PHYSMEM_BITS 32

endif

else / CONFIG_X86_32 /

define SECTION_SIZE_BITS 27 / matt - 128 is convenient right now /

define MAX_PHYSADDR_BITS 44

define MAX_PHYSMEM_BITS 46

endif

[/c]

每个 section 有自己的 mem_map，所以其 __pfn_to_page 的定义如下：

[c]
/*

Note: section’s mem_map is encorded to reflect its start_pfn.

section[i].section_mem_map == mem_map’s address - start_pfn;
*/

define __page_to_pfn(pg)

({ const struct page *__pg = (pg);

 int __sec = page_to_section(__pg);                      
 (unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec)));

})

define __pfn_to_page(pfn)

({ unsigned long pfn = (pfn);
struct mem_section *sec = pfn_to_section(pfn);
section_mem_map_addr(sec) + __pfn;
})
[/c]

Sparse memory 还有两个变异：SPARSEMEM_EXTREME 和 SPARSEMEM_VMEMMAP，前者是对更稀疏的内存做了优化，采用了两层 memory section，即二维数组：

[c]

ifdef CONFIG_SPARSEMEM_EXTREME

define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))

else

define SECTIONS_PER_ROOT 1

endif

define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)

define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)

define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)

ifdef CONFIG_SPARSEMEM_EXTREME

extern struct mem_section *mem_section[NR_SECTION_ROOTS];

else

extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];

endif

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
if (!mem_section[SECTION_NR_TO_ROOT(nr)])
return NULL;
return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
}
[/c]

后者是针对 x86_64 这种虚拟地址空间比较大的平台做了速度的优化，把 mem_map 里所有 struct page 的地址一一映射进 vmemmap 地址中去，这样它们的虚拟地址就是连续的了：

[c]

define VMEMMAP_START _AC(0xffffea0000000000, UL)

define vmemmap ((struct page *)VMEMMAP_START)

/ memmap is virtually contiguous. /

define __pfn_to_page(pfn) (vmemmap + (pfn))

define __page_to_pfn(page) (unsigned long)((page) - vmemmap)

[/c]

不过额外的代价就是初始化的时候要对每一个 struct page 做 populate，参见 sparse_mem_map_populate() 函数。另外可参考内存热插拔代码中函数 add_section() 和 remove_section() 的实现。

define __pfn_to_page(pfn) (mem_map + ((pfn) - ARCH_PFN_OFFSET))

define __page_to_pfn(page) ((unsigned long)((page) - mem_map) +

ifdef CONFIG_FLAT_NODE_MEM_MAP

define arch_local_page_offset(pfn, nid)

define __pfn_to_page(pfn)

define __page_to_pfn(pg)

define MAX_NR_PAGES 16777216

define MAX_SECTIONS 1024

define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS)

ifdef CONFIG_NUMA

else

endif

ifdef CONFIG_X86_32

ifdef CONFIG_X86_PAE

define SECTION_SIZE_BITS 29

define MAX_PHYSADDR_BITS 36

define MAX_PHYSMEM_BITS 36

else

define SECTION_SIZE_BITS 26

define MAX_PHYSADDR_BITS 32

define MAX_PHYSMEM_BITS 32

endif

else / CONFIG_X86_32 /

define SECTION_SIZE_BITS 27 / matt - 128 is convenient right now /

define MAX_PHYSADDR_BITS 44

define MAX_PHYSMEM_BITS 46

endif

define __page_to_pfn(pg)

define __pfn_to_page(pfn)

ifdef CONFIG_SPARSEMEM_EXTREME

define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))

else

define SECTIONS_PER_ROOT 1

endif

define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)

define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)

define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)

ifdef CONFIG_SPARSEMEM_EXTREME

else

endif

define VMEMMAP_START _AC(0xffffea0000000000, UL)

define vmemmap ((struct page *)VMEMMAP_START)

define __pfn_to_page(pfn) (vmemmap + (pfn))

define __page_to_pfn(page) (unsigned long)((page) - vmemmap)

Comments