一、引言
为了实现虚拟内存管理机制,操作系统对内存实行分页管理。自内存“分页机制”提出之始,内存页面的默认大小便被设置为 4096 字节(4KB),虽然原则上内存页面大小是可配置的,但绝大多数的操作系统实现中仍然采用默认的 4KB 页面。当某些应用的需要使用的内存达到几G、甚至几十G的时候,4KB的内存页面将严重制约程序的性能。
CPU缓存中有一组缓存专门用于缓存TLB,但其大小是有限的。当采用的默认页面大小为 4KB,其产生的TLB较大,因而将会产生较多 TLB Miss 和缺页中断,从而大大影响应用程序的性能。操作系统以 2MB 甚至更大作为分页的单位时,将会大大减少 TLB Miss 和缺页中断的数量,显著提高应用程序的性能。这也正是 Linux 内核引入大页面支持的直接原因。好处是很明显的,假设应用程序需要 2MB 的内存,如果操作系统以 4KB 作为分页的单位,则需要 512 个页面,进而在 TLB 中需要 512 个表项,同时也需要 512 个页表项,操作系统需要经历至少 512 次 TLB Miss 和 512 次缺页中断才能将 2MB 应用程序空间全部映射到物理内存;然而,当操作系统采用 2MB 作为分页的基本单位时,只需要一次 TLB Miss 和一次缺页中断,就可以为 2MB 的应用程序空间建立虚实映射,并在运行过程中无需再经历 TLB Miss 和缺页中断(假设未发生 TLB 项替换和 Swap)。
为了能以最小的代价实现大页面支持,Linux 操作系统采用了基于 hugetlbfs 特殊文件系统 2M 字节大页面支持。这种采用特殊文件系统形式支持大页面的方式,使得应用程序可以根据需要灵活地选择虚存页面大小,而不会被强制使用 2MB 大页面。
二、HugePage的使用
本文的例子摘自 Linux 内核源码中提供的有关说明文档 (Documentation/vm/hugetlbpage.txt) 。使用 hugetlbfs 之前,首先需要在编译内核 (make menuconfig) 时配置CONFIG_HUGETLB_PAGE和CONFIG_HUGETLBFS选项,这两个选项均可在 File systems 内核配置菜单中找到。
内核编译完成并成功启动内核之后,将 hugetlbfs 特殊文件系统挂载到根文件系统的某个目录上去,以使得 hugetlbfs 可以访问。命令如下:
mount none /mnt/huge -t hugetlbfs
此后,只要是在 /mnt/huge/ 目录下创建的文件,将其映射到内存中时都会使用 2MB 作为分页的基本单位。值得一提的是,hugetlbfs 中的文件是不支持读 / 写系统调用 ( 如read()或write()等 ) 的,一般对它的访问都是以内存映射的形式进行的。为了更好地介绍大页面的应用,接下来将给出一个大页面应用的例子,该例子同样也是摘自于上述提到的内核文档,只是略有简化。
1 清单 1. Linux 大页面应用示例 2 #include <fcntl.h> 3 #include <sys/mman.h> 4 #include <errno.h> 5 6 #define MAP_LENGTH (10*1024*1024) 7 8 int main() 9 { 10 int fd; 11 void * addr; 12 13 /* create a file in hugetlb fs */ 14 fd = open("/mnt/huge/test", O_CREAT | O_RDWR); 15 if(fd < 0){ 16 perror("Err: "); 17 return -1; 18 } 19 20 /* map the file into address space of current application PRocess */ 21 addr = mmap(0, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 22 if(addr == MAP_FAILED){ 23 perror("Err: "); 24 close(fd); 25 unlink("/mnt/huge/test"); 26 return -1; 27 } 28 29 /* from now on, you can store application data on huage pages via addr */ 30 31 munmap(addr, MAP_LENGTH); 32 close(fd); 33 unlink("/mnt/huge/test"); 34 return 0; 35 }
对于系统中大页面的统计信息可以在 Proc 特殊文件系统(/proc)中查到,如/proc/sys/vm/nr_hugepages给出了当前内核中配置的大页面的数目,也可以通过该文件配置大页面的数目,如:
echo 20 > /proc/sys/vm/nr_hugepages
三、Hugetlbfs的初始化(基于Linux-3.4.51)
1、hugetlb的初始化
hugetlb初始化是通过hugetlb_init()函数实现的,主要是初始化hstates[MAX_NUMNODES]全局数组以及创建sysfs相关目录文件。
1 static int __init hugetlb_init(void) 2 { 3 /* Some platform decide whether they support huge pages at boot 4 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when 5 * there is no such support 6 */ 7 if (HPAGE_SHIFT == 0) 8 return 0; 9 10 if (!size_to_hstate(default_hstate_size)) {11 default_hstate_size = HPAGE_SIZE; /*默认大小为2M*/12 if (!size_to_hstate(default_hstate_size))13 /* 初始化hstates[MAX_NUMNODES]数组,数组中只有一个成员;14 * HUGETLB_PAGE_ORDER = 9,即,h->order = 9;15 */16 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);17 }18 /*由于hstates[]只有一个成员,default_hstate_idx = 0*/19 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;20 /*默认最大页数为0*/21 if (default_hstate_max_huge_pages)22 default_hstate.max_huge_pages = default_hstate_max_huge_pages;23 24 /*由于最大页数为0,没有为hstate[]分配任何页*/25 hugetlb_init_hstates();26 /*这个函数不知道干啥???*/27 gather_bootmem_prealloc();28 /*打印初始化后的相关信息*/29 report_hugepages();30 /*初始化/sys/kernel/mm/hugepages相关目录文件*/31 hugetlb_sysfs_init();32 /*初始化/sys/device/system/node/node*/hugepages相关目录文件*/33 hugetlb_register_all_nodes();34 return 0;35 }36 module_init(hugetlb_init);
另外,hugepage的默认大小也可以通过配置内核启动参数“default_hugepagesz”指定,例如:default_hugepagesz=4M,指定default_hstate_size的大小为4M,其内核实现如下:
1 static int __init hugetlb_default_setup(char *s)2 {3 default_hstate_size = memparse(s, &s);4 return 1;5 }6 __setup("default_hugepagesz=", hugetlb_default_setup);
hugepage的大页是通过将N个连续的4k页作为一个混合页来实现大页面的。hugepage的页数也可以通过内核启动参数“hugepages”指定。例如:hugepages=1024,其内核实现如下:
1 static int __init hugetlb_nrpages_setup(char *s) 2 { 3 unsigned long *mhp; 4 static unsigned long *last_mhp; 5 /* 6 * !max_hstate means we haven't parsed a hugepagesz= parameter yet, 7 * so this hugepages= parameter goes to the "default hstate". 8 */ 9 if (!max_hstate)10 mhp = &default_hstate_max_huge_pages;11 else12 mhp = &parsed_hstate->max_huge_pages;13 if (mhp == last_mhp) {14 printk(KERN_WARNING "hugepages= specified twice without "15 "interleaving hugepagesz=, ignoring/n");16 return 1;17 }18 if (sscanf(s, "%lu", mhp) <= 0)19 *mhp = 0;20 /*21 * Global state is always initialized later in hugetlb_init.22 * But we need to allocate >= MAX_ORDER hstates here early to still23 * use the bootmem allocator.24 */25 /* parsed_hstate->order = 9, MAX_ORDER = 11, 不会调用hugetlb_hstate_alloc_pages();26 * 通过内核启动参数配置页面数,什么时候分配具体的内存页???27 */28 if (max_hstate && parsed_hstate->order >= MAX_ORDER)29 hugetlb_hstate_alloc_pages(parsed_hstate);30 last_mhp = mhp;31 return 1;32 }33 __setup("hugepages=", hugetlb_nrpages_setup);
hugepage的页数也可以通过命令配置,echo 20 > /proc/sys/vm/nr_hugepages,此时,是通过系统调用实现的。内核实现如下:
1 int hugetlb_sysctl_handler(struct ctl_table *table, int write,2 void __user *buffer, size_t *length, loff_t *ppos)3 {4 return hugetlb_sysctl_handler_common(false, table, write,5 buffer, length, ppos);6 }
1 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 2 struct ctl_table *table, int write, 3 void __user *buffer, size_t *length, loff_t *ppos) 4 { 5 struct hstate *h = &default_hstate; 6 unsigned long tmp; 7 int ret; 8 tmp = h->max_huge_pages; 9 if (write && h->order >= MAX_ORDER)10 return -EINVAL;11 table->data = &tmp;12 table->maxlen = sizeof(unsigned long);13 /*从用户空间将数值copy赋值给tabel->data,即tmp,并做相关检查*/14 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);15 if (ret)16 goto out;17 if (write) { 18 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);19 if (!(obey_mempolicy &&20 init_nodemask_of_mempolicy(nodes_allowed))) {21 NODEMASK_FREE(nodes_allowed);22 nodes_allowed = &node_states[N_HIGH_MEMORY];23 }24 /*设置最大页数,并分配具体内存页*/25 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);26 if (nodes_allowed != &node_states[N_HIGH_MEMORY])27 NODEMASK_FREE(nodes_allowed);28 }29 out:30 return ret;31 }
1 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, 2 nodemask_t *nodes_allowed) 3 { 4 unsigned long min_count, ret; 5 if (h->order >= MAX_ORDER) 6 return h->max_huge_pages; 7 /* 8 * Increase the pool size 9 * First take pages out of surplus state. Then make up the10 * remaining difference by allocating fresh huge pages.11 *12 * We might race with alloc_buddy_huge_page() here and be unable13 * to convert a surplus huge page to a normal huge page. That is14 * not critical, though, it just means the overall size of the15 * pool might be one hugepage larger than it needs to be, but16 * within all the constraints specified by the sysctls.17 */18 spin_lock(&hugetlb_lock);19 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {20 if (!adjust_pool_surplus(h, nodes_allowed, -1))21 break;22 }23 while (count > persistent_huge_pages(h)) {24 /*25 * If this allocation races such that we no longer need the26 * page, free_huge_page will handle it by freeing the page27 * and reducing the surplus.28 */29 spin_unlock(&hugetlb_lock);30 /*分配内存页*/31 ret = alloc_fresh_huge_page(h, nodes_allowed);32 spin_lock(&hugetlb_lock);33 if (!ret)34 goto out;35 /* Bail for signals. Probably ctrl-c from user */36 if (signal_pending(current))37 goto out;38 }39 /*40 * Decrease the pool size41 * First return free pages to the buddy allocator (being careful42 * to keep enough around to satisfy reservations). Then place43 * pages into surplus state as needed so the pool will shrink44 * to the desired size as pages become free.45 *46 * By placing pages into the surplus state independent of the47 * overcommit value, we are allowing the surplus pool size to48 * exceed overcommit. There are few sane options here. Since49 * alloc_buddy_huge_page() is checking the global counter,50 * though, we'll note that we're not allowed to exceed surplus51 * and won't grow the pool anywhere else. Not until one of the52 * sysctls are changed, or the surplus pages go out of use.53 */54 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;55 min_count = max(count, min_count);56 try_to_free_low(h, min_count, nodes_allowed);57 while (min_count < persistent_huge_pages(h)) {58 if (!free_pool_huge_page(h, nodes_allowed, 0))59 break;60 }61 while (count < persistent_huge_pages(h)) {62 if (!adjust_pool_surplus(h, nodes_allowed, 1))63 break;64 }65 out:66 ret = persistent_huge_pages(h);67 spin_unlock(&hugetlb_lock);68 return ret;69 }
1 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 2 { 3 struct page *page; 4 int start_nid; 5 int next_nid; 6 int ret = 0; 7 start_nid = hstate_next_node_to_alloc(h, nodes_allowed); 8 next_nid = start_nid; 9 do {10 /* 从内存Node的zonelist上分配2^h->order个4K的内存页,返回第一个page的地址;11 * 如果分配不成功,从下一个内存Node上尝试;12 */13 page = alloc_fresh_huge_page_node(h, next_nid);14 if (page) {15 ret = 1;16 break;17 }18 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);19 } while (next_nid != start_nid);20 if (ret)21 count_vm_event(HTLB_BUDDY_PGALLOC);22 else23 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);24 return ret;25 }
1 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 2 { 3 struct page *page; 4 if (h->order >= MAX_ORDER) 5 return NULL; 6 /*__GFP_COMP标志:分配2^h->order个连续的4K大小的page,返回第一个Page的地址,并设置PG_compound标记*/ 7 page = alloc_pages_exact_node(nid, 8 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 9 __GFP_REPEAT|__GFP_NOWARN,10 huge_page_order(h));11 if (page) {12 if (arch_prepare_hugepage(page)) {13 __free_pages(page, huge_page_order(h));14 return NULL;15 }16 /* 1、将已分配的2^h->order个数的page中的第二个page的lru.next执行函数free_huge_page();17 * 2、在put_page()函数中,最后调用free_huge_page()-->enqueue_huge_page(),将page加入到h->hugepages_freelists[nid]链表;18 */19 prep_new_huge_page(h, page, nid);20 }21 return page;22 }
2、hugetlbfs的初始化
hugetlbfs的创建,主要是建立VFS层的super_block、dentry、inode之间的相关映射,同时也和hugetlb_init()函数中初始化的hstates[]数组关联起来了,也就和分配的大内存页关联起来了。如下图(有点乱):
1 static int __init init_hugetlbfs_fs(void) 2 { 3 int error; 4 struct vfsmount *vfsmount; 5 6 /*初始化hugetlbfs回写数据结构*/ 7 error = bdi_init(&hugetlbfs_backing_dev_info); 8 if (error) 9 return error;10 11 error = -ENOMEM;12 /*创建slab缓存hugetlbfs_inode_cachep,后续hugetlbfs的inode从这里面分配*/13 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",14 sizeof(struct hugetlbfs_inode_info),15 0, 0, init_once);16 if (hugetlbfs_inode_cachep == NULL)17 goto out2;18 19 /*将hugetlbfs_fs_type加入到全局file_systems链表中*/20 error = register_filesystem(&hugetlbfs_fs_type);21 if (error)22 goto out;23 24 /* 创建hugetlbfs的super_block、entry、inode,并建立它们之间的相互映射,25 * 以及它们与hugetlbfs_fs_type、default_hstate、hugetlbfs_inode_cachep之间的映射关系26 */27 vfsmount = kern_mount(&hugetlbfs_fs_type);28 29 if (!IS_ERR(vfsmount)) {30 hugetlbfs_vfsmount = vfsmount;31 return 0;32 }33 34 error = PTR_ERR(vfsmount);35 36 out:37 kmem_cache_destroy(hugetlbfs_inode_cachep);38 out2:39 bdi_destroy(&hugetlbfs_backing_dev_info);40 return error;41 }42
有不足或错误之处,欢迎指出。
参考:
http://www.ibm.com/developerworks/cn/linux/l-cn-hugetlb/
新闻热点
疑难解答