0%

Principles:共享内存mmap底层原理

共享内存基础知识

共享内存有两个,一个 mmap,一个 systemV 的 shm

由于所有用户进程总的虚拟地址空间比可用的物理内存大很多,因此只有最常用的部分才与物理页帧关联(这不是问题,因为大多数程序只占用实际可用内存的一小部分)

  • 在将磁盘上的数据映射到进程的虚拟地址空间的时,内核必须提供数据结构,以建立虚拟地址空间的区域和相关数据所在位置之间的关联,Linux 软件系统多级页表映射机制
  • 共享内存使得多个进程可以访问同一块内存空间(节约了内存空间),不同进程可以及时看到对方进程中对共享内存中数据得更新(多个进程可以同时操作,所以需要进行同步 ,一般与信号量配合使用)

本文主要介绍 mmap

共享内存的 API

1
void *mmap(void *addr, size_t len, int prot, int flags, int fd, off_t offset);
  • addr:
    • 指定了映射被放置的虚拟地址,首选做法是将 addr 指定为 NULL,内核会为映射选择一个合适的地址(将 addr 指定为非 NULL,内核会将该参数值作为一个提示信息来处理)
  • length:
    • 指定了映射字节数,如果 length 不是分页的整数倍,内核会以分页大小为单位建立映射
  • prot:是一个位掩码,指定了新内存映射上的保护信息
  • flags:是一个控制映射操作各个方面的选项的位掩码(只能选一个)
    • MAP_PRIVATE - 私有:对映射区域的写入操作会产生一个映射文件的复制,即私人的“写入时复制”(copy on write)对此区域作的任何修改都不会写回原来的文件内容
    • MAP_SHARED - 共有:对映射区域的写入数据会复制回文件内,而且允许其他映射该文件的进程共享
    • MAP_ANONYMOUS - 匿名:建立匿名映射,此时会忽略参数fd,不涉及文件(其实是使用 /dev/zero 文件),而且映射区域无法和其他进程共享
  • 匿名映射会忽略下面两个参数:
    • fd:表示映射的文件的文件描述符
    • offset:指定了映射在文件中的起点,必须是系统分页大小的倍数
  • return:
    • 成功:返回被映射区的指针
    • 出错:返回 “-1”,错误原因存于 error 中

共享内存使用案例

mmap:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include<stdio.h> 
#include<sys/mman.h>

int main(int argc, char* argv[]){
int fd = open("./flag.txt", 0666);
if(-1 == fd)
{
perror("open");
return -1;
}
int length = 1;
// char *addr = (char*)mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
char *addr = (char*)mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if(addr == MAP_FAILED)
{
perror("mmap");
return -1;
}

puts("get data from mmap:");
write(1,addr,0x40);
puts(" ");
puts("input data to mmap:");
read(0,addr,0x40);

if(munmap(addr, length) == -1) /* 解除映射区域 */
{
perror("munmap");
return -1;
}
}
  • 效果:
1
2
3
4
5
6
0x7ffff7fcf000     0x7ffff7fd0000 r--p     1000 0      /usr/lib/x86_64-linux-gnu/ld-2.31.so
0x7ffff7fd0000 0x7ffff7ff3000 r-xp 23000 1000 /usr/lib/x86_64-linux-gnu/ld-2.31.so
0x7ffff7ff3000 0x7ffff7ffb000 r--p 8000 24000 /usr/lib/x86_64-linux-gnu/ld-2.31.so
0x7ffff7ffb000 0x7ffff7ffc000 rw-p 1000 0 /home/yhellow/桌面/exp/flag.txt /* target */
0x7ffff7ffc000 0x7ffff7ffd000 r--p 1000 2c000 /usr/lib/x86_64-linux-gnu/ld-2.31.so
0x7ffff7ffd000 0x7ffff7ffe000 rw-p 1000 2d000 /usr/lib/x86_64-linux-gnu/ld-2.31.so
  • 其实 mmap 也可以用来进程间通信,但是用它分配内存的情况多一点

Linux 中 mmap 的实现

mmap 的作用就是把磁盘文件的一部分(指定 fd)直接映射到进程的内存中

1
2
3
4
5
6
7
0x7ffff7eda8e4 <mmap64+36>    syscall  <SYS_mmap>
addr: 0x0
len: 0x1
prot: 0x3
flags: 0x2
fd: 0x3 (/home/yhellow/桌面/exp/flag.txt)
offset: 0x0
1
2
3
4
5
6
7
8
9
10
11
asmlinkage unsigned long
sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off)
{
if (offset_in_page(off) != 0)
return -EINVAL;

addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); /* 核心函数 */
if (!IS_ERR((void *) addr))
force_successful_syscall_return();
return addr;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
struct file *file = NULL;
unsigned long retval;

if (!(flags & MAP_ANONYMOUS)) { /* MAP_ANONYMOUS:匿名的 */
audit_mmap_fd(fd, flags); /* 把'fd'和'flags'写到mmap结构体中 */
file = fget(fd); /* 获取对应的文件 */
if (!file)
return -EBADF;
if (is_file_hugepages(file))
len = ALIGN(len, huge_page_size(hstate_file(file))); /* 对齐 */
retval = -EINVAL;
if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
goto out_fput;
} else if (flags & MAP_HUGETLB) { /* MAP_HUGETLB:大页面映射 */
struct user_struct *user = NULL;
struct hstate *hs;

hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); /* 生成状态日志 */
if (!hs)
return -EINVAL;

len = ALIGN(len, huge_page_size(hs)); /* 对齐 */
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); /* 启用严格记账 */
if (IS_ERR(file))
return PTR_ERR(file);
}

flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); /* 去掉可执行权限,去掉不可写权限 */

retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); /* 核心函数 */
out_fput:
if (file)
fput(file);
return retval;
}
  • 简单检查并处理了一下标志位,然后进行对齐
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm; /* 获取当前进程的内存描述符 */
unsigned long populate;
LIST_HEAD(uf);

ret = security_mmap_file(file, prot, flag); /* 内核sandboxing功能,通过sandboxing调用mmap_file函数,如果是文件映射会mmap_file会对文件进行权限检查之类操作 */
if (!ret) {
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
&populate, &uf); /* 核心函数 */
up_write(&mm->mmap_sem);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
  • security_mmap_file 最终会调用 ima_file_mmap
1
2
3
4
5
6
7
8
static inline unsigned long
do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
unsigned long pgoff, unsigned long *populate,
struct list_head *uf)
{
return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
unsigned long do_mmap(struct file *file,
unsigned long addr,
unsigned long len,
unsigned long prot,
unsigned long flags,
vm_flags_t vm_flags,
unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
{
struct vm_area_struct *vma; /* Linux中vm_area_struct表示的虚拟地址是给进程使用的(vm_struct表示的虚拟地址是给内核使用的) */
struct vm_region *region;
struct rb_node *rb;
unsigned long capabilities, result;
int ret;

*populate = 0;

/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
&capabilities); /* 用于决定是否应该尝试映射 */
if (ret < 0)
return ret;

/* we ignore the address hint */
addr = 0;
len = PAGE_ALIGN(len);

/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
vm_flags |= determine_vm_flags(file, prot, flags, capabilities);

/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); /* 记录映射(kmem_cache_zalloc除了分配内存对象之外,还把内存对象所代表的内存空间初始化为"0") */
if (!region)
goto error_getting_region;

vma = vm_area_alloc(current->mm); /* 调用kmem_cache_alloc分配新的vma,然后调用vma_init进行初始化 */
if (!vma)
goto error_getting_vma;

region->vm_usage = 1; /* 设置vm_region */
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;

vma->vm_flags = vm_flags; /* 设置vm_area_struct */
vma->vm_pgoff = pgoff;

if (file) { /* 这里的file就是通过mmap的参数'fd'得来的 */
region->vm_file = get_file(file);
vma->vm_file = get_file(file);
}

down_write(&nommu_region_sem);

/* if we want to share, we need to check for regions created by other
* mmap() calls that overlap with our proposed mapping
* - we can only share with a superset match on most regular files
* - shared mappings on character devices and memory backed files are
* permitted to overlap inexactly as far as we are concerned for in
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
if (vm_flags & VM_MAYSHARE) { /* VM_MAYSHARE:用于确定是否可以设置对应的VM_SHARED(可以被多个进程共享) */
struct vm_region *pregion;
unsigned long pglen, rpglen, pgend, rpgend, start;

pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
pgend = pgoff + pglen;

for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
pregion = rb_entry(rb, struct vm_region, vm_rb);

if (!(pregion->vm_flags & VM_MAYSHARE))
continue;

/* search for overlapping mappings on the same file */
if (file_inode(pregion->vm_file) !=
file_inode(file))
continue;

if (pregion->vm_pgoff >= pgend)
continue;

rpglen = pregion->vm_end - pregion->vm_start;
rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
rpgend = pregion->vm_pgoff + rpglen;
if (pgoff >= rpgend)
continue;

/* handle inexactly overlapping matches between
* mappings */
if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
!(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
/* new mapping is not a subset of the region */
if (!(capabilities & NOMMU_MAP_DIRECT))
goto sharing_violation;
continue;
}

/* we've found a region we can share */
pregion->vm_usage++;
vma->vm_region = pregion; /* 设置vm_area_struct */
start = pregion->vm_start;
start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
vma->vm_start = start;
vma->vm_end = start + len;

if (pregion->vm_flags & VM_MAPPED_COPY)
vma->vm_flags |= VM_MAPPED_COPY;
else {
ret = do_mmap_shared_file(vma); /* 在文件上设置共享映射(驱动程序或文件系统提供并固定存储) */
if (ret < 0) {
vma->vm_region = NULL;
vma->vm_start = 0;
vma->vm_end = 0;
pregion->vm_usage--;
pregion = NULL;
goto error_just_free;
}
}
fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
region = pregion;
result = start;
goto share;
}

/* obtain the address at which to make a shared mapping
* - this is the hook for quasi-memory character devices to
* tell us the location of a shared mapping
*/
if (capabilities & NOMMU_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
/* get_unmapped_area调用的是"current->mm->get_unmapped_area",在不同体系结构上对应不同的函数,但这些函数的基本原理都是类似的 */
if (IS_ERR_VALUE(addr)) {
ret = addr;
if (ret != -ENOSYS)
goto error_just_free;

/* the driver refused to tell us where to site
* the mapping so we'll have to attempt to copy
* it */
ret = -ENODEV;
if (!(capabilities & NOMMU_MAP_COPY))
goto error_just_free;

capabilities &= ~NOMMU_MAP_DIRECT;
} else {
vma->vm_start = region->vm_start = addr;
vma->vm_end = region->vm_end = addr + len;
}
}
}

vma->vm_region = region;

/* set up the mapping
* - the region is filled in if NOMMU_MAP_DIRECT is still set
*/
if (file && vma->vm_flags & VM_SHARED) /* VM_SHARED:可以被多个进程共享 */
ret = do_mmap_shared_file(vma); /* 在文件上设置共享映射(驱动程序或文件系统提供并固定存储) */
else
ret = do_mmap_private(vma, region, len, capabilities); /* 设置私有映射或匿名共享映射 */
if (ret < 0)
goto error_just_free;
add_nommu_region(region);

/* clear anonymous mappings that don't ask for uninitialized data */
if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) /* 清除不要求未初始化数据的匿名映射 */
memset((void *)region->vm_start, 0,
region->vm_end - region->vm_start);

/* okay... we have a mapping; now we have to register it */
result = vma->vm_start;

current->mm->total_vm += len >> PAGE_SHIFT;

share:
add_vma_to_mm(current->mm, vma); /* 在list和tree的适当位置将VMA添加到进程的mm_struct中,如果不是匿名页面,也添加到地址空间的页面树中 */

/* we flush the region from the icache only when the first executable
* mapping of it is made */
if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
flush_icache_range(region->vm_start, region->vm_end);
region->vm_icache_flushed = true;
}

up_write(&nommu_region_sem);

return result;

error_just_free:
up_write(&nommu_region_sem);
error:
if (region->vm_file)
fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
if (vma->vm_file)
fput(vma->vm_file);
vm_area_free(vma);
return ret;

sharing_violation:
up_write(&nommu_region_sem);
pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;

error_getting_vma:
kmem_cache_free(vm_region_jar, region);
pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;

error_getting_region:
pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;
}
  • 首先调用 vm_area_alloc(底层还是调用 kmem_cache_alloc,然后调用 vma_init 把该 vma 插入红黑树)
  • 新分配的 vm_area_struct 用于管理进程使用的虚拟地址(虚存管理的最基本的管理单元):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */

unsigned long vm_start; /* Our start address within vm_mm. */
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */

/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next, *vm_prev;

struct rb_node vm_rb;

/*
* Largest free memory gap in bytes to the left of this VMA.
* Either between this VMA and vma->vm_prev, or between one of the
* VMAs below us in the VMA rbtree and its ->vm_prev. This helps
* get_unmapped_area find a free area of the right size.
*/
unsigned long rb_subtree_gap;

/* Second cache line starts here. */

struct mm_struct *vm_mm; /* The address space we belong to. */
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
unsigned long vm_flags; /* Flags, see mm.h. */

/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
*/
struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;

/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
* or brk vma (with NULL file) can only be in an anon_vma list.
*/
struct list_head anon_vma_chain; /* Serialized by mmap_sem &
* page_table_lock */
struct anon_vma *anon_vma; /* Serialized by page_table_lock */

/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;

/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units */
struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */

atomic_long_t swap_readahead_info;
#ifndef CONFIG_MMU
struct vm_region *vm_region; /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;

struct core_thread {
struct task_struct *task;
struct core_thread *next;
};

struct core_state {
atomic_t nr_threads;
struct core_thread dumper;
struct completion startup;
};
  • 核心函数 get_unmapped_area 调用的是 current->mm->get_unmapped_area,在 Linux 中,实际上调用的是 arch_get_unmapped_area(进程中能够找到查找空闲虚拟内存的方法)
1
2
3
4
5
6
7
8
enum mmap_allocation_direction {UP, DOWN}; /* UP == '0', DOWN == '1' */

unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
return arch_get_unmapped_area_common(filp,
addr0, len, pgoff, flags, UP); /* addr0 == '0' */
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
static unsigned long arch_get_unmapped_area_common(struct file *filp,
unsigned long addr0, unsigned long len, unsigned long pgoff,
unsigned long flags, enum mmap_allocation_direction dir) /* dir == UP */
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long addr = addr0;
int do_color_align;
struct vm_unmapped_area_info info; /* 用于管理分配内存请求 */

if (unlikely(len > TASK_SIZE))
return -ENOMEM;

if (flags & MAP_FIXED) {
/* Even MAP_FIXED mappings must reside within TASK_SIZE */
if (TASK_SIZE - len < addr)
return -EINVAL;

/*
* We do not accept a shared mapping if it would violate
* cache aliasing constraints.
*/
if ((flags & MAP_SHARED) &&
((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask))
return -EINVAL;
return addr;
}

do_color_align = 0;
if (filp || (flags & MAP_SHARED))
do_color_align = 1;

/* requesting a specific address */
if (addr) {
if (do_color_align)
addr = COLOUR_ALIGN(addr, pgoff);
else
addr = PAGE_ALIGN(addr);

vma = find_vma(mm, addr); /* 找到对应的vma */
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}

info.length = len;
info.align_mask = do_color_align ? (PAGE_MASK & shm_align_mask) : 0;
info.align_offset = pgoff << PAGE_SHIFT;

if (dir == DOWN) { /* 自上而下进行映射(在本调用链中恒不成立) */
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
addr = vm_unmapped_area(&info); /* 根据vm_unmapped_area_info扫描mmap映射区域来查找满足请求的内存 */

if (!(addr & ~PAGE_MASK)) /* "addr&~PAGE_MASK"可判定addr是否是4096倍数,如果结果为"0",则是,否则不是 */
return addr; /* addr是否是4096倍数则返回 */

/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
}

info.flags = 0;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
return vm_unmapped_area(&info); /* 根据vm_unmapped_area_info扫描mmap映射区域来查找满足请求的内存 */
}
  • vm_unmapped_area 用于在 mmap 映射区域中查找满足请求的内存(以 vm_area_struct 为单位),这是内存分配中最底层的内容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
* 搜索未映射的地址范围,条件如下:
* - 不与任何VMA相交
* - 区间范围属于 [low_limit,high_limit)
* - 地址大小至少是 length
* - 满足 (begin_addr & align_mask) == (align_offset & align_mask)
*/
static inline unsigned long
vm_unmapped_area(struct vm_unmapped_area_info *info)
{
if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) /* VM_UNMAPPED_AREA_TOPDOWN:将虚拟机未映射区域自上而下进行映射(在本调用链中恒不成立) */
return unmapped_area_topdown(info); /* 反向 */
else
return unmapped_area(info); /* 正向 */
}

  • 看来 mmap 还支持反向映射,我们这里主要研究正向映射 unmapped_area
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
* 我们通过寻找紧跟合适间隙的rbtree节点来实现搜索
* - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
* - gap_end = vma->vm_start >= info->low_limit + length;
* - gap_end - gap_start >= length
*/

struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;

/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;

/* Adjust search limits by the desired length */
if (info->high_limit < length)
return -ENOMEM;
high_limit = info->high_limit - length;

if (info->low_limit > high_limit)
return -ENOMEM;
low_limit = info->low_limit + length;

/* Check if rbtree root looks promising */
if (RB_EMPTY_ROOT(&mm->mm_rb))
goto check_highest;
vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
if (vma->rb_subtree_gap < length)
goto check_highest;

while (true) {
/* Visit left subtree if it looks promising */
gap_end = vm_start_gap(vma);
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb);
if (left->rb_subtree_gap >= length) {
vma = left;
continue;
}
}

gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
if (gap_end >= low_limit &&
gap_end > gap_start && gap_end - gap_start >= length)
goto found;

/* Visit right subtree if it looks promising */
if (vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
struct vm_area_struct, vm_rb);
if (right->rb_subtree_gap >= length) {
vma = right;
continue;
}
}

/* Go back up the rbtree to find next candidate node */
while (true) {
struct rb_node *prev = &vma->vm_rb;
if (!rb_parent(prev))
goto check_highest;
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
gap_end = vm_start_gap(vma);
goto check_current;
}
}
}

check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;

found:
/* We found a suitable gap. Clip it with the original low_limit. */
if (gap_start < info->low_limit)
gap_start = info->low_limit;

/* Adjust gap address to the desired alignment */
gap_start += (info->align_offset - gap_start) & info->align_mask;

VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
return gap_start; /* 最后返回找到的addr */
}
  • 最底层的查找过程是用 红黑树 实现的(由于本人对红黑树还不是很了解,这里就先跳过了)
  • 至于 mmap 映射区域的由来,这就是分页机制和内容了
  • 最后返回到之前的函数中,mmap 也设置了两种机制:共享和私有
    • 如果是共享映射,那么在内存中对文件进行修改,磁盘中对应的文件也会被修改,相反,磁盘中的文件有了修改,内存中的文件也被修改
    • 如果是私有映射,那么内存中的文件是独立的,二者进行修改都不会对对方造成影响
  • 不管是调用 do_mmap_shared_file 或者 do_mmap_private,他们底层都会调用 call_mmap 完成最后的设置
1
2
3
4
static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
return file->f_op->mmap(file, vma);
}
  • 在 Ext4 文件系统中 file->f_op->mmap 指向 ext4_file_mmap(Linux 默认的文件系统为 Ext2 Ext3 Ext4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_mapping->host;

if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;

/*
* We don't support synchronous mappings for non-DAX files. At least
* until someone comes with a sensible use case.
*/
if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
return -EOPNOTSUPP;

file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops; /* 初始化vma->vm_ops(在page fault handler中被使用到) */
vma->vm_flags |= VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
return 0;
}
  • 当所有的剩余工作都处理完成后,mmap 就会返回在 mmap 映射区找到的 addr