Principles：共享内存shm底层原理

共享内存基础知识

共享内存有两个，一个 mmap，一个 systemV 的 shm

由于所有用户进程总的虚拟地址空间比可用的物理内存大很多，因此只有最常用的部分才与物理页帧关联（这不是问题，因为大多数程序只占用实际可用内存的一小部分）

在将磁盘上的数据映射到进程的虚拟地址空间的时，内核必须提供数据结构，以建立虚拟地址空间的区域和相关数据所在位置之间的关联，Linux 软件系统多级页表映射机制
共享内存使得多个进程可以访问同一块内存空间（节约了内存空间），不同进程可以及时看到对方进程中对共享内存中数据得更新（多个进程可以同时操作，所以需要进行同步，一般与信号量配合使用）

本文主要介绍 shm

共享内存的 API

1	int shmget(key_t key, size_t size, int shmflg); /* 获取一个新的共享内存段 */

key：
- IPC_PRIVATE - “0”：会建立新共享内存对象
- 大于0的32位整数：视参数 shmflg 来确定操作（通常要求此值来源于 ftok 返回的IPC键值）
size：
- “0”：获取共享内存时指定为“0”
- 大于0的整数：新建的共享内存大小，以字节为单位
shmflg：
- “0”：取共享内存标识符，若不存在则函数会报错
- IPC_CREAT：如果内核中不存在键值与key相等的共享内存，则新建一个共享内存，如果存在这样的共享内存，返回此共享内存的标识符
- IPC_CREAT | IPC_EXCL：如果内核中不存在键值与 key 相等的共享内存，则新建一个共享内存，如果存在这样的共享内存则报错
return：
- 成功：返回消息队列的标识符
- 出错：返回 “-1”，错误原因存于 error 中

1	void shmat(int shmid, const void shmaddr, int shmflg); /* 进行内存映射 */

shmid：
- 共享内存的标识符
shmaddr：
- 如果 shmaddr 为“0”则此段连接到由内核选择的第一个可用地址上
- 如果 shmaddr 非零，并且没有指定 SHM_RND，则此段链接到 shmaddr 所指的地址上，但是 shmaddr 必须是发生附加的页对齐地址
- 如果 shmaddr 非零且指定 SHM_RND，系统会自动对 shmaddr 进行页对齐
shmflg：
- “0”：读写模式
- SHM_RDONLY：为只读模式
- SHM_EXEC：指定对共享内存段的执行权限（对共享内存而言，所谓的执行权限实际上和读权限是一样的）
- SHM_RND：取整，取向下一个 SHMLBA 边界（shmaddr 非空时有效）
- SHM_REMAP：附加上的接管区域
return：
- 成功：返回共享内存地址
- 出错：返回 “-1”，错误原因存于 error 中
PS：
- fork 后子进程继承已连接的共享内存地址
- exec 后该子进程与已连接的共享内存地址自动脱离(detach)
- 目标进程结束后，已连接的共享内存地址会自动脱离(detach)

1	int shmdt(const void shmaddr); / 删除内存映射 */

shmaddr：
- 连接的共享内存的起始地址
return：
- 成功：返回消息队列的标识符
- 出错：返回 “-1”，错误原因存于 error 中

1	int shmctl(int shmid, int cmd, struct shmid_ds buf); / 对共享内存段进行操作 */

msqid：
- 共享内存的标识符
cmd：
- IPC_STAT：得到共享内存的状态，把共享内存的 shmid_ds 结构复制到 buf 中
- IPC_SET：改变共享内存的状态，把 buf 所指的 shmid_ds 结构中的 uid、gid、mode 复制到共享内存的 shmid_ds 结构内
- IPC_RMID：删除这片共享内存（销毁 shmget 创建的 shmid）
buf：
- 共享内存管理结构体
return：
- 成功：返回 “0”
- 出错：返回 “-1”，错误原因存于 error 中

共享内存使用案例

shm 写内存：

#include <stdio.h>
#include <sys/shm.h>
#include <unistd.h>
#include <string.h>

int main(int argc, char **argv) {
    int shmid;
    int i = 0;
    char *pshm;
    char buf[1024];
    key_t key = ftok(".",'z');

    printf("key = 0x%x\n",key);
    shmid = shmget(key, 1024 * 10, 0666 | IPC_CREAT);
    pshm = shmat(shmid, 0, 0);

    printf("input node 0-9\n");
    scanf("%d", &i);
    printf("node is %d\n", i);

    memset(buf, 0, sizeof(buf));
    printf("input data\n");
    scanf("%s", buf);
    memcpy(pshm + i * 1024, buf, 1024);
    shmdt(pshm);
    return 0;
}

shm 读内存：

#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sys/shm.h>

int main(int argc, char **argv) {
    int i;
    char *pshm;
    char buf[1024];
    int shmid;
    key_t key = ftok(".",'z');
    
    printf("key = 0x%x\n",key);
    shmid = shmget(key, 1024 * 10, 0666 | IPC_CREAT);
    pshm = shmat(shmid, 0, 0);

    printf("input node 0-9\n");
    scanf("%d", &i);
    printf("node is %d\n",i);

    memset(buf, 0, 1024);
    memcpy(buf, pshm + i * 1024, 1024);
    fprintf(stderr,"data [%s]\n", buf);
    shmdt(pshm);
    return 0;
}

效果：（这里把 R/W 分为两个文件，体现其通信的特性）

➜  exp ./send
key = 0x7a05274f
input node 0-9
0
node is 0
input data
yhellow

➜  exp ./read
key = 0x7a05274f
input node 0-9
0
node is 0
data [yhellow]

表面和 msg 的效果很像：
- “共享内存段-shmid_kernel” 类似于 “消息队列-msg_queue”
- “映射的内存-shm_file_data” 类似于 “消息-msg_msg”
- 但是
只是多了片虚拟内存空间：

    0x7ffff7fb0000     0x7ffff7fb6000 rw-p     6000 0      [anon_7ffff7fb0]
    0x7ffff7fc6000     0x7ffff7fc9000 rw-p     3000 0      /SYSV7a05274f (deleted)
    0x7ffff7fc9000     0x7ffff7fcd000 r--p     4000 0      [vvar]
    0x7ffff7fcd000     0x7ffff7fcf000 r-xp     2000 0      [vdso]
pwndbg> telescope 0x7ffff7fc6000
00:0000│  0x7ffff7fc6000 ◂— 0x776f6c6c656879 /* 'yhellow' */
01:0008│  0x7ffff7fc6008 ◂— 0x0

shm 父子进程通信：

#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BUFFER_SIZE 2048

int main()
{
	pid_t pid;
	int shmid;
	char *shm_addr;
	char flag[] = "WROTE";
	char buff[2048];
	
	system("ipcs -m");
	if ((shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, 0666)) < 0){
		perror("shmget");
		exit(1);
	}
	else{
		printf("Create shared-memory: %d\n",shmid);
	}
	system("ipcs -m");
	
	pid = fork();
	if (pid == -1){
		perror("fork");
		exit(1);
	}
	else if (pid == 0) { /* 子进程处理 */
		if ((shm_addr = shmat(shmid, 0, 0)) == (void*)-1){
			perror("Child: shmat");
			exit(1);
		}
		else{
			printf("Child: Attach shared-memory addr: %p\n", shm_addr);
		}
		system("ipcs -m");
		
		while (strncmp(shm_addr, flag, strlen(flag))){
			printf("Child: Wait for enable data...\n");
			sleep(5);
		}

		strcpy(buff, shm_addr + strlen(flag));
		printf("Child: Shared-memory :%s\n", buff);

		if ((shmdt(shm_addr)) < 0){
			perror("shmdt");
			exit(1);
		}
		else{
			printf("Child: Deattach shared-memory\n");
		}
	  	system("ipcs -m");
	  	
	  	if (shmctl(shmid, IPC_RMID, NULL) == -1){
			perror("Child: shmctl(IPC_RMID)\n");
			exit(1);
		}
		else{
			printf("Delete shared-memory\n");
		}
		system("ipcs -m");
	}
	else { /* 父进程处理 */
		sleep(1);
		if ((shm_addr = shmat(shmid, 0, 0)) == (void*)-1){
			perror("Parent: shmat");
			exit(1);
		}
		else{
			printf("Parent: Attach shared-memory addr: %p\n", shm_addr);
		}
		
		printf("\nInput some string:(Parent)\n");
		fgets(buff, BUFFER_SIZE, stdin);
		strncpy(shm_addr + strlen(flag), buff, strlen(buff));
		strncpy(shm_addr, flag, strlen(flag));
		system("ipcs -m");

		if ((shmdt(shm_addr)) < 0){
			perror("Parent: shmdt");
			exit(1);
		}
		else{
			printf("Parent: Deattach shared-memory\n");
		}
		system("ipcs -m");
		
		waitpid(pid, NULL, 0);		
		printf("Finished\n");
	}
  	exit(0);
}

效果：

➜  exp ./shmem

------------ 共享内存段 -------------- /* 初始状态 */
键        shmid      拥有者  权限     字节     连接数  状态      
0x00000000 4          yhellow    600        524288     2          目标       
0x00000000 7          yhellow    600        524288     2          目标       
0x00000000 11         yhellow    600        524288     2          目标       

Create shared-memory: 13

------------ 共享内存段 -------------- /* shmget新创建的共享内存段(shmid==13) */
键        shmid      拥有者  权限     字节     连接数  状态      
0x00000000 4          yhellow    600        524288     2          目标       
0x00000000 7          yhellow    600        524288     2          目标       
0x00000000 11         yhellow    600        524288     2          目标       
0x00000000 13         yhellow    666        2048       0                       

Child: Attach shared-memory addr: 0x7fe24a52d000

------------ 共享内存段 -------------- /* 子进程连接该共享内存段,使其[连接数]加一 */
键        shmid      拥有者  权限     字节     连接数  状态      
0x00000000 4          yhellow    600        524288     2          目标       
0x00000000 7          yhellow    600        524288     2          目标       
0x00000000 11         yhellow    600        524288     2          目标       
0x00000000 13         yhellow    666        2048       1                       

Child: Wait for enable data...
Parent: Attach shared-memory addr: 0x7fe24a52d000

Input some string:(Parent)
yhellow /* 在父进程上的输入 */

------------ 共享内存段 -------------- /* 父进程连接该共享内存段,使其[连接数]加一 */
键        shmid      拥有者  权限     字节     连接数  状态      
0x00000000 4          yhellow    600        524288     2          目标       
0x00000000 7          yhellow    600        524288     2          目标       
0x00000000 11         yhellow    600        524288     2          目标       
0x00000000 13         yhellow    666        2048       2                       

Parent: Deattach shared-memory

------------ 共享内存段 -------------- /* 父进程断开该共享内存段,使其[连接数]减一 */
键        shmid      拥有者  权限     字节     连接数  状态      
0x00000000 4          yhellow    600        524288     2          目标       
0x00000000 7          yhellow    600        524288     2          目标       
0x00000000 11         yhellow    600        524288     2          目标       
0x00000000 13         yhellow    666        2048       1                       

Child: Shared-memory :yhellow /* 在子进程上的输出 */

Child: Deattach shared-memory

------------ 共享内存段 -------------- /* 子进程断开该共享内存段,使其[连接数]减一 */
键        shmid      拥有者  权限     字节     连接数  状态      
0x00000000 4          yhellow    600        524288     2          目标       
0x00000000 7          yhellow    600        524288     2          目标       
0x00000000 11         yhellow    600        524288     2          目标       
0x00000000 13         yhellow    666        2048       0                       

Delete shared-memory

------------ 共享内存段 -------------- /* shmctl销毁了指定的共享内存段 */
键        shmid      拥有者  权限     字节     连接数  状态      
0x00000000 4          yhellow    600        524288     2          目标       
0x00000000 7          yhellow    600        524288     2          目标       
0x00000000 11         yhellow    600        524288     2          目标       

Finished

Linux 中 shm 的实现

► 0x7ffff7ee11fc <shmget+12>       syscall  <SYS_shmget>
       key: 0x7a05274f
       size: 0x2800
       shmflg: 0x3b6

struct ipc_ops {
	int (*getnew)(struct ipc_namespace *, struct ipc_params *);
	int (*associate)(struct kern_ipc_perm *, int);
	int (*more_checks)(struct kern_ipc_perm *, struct ipc_params *);
};

long ksys_shmget(key_t key, size_t size, int shmflg)
{
	struct ipc_namespace *ns;
	static const struct ipc_ops shm_ops = {
		.getnew = newseg,
		.associate = security_shm_associate,
		.more_checks = shm_more_checks,
	}; /* 初始化"创建例程" */
	struct ipc_params shm_params;

	ns = current->nsproxy->ipc_ns; /* 获取当前IPC命名空间 */

	shm_params.key = key; /* 键值 */
	shm_params.flg = shmflg; /* 标识符 */
	shm_params.u.size = size; /* 大小 */

	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); /* 核心函数 */
}

int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
			const struct ipc_ops *ops, struct ipc_params *params)
{
	if (params->key == IPC_PRIVATE) /* 是否私有 */
		return ipcget_new(ns, ids, ops, params); /* 创建一个新的ipc对象 */
	else
		return ipcget_public(ns, ids, ops, params); /* 获取一个ipc对象或创建一个新对象 */
}

static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
		const struct ipc_ops *ops, struct ipc_params *params)
{
    // *ns: ipc命名空间
    // *ids: ipc标识符集
    // *ops: 要调用的实际创建例程
    // *params: 它的参数
	struct kern_ipc_perm *ipcp;
	int flg = params->flg;
	int err;

	/*
	 * Take the lock as a writer since we are potentially going to add
	 * a new entry + read locks are not "upgradable"
	 */
	down_write(&ids->rwsem);
	ipcp = ipc_findkey(ids, params->key); /* 通过key值查找一个ids对象 */
	if (ipcp == NULL) { 
		if (!(flg & IPC_CREAT)) /* IPC_CREAT:如果内核中不存在键值与key相等的共享内存,则新建一个共享内存 */
			err = -ENOENT;
		else
			err = ops->getnew(ns, params); /* 新建一个共享内存 */
	} else {
		if (flg & IPC_CREAT && flg & IPC_EXCL) /* IPC_CREAT|IPC_EXCL:如果存在key值相同的共享内存则报错 */
			err = -EEXIST;
		else {
			err = 0;
			if (ops->more_checks)
				err = ops->more_checks(ipcp, params);
			if (!err)
				/* ipc_check_perms returns the IPC id on success */
				err = ipc_check_perms(ns, ipcp, ops, params); 
		}
		ipc_unlock(ipcp);
	}
	up_write(&ids->rwsem);

	return err;
}

前面这些可以说是 “共享内存”，“信号量”，“消息队列” 的通用部分，只是 ipc_ops 结构体的初始化不同
这里其实运用了面向对象的思想，用一系列函数和数据结构来描述 IPC 这个类（只是没法单独定义为一个类而已）
函数 newseg 的源码如下：（创建一个新的共享内存）

static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
{
    // *ns: 命名空间
    // *params: 指向包含key和msgflg的结构体(ipc_params)
	key_t key = params->key;
	int shmflg = params->flg;
	size_t size = params->u.size;
	int error;
	struct shmid_kernel *shp;
	size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	struct file *file;
	char name[13];
	vm_flags_t acctflag = 0;

	if (size < SHMMIN || size > ns->shm_ctlmax)
		return -EINVAL;

	if (numpages << PAGE_SHIFT < size)
		return -ENOSPC;

	if (ns->shm_tot + numpages < ns->shm_tot ||
			ns->shm_tot + numpages > ns->shm_ctlall)
		return -ENOSPC;

	shp = kvmalloc(sizeof(*shp), GFP_KERNEL); /* 为shmid_kernel分配内核堆空间 */
	if (unlikely(!shp))
		return -ENOMEM;

	shp->shm_perm.key = key; /* 设置shmid_kernel->shm_perm(参数) */
	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
	shp->mlock_user = NULL;

	shp->shm_perm.security = NULL;
	error = security_shm_alloc(&shp->shm_perm); /* 将shmid_kernel添加到消息队列基数树中,并取回基数树id */
	if (error) {
		kvfree(shp);
		return error;
	}

	sprintf(name, "SYSV%08x", key); /* PS:在GDB中使用"vmmap"命令就可以看到name */
	if (shmflg & SHM_HUGETLB) { /* SHM_HUGETLB:大页面映射 */
		struct hstate *hs;
		size_t hugesize;

		hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); /* 生成状态日志 */
		if (!hs) {
			error = -EINVAL;
			goto no_file;
		}
		hugesize = ALIGN(size, huge_page_size(hs)); /* 对齐 */

		/* hugetlb_file_setup applies strict accounting */
		if (shmflg & SHM_NORESERVE)
			acctflag = VM_NORESERVE;
		file = hugetlb_file_setup(name, hugesize, acctflag,
				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); /* 启用严格记账 */
	} else {
		/*
		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
		 * if it's asked for.
		 */
		if  ((shmflg & SHM_NORESERVE) &&
				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
			acctflag = VM_NORESERVE;
		file = shmem_kernel_file_setup(name, size, acctflag); /* 会在shmem文件系统里面创建一个文件 */
        /* 会创建新的shmem文件对应的dentry和inode,并将它们两个关联起来,然后分配一个struct file结构来表示新的shmem文件 */
	}
	error = PTR_ERR(file);
	if (IS_ERR(file))
		goto no_file;

	shp->shm_cprid = get_pid(task_tgid(current)); /* 获取pid */
	shp->shm_lprid = NULL;
	shp->shm_atim = shp->shm_dtim = 0;
	shp->shm_ctim = ktime_get_real_seconds();
	shp->shm_segsz = size;
	shp->shm_nattch = 0;
	shp->shm_file = file;
	shp->shm_creator = current;

	/* ipc_addid() locks shp upon success. */
	error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); /* 新创建的shmid_kernel结构挂到shm_ids里面的基数树上 */
	if (error < 0)
		goto no_id;

	list_add(&shp->shm_clist, &current->sysvshm.shm_clist); /* 插入shm链表 */

	/*
	 * shmid gets reported as "inode#" in /proc/pid/maps.
	 * proc-ps tools use this. Changing this will break them.
	 */
	file_inode(file)->i_ino = shp->shm_perm.id;

	ns->shm_tot += numpages;
	error = shp->shm_perm.id;

	ipc_unlock_object(&shp->shm_perm);
	rcu_read_unlock();
	return error;

no_id:
	ipc_update_pid(&shp->shm_cprid, NULL);
	ipc_update_pid(&shp->shm_lprid, NULL);
	if (is_file_hugepages(file) && shp->mlock_user)
		user_shm_unlock(size, shp->mlock_user);
	fput(file);
	ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); /* 释放目标 */
	return error;
no_file:
	call_rcu(&shp->shm_perm.rcu, shm_rcu_free);
	return error;
}

接下来看一下 shmem_kernel_file_setup 生成文件的过程：（新创建的 struct file 则专门用于做内存映射）

struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
{
	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}

static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
				       unsigned long flags, unsigned int i_flags)
{
	struct inode *inode;
	struct file *res;

	if (IS_ERR(mnt))
		return ERR_CAST(mnt);

	if (size < 0 || size > MAX_LFS_FILESIZE)
		return ERR_PTR(-EINVAL);

	if (shmem_acct_size(flags, size))
		return ERR_PTR(-ENOMEM);

	inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
				flags); /* 分配一个inode并进行一系列的初始化设置 */
	if (unlikely(!inode)) {
		shmem_unacct_size(flags, size);
		return ERR_PTR(-ENOSPC);
	}
	inode->i_flags |= i_flags;
	inode->i_size = size;
	clear_nlink(inode);	/* It is unlinked */
	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
	if (!IS_ERR(res))
		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
				&shmem_file_operations); /* 基于inode分配一个file(伪) */
	if (IS_ERR(res))
		iput(inode);
	return res;
}

除了生成新的一个文件（这并不是真正的文件，而是 保存在内存上 的抽象文件，这也是 shm 和 mmap 根本上的不同点），newseg 还会在内核堆空间中创建一个重要的结构体
结构体 shmid_kernel，用于管理 shmget 生成的共享内存段：

struct shmid_kernel /* private to the kernel */
{
	struct kern_ipc_perm	shm_perm;
	struct file		*shm_file;
	unsigned long		shm_nattch;
	unsigned long		shm_segsz;
	time64_t		shm_atim;
	time64_t		shm_dtim;
	time64_t		shm_ctim;
	struct pid		*shm_cprid;
	struct pid		*shm_lprid;
	struct user_struct	*mlock_user;

	/* The task created the shm object.  NULL if the task is dead. */
	struct task_struct	*shm_creator;
	struct list_head	shm_clist;	/* list by creator */
} __randomize_layout;

shmat 和 shmdt 都依靠另一个重要的结构体 - shm_file_data：

struct shm_file_data {
	int id;
	struct ipc_namespace *ns;
	struct file *file;
	const struct vm_operations_struct *vm_ops;
};

映射共享内存的核心函数 shmat：

► 0x7ffff7ee1199 <shmat+9>        syscall  <SYS_shmat>
       shmid: 0x24
       shmaddr: 0x0
       shmflg: 0x0

shmat 底层会调用 do_shmat：

long do_shmat(int shmid, char __user *shmaddr, int shmflg,
	      ulong *raddr, unsigned long shmlba)
{
	struct shmid_kernel *shp;
	unsigned long addr = (unsigned long)shmaddr;
	unsigned long size;
	struct file *file, *base;
	int    err;
	unsigned long flags = MAP_SHARED;
	unsigned long prot;
	int acc_mode;
	struct ipc_namespace *ns;
	struct shm_file_data *sfd;
	int f_flags;
	unsigned long populate = 0;

	err = -EINVAL;
	if (shmid < 0)
		goto out;

	if (addr) { /* shmaddr不为空(不常用) */
		if (addr & (shmlba - 1)) {
			if (shmflg & SHM_RND) {
				addr &= ~(shmlba - 1);  /* round down */

				/*
				 * Ensure that the round-down is non-nil
				 * when remapping. This can happen for
				 * cases when addr < shmlba.
				 */
				if (!addr && (shmflg & SHM_REMAP))
					goto out;
			} else
#ifndef __ARCH_FORCE_SHMLBA
				if (addr & ~PAGE_MASK)
#endif
					goto out;
		}

		flags |= MAP_FIXED;
	} else if ((shmflg & SHM_REMAP)) 
		goto out;

	if (shmflg & SHM_RDONLY) { /* SHM_RDONLY:为只读模式 */
		prot = PROT_READ;
		acc_mode = S_IRUGO;
		f_flags = O_RDONLY;
	} else {
		prot = PROT_READ | PROT_WRITE;
		acc_mode = S_IRUGO | S_IWUGO;
		f_flags = O_RDWR;
	}
	if (shmflg & SHM_EXEC) { /* SHM_EXEC:指定对共享内存段的执行权限 */
		prot |= PROT_EXEC;
		acc_mode |= S_IXUGO;
	}

	/*
	 * We cannot rely on the fs check since SYSV IPC does have an
	 * additional creator id...
	 */
	ns = current->nsproxy->ipc_ns;
	rcu_read_lock();
	shp = shm_obtain_object_check(ns, shmid); /* 通过共享内存的shmid,在基数树中找到对应的struct shmid_kernel结构 */
	if (IS_ERR(shp)) {
		err = PTR_ERR(shp);
		goto out_unlock;
	}

	err = -EACCES;
	if (ipcperms(ns, &shp->shm_perm, acc_mode))
		goto out_unlock;

	err = security_shm_shmat(&shp->shm_perm, shmaddr, shmflg);
	if (err)
		goto out_unlock;

	ipc_lock_object(&shp->shm_perm);

	/* check if shm_destroy() is tearing down shp */
	if (!ipc_valid_object(&shp->shm_perm)) {
		ipc_unlock_object(&shp->shm_perm);
		err = -EIDRM;
		goto out_unlock;
	}

	/*
	 * We need to take a reference to the real shm file to prevent the
	 * pointer from becoming stale in cases where the lifetime of the outer
	 * file extends beyond that of the shm segment.  It's not usually
	 * possible, but it can happen during remap_file_pages() emulation as
	 * that unmaps the memory, then does ->mmap() via file reference only.
	 * We'll deny the ->mmap() if the shm segment was since removed, but to
	 * detect shm ID reuse we need to compare the file pointers.
	 */
	base = get_file(shp->shm_file); /* 找到shmem上的内存文件base */
	shp->shm_nattch++;
	size = i_size_read(file_inode(base)); /* 获取shm_file的size */
	ipc_unlock_object(&shp->shm_perm);
	rcu_read_unlock();

	err = -ENOMEM;
	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); /* 为shm_file_data分配内存 */
	if (!sfd) {
		fput(base);
		goto out_nattch;
	}

	file = alloc_file_clone(base, f_flags,
			  is_file_hugepages(base) ?
				&shm_file_operations_huge :
				&shm_file_operations); 
    /* 拷贝一个struct file实例,同样将其private_data字段的值设置为inode->i_pipe的值 */
	err = PTR_ERR(file);
	if (IS_ERR(file)) {
		kfree(sfd);
		fput(base);
		goto out_nattch;
	}

	sfd->id = shp->shm_perm.id;
	sfd->ns = get_ipc_ns(ns);
	sfd->file = base;
	sfd->vm_ops = NULL;
	file->private_data = sfd;

	err = security_mmap_file(file, prot, flags);
	if (err)
		goto out_fput;

	if (down_write_killable(&current->mm->mmap_sem)) {
		err = -EINTR;
		goto out_fput;
	}

	if (addr && !(shmflg & SHM_REMAP)) {
		err = -EINVAL;
		if (addr + size < addr)
			goto invalid;

		if (find_vma_intersection(current->mm, addr, addr + size))
			goto invalid;
	}

	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL); /* 内存分配核心函数(在mmap的底层也会调用) */
	*raddr = addr;
	err = 0;
	if (IS_ERR_VALUE(addr))
		err = (long)addr;
invalid:
	up_write(&current->mm->mmap_sem);
	if (populate)
		mm_populate(addr, populate);

out_fput:
	fput(file);

out_nattch:
	down_write(&shm_ids(ns).rwsem);
	shp = shm_lock(ns, shmid);
	shp->shm_nattch--;
	if (shm_may_destroy(ns, shp))
		shm_destroy(ns, shp);
	else
		shm_unlock(shp);
	up_write(&shm_ids(ns).rwsem);
	return err;

out_unlock:
	rcu_read_unlock();
out:
	return err;
}

其实 do_shmat 底层申请内存的部分和 mmap 一样，至于 do_mmap_pgoff 已经在之前的博客中已经分析过了
PS：因为 do_mmap_pgoff 的底层还是调用了 do_mmap，所以可以通过 do_munmap 释放该内存，函数 shmdt 底层就是利用了这一点

解除共享内存的核心函数 shmdt：

1 2	► 0x7ffff7ee11c9 <shmdt+9> syscall <SYS_shmdt> shmaddr: 0x7ffff7ffb000 ◂— 0x0

shmdt 底层会调用 ksys_shmdt：

long ksys_shmdt(char __user *shmaddr)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long addr = (unsigned long)shmaddr;
	int retval = -EINVAL;
#ifdef CONFIG_MMU
	loff_t size = 0;
	struct file *file;
	struct vm_area_struct *next;
#endif

	if (addr & ~PAGE_MASK)
		return retval;

	if (down_write_killable(&mm->mmap_sem))
		return -EINTR;

	/*
	 * This function tries to be smart and unmap shm segments that
	 * were modified by partial mlock or munmap calls:
	 * - It first determines the size of the shm segment that should be
	 *   unmapped: It searches for a vma that is backed by shm and that
	 *   started at address shmaddr. It records it's size and then unmaps
	 *   it.
	 * - Then it unmaps all shm vmas that started at shmaddr and that
	 *   are within the initially determined size and that are from the
	 *   same shm segment from which we determined the size.
	 * Errors from do_munmap are ignored: the function only fails if
	 * it's called with invalid parameters or if it's called to unmap
	 * a part of a vma. Both calls in this function are for full vmas,
	 * the parameters are directly copied from the vma itself and always
	 * valid - therefore do_munmap cannot fail. (famous last words?)
	 */
	/*
	 * If it had been mremap()'d, the starting address would not
	 * match the usual checks anyway. So assume all vma's are
	 * above the starting address given.
	 */
	vma = find_vma(mm, addr); /* 根据一个属于某个进程的虚拟地址,找到其所属的进程虚拟区间,并返回相应的vma_area_struct结构体指针 */

#ifdef CONFIG_MMU
	while (vma) {
		next = vma->vm_next;

		/*
		 * Check if the starting address would match, i.e. it's
		 * a fragment created by mprotect() and/or munmap(), or it
		 * otherwise it starts at this address with no hassles.
		 */
		if ((vma->vm_ops == &shm_vm_ops) &&
			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {

			/*
			 * Record the file of the shm segment being
			 * unmapped.  With mremap(), someone could place
			 * page from another segment but with equal offsets
			 * in the range we are unmapping.
			 */
			file = vma->vm_file;
			size = i_size_read(file_inode(vma->vm_file));
			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); /* 释放调用do_mmap生成的内存空间 */
			/*
			 * We discovered the size of the shm segment, so
			 * break out of here and fall through to the next
			 * loop that uses the size information to stop
			 * searching for matching vma's.
			 */
			retval = 0;
			vma = next;
			break;
		}
		vma = next;
	}

	/*
	 * We need look no further than the maximum address a fragment
	 * could possibly have landed at. Also cast things to loff_t to
	 * prevent overflows and make comparisons vs. equal-width types.
	 */
	size = PAGE_ALIGN(size);
	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
		next = vma->vm_next;

		/* finding a matching vma now does not alter retval */
		if ((vma->vm_ops == &shm_vm_ops) &&
		    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
		    (vma->vm_file == file))
			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); /* 释放调用do_mmap生成的内存空间 */
		vma = next;
	}

#else	/* CONFIG_MMU */
	/* under NOMMU conditions, the exact address to be destroyed must be
	 * given
	 */
	if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); /* 释放调用do_mmap生成的内存空间 */
		retval = 0;
	}

#endif

	up_write(&mm->mmap_sem);
	return retval;
}

内核开发人员已经把遍历释放的过程写好注释了，这些操作要么为了安全，要么为了效率

mmap VS shm

mmap 的机制：
- 就是在磁盘上建立一个文件，每个进程存储器里面，单独开辟一个空间来进行映射，如果多进程的话，那么不会对实际的内存消耗太大
- 数据保存到实际硬盘，实际存储并没有反映到主存上（不耗内存，但速度慢）
shm 的机制：
- 每个进程的共享内存都直接映射到内存里面
- 数据保存到内存中，实际的储存量直接反映到主存上（速度快，但耗内存）

令我好奇的一点是：不管是 shm 还是 mmap 底层都要依靠文件（匿名的 mmap 底层也会使用 /dev/zero 文件）

mmap 直接使用文件来存储数据
shm 利用文件来完成映射

另外匿名管道也会使用 alloc_file_pseudo 来生成一个“抽象文件”，并使用它进行数据传输

其实这也可以理解，因为 file 中管理的 inode 直接和内存相关，并且 file->f_op 中还会提供许多与驱动程序相关的内核函数（例如：在 do_mmap_pgoff 的调用链中会使用 file->f_op->get_unmapped_area）

POSIX 共享内存

传统的 systemV 的 shm 共享内存有个升级版的 API：

1
2
3

static void shm_open(struct vm_area_struct *vma); /* 在/dev/shm/下建立一个文件,作为该进程的共享内存 */
static void shm_close(struct vm_area_struct *vma); /* 释放目标共享内存 */
static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp); /* 销毁/dev/shm/中对应的文件 */